mirror of
https://github.com/JetBrains/JetBrainsRuntime.git
synced 2025-12-06 09:29:38 +01:00
8329816: Add SLEEF version 3.6.1
Reviewed-by: erikj, mli, luhenry
This commit is contained in:
@@ -568,6 +568,10 @@ $(eval $(call SetupTarget, update-build-docs, \
|
||||
MAKEFILE := UpdateBuildDocs, \
|
||||
))
|
||||
|
||||
$(eval $(call SetupTarget, update-sleef-source, \
|
||||
MAKEFILE := UpdateSleefSource, \
|
||||
))
|
||||
|
||||
$(eval $(call SetupTarget, update-x11wrappers, \
|
||||
MAKEFILE := UpdateX11Wrappers, \
|
||||
DEPS := java.base-copy buildtools-jdk, \
|
||||
|
||||
153
make/UpdateSleefSource.gmk
Normal file
153
make/UpdateSleefSource.gmk
Normal file
@@ -0,0 +1,153 @@
|
||||
#
|
||||
# Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
|
||||
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
#
|
||||
# This code is free software; you can redistribute it and/or modify it
|
||||
# under the terms of the GNU General Public License version 2 only, as
|
||||
# published by the Free Software Foundation. Oracle designates this
|
||||
# particular file as subject to the "Classpath" exception as provided
|
||||
# by Oracle in the LICENSE file that accompanied this code.
|
||||
#
|
||||
# This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
# version 2 for more details (a copy is included in the LICENSE file that
|
||||
# accompanied this code).
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License version
|
||||
# 2 along with this work; if not, write to the Free Software Foundation,
|
||||
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
#
|
||||
# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
# or visit www.oracle.com if you need additional information or have any
|
||||
# questions.
|
||||
#
|
||||
|
||||
################################################################################
|
||||
|
||||
default: all
|
||||
|
||||
include $(SPEC)
|
||||
include MakeBase.gmk
|
||||
|
||||
include CopyFiles.gmk
|
||||
include Execute.gmk
|
||||
|
||||
################################################################################
|
||||
# This file is responsible for updating the generated sleef source code files
|
||||
# that are checked in to the JDK repo, and that are actually used when building.
|
||||
# This target needs to be re-run every time the source code of libsleef is
|
||||
# updated from upstream.
|
||||
################################################################################
|
||||
|
||||
ifneq ($(COMPILE_TYPE), cross)
|
||||
$(error Only cross-compilation of libsleef is currently supported)
|
||||
endif
|
||||
|
||||
ifeq ($(CMAKE), )
|
||||
$(error CMake not found. Please install cmake and rerun configure)
|
||||
endif
|
||||
|
||||
ifneq ($(OPENJDK_BUILD_OS), linux)
|
||||
$(error This target is only supported on linux)
|
||||
endif
|
||||
|
||||
SLEEF_SUPPORT_DIR := $(MAKESUPPORT_OUTPUTDIR)/sleef
|
||||
SLEEF_SOURCE_BASE_DIR := $(TOPDIR)/src/jdk.incubator.vector/linux/native/libsleef
|
||||
SLEEF_SOURCE_DIR := $(SLEEF_SOURCE_BASE_DIR)/upstream
|
||||
SLEEF_TARGET_DIR := $(SLEEF_SOURCE_BASE_DIR)/generated
|
||||
SLEEF_NATIVE_BUILD_DIR := $(SLEEF_SUPPORT_DIR)/native
|
||||
SLEEF_CROSS_BUILD_DIR := $(SLEEF_SUPPORT_DIR)/cross
|
||||
|
||||
ifeq ($(OPENJDK_TARGET_CPU), aarch64)
|
||||
CROSS_COMPILATION_FILENAMES := sleefinline_advsimd.h sleefinline_sve.h
|
||||
EXTRA_CROSS_OPTIONS := -DSLEEF_ENFORCE_SVE=TRUE
|
||||
else ifeq ($(OPENJDK_TARGET_CPU), riscv64)
|
||||
CROSS_COMPILATION_FILENAMES := sleefinline_rvvm1.h
|
||||
EXTRA_CROSS_OPTIONS := -DSLEEF_ENFORCE_RVVM1=TRUE
|
||||
else
|
||||
$(error Unsupported platform)
|
||||
endif
|
||||
CROSS_COMPILATION_SRC_FILES := $(addprefix $(SLEEF_CROSS_BUILD_DIR)/include/, \
|
||||
$(CROSS_COMPILATION_FILENAMES))
|
||||
|
||||
ifeq ($(TOOLCHAIN_TYPE), clang)
|
||||
SLEEF_TOOLCHAIN_TYPE := llvm
|
||||
else
|
||||
SLEEF_TOOLCHAIN_TYPE := $(TOOLCHAIN_TYPE)
|
||||
endif
|
||||
|
||||
SLEEF_CMAKE_FILE := toolchains/$(OPENJDK_TARGET_CPU)-$(SLEEF_TOOLCHAIN_TYPE).cmake
|
||||
|
||||
# We need to run CMake twice, first using it to configure the build, and then
|
||||
# to actually build; and we need to do this twice, once for a native build
|
||||
# and once for the cross-compilation build.
|
||||
|
||||
$(eval $(call SetupExecute, sleef_native_config, \
|
||||
INFO := Configuring native sleef build, \
|
||||
OUTPUT_DIR := $(SLEEF_NATIVE_BUILD_DIR), \
|
||||
COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) -S . -B \
|
||||
$(SLEEF_NATIVE_BUILD_DIR), \
|
||||
))
|
||||
|
||||
TARGETS := $(sleef_native_config)
|
||||
|
||||
$(eval $(call SetupExecute, sleef_native_build, \
|
||||
INFO := Building native sleef, \
|
||||
DEPS := $(sleef_native_config), \
|
||||
OUTPUT_DIR := $(SLEEF_NATIVE_BUILD_DIR), \
|
||||
COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) --build \
|
||||
$(SLEEF_NATIVE_BUILD_DIR) -j, \
|
||||
))
|
||||
|
||||
TARGETS := $(sleef_native_build)
|
||||
|
||||
$(eval $(call SetupExecute, sleef_cross_config, \
|
||||
INFO := Configuring cross-compiling sleef build, \
|
||||
DEPS := $(sleef_native_build), \
|
||||
OUTPUT_DIR := $(SLEEF_CROSS_BUILD_DIR), \
|
||||
COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) -S . -B \
|
||||
$(SLEEF_CROSS_BUILD_DIR) \
|
||||
-DCMAKE_C_COMPILER=$(CC) \
|
||||
-DCMAKE_TOOLCHAIN_FILE=$(SLEEF_CMAKE_FILE) \
|
||||
-DNATIVE_BUILD_DIR=$(SLEEF_NATIVE_BUILD_DIR) \
|
||||
-DSLEEF_BUILD_INLINE_HEADERS=TRUE \
|
||||
$(EXTRA_CROSS_OPTIONS), \
|
||||
))
|
||||
|
||||
TARGETS := $(sleef_cross_config)
|
||||
|
||||
$(eval $(call SetupExecute, sleef_cross_build, \
|
||||
INFO := Building cross-compiling sleef, \
|
||||
DEPS := $(sleef_cross_config), \
|
||||
OUTPUT_DIR := $(SLEEF_NATIVE_BUILD_DIR), \
|
||||
COMMAND := cd $(SLEEF_SOURCE_DIR) && $(CMAKE) --build \
|
||||
$(SLEEF_CROSS_BUILD_DIR) -j, \
|
||||
))
|
||||
|
||||
TARGETS := $(sleef_cross_build)
|
||||
|
||||
$(CROSS_COMPILATION_SRC_FILES): $(sleef_cross_build)
|
||||
|
||||
# Finally, copy the generated files (and one needed static file) into our
|
||||
# target directory.
|
||||
|
||||
$(eval $(call SetupCopyFiles, copy_static_sleef_source, \
|
||||
FILES := $(SLEEF_SOURCE_DIR)/src/common/misc.h, \
|
||||
DEST := $(SLEEF_TARGET_DIR), \
|
||||
))
|
||||
|
||||
TARGETS := $(copy_static_sleef_source)
|
||||
|
||||
$(eval $(call SetupCopyFiles, copy_generated_sleef_source, \
|
||||
FILES := $(CROSS_COMPILATION_SRC_FILES), \
|
||||
DEST := $(SLEEF_TARGET_DIR), \
|
||||
))
|
||||
|
||||
TARGETS := $(copy_generated_sleef_source)
|
||||
|
||||
################################################################################
|
||||
|
||||
all: $(TARGETS)
|
||||
|
||||
.PHONY: all default
|
||||
@@ -99,6 +99,7 @@ AC_DEFUN_ONCE([BASIC_SETUP_TOOLS],
|
||||
UTIL_REQUIRE_SPECIAL(FGREP, [AC_PROG_FGREP])
|
||||
|
||||
# Optional tools, we can do without them
|
||||
UTIL_LOOKUP_PROGS(CMAKE, cmake)
|
||||
UTIL_LOOKUP_PROGS(DF, df)
|
||||
UTIL_LOOKUP_PROGS(GIT, git)
|
||||
UTIL_LOOKUP_PROGS(NICE, nice)
|
||||
|
||||
@@ -719,6 +719,7 @@ CCACHE := @CCACHE@
|
||||
# CD is going away, but remains to cater for legacy makefiles.
|
||||
CD := cd
|
||||
CHMOD := @CHMOD@
|
||||
CMAKE := @CMAKE@
|
||||
CODESIGN := @CODESIGN@
|
||||
CP := @CP@
|
||||
CUT := @CUT@
|
||||
|
||||
439
src/jdk.incubator.vector/linux/legal/sleef.md
Normal file
439
src/jdk.incubator.vector/linux/legal/sleef.md
Normal file
@@ -0,0 +1,439 @@
|
||||
## SLEEF v3.6.1
|
||||
|
||||
### Notice
|
||||
```
|
||||
Copyright © 2010-2024 SLEEF Project, Naoki Shibata and contributors
|
||||
|
||||
-------
|
||||
src/arch/helpersve.h has the following copyright:
|
||||
Copyright ARM Ltd. 2010 - 2024.
|
||||
-------
|
||||
src/gencoef/{dp.h, gencoef.c, ld.h, qp.h, simplexfr.c, sp.h} have no copyright but has the following license text:
|
||||
// The code is distributed under the Creative Commons Attribution 4.0 International License.
|
||||
// https://creativecommons.org/licenses/by/4.0/
|
||||
Attribution 4.0 International
|
||||
```
|
||||
|
||||
### LICENSE Boost v1.0
|
||||
```
|
||||
|
||||
Boost Software License - Version 1.0 - August 17th, 2003
|
||||
|
||||
Permission is hereby granted, free of charge, to any person or organization
|
||||
obtaining a copy of the software and accompanying documentation covered by
|
||||
this license (the "Software") to use, reproduce, display, distribute,
|
||||
execute, and transmit the Software, and to prepare derivative works of the
|
||||
Software, and to permit third-parties to whom the Software is furnished to
|
||||
do so, all subject to the following:
|
||||
|
||||
The copyright notices in the Software and this entire statement, including
|
||||
the above license grant, this restriction and the following disclaimer,
|
||||
must be included in all copies of the Software, in whole or in part, and
|
||||
all derivative works of the Software, unless such copies or derivative
|
||||
works are solely in the form of machine-executable object code generated by
|
||||
a source language processor.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
```
|
||||
|
||||
### LICENSE Creative Commons Attribution 4.0 International License
|
||||
|
||||
```
|
||||
Creative Commons Corporation ("Creative Commons") is not a law firm and
|
||||
does not provide legal services or legal advice. Distribution of
|
||||
Creative Commons public licenses does not create a lawyer-client or
|
||||
other relationship. Creative Commons makes its licenses and related
|
||||
information available on an "as-is" basis. Creative Commons gives no
|
||||
warranties regarding its licenses, any material licensed under their
|
||||
terms and conditions, or any related information. Creative Commons
|
||||
disclaims all liability for damages resulting from their use to the
|
||||
fullest extent possible.
|
||||
|
||||
Using Creative Commons Public Licenses
|
||||
|
||||
Creative Commons public licenses provide a standard set of terms and
|
||||
conditions that creators and other rights holders may use to share
|
||||
original works of authorship and other material subject to copyright
|
||||
and certain other rights specified in the public license below. The
|
||||
following considerations are for informational purposes only, are not
|
||||
exhaustive, and do not form part of our licenses.
|
||||
|
||||
Considerations for licensors: Our public licenses are
|
||||
intended for use by those authorized to give the public
|
||||
permission to use material in ways otherwise restricted by
|
||||
copyright and certain other rights. Our licenses are
|
||||
irrevocable. Licensors should read and understand the terms
|
||||
and conditions of the license they choose before applying it.
|
||||
Licensors should also secure all rights necessary before
|
||||
applying our licenses so that the public can reuse the
|
||||
material as expected. Licensors should clearly mark any
|
||||
material not subject to the license. This includes other CC-
|
||||
licensed material, or material used under an exception or
|
||||
limitation to copyright. More considerations for licensors:
|
||||
wiki.creativecommons.org/Considerations_for_licensors
|
||||
|
||||
Considerations for the public: By using one of our public
|
||||
licenses, a licensor grants the public permission to use the
|
||||
licensed material under specified terms and conditions. If
|
||||
the licensor's permission is not necessary for any reason--for
|
||||
example, because of any applicable exception or limitation to
|
||||
copyright--then that use is not regulated by the license. Our
|
||||
licenses grant only permissions under copyright and certain
|
||||
other rights that a licensor has authority to grant. Use of
|
||||
the licensed material may still be restricted for other
|
||||
reasons, including because others have copyright or other
|
||||
rights in the material. A licensor may make special requests,
|
||||
such as asking that all changes be marked or described.
|
||||
Although not required by our licenses, you are encouraged to
|
||||
respect those requests where reasonable. More considerations
|
||||
for the public:
|
||||
wiki.creativecommons.org/Considerations_for_licensees
|
||||
|
||||
=======================================================================
|
||||
|
||||
Creative Commons Attribution 4.0 International Public License
|
||||
|
||||
By exercising the Licensed Rights (defined below), You accept and agree
|
||||
to be bound by the terms and conditions of this Creative Commons
|
||||
Attribution 4.0 International Public License ("Public License"). To the
|
||||
extent this Public License may be interpreted as a contract, You are
|
||||
granted the Licensed Rights in consideration of Your acceptance of
|
||||
these terms and conditions, and the Licensor grants You such rights in
|
||||
consideration of benefits the Licensor receives from making the
|
||||
Licensed Material available under these terms and conditions.
|
||||
|
||||
|
||||
Section 1 -- Definitions.
|
||||
|
||||
a. Adapted Material means material subject to Copyright and Similar
|
||||
Rights that is derived from or based upon the Licensed Material
|
||||
and in which the Licensed Material is translated, altered,
|
||||
arranged, transformed, or otherwise modified in a manner requiring
|
||||
permission under the Copyright and Similar Rights held by the
|
||||
Licensor. For purposes of this Public License, where the Licensed
|
||||
Material is a musical work, performance, or sound recording,
|
||||
Adapted Material is always produced where the Licensed Material is
|
||||
synched in timed relation with a moving image.
|
||||
|
||||
b. Adapter's License means the license You apply to Your Copyright
|
||||
and Similar Rights in Your contributions to Adapted Material in
|
||||
accordance with the terms and conditions of this Public License.
|
||||
|
||||
c. Copyright and Similar Rights means copyright and/or similar rights
|
||||
closely related to copyright including, without limitation,
|
||||
performance, broadcast, sound recording, and Sui Generis Database
|
||||
Rights, without regard to how the rights are labeled or
|
||||
categorized. For purposes of this Public License, the rights
|
||||
specified in Section 2(b)(1)-(2) are not Copyright and Similar
|
||||
Rights.
|
||||
|
||||
d. Effective Technological Measures means those measures that, in the
|
||||
absence of proper authority, may not be circumvented under laws
|
||||
fulfilling obligations under Article 11 of the WIPO Copyright
|
||||
Treaty adopted on December 20, 1996, and/or similar international
|
||||
agreements.
|
||||
|
||||
e. Exceptions and Limitations means fair use, fair dealing, and/or
|
||||
any other exception or limitation to Copyright and Similar Rights
|
||||
that applies to Your use of the Licensed Material.
|
||||
|
||||
f. Licensed Material means the artistic or literary work, database,
|
||||
or other material to which the Licensor applied this Public
|
||||
License.
|
||||
|
||||
g. Licensed Rights means the rights granted to You subject to the
|
||||
terms and conditions of this Public License, which are limited to
|
||||
all Copyright and Similar Rights that apply to Your use of the
|
||||
Licensed Material and that the Licensor has authority to license.
|
||||
|
||||
h. Licensor means the individual(s) or entity(ies) granting rights
|
||||
under this Public License.
|
||||
|
||||
i. Share means to provide material to the public by any means or
|
||||
process that requires permission under the Licensed Rights, such
|
||||
as reproduction, public display, public performance, distribution,
|
||||
dissemination, communication, or importation, and to make material
|
||||
available to the public including in ways that members of the
|
||||
public may access the material from a place and at a time
|
||||
individually chosen by them.
|
||||
|
||||
j. Sui Generis Database Rights means rights other than copyright
|
||||
resulting from Directive 96/9/EC of the European Parliament and of
|
||||
the Council of 11 March 1996 on the legal protection of databases,
|
||||
as amended and/or succeeded, as well as other essentially
|
||||
equivalent rights anywhere in the world.
|
||||
|
||||
k. You means the individual or entity exercising the Licensed Rights
|
||||
under this Public License. Your has a corresponding meaning.
|
||||
|
||||
|
||||
Section 2 -- Scope.
|
||||
|
||||
a. License grant.
|
||||
|
||||
1. Subject to the terms and conditions of this Public License,
|
||||
the Licensor hereby grants You a worldwide, royalty-free,
|
||||
non-sublicensable, non-exclusive, irrevocable license to
|
||||
exercise the Licensed Rights in the Licensed Material to:
|
||||
|
||||
a. reproduce and Share the Licensed Material, in whole or
|
||||
in part; and
|
||||
|
||||
b. produce, reproduce, and Share Adapted Material.
|
||||
|
||||
2. Exceptions and Limitations. For the avoidance of doubt, where
|
||||
Exceptions and Limitations apply to Your use, this Public
|
||||
License does not apply, and You do not need to comply with
|
||||
its terms and conditions.
|
||||
|
||||
3. Term. The term of this Public License is specified in Section
|
||||
6(a).
|
||||
|
||||
4. Media and formats; technical modifications allowed. The
|
||||
Licensor authorizes You to exercise the Licensed Rights in
|
||||
all media and formats whether now known or hereafter created,
|
||||
and to make technical modifications necessary to do so. The
|
||||
Licensor waives and/or agrees not to assert any right or
|
||||
authority to forbid You from making technical modifications
|
||||
necessary to exercise the Licensed Rights, including
|
||||
technical modifications necessary to circumvent Effective
|
||||
Technological Measures. For purposes of this Public License,
|
||||
simply making modifications authorized by this Section 2(a)
|
||||
(4) never produces Adapted Material.
|
||||
|
||||
5. Downstream recipients.
|
||||
|
||||
a. Offer from the Licensor -- Licensed Material. Every
|
||||
recipient of the Licensed Material automatically
|
||||
receives an offer from the Licensor to exercise the
|
||||
Licensed Rights under the terms and conditions of this
|
||||
Public License.
|
||||
|
||||
b. No downstream restrictions. You may not offer or impose
|
||||
any additional or different terms or conditions on, or
|
||||
apply any Effective Technological Measures to, the
|
||||
Licensed Material if doing so restricts exercise of the
|
||||
Licensed Rights by any recipient of the Licensed
|
||||
Material.
|
||||
|
||||
6. No endorsement. Nothing in this Public License constitutes or
|
||||
may be construed as permission to assert or imply that You
|
||||
are, or that Your use of the Licensed Material is, connected
|
||||
with, or sponsored, endorsed, or granted official status by,
|
||||
the Licensor or others designated to receive attribution as
|
||||
provided in Section 3(a)(1)(A)(i).
|
||||
|
||||
b. Other rights.
|
||||
|
||||
1. Moral rights, such as the right of integrity, are not
|
||||
licensed under this Public License, nor are publicity,
|
||||
privacy, and/or other similar personality rights; however, to
|
||||
the extent possible, the Licensor waives and/or agrees not to
|
||||
assert any such rights held by the Licensor to the limited
|
||||
extent necessary to allow You to exercise the Licensed
|
||||
Rights, but not otherwise.
|
||||
|
||||
2. Patent and trademark rights are not licensed under this
|
||||
Public License.
|
||||
|
||||
3. To the extent possible, the Licensor waives any right to
|
||||
collect royalties from You for the exercise of the Licensed
|
||||
Rights, whether directly or through a collecting society
|
||||
under any voluntary or waivable statutory or compulsory
|
||||
licensing scheme. In all other cases the Licensor expressly
|
||||
reserves any right to collect such royalties.
|
||||
|
||||
|
||||
Section 3 -- License Conditions.
|
||||
|
||||
Your exercise of the Licensed Rights is expressly made subject to the
|
||||
following conditions.
|
||||
|
||||
a. Attribution.
|
||||
|
||||
1. If You Share the Licensed Material (including in modified
|
||||
form), You must:
|
||||
|
||||
a. retain the following if it is supplied by the Licensor
|
||||
with the Licensed Material:
|
||||
|
||||
i. identification of the creator(s) of the Licensed
|
||||
Material and any others designated to receive
|
||||
attribution, in any reasonable manner requested by
|
||||
the Licensor (including by pseudonym if
|
||||
designated);
|
||||
|
||||
ii. a copyright notice;
|
||||
|
||||
iii. a notice that refers to this Public License;
|
||||
|
||||
iv. a notice that refers to the disclaimer of
|
||||
warranties;
|
||||
|
||||
v. a URI or hyperlink to the Licensed Material to the
|
||||
extent reasonably practicable;
|
||||
|
||||
b. indicate if You modified the Licensed Material and
|
||||
retain an indication of any previous modifications; and
|
||||
|
||||
c. indicate the Licensed Material is licensed under this
|
||||
Public License, and include the text of, or the URI or
|
||||
hyperlink to, this Public License.
|
||||
|
||||
2. You may satisfy the conditions in Section 3(a)(1) in any
|
||||
reasonable manner based on the medium, means, and context in
|
||||
which You Share the Licensed Material. For example, it may be
|
||||
reasonable to satisfy the conditions by providing a URI or
|
||||
hyperlink to a resource that includes the required
|
||||
information.
|
||||
|
||||
3. If requested by the Licensor, You must remove any of the
|
||||
information required by Section 3(a)(1)(A) to the extent
|
||||
reasonably practicable.
|
||||
|
||||
4. If You Share Adapted Material You produce, the Adapter's
|
||||
License You apply must not prevent recipients of the Adapted
|
||||
Material from complying with this Public License.
|
||||
|
||||
|
||||
Section 4 -- Sui Generis Database Rights.
|
||||
|
||||
Where the Licensed Rights include Sui Generis Database Rights that
|
||||
apply to Your use of the Licensed Material:
|
||||
|
||||
a. for the avoidance of doubt, Section 2(a)(1) grants You the right
|
||||
to extract, reuse, reproduce, and Share all or a substantial
|
||||
portion of the contents of the database;
|
||||
|
||||
b. if You include all or a substantial portion of the database
|
||||
contents in a database in which You have Sui Generis Database
|
||||
Rights, then the database in which You have Sui Generis Database
|
||||
Rights (but not its individual contents) is Adapted Material; and
|
||||
|
||||
c. You must comply with the conditions in Section 3(a) if You Share
|
||||
all or a substantial portion of the contents of the database.
|
||||
|
||||
For the avoidance of doubt, this Section 4 supplements and does not
|
||||
replace Your obligations under this Public License where the Licensed
|
||||
Rights include other Copyright and Similar Rights.
|
||||
|
||||
|
||||
Section 5 -- Disclaimer of Warranties and Limitation of Liability.
|
||||
|
||||
a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
|
||||
EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
|
||||
AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
|
||||
ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
|
||||
IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
|
||||
WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
|
||||
PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
|
||||
ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
|
||||
KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
|
||||
ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
|
||||
|
||||
b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
|
||||
TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
|
||||
NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
|
||||
INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
|
||||
COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
|
||||
USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
|
||||
ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
|
||||
DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
|
||||
IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
|
||||
|
||||
c. The disclaimer of warranties and limitation of liability provided
|
||||
above shall be interpreted in a manner that, to the extent
|
||||
possible, most closely approximates an absolute disclaimer and
|
||||
waiver of all liability.
|
||||
|
||||
|
||||
Section 6 -- Term and Termination.
|
||||
|
||||
a. This Public License applies for the term of the Copyright and
|
||||
Similar Rights licensed here. However, if You fail to comply with
|
||||
this Public License, then Your rights under this Public License
|
||||
terminate automatically.
|
||||
|
||||
b. Where Your right to use the Licensed Material has terminated under
|
||||
Section 6(a), it reinstates:
|
||||
|
||||
1. automatically as of the date the violation is cured, provided
|
||||
it is cured within 30 days of Your discovery of the
|
||||
violation; or
|
||||
|
||||
2. upon express reinstatement by the Licensor.
|
||||
|
||||
For the avoidance of doubt, this Section 6(b) does not affect any
|
||||
right the Licensor may have to seek remedies for Your violations
|
||||
of this Public License.
|
||||
|
||||
c. For the avoidance of doubt, the Licensor may also offer the
|
||||
Licensed Material under separate terms or conditions or stop
|
||||
distributing the Licensed Material at any time; however, doing so
|
||||
will not terminate this Public License.
|
||||
|
||||
d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
|
||||
License.
|
||||
|
||||
|
||||
Section 7 -- Other Terms and Conditions.
|
||||
|
||||
a. The Licensor shall not be bound by any additional or different
|
||||
terms or conditions communicated by You unless expressly agreed.
|
||||
|
||||
b. Any arrangements, understandings, or agreements regarding the
|
||||
Licensed Material not stated herein are separate from and
|
||||
independent of the terms and conditions of this Public License.
|
||||
|
||||
|
||||
Section 8 -- Interpretation.
|
||||
|
||||
a. For the avoidance of doubt, this Public License does not, and
|
||||
shall not be interpreted to, reduce, limit, restrict, or impose
|
||||
conditions on any use of the Licensed Material that could lawfully
|
||||
be made without permission under this Public License.
|
||||
|
||||
b. To the extent possible, if any provision of this Public License is
|
||||
deemed unenforceable, it shall be automatically reformed to the
|
||||
minimum extent necessary to make it enforceable. If the provision
|
||||
cannot be reformed, it shall be severed from this Public License
|
||||
without affecting the enforceability of the remaining terms and
|
||||
conditions.
|
||||
|
||||
c. No term or condition of this Public License will be waived and no
|
||||
failure to comply consented to unless expressly agreed to by the
|
||||
Licensor.
|
||||
|
||||
d. Nothing in this Public License constitutes or may be interpreted
|
||||
as a limitation upon, or waiver of, any privileges and immunities
|
||||
that apply to the Licensor or You, including from the legal
|
||||
processes of any jurisdiction or authority.
|
||||
|
||||
|
||||
=======================================================================
|
||||
|
||||
Creative Commons is not a party to its public
|
||||
licenses. Notwithstanding, Creative Commons may elect to apply one of
|
||||
its public licenses to material it publishes and in those instances
|
||||
will be considered the “Licensor.” The text of the Creative Commons
|
||||
public licenses is dedicated to the public domain under the CC0 Public
|
||||
Domain Dedication. Except for the limited purpose of indicating that
|
||||
material is shared under a Creative Commons public license or as
|
||||
otherwise permitted by the Creative Commons policies published at
|
||||
creativecommons.org/policies, Creative Commons does not authorize the
|
||||
use of the trademark "Creative Commons" or any other trademark or logo
|
||||
of Creative Commons without its prior written consent including,
|
||||
without limitation, in connection with any unauthorized modifications
|
||||
to any of its public licenses or any other arrangements,
|
||||
understandings, or agreements concerning use of licensed material. For
|
||||
the avoidance of doubt, this paragraph does not form part of the
|
||||
public licenses.
|
||||
|
||||
Creative Commons may be contacted at creativecommons.org.
|
||||
```
|
||||
54
src/jdk.incubator.vector/linux/native/libsleef/README.md
Normal file
54
src/jdk.incubator.vector/linux/native/libsleef/README.md
Normal file
@@ -0,0 +1,54 @@
|
||||
# About SLEEF
|
||||
|
||||
This directory contains the source code for the SLEEF library, the
|
||||
**SIMD Library for Evaluating Elementary Functions**. For more information on
|
||||
SLEEF, see https://sleef.org/.
|
||||
|
||||
The currently imported libsleef sources is version 3.6.1, which has
|
||||
git tag `3.6.1` and git commit hash `6ee14bcae5fe92c2ff8b000d5a01102dab08d774`.
|
||||
|
||||
# About the libsleef integration in the JDK
|
||||
|
||||
The upstream original source code is available in
|
||||
`src/jdk.incubator.vector/linux/native/libsleef/upstream`. However, this code is
|
||||
not directly usable in the JDK build system, but is instead used as the base for
|
||||
the generation of additional souce code files. This generation is done by
|
||||
the libsleef CMake files. If this should have been done at build time, it would
|
||||
have meant adding CMake as a required dependency to build the JDK.
|
||||
|
||||
Instead, we create these generated files only once, when we import a new
|
||||
version of the libsleef source code, and check in the generated files into
|
||||
the JDK source tree. The generated files reside in
|
||||
`src/jdk.incubator.vector/linux/native/libsleef/generated`.
|
||||
|
||||
# Import instructions
|
||||
|
||||
To update the version of libsleef that is used in the JDK, clone
|
||||
`https://github.com/shibatch/sleef.git`, and copy all files, except the `docs`,
|
||||
`.github` and `.git` directories, into
|
||||
`src/jdk.incubator.vector/linux/native/libsleef/upstream`.
|
||||
|
||||
The libsleef source code does not follow the JDK whitespace rules as enforced by
|
||||
jcheck. You will need to remove trailing whitespace, and expand tabs to 8
|
||||
spaces in the imported source code.
|
||||
|
||||
Update the note above with information about what version you import.
|
||||
|
||||
You will need to repeat the process below for each of the platforms in the JDK
|
||||
that uses libsleef; currently this is aarch64 and riscv64. The rest of this
|
||||
instruction assumes you are doing this on linux/x64; at this point, any other
|
||||
setup is not supported. Also, make sure you have CMake installed.
|
||||
|
||||
First, run configure for cross-compiling to your selected target platform
|
||||
(e.g. aarch64).
|
||||
|
||||
Run `make update-sleef-source` to process the upstream source code and
|
||||
store the generated files in the `generated` directory.
|
||||
|
||||
Now, you can repeat this for the next platform. For instance, you can
|
||||
create a separate configuration using `configure --with-conf-name=riscv64` and
|
||||
then generate the updated libsleef source code by
|
||||
`make update-sleef-source CONF=riscv64`.
|
||||
|
||||
Finally, verify with git that the local changes made to the files in
|
||||
`src/jdk.incubator.vector/linux/native/libsleef/generated` look okay.
|
||||
332
src/jdk.incubator.vector/linux/native/libsleef/generated/misc.h
Normal file
332
src/jdk.incubator.vector/linux/native/libsleef/generated/misc.h
Normal file
@@ -0,0 +1,332 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2024.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
//
|
||||
|
||||
#ifndef __MISC_H__
|
||||
#define __MISC_H__
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.141592653589793238462643383279502884
|
||||
#endif
|
||||
|
||||
#ifndef M_PIl
|
||||
#define M_PIl 3.141592653589793238462643383279502884L
|
||||
#endif
|
||||
|
||||
#ifndef M_1_PI
|
||||
#define M_1_PI 0.318309886183790671537767526745028724
|
||||
#endif
|
||||
|
||||
#ifndef M_1_PIl
|
||||
#define M_1_PIl 0.318309886183790671537767526745028724L
|
||||
#endif
|
||||
|
||||
#ifndef M_2_PI
|
||||
#define M_2_PI 0.636619772367581343075535053490057448
|
||||
#endif
|
||||
|
||||
#ifndef M_2_PIl
|
||||
#define M_2_PIl 0.636619772367581343075535053490057448L
|
||||
#endif
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
|
||||
#ifndef SLEEF_FP_ILOGB0
|
||||
#define SLEEF_FP_ILOGB0 ((int)0x80000000)
|
||||
#endif
|
||||
|
||||
#ifndef SLEEF_FP_ILOGBNAN
|
||||
#define SLEEF_FP_ILOGBNAN ((int)2147483647)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#define SLEEF_SNAN (((union { long long int i; double d; }) { .i = INT64_C(0x7ff0000000000001) }).d)
|
||||
#define SLEEF_SNANf (((union { long int i; float f; }) { .i = 0xff800001 }).f)
|
||||
|
||||
#define SLEEF_FLT_MIN 0x1p-126
|
||||
#define SLEEF_DBL_MIN 0x1p-1022
|
||||
#define SLEEF_INT_MAX 2147483647
|
||||
#define SLEEF_DBL_DENORM_MIN 4.9406564584124654e-324
|
||||
#define SLEEF_FLT_DENORM_MIN 1.40129846e-45F
|
||||
|
||||
//
|
||||
|
||||
/*
|
||||
PI_A to PI_D are constants that satisfy the following two conditions.
|
||||
|
||||
* For PI_A, PI_B and PI_C, the last 28 bits are zero.
|
||||
* PI_A + PI_B + PI_C + PI_D is close to PI as much as possible.
|
||||
|
||||
The argument of a trig function is multiplied by 1/PI, and the
|
||||
integral part is divided into two parts, each has at most 28
|
||||
bits. So, the maximum argument that could be correctly reduced
|
||||
should be 2^(28*2-1) PI = 1.1e+17. However, due to internal
|
||||
double precision calculation, the actual maximum argument that can
|
||||
be correctly reduced is around 2^47.
|
||||
*/
|
||||
|
||||
#define PI_A 3.1415926218032836914
|
||||
#define PI_B 3.1786509424591713469e-08
|
||||
#define PI_C 1.2246467864107188502e-16
|
||||
#define PI_D 1.2736634327021899816e-24
|
||||
#define TRIGRANGEMAX 1e+14
|
||||
|
||||
/*
|
||||
PI_A2 and PI_B2 are constants that satisfy the following two conditions.
|
||||
|
||||
* The last 3 bits of PI_A2 are zero.
|
||||
* PI_A2 + PI_B2 is close to PI as much as possible.
|
||||
|
||||
The argument of a trig function is multiplied by 1/PI, and the
|
||||
integral part is multiplied by PI_A2. So, the maximum argument that
|
||||
could be correctly reduced should be 2^(3-1) PI = 12.6. By testing,
|
||||
we confirmed that it correctly reduces the argument up to around 15.
|
||||
*/
|
||||
|
||||
#define PI_A2 3.141592653589793116
|
||||
#define PI_B2 1.2246467991473532072e-16
|
||||
#define TRIGRANGEMAX2 15
|
||||
|
||||
#define M_2_PI_H 0.63661977236758138243
|
||||
#define M_2_PI_L -3.9357353350364971764e-17
|
||||
|
||||
#define SQRT_DBL_MAX 1.3407807929942596355e+154
|
||||
|
||||
#define TRIGRANGEMAX3 1e+9
|
||||
|
||||
#define M_4_PI 1.273239544735162542821171882678754627704620361328125
|
||||
|
||||
#define L2U .69314718055966295651160180568695068359375
|
||||
#define L2L .28235290563031577122588448175013436025525412068e-12
|
||||
#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
|
||||
|
||||
#define L10U 0.30102999566383914498 // log 2 / log 10
|
||||
#define L10L 1.4205023227266099418e-13
|
||||
#define LOG10_2 3.3219280948873623478703194294893901758648313930
|
||||
|
||||
#define L10Uf 0.3010253906f
|
||||
#define L10Lf 4.605038981e-06f
|
||||
|
||||
//
|
||||
|
||||
#define PI_Af 3.140625f
|
||||
#define PI_Bf 0.0009670257568359375f
|
||||
#define PI_Cf 6.2771141529083251953e-07f
|
||||
#define PI_Df 1.2154201256553420762e-10f
|
||||
#define TRIGRANGEMAXf 39000
|
||||
|
||||
#define PI_A2f 3.1414794921875f
|
||||
#define PI_B2f 0.00011315941810607910156f
|
||||
#define PI_C2f 1.9841872589410058936e-09f
|
||||
#define TRIGRANGEMAX2f 125.0f
|
||||
|
||||
#define TRIGRANGEMAX4f 8e+6f
|
||||
|
||||
#define SQRT_FLT_MAX 18446743523953729536.0
|
||||
|
||||
#define L2Uf 0.693145751953125f
|
||||
#define L2Lf 1.428606765330187045e-06f
|
||||
|
||||
#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
|
||||
#ifndef M_PIf
|
||||
# define M_PIf ((float)M_PI)
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(x, y) ((x) < (y) ? (x) : (y))
|
||||
#endif
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(x, y) ((x) > (y) ? (x) : (y))
|
||||
#endif
|
||||
|
||||
#ifndef ABS
|
||||
#define ABS(x) ((x) < 0 ? -(x) : (x))
|
||||
#endif
|
||||
|
||||
#define stringify(s) stringify_(s)
|
||||
#define stringify_(s) #s
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
typedef long double longdouble;
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_double2_DEFINED) && !defined(SLEEF_GENHEADER)
|
||||
#define Sleef_double2_DEFINED
|
||||
typedef struct {
|
||||
double x, y;
|
||||
} Sleef_double2;
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_float2_DEFINED) && !defined(SLEEF_GENHEADER)
|
||||
#define Sleef_float2_DEFINED
|
||||
typedef struct {
|
||||
float x, y;
|
||||
} Sleef_float2;
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_longdouble2_DEFINED) && !defined(SLEEF_GENHEADER)
|
||||
#define Sleef_longdouble2_DEFINED
|
||||
typedef struct {
|
||||
long double x, y;
|
||||
} Sleef_longdouble2;
|
||||
#endif
|
||||
|
||||
#if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
|
||||
|
||||
#define LIKELY(condition) __builtin_expect(!!(condition), 1)
|
||||
#define UNLIKELY(condition) __builtin_expect(!!(condition), 0)
|
||||
#define RESTRICT __restrict__
|
||||
|
||||
#ifndef __arm__
|
||||
#define ALIGNED(x) __attribute__((aligned(x)))
|
||||
#else
|
||||
#define ALIGNED(x)
|
||||
#endif
|
||||
|
||||
#if defined(SLEEF_GENHEADER)
|
||||
|
||||
#define INLINE SLEEF_ALWAYS_INLINE
|
||||
#define EXPORT SLEEF_INLINE
|
||||
#define CONST SLEEF_CONST
|
||||
#define NOEXPORT
|
||||
|
||||
#else // #if defined(SLEEF_GENHEADER)
|
||||
|
||||
#define CONST __attribute__((const))
|
||||
#define INLINE __attribute__((always_inline))
|
||||
|
||||
#if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
|
||||
#ifndef SLEEF_STATIC_LIBS
|
||||
#define EXPORT __stdcall __declspec(dllexport)
|
||||
#define NOEXPORT
|
||||
#else // #ifndef SLEEF_STATIC_LIBS
|
||||
#define EXPORT
|
||||
#define NOEXPORT
|
||||
#endif // #ifndef SLEEF_STATIC_LIBS
|
||||
#else // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
|
||||
#define EXPORT __attribute__((visibility("default")))
|
||||
#define NOEXPORT __attribute__ ((visibility ("hidden")))
|
||||
#endif // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
|
||||
|
||||
#endif // #if defined(SLEEF_GENHEADER)
|
||||
|
||||
#define SLEEF_NAN __builtin_nan("")
|
||||
#define SLEEF_NANf __builtin_nanf("")
|
||||
#define SLEEF_NANl __builtin_nanl("")
|
||||
#define SLEEF_INFINITY __builtin_inf()
|
||||
#define SLEEF_INFINITYf __builtin_inff()
|
||||
#define SLEEF_INFINITYl __builtin_infl()
|
||||
|
||||
#if defined(__INTEL_COMPILER) || defined (__clang__)
|
||||
#define SLEEF_INFINITYq __builtin_inf()
|
||||
#define SLEEF_NANq __builtin_nan("")
|
||||
#else
|
||||
#define SLEEF_INFINITYq __builtin_infq()
|
||||
#define SLEEF_NANq (SLEEF_INFINITYq - SLEEF_INFINITYq)
|
||||
#endif
|
||||
|
||||
#elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
|
||||
|
||||
#if defined(SLEEF_GENHEADER)
|
||||
|
||||
#define INLINE SLEEF_ALWAYS_INLINE
|
||||
#define CONST SLEEF_CONST
|
||||
#define EXPORT SLEEF_INLINE
|
||||
#define NOEXPORT
|
||||
|
||||
#else // #if defined(SLEEF_GENHEADER)
|
||||
|
||||
#define INLINE __forceinline
|
||||
#define CONST
|
||||
#ifndef SLEEF_STATIC_LIBS
|
||||
#define EXPORT __declspec(dllexport)
|
||||
#define NOEXPORT
|
||||
#else
|
||||
#define EXPORT
|
||||
#define NOEXPORT
|
||||
#endif
|
||||
|
||||
#endif // #if defined(SLEEF_GENHEADER)
|
||||
|
||||
#define RESTRICT
|
||||
#define ALIGNED(x)
|
||||
#define LIKELY(condition) (condition)
|
||||
#define UNLIKELY(condition) (condition)
|
||||
|
||||
#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) && !defined(SLEEF_GENHEADER)
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#define SLEEF_INFINITY (1e+300 * 1e+300)
|
||||
#define SLEEF_NAN (SLEEF_INFINITY - SLEEF_INFINITY)
|
||||
#define SLEEF_INFINITYf ((float)SLEEF_INFINITY)
|
||||
#define SLEEF_NANf ((float)SLEEF_NAN)
|
||||
#define SLEEF_INFINITYl ((long double)SLEEF_INFINITY)
|
||||
#define SLEEF_NANl ((long double)SLEEF_NAN)
|
||||
|
||||
#if (defined(_M_AMD64) || defined(_M_X64))
|
||||
#ifndef __SSE2__
|
||||
#define __SSE2__
|
||||
#define __SSE3__
|
||||
#define __SSE4_1__
|
||||
#endif
|
||||
#elif _M_IX86_FP == 2
|
||||
#ifndef __SSE2__
|
||||
#define __SSE2__
|
||||
#define __SSE3__
|
||||
#define __SSE4_1__
|
||||
#endif
|
||||
#elif _M_IX86_FP == 1
|
||||
#ifndef __SSE__
|
||||
#define __SSE__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
|
||||
|
||||
#if !defined(__linux__)
|
||||
#define isinff(x) ((x) == SLEEF_INFINITYf || (x) == -SLEEF_INFINITYf)
|
||||
#define isinfl(x) ((x) == SLEEF_INFINITYl || (x) == -SLEEF_INFINITYl)
|
||||
#define isnanf(x) ((x) != (x))
|
||||
#define isnanl(x) ((x) != (x))
|
||||
#endif
|
||||
|
||||
#endif // #ifndef __MISC_H__
|
||||
|
||||
#ifdef ENABLE_AAVPCS
|
||||
#define VECTOR_CC __attribute__((aarch64_vector_pcs))
|
||||
#else
|
||||
#define VECTOR_CC
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
#if defined (__GNUC__) && !defined(__INTEL_COMPILER)
|
||||
#pragma GCC diagnostic ignored "-Wpragmas"
|
||||
#pragma GCC diagnostic ignored "-Wunknown-pragmas"
|
||||
#if !defined (__clang__)
|
||||
#pragma GCC diagnostic ignored "-Wattribute-alias"
|
||||
#pragma GCC diagnostic ignored "-Wlto-type-mismatch"
|
||||
#pragma GCC diagnostic ignored "-Wstringop-overflow"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable:4101) // warning C4101: 'v': unreferenced local variable
|
||||
#pragma warning(disable:4116) // warning C4116: unnamed type definition in parentheses
|
||||
#pragma warning(disable:4244) // warning C4244: 'function': conversion from 'vopmask' to '__mmask8', possible loss of data
|
||||
#pragma warning(disable:4267) // warning C4267: 'initializing': conversion from 'size_t' to 'const int', possible loss of data
|
||||
#pragma warning(disable:4305) // warning C4305: 'function': truncation from 'double' to 'float'
|
||||
#endif
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,255 @@
|
||||
## 3.6.1 - 2024-06-10
|
||||
|
||||
This patch release provides important bug fixes, including a fix
|
||||
for API compatibility with 3.5 (#534).
|
||||
The support and test for some features is still limited, as
|
||||
documented in [README](./README.md), however significant progress
|
||||
was made in order to test on Linux, macOS and Windows.
|
||||
|
||||
### Added
|
||||
- Add support for RISC-V in DFT, QUAD and inline headers (#503,
|
||||
#522).
|
||||
- Add GHA workflow to run CI tests on Windows x86 (#540) and macOS
|
||||
x86/aarch64 (#543). And update test matrix.
|
||||
- Add GHA workflows to run examples in CI (#550).
|
||||
|
||||
### Changed
|
||||
- Cleanup/Improve support for RISC-V in LIBM (#520, #521).
|
||||
- Update supported environment in documentation (#529, #549),
|
||||
including website and test matrix from README.
|
||||
|
||||
### Fixed
|
||||
- Major fix and cleanup of CMakeLists.txt (#531).
|
||||
- Fix compatibility issue after removal of quad and long double
|
||||
sincospi (#545). Restores functions that are missing in 3.6.
|
||||
- Various bug fixes (#528, #533, #536, #537).
|
||||
|
||||
## 3.6 - 2024-02-14
|
||||
|
||||
This release follows a long period of inactivity. The library is now
|
||||
being actively maintained. However, the support and test for some
|
||||
features is currently limited, as documented in [README](./README.md).
|
||||
|
||||
### Added
|
||||
- Add documentation for the quad precision math library
|
||||
- Enable generation of inline header file for CUDA (PR #337)
|
||||
- Add support for System/390 z15 support (PR #343)
|
||||
- Add support for POWER 9 (PR #360)
|
||||
- Add quad-precision functions (PR #375, #377, #380, #381, #382, #383,
|
||||
#385, #386, #387)
|
||||
- Add preliminary support for iOS and Android (PR #388, #389)
|
||||
- Add OpenMP pragmas to the function declarations in sleef.h to enable
|
||||
auto-vectorization by GCC (PR #404, #406)
|
||||
- Add new public CI test infrastructure using GitHub Actions (PR #476)
|
||||
- Add support for RISC-V in libm (PR #477)
|
||||
|
||||
### Removed
|
||||
- Remove old CI scripts based on Travis/Jenkins/Appveyor (PR #502)
|
||||
|
||||
### Changed
|
||||
- Optimise error functions (PR #370)
|
||||
- Update CMake package config (PR #412)
|
||||
- Update documentation and move doc/website to main repository (PR #504,
|
||||
#513)
|
||||
- Add SLEEF_ prefix to user-facing CMake options (PR #509)
|
||||
- Disable SVE on Darwin (PR #512)
|
||||
|
||||
### Fixed
|
||||
- Fix parallel builds with GNU make (PR #491)
|
||||
- Various bug fixes (PR #492, #499, #508)
|
||||
|
||||
## 3.5.1 - 2020-09-15
|
||||
### Changed
|
||||
- Fixed a bug in handling compiler options
|
||||
|
||||
## 3.5 - 2020-09-01
|
||||
- IBM System/390 support is added.
|
||||
- The library can be built with Clang on Windows.
|
||||
- Static libraries with LTO can be generated.
|
||||
- Alternative division and sqrt methods can be chosen with AArch64.
|
||||
- Header files for inlining the whole SLEEF functions can be generated.
|
||||
- IEEE remainder function is added.
|
||||
- GCC-10 can now build SLEEF with SVE support.
|
||||
|
||||
## 3.4.1 - 2019-10-01
|
||||
### Changed
|
||||
- Fixed accuracy problem with tan_u35, atan_u10, log2f_u35 and exp10f_u10.
|
||||
https://github.com/shibatch/sleef/pull/260
|
||||
https://github.com/shibatch/sleef/pull/265
|
||||
https://github.com/shibatch/sleef/pull/267
|
||||
- SVE intrinsics that are not supported in newer ACLE are replaced.
|
||||
https://github.com/shibatch/sleef/pull/268
|
||||
- FMA4 detection problem is fixed.
|
||||
https://github.com/shibatch/sleef/pull/262
|
||||
- Compilation problem under Windows with MinGW is fixed.
|
||||
https://github.com/shibatch/sleef/pull/266
|
||||
|
||||
## 3.4 - 2019-04-28
|
||||
### Added
|
||||
- Faster and low precision functions are added.
|
||||
https://github.com/shibatch/sleef/pull/229
|
||||
- Functions that return consistent results across platforms are
|
||||
added
|
||||
https://github.com/shibatch/sleef/pull/216
|
||||
https://github.com/shibatch/sleef/pull/224
|
||||
- Quad precision math library(libsleefquad) is added
|
||||
https://github.com/shibatch/sleef/pull/235
|
||||
https://github.com/shibatch/sleef/pull/237
|
||||
https://github.com/shibatch/sleef/pull/240
|
||||
- AArch64 Vector Procedure Call Standard (AAVPCS) support.
|
||||
### Changed
|
||||
- Many functions are now faster
|
||||
- Testers are now faster
|
||||
|
||||
## 3.3.1 - 2018-08-20
|
||||
### Added
|
||||
- FreeBSD support is added
|
||||
### Changed
|
||||
- i386 build problem is fixed
|
||||
- Trigonometric functions now evaluate correctly with full FP
|
||||
domain.
|
||||
https://github.com/shibatch/sleef/pull/210
|
||||
|
||||
## 3.3 - 2018-07-06
|
||||
### Added
|
||||
- SVE target support is added to libsleef.
|
||||
https://github.com/shibatch/sleef/pull/180
|
||||
- SVE target support is added to DFT. With this patch, DFT operations
|
||||
can be carried out using 256, 512, 1024 and 2048-bit wide vectors
|
||||
according to runtime availability of vector registers and operators.
|
||||
https://github.com/shibatch/sleef/pull/182
|
||||
- 3.5-ULP versions of sinh, cosh, tanh, sinhf, coshf, tanhf, and the
|
||||
corresponding testing functionalities are added.
|
||||
https://github.com/shibatch/sleef/pull/192
|
||||
- Power VSX target support is added to libsleef.
|
||||
https://github.com/shibatch/sleef/pull/195
|
||||
- Payne-Hanek like argument reduction is added to libsleef.
|
||||
https://github.com/shibatch/sleef/pull/197
|
||||
|
||||
## 3.2 - 2018-02-26
|
||||
### Added
|
||||
- The whole build system of the project migrated from makefiles to
|
||||
cmake. In particualr this includes `libsleef`, `libsleefgnuabi`,
|
||||
`libdft` and all the tests.
|
||||
- Benchmarks that compare `libsleef` vs `SVML` on X86 Linux are
|
||||
available in the project tree under src/libm-benchmarks directory.
|
||||
- Extensive upstream testing via Travis CI and Appveyor, on the
|
||||
following systems:
|
||||
* OS: Windows / Linux / OSX.
|
||||
* Compilers: gcc / clang / MSVC.
|
||||
* Targets: X86 (SSE/AVX/AVX2/AVX512F), AArch64 (Advanced SIMD), ARM
|
||||
(NEON). Emulators like QEMU or SDE can be used to run the tests.
|
||||
- Added the following new vector functions (with relative testing):
|
||||
* `log2`
|
||||
- New compatibility tests have been added to check that
|
||||
`libsleefgnuabi` exports the GNUABI symbols correctly.
|
||||
- The library can be compiled to an LLVM bitcode object.
|
||||
- Added masked interface to the library to support AVX512F masked
|
||||
vectorization.
|
||||
|
||||
### Changed
|
||||
- Use native instructions if available for `sqrt`.
|
||||
- Fixed fmax and fmin behavior on AArch64:
|
||||
https://github.com/shibatch/sleef/pull/140
|
||||
- Speed improvements for `asin`, `acos`, `fmod` and `log`. Computation
|
||||
speed of other functions are also improved by general optimization.
|
||||
https://github.com/shibatch/sleef/pull/97
|
||||
- Removed `libm` dependency.
|
||||
|
||||
### Removed
|
||||
- Makefile build system
|
||||
|
||||
## 3.1 - 2017-07-19
|
||||
- Added AArch64 support
|
||||
- Implemented the remaining C99 math functions : lgamma, tgamma,
|
||||
erf, erfc, fabs, copysign, fmax, fmin, fdim, trunc, floor, ceil,
|
||||
round, rint, modf, ldexp, nextafter, frexp, hypot, and fmod.
|
||||
- Added dispatcher for x86 functions
|
||||
- Improved reduction of trigonometric functions
|
||||
- Added support for 32-bit x86, Cygwin, etc.
|
||||
- Improved tester
|
||||
|
||||
## 3.0 - 2017-02-07
|
||||
- New API is defined
|
||||
- Functions for DFT are added
|
||||
- sincospi functions are added
|
||||
- gencoef now supports single, extended and quad precision in addition to double precision
|
||||
- Linux, Windows and Mac OS X are supported
|
||||
- GCC, Clang, Intel Compiler, Microsoft Visual C++ are supported
|
||||
- The library can be compiled as DLLs
|
||||
- Files needed for creating a debian package are now included
|
||||
|
||||
## 2.120 - 2017-01-30
|
||||
- Relicensed to Boost Software License Version 1.0
|
||||
|
||||
## 2.110 - 2016-12-11
|
||||
- The valid range of argument is extended for trig functions
|
||||
- Specification of each functions regarding to the domain and accuracy is added
|
||||
- A coefficient generation tool is added
|
||||
- New testing tools are introduced
|
||||
- Following functions returned incorrect values when the argument is very large or small : exp, pow, asinh, acosh
|
||||
- SIMD xsin and xcos returned values more than 1 when FMA is enabled
|
||||
- Pure C cbrt returned incorrect values when the argument is negative
|
||||
- tan_u1 returned values with more than 1 ulp of error on rare occasions
|
||||
- Removed support for Java language(because no one seems using this)
|
||||
|
||||
## 2.100 - 2016-12-04
|
||||
- Added support for AVX-512F and Clang Extended Vectors.
|
||||
|
||||
## 2.90 - 2016-11-27
|
||||
- Added ilogbf. All the reported bugs(listed below) are fixed.
|
||||
- Log function returned incorrect values when the argument is very small.
|
||||
- Signs of returned values were incorrect when the argument is signed zero.
|
||||
- Tester incorrectly counted ULP in some cases.
|
||||
- ilogb function returned incorrect values in some cases.
|
||||
|
||||
## 2.80 - 2013-05-18
|
||||
- Added support for ARM NEON. Added higher accuracy single
|
||||
precision functions : sinf_u1, cosf_u1, sincosf_u1, tanf_u1, asinf_u1,
|
||||
acosf_u1, atanf_u1, atan2f_u1, logf_u1, and cbrtf_u1.
|
||||
|
||||
## 2.70 - 2013-04-30
|
||||
- Added higher accuracy functions : sin_u1, cos_u1, sincos_u1,
|
||||
tan_u1, asin_u1, acos_u1, atan_u1, atan2_u1, log_u1, and
|
||||
cbrt_u1. These functions evaluate the corresponding function with at
|
||||
most 1 ulp of error.
|
||||
|
||||
## 2.60 - 2013-03-26
|
||||
- Added the remaining single precision functions : powf, sinhf,
|
||||
coshf, tanhf, exp2f, exp10f, log10f, log1pf. Added support for FMA4
|
||||
(for AMD Bulldozer). Added more test cases. Fixed minor bugs (which
|
||||
degraded accuracy in some rare cases).
|
||||
|
||||
## 2.50 - 2013-03-12
|
||||
- Added support for AVX2. SLEEF now compiles with ICC.
|
||||
|
||||
## 2.40 - 2013-03-07
|
||||
- Fixed incorrect denormal/nonnumber handling in ldexp, ldexpf,
|
||||
sinf and cosf. Removed support for Go language.
|
||||
|
||||
## 2.31 - 2012-07-05
|
||||
- Added sincosf.
|
||||
|
||||
## 2.30 - 2012-01-20
|
||||
- Added single precision functions : sinf, cosf, tanf, asinf,
|
||||
acosf, atanf, logf, expf, atan2f and cbrtf.
|
||||
|
||||
## 2.20 - 2012-01-09
|
||||
- Added exp2, exp10, expm1, log10, log1p, and cbrt.
|
||||
|
||||
## 2.10 - 2012-01-05
|
||||
- asin() and acos() are back.
|
||||
- Added ilogb() and ldexp().
|
||||
- Added hyperbolic functions.
|
||||
- Eliminated dependency on frexp, ldexp, fabs, isnan and isinf.
|
||||
|
||||
## 2.00 - 2011-12-30
|
||||
- All of the algorithm has been updated.
|
||||
- Both accuracy and speed are improved since version 1.10.
|
||||
- Denormal number handling is also improved.
|
||||
|
||||
## 1.10 - 2010-06-22
|
||||
- AVX support is added. Accuracy tester is added.
|
||||
|
||||
## 1.00 - 2010-05-15
|
||||
- Initial release
|
||||
@@ -0,0 +1,339 @@
|
||||
cmake_minimum_required(VERSION 3.18)
|
||||
project(SLEEF VERSION 3.6.1 LANGUAGES C)
|
||||
|
||||
set(SLEEF_SOVERSION ${SLEEF_VERSION_MAJOR})
|
||||
|
||||
# Options
|
||||
|
||||
option(SLEEF_BUILD_STATIC_TEST_BINS "Build statically linked test executables" OFF)
|
||||
option(SLEEF_ENABLE_LTO "Enable LTO on GCC or ThinLTO on clang" OFF)
|
||||
option(SLEEF_BUILD_LIBM "libsleef will be built." ON)
|
||||
option(SLEEF_BUILD_DFT "libsleefdft will be built." OFF)
|
||||
option(SLEEF_BUILD_QUAD "libsleefquad will be built." OFF)
|
||||
option(SLEEF_BUILD_GNUABI_LIBS "libsleefgnuabi will be built." ON)
|
||||
option(SLEEF_BUILD_SCALAR_LIB "libsleefscalar will be built." OFF)
|
||||
option(SLEEF_BUILD_TESTS "Tests will be built." ON)
|
||||
option(SLEEF_BUILD_INLINE_HEADERS "Build header for inlining whole SLEEF functions" OFF)
|
||||
|
||||
option(SLEEF_TEST_ALL_IUT "Perform tests on implementations with all vector extensions" OFF)
|
||||
option(SLEEF_SHOW_CONFIG "Show SLEEF configuration status messages." ON)
|
||||
option(SLEEF_SHOW_ERROR_LOG "Show cmake error log." OFF)
|
||||
option(SLEEF_ASAN "Enable address sanitizing on all targets." OFF)
|
||||
|
||||
option(SLEEF_ENFORCE_TESTER "Build fails if tester is not available" OFF)
|
||||
option(SLEEF_ENFORCE_TESTER3 "Build fails if tester3 is not built" OFF)
|
||||
|
||||
option(SLEEF_ENABLE_ALTDIV "Enable alternative division method (aarch64 only)" OFF)
|
||||
option(SLEEF_ENABLE_ALTSQRT "Enable alternative sqrt method (aarch64 only)" OFF)
|
||||
|
||||
option(SLEEF_DISABLE_FFTW "Disable testing the DFT library with FFTW" OFF)
|
||||
option(SLEEF_DISABLE_MPFR "Disable testing with the MPFR library" OFF)
|
||||
option(SLEEF_DISABLE_SSL "Disable testing with the SSL library" OFF)
|
||||
|
||||
option(SLEEF_ENABLE_CUDA "Enable CUDA" OFF)
|
||||
option(SLEEF_ENABLE_CXX "Enable C++" OFF)
|
||||
|
||||
#
|
||||
|
||||
if (DEFINED SLEEF_BUILD_SHARED_LIBS)
|
||||
set(BUILD_SHARED_LIBS ${SLEEF_BUILD_SHARED_LIBS})
|
||||
endif ()
|
||||
|
||||
if (SLEEF_SHOW_CONFIG)
|
||||
# Normalize the value of BUILD_SHARED_LIBS so that it displays nicely
|
||||
# in the configuration display
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set(BUILD_SHARED_LIBS ON)
|
||||
else ()
|
||||
set(BUILD_SHARED_LIBS OFF)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
# Function used to generate safe command arguments for add_custom_command
|
||||
function(command_arguments PROPNAME)
|
||||
set(quoted_args "")
|
||||
foreach(arg ${ARGN})
|
||||
list(APPEND quoted_args "\"${arg}\"" )
|
||||
endforeach()
|
||||
set(${PROPNAME} ${quoted_args} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
# Helper function for concatenating several files
|
||||
function(sleef_concat_files)
|
||||
cmake_parse_arguments(concat_required "" "OUTPUT" "SOURCES" ${ARGN})
|
||||
if("${concat_required_OUTPUT}" STREQUAL "")
|
||||
message(FATAL_ERROR "Must pass OUTPUT to sleef_concat_files")
|
||||
endif()
|
||||
|
||||
if(NOT concat_required_SOURCES)
|
||||
message(FATAL_ERROR "sleef_concat_files not passed any SOURCES")
|
||||
endif()
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${concat_required_OUTPUT}
|
||||
COMMAND ${CMAKE_COMMAND} -E cat ${concat_required_SOURCES} > ${concat_required_OUTPUT}
|
||||
DEPENDS ${concat_required_SOURCES}
|
||||
COMMAND_EXPAND_LISTS)
|
||||
endfunction()
|
||||
|
||||
# Settings
|
||||
|
||||
set(SLEEF_ALL_SUPPORTED_EXTENSIONS
|
||||
AVX512FNOFMA AVX512F AVX2 AVX2128 FMA4 AVX SSE4 SSE2 # x86
|
||||
SVENOFMA SVE ADVSIMDNOFMA ADVSIMD # Aarch64
|
||||
NEON32 NEON32VFPV4 # Aarch32
|
||||
VSX VSXNOFMA VSX3 VSX3NOFMA # PPC64
|
||||
VXE VXENOFMA VXE2 VXE2NOFMA # IBM Z
|
||||
RVVM1NOFMA RVVM1 RVVM2NOFMA RVVM2 # RISC-V Vectors
|
||||
PUREC_SCALAR PURECFMA_SCALAR # Generic type
|
||||
CACHE STRING "List of SIMD architectures supported by libsleef."
|
||||
)
|
||||
|
||||
set(SLEEF_SUPPORTED_LIBM_EXTENSIONS
|
||||
AVX512FNOFMA AVX512F AVX2 AVX2128 FMA4 AVX SSE4 SSE2 # x86
|
||||
SVENOFMA SVE ADVSIMDNOFMA ADVSIMD # Aarch64
|
||||
NEON32 NEON32VFPV4 # Aarch32
|
||||
VSX VSXNOFMA VSX3 VSX3NOFMA # PPC64
|
||||
VXE VXENOFMA VXE2 VXE2NOFMA # IBM Z
|
||||
RVVM1NOFMA RVVM1 RVVM2NOFMA RVVM2 # RISC-V Vectors
|
||||
PUREC_SCALAR PURECFMA_SCALAR # Generic type
|
||||
CACHE STRING "List of SIMD architectures supported by libsleef."
|
||||
)
|
||||
set(SLEEF_SUPPORTED_GNUABI_EXTENSIONS
|
||||
SSE2 AVX AVX2 AVX512F ADVSIMD SVE
|
||||
CACHE STRING "List of SIMD architectures supported by libsleef for GNU ABI."
|
||||
)
|
||||
|
||||
set(SLEEF_SUPPORTED_QUAD_EXTENSIONS
|
||||
PUREC_SCALAR PURECFMA_SCALAR SSE2 AVX2128 AVX2 AVX512F ADVSIMD SVE VSX VSX3 VXE VXE2 RVVM1 RVVM2)
|
||||
|
||||
# MKMASKED_PARAMS
|
||||
|
||||
command_arguments(MKMASKED_PARAMS_GNUABI_AVX512F_dp avx512f e 8)
|
||||
command_arguments(MKMASKED_PARAMS_GNUABI_AVX512F_sp avx512f e -16)
|
||||
|
||||
command_arguments(MKMASKED_PARAMS_GNUABI_SVE_dp sve s 2)
|
||||
command_arguments(MKMASKED_PARAMS_GNUABI_SVE_sp sve s -4)
|
||||
|
||||
#
|
||||
|
||||
set(COSTOVERRIDE_AVX512F 10)
|
||||
set(COSTOVERRIDE_AVX512FNOFMA 10)
|
||||
set(COSTOVERRIDE_AVX2 2)
|
||||
set(COSTOVERRIDE_AVX 2)
|
||||
set(COSTOVERRIDE_NEON32 2)
|
||||
set(COSTOVERRIDE_NEON32VFPV4 2)
|
||||
set(COSTOVERRIDE_SVE 10)
|
||||
set(COSTOVERRIDE_SVENOFMA 10)
|
||||
set(COSTOVERRIDE_RVVM1 10)
|
||||
set(COSTOVERRIDE_RVVM1NOFMA 10)
|
||||
set(COSTOVERRIDE_RVVM2 20)
|
||||
set(COSTOVERRIDE_RVVM2NOFMA 20)
|
||||
|
||||
#
|
||||
|
||||
enable_testing()
|
||||
|
||||
if (SLEEF_ENABLE_CXX)
|
||||
enable_language(CXX)
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENABLE_CUDA)
|
||||
enable_language(CUDA)
|
||||
endif()
|
||||
|
||||
# For specifying installation directories
|
||||
include(GNUInstallDirs)
|
||||
|
||||
if(NOT DEFINED sleef_SOURCE_DIR)
|
||||
set(sleef_SOURCE_DIR ${CMAKE_SOURCE_DIR})
|
||||
endif()
|
||||
|
||||
if(NOT DEFINED sleef_BINARY_DIR)
|
||||
set(sleef_BINARY_DIR ${CMAKE_BINARY_DIR})
|
||||
endif()
|
||||
|
||||
# Sanity check for in-source builds which we do not want to happen
|
||||
if(sleef_SOURCE_DIR STREQUAL sleef_BINARY_DIR)
|
||||
message(FATAL_ERROR "SLEEF does not allow in-source builds.
|
||||
You can refer to docs/build-with-cmake.md for instructions on how provide a \
|
||||
separate build directory. Note: Please remove autogenerated file \
|
||||
`CMakeCache.txt` and directory `CMakeFiles` in the current directory.")
|
||||
endif()
|
||||
|
||||
if(SLEEF_ENABLE_LTO AND BUILD_SHARED_LIBS)
|
||||
message(FATAL_ERROR "SLEEF_ENABLE_LTO and BUILD_SHARED_LIBS cannot be specified at the same time")
|
||||
endif(SLEEF_ENABLE_LTO AND BUILD_SHARED_LIBS)
|
||||
|
||||
if(SLEEF_ENABLE_LTO)
|
||||
include(CheckIPOSupported)
|
||||
check_ipo_supported(RESULT supported OUTPUT error)
|
||||
endif()
|
||||
|
||||
# Set output directories for the library files
|
||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/bin)
|
||||
|
||||
foreach(CONFIG ${CMAKE_CONFIGURATION_TYPES})
|
||||
string(TOUPPER ${CONFIG} CONFIG)
|
||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_${CONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_${CONFIG} ${PROJECT_BINARY_DIR}/lib)
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_${CONFIG} ${PROJECT_BINARY_DIR}/bin)
|
||||
endforeach(CONFIG CMAKE_CONFIGURATION_TYPES)
|
||||
|
||||
# Path for finding cmake modules
|
||||
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules)
|
||||
set(SLEEF_SCRIPT_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Scripts CACHE PATH
|
||||
"Path for finding sleef specific cmake scripts")
|
||||
|
||||
if (CMAKE_C_COMPILER_ID MATCHES "Clang" AND "x${CMAKE_C_SIMULATE_ID}" STREQUAL "xMSVC")
|
||||
message(STATUS "Building with Clang on Windows")
|
||||
set(SLEEF_CLANG_ON_WINDOWS TRUE)
|
||||
endif()
|
||||
|
||||
# sleef-config.h.in passes cmake settings to the source code
|
||||
include(Configure.cmake)
|
||||
configure_file(
|
||||
${PROJECT_SOURCE_DIR}/sleef-config.h.in
|
||||
${PROJECT_BINARY_DIR}/include/sleef-config.h @ONLY)
|
||||
|
||||
# We like to have a documented index of all targets in the project. The
|
||||
# variables listed below carry the names of the targets defined throughout
|
||||
# the project.
|
||||
|
||||
# Generates object file (shared library) `libsleef`
|
||||
# Defined in src/libm/CMakeLists.txt via command add_library
|
||||
set(TARGET_LIBSLEEF "sleef")
|
||||
set(TARGET_LIBSLEEFGNUABI "sleefgnuabi")
|
||||
# Generates the sleef.h headers and all the rename headers
|
||||
# Defined in src/libm/CMakeLists.txt via custom commands and a custom target
|
||||
set(TARGET_HEADERS "headers")
|
||||
set(TARGET_INLINE_HEADERS "inline_headers")
|
||||
set(TARGET_QINLINE_HEADERS "quad_inline_headers")
|
||||
set(TARGET_LIBINLINE "sleefinline")
|
||||
# Generates executable files for running the test suite
|
||||
# Defined in src/libm-tester/CMakeLists.txt via command add_executable
|
||||
set(TARGET_TESTER "tester")
|
||||
set(TARGET_IUT "iut")
|
||||
# The target to generate LLVM bitcode only, available when SLEEF_ENABLE_LLVM_BITCODE is passed to cmake
|
||||
set(TARGET_LLVM_BITCODE "llvm-bitcode")
|
||||
# Generates the helper executable file mkrename needed to write the sleef header
|
||||
set(TARGET_MKRENAME "mkrename")
|
||||
set(TARGET_MKRENAME_GNUABI "mkrename_gnuabi")
|
||||
set(TARGET_MKMASKED_GNUABI "mkmasked_gnuabi")
|
||||
# Generates the helper executable file mkdisp needed to write the sleef header
|
||||
set(TARGET_MKDISP "mkdisp")
|
||||
set(TARGET_MKALIAS "mkalias")
|
||||
# Generates static library common
|
||||
# Defined in src/common/CMakeLists.txt via command add_library
|
||||
set(TARGET_LIBCOMMON_OBJ "common")
|
||||
set(TARGET_LIBARRAYMAP_OBJ "arraymap")
|
||||
|
||||
# Function used to add an executable that is executed on host
|
||||
function(add_host_executable TARGETNAME)
|
||||
if (NOT CMAKE_CROSSCOMPILING)
|
||||
add_executable(${TARGETNAME} ${ARGN})
|
||||
# Ensure that Darwin host executable is built as universal binary
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
|
||||
target_compile_options(${TARGETNAME} PRIVATE -arch "${CMAKE_HOST_SYSTEM_PROCESSOR}")
|
||||
target_link_options(${TARGETNAME} PRIVATE -arch "${CMAKE_HOST_SYSTEM_PROCESSOR}")
|
||||
endif()
|
||||
else()
|
||||
add_executable(${TARGETNAME} IMPORTED GLOBAL)
|
||||
set_property(TARGET ${TARGETNAME} PROPERTY IMPORTED_LOCATION ${NATIVE_BUILD_DIR}/bin/${TARGETNAME})
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
function(host_target_AAVPCS_definitions TARGETNAME)
|
||||
if (NOT CMAKE_CROSSCOMPILING)
|
||||
target_compile_definitions(${TARGETNAME} PRIVATE ENABLE_AAVPCS=1)
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
# Generates object file (shared library) `libsleefdft`
|
||||
# Defined in src/dft/CMakeLists.txt via command add_library
|
||||
set(TARGET_LIBDFT "sleefdft")
|
||||
|
||||
# Check subdirectories
|
||||
add_subdirectory("src")
|
||||
|
||||
# Install the CMake package config
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
write_basic_package_version_file(
|
||||
sleefConfigVersion.cmake
|
||||
COMPATIBILITY SameMajorVersion
|
||||
)
|
||||
|
||||
set(
|
||||
SLEEF_INSTALL_CMAKEDIR "${CMAKE_INSTALL_LIBDIR}/cmake/sleef"
|
||||
CACHE STRING "CMake package config location relative to the install prefix"
|
||||
)
|
||||
|
||||
mark_as_advanced(SLEEF_INSTALL_CMAKEDIR)
|
||||
|
||||
install(
|
||||
FILES
|
||||
"${PROJECT_SOURCE_DIR}/sleefConfig.cmake"
|
||||
"${PROJECT_BINARY_DIR}/sleefConfigVersion.cmake"
|
||||
DESTINATION "${SLEEF_INSTALL_CMAKEDIR}"
|
||||
COMPONENT sleef_Development
|
||||
)
|
||||
|
||||
install(
|
||||
EXPORT sleefTargets
|
||||
NAMESPACE sleef::
|
||||
DESTINATION "${SLEEF_INSTALL_CMAKEDIR}"
|
||||
COMPONENT sleef_Development
|
||||
)
|
||||
|
||||
# Extra messages at configuration time. By default is active, it can be
|
||||
# turned off by invoking cmake with "-DSLEEF_SHOW_CONFIG=OFF".
|
||||
if(SLEEF_SHOW_CONFIG)
|
||||
message(STATUS "Configuring build for ${PROJECT_NAME}-v${SLEEF_VERSION}")
|
||||
message(" Target system: ${CMAKE_SYSTEM}")
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
|
||||
message(" Target processor: ${CMAKE_OSX_ARCHITECTURES}")
|
||||
else()
|
||||
message(" Target processor: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
endif()
|
||||
message(" Host system: ${CMAKE_HOST_SYSTEM}")
|
||||
message(" Host processor: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
|
||||
message(" Detected C compiler: ${CMAKE_C_COMPILER_ID} @ ${CMAKE_C_COMPILER}")
|
||||
message(" CMake: ${CMAKE_VERSION}")
|
||||
message(" Make program: ${CMAKE_MAKE_PROGRAM}")
|
||||
if(CMAKE_CROSSCOMPILING)
|
||||
message(" Crosscompiling SLEEF.")
|
||||
message(" Native build dir: ${NATIVE_BUILD_DIR}")
|
||||
endif(CMAKE_CROSSCOMPILING)
|
||||
message(STATUS "Using option `${SLEEF_C_FLAGS}` to compile libsleef")
|
||||
message(STATUS "Building shared libs : " ${BUILD_SHARED_LIBS})
|
||||
message(STATUS "Building static test bins: " ${SLEEF_BUILD_STATIC_TEST_BINS})
|
||||
message(STATUS "MPFR : " ${LIB_MPFR})
|
||||
if (MPFR_INCLUDE_DIR)
|
||||
message(STATUS "MPFR header file in " ${MPFR_INCLUDE_DIR})
|
||||
endif()
|
||||
message(STATUS "GMP : " ${LIBGMP})
|
||||
message(STATUS "RT : " ${LIBRT})
|
||||
message(STATUS "FFTW3 : " ${LIBFFTW3})
|
||||
message(STATUS "OPENSSL : " ${OPENSSL_VERSION})
|
||||
message(STATUS "SDE : " ${SDE_COMMAND})
|
||||
if (SLEEF_BUILD_INLINE_HEADERS)
|
||||
message(STATUS "SED : " ${SED_COMMAND})
|
||||
endif()
|
||||
message(STATUS "COMPILER_SUPPORTS_OPENMP : " ${COMPILER_SUPPORTS_OPENMP})
|
||||
if(ENABLE_GNUABI)
|
||||
message(STATUS "A version of SLEEF compatible with libm and libmvec in GNU libc will be produced (${TARGET_LIBSLEEFGNUABI}.so)")
|
||||
endif()
|
||||
if (COMPILER_SUPPORTS_SVE)
|
||||
message(STATUS "Building SLEEF with VLA SVE support")
|
||||
if (ARMIE_COMMAND)
|
||||
message(STATUS "Arm Instruction Emulator found at ${ARMIE_COMMAND}")
|
||||
message(STATUS "SVE testing is done with ${SVE_VECTOR_BITS}-bits vectors.")
|
||||
endif()
|
||||
endif()
|
||||
if(FORCE_AAVPCS)
|
||||
message(STATUS "Building SLEEF with AArch64 Vector PCS support")
|
||||
endif()
|
||||
endif(SLEEF_SHOW_CONFIG)
|
||||
@@ -0,0 +1,27 @@
|
||||
# List of contributors
|
||||
|
||||
These lists are not exhaustive and only provide most relevant contact information.
|
||||
For an exhausitive list of contributors please refer to the
|
||||
[GitHub contributors section for SLEEF](https://github.com/shibatch/sleef/graphs/contributors).
|
||||
|
||||
## Maintainers
|
||||
|
||||
| Name | Affiliation | Github profile |
|
||||
| -------------------- | ----------------------- | ---------------------------------- |
|
||||
| Pierre Blanchard | Arm Ltd. | https://github.com/blapie |
|
||||
| Joana Cruz | Arm Ltd. | https://github.com/joanaxcruz |
|
||||
| Joe Ramsay | Arm Ltd. | https://github.com/joeramsay |
|
||||
| Naoki Shibata | Nara Institute of Science and Technology | https://github.com/shibatch |
|
||||
|
||||
## Contributors
|
||||
|
||||
| Name | Affiliation | Github profile |
|
||||
| -------------------- | ----------------------- | ---------------------------------- |
|
||||
| Anonymous | | https://github.com/friendlyanon |
|
||||
| Diana Bite | Former Arm Ltd. | https://github.com/diaena |
|
||||
| Ludovic Henry | Rivos Inc. | https://github.com/luhenry |
|
||||
| Martin Krastev | Chaos Group | https://github.com/blu |
|
||||
| Jilayne Lovejoy | Former Arm Inc. | https://github.com/jlovejoy |
|
||||
| Kerry McLaughlin | Arm Ltd. | https://github.com/kmclaughlin-arm |
|
||||
| Alexandre Mutel | Unity Technologies | https://github.com/xoofx |
|
||||
| Francesco Petrogalli | Former Arm Ltd. | https://github.com/fpetrogalli-arm |
|
||||
@@ -0,0 +1,860 @@
|
||||
include(CheckCCompilerFlag)
|
||||
include(CheckCSourceCompiles)
|
||||
include(CheckTypeSize)
|
||||
include(CheckLanguage)
|
||||
|
||||
#
|
||||
|
||||
if (SLEEF_BUILD_STATIC_TEST_BINS)
|
||||
set(CMAKE_FIND_LIBRARY_SUFFIXES ".a")
|
||||
set(BUILD_SHARED_LIBS OFF)
|
||||
set(CMAKE_EXE_LINKER_FLAGS "-static")
|
||||
endif()
|
||||
|
||||
set(OPENSSL_EXTRA_LIBRARIES "" CACHE STRING "Extra libraries for openssl")
|
||||
if (NOT CMAKE_CROSSCOMPILING AND NOT SLEEF_FORCE_FIND_PACKAGE_SSL)
|
||||
if (SLEEF_BUILD_STATIC_TEST_BINS)
|
||||
set(OPENSSL_USE_STATIC_LIBS TRUE)
|
||||
endif()
|
||||
find_package(OpenSSL)
|
||||
if (OPENSSL_FOUND)
|
||||
set(SLEEF_OPENSSL_FOUND TRUE)
|
||||
set(SLEEF_OPENSSL_LIBRARIES ${OPENSSL_LIBRARIES})
|
||||
# Work around for tester3 sig segv, when linking versions of openssl (1.1.1) statically.
|
||||
# This is a known issue https://github.com/openssl/openssl/issues/13872.
|
||||
if (SLEEF_BUILD_STATIC_TEST_BINS)
|
||||
string(REGEX REPLACE
|
||||
"-lpthread" "-Wl,--whole-archive -lpthread -Wl,--no-whole-archive"
|
||||
SLEEF_OPENSSL_LIBRARIES "${OPENSSL_LIBRARIES}")
|
||||
endif()
|
||||
set(SLEEF_OPENSSL_VERSION ${OPENSSL_VERSION})
|
||||
set(SLEEF_OPENSSL_LIBRARIES ${SLEEF_OPENSSL_LIBRARIES} ${OPENSSL_EXTRA_LIBRARIES})
|
||||
set(SLEEF_OPENSSL_INCLUDE_DIR ${OPENSSL_INCLUDE_DIR})
|
||||
endif()
|
||||
else()
|
||||
# find_package cannot find OpenSSL when cross-compiling
|
||||
find_library(LIBSSL ssl)
|
||||
find_library(LIBCRYPTO crypto)
|
||||
if (LIBSSL AND LIBCRYPTO)
|
||||
set(SLEEF_OPENSSL_FOUND TRUE)
|
||||
set(SLEEF_OPENSSL_LIBRARIES ${LIBSSL} ${LIBCRYPTO} ${OPENSSL_EXTRA_LIBRARIES})
|
||||
set(SLEEF_OPENSSL_VERSION ${LIBSSL})
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_TESTER3 AND NOT SLEEF_OPENSSL_FOUND)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_TESTER3 is specified and OpenSSL not found")
|
||||
endif()
|
||||
|
||||
# Some toolchains require explicit linking of the libraries following.
|
||||
find_library(LIB_MPFR mpfr)
|
||||
find_library(LIBM m)
|
||||
find_library(LIBGMP gmp)
|
||||
find_library(LIBRT rt)
|
||||
find_library(LIBFFTW3 fftw3)
|
||||
|
||||
if (LIB_MPFR)
|
||||
find_path(MPFR_INCLUDE_DIR
|
||||
NAMES mpfr.h
|
||||
ONLY_CMAKE_FIND_ROOT_PATH)
|
||||
endif(LIB_MPFR)
|
||||
|
||||
if (LIBFFTW3)
|
||||
find_path(FFTW3_INCLUDE_DIR
|
||||
NAMES fftw3.h
|
||||
ONLY_CMAKE_FIND_ROOT_PATH)
|
||||
endif(LIBFFTW3)
|
||||
|
||||
if (NOT LIBM)
|
||||
set(LIBM "")
|
||||
endif()
|
||||
|
||||
if (NOT LIBRT)
|
||||
set(LIBRT "")
|
||||
endif()
|
||||
|
||||
if (SLEEF_DISABLE_MPFR)
|
||||
set(LIB_MPFR "")
|
||||
endif()
|
||||
|
||||
if (SLEEF_DISABLE_SSL)
|
||||
set(SLEEF_OPENSSL_FOUND FALSE)
|
||||
endif()
|
||||
|
||||
# Force set default build type if none was specified
|
||||
# Note: some sleef code requires the optimisation flags turned on
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
message(STATUS "Setting build type to 'Release' (required for full support).")
|
||||
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
|
||||
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS
|
||||
"Debug" "Release" "RelWithDebInfo" "MinSizeRel")
|
||||
endif()
|
||||
|
||||
# Sanitizers
|
||||
if(SLEEF_ASAN)
|
||||
# Add address sanitizing to all targets
|
||||
add_compile_options(-fno-omit-frame-pointer -fsanitize=address)
|
||||
add_link_options(-fno-omit-frame-pointer -fsanitize=address)
|
||||
endif()
|
||||
|
||||
# TARGET PROCESSOR DETECTION
|
||||
set(SLEEF_TARGET_PROCESSOR "${CMAKE_SYSTEM_PROCESSOR}")
|
||||
if(CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "^(x86_64|arm64)$")
|
||||
set(SLEEF_TARGET_PROCESSOR "${CMAKE_OSX_ARCHITECTURES}")
|
||||
endif()
|
||||
|
||||
# PLATFORM DETECTION
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
set(SLEEF_ARCH_32BIT ON CACHE INTERNAL "True for 32-bit architecture.")
|
||||
endif()
|
||||
|
||||
if(SLEEF_TARGET_PROCESSOR MATCHES "(x86|AMD64|amd64|^i.86$)")
|
||||
set(SLEEF_ARCH_X86 ON CACHE INTERNAL "True for x86 architecture.")
|
||||
|
||||
set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mavx2;-mfma")
|
||||
elseif(SLEEF_TARGET_PROCESSOR MATCHES "aarch64|arm64")
|
||||
set(SLEEF_ARCH_AARCH64 ON CACHE INTERNAL "True for Aarch64 architecture.")
|
||||
# Aarch64 requires support for advsimdfma4
|
||||
set(COMPILER_SUPPORTS_ADVSIMD 1)
|
||||
set(COMPILER_SUPPORTS_ADVSIMDNOFMA 1)
|
||||
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
|
||||
set(SLEEF_ARCH_AARCH32 ON CACHE INTERNAL "True for Aarch32 architecture.")
|
||||
set(COMPILER_SUPPORTS_NEON32 1)
|
||||
set(COMPILER_SUPPORTS_NEON32VFPV4 1)
|
||||
|
||||
set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mfpu=vfpv4")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
|
||||
set(SLEEF_ARCH_PPC64 ON CACHE INTERNAL "True for PPC64 architecture.")
|
||||
|
||||
set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-mvsx")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
|
||||
set(SLEEF_ARCH_S390X ON CACHE INTERNAL "True for IBM Z architecture.")
|
||||
|
||||
set(CLANG_FLAGS_ENABLE_PUREC_SCALAR "-march=z14;-mzvector")
|
||||
set(CLANG_FLAGS_ENABLE_PURECFMA_SCALAR "-march=z14;-mzvector")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
|
||||
set(SLEEF_ARCH_RISCV64 ON CACHE INTERNAL "True for RISCV64 architecture.")
|
||||
endif()
|
||||
|
||||
set(COMPILER_SUPPORTS_PUREC_SCALAR 1)
|
||||
set(COMPILER_SUPPORTS_PURECFMA_SCALAR 1)
|
||||
|
||||
# Compiler feature detection
|
||||
|
||||
# Detect CLANG executable path (on both Windows and Linux/OSX)
|
||||
if(NOT CLANG_EXE_PATH)
|
||||
# If the current compiler used by CMAKE is already clang, use this one directly
|
||||
if(CMAKE_C_COMPILER MATCHES "clang")
|
||||
set(CLANG_EXE_PATH ${CMAKE_C_COMPILER})
|
||||
else()
|
||||
# Else we may find clang on the path?
|
||||
find_program(CLANG_EXE_PATH NAMES clang "clang-11" "clang-10" "clang-9" "clang-8" "clang-7" "clang-6.0" "clang-5.0" "clang-4.0" "clang-3.9")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# Allow to define the Gcc/Clang here
|
||||
# As we might compile the lib with MSVC, but generates bitcode with CLANG
|
||||
# Intel vector extensions.
|
||||
set(CLANG_FLAGS_ENABLE_SSE2 "-msse2")
|
||||
set(CLANG_FLAGS_ENABLE_SSE4 "-msse4.1")
|
||||
set(CLANG_FLAGS_ENABLE_AVX "-mavx")
|
||||
set(CLANG_FLAGS_ENABLE_FMA4 "-mfma4")
|
||||
set(CLANG_FLAGS_ENABLE_AVX2 "-mavx2;-mfma")
|
||||
set(CLANG_FLAGS_ENABLE_AVX2128 "-mavx2;-mfma")
|
||||
set(CLANG_FLAGS_ENABLE_AVX512F "-mavx512f")
|
||||
set(CLANG_FLAGS_ENABLE_AVX512FNOFMA "-mavx512f")
|
||||
set(CLANG_FLAGS_ENABLE_NEON32 "--target=arm-linux-gnueabihf;-mcpu=cortex-a8")
|
||||
set(CLANG_FLAGS_ENABLE_NEON32VFPV4 "-march=armv7-a;-mfpu=neon-vfpv4")
|
||||
# Arm AArch64 vector extensions.
|
||||
set(CLANG_FLAGS_ENABLE_SVE "-march=armv8-a+sve")
|
||||
set(CLANG_FLAGS_ENABLE_SVENOFMA "-march=armv8-a+sve")
|
||||
# PPC64
|
||||
set(CLANG_FLAGS_ENABLE_VSX "-mcpu=power8")
|
||||
set(CLANG_FLAGS_ENABLE_VSXNOFMA "-mcpu=power8")
|
||||
set(CLANG_FLAGS_ENABLE_VSX3 "-mcpu=power9")
|
||||
set(CLANG_FLAGS_ENABLE_VSX3NOFMA "-mcpu=power9")
|
||||
# IBM z
|
||||
set(CLANG_FLAGS_ENABLE_VXE "-march=z14;-mzvector")
|
||||
set(CLANG_FLAGS_ENABLE_VXENOFMA "-march=z14;-mzvector")
|
||||
set(CLANG_FLAGS_ENABLE_VXE2 "-march=z15;-mzvector")
|
||||
set(CLANG_FLAGS_ENABLE_VXE2NOFMA "-march=z15;-mzvector")
|
||||
# RISC-V
|
||||
set(CLANG_FLAGS_ENABLE_RVVM1 "-march=rv64gcv_zba_zbb_zbs")
|
||||
set(CLANG_FLAGS_ENABLE_RVVM1NOFMA "-march=rv64gcv_zba_zbb_zbs")
|
||||
set(CLANG_FLAGS_ENABLE_RVVM2 "-march=rv64gcv_zba_zbb_zbs")
|
||||
set(CLANG_FLAGS_ENABLE_RVVM2NOFMA "-march=rv64gcv_zba_zbb_zbs")
|
||||
|
||||
set(FLAGS_OTHERS "")
|
||||
|
||||
# All variables storing compiler flags should be prefixed with FLAGS_
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
|
||||
# Always compile sleef with -ffp-contract.
|
||||
set(FLAGS_STRICTMATH "-ffp-contract=off")
|
||||
set(FLAGS_FASTMATH "-ffast-math")
|
||||
set(FLAGS_NOSTRICTALIASING "-fno-strict-aliasing")
|
||||
|
||||
if (SLEEF_ARCH_X86 AND SLEEF_ARCH_32BIT)
|
||||
string(CONCAT FLAGS_STRICTMATH ${FLAGS_STRICTMATH} " -msse2 -mfpmath=sse")
|
||||
string(CONCAT FLAGS_FASTMATH ${FLAGS_FASTMATH} " -msse2 -mfpmath=sse")
|
||||
endif()
|
||||
|
||||
# Without the options below, gcc generates calls to libm
|
||||
string(CONCAT FLAGS_OTHERS "-fno-math-errno -fno-trapping-math")
|
||||
|
||||
# Intel vector extensions.
|
||||
foreach(SIMD ${SLEEF_ALL_SUPPORTED_EXTENSIONS})
|
||||
set(FLAGS_ENABLE_${SIMD} ${CLANG_FLAGS_ENABLE_${SIMD}})
|
||||
endforeach()
|
||||
|
||||
# Warning flags.
|
||||
set(FLAGS_WALL "-Wall -Wno-unused-function -Wno-attributes -Wno-unused-result")
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU")
|
||||
# The following compiler option is needed to suppress the warning
|
||||
# "AVX vector return without AVX enabled changes the ABI" at
|
||||
# src/arch/helpervecext.h:88
|
||||
string(CONCAT FLAGS_WALL ${FLAGS_WALL} " -Wno-psabi")
|
||||
set(FLAGS_ENABLE_NEON32 "-mfpu=neon")
|
||||
endif(CMAKE_C_COMPILER_ID MATCHES "GNU")
|
||||
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "Clang" AND SLEEF_ENABLE_LTO)
|
||||
if (NOT SLEEF_LLVM_AR_COMMAND)
|
||||
find_program(SLEEF_LLVM_AR_COMMAND "llvm-ar")
|
||||
endif()
|
||||
if (SLEEF_LLVM_AR_COMMAND)
|
||||
SET(CMAKE_AR ${SLEEF_LLVM_AR_COMMAND})
|
||||
SET(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> rcs <TARGET> <LINK_FLAGS> <OBJECTS>")
|
||||
SET(CMAKE_C_ARCHIVE_FINISH "true")
|
||||
endif(SLEEF_LLVM_AR_COMMAND)
|
||||
string(CONCAT FLAGS_OTHERS "-flto=thin")
|
||||
endif(CMAKE_C_COMPILER_ID MATCHES "Clang" AND SLEEF_ENABLE_LTO)
|
||||
|
||||
# Flags for generating inline headers
|
||||
set(FLAG_PREPROCESS "-E")
|
||||
set(FLAG_PRESERVE_COMMENTS "-C")
|
||||
set(FLAG_INCLUDE "-I")
|
||||
set(FLAG_DEFINE "-D")
|
||||
|
||||
if (SLEEF_CLANG_ON_WINDOWS)
|
||||
# The following line is required to prevent clang from displaying
|
||||
# many warnings. Clang on Windows references MSVC header files,
|
||||
# which have deprecation and security attributes for many
|
||||
# functions.
|
||||
|
||||
string(CONCAT FLAGS_WALL ${FLAGS_WALL} " -D_CRT_SECURE_NO_WARNINGS -D_CRT_NONSTDC_NO_DEPRECATE -Wno-deprecated-declarations")
|
||||
endif()
|
||||
elseif(MSVC)
|
||||
# Intel vector extensions.
|
||||
if (CMAKE_CL_64)
|
||||
set(FLAGS_ENABLE_SSE2 /D__SSE2__)
|
||||
set(FLAGS_ENABLE_SSE4 /D__SSE2__ /D__SSE3__ /D__SSE4_1__)
|
||||
else()
|
||||
set(FLAGS_ENABLE_SSE2 /D__SSE2__ /arch:SSE2)
|
||||
set(FLAGS_ENABLE_SSE4 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /arch:SSE2)
|
||||
endif()
|
||||
set(FLAGS_ENABLE_AVX /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /arch:AVX)
|
||||
set(FLAGS_ENABLE_FMA4 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /D__FMA4__ /arch:AVX2)
|
||||
set(FLAGS_ENABLE_AVX2 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /arch:AVX2)
|
||||
set(FLAGS_ENABLE_AVX2128 /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /arch:AVX2)
|
||||
set(FLAGS_ENABLE_AVX512F /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /D__AVX512F__ /arch:AVX2)
|
||||
set(FLAGS_ENABLE_AVX512FNOFMA /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /D__AVX512F__ /arch:AVX2)
|
||||
set(FLAGS_ENABLE_PURECFMA_SCALAR /D__SSE2__ /D__SSE3__ /D__SSE4_1__ /D__AVX__ /D__AVX2__ /arch:AVX2)
|
||||
set(FLAGS_WALL "/D_CRT_SECURE_NO_WARNINGS /D_CRT_NONSTDC_NO_DEPRECATE")
|
||||
|
||||
set(FLAGS_NO_ERRNO "")
|
||||
|
||||
set(FLAG_PREPROCESS "/E")
|
||||
set(FLAG_PRESERVE_COMMENTS "/C")
|
||||
set(FLAG_INCLUDE "/I")
|
||||
set(FLAG_DEFINE "/D")
|
||||
elseif(CMAKE_C_COMPILER_ID MATCHES "Intel")
|
||||
set(FLAGS_ENABLE_SSE2 "-msse2")
|
||||
set(FLAGS_ENABLE_SSE4 "-msse4.1")
|
||||
set(FLAGS_ENABLE_AVX "-mavx")
|
||||
set(FLAGS_ENABLE_AVX2 "-march=core-avx2")
|
||||
set(FLAGS_ENABLE_AVX2128 "-march=core-avx2")
|
||||
set(FLAGS_ENABLE_AVX512F "-xCOMMON-AVX512")
|
||||
set(FLAGS_ENABLE_AVX512FNOFMA "-xCOMMON-AVX512")
|
||||
set(FLAGS_ENABLE_PURECFMA_SCALAR "-march=core-avx2;-fno-strict-aliasing")
|
||||
set(FLAGS_ENABLE_FMA4 "-msse2") # This is a dummy flag
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "IntelLLVM")
|
||||
set(FLAGS_STRICTMATH "-fp-model strict -Qoption,cpp,--extended_float_types")
|
||||
set(FLAGS_FASTMATH "-fp-model fast -Qoption,cpp,--extended_float_types")
|
||||
else()
|
||||
set(FLAGS_STRICTMATH "-fp-model strict -Qoption,cpp,--extended_float_type")
|
||||
set(FLAGS_FASTMATH "-fp-model fast=2 -Qoption,cpp,--extended_float_type")
|
||||
endif()
|
||||
set(FLAGS_NOSTRICTALIASING "-fno-strict-aliasing")
|
||||
set(FLAGS_WALL "-fmax-errors=3 -Wall -Wno-unused -Wno-attributes")
|
||||
|
||||
set(FLAGS_NO_ERRNO "")
|
||||
|
||||
set(FLAG_PREPROCESS "-E")
|
||||
set(FLAG_PRESERVE_COMMENTS "-C")
|
||||
set(FLAG_INCLUDE "-I")
|
||||
set(FLAG_DEFINE "-D")
|
||||
endif()
|
||||
|
||||
set(SLEEF_C_FLAGS "${FLAGS_WALL} ${FLAGS_STRICTMATH} ${FLAGS_OTHERS}")
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 6.99)
|
||||
set(DFT_C_FLAGS "${FLAGS_WALL} ${FLAGS_NOSTRICTALIASING} ${FLAGS_OTHERS}")
|
||||
else()
|
||||
set(DFT_C_FLAGS "${FLAGS_WALL} ${FLAGS_NOSTRICTALIASING} ${FLAGS_FASTMATH} ${FLAGS_OTHERS}")
|
||||
endif()
|
||||
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "GNU")
|
||||
set(FLAGS_ENABLE_SVE "${FLAGS_ENABLE_SVE};-fno-tree-vrp")
|
||||
endif()
|
||||
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" AND CMAKE_C_COMPILER_ID MATCHES "GNU")
|
||||
set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -msse2 -mfpmath=sse")
|
||||
set(DFT_C_FLAGS "${DFT_C_FLAGS} -msse2 -mfpmath=sse -m128bit-long-double")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^i.86$" AND CMAKE_C_COMPILER_ID MATCHES "Clang")
|
||||
set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -msse2 -mfpmath=sse")
|
||||
set(DFT_C_FLAGS "${DFT_C_FLAGS} -msse2 -mfpmath=sse")
|
||||
endif()
|
||||
|
||||
if(CYGWIN OR MINGW)
|
||||
set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -fno-asynchronous-unwind-tables")
|
||||
set(DFT_C_FLAGS "${DFT_C_FLAGS} -fno-asynchronous-unwind-tables")
|
||||
endif()
|
||||
|
||||
if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 9.3 AND CMAKE_C_COMPILER_VERSION VERSION_LESS 10.2)
|
||||
set(SLEEF_C_FLAGS "${SLEEF_C_FLAGS} -fno-shrink-wrap -fno-tree-vrp")
|
||||
set(DFT_C_FLAGS "${DFT_C_FLAGS} -fno-shrink-wrap -fno-tree-vrp")
|
||||
endif()
|
||||
|
||||
# FEATURE DETECTION
|
||||
|
||||
# Long double
|
||||
|
||||
option(SLEEF_DISABLE_LONG_DOUBLE "Disable long double" OFF)
|
||||
option(SLEEF_ENFORCE_LONG_DOUBLE "Build fails if long double is not supported by the compiler" OFF)
|
||||
|
||||
if(NOT SLEEF_DISABLE_LONG_DOUBLE)
|
||||
CHECK_TYPE_SIZE("long double" LD_SIZE)
|
||||
if(LD_SIZE GREATER "9")
|
||||
# This is needed to check since internal compiler error occurs with gcc 4.x
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
typedef long double vlongdouble __attribute__((vector_size(sizeof(long double)*2)));
|
||||
vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d }; }
|
||||
int main() { vlongdouble vld = vcast_vl_l(0);
|
||||
}" COMPILER_SUPPORTS_LONG_DOUBLE)
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Support for long double disabled by CMake option")
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_LONG_DOUBLE AND NOT COMPILER_SUPPORTS_LONG_DOUBLE)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_LONG_DOUBLE is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# float128
|
||||
|
||||
option(SLEEF_DISABLE_FLOAT128 "Disable float128" OFF)
|
||||
option(SLEEF_ENFORCE_FLOAT128 "Build fails if float128 is not supported by the compiler" OFF)
|
||||
|
||||
if(NOT SLEEF_DISABLE_FLOAT128)
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
int main() { __float128 r = 1;
|
||||
}" COMPILER_SUPPORTS_FLOAT128)
|
||||
else()
|
||||
message(STATUS "Support for float128 disabled by CMake option")
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_FLOAT128 AND NOT COMPILER_SUPPORTS_FLOAT128)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_FLOAT128 is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
if(COMPILER_SUPPORTS_FLOAT128)
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#include <quadmath.h>
|
||||
int main() { __float128 r = 1;
|
||||
}" COMPILER_SUPPORTS_QUADMATH)
|
||||
endif()
|
||||
|
||||
# SSE2
|
||||
|
||||
option(SLEEF_DISABLE_SSE2 "Disable SSE2" OFF)
|
||||
option(SLEEF_ENFORCE_SSE2 "Build fails if SSE2 is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_SSE2)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SSE2}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
int main() {
|
||||
__m128d r = _mm_mul_pd(_mm_set1_pd(1), _mm_set1_pd(2)); }"
|
||||
COMPILER_SUPPORTS_SSE2)
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_SSE2 AND NOT COMPILER_SUPPORTS_SSE2)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_SSE2 is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# SSE 4.1
|
||||
|
||||
option(SLEEF_DISABLE_SSE4 "Disable SSE4" OFF)
|
||||
option(SLEEF_ENFORCE_SSE4 "Build fails if SSE4 is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_SSE4)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SSE4}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
int main() {
|
||||
__m128d r = _mm_floor_sd(_mm_set1_pd(1), _mm_set1_pd(2)); }"
|
||||
COMPILER_SUPPORTS_SSE4)
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_SSE4 AND NOT COMPILER_SUPPORTS_SSE4)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_SSE4 is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# AVX
|
||||
|
||||
option(SLEEF_ENFORCE_AVX "Disable AVX" OFF)
|
||||
option(SLEEF_ENFORCE_AVX "Build fails if AVX is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
int main() {
|
||||
__m256d r = _mm256_add_pd(_mm256_set1_pd(1), _mm256_set1_pd(2));
|
||||
}" COMPILER_SUPPORTS_AVX)
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_AVX AND NOT COMPILER_SUPPORTS_AVX)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_AVX is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# FMA4
|
||||
|
||||
option(SLEEF_DISABLE_FMA4 "Disable FMA4" OFF)
|
||||
option(SLEEF_ENFORCE_FMA4 "Build fails if FMA4 is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_FMA4)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_FMA4}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
int main() {
|
||||
__m256d r = _mm256_macc_pd(_mm256_set1_pd(1), _mm256_set1_pd(2), _mm256_set1_pd(3)); }"
|
||||
COMPILER_SUPPORTS_FMA4)
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_FMA4 AND NOT COMPILER_SUPPORTS_FMA4)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_FMA4 is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# AVX2
|
||||
|
||||
option(SLEEF_DISABLE_AVX2 "Disable AVX2" OFF)
|
||||
option(SLEEF_ENFORCE_AVX2 "Build fails if AVX2 is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX2)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX2}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
int main() {
|
||||
__m256i r = _mm256_abs_epi32(_mm256_set1_epi32(1)); }"
|
||||
COMPILER_SUPPORTS_AVX2)
|
||||
|
||||
# AVX2 implies AVX2128
|
||||
if(COMPILER_SUPPORTS_AVX2)
|
||||
set(COMPILER_SUPPORTS_AVX2128 1)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_AVX2 AND NOT COMPILER_SUPPORTS_AVX2)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_AVX2 is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# AVX512F
|
||||
|
||||
option(SLEEF_DISABLE_AVX512F "Disable AVX512F" OFF)
|
||||
option(SLEEF_ENFORCE_AVX512F "Build fails if AVX512F is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_X86 AND NOT SLEEF_DISABLE_AVX512F)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_AVX512F}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
__m512 addConstant(__m512 arg) {
|
||||
return _mm512_add_ps(arg, _mm512_set1_ps(1.f));
|
||||
}
|
||||
int main() {
|
||||
__m512i a = _mm512_set1_epi32(1);
|
||||
__m256i ymm = _mm512_extracti64x4_epi64(a, 0);
|
||||
__mmask16 m = _mm512_cmp_epi32_mask(a, a, _MM_CMPINT_EQ);
|
||||
__m512i r = _mm512_andnot_si512(a, a); }"
|
||||
COMPILER_SUPPORTS_AVX512F)
|
||||
|
||||
if (COMPILER_SUPPORTS_AVX512F)
|
||||
set(COMPILER_SUPPORTS_AVX512FNOFMA 1)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_AVX512F AND NOT COMPILER_SUPPORTS_AVX512F)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_AVX512F is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# SVE
|
||||
|
||||
option(SLEEF_DISABLE_SVE "Disable SVE" OFF)
|
||||
option(SLEEF_ENFORCE_SVE "Build fails if SVE is not supported by the compiler" OFF)
|
||||
|
||||
# Darwin does not support SVE yet (see issue #474),
|
||||
# therefore we disable SVE on Darwin systems.
|
||||
if(SLEEF_ARCH_AARCH64 AND NOT SLEEF_DISABLE_SVE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin")
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_SVE}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#include <arm_sve.h>
|
||||
int main() {
|
||||
svint32_t r = svdup_n_s32(1); }"
|
||||
COMPILER_SUPPORTS_SVE)
|
||||
|
||||
if(COMPILER_SUPPORTS_SVE)
|
||||
set(COMPILER_SUPPORTS_SVENOFMA 1)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_SVE AND NOT COMPILER_SUPPORTS_SVE)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_SVE is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# VSX
|
||||
|
||||
option(SLEEF_DISABLE_VSX "Disable VSX" OFF)
|
||||
option(SLEEF_ENFORCE_VSX "Build fails if VSX is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_PPC64 AND NOT SLEEF_DISABLE_VSX)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VSX}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#include <altivec.h>
|
||||
#ifndef __LITTLE_ENDIAN__
|
||||
#error \"Only VSX(ISA2.07) little-endian mode is supported \"
|
||||
#endif
|
||||
int main() {
|
||||
vector double d;
|
||||
vector unsigned char p = {
|
||||
4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11
|
||||
};
|
||||
d = vec_perm(d, d, p);
|
||||
}"
|
||||
COMPILER_SUPPORTS_VSX)
|
||||
|
||||
if (COMPILER_SUPPORTS_VSX)
|
||||
set(COMPILER_SUPPORTS_VSXNOFMA 1)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_VSX AND NOT COMPILER_SUPPORTS_VSX)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_VSX is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# VSX3
|
||||
|
||||
option(SLEEF_DISABLE_VSX3 "Disable VSX3" OFF)
|
||||
option(SLEEF_ENFORCE_VSX3 "Build fails if VSX3 is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_PPC64 AND NOT SLEEF_DISABLE_VSX3)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VSX3}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#include <altivec.h>
|
||||
#ifndef __LITTLE_ENDIAN__
|
||||
#error \"Only VSX3 little-endian mode is supported \"
|
||||
#endif
|
||||
int main() {
|
||||
static vector double d;
|
||||
static vector unsigned long long a, b;
|
||||
|
||||
d = vec_insert_exp(a, b);
|
||||
}"
|
||||
COMPILER_SUPPORTS_VSX3)
|
||||
|
||||
if (COMPILER_SUPPORTS_VSX3)
|
||||
set(COMPILER_SUPPORTS_VSX3NOFMA 1)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_VSX3 AND NOT COMPILER_SUPPORTS_VSX3)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_VSX3 is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# IBM Z
|
||||
|
||||
option(SLEEF_DISABLE_VXE "Disable VXE" OFF)
|
||||
option(SLEEF_ENFORCE_VXE "Build fails if VXE is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_S390X AND NOT SLEEF_DISABLE_VXE)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VXE}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#include <vecintrin.h>
|
||||
int main() {
|
||||
__vector float d;
|
||||
d = vec_sqrt(d);
|
||||
}"
|
||||
COMPILER_SUPPORTS_VXE)
|
||||
|
||||
if(COMPILER_SUPPORTS_VXE)
|
||||
set(COMPILER_SUPPORTS_VXENOFMA 1)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_VXE AND NOT COMPILER_SUPPORTS_VXE)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_VXE is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
#
|
||||
|
||||
option(SLEEF_DISABLE_VXE2 "Disable VXE2" OFF)
|
||||
option(SLEEF_ENFORCE_VXE2 "Build fails if VXE2 is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_S390X AND NOT SLEEF_DISABLE_VXE2)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_VXE2}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#include <vecintrin.h>
|
||||
int main() {
|
||||
__vector float d;
|
||||
d = vec_sqrt(d);
|
||||
}"
|
||||
COMPILER_SUPPORTS_VXE2)
|
||||
|
||||
if(COMPILER_SUPPORTS_VXE2)
|
||||
set(COMPILER_SUPPORTS_VXE2NOFMA 1)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_VXE2 AND NOT COMPILER_SUPPORTS_VXE2)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_VXE2 is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# RVVM1
|
||||
|
||||
option(SLEEF_DISABLE_RVVM1 "Disable RVVM1" OFF)
|
||||
option(SLEEF_ENFORCE_RVVM1 "Build fails if RVVM1 is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_RISCV64 AND NOT SLEEF_DISABLE_RVVM1)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_RVVM1}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#include <riscv_vector.h>
|
||||
int main() {
|
||||
vint32m1_t r = __riscv_vmv_v_x_i32m1(1, __riscv_vlenb() * 8 / 32); }"
|
||||
COMPILER_SUPPORTS_RVVM1)
|
||||
|
||||
if(COMPILER_SUPPORTS_RVVM1)
|
||||
set(COMPILER_SUPPORTS_RVVM1NOFMA 1)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_RVVM1 AND NOT COMPILER_SUPPORTS_RVVM1)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_RVVM1 is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# RVVM2
|
||||
|
||||
option(SLEEF_DISABLE_RVVM2 "Disable RVVM2" OFF)
|
||||
option(SLEEF_ENFORCE_RVVM2 "Build fails if RVVM2 is not supported by the compiler" OFF)
|
||||
|
||||
if(SLEEF_ARCH_RISCV64 AND NOT SLEEF_DISABLE_RVVM2)
|
||||
string (REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${FLAGS_ENABLE_RVVM2}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#include <riscv_vector.h>
|
||||
int main() {
|
||||
vint32m2_t r = __riscv_vmv_v_x_i32m2(1, 2 * __riscv_vlenb() * 8 / 32); }"
|
||||
COMPILER_SUPPORTS_RVVM2)
|
||||
|
||||
if(COMPILER_SUPPORTS_RVVM2)
|
||||
set(COMPILER_SUPPORTS_RVVM2NOFMA 1)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_RVVM2 AND NOT COMPILER_SUPPORTS_RVVM2)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_RVVM2 is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# CUDA
|
||||
|
||||
option(SLEEF_ENFORCE_CUDA "Build fails if CUDA is not supported" OFF)
|
||||
|
||||
if (SLEEF_ENFORCE_CUDA AND NOT CMAKE_CUDA_COMPILER)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_CUDA is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# OpenMP
|
||||
|
||||
option(SLEEF_DISABLE_OPENMP "Disable OPENMP" OFF)
|
||||
option(SLEEF_ENFORCE_OPENMP "Build fails if OPENMP is not supported by the compiler" OFF)
|
||||
|
||||
if(NOT SLEEF_DISABLE_OPENMP)
|
||||
find_package(OpenMP)
|
||||
# Check if compilation with OpenMP really succeeds
|
||||
# It might not succeed even though find_package(OpenMP) succeeds.
|
||||
if(OPENMP_FOUND)
|
||||
set (CMAKE_REQUIRED_FLAGS "${OpenMP_C_FLAGS}")
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#include <stdio.h>
|
||||
int main() {
|
||||
int i;
|
||||
#pragma omp parallel for
|
||||
for(i=0;i < 10;i++) { putchar(0); }
|
||||
}"
|
||||
COMPILER_SUPPORTS_OPENMP)
|
||||
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#pragma omp declare simd notinbranch
|
||||
double func(double x) { return x + 1; }
|
||||
double a[1024];
|
||||
int main() {
|
||||
#pragma omp parallel for simd
|
||||
for (int i = 0; i < 1024; i++) a[i] = func(a[i]);
|
||||
}
|
||||
"
|
||||
COMPILER_SUPPORTS_OMP_SIMD)
|
||||
endif(OPENMP_FOUND)
|
||||
else()
|
||||
message(STATUS "Support for OpenMP disabled by CMake option")
|
||||
endif()
|
||||
|
||||
if (SLEEF_ENFORCE_OPENMP AND NOT COMPILER_SUPPORTS_OPENMP)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_OPENMP is specified and that feature is disabled or not supported by the compiler")
|
||||
endif()
|
||||
|
||||
# Weak aliases
|
||||
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#if defined(__CYGWIN__)
|
||||
#define EXPORT __stdcall __declspec(dllexport)
|
||||
#else
|
||||
#define EXPORT
|
||||
#endif
|
||||
EXPORT int f(int a) {
|
||||
return a + 2;
|
||||
}
|
||||
EXPORT int g(int a) __attribute__((weak, alias(\"f\")));
|
||||
int main(void) {
|
||||
return g(2);
|
||||
}"
|
||||
COMPILER_SUPPORTS_WEAK_ALIASES)
|
||||
if (COMPILER_SUPPORTS_WEAK_ALIASES AND
|
||||
NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm" AND
|
||||
NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64" AND
|
||||
NOT SLEEF_CLANG_ON_WINDOWS AND
|
||||
NOT MINGW AND SLEEF_BUILD_GNUABI_LIBS)
|
||||
set(ENABLE_GNUABI ${COMPILER_SUPPORTS_WEAK_ALIASES})
|
||||
endif()
|
||||
|
||||
# Built-in math functions
|
||||
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
int main(void) {
|
||||
double a = __builtin_sqrt (2);
|
||||
float b = __builtin_sqrtf(2);
|
||||
}"
|
||||
COMPILER_SUPPORTS_BUILTIN_MATH)
|
||||
|
||||
# SYS_getrandom
|
||||
|
||||
CHECK_C_SOURCE_COMPILES("
|
||||
#define _GNU_SOURCE
|
||||
#include <unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <linux/random.h>
|
||||
int main(void) {
|
||||
int i;
|
||||
syscall(SYS_getrandom, &i, sizeof(i), 0);
|
||||
}"
|
||||
COMPILER_SUPPORTS_SYS_GETRANDOM)
|
||||
|
||||
#
|
||||
|
||||
# Reset used flags
|
||||
set(CMAKE_REQUIRED_FLAGS)
|
||||
set(CMAKE_REQUIRED_LIBRARIES)
|
||||
|
||||
# Save the default C flags
|
||||
set(ORG_CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
|
||||
|
||||
##
|
||||
|
||||
# Check if sde64 command is available
|
||||
|
||||
find_program(SDE_COMMAND sde64)
|
||||
if (NOT SDE_COMMAND)
|
||||
find_program(SDE_COMMAND sde)
|
||||
endif()
|
||||
|
||||
# Check if armie command is available
|
||||
|
||||
find_program(ARMIE_COMMAND armie)
|
||||
if (NOT SVE_VECTOR_BITS)
|
||||
set(SVE_VECTOR_BITS 128)
|
||||
endif()
|
||||
|
||||
#
|
||||
|
||||
find_program(FILECHECK_COMMAND NAMES FileCheck FileCheck-11 FileCheck-10 FileCheck-9)
|
||||
|
||||
#
|
||||
|
||||
find_program(SED_COMMAND sed)
|
||||
|
||||
##
|
||||
|
||||
if(SLEEF_SHOW_ERROR_LOG)
|
||||
if (EXISTS ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeError.log)
|
||||
file(READ ${PROJECT_BINARY_DIR}/CMakeFiles/CMakeError.log FILE_CONTENT)
|
||||
message("")
|
||||
message("")
|
||||
message("====== Content of CMakeError.log ======")
|
||||
message("")
|
||||
message("${FILE_CONTENT}")
|
||||
message("")
|
||||
message("======== End of CMakeError.log ========")
|
||||
message("")
|
||||
message("")
|
||||
endif()
|
||||
endif(SLEEF_SHOW_ERROR_LOG)
|
||||
|
||||
if (MSVC OR SLEEF_CLANG_ON_WINDOWS)
|
||||
set(COMPILER_SUPPORTS_OPENMP FALSE) # At this time, OpenMP is not supported on MSVC
|
||||
endif()
|
||||
|
||||
##
|
||||
|
||||
# Set common definitions
|
||||
|
||||
if (NOT BUILD_SHARED_LIBS)
|
||||
set(COMMON_TARGET_DEFINITIONS SLEEF_STATIC_LIBS=1)
|
||||
set(SLEEF_STATIC_LIBS 1)
|
||||
endif()
|
||||
|
||||
if (COMPILER_SUPPORTS_WEAK_ALIASES)
|
||||
set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_ALIAS=1)
|
||||
endif()
|
||||
|
||||
if (COMPILER_SUPPORTS_SYS_GETRANDOM)
|
||||
set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_SYS_getrandom=1)
|
||||
endif()
|
||||
@@ -0,0 +1,23 @@
|
||||
Boost Software License - Version 1.0 - August 17th, 2003
|
||||
|
||||
Permission is hereby granted, free of charge, to any person or organization
|
||||
obtaining a copy of the software and accompanying documentation covered by
|
||||
this license (the "Software") to use, reproduce, display, distribute,
|
||||
execute, and transmit the Software, and to prepare derivative works of the
|
||||
Software, and to permit third-parties to whom the Software is furnished to
|
||||
do so, all subject to the following:
|
||||
|
||||
The copyright notices in the Software and this entire statement, including
|
||||
the above license grant, this restriction and the following disclaimer,
|
||||
must be included in all copies of the Software, in whole or in part, and
|
||||
all derivative works of the Software, unless such copies or derivative
|
||||
works are solely in the form of machine-executable object code generated by
|
||||
a source language processor.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
|
||||
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
|
||||
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
|
||||
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
@@ -0,0 +1,221 @@
|
||||
# SLEEF
|
||||
|
||||

|
||||
[](https://ieeexplore.ieee.org/document/8936472)
|
||||
[](https://www.boost.org/LICENSE_1_0.txt)
|
||||

|
||||
[](https://spack.readthedocs.io/en/v0.16.2/package_list.html#sleef)
|
||||
[](https://sourceforge.net/projects/sleef/)
|
||||
|
||||
SLEEF is a library that implements vectorized versions of C standard math functions. This library also includes DFT subroutines.
|
||||
|
||||
- **Web Page:** [https://sleef.org/][webpage_url]
|
||||
- **Sources:** [https://github.com/shibatch/sleef][repo_url]
|
||||
|
||||
## Supported environment
|
||||
|
||||
### Test matrix
|
||||
|
||||
The following table summarises currently supported vector extensions, compilers and OS-es.
|
||||
|
||||
:green_circle: : Tested extensively in CI.
|
||||
|
||||
:yellow_circle: : Tested partially in CI.
|
||||
|
||||
:x: : Currently failing some tests in CI.
|
||||
|
||||
:white_circle: : Not tested in CI. Might have passed tests in previous CI framework.
|
||||
|
||||
[This issue](https://github.com/shibatch/sleef/issues/481) tracks progress on improving test coverage.
|
||||
Compilation of SLEEF on previously supported environments might still be safe, we just cannot verify it yet.
|
||||
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th colspan="2" rowspan="2"></th>
|
||||
<th colspan="9">OS/Compiler</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th colspan="3">Linux</th>
|
||||
<th colspan="2">macOS</th>
|
||||
<th colspan="4">Windows</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Arch.</th>
|
||||
<th>Vector Extensions</th>
|
||||
<th>gcc</th><th>llvm</th><th>icc</th>
|
||||
<th>gcc</th><th>llvm</th>
|
||||
<th>gcc</th><th>llvm-gnu</th><th>llvm-msvc</th><th>msvc</th>
|
||||
</tr>
|
||||
<tr align="center"><th>x86_64</th><th>SSE2, SSE4,<br>AVX, AVX2, AVX512F</th>
|
||||
<td>:green_circle:</td><td>:green_circle:</td><td>:white_circle:</td>
|
||||
<td>:white_circle:</td><td>:green_circle:</td>
|
||||
<td>:white_circle:</td><td>:yellow_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
|
||||
</tr>
|
||||
<tr align="center"><th>x86 32bit<br>(i386)</th><th>SSE</th>
|
||||
<td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
|
||||
<td colspan="2">N/A</td>
|
||||
<td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
|
||||
</tr>
|
||||
<tr align="center"><th>AArch64<br>(arm)</th><th>Neon, SVE</th>
|
||||
<td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
|
||||
<td colspan="1">N/A</td><td>:green_circle:</td>
|
||||
<td colspan="1">N/A</td><td>:white_circle:</td><td>:white_circle:</td><td>:white_circle:</td>
|
||||
</tr>
|
||||
<tr align="center"><th>AArch32<br>(armhf)</th><th>NEON</th>
|
||||
<td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
|
||||
<td colspan="2">N/A</td>
|
||||
<td colspan="4">N/A</td>
|
||||
</tr>
|
||||
<tr align="center"><th>PowerPC<br>(ppc64el)</th><th>VSX, VSX3</th>
|
||||
<td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
|
||||
<td colspan="2">N/A</td>
|
||||
<td colspan="4">N/A</td>
|
||||
</tr>
|
||||
<tr align="center"><th>IBM/Z<br>(s390x)</th><th>VXE, VXE2</th>
|
||||
<td>:green_circle:</td><td>:green_circle:</td><td>N/A</td>
|
||||
<td colspan="2">N/A</td>
|
||||
<td colspan="4">N/A</td>
|
||||
</tr>
|
||||
<tr align="center"><th>RISC-V<br>(riscv64)</th><th>RVV1, RVV2</th>
|
||||
<td>N/A (14+)</td><td>:green_circle:</td><td>N/A</td>
|
||||
<td colspan="2">N/A</td>
|
||||
<td colspan="4">N/A</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
### Component support
|
||||
|
||||
The above table is valid for libm in single, double and quadruple precision, as well as fast Discrete Fourier Transform (DFT).
|
||||
|
||||
Generation of inline headers is also supported for most vector extensions.
|
||||
|
||||
LTO is not tested in CI yet, except on Windows.
|
||||
|
||||
### Compiler support
|
||||
|
||||
Results are displayed for gcc 11 and llvm 17, the compiler versions used in CI tests with GitHub Actions.
|
||||
|
||||
Older versions should be supported too, while newer ones are either not tested or have known issues.
|
||||
|
||||
Some compiler versions simply do not support certain vector extensions, for instance SVE is only supported for gcc version 9 onwards.
|
||||
|
||||
Similarly, the RISC-V interface in SLEEF is based on version 1.0 of the intrinsics, which is only supported from llvm version 17 and gcc version 14 onwards.
|
||||
|
||||
Toolchain files provide some information on supported compiler versions.
|
||||
|
||||
### OS support
|
||||
|
||||
Only Linux distributions and macOS are fully tested in CI and thus officially supported.
|
||||
|
||||
Building SLEEF for Windows on x86 machines was officially supported ( :white_circle: ), as of 3.5.1,
|
||||
however it is only partially tested due to [known limitations of the test suite with MinGW or MSYS2](https://github.com/shibatch/sleef/issues/544).
|
||||
As a result tests for Windows on x86 only include DFT for now (other tests are disabled in build system),
|
||||
but all components are built.
|
||||
|
||||
Support for iOS and Android is only preliminary on AArch64.
|
||||
|
||||
SVE is not supported on Darwin-based system and therefore automatically disabled by SLEEF on Darwin.
|
||||
|
||||
### More on supported environment
|
||||
|
||||
Refer to our web page for [more on supported environment][supported_env_url].
|
||||
|
||||
## Install SLEEF dependencies
|
||||
|
||||
The library itself does not have any additional dependency.
|
||||
|
||||
However some tests require:
|
||||
|
||||
- libssl and libcrypto, that can be provided by installing openssl.
|
||||
- libm, libgmp and libmpfr
|
||||
- libfftw.
|
||||
|
||||
These tests can be disabled if necessary.
|
||||
|
||||
## How to build SLEEF
|
||||
|
||||
We recommend relying on CMake as much as possible in the build process to ensure portability.
|
||||
**CMake 3.18+** is the minimum required.
|
||||
|
||||
1. Check out the source code from our GitHub repository
|
||||
|
||||
```
|
||||
git clone https://github.com/shibatch/sleef
|
||||
```
|
||||
|
||||
2. Make a separate directory to create an out-of-source build
|
||||
|
||||
```
|
||||
cd sleef && mkdir build
|
||||
```
|
||||
|
||||
3. Run cmake to configure the project
|
||||
|
||||
```
|
||||
cmake -S . -B build
|
||||
```
|
||||
|
||||
By default this will generate shared libraries. In order to generate static libraries, pass option `-DBUILD_SHARED_LIBS=OFF`.
|
||||
|
||||
For more verbose output add option `-DSLEEF_SHOW_CONFIG=ON`.
|
||||
|
||||
4. Run make to build the project
|
||||
|
||||
```
|
||||
cmake --build build -j --clean-first
|
||||
```
|
||||
|
||||
5. Run tests using ctests
|
||||
|
||||
```
|
||||
ctest --test-dir build -j
|
||||
```
|
||||
|
||||
For more detailed build instructions please refer to the [dedicated section on CMake](./docs/build-with-cmake.md) or to [our web page][build_info_url].
|
||||
|
||||
## Install SLEEF
|
||||
|
||||
### From source
|
||||
|
||||
Assuming following instructions were followed.
|
||||
|
||||
6. Install to specified directory `<prefix>`
|
||||
|
||||
```
|
||||
cmake --install build --prefix=<prefix>
|
||||
```
|
||||
|
||||
### Using Spack
|
||||
|
||||
SLEEF can also be directly installed using Spack.
|
||||
|
||||
```
|
||||
spack install sleef@master
|
||||
```
|
||||
|
||||
### Uninstall
|
||||
|
||||
In order to uninstall SLEEF library and headers run
|
||||
|
||||
```
|
||||
sudo xargs rm -v < build/install_manifest.txt
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
The software is distributed under the Boost Software License, Version 1.0.
|
||||
See accompanying file [LICENSE.txt](./LICENSE.txt) or copy at [http://www.boost.org/LICENSE_1_0.txt][license_url].
|
||||
Contributions to this project are accepted under the same license.
|
||||
|
||||
Copyright © 2010-2024 SLEEF Project, Naoki Shibata and contributors.<br/>
|
||||
|
||||
|
||||
<!-- Repository links -->
|
||||
|
||||
[webpage_url]: https://sleef.org/
|
||||
[build_info_url]: https://sleef.org/compile.xhtml
|
||||
[supported_env_url]: https://sleef.org/index.xhtml#environment
|
||||
[repo_url]: https://github.com/shibatch/sleef
|
||||
[repo_license_url]: https://github.com/shibatch/sleef/blob/main/LICENSE.txt
|
||||
[license_url]: http://www.boost.org/LICENSE_1_0.txt
|
||||
@@ -0,0 +1,71 @@
|
||||
#ifndef __SLEEFDFT_H__
|
||||
#define __SLEEFDFT_H__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#define SLEEF_MODE_FORWARD (0 << 0)
|
||||
#define SLEEF_MODE_BACKWARD (1 << 0)
|
||||
|
||||
#define SLEEF_MODE_COMPLEX (0 << 1)
|
||||
#define SLEEF_MODE_REAL (1 << 1)
|
||||
|
||||
#define SLEEF_MODE_ALT (1 << 2)
|
||||
#define SLEEF_MODE_FFTWCOMPAT (1 << 3)
|
||||
|
||||
#define SLEEF_MODE_DEBUG (1 << 10)
|
||||
#define SLEEF_MODE_VERBOSE (1 << 11)
|
||||
#define SLEEF_MODE_NO_MT (1 << 12)
|
||||
|
||||
#define SLEEF_MODE_ESTIMATE (1 << 20)
|
||||
#define SLEEF_MODE_MEASURE (2 << 20)
|
||||
|
||||
#if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
|
||||
#ifdef IMPORT_IS_EXPORT
|
||||
#define IMPORT __declspec(dllexport)
|
||||
#else // #ifdef IMPORT_IS_EXPORT
|
||||
#define IMPORT __declspec(dllimport)
|
||||
#if (defined(_MSC_VER))
|
||||
#pragma comment(lib,"sleefdft.lib")
|
||||
#endif // #if (defined(_MSC_VER))
|
||||
#endif // #ifdef IMPORT_IS_EXPORT
|
||||
#else // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
|
||||
#define IMPORT
|
||||
#endif // #if (defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__) || defined(_MSC_VER)) && !defined(SLEEF_STATIC_LIBS)
|
||||
|
||||
IMPORT struct SleefDFT *SleefDFT_double_init1d(uint32_t n, const double *in, double *out, uint64_t mode);
|
||||
IMPORT struct SleefDFT *SleefDFT_double_init2d(uint32_t n, uint32_t m, const double *in, double *out, uint64_t mode);
|
||||
IMPORT void SleefDFT_double_execute(struct SleefDFT *ptr, const double *in, double *out);
|
||||
|
||||
IMPORT struct SleefDFT *SleefDFT_float_init1d(uint32_t n, const float *in, float *out, uint64_t mode);
|
||||
IMPORT struct SleefDFT *SleefDFT_float_init2d(uint32_t n, uint32_t m, const float *in, float *out, uint64_t mode);
|
||||
IMPORT void SleefDFT_float_execute(struct SleefDFT *ptr, const float *in, float *out);
|
||||
|
||||
IMPORT void SleefDFT_dispose(struct SleefDFT *ptr);
|
||||
|
||||
IMPORT void SleefDFT_setPath(struct SleefDFT *ptr, char *pathStr);
|
||||
|
||||
//
|
||||
|
||||
IMPORT void SleefDFT_setPlanFilePath(const char *path, const char *arch, uint64_t mode);
|
||||
|
||||
#define SLEEF_PLAN_AUTOMATIC 0
|
||||
#define SLEEF_PLAN_READONLY (1 << 0)
|
||||
#define SLEEF_PLAN_RESET (1 << 1)
|
||||
#define SLEEF_PLAN_BUILDALLPLAN (1 << 2)
|
||||
#define SLEEF_PLAN_NOLOCK (1 << 3)
|
||||
#define SLEEF_PLAN_MEASURE (1 << 29)
|
||||
#define SLEEF_PLAN_REFERTOENVVAR (1 << 30)
|
||||
|
||||
#undef IMPORT
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // #ifndef __SLEEFDFT_H__
|
||||
@@ -0,0 +1,11 @@
|
||||
// Configuration of @PROJECT_NAME@ /////////////////////////////////////////////
|
||||
|
||||
#ifndef SLEEF_CONFIG_H
|
||||
#define SLEEF_CONFIG_H
|
||||
|
||||
#define SLEEF_VERSION_MAJOR @SLEEF_VERSION_MAJOR@
|
||||
#define SLEEF_VERSION_MINOR @SLEEF_VERSION_MINOR@
|
||||
|
||||
#cmakedefine SLEEF_STATIC_LIBS
|
||||
|
||||
#endif // SLEEF_CONFIG_H
|
||||
@@ -0,0 +1 @@
|
||||
include("${CMAKE_CURRENT_LIST_DIR}/sleefTargets.cmake")
|
||||
@@ -0,0 +1,22 @@
|
||||
include_directories("common")
|
||||
include_directories("arch")
|
||||
|
||||
add_subdirectory("libm")
|
||||
if (SLEEF_BUILD_TESTS AND NOT MINGW)
|
||||
add_subdirectory("libm-tester")
|
||||
endif()
|
||||
add_subdirectory("common")
|
||||
|
||||
if (SLEEF_BUILD_DFT)
|
||||
add_subdirectory("dft")
|
||||
if (SLEEF_BUILD_TESTS)
|
||||
add_subdirectory("dft-tester")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (SLEEF_BUILD_QUAD)
|
||||
add_subdirectory("quad")
|
||||
if (SLEEF_BUILD_TESTS AND NOT MINGW)
|
||||
add_subdirectory("quad-tester")
|
||||
endif()
|
||||
endif()
|
||||
@@ -0,0 +1,837 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright ARM Ltd. 2010 - 2024. */
|
||||
/* Distributed under the Boost Software License, Version 1.0. */
|
||||
/* (See accompanying file LICENSE.txt or copy at */
|
||||
/* http://www.boost.org/LICENSE_1_0.txt) */
|
||||
/*********************************************************************/
|
||||
|
||||
#if !defined(__ARM_NEON) && !defined(SLEEF_GENHEADER)
|
||||
#error Please specify advsimd flags.
|
||||
#endif
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
#include <arm_neon.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "misc.h"
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
#define ENABLE_DP
|
||||
//@#define ENABLE_DP
|
||||
#define LOG2VECTLENDP 1
|
||||
//@#define LOG2VECTLENDP 1
|
||||
#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
//@#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
|
||||
#define ENABLE_SP
|
||||
//@#define ENABLE_SP
|
||||
#define LOG2VECTLENSP 2
|
||||
//@#define LOG2VECTLENSP 2
|
||||
#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
//@#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
|
||||
#if CONFIG == 1
|
||||
#define ENABLE_FMA_DP
|
||||
//@#define ENABLE_FMA_DP
|
||||
#define ENABLE_FMA_SP
|
||||
//@#define ENABLE_FMA_SP
|
||||
#endif
|
||||
|
||||
#define FULL_FP_ROUNDING
|
||||
//@#define FULL_FP_ROUNDING
|
||||
#define ACCURATE_SQRT
|
||||
//@#define ACCURATE_SQRT
|
||||
|
||||
#define ISANAME "AArch64 AdvSIMD"
|
||||
|
||||
// Mask definition
|
||||
typedef uint32x4_t vmask;
|
||||
typedef uint32x4_t vopmask;
|
||||
|
||||
// Single precision definitions
|
||||
typedef float32x4_t vfloat;
|
||||
typedef int32x4_t vint2;
|
||||
|
||||
// Double precision definitions
|
||||
typedef float64x2_t vdouble;
|
||||
typedef int32x2_t vint;
|
||||
|
||||
typedef int64x2_t vint64;
|
||||
typedef uint64x2_t vuint64;
|
||||
|
||||
typedef struct {
|
||||
vmask x, y;
|
||||
} vquad;
|
||||
|
||||
typedef vquad vargquad;
|
||||
|
||||
#define DFTPRIORITY 10
|
||||
|
||||
static INLINE int vavailability_i(int name) { return 3; }
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { }
|
||||
|
||||
static INLINE VECTOR_CC int vtestallones_i_vo32(vopmask g) {
|
||||
uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
|
||||
uint32x2_t x1 = vpmin_u32(x0, x0);
|
||||
return vget_lane_u32(x1, 0);
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC int vtestallones_i_vo64(vopmask g) {
|
||||
uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
|
||||
uint32x2_t x1 = vpmin_u32(x0, x0);
|
||||
return vget_lane_u32(x1, 0);
|
||||
}
|
||||
|
||||
// Vector load / store
|
||||
static INLINE VECTOR_CC vdouble vload_vd_p(const double *ptr) { return vld1q_f64(ptr); }
|
||||
static INLINE VECTOR_CC vdouble vloadu_vd_p(const double *ptr) { return vld1q_f64(ptr); }
|
||||
static INLINE VECTOR_CC void vstore_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }
|
||||
static INLINE VECTOR_CC void vstoreu_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }
|
||||
static INLINE VECTOR_CC vfloat vload_vf_p(const float *ptr) { return vld1q_f32(ptr); }
|
||||
static INLINE VECTOR_CC vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
|
||||
static INLINE VECTOR_CC void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
|
||||
static INLINE VECTOR_CC void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
|
||||
static INLINE VECTOR_CC vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
|
||||
static INLINE VECTOR_CC void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
|
||||
static INLINE VECTOR_CC vint vloadu_vi_p(int32_t *p) { return vld1_s32(p); }
|
||||
static INLINE VECTOR_CC void vstoreu_v_p_vi(int32_t *p, vint v) { vst1_s32(p, v); }
|
||||
|
||||
static INLINE VECTOR_CC vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
|
||||
return ((vdouble) { ptr[vget_lane_s32(vi, 0)], ptr[vget_lane_s32(vi, 1)]} );
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
|
||||
return ((vfloat) {
|
||||
ptr[vgetq_lane_s32(vi2, 0)],
|
||||
ptr[vgetq_lane_s32(vi2, 1)],
|
||||
ptr[vgetq_lane_s32(vi2, 2)],
|
||||
ptr[vgetq_lane_s32(vi2, 3)]
|
||||
});
|
||||
}
|
||||
|
||||
// Basic logical operations for mask
|
||||
static INLINE VECTOR_CC vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
|
||||
static INLINE VECTOR_CC vmask vandnot_vm_vm_vm(vmask x, vmask y) {
|
||||
return vbicq_u32(y, x);
|
||||
}
|
||||
static INLINE VECTOR_CC vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }
|
||||
static INLINE VECTOR_CC vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }
|
||||
|
||||
// Mask <--> single precision reinterpret
|
||||
static INLINE VECTOR_CC vmask vreinterpret_vm_vf(vfloat vf) {
|
||||
return vreinterpretq_u32_f32(vf);
|
||||
}
|
||||
static INLINE VECTOR_CC vfloat vreinterpret_vf_vm(vmask vm) {
|
||||
return vreinterpretq_f32_u32(vm);
|
||||
}
|
||||
static INLINE VECTOR_CC vint2 vcast_vi2_vm(vmask vm) { return vreinterpretq_s32_u32(vm); }
|
||||
static INLINE VECTOR_CC vmask vcast_vm_vi2(vint2 vi) { return vreinterpretq_u32_s32(vi); }
|
||||
|
||||
// Mask <--> double precision reinterpret
|
||||
static INLINE VECTOR_CC vmask vreinterpret_vm_vd(vdouble vd) {
|
||||
return vreinterpretq_u32_f64(vd);
|
||||
}
|
||||
static INLINE VECTOR_CC vdouble vreinterpret_vd_vm(vmask vm) {
|
||||
return vreinterpretq_f64_u32(vm);
|
||||
}
|
||||
static INLINE VECTOR_CC vfloat vreinterpret_vf_vi2(vint2 vm) {
|
||||
return vreinterpretq_f32_s32(vm);
|
||||
}
|
||||
static INLINE VECTOR_CC vint2 vreinterpret_vi2_vf(vfloat vf) {
|
||||
return vreinterpretq_s32_f32(vf);
|
||||
}
|
||||
|
||||
/****************************************/
|
||||
/* Single precision FP operations */
|
||||
/****************************************/
|
||||
// Broadcast
|
||||
static INLINE VECTOR_CC vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }
|
||||
|
||||
// Add, Sub, Mul
|
||||
static INLINE VECTOR_CC vfloat vadd_vf_vf_vf(vfloat x, vfloat y) {
|
||||
return vaddq_f32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vfloat vsub_vf_vf_vf(vfloat x, vfloat y) {
|
||||
return vsubq_f32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vfloat vmul_vf_vf_vf(vfloat x, vfloat y) {
|
||||
return vmulq_f32(x, y);
|
||||
}
|
||||
|
||||
// |x|, -x
|
||||
static INLINE VECTOR_CC vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
|
||||
static INLINE VECTOR_CC vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }
|
||||
|
||||
#if CONFIG == 1
|
||||
// Multiply accumulate: z = z + x * y
|
||||
static INLINE VECTOR_CC vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
|
||||
return vfmaq_f32(z, x, y);
|
||||
}
|
||||
// Multiply subtract: z = z - x * y
|
||||
static INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
|
||||
return vfmsq_f32(z, x, y);
|
||||
}
|
||||
// Multiply subtract: z = x * y - z
|
||||
static INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
|
||||
return vneg_vf_vf(vfmsq_f32(z, x, y));
|
||||
}
|
||||
#else
|
||||
static INLINE VECTOR_CC vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
static INLINE VECTOR_CC vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
|
||||
static INLINE VECTOR_CC vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
#endif
|
||||
|
||||
static INLINE VECTOR_CC vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z + x * y
|
||||
return vfmaq_f32(z, x, y);
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z - x * y
|
||||
return vfmsq_f32(z, x, y);
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // x * y - z
|
||||
return vfma_vf_vf_vf_vf(x, y, vneg_vf_vf(z));
|
||||
}
|
||||
|
||||
// Reciprocal 1/x, Division, Square root
|
||||
static INLINE VECTOR_CC vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
|
||||
#ifndef SLEEF_ENABLE_ALTDIV
|
||||
return vdivq_f32(n, d);
|
||||
#else
|
||||
// Finite numbers (including denormal) only, gives mostly correctly rounded result
|
||||
float32x4_t t, u, x, y;
|
||||
uint32x4_t i0, i1;
|
||||
i0 = vandq_u32(vreinterpretq_u32_f32(n), vdupq_n_u32(0x7c000000));
|
||||
i1 = vandq_u32(vreinterpretq_u32_f32(d), vdupq_n_u32(0x7c000000));
|
||||
i0 = vsubq_u32(vdupq_n_u32(0x7d000000), vshrq_n_u32(vaddq_u32(i0, i1), 1));
|
||||
t = vreinterpretq_f32_u32(i0);
|
||||
y = vmulq_f32(d, t);
|
||||
x = vmulq_f32(n, t);
|
||||
t = vrecpeq_f32(y);
|
||||
t = vmulq_f32(t, vrecpsq_f32(y, t));
|
||||
t = vmulq_f32(t, vrecpsq_f32(y, t));
|
||||
u = vmulq_f32(x, t);
|
||||
u = vfmaq_f32(u, vfmsq_f32(x, y, u), t);
|
||||
return u;
|
||||
#endif
|
||||
}
|
||||
static INLINE VECTOR_CC vfloat vrec_vf_vf(vfloat d) {
|
||||
#ifndef SLEEF_ENABLE_ALTDIV
|
||||
return vdiv_vf_vf_vf(vcast_vf_f(1.0f), d);
|
||||
#else
|
||||
return vbslq_f32(vceqq_f32(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)),
|
||||
vcast_vf_f(0), vdiv_vf_vf_vf(vcast_vf_f(1.0f), d));
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vfloat vsqrt_vf_vf(vfloat d) {
|
||||
#ifndef SLEEF_ENABLE_ALTSQRT
|
||||
return vsqrtq_f32(d);
|
||||
#else
|
||||
// Gives correctly rounded result for all input range
|
||||
vfloat w, x, y, z;
|
||||
|
||||
y = vrsqrteq_f32(d);
|
||||
x = vmul_vf_vf_vf(d, y); w = vmul_vf_vf_vf(vcast_vf_f(0.5), y);
|
||||
y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(0.5));
|
||||
x = vfma_vf_vf_vf_vf(x, y, x); w = vfma_vf_vf_vf_vf(w, y, w);
|
||||
|
||||
y = vfmanp_vf_vf_vf_vf(x, w, vcast_vf_f(1.5)); w = vadd_vf_vf_vf(w, w);
|
||||
w = vmul_vf_vf_vf(w, y);
|
||||
x = vmul_vf_vf_vf(w, d);
|
||||
y = vfmapn_vf_vf_vf_vf(w, d, x); z = vfmanp_vf_vf_vf_vf(w, x, vcast_vf_f(1));
|
||||
z = vfmanp_vf_vf_vf_vf(w, y, z); w = vmul_vf_vf_vf(vcast_vf_f(0.5), x);
|
||||
w = vfma_vf_vf_vf_vf(w, z, y);
|
||||
w = vadd_vf_vf_vf(w, x);
|
||||
|
||||
return vbslq_f32(vorrq_u32(vceqq_f32(d, vcast_vf_f(0)),
|
||||
vceqq_f32(d, vcast_vf_f(SLEEF_INFINITYf))), d, w);
|
||||
#endif
|
||||
}
|
||||
|
||||
// max, min
|
||||
static INLINE VECTOR_CC vfloat vmax_vf_vf_vf(vfloat x, vfloat y) {
|
||||
return vmaxq_f32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vfloat vmin_vf_vf_vf(vfloat x, vfloat y) {
|
||||
return vminq_f32(x, y);
|
||||
}
|
||||
|
||||
// Comparisons
|
||||
static INLINE VECTOR_CC vmask veq_vm_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }
|
||||
static INLINE VECTOR_CC vmask vneq_vm_vf_vf(vfloat x, vfloat y) {
|
||||
return vmvnq_u32(vceqq_f32(x, y));
|
||||
}
|
||||
static INLINE VECTOR_CC vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }
|
||||
static INLINE VECTOR_CC vmask vle_vm_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }
|
||||
static INLINE VECTOR_CC vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }
|
||||
static INLINE VECTOR_CC vmask vge_vm_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }
|
||||
|
||||
// Conditional select
|
||||
static INLINE VECTOR_CC vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) {
|
||||
return vbslq_f32(mask, x, y);
|
||||
}
|
||||
|
||||
// int <--> float conversions
|
||||
static INLINE VECTOR_CC vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }
|
||||
static INLINE VECTOR_CC vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }
|
||||
static INLINE VECTOR_CC vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }
|
||||
static INLINE VECTOR_CC vint2 vrint_vi2_vf(vfloat d) {
|
||||
return vcvtq_s32_f32(vrndnq_f32(d));
|
||||
}
|
||||
|
||||
/***************************************/
|
||||
/* Single precision integer operations */
|
||||
/***************************************/
|
||||
|
||||
// Add, Sub, Neg (-x)
|
||||
static INLINE VECTOR_CC vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
return vaddq_s32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
return vsubq_s32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }
|
||||
|
||||
// Logical operations
|
||||
static INLINE VECTOR_CC vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
return vandq_s32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
return vbicq_s32(y, x);
|
||||
}
|
||||
static INLINE VECTOR_CC vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
return vorrq_s32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
return veorq_s32(x, y);
|
||||
}
|
||||
|
||||
// Shifts
|
||||
#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
|
||||
//@#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
|
||||
#define vsrl_vi2_vi2_i(x, c) \
|
||||
vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
|
||||
//@#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
|
||||
|
||||
#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
|
||||
//@#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
|
||||
#define vsra_vi_vi_i(x, c) vshr_n_s32(x, c)
|
||||
//@#define vsra_vi_vi_i(x, c) vshr_n_s32(x, c)
|
||||
#define vsll_vi_vi_i(x, c) vshl_n_s32(x, c)
|
||||
//@#define vsll_vi_vi_i(x, c) vshl_n_s32(x, c)
|
||||
#define vsrl_vi_vi_i(x, c) \
|
||||
vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c))
|
||||
//@#define vsrl_vi_vi_i(x, c) vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c))
|
||||
|
||||
// Comparison returning masks
|
||||
static INLINE VECTOR_CC vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }
|
||||
static INLINE VECTOR_CC vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { return vcgeq_s32(x, y); }
|
||||
// Comparison returning integers
|
||||
static INLINE VECTOR_CC vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
return vreinterpretq_s32_u32(vcgtq_s32(x, y));
|
||||
}
|
||||
static INLINE VECTOR_CC vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
return vreinterpretq_s32_u32(vceqq_s32(x, y));
|
||||
}
|
||||
|
||||
// Conditional select
|
||||
static INLINE VECTOR_CC vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) {
|
||||
return vbslq_s32(m, x, y);
|
||||
}
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
/* -------------------------------------------------------------------------- */
|
||||
/* -------------------------------------------------------------------------- */
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
/****************************************/
|
||||
/* Double precision FP operations */
|
||||
/****************************************/
|
||||
// Broadcast
|
||||
static INLINE VECTOR_CC vdouble vcast_vd_d(double f) { return vdupq_n_f64(f); }
|
||||
|
||||
// Add, Sub, Mul
|
||||
static INLINE VECTOR_CC vdouble vadd_vd_vd_vd(vdouble x, vdouble y) {
|
||||
return vaddq_f64(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vdouble vsub_vd_vd_vd(vdouble x, vdouble y) {
|
||||
return vsubq_f64(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vdouble vmul_vd_vd_vd(vdouble x, vdouble y) {
|
||||
return vmulq_f64(x, y);
|
||||
}
|
||||
|
||||
// |x|, -x
|
||||
static INLINE VECTOR_CC vdouble vabs_vd_vd(vdouble f) { return vabsq_f64(f); }
|
||||
static INLINE VECTOR_CC vdouble vneg_vd_vd(vdouble f) { return vnegq_f64(f); }
|
||||
|
||||
// max, min
|
||||
static INLINE VECTOR_CC vdouble vmax_vd_vd_vd(vdouble x, vdouble y) {
|
||||
return vmaxq_f64(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vdouble vmin_vd_vd_vd(vdouble x, vdouble y) {
|
||||
return vminq_f64(x, y);
|
||||
}
|
||||
|
||||
#if CONFIG == 1
|
||||
// Multiply accumulate: z = z + x * y
|
||||
static INLINE VECTOR_CC vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
|
||||
return vfmaq_f64(z, x, y);
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
|
||||
return vfmsq_f64(z, x, y);
|
||||
}
|
||||
|
||||
//[z = x * y - z]
|
||||
static INLINE VECTOR_CC vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
|
||||
return vneg_vd_vd(vfmsq_f64(z, x, y));
|
||||
}
|
||||
#else
|
||||
static INLINE VECTOR_CC vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
|
||||
static INLINE VECTOR_CC vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
|
||||
#endif
|
||||
|
||||
static INLINE VECTOR_CC vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z + x * y
|
||||
return vfmaq_f64(z, x, y);
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z - x * y
|
||||
return vfmsq_f64(z, x, y);
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // x * y - z
|
||||
return vfma_vd_vd_vd_vd(x, y, vneg_vd_vd(z));
|
||||
}
|
||||
|
||||
// Reciprocal 1/x, Division, Square root
|
||||
static INLINE VECTOR_CC vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {
|
||||
#ifndef SLEEF_ENABLE_ALTDIV
|
||||
return vdivq_f64(n, d);
|
||||
#else
|
||||
// Finite numbers (including denormal) only, gives mostly correctly rounded result
|
||||
float64x2_t t, u, x, y;
|
||||
uint64x2_t i0, i1;
|
||||
i0 = vandq_u64(vreinterpretq_u64_f64(n), vdupq_n_u64(0x7fc0000000000000L));
|
||||
i1 = vandq_u64(vreinterpretq_u64_f64(d), vdupq_n_u64(0x7fc0000000000000L));
|
||||
i0 = vsubq_u64(vdupq_n_u64(0x7fd0000000000000L), vshrq_n_u64(vaddq_u64(i0, i1), 1));
|
||||
t = vreinterpretq_f64_u64(i0);
|
||||
y = vmulq_f64(d, t);
|
||||
x = vmulq_f64(n, t);
|
||||
t = vrecpeq_f64(y);
|
||||
t = vmulq_f64(t, vrecpsq_f64(y, t));
|
||||
t = vmulq_f64(t, vrecpsq_f64(y, t));
|
||||
t = vmulq_f64(t, vrecpsq_f64(y, t));
|
||||
u = vmulq_f64(x, t);
|
||||
u = vfmaq_f64(u, vfmsq_f64(x, y, u), t);
|
||||
return u;
|
||||
#endif
|
||||
}
|
||||
static INLINE VECTOR_CC vdouble vrec_vd_vd(vdouble d) {
|
||||
#ifndef SLEEF_ENABLE_ALTDIV
|
||||
return vdiv_vd_vd_vd(vcast_vd_d(1.0f), d);
|
||||
#else
|
||||
return vbslq_f64(vceqq_f64(vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY)),
|
||||
vcast_vd_d(0), vdiv_vd_vd_vd(vcast_vd_d(1.0f), d));
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vdouble vsqrt_vd_vd(vdouble d) {
|
||||
#ifndef SLEEF_ENABLE_ALTSQRT
|
||||
return vsqrtq_f64(d);
|
||||
#else
|
||||
// Gives correctly rounded result for all input range
|
||||
vdouble w, x, y, z;
|
||||
|
||||
y = vrsqrteq_f64(d);
|
||||
x = vmul_vd_vd_vd(d, y); w = vmul_vd_vd_vd(vcast_vd_d(0.5), y);
|
||||
y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
|
||||
x = vfma_vd_vd_vd_vd(x, y, x); w = vfma_vd_vd_vd_vd(w, y, w);
|
||||
y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(0.5));
|
||||
x = vfma_vd_vd_vd_vd(x, y, x); w = vfma_vd_vd_vd_vd(w, y, w);
|
||||
|
||||
y = vfmanp_vd_vd_vd_vd(x, w, vcast_vd_d(1.5)); w = vadd_vd_vd_vd(w, w);
|
||||
w = vmul_vd_vd_vd(w, y);
|
||||
x = vmul_vd_vd_vd(w, d);
|
||||
y = vfmapn_vd_vd_vd_vd(w, d, x); z = vfmanp_vd_vd_vd_vd(w, x, vcast_vd_d(1));
|
||||
z = vfmanp_vd_vd_vd_vd(w, y, z); w = vmul_vd_vd_vd(vcast_vd_d(0.5), x);
|
||||
w = vfma_vd_vd_vd_vd(w, z, y);
|
||||
w = vadd_vd_vd_vd(w, x);
|
||||
|
||||
return vbslq_f64(vorrq_u64(vceqq_f64(d, vcast_vd_d(0)),
|
||||
vceqq_f64(d, vcast_vd_d(SLEEF_INFINITY))), d, w);
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Comparisons */
|
||||
static INLINE VECTOR_CC vopmask veq_vo_vd_vd(vdouble x, vdouble y) {
|
||||
return vreinterpretq_u32_u64(vceqq_f64(x, y));
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vneq_vo_vd_vd(vdouble x, vdouble y) {
|
||||
return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(x, y)));
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vlt_vo_vd_vd(vdouble x, vdouble y) {
|
||||
return vreinterpretq_u32_u64(vcltq_f64(x, y));
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vgt_vo_vd_vd(vdouble x, vdouble y) {
|
||||
return vreinterpretq_u32_u64(vcgtq_f64(x, y));
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vle_vo_vd_vd(vdouble x, vdouble y) {
|
||||
return vreinterpretq_u32_u64(vcleq_f64(x, y));
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vge_vo_vd_vd(vdouble x, vdouble y) {
|
||||
return vreinterpretq_u32_u64(vcgeq_f64(x, y));
|
||||
}
|
||||
|
||||
// Conditional select
|
||||
static INLINE VECTOR_CC vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) {
|
||||
return vbslq_f64(vreinterpretq_u64_u32(mask), x, y);
|
||||
}
|
||||
|
||||
#if 1
|
||||
static INLINE CONST VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
|
||||
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
|
||||
}
|
||||
#else
|
||||
// This implementation is slower on the current CPU models (as of May 2017.)
|
||||
// I(Naoki Shibata) expect that on future CPU models with hardware similar to Super Shuffle Engine, this implementation will be faster.
|
||||
static INLINE CONST VECTOR_CC vdouble vsel_vd_vo_d_d(vopmask o, double d0, double d1) {
|
||||
uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
|
||||
(uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 });
|
||||
|
||||
uint8x16_t tab = (uint8x16_t) (float64x2_t) { d0, d1 };
|
||||
return (vdouble) vqtbl1q_u8(tab, idx);
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
|
||||
uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o0), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
|
||||
vbslq_u8(vreinterpretq_u8_u32(o1), (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 },
|
||||
vbslq_u8(vreinterpretq_u8_u32(o2), (uint8x16_t) { 16, 17, 18, 19, 20, 21, 22, 23, 16, 17, 18, 19, 20, 21, 22, 23 },
|
||||
(uint8x16_t) { 24, 25, 26, 27, 28, 29, 30, 31, 24, 25, 26, 27, 28, 29, 30, 31 })));
|
||||
|
||||
uint8x16x2_t tab = { { (uint8x16_t) (float64x2_t) { d0, d1 }, (uint8x16_t) (float64x2_t) { d2, d3 } } };
|
||||
return (vdouble) vqtbl2q_u8(tab, idx);
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
|
||||
return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE VECTOR_CC vdouble vrint_vd_vd(vdouble d) { return vrndnq_f64(d); }
|
||||
static INLINE VECTOR_CC vfloat vrint_vf_vf(vfloat d) { return vrndnq_f32(d); }
|
||||
|
||||
/****************************************/
|
||||
/* int <--> float conversions */
|
||||
/****************************************/
|
||||
static INLINE VECTOR_CC vint vtruncate_vi_vd(vdouble vf) {
|
||||
return vmovn_s64(vcvtq_s64_f64(vf));
|
||||
}
|
||||
static INLINE VECTOR_CC vdouble vcast_vd_vi(vint vi) {
|
||||
return vcvtq_f64_s64(vmovl_s32(vi));
|
||||
}
|
||||
static INLINE VECTOR_CC vint vcast_vi_i(int i) { return vdup_n_s32(i); }
|
||||
static INLINE VECTOR_CC vint vrint_vi_vd(vdouble d) {
|
||||
return vqmovn_s64(vcvtq_s64_f64(vrndnq_f64(d)));
|
||||
}
|
||||
|
||||
/***************************************/
|
||||
/* Integer operations */
|
||||
/***************************************/
|
||||
|
||||
// Add, Sub, Neg (-x)
|
||||
static INLINE VECTOR_CC vint vadd_vi_vi_vi(vint x, vint y) { return vadd_s32(x, y); }
|
||||
static INLINE VECTOR_CC vint vsub_vi_vi_vi(vint x, vint y) { return vsub_s32(x, y); }
|
||||
static INLINE VECTOR_CC vint vneg_vi_vi(vint e) { return vneg_s32(e); }
|
||||
|
||||
// Logical operations
|
||||
static INLINE VECTOR_CC vint vand_vi_vi_vi(vint x, vint y) { return vand_s32(x, y); }
|
||||
static INLINE VECTOR_CC vint vandnot_vi_vi_vi(vint x, vint y) { return vbic_s32(y, x); }
|
||||
static INLINE VECTOR_CC vint vor_vi_vi_vi(vint x, vint y) { return vorr_s32(x, y); }
|
||||
static INLINE VECTOR_CC vint vxor_vi_vi_vi(vint x, vint y) { return veor_s32(x, y); }
|
||||
|
||||
// Comparison returning masks
|
||||
static INLINE VECTOR_CC vopmask veq_vo_vi_vi(vint x, vint y) {
|
||||
return vcombine_u32(vceq_s32(x, y), vdup_n_u32(0));
|
||||
}
|
||||
|
||||
// Conditional select
|
||||
static INLINE VECTOR_CC vint vsel_vi_vm_vi_vi(vmask m, vint x, vint y) {
|
||||
return vbsl_s32(vget_low_u32(m), x, y);
|
||||
}
|
||||
|
||||
/***************************************/
|
||||
/* Predicates */
|
||||
/***************************************/
|
||||
static INLINE VECTOR_CC vopmask visinf_vo_vd(vdouble d) {
|
||||
const float64x2_t inf = vdupq_n_f64(SLEEF_INFINITY);
|
||||
const float64x2_t neg_inf = vdupq_n_f64(-SLEEF_INFINITY);
|
||||
uint64x2_t cmp = vorrq_u64(vceqq_f64(d, inf), vceqq_f64(d, neg_inf));
|
||||
return vreinterpretq_u32_u64(cmp);
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vopmask visnan_vo_vd(vdouble d) {
|
||||
return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(d, d)));
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vopmask vispinf_vo_vd(vdouble d) {
|
||||
return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(SLEEF_INFINITY)));
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vopmask visminf_vo_vd(vdouble d) {
|
||||
return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(-SLEEF_INFINITY)));
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
|
||||
return vbslq_f32(mask, x, y);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
|
||||
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vopmask veq_vo_vf_vf(vfloat x, vfloat y) {
|
||||
return vceqq_f32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vneq_vo_vf_vf(vfloat x, vfloat y) {
|
||||
return vmvnq_u32(vceqq_f32(x, y));
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vlt_vo_vf_vf(vfloat x, vfloat y) {
|
||||
return vcltq_f32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vle_vo_vf_vf(vfloat x, vfloat y) {
|
||||
return vcleq_f32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vgt_vo_vf_vf(vfloat x, vfloat y) {
|
||||
return vcgtq_f32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vge_vo_vf_vf(vfloat x, vfloat y) {
|
||||
return vcgeq_f32(x, y);
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {
|
||||
return vceqq_s32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {
|
||||
return vcgtq_s32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vgt_vo_vi_vi(vint x, vint y) {
|
||||
return vcombine_u32(vcgt_s32(x, y), vdup_n_u32(0));
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask visinf_vo_vf(vfloat d) {
|
||||
return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf));
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vispinf_vo_vf(vfloat d) {
|
||||
return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf));
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask visminf_vo_vf(vfloat d) {
|
||||
return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf));
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
|
||||
|
||||
static INLINE VECTOR_CC vopmask vcast_vo32_vo64(vopmask m) {
|
||||
return vuzpq_u32(m, m).val[0];
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vcast_vo64_vo32(vopmask m) {
|
||||
return vzipq_u32(m, m).val[0];
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vcast_vo_i(int i) {
|
||||
return vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)(i ? -1 : 0)));
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vopmask vand_vo_vo_vo(vopmask x, vopmask y) {
|
||||
return vandq_u32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) {
|
||||
return vbicq_u32(y, x);
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vor_vo_vo_vo(vopmask x, vopmask y) {
|
||||
return vorrq_u32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vopmask vxor_vo_vo_vo(vopmask x, vopmask y) {
|
||||
return veorq_u32(x, y);
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
|
||||
return vbslq_s32(m, x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {
|
||||
return vandq_s32(vreinterpretq_s32_u32(x), y);
|
||||
}
|
||||
static INLINE VECTOR_CC vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) {
|
||||
return vbicq_s32(y, vreinterpretq_s32_u32(x));
|
||||
}
|
||||
static INLINE VECTOR_CC vint vandnot_vi_vo_vi(vopmask x, vint y) {
|
||||
return vbic_s32(y, vget_low_s32(vreinterpretq_s32_u32(x)));
|
||||
}
|
||||
static INLINE VECTOR_CC vmask vand_vm_vo32_vm(vopmask x, vmask y) {
|
||||
return vandq_u32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vmask vand_vm_vo64_vm(vopmask x, vmask y) {
|
||||
return vandq_u32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vmask vandnot_vm_vo32_vm(vopmask x, vmask y) {
|
||||
return vbicq_u32(y, x);
|
||||
}
|
||||
static INLINE VECTOR_CC vmask vandnot_vm_vo64_vm(vopmask x, vmask y) {
|
||||
return vbicq_u32(y, x);
|
||||
}
|
||||
static INLINE VECTOR_CC vmask vor_vm_vo32_vm(vopmask x, vmask y) {
|
||||
return vorrq_u32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vmask vor_vm_vo64_vm(vopmask x, vmask y) {
|
||||
return vorrq_u32(x, y);
|
||||
}
|
||||
static INLINE VECTOR_CC vmask vxor_vm_vo32_vm(vopmask x, vmask y) {
|
||||
return veorq_u32(x, y);
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vfloat vtruncate_vf_vf(vfloat vd) { return vrndq_f32(vd); }
|
||||
|
||||
static INLINE VECTOR_CC vmask vcast_vm_i_i(int i0, int i1) {
|
||||
return vreinterpretq_u32_u64(vdupq_n_u64((0xffffffff & (uint64_t)i1) | (((uint64_t)i0) << 32)));
|
||||
}
|
||||
|
||||
static INLINE vmask vcast_vm_i64(int64_t i) {
|
||||
return vreinterpretq_u32_u64(vdupq_n_u64((uint64_t)i));
|
||||
}
|
||||
static INLINE vmask vcast_vm_u64(uint64_t i) {
|
||||
return vreinterpretq_u32_u64(vdupq_n_u64(i));
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vopmask veq64_vo_vm_vm(vmask x, vmask y) {
|
||||
return vreinterpretq_u32_u64(vceqq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vmask vadd64_vm_vm_vm(vmask x, vmask y) {
|
||||
return vreinterpretq_u32_s64(vaddq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
|
||||
return vbsl_s32(vget_low_u32(m), x, y);
|
||||
}
|
||||
|
||||
// Logical operations
|
||||
static INLINE VECTOR_CC vint vand_vi_vo_vi(vopmask x, vint y) {
|
||||
return vand_s32(vreinterpret_s32_u32(vget_low_u32(x)), y);
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC vmask vcastu_vm_vi(vint vi) {
|
||||
return vrev64q_u32(vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi))));
|
||||
}
|
||||
static INLINE VECTOR_CC vint vcastu_vi_vm(vmask vi2) {
|
||||
return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vrev64q_u32(vi2))));
|
||||
}
|
||||
static INLINE VECTOR_CC vdouble vtruncate_vd_vd(vdouble vd) { return vrndq_f64(vd); }
|
||||
|
||||
//
|
||||
|
||||
#define PNMASK ((vdouble) { +0.0, -0.0 })
|
||||
#define NPMASK ((vdouble) { -0.0, +0.0 })
|
||||
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
|
||||
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
|
||||
|
||||
static INLINE VECTOR_CC vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
|
||||
static INLINE VECTOR_CC vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
|
||||
static INLINE VECTOR_CC vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }
|
||||
static INLINE VECTOR_CC vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }
|
||||
|
||||
static INLINE VECTOR_CC vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
|
||||
static INLINE VECTOR_CC vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
|
||||
static INLINE VECTOR_CC vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
|
||||
static INLINE VECTOR_CC vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
|
||||
static INLINE VECTOR_CC vdouble vrev21_vd_vd(vdouble d0) { return (float64x2_t)vcombine_u64(vget_high_u64((uint64x2_t)d0), vget_low_u64((uint64x2_t)d0)); }
|
||||
static INLINE VECTOR_CC vdouble vreva2_vd_vd(vdouble vd) { return vd; }
|
||||
|
||||
static INLINE VECTOR_CC void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); }
|
||||
static INLINE VECTOR_CC void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
|
||||
static INLINE VECTOR_CC void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
|
||||
|
||||
static INLINE VECTOR_CC vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }
|
||||
static INLINE VECTOR_CC vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); }
|
||||
|
||||
static INLINE VECTOR_CC void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
|
||||
|
||||
static INLINE VECTOR_CC void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
|
||||
vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
|
||||
vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
|
||||
}
|
||||
|
||||
static INLINE VECTOR_CC void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
|
||||
vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
|
||||
vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static vquad loadu_vq_p(void *p) {
|
||||
vquad vq;
|
||||
memcpy(&vq, p, VECTLENDP * 16);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vquad cast_vq_aq(vargquad aq) {
|
||||
vquad vq;
|
||||
memcpy(&vq, &aq, VECTLENDP * 16);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vargquad cast_aq_vq(vquad vq) {
|
||||
vargquad aq;
|
||||
memcpy(&aq, &vq, VECTLENDP * 16);
|
||||
return aq;
|
||||
}
|
||||
|
||||
static INLINE int vtestallzeros_i_vo64(vopmask g) {
|
||||
uint32x2_t x0 = vorr_u32(vget_low_u32(g), vget_high_u32(g));
|
||||
uint32x2_t x1 = vpmax_u32(x0, x0);
|
||||
return ~vget_lane_u32(x1, 0);
|
||||
}
|
||||
|
||||
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { return vbslq_u32(m, x, y); }
|
||||
|
||||
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
|
||||
return vreinterpretq_u32_s64(vsubq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
|
||||
}
|
||||
|
||||
static INLINE vmask vneg64_vm_vm(vmask x) {
|
||||
return vreinterpretq_u32_s64(vnegq_s64(vreinterpretq_s64_u32(x)));
|
||||
}
|
||||
|
||||
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
|
||||
return vreinterpretq_u32_u64(vcgtq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
|
||||
}
|
||||
|
||||
#define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c))
|
||||
//@#define vsll64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x), c))
|
||||
#define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c))
|
||||
//@#define vsrl64_vm_vm_i(x, c) vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x), c))
|
||||
|
||||
static INLINE vmask vcast_vm_vi(vint vi) {
|
||||
vmask m = vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi)));
|
||||
return vor_vm_vm_vm(vcastu_vm_vi(vreinterpret_s32_u32(vget_low_u32(vgt_vo_vi_vi(vcast_vi_i(0), vi)))), m);
|
||||
}
|
||||
static INLINE vint vcast_vi_vm(vmask vm) { return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vm))); }
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return vreinterpretq_u32_s64(v); }
|
||||
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return vreinterpretq_s64_u32(m); }
|
||||
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return vreinterpretq_u32_u64(v); }
|
||||
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return vreinterpretq_u64_u32(m); }
|
||||
@@ -0,0 +1,638 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#if CONFIG == 1
|
||||
|
||||
#if !defined(__AVX__) && !defined(SLEEF_GENHEADER)
|
||||
#error Please specify -mavx.
|
||||
#endif
|
||||
|
||||
#elif CONFIG == 4
|
||||
|
||||
#if (!defined(__AVX__) || !defined(__FMA4__)) && !defined(SLEEF_GENHEADER)
|
||||
#error Please specify -mavx and -mfma4.
|
||||
#endif
|
||||
|
||||
#else
|
||||
#error CONFIG macro invalid or not defined
|
||||
#endif
|
||||
|
||||
#define ENABLE_DP
|
||||
//@#define ENABLE_DP
|
||||
#define LOG2VECTLENDP 2
|
||||
//@#define LOG2VECTLENDP 2
|
||||
#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
//@#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
|
||||
#define ENABLE_SP
|
||||
//@#define ENABLE_SP
|
||||
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
//@#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
|
||||
#define FULL_FP_ROUNDING
|
||||
//@#define FULL_FP_ROUNDING
|
||||
#define ACCURATE_SQRT
|
||||
//@#define ACCURATE_SQRT
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include "misc.h"
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
typedef __m256i vmask;
|
||||
typedef __m256i vopmask;
|
||||
|
||||
typedef __m256d vdouble;
|
||||
typedef __m128i vint;
|
||||
|
||||
typedef __m256 vfloat;
|
||||
typedef struct { __m128i x, y; } vint2;
|
||||
|
||||
typedef __m256i vint64;
|
||||
typedef __m256i vuint64;
|
||||
|
||||
typedef struct {
|
||||
vmask x, y;
|
||||
} vquad;
|
||||
|
||||
typedef vquad vargquad;
|
||||
|
||||
//
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
|
||||
#ifndef __SLEEF_H__
|
||||
void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
|
||||
#endif
|
||||
|
||||
static INLINE int cpuSupportsAVX() {
|
||||
int32_t reg[4];
|
||||
Sleef_x86CpuID(reg, 1, 0);
|
||||
return (reg[2] & (1 << 28)) != 0;
|
||||
}
|
||||
|
||||
static INLINE int cpuSupportsFMA4() {
|
||||
int32_t reg[4];
|
||||
Sleef_x86CpuID(reg, 0x80000001, 0);
|
||||
return (reg[2] & (1 << 16)) != 0;
|
||||
}
|
||||
|
||||
#if CONFIG == 4 && defined(__AVX__) && defined(__FMA4__)
|
||||
static INLINE int vavailability_i(int name) {
|
||||
int d = cpuSupportsAVX() && cpuSupportsFMA4();
|
||||
return d ? 3 : 0;
|
||||
}
|
||||
|
||||
#define ENABLE_FMA_DP
|
||||
#define ENABLE_FMA_SP
|
||||
|
||||
#define ISANAME "AVX + AMD FMA4"
|
||||
#define DFTPRIORITY 21
|
||||
#else
|
||||
static INLINE int vavailability_i(int name) {
|
||||
int d = cpuSupportsAVX();
|
||||
return d ? 3 : 0;
|
||||
}
|
||||
|
||||
#define ISANAME "AVX"
|
||||
#define DFTPRIORITY 20
|
||||
#endif
|
||||
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
|
||||
|
||||
static INLINE int vtestallones_i_vo32(vopmask g) {
|
||||
return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
|
||||
}
|
||||
|
||||
static INLINE int vtestallones_i_vo64(vopmask g) {
|
||||
return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
|
||||
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
|
||||
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm); }
|
||||
|
||||
//
|
||||
|
||||
static vint2 vloadu_vi2_p(int32_t *p) {
|
||||
vint2 r;
|
||||
r.x = _mm_loadu_si128((__m128i *) p );
|
||||
r.y = _mm_loadu_si128((__m128i *)(p + 4));
|
||||
return r;
|
||||
}
|
||||
|
||||
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
|
||||
_mm_storeu_si128((__m128i *) p , v.x);
|
||||
_mm_storeu_si128((__m128i *)(p + 4), v.y);
|
||||
}
|
||||
|
||||
static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
|
||||
static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
|
||||
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
|
||||
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
|
||||
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
|
||||
static INLINE vopmask vcast_vo32_vo64(vopmask o) {
|
||||
return _mm256_castsi128_si256(_mm256_cvtpd_epi32(_mm256_and_pd(vreinterpret_vd_vm(o), _mm256_set1_pd(-1.0))));
|
||||
}
|
||||
|
||||
static INLINE vopmask vcast_vo64_vo32(vopmask o) {
|
||||
return vreinterpret_vm_vd(_mm256_cmp_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(o)), _mm256_set1_pd(-1.0), _CMP_EQ_OQ));
|
||||
}
|
||||
|
||||
static INLINE vopmask vcast_vo_i(int i) { return _mm256_set1_epi64x(i ? -1 : 0); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
|
||||
static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
|
||||
static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
|
||||
static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
|
||||
|
||||
static INLINE vmask vcastu_vm_vi(vint vi) {
|
||||
__m256i m = _mm256_castsi128_si256(_mm_and_si128(_mm_shuffle_epi32(vi, 0x40), _mm_set_epi32(-1, 0, -1, 0)));
|
||||
return _mm256_insertf128_si256(m, _mm_and_si128(_mm_shuffle_epi32(vi, 0xc8), _mm_set_epi32(-1, 0, -1, 0)), 1);
|
||||
}
|
||||
|
||||
static INLINE vint vcastu_vi_vm(vmask vi) {
|
||||
return _mm_or_si128(_mm_and_si128(_mm_shuffle_epi32(_mm256_castsi256_si128(vi) , 0x0d), _mm_set_epi32( 0, 0, -1, -1)),
|
||||
_mm_and_si128(_mm_shuffle_epi32(_mm256_extractf128_si256(vi, 1), 0xd0), _mm_set_epi32(-1, -1, 0, 0)));
|
||||
}
|
||||
|
||||
static INLINE vmask vcast_vm_i_i(int i0, int i1) {
|
||||
return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1);
|
||||
}
|
||||
|
||||
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
|
||||
return vreinterpret_vm_vd(_mm256_cmp_pd(vreinterpret_vd_vm(vxor_vm_vm_vm(vxor_vm_vm_vm(x, y), vreinterpret_vm_vd(_mm256_set1_pd(1.0)))), _mm256_set1_pd(1.0), _CMP_EQ_OQ));
|
||||
}
|
||||
|
||||
static INLINE vmask vcast_vm_i64(int64_t i) { return _mm256_set1_epi64x(i); }
|
||||
static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm256_set1_epi64x((uint64_t)i); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
|
||||
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
|
||||
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
|
||||
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
|
||||
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }
|
||||
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
|
||||
static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }
|
||||
static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }
|
||||
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
|
||||
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }
|
||||
|
||||
#if CONFIG == 1
|
||||
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
|
||||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
|
||||
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); }
|
||||
#else
|
||||
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
|
||||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); }
|
||||
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); }
|
||||
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
|
||||
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
|
||||
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); }
|
||||
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); }
|
||||
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmsub_pd(x, y, z); }
|
||||
#endif
|
||||
|
||||
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); }
|
||||
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); }
|
||||
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }
|
||||
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }
|
||||
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); }
|
||||
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
|
||||
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
|
||||
static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
|
||||
|
||||
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
|
||||
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
|
||||
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
|
||||
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
|
||||
|
||||
static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); }
|
||||
static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); }
|
||||
|
||||
static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
|
||||
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
|
||||
static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
|
||||
|
||||
static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
|
||||
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
|
||||
|
||||
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); }
|
||||
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); }
|
||||
|
||||
static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(o)); }
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }
|
||||
|
||||
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
|
||||
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vopmask visinf_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
|
||||
}
|
||||
|
||||
static INLINE vopmask vispinf_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
|
||||
}
|
||||
|
||||
static INLINE vopmask visminf_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));
|
||||
}
|
||||
|
||||
static INLINE vopmask visnan_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ));
|
||||
}
|
||||
|
||||
static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); }
|
||||
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }
|
||||
|
||||
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
|
||||
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
|
||||
|
||||
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
|
||||
int a[VECTLENDP];
|
||||
vstoreu_v_p_vi(a, vi);
|
||||
return _mm256_set_pd(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
|
||||
}
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// This function is needed when debugging on MSVC.
|
||||
static INLINE double vcast_d_vd(vdouble v) {
|
||||
double a[VECTLENDP];
|
||||
vstoreu_v_p_vd(a, v);
|
||||
return a[0];
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint2 vcast_vi2_vm(vmask vm) {
|
||||
vint2 r;
|
||||
r.x = _mm256_castsi256_si128(vm);
|
||||
r.y = _mm256_extractf128_si256(vm, 1);
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vmask vcast_vm_vi2(vint2 vi) {
|
||||
vmask m = _mm256_castsi128_si256(vi.x);
|
||||
m = _mm256_insertf128_si256(m, vi.y, 1);
|
||||
return m;
|
||||
}
|
||||
|
||||
static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); }
|
||||
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); }
|
||||
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); }
|
||||
static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
|
||||
static INLINE vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = _mm_set1_epi32(i); return r; }
|
||||
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }
|
||||
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }
|
||||
|
||||
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
|
||||
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }
|
||||
|
||||
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
|
||||
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
|
||||
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
|
||||
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
|
||||
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
|
||||
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
|
||||
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
|
||||
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
|
||||
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
|
||||
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }
|
||||
|
||||
#if CONFIG == 1
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
|
||||
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
#else
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); }
|
||||
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); }
|
||||
static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
|
||||
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
|
||||
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); }
|
||||
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); }
|
||||
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmsub_ps(x, y, z); }
|
||||
#endif
|
||||
|
||||
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); }
|
||||
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); }
|
||||
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }
|
||||
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }
|
||||
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); }
|
||||
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); }
|
||||
|
||||
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
vint2 vi = { _mm_add_epi32(x.x, y.x), _mm_add_epi32(x.y, y.y) };
|
||||
return vi;
|
||||
}
|
||||
|
||||
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
vint2 vi = { _mm_sub_epi32(x.x, y.x), _mm_sub_epi32(x.y, y.y) };
|
||||
return vi;
|
||||
}
|
||||
|
||||
static INLINE vint2 vneg_vi2_vi2(vint2 e) {
|
||||
vint2 vi = { _mm_sub_epi32(_mm_set1_epi32(0), e.x), _mm_sub_epi32(_mm_set1_epi32(0), e.y) };
|
||||
return vi;
|
||||
}
|
||||
|
||||
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
vint2 vi = { _mm_and_si128(x.x, y.x), _mm_and_si128(x.y, y.y) };
|
||||
return vi;
|
||||
}
|
||||
|
||||
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
vint2 vi = { _mm_andnot_si128(x.x, y.x), _mm_andnot_si128(x.y, y.y) };
|
||||
return vi;
|
||||
}
|
||||
|
||||
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
vint2 vi = { _mm_or_si128(x.x, y.x), _mm_or_si128(x.y, y.y) };
|
||||
return vi;
|
||||
}
|
||||
|
||||
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
vint2 vi = { _mm_xor_si128(x.x, y.x), _mm_xor_si128(x.y, y.y) };
|
||||
return vi;
|
||||
}
|
||||
|
||||
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
|
||||
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
|
||||
|
||||
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
|
||||
vint2 vi = { _mm_slli_epi32(x.x, c), _mm_slli_epi32(x.y, c) };
|
||||
return vi;
|
||||
}
|
||||
|
||||
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
|
||||
vint2 vi = { _mm_srli_epi32(x.x, c), _mm_srli_epi32(x.y, c) };
|
||||
return vi;
|
||||
}
|
||||
|
||||
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) {
|
||||
vint2 vi = { _mm_srai_epi32(x.x, c), _mm_srai_epi32(x.y, c) };
|
||||
return vi;
|
||||
}
|
||||
|
||||
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {
|
||||
vint2 r;
|
||||
r.x = _mm_cmpeq_epi32(x.x, y.x);
|
||||
r.y = _mm_cmpeq_epi32(x.y, y.y);
|
||||
return vcast_vm_vi2(r);
|
||||
}
|
||||
|
||||
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {
|
||||
vint2 r;
|
||||
r.x = _mm_cmpgt_epi32(x.x, y.x);
|
||||
r.y = _mm_cmpgt_epi32(x.y, y.y);
|
||||
return vcast_vm_vi2(r);
|
||||
}
|
||||
|
||||
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
vint2 r;
|
||||
r.x = _mm_cmpeq_epi32(x.x, y.x);
|
||||
r.y = _mm_cmpeq_epi32(x.y, y.y);
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
vint2 r;
|
||||
r.x = _mm_cmpgt_epi32(x.x, y.x);
|
||||
r.y = _mm_cmpgt_epi32(x.y, y.y);
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
|
||||
vint2 n = vcast_vi2_vm(m);
|
||||
vint2 r = { _mm_blendv_epi8(y.x, x.x, n.x), _mm_blendv_epi8(y.y, x.y, n.y) };
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
|
||||
vint2 ix = vcast_vi2_vm(x), iy = vcast_vi2_vm(y), iz;
|
||||
iz.x = _mm_add_epi64(ix.x, iy.x);
|
||||
iz.y = _mm_add_epi64(ix.y, iy.y);
|
||||
return vcast_vm_vi2(iz);
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }
|
||||
|
||||
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
|
||||
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); }
|
||||
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }
|
||||
|
||||
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
|
||||
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
|
||||
|
||||
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
|
||||
int a[VECTLENSP];
|
||||
vstoreu_v_p_vi2(a, vi2);
|
||||
return _mm256_set_ps(ptr[a[7]], ptr[a[6]], ptr[a[5]], ptr[a[4]],
|
||||
ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
|
||||
}
|
||||
|
||||
#ifdef _MSC_VER
|
||||
// This function is needed when debugging on MSVC.
|
||||
static INLINE float vcast_f_vf(vfloat v) {
|
||||
float a[VECTLENSP];
|
||||
vstoreu_v_p_vf(a, v);
|
||||
return a[0];
|
||||
}
|
||||
#endif
|
||||
//
|
||||
|
||||
#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
|
||||
#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
|
||||
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
|
||||
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
|
||||
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
|
||||
static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
|
||||
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
|
||||
|
||||
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); }
|
||||
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); }
|
||||
|
||||
#if CONFIG == 1
|
||||
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
|
||||
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
#else
|
||||
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
|
||||
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
|
||||
#endif
|
||||
|
||||
|
||||
static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); }
|
||||
static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); }
|
||||
|
||||
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); }
|
||||
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
|
||||
_mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
|
||||
_mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
|
||||
_mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
|
||||
_mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
|
||||
static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
|
||||
|
||||
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
|
||||
_mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
|
||||
_mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
|
||||
_mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
|
||||
_mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
|
||||
|
||||
//
|
||||
|
||||
static vquad loadu_vq_p(void *p) {
|
||||
vquad vq;
|
||||
memcpy(&vq, p, VECTLENDP * 16);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vquad cast_vq_aq(vargquad aq) {
|
||||
vquad vq;
|
||||
memcpy(&vq, &aq, VECTLENDP * 16);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vargquad cast_aq_vq(vquad vq) {
|
||||
vargquad aq;
|
||||
memcpy(&aq, &vq, VECTLENDP * 16);
|
||||
return aq;
|
||||
}
|
||||
|
||||
static INLINE int vtestallzeros_i_vo64(vopmask g) {
|
||||
return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0;
|
||||
}
|
||||
|
||||
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
|
||||
return vreinterpret_vm_vd(_mm256_blendv_pd(vreinterpret_vd_vm(y), vreinterpret_vd_vm(x), vreinterpret_vd_vm(o)));
|
||||
}
|
||||
|
||||
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
|
||||
__m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0);
|
||||
__m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0);
|
||||
vmask r = _mm256_castsi128_si256(_mm_sub_epi64(xl, yl));
|
||||
return _mm256_insertf128_si256(r, _mm_sub_epi64(xh, yh), 1);
|
||||
}
|
||||
|
||||
static INLINE vmask vneg64_vm_vm(vmask x) { return vsub64_vm_vm_vm(vcast_vm_i_i(0, 0), x); }
|
||||
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
|
||||
__m128i xh = _mm256_extractf128_si256(x, 1), xl = _mm256_extractf128_si256(x, 0);
|
||||
__m128i yh = _mm256_extractf128_si256(y, 1), yl = _mm256_extractf128_si256(y, 0);
|
||||
vmask r = _mm256_castsi128_si256(_mm_cmpgt_epi64(xl, yl));
|
||||
return _mm256_insertf128_si256(r, _mm_cmpgt_epi64(xh, yh), 1);
|
||||
}
|
||||
|
||||
#define vsll64_vm_vm_i(x, c) \
|
||||
_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), \
|
||||
_mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
|
||||
#define vsrl64_vm_vm_i(x, c) \
|
||||
_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), \
|
||||
_mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
|
||||
|
||||
//@#define vsll64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_slli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
|
||||
//@#define vsrl64_vm_vm_i(x, c) _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_srli_epi64(_mm256_extractf128_si256(x, 0), c)), _mm_srli_epi64(_mm256_extractf128_si256(x, 1), c), 1)
|
||||
|
||||
static INLINE vmask vcast_vm_vi(vint vi) {
|
||||
vint vi0 = _mm_and_si128(_mm_shuffle_epi32(vi, (1 << 4) | (1 << 6)), _mm_set_epi32(0, -1, 0, -1));
|
||||
vint vi1 = _mm_and_si128(_mm_shuffle_epi32(vi, (2 << 0) | (2 << 2) | (3 << 4) | (3 << 6)), _mm_set_epi32(0, -1, 0, -1));
|
||||
vmask m = _mm256_insertf128_si256(_mm256_castsi128_si256(vi0), vi1, 1);
|
||||
return vor_vm_vm_vm(vcastu_vm_vi(vand_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi), vcast_vi_i(-1))), m);
|
||||
}
|
||||
static INLINE vint vcast_vi_vm(vmask vm) {
|
||||
return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
|
||||
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
|
||||
}
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
|
||||
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
|
||||
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
|
||||
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
|
||||
@@ -0,0 +1,485 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#if CONFIG == 1
|
||||
|
||||
#if !defined(__AVX2__) && !defined(SLEEF_GENHEADER)
|
||||
#error Please specify -mavx2.
|
||||
#endif
|
||||
|
||||
#else
|
||||
#error CONFIG macro invalid or not defined
|
||||
#endif
|
||||
|
||||
#define ENABLE_DP
|
||||
//@#define ENABLE_DP
|
||||
#define LOG2VECTLENDP 2
|
||||
//@#define LOG2VECTLENDP 2
|
||||
#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
//@#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
#define ENABLE_FMA_DP
|
||||
//@#define ENABLE_FMA_DP
|
||||
|
||||
#define ENABLE_SP
|
||||
//@#define ENABLE_SP
|
||||
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
//@#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
#define ENABLE_FMA_SP
|
||||
//@#define ENABLE_FMA_SP
|
||||
|
||||
#define FULL_FP_ROUNDING
|
||||
//@#define FULL_FP_ROUNDING
|
||||
#define ACCURATE_SQRT
|
||||
//@#define ACCURATE_SQRT
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include "misc.h"
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
typedef __m256i vmask;
|
||||
typedef __m256i vopmask;
|
||||
|
||||
typedef __m256d vdouble;
|
||||
typedef __m128i vint;
|
||||
|
||||
typedef __m256 vfloat;
|
||||
typedef __m256i vint2;
|
||||
|
||||
typedef __m256i vint64;
|
||||
typedef __m256i vuint64;
|
||||
|
||||
typedef struct {
|
||||
vmask x, y;
|
||||
} vquad;
|
||||
|
||||
typedef vquad vargquad;
|
||||
|
||||
//
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
|
||||
#ifndef __SLEEF_H__
|
||||
void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
|
||||
#endif
|
||||
|
||||
static INLINE int cpuSupportsAVX2() {
|
||||
int32_t reg[4];
|
||||
Sleef_x86CpuID(reg, 7, 0);
|
||||
return (reg[1] & (1 << 5)) != 0;
|
||||
}
|
||||
|
||||
static INLINE int cpuSupportsFMA() {
|
||||
int32_t reg[4];
|
||||
Sleef_x86CpuID(reg, 1, 0);
|
||||
return (reg[2] & (1 << 12)) != 0;
|
||||
}
|
||||
|
||||
#if CONFIG == 1 && defined(__AVX2__)
|
||||
static INLINE int vavailability_i(int name) {
|
||||
int d = cpuSupportsAVX2() && cpuSupportsFMA();
|
||||
return d ? 3 : 0;
|
||||
}
|
||||
#define ISANAME "AVX2"
|
||||
#define DFTPRIORITY 25
|
||||
#endif
|
||||
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
|
||||
|
||||
static INLINE int vtestallones_i_vo32(vopmask g) {
|
||||
return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
|
||||
}
|
||||
|
||||
static INLINE int vtestallones_i_vo64(vopmask g) {
|
||||
return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
|
||||
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
|
||||
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm); }
|
||||
|
||||
//
|
||||
|
||||
static vint2 vloadu_vi2_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); }
|
||||
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm256_storeu_si256((__m256i *)p, v); }
|
||||
static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
|
||||
static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
|
||||
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
|
||||
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
|
||||
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
|
||||
static INLINE vopmask vcast_vo32_vo64(vopmask o) {
|
||||
return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0));
|
||||
}
|
||||
|
||||
static INLINE vopmask vcast_vo64_vo32(vopmask o) {
|
||||
return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
|
||||
}
|
||||
|
||||
static INLINE vopmask vcast_vo_i(int i) { return _mm256_set1_epi64x(i ? -1 : 0); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
|
||||
static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
|
||||
static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
|
||||
static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
|
||||
|
||||
static INLINE vmask vcastu_vm_vi(vint vi) {
|
||||
return _mm256_slli_epi64(_mm256_cvtepi32_epi64(vi), 32);
|
||||
}
|
||||
|
||||
static INLINE vint vcastu_vi_vm(vmask vi) {
|
||||
return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vi)), _mm_set1_ps(0), 0x0d)),
|
||||
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vi, 1)), 0xd0)));
|
||||
}
|
||||
|
||||
static INLINE vmask vcast_vm_i_i(int i0, int i1) {
|
||||
return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1);
|
||||
}
|
||||
|
||||
static INLINE vmask vcast_vm_i64(int64_t i) { return _mm256_set1_epi64x(i); }
|
||||
static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm256_set1_epi64x((uint64_t)i); }
|
||||
|
||||
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpeq_epi64(x, y); }
|
||||
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm256_add_epi64(x, y); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
|
||||
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
|
||||
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
|
||||
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
|
||||
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }
|
||||
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
|
||||
static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }
|
||||
static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }
|
||||
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
|
||||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); }
|
||||
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); }
|
||||
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
|
||||
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }
|
||||
|
||||
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
|
||||
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
|
||||
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); }
|
||||
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); }
|
||||
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmsub_pd(x, y, z); }
|
||||
|
||||
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); }
|
||||
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); }
|
||||
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }
|
||||
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }
|
||||
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); }
|
||||
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
|
||||
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
|
||||
static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
|
||||
|
||||
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
|
||||
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
|
||||
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
|
||||
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
|
||||
|
||||
static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); }
|
||||
static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); }
|
||||
|
||||
static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
|
||||
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
|
||||
static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
|
||||
|
||||
static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
|
||||
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
|
||||
|
||||
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); }
|
||||
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); }
|
||||
|
||||
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(m)); }
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }
|
||||
static INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return _mm256_permutevar_pd(_mm256_set_pd(v1, v0, v1, v0), o); }
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
|
||||
__m256i v = _mm256_castpd_si256(vsel_vd_vo_vd_vd(o0, _mm256_castsi256_pd(_mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0)),
|
||||
vsel_vd_vo_vd_vd(o1, _mm256_castsi256_pd(_mm256_set_epi32(3, 2, 3, 2, 3, 2, 3, 2)),
|
||||
vsel_vd_vo_vd_vd(o2, _mm256_castsi256_pd(_mm256_set_epi32(5, 4, 5, 4, 5, 4, 5, 4)),
|
||||
_mm256_castsi256_pd(_mm256_set_epi32(7, 6, 7, 6, 7, 6, 7, 6))))));
|
||||
return _mm256_castsi256_pd(_mm256_permutevar8x32_epi32(_mm256_castpd_si256(_mm256_set_pd(d3, d2, d1, d0)), v));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
|
||||
return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
|
||||
}
|
||||
|
||||
static INLINE vopmask visinf_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
|
||||
}
|
||||
|
||||
static INLINE vopmask vispinf_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
|
||||
}
|
||||
|
||||
static INLINE vopmask visminf_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));
|
||||
}
|
||||
|
||||
static INLINE vopmask visnan_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ));
|
||||
}
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// This function is needed when debugging on MSVC.
|
||||
static INLINE double vcast_d_vd(vdouble v) {
|
||||
double s[4];
|
||||
_mm256_storeu_pd(s, v);
|
||||
return s[0];
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); }
|
||||
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }
|
||||
|
||||
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
|
||||
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
|
||||
|
||||
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm256_i32gather_pd(ptr, vi, 8); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
|
||||
static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
|
||||
|
||||
static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); }
|
||||
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); }
|
||||
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); }
|
||||
static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
|
||||
static INLINE vint2 vcast_vi2_i(int i) { return _mm256_set1_epi32(i); }
|
||||
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }
|
||||
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }
|
||||
|
||||
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
|
||||
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }
|
||||
|
||||
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
|
||||
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
|
||||
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
|
||||
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
|
||||
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
|
||||
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
|
||||
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
|
||||
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
|
||||
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); }
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); }
|
||||
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
|
||||
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }
|
||||
|
||||
static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
|
||||
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
|
||||
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); }
|
||||
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); }
|
||||
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmsub_ps(x, y, z); }
|
||||
|
||||
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); }
|
||||
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); }
|
||||
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }
|
||||
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }
|
||||
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); }
|
||||
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); }
|
||||
|
||||
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_add_epi32(x, y); }
|
||||
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_sub_epi32(x, y); }
|
||||
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
|
||||
|
||||
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_and_si256(x, y); }
|
||||
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_andnot_si256(x, y); }
|
||||
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_or_si256(x, y); }
|
||||
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_xor_si256(x, y); }
|
||||
|
||||
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
|
||||
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
|
||||
|
||||
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm256_slli_epi32(x, c); }
|
||||
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm256_srli_epi32(x, c); }
|
||||
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm256_srai_epi32(x, c); }
|
||||
|
||||
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); }
|
||||
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); }
|
||||
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); }
|
||||
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); }
|
||||
|
||||
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
|
||||
return _mm256_blendv_epi8(y, x, m);
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }
|
||||
|
||||
// At this point, the following three functions are implemented in a generic way,
|
||||
// but I will try target-specific optimization later on.
|
||||
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
|
||||
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
|
||||
|
||||
#ifdef _MSC_VER
|
||||
// This function is needed when debugging on MSVC.
|
||||
static INLINE float vcast_f_vf(vfloat v) {
|
||||
float s[8];
|
||||
_mm256_storeu_ps(s, v);
|
||||
return s[0];
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); }
|
||||
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }
|
||||
|
||||
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
|
||||
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
|
||||
|
||||
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm256_i32gather_ps(ptr, vi2, 4); }
|
||||
|
||||
//
|
||||
|
||||
#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
|
||||
#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
|
||||
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
|
||||
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
|
||||
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
|
||||
static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
|
||||
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
|
||||
|
||||
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); }
|
||||
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); }
|
||||
|
||||
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
|
||||
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
|
||||
|
||||
static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); }
|
||||
static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); }
|
||||
|
||||
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); }
|
||||
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
|
||||
_mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
|
||||
_mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
|
||||
_mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
|
||||
_mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
|
||||
static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
|
||||
|
||||
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
|
||||
_mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
|
||||
_mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
|
||||
_mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
|
||||
_mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
|
||||
|
||||
//
|
||||
|
||||
static vquad loadu_vq_p(void *p) {
|
||||
vquad vq;
|
||||
memcpy(&vq, p, VECTLENDP * 16);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vquad cast_vq_aq(vargquad aq) {
|
||||
vquad vq;
|
||||
memcpy(&vq, &aq, VECTLENDP * 16);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vargquad cast_aq_vq(vquad vq) {
|
||||
vargquad aq;
|
||||
memcpy(&aq, &vq, VECTLENDP * 16);
|
||||
return aq;
|
||||
}
|
||||
|
||||
static INLINE int vtestallzeros_i_vo64(vopmask g) {
|
||||
return _mm_movemask_epi8(_mm_or_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1))) == 0;
|
||||
}
|
||||
|
||||
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return _mm256_blendv_epi8(y, x, o); }
|
||||
|
||||
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm256_sub_epi64(x, y); }
|
||||
static INLINE vmask vneg64_vm_vm(vmask x) { return _mm256_sub_epi64(vcast_vm_i_i(0, 0), x); }
|
||||
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpgt_epi64(x, y); } // signed compare
|
||||
|
||||
#define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c)
|
||||
#define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c)
|
||||
//@#define vsll64_vm_vm_i(x, c) _mm256_slli_epi64(x, c)
|
||||
//@#define vsrl64_vm_vm_i(x, c) _mm256_srli_epi64(x, c)
|
||||
|
||||
static INLINE vmask vcast_vm_vi(vint vi) { return _mm256_cvtepi32_epi64(vi); } // signed 32-bit => 64-bit
|
||||
static INLINE vint vcast_vi_vm(vmask vm) { // signed 32-bit <= 64-bit
|
||||
return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vm)), _mm_set1_ps(0), 0x08)),
|
||||
_mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vm, 1)), 0x80)));
|
||||
}
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
|
||||
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
|
||||
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
|
||||
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
|
||||
@@ -0,0 +1,463 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#if CONFIG == 1
|
||||
|
||||
#if !defined(__AVX2__) && !defined(SLEEF_GENHEADER)
|
||||
#error Please specify -mavx2.
|
||||
#endif
|
||||
|
||||
#else
|
||||
#error CONFIG macro invalid or not defined
|
||||
#endif
|
||||
|
||||
#define ENABLE_DP
|
||||
//@#define ENABLE_DP
|
||||
#define LOG2VECTLENDP 1
|
||||
//@#define LOG2VECTLENDP 1
|
||||
#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
//@#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
#define ENABLE_FMA_DP
|
||||
//@#define ENABLE_FMA_DP
|
||||
|
||||
#define ENABLE_SP
|
||||
//@#define ENABLE_SP
|
||||
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
//@#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
#define ENABLE_FMA_SP
|
||||
//@#define ENABLE_FMA_SP
|
||||
|
||||
#define FULL_FP_ROUNDING
|
||||
//@#define FULL_FP_ROUNDING
|
||||
#define ACCURATE_SQRT
|
||||
//@#define ACCURATE_SQRT
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include "misc.h"
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
typedef __m128i vmask;
|
||||
typedef __m128i vopmask;
|
||||
|
||||
typedef __m128d vdouble;
|
||||
typedef __m128i vint;
|
||||
|
||||
typedef __m128 vfloat;
|
||||
typedef __m128i vint2;
|
||||
|
||||
typedef __m128i vint64;
|
||||
typedef __m128i vuint64;
|
||||
|
||||
typedef struct {
|
||||
vmask x, y;
|
||||
} vquad;
|
||||
|
||||
typedef vquad vargquad;
|
||||
|
||||
//
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
|
||||
#ifndef __SLEEF_H__
|
||||
void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
|
||||
#endif
|
||||
|
||||
static INLINE int cpuSupportsAVX2() {
|
||||
int32_t reg[4];
|
||||
Sleef_x86CpuID(reg, 7, 0);
|
||||
return (reg[1] & (1 << 5)) != 0;
|
||||
}
|
||||
|
||||
static INLINE int cpuSupportsFMA() {
|
||||
int32_t reg[4];
|
||||
Sleef_x86CpuID(reg, 1, 0);
|
||||
return (reg[2] & (1 << 12)) != 0;
|
||||
}
|
||||
|
||||
#if CONFIG == 1 && defined(__AVX2__)
|
||||
static INLINE int vavailability_i(int name) {
|
||||
int d = cpuSupportsAVX2() && cpuSupportsFMA();
|
||||
return d ? 3 : 0;
|
||||
}
|
||||
#define ISANAME "AVX2"
|
||||
#define DFTPRIORITY 25
|
||||
#endif
|
||||
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
|
||||
|
||||
static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
|
||||
static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vdouble vcast_vd_d(double d) { return _mm_set1_pd(d); }
|
||||
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm_castpd_si128(vd); }
|
||||
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm_castsi128_pd(vm); }
|
||||
|
||||
//
|
||||
|
||||
static vint2 vloadu_vi2_p(int32_t *p) { return _mm_loadu_si128((__m128i const *)p); }
|
||||
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, v); }
|
||||
static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
|
||||
static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
|
||||
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
|
||||
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
|
||||
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
|
||||
|
||||
static INLINE vopmask vcast_vo32_vo64(vopmask m) { return _mm_shuffle_epi32(m, 0x08); }
|
||||
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return _mm_shuffle_epi32(m, 0x50); }
|
||||
|
||||
static INLINE vopmask vcast_vo_i(int i) { return _mm_set1_epi64x(i ? -1 : 0); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
|
||||
static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
|
||||
static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }
|
||||
static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
|
||||
|
||||
static INLINE vmask vcastu_vm_vi(vint vi) { return _mm_and_si128(_mm_shuffle_epi32(vi, 0x73), _mm_set_epi32(-1, 0, -1, 0)); }
|
||||
static INLINE vint vcastu_vi_vm(vmask vi) { return _mm_shuffle_epi32(vi, 0x0d); }
|
||||
|
||||
static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm_set_epi32(i0, i1, i0, i1); }
|
||||
|
||||
static INLINE vmask vcast_vm_i64(int64_t i) { return _mm_set1_epi64x(i); }
|
||||
static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm_set1_epi64x((uint64_t)i); }
|
||||
|
||||
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpeq_epi64(x, y); }
|
||||
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm_add_epi64(x, y); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_add_pd(x, y); }
|
||||
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm_sub_pd(x, y); }
|
||||
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm_mul_pd(x, y); }
|
||||
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm_div_pd(x, y); }
|
||||
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm_div_pd(_mm_set1_pd(1), x); }
|
||||
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm_sqrt_pd(x); }
|
||||
static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm_andnot_pd(_mm_set1_pd(-0.0), d); }
|
||||
static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm_xor_pd(_mm_set1_pd(-0.0), d); }
|
||||
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmadd_pd(x, y, z); }
|
||||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmsub_pd(x, y, z); }
|
||||
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fnmadd_pd(x, y, z); }
|
||||
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm_max_pd(x, y); }
|
||||
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm_min_pd(x, y); }
|
||||
|
||||
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmadd_pd(x, y, z); }
|
||||
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmadd_pd(x, y, z); }
|
||||
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmsub_pd(x, y, z); }
|
||||
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fnmadd_pd(x, y, z); }
|
||||
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fnmsub_pd(x, y, z); }
|
||||
|
||||
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_EQ_OQ)); }
|
||||
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_NEQ_UQ)); }
|
||||
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_LT_OQ)); }
|
||||
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_LE_OQ)); }
|
||||
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_GT_OQ)); }
|
||||
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_GE_OQ)); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
|
||||
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
|
||||
static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
|
||||
|
||||
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
|
||||
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
|
||||
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
|
||||
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
|
||||
|
||||
static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return _mm_and_si128(x, y); }
|
||||
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return _mm_andnot_si128(x, y); }
|
||||
|
||||
static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
|
||||
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
|
||||
static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
|
||||
|
||||
static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
|
||||
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
|
||||
|
||||
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
|
||||
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
|
||||
|
||||
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, m); }
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm_blendv_pd(y, x, _mm_castsi128_pd(o)); }
|
||||
|
||||
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
|
||||
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vopmask visinf_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm_cmp_pd(vabs_vd_vd(d), _mm_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
|
||||
}
|
||||
|
||||
static INLINE vopmask vispinf_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm_cmp_pd(d, _mm_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ));
|
||||
}
|
||||
|
||||
static INLINE vopmask visminf_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm_cmp_pd(d, _mm_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ));
|
||||
}
|
||||
|
||||
static INLINE vopmask visnan_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm_cmp_pd(d, d, _CMP_NEQ_UQ));
|
||||
}
|
||||
|
||||
static INLINE vdouble vload_vd_p(const double *ptr) { return _mm_load_pd(ptr); }
|
||||
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr); }
|
||||
|
||||
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
|
||||
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }
|
||||
|
||||
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm_i32gather_pd(ptr, vi, 8); }
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// This function is needed when debugging on MSVC.
|
||||
static INLINE double vcast_d_vd(vdouble v) {
|
||||
double a[VECTLENDP];
|
||||
vstoreu_v_p_vd(a, v);
|
||||
return a[0];
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
|
||||
static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
|
||||
|
||||
static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm_cvtps_epi32(vf)); }
|
||||
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm_cvttps_epi32(vf)); }
|
||||
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); }
|
||||
static INLINE vfloat vcast_vf_f(float f) { return _mm_set1_ps(f); }
|
||||
static INLINE vint2 vcast_vi2_i(int i) { return _mm_set1_epi32(i); }
|
||||
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm_castps_si128(vf); }
|
||||
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm_castsi128_ps(vm); }
|
||||
|
||||
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
|
||||
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }
|
||||
|
||||
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); }
|
||||
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); }
|
||||
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); }
|
||||
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm_div_ps(x, y); }
|
||||
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
|
||||
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); }
|
||||
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
|
||||
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x, y, z); }
|
||||
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmsub_ps(x, y, z); }
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fnmadd_ps(x, y, z); }
|
||||
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); }
|
||||
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); }
|
||||
|
||||
static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x, y, z); }
|
||||
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x, y, z); }
|
||||
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmsub_ps(x, y, z); }
|
||||
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fnmadd_ps(x, y, z); }
|
||||
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fnmsub_ps(x, y, z); }
|
||||
|
||||
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_EQ_OQ)); }
|
||||
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_NEQ_UQ)); }
|
||||
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_LT_OQ)); }
|
||||
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_LE_OQ)); }
|
||||
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_GT_OQ)); }
|
||||
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_GE_OQ)); }
|
||||
|
||||
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_add_epi32(x, y); }
|
||||
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_sub_epi32(x, y); }
|
||||
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
|
||||
|
||||
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_and_si128(x, y); }
|
||||
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_andnot_si128(x, y); }
|
||||
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_or_si128(x, y); }
|
||||
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_xor_si128(x, y); }
|
||||
|
||||
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
|
||||
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
|
||||
|
||||
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm_slli_epi32(x, c); }
|
||||
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm_srli_epi32(x, c); }
|
||||
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm_srai_epi32(x, c); }
|
||||
|
||||
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
|
||||
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
|
||||
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
|
||||
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
|
||||
|
||||
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
|
||||
return _mm_blendv_epi8(y, x, m);
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm_blendv_ps(y, x, _mm_castsi128_ps(o)); }
|
||||
|
||||
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
|
||||
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
|
||||
|
||||
static INLINE vfloat vload_vf_p(const float *ptr) { return _mm_load_ps(ptr); }
|
||||
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
|
||||
|
||||
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
|
||||
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }
|
||||
|
||||
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm_i32gather_ps(ptr, vi2, 4); }
|
||||
|
||||
#ifdef _MSC_VER
|
||||
// This function is needed when debugging on MSVC.
|
||||
static INLINE float vcast_f_vf(vfloat v) {
|
||||
float a[VECTLENSP];
|
||||
vstoreu_v_p_vf(a, v);
|
||||
return a[0];
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
#define PNMASK ((vdouble) { +0.0, -0.0 })
|
||||
#define NPMASK ((vdouble) { -0.0, +0.0 })
|
||||
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
|
||||
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
|
||||
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
|
||||
static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
|
||||
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
|
||||
|
||||
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_addsub_pd(x, y); }
|
||||
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_addsub_ps(x, y); }
|
||||
|
||||
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
|
||||
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
|
||||
|
||||
static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm_shuffle_pd(d0, d0, 1); }
|
||||
static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
|
||||
|
||||
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm_stream_pd(ptr, v); }
|
||||
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
|
||||
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd((double *)(&ptr[2*offset]), v); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
|
||||
static INLINE vfloat vreva2_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
|
||||
|
||||
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm_stream_ps(ptr, v); }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
|
||||
_mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
|
||||
_mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
|
||||
_mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
|
||||
_mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static vquad loadu_vq_p(void *p) {
|
||||
vquad vq = {
|
||||
vloadu_vi2_p((int32_t *)p),
|
||||
vloadu_vi2_p((int32_t *)((uint8_t *)p + sizeof(vmask)))
|
||||
};
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vquad cast_vq_aq(vargquad aq) {
|
||||
vquad vq;
|
||||
memcpy(&vq, &aq, VECTLENDP * 16);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vargquad cast_aq_vq(vquad vq) {
|
||||
vargquad aq;
|
||||
memcpy(&aq, &vq, VECTLENDP * 16);
|
||||
return aq;
|
||||
}
|
||||
|
||||
static void vstoreu_v_p_vq(void *p, vquad vq) {
|
||||
vstoreu_v_p_vi2((int32_t *)p, vcast_vi2_vm(vq.x));
|
||||
vstoreu_v_p_vi2((int32_t *)((uint8_t *)p + sizeof(vmask)), vcast_vi2_vm(vq.y));
|
||||
}
|
||||
|
||||
static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0; }
|
||||
|
||||
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return _mm_blendv_epi8(y, x, o); }
|
||||
|
||||
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm_sub_epi64(x, y); }
|
||||
static INLINE vmask vneg64_vm_vm(vmask x) { return _mm_sub_epi64(vcast_vm_i_i(0, 0), x); }
|
||||
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpgt_epi64(x, y); } // signed compare
|
||||
|
||||
#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
|
||||
#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
|
||||
//@#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
|
||||
//@#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
|
||||
|
||||
static INLINE vmask vcast_vm_vi(vint vi) {
|
||||
vmask m = _mm_and_si128(_mm_shuffle_epi32(vi, (0 << 6) | (1 << 4) | (0 << 2) | (0 << 0)), _mm_set_epi32(0, -1, 0, -1));
|
||||
return vor_vm_vm_vm(vcastu_vm_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi)), m);
|
||||
}
|
||||
static INLINE vint vcast_vi_vm(vmask vm) { return _mm_shuffle_epi32(vm, 0x08); }
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
|
||||
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
|
||||
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
|
||||
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
|
||||
@@ -0,0 +1,600 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#if CONFIG == 1 || CONFIG == 2
|
||||
|
||||
#if !defined(__AVX512F__) && !defined(SLEEF_GENHEADER)
|
||||
#error Please specify -mavx512f.
|
||||
#endif
|
||||
|
||||
#else
|
||||
#error CONFIG macro invalid or not defined
|
||||
#endif
|
||||
|
||||
#define ENABLE_DP
|
||||
//@#define ENABLE_DP
|
||||
#define LOG2VECTLENDP 3
|
||||
//@#define LOG2VECTLENDP 3
|
||||
#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
//@#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
|
||||
#define ENABLE_SP
|
||||
//@#define ENABLE_SP
|
||||
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
//@#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
|
||||
#if CONFIG == 1
|
||||
#define ENABLE_FMA_DP
|
||||
//@#define ENABLE_FMA_DP
|
||||
#define ENABLE_FMA_SP
|
||||
//@#define ENABLE_FMA_SP
|
||||
#endif
|
||||
|
||||
#define FULL_FP_ROUNDING
|
||||
//@#define FULL_FP_ROUNDING
|
||||
#define ACCURATE_SQRT
|
||||
//@#define ACCURATE_SQRT
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include "misc.h"
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
typedef __m512i vmask;
|
||||
typedef __mmask16 vopmask;
|
||||
|
||||
typedef __m512d vdouble;
|
||||
typedef __m256i vint;
|
||||
|
||||
typedef __m512 vfloat;
|
||||
typedef __m512i vint2;
|
||||
|
||||
typedef __m512i vint64;
|
||||
typedef __m512i vuint64;
|
||||
|
||||
typedef struct {
|
||||
vmask x, y;
|
||||
} vquad;
|
||||
|
||||
typedef vquad vargquad;
|
||||
|
||||
//
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
|
||||
#ifndef __SLEEF_H__
|
||||
void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
|
||||
#endif
|
||||
|
||||
static INLINE int cpuSupportsAVX512F() {
|
||||
int32_t reg[4];
|
||||
Sleef_x86CpuID(reg, 7, 0);
|
||||
return (reg[1] & (1 << 16)) != 0;
|
||||
}
|
||||
|
||||
#if CONFIG == 1 && defined(__AVX512F__)
|
||||
static INLINE int vavailability_i(int name) {
|
||||
int d = cpuSupportsAVX512F();
|
||||
return d ? 3 : 0;
|
||||
}
|
||||
#define ISANAME "AVX512F"
|
||||
#define DFTPRIORITY 30
|
||||
#endif
|
||||
|
||||
#if CONFIG == 2 && defined(__AVX512F__)
|
||||
static INLINE int vavailability_i(int name) {
|
||||
int d = cpuSupportsAVX512F();
|
||||
return d ? 3 : 0;
|
||||
}
|
||||
#define ISANAME "AVX512FNOFMA"
|
||||
#define DFTPRIORITY 0
|
||||
#endif
|
||||
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
|
||||
|
||||
#ifdef __INTEL_COMPILER
|
||||
static INLINE int vtestallones_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0xff; }
|
||||
static INLINE int vtestallones_i_vo32(vopmask g) { return _mm512_mask2int(g) == 0xffff; }
|
||||
#else
|
||||
static INLINE int vtestallones_i_vo64(vopmask g) { return g == 0xff; }
|
||||
static INLINE int vtestallones_i_vo32(vopmask g) { return g == 0xffff; }
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
static vint2 vloadu_vi2_p(int32_t *p) { return _mm512_loadu_si512((__m512i const *)p); }
|
||||
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm512_storeu_si512((__m512i *)p, v); }
|
||||
static vint vloadu_vi_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); }
|
||||
static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm256_storeu_si256((__m256i *)p, v); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm512_and_si512(x, y); }
|
||||
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm512_andnot_si512(x, y); }
|
||||
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm512_or_si512(x, y); }
|
||||
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm512_xor_si512(x, y); }
|
||||
|
||||
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kand(x, y); }
|
||||
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kandn(x, y); }
|
||||
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kor(x, y); }
|
||||
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kxor(x, y); }
|
||||
|
||||
static INLINE vmask vand_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(_mm512_set1_epi32(0), o, m, m); }
|
||||
static INLINE vmask vandnot_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); }
|
||||
static INLINE vmask vor_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_or_epi64(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); }
|
||||
|
||||
static INLINE vmask vand_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m); }
|
||||
static INLINE vmask vandnot_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); }
|
||||
static INLINE vmask vor_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_or_epi32(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); }
|
||||
|
||||
static INLINE vopmask vcast_vo32_vo64(vopmask o) { return o; }
|
||||
static INLINE vopmask vcast_vo64_vo32(vopmask o) { return o; }
|
||||
|
||||
static INLINE vopmask vcast_vo_i(int i) { return i ? -1 : 0; }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint vrint_vi_vd(vdouble vd) {
|
||||
return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
|
||||
}
|
||||
|
||||
static INLINE vint vtruncate_vi_vd(vdouble vd) {
|
||||
return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
|
||||
}
|
||||
|
||||
static INLINE vdouble vcast_vd_vi(vint vi) { return _mm512_cvtepi32_pd(vi); }
|
||||
static INLINE vint vcast_vi_i(int i) { return _mm256_set1_epi32(i); }
|
||||
|
||||
static INLINE vdouble vtruncate_vd_vd(vdouble vd) {
|
||||
return _mm512_roundscale_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
|
||||
}
|
||||
|
||||
static INLINE vdouble vrint_vd_vd(vdouble vd) {
|
||||
return _mm512_roundscale_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
|
||||
}
|
||||
|
||||
static INLINE vmask vcastu_vm_vi(vint vi) {
|
||||
return _mm512_maskz_permutexvar_epi32(0xaaaa, _mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), _mm512_castsi256_si512(vi));
|
||||
}
|
||||
|
||||
static INLINE vint vcastu_vi_vm(vmask vi) {
|
||||
return _mm512_castsi512_si256(_mm512_maskz_permutexvar_epi32(0x00ff, _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 7, 5, 3, 1), vi));
|
||||
}
|
||||
|
||||
static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm512_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1); }
|
||||
|
||||
static INLINE vmask vcast_vm_i64(int64_t i) { return _mm512_set1_epi64(i); }
|
||||
static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm512_set1_epi64((uint64_t)i); }
|
||||
|
||||
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ); }
|
||||
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm512_add_epi64(x, y); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vdouble vcast_vd_d(double d) { return _mm512_set1_pd(d); }
|
||||
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm512_castpd_si512(vd); }
|
||||
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm512_castsi512_pd(vm); }
|
||||
|
||||
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm512_add_pd(x, y); }
|
||||
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm512_sub_pd(x, y); }
|
||||
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm512_mul_pd(x, y); }
|
||||
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm512_div_pd(x, y); }
|
||||
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm512_div_pd(_mm512_set1_pd(1), x); }
|
||||
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm512_sqrt_pd(x); }
|
||||
static INLINE vdouble vabs_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_andnot_si512(vreinterpret_vm_vd(_mm512_set1_pd(-0.0)), vreinterpret_vm_vd(d))); }
|
||||
static INLINE vdouble vneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(_mm512_xor_si512(vreinterpret_vm_vd(_mm512_set1_pd(-0.0)), vreinterpret_vm_vd(d))); }
|
||||
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm512_max_pd(x, y); }
|
||||
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm512_min_pd(x, y); }
|
||||
|
||||
#if CONFIG == 1
|
||||
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
|
||||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); }
|
||||
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); }
|
||||
#else
|
||||
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
|
||||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
|
||||
#endif
|
||||
|
||||
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
|
||||
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
|
||||
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); }
|
||||
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); }
|
||||
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmsub_pd(x, y, z); }
|
||||
|
||||
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ); }
|
||||
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_NEQ_UQ); }
|
||||
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LT_OQ); }
|
||||
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LE_OQ); }
|
||||
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GT_OQ); }
|
||||
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm256_add_epi32(x, y); }
|
||||
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm256_sub_epi32(x, y); }
|
||||
static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
|
||||
|
||||
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm256_and_si256(x, y); }
|
||||
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm256_andnot_si256(x, y); }
|
||||
|
||||
static INLINE vint vandnot_vi_vo_vi(vopmask o, vint y) {
|
||||
return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_castsi256_si512(y), o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)));
|
||||
}
|
||||
static INLINE vint vand_vi_vo_vi(vopmask o, vint y) {
|
||||
return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_set1_epi32(0), o, _mm512_castsi256_si512(y), _mm512_castsi256_si512(y)));
|
||||
}
|
||||
|
||||
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm256_or_si256(x, y); }
|
||||
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm256_xor_si256(x, y); }
|
||||
#define vsll_vi_vi_i(x, c) _mm256_slli_epi32(x, c)
|
||||
#define vsrl_vi_vi_i(x, c) _mm256_srli_epi32(x, c)
|
||||
#define vsra_vi_vi_i(x, c) _mm256_srai_epi32(x, c)
|
||||
//@#define vsll_vi_vi_i(x, c) _mm256_slli_epi32(x, c)
|
||||
//@#define vsrl_vi_vi_i(x, c) _mm256_srli_epi32(x, c)
|
||||
//@#define vsra_vi_vi_i(x, c) _mm256_srai_epi32(x, c)
|
||||
|
||||
static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm256_cmpeq_epi32(x, y); }
|
||||
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm256_cmpgt_epi32(x, y); }
|
||||
|
||||
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) {
|
||||
return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(x), _mm512_castsi256_si512(y), _MM_CMPINT_EQ);
|
||||
}
|
||||
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) {
|
||||
return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(y), _mm512_castsi256_si512(x), _MM_CMPINT_LT);
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) {
|
||||
return _mm512_mask_blend_pd(mask, y, x);
|
||||
}
|
||||
|
||||
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
|
||||
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
|
||||
}
|
||||
|
||||
#if 1
|
||||
// Probably this is faster
|
||||
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
|
||||
__m512i v = _mm512_castpd_si512(vsel_vd_vo_vd_vd(o0, _mm512_castsi512_pd(_mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 0)),
|
||||
vsel_vd_vo_vd_vd(o1, _mm512_castsi512_pd(_mm512_set_epi64(1, 1, 1, 1, 1, 1, 1, 1)),
|
||||
vsel_vd_vo_vd_vd(o2, _mm512_castsi512_pd(_mm512_set_epi64(2, 2, 2, 2, 2, 2, 2, 2)),
|
||||
_mm512_castsi512_pd(_mm512_set_epi64(3, 3, 3, 3, 3, 3, 3, 3))))));
|
||||
return _mm512_permutexvar_pd(v, _mm512_castpd256_pd512(_mm256_set_pd(d3, d2, d1, d0)));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
|
||||
return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
|
||||
}
|
||||
#else
|
||||
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE vopmask visinf_vo_vd(vdouble d) {
|
||||
return _mm512_cmp_pd_mask(vabs_vd_vd(d), _mm512_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ);
|
||||
}
|
||||
|
||||
static INLINE vopmask vispinf_vo_vd(vdouble d) {
|
||||
return _mm512_cmp_pd_mask(d, _mm512_set1_pd(SLEEF_INFINITY), _CMP_EQ_OQ);
|
||||
}
|
||||
|
||||
static INLINE vopmask visminf_vo_vd(vdouble d) {
|
||||
return _mm512_cmp_pd_mask(d, _mm512_set1_pd(-SLEEF_INFINITY), _CMP_EQ_OQ);
|
||||
}
|
||||
|
||||
static INLINE vopmask visnan_vo_vd(vdouble d) {
|
||||
return _mm512_cmp_pd_mask(d, d, _CMP_NEQ_UQ);
|
||||
}
|
||||
|
||||
static INLINE vint vilogbk_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); }
|
||||
|
||||
// vilogb2k_vi_vd is similar to vilogbk_vi_vd, but the argument has to
|
||||
// be a normalized FP value.
|
||||
static INLINE vint vilogb2k_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); }
|
||||
|
||||
static INLINE vdouble vgetexp_vd_vd(vdouble d) { return _mm512_getexp_pd(d); }
|
||||
static INLINE vfloat vgetexp_vf_vf(vfloat d) { return _mm512_getexp_ps(d); }
|
||||
|
||||
static INLINE vdouble vgetmant_vd_vd(vdouble d) { return _mm512_getmant_pd(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }
|
||||
static INLINE vfloat vgetmant_vf_vf(vfloat d) { return _mm512_getmant_ps(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }
|
||||
|
||||
#define vfixup_vd_vd_vd_vi2_i(a, b, c, imm) _mm512_fixupimm_pd((a), (b), (c), (imm))
|
||||
#define vfixup_vf_vf_vf_vi2_i(a, b, c, imm) _mm512_fixupimm_ps((a), (b), (c), (imm))
|
||||
//@#define vfixup_vd_vd_vd_vi2_i(a, b, c, imm) _mm512_fixupimm_pd((a), (b), (c), (imm))
|
||||
//@#define vfixup_vf_vf_vf_vi2_i(a, b, c, imm) _mm512_fixupimm_ps((a), (b), (c), (imm))
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// This function is needed when debugging on MSVC.
|
||||
static INLINE double vcast_d_vd(vdouble v) {
|
||||
double s[VECTLENDP];
|
||||
_mm512_storeu_pd(s, v);
|
||||
return s[0];
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE vdouble vload_vd_p(const double *ptr) { return _mm512_load_pd(ptr); }
|
||||
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm512_loadu_pd(ptr); }
|
||||
|
||||
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm512_store_pd(ptr, v); }
|
||||
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm512_storeu_pd(ptr, v); }
|
||||
|
||||
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return _mm512_i32gather_pd(vi, ptr, 8); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
|
||||
return _mm512_castsi512_si256(_mm512_mask_blend_epi32(m, _mm512_castsi256_si512(y), _mm512_castsi256_si512(x)));
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm512_castps_si512(vf); }
|
||||
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm512_castsi512_ps(vm); }
|
||||
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return _mm512_castsi512_ps(vi); }
|
||||
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm512_castps_si512(vf); }
|
||||
|
||||
static INLINE vdouble vreinterpret_vd_vf(vfloat vf) { return _mm512_castps_pd(vf); }
|
||||
static INLINE vfloat vreinterpret_vf_vd(vdouble vd) { return _mm512_castpd_ps(vd); }
|
||||
|
||||
static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
|
||||
static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
|
||||
|
||||
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm512_cvtepi32_ps(vcast_vm_vi2(vi)); }
|
||||
static INLINE vfloat vcast_vf_f(float f) { return _mm512_set1_ps(f); }
|
||||
static INLINE vint2 vcast_vi2_i(int i) { return _mm512_set1_epi32(i); }
|
||||
static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm512_cvtps_epi32(vf)); }
|
||||
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm512_cvttps_epi32(vf)); }
|
||||
|
||||
static INLINE vfloat vtruncate_vf_vf(vfloat vd) {
|
||||
return _mm512_roundscale_ps(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
|
||||
}
|
||||
|
||||
static INLINE vfloat vrint_vf_vf(vfloat vd) {
|
||||
return _mm512_roundscale_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
|
||||
}
|
||||
|
||||
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm512_add_ps(x, y); }
|
||||
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm512_sub_ps(x, y); }
|
||||
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm512_mul_ps(x, y); }
|
||||
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm512_div_ps(x, y); }
|
||||
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
|
||||
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm512_sqrt_ps(x); }
|
||||
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
|
||||
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
|
||||
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm512_max_ps(x, y); }
|
||||
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm512_min_ps(x, y); }
|
||||
|
||||
#if CONFIG == 1
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
|
||||
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); }
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); }
|
||||
#else
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
|
||||
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
#endif
|
||||
|
||||
static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
|
||||
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
|
||||
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); }
|
||||
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); }
|
||||
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmsub_ps(x, y, z); }
|
||||
|
||||
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ); }
|
||||
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ); }
|
||||
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ); }
|
||||
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ); }
|
||||
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ); }
|
||||
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); }
|
||||
|
||||
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_add_epi32(x, y); }
|
||||
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_sub_epi32(x, y); }
|
||||
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
|
||||
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_and_si512(x, y); }
|
||||
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_andnot_si512(x, y); }
|
||||
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_or_si512(x, y); }
|
||||
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_xor_si512(x, y); }
|
||||
|
||||
static INLINE vint2 vand_vi2_vo_vi2(vopmask o, vint2 m) {
|
||||
return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m);
|
||||
}
|
||||
|
||||
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask o, vint2 m) {
|
||||
return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0));
|
||||
}
|
||||
|
||||
#define vsll_vi2_vi2_i(x, c) _mm512_slli_epi32(x, c)
|
||||
#define vsrl_vi2_vi2_i(x, c) _mm512_srli_epi32(x, c)
|
||||
#define vsra_vi2_vi2_i(x, c) _mm512_srai_epi32(x, c)
|
||||
//@#define vsll_vi2_vi2_i(x, c) _mm512_slli_epi32(x, c)
|
||||
//@#define vsrl_vi2_vi2_i(x, c) _mm512_srli_epi32(x, c)
|
||||
//@#define vsra_vi2_vi2_i(x, c) _mm512_srai_epi32(x, c)
|
||||
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpeq_epi32_mask(x, y); }
|
||||
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpgt_epi32_mask(x, y); }
|
||||
|
||||
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
__mmask16 m = _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_EQ);
|
||||
return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1));
|
||||
}
|
||||
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
|
||||
__mmask16 m = _mm512_cmp_epi32_mask(y, x, _MM_CMPINT_LT);
|
||||
return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1));
|
||||
}
|
||||
|
||||
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
|
||||
return _mm512_mask_blend_epi32(m, y, x);
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) {
|
||||
return _mm512_mask_blend_ps(m, y, x);
|
||||
}
|
||||
|
||||
// At this point, the following three functions are implemented in a generic way,
|
||||
// but I will try target-specific optimization later on.
|
||||
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
|
||||
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
|
||||
|
||||
static INLINE vint2 vilogbk_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
|
||||
static INLINE vint2 vilogb2k_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
|
||||
|
||||
#ifdef _MSC_VER
|
||||
// This function is needed when debugging on MSVC.
|
||||
static INLINE float vcast_f_vf(vfloat v) {
|
||||
float s[VECTLENSP];
|
||||
_mm512_storeu_ps(s, v);
|
||||
return s[0];
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE vfloat vload_vf_p(const float *ptr) { return _mm512_load_ps(ptr); }
|
||||
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm512_loadu_ps(ptr); }
|
||||
|
||||
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm512_store_ps(ptr, v); }
|
||||
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm512_storeu_ps(ptr, v); }
|
||||
|
||||
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) { return _mm512_i32gather_ps(vi2, ptr, 4); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d) {
|
||||
return vreinterpret_vd_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vd(d), 0xcccc, vreinterpret_vm_vd(d), vreinterpret_vm_vd(_mm512_set1_pd(-0.0))));
|
||||
}
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d) {
|
||||
return vreinterpret_vd_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vd(d), 0x3333, vreinterpret_vm_vd(d), vreinterpret_vm_vd(_mm512_set1_pd(-0.0))));
|
||||
}
|
||||
static INLINE vfloat vposneg_vf_vf(vfloat d) {
|
||||
return vreinterpret_vf_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vf(d), 0xaaaa, vreinterpret_vm_vf(d), vreinterpret_vm_vf(_mm512_set1_ps(-0.0f))));
|
||||
}
|
||||
static INLINE vfloat vnegpos_vf_vf(vfloat d) {
|
||||
return vreinterpret_vf_vm(_mm512_mask_xor_epi32(vreinterpret_vm_vf(d), 0x5555, vreinterpret_vm_vf(d), vreinterpret_vm_vf(_mm512_set1_ps(-0.0f))));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
|
||||
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
|
||||
|
||||
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmaddsub_pd(x, y, z); }
|
||||
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmaddsub_ps(x, y, z); }
|
||||
|
||||
static INLINE vdouble vrev21_vd_vd(vdouble vd) { return _mm512_permute_pd(vd, 0x55); }
|
||||
|
||||
static INLINE vdouble vreva2_vd_vd(vdouble vd) {
|
||||
return vreinterpret_vd_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), vreinterpret_vm_vd(vd)));
|
||||
}
|
||||
|
||||
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm512_stream_pd(ptr, v); }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
|
||||
_mm_store_pd(&ptr[(offset + step * 0)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 0)));
|
||||
_mm_store_pd(&ptr[(offset + step * 1)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 1)));
|
||||
_mm_store_pd(&ptr[(offset + step * 2)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 2)));
|
||||
_mm_store_pd(&ptr[(offset + step * 3)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 3)));
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
|
||||
_mm_stream_pd(&ptr[(offset + step * 0)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 0)));
|
||||
_mm_stream_pd(&ptr[(offset + step * 1)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 1)));
|
||||
_mm_stream_pd(&ptr[(offset + step * 2)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 2)));
|
||||
_mm_stream_pd(&ptr[(offset + step * 3)*2], _mm_castps_pd(_mm512_extractf32x4_ps(vreinterpret_vf_vd(v), 3)));
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static INLINE vfloat vrev21_vf_vf(vfloat vf) { return _mm512_permute_ps(vf, 0xb1); }
|
||||
|
||||
static INLINE vfloat vreva2_vf_vf(vfloat vf) {
|
||||
return vreinterpret_vf_vm(_mm512_permutexvar_epi32(_mm512_set_epi32(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), vreinterpret_vm_vf(vf)));
|
||||
}
|
||||
|
||||
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm512_stream_ps(ptr, v); }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
|
||||
_mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 0)));
|
||||
_mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 0)));
|
||||
_mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 1)));
|
||||
_mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 1)));
|
||||
_mm_storel_pd((double *)(ptr+(offset + step * 4)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 2)));
|
||||
_mm_storeh_pd((double *)(ptr+(offset + step * 5)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 2)));
|
||||
_mm_storel_pd((double *)(ptr+(offset + step * 6)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 3)));
|
||||
_mm_storeh_pd((double *)(ptr+(offset + step * 7)*2), _mm_castps_pd(_mm512_extractf32x4_ps(v, 3)));
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
|
||||
|
||||
//
|
||||
|
||||
static vquad loadu_vq_p(void *p) {
|
||||
vquad vq;
|
||||
memcpy(&vq, p, VECTLENDP * 16);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vquad cast_vq_aq(vargquad aq) {
|
||||
vquad vq;
|
||||
memcpy(&vq, &aq, VECTLENDP * 16);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vargquad cast_aq_vq(vquad vq) {
|
||||
vargquad aq;
|
||||
memcpy(&aq, &vq, VECTLENDP * 16);
|
||||
return aq;
|
||||
}
|
||||
|
||||
#ifdef __INTEL_COMPILER
|
||||
static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0; }
|
||||
#else
|
||||
static INLINE int vtestallzeros_i_vo64(vopmask g) { return g == 0; }
|
||||
#endif
|
||||
|
||||
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask m, vmask x, vmask y) { return _mm512_mask_blend_epi64(m, y, x); }
|
||||
|
||||
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm512_sub_epi64(x, y); }
|
||||
static INLINE vmask vneg64_vm_vm(vmask x) { return _mm512_sub_epi64(vcast_vm_i_i(0, 0), x); }
|
||||
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64_mask(y, x, _MM_CMPINT_LT); } // signed compare
|
||||
|
||||
#define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c)
|
||||
#define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c)
|
||||
//@#define vsll64_vm_vm_i(x, c) _mm512_slli_epi64(x, c)
|
||||
//@#define vsrl64_vm_vm_i(x, c) _mm512_srli_epi64(x, c)
|
||||
|
||||
static INLINE vmask vcast_vm_vi(vint vi) {
|
||||
return _mm512_cvtepi32_epi64(vi);
|
||||
}
|
||||
static INLINE vint vcast_vi_vm(vmask vm) {
|
||||
return _mm512_cvtepi64_epi32(vm);
|
||||
}
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
|
||||
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
|
||||
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
|
||||
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
|
||||
@@ -0,0 +1,297 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#if !defined(__ARM_NEON) && !defined(SLEEF_GENHEADER)
|
||||
#error Please specify -mfpu=neon.
|
||||
#endif
|
||||
|
||||
#ifdef __aarch64__
|
||||
#warning This implementation is for AARCH32.
|
||||
#endif
|
||||
|
||||
#define ENABLE_SP
|
||||
//@#define ENABLE_SP
|
||||
#define LOG2VECTLENSP 2
|
||||
//@#define LOG2VECTLENSP 2
|
||||
#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
//@#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
|
||||
#if CONFIG == 4
|
||||
#define ISANAME "AARCH32 NEON-VFPV4"
|
||||
#define ENABLE_FMA_SP
|
||||
//@#define ENABLE_FMA_SP
|
||||
#else
|
||||
#define ISANAME "AARCH32 NEON"
|
||||
#endif
|
||||
#define DFTPRIORITY 10
|
||||
|
||||
#define ENABLE_RECSQRT_SP
|
||||
//@#define ENABLE_RECSQRT_SP
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "misc.h"
|
||||
|
||||
typedef uint32x4_t vmask;
|
||||
typedef uint32x4_t vopmask;
|
||||
|
||||
//typedef int32x4_t vint;
|
||||
|
||||
typedef float32x4_t vfloat;
|
||||
typedef int32x4_t vint2;
|
||||
|
||||
//
|
||||
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { }
|
||||
|
||||
static INLINE int vtestallones_i_vo32(vopmask g) {
|
||||
uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
|
||||
uint32x2_t x1 = vpmin_u32(x0, x0);
|
||||
return vget_lane_u32(x1, 0);
|
||||
}
|
||||
|
||||
static vfloat vloaduf(float *p) { return vld1q_f32(p); }
|
||||
static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); }
|
||||
|
||||
static vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
|
||||
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
|
||||
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vbicq_u32(y, x); }
|
||||
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }
|
||||
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }
|
||||
|
||||
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vandq_u32(x, y); }
|
||||
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vbicq_u32(y, x); }
|
||||
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vorrq_u32(x, y); }
|
||||
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return veorq_u32(x, y); }
|
||||
|
||||
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vandq_u32(x, y); }
|
||||
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }
|
||||
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }
|
||||
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return veorq_u32(x, y); }
|
||||
|
||||
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vandq_u32(x, y); }
|
||||
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }
|
||||
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }
|
||||
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return veorq_u32(x, y); }
|
||||
|
||||
static INLINE vopmask vcast_vo32_vo64(vopmask m) { return vuzpq_u32(m, m).val[0]; }
|
||||
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return vzipq_u32(m, m).val[0]; }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vmask vcast_vm_i_i(int i0, int i1) { return (vmask)vdupq_n_u64((uint64_t)i0 | (((uint64_t)i1) << 32)); }
|
||||
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
|
||||
uint32x4_t t = vceqq_u32(x, y);
|
||||
return vandq_u32(t, vrev64q_u32(t));
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
|
||||
static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
|
||||
static INLINE vint2 vrint_vi2_vf(vfloat d) {
|
||||
return vcvtq_s32_f32(vaddq_f32(d, (float32x4_t)vorrq_u32(vandq_u32((uint32x4_t)d, (uint32x4_t)vdupq_n_f32(-0.0f)), (uint32x4_t)vdupq_n_f32(0.5f))));
|
||||
}
|
||||
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }
|
||||
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }
|
||||
|
||||
static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
|
||||
static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
|
||||
|
||||
static INLINE vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }
|
||||
static INLINE vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }
|
||||
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
|
||||
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
|
||||
static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return (vfloat)vm; }
|
||||
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
|
||||
|
||||
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return vaddq_f32(x, y); }
|
||||
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return vsubq_f32(x, y); }
|
||||
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return vmulq_f32(x, y); }
|
||||
|
||||
static INLINE vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
|
||||
static INLINE vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }
|
||||
#if CONFIG == 4
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { return vfmaq_f32(z, x, y); }
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmsq_f32(z, x, y); }
|
||||
static INLINE vfloat vfma_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { return vfmaq_f32(z, x, y); }
|
||||
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vfmsq_f32(z, x, y); }
|
||||
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z)); }
|
||||
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z)); }
|
||||
|
||||
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) {
|
||||
float32x4_t t = vrecpeq_f32(y), u;
|
||||
t = vmulq_f32(t, vrecpsq_f32(y, t));
|
||||
t = vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);
|
||||
u = vmulq_f32(x, t);
|
||||
return vfmaq_f32(u, vfmsq_f32(x, y, u), t);
|
||||
}
|
||||
|
||||
static INLINE vfloat vsqrt_vf_vf(vfloat d) {
|
||||
float32x4_t x = vrsqrteq_f32(d);
|
||||
x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
|
||||
x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
|
||||
float32x4_t u = vmulq_f32(x, d);
|
||||
u = vfmaq_f32(u, vfmsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));
|
||||
return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vceqq_f32(d, vdupq_n_f32(0.0f))));
|
||||
}
|
||||
|
||||
static INLINE vfloat vrec_vf_vf(vfloat y) {
|
||||
float32x4_t t = vrecpeq_f32(y);
|
||||
t = vmulq_f32(t, vrecpsq_f32(y, t));
|
||||
t = vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);
|
||||
return vfmaq_f32(t, vfmsq_f32(vdupq_n_f32(1.0f), y, t), t);
|
||||
}
|
||||
|
||||
static INLINE vfloat vrecsqrt_vf_vf(vfloat d) {
|
||||
float32x4_t x = vrsqrteq_f32(d);
|
||||
x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
|
||||
return vfmaq_f32(x, vfmsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5)));
|
||||
}
|
||||
#else // #if CONFIG == 4
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlaq_f32(z, x, y); }
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlsq_f32(z, x, y); }
|
||||
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vneg_vf_vf(vmlsq_f32(z, x, y)); }
|
||||
|
||||
static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
|
||||
float32x4_t x = vrecpeq_f32(d);
|
||||
x = vmulq_f32(x, vrecpsq_f32(d, x));
|
||||
float32x4_t t = vmulq_f32(n, x);
|
||||
return vmlsq_f32(vaddq_f32(t, t), vmulq_f32(t, x), d);
|
||||
}
|
||||
|
||||
static INLINE vfloat vsqrt_vf_vf(vfloat d) {
|
||||
float32x4_t x = vrsqrteq_f32(d);
|
||||
x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
|
||||
float32x4_t u = vmulq_f32(x, d);
|
||||
u = vmlaq_f32(u, vmlsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));
|
||||
return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(u), vceqq_f32(d, vdupq_n_f32(0.0f))));
|
||||
}
|
||||
|
||||
static INLINE vfloat vrec_vf_vf(vfloat d) {
|
||||
float32x4_t x = vrecpeq_f32(d);
|
||||
x = vmulq_f32(x, vrecpsq_f32(d, x));
|
||||
return vmlsq_f32(vaddq_f32(x, x), vmulq_f32(x, x), d);
|
||||
}
|
||||
|
||||
static INLINE vfloat vrecsqrt_vf_vf(vfloat d) {
|
||||
float32x4_t x = vrsqrteq_f32(d);
|
||||
x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
|
||||
return vmlaq_f32(x, vmlsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5)));
|
||||
}
|
||||
#endif // #if CONFIG == 4
|
||||
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vmaxq_f32(x, y); }
|
||||
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vminq_f32(x, y); }
|
||||
|
||||
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }
|
||||
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vmvnq_u32(vceqq_f32(x, y)); }
|
||||
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }
|
||||
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }
|
||||
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }
|
||||
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }
|
||||
|
||||
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vaddq_s32(x, y); }
|
||||
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsubq_s32(x, y); }
|
||||
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }
|
||||
|
||||
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vandq_s32(x, y); }
|
||||
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vbicq_s32(y, x); }
|
||||
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vorrq_s32(x, y); }
|
||||
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return veorq_s32(x, y); }
|
||||
|
||||
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vandq_u32(x, (vopmask)y); }
|
||||
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vbicq_u32((vopmask)y, x); }
|
||||
|
||||
#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
|
||||
#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
|
||||
#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
|
||||
//@#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
|
||||
//@#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
|
||||
//@#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
|
||||
|
||||
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }
|
||||
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return vcgtq_s32(x, y); }
|
||||
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vceqq_s32(x, y); }
|
||||
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vcgtq_s32(x, y); }
|
||||
|
||||
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return (vint2)vbslq_u32(m, (vmask)x, (vmask)y); }
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
|
||||
return (vfloat)vbslq_u32(mask, (vmask)x, (vmask)y);
|
||||
}
|
||||
|
||||
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
|
||||
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
|
||||
|
||||
// This function is needed when debugging on MSVC.
|
||||
static INLINE float vcast_f_vf(vfloat v) {
|
||||
float p[4];
|
||||
vst1q_f32 (p, v);
|
||||
return p[0];
|
||||
}
|
||||
|
||||
static INLINE int vavailability_i(int name) {
|
||||
if (name != 2) return 0;
|
||||
return vcast_f_vf(vadd_vf_vf_vf(vcast_vf_f(name), vcast_vf_f(name))) != 0.0;
|
||||
}
|
||||
|
||||
|
||||
static INLINE vfloat vload_vf_p(const float *ptr) { return vld1q_f32(__builtin_assume_aligned(ptr, 16)); }
|
||||
static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
|
||||
|
||||
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); }
|
||||
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
|
||||
|
||||
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
|
||||
return ((vfloat) {
|
||||
ptr[vgetq_lane_s32(vi2, 0)],
|
||||
ptr[vgetq_lane_s32(vi2, 1)],
|
||||
ptr[vgetq_lane_s32(vi2, 2)],
|
||||
ptr[vgetq_lane_s32(vi2, 3)]
|
||||
});
|
||||
}
|
||||
|
||||
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
|
||||
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
|
||||
|
||||
static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }
|
||||
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }
|
||||
|
||||
static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
|
||||
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
|
||||
static INLINE vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }
|
||||
static INLINE vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); }
|
||||
|
||||
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
|
||||
vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
|
||||
vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
|
||||
vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
|
||||
vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
|
||||
}
|
||||
@@ -0,0 +1,873 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#if CONFIG == 1 || CONFIG == 2 || CONFIG == 3 || CONFIG == 4
|
||||
|
||||
#ifndef __VSX__
|
||||
#error Please specify -mcpu=power8 or -mcpu=power9
|
||||
#endif
|
||||
|
||||
#else
|
||||
#error CONFIG macro invalid or not defined
|
||||
#endif
|
||||
|
||||
#define ENABLE_DP
|
||||
//@#define ENABLE_DP
|
||||
#define LOG2VECTLENDP 1
|
||||
//@#define LOG2VECTLENDP 1
|
||||
#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
//@#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
|
||||
#define ENABLE_SP
|
||||
//@#define ENABLE_SP
|
||||
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
//@#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
|
||||
#if CONFIG == 1 || CONFIG == 3
|
||||
#define ENABLE_FMA_DP
|
||||
//@#define ENABLE_FMA_DP
|
||||
#define ENABLE_FMA_SP
|
||||
//@#define ENABLE_FMA_SP
|
||||
#endif
|
||||
|
||||
#define ACCURATE_SQRT
|
||||
//@#define ACCURATE_SQRT
|
||||
#define FULL_FP_ROUNDING
|
||||
//@#define FULL_FP_ROUNDING
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
#include <altivec.h>
|
||||
// undef altivec types since CPP and C99 use them as compiler tokens
|
||||
// use __vector and __bool instead
|
||||
#undef vector
|
||||
#undef bool
|
||||
|
||||
#include <stdint.h>
|
||||
#include "misc.h"
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
#if CONFIG == 1 || CONFIG == 2
|
||||
#define ISANAME "VSX"
|
||||
#else
|
||||
#define ISANAME "VSX-3"
|
||||
#endif
|
||||
|
||||
#define DFTPRIORITY 25
|
||||
|
||||
static INLINE int vavailability_i(int name) { return 3; }
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { }
|
||||
|
||||
/**********************************************
|
||||
** Types
|
||||
***********************************************/
|
||||
typedef __vector unsigned int vmask;
|
||||
// using __bool with typedef may cause ambiguous errors
|
||||
#define vopmask __vector __bool int
|
||||
//@#define vopmask __vector __bool int
|
||||
typedef __vector signed int vint;
|
||||
typedef __vector signed int vint2;
|
||||
typedef __vector float vfloat;
|
||||
typedef __vector double vdouble;
|
||||
|
||||
// internal use types
|
||||
typedef __vector unsigned int v__u32;
|
||||
typedef __vector unsigned char v__u8;
|
||||
typedef __vector signed long long v__i64;
|
||||
typedef __vector unsigned long long v__u64;
|
||||
#define v__b64 __vector __bool long long
|
||||
|
||||
typedef __vector long long vint64;
|
||||
typedef __vector unsigned long long vuint64;
|
||||
|
||||
typedef struct {
|
||||
vmask x, y;
|
||||
} vquad;
|
||||
|
||||
typedef vquad vargquad;
|
||||
|
||||
/**********************************************
|
||||
** Utilities
|
||||
***********************************************/
|
||||
#define vset__vi(v0, v1) ((vint) {v0, v1, v0, v1})
|
||||
#define vset__vi2(...) ((vint2) {__VA_ARGS__})
|
||||
#define vset__vm(...) ((vmask) {__VA_ARGS__})
|
||||
#define vset__vo(...) ((vopmask) {__VA_ARGS__})
|
||||
#define vset__vf(...) ((vfloat) {__VA_ARGS__})
|
||||
#define vset__vd(...) ((vdouble) {__VA_ARGS__})
|
||||
#define vset__u8(...) ((v__u8) {__VA_ARGS__})
|
||||
#define vset__u32(...) ((v__u32) {__VA_ARGS__})
|
||||
#define vset__s64(...) ((v__i64) {__VA_ARGS__})
|
||||
#define vset__u64(...) ((v__u64) {__VA_ARGS__})
|
||||
|
||||
#define vsetall__vi(v) vset__vi(v, v)
|
||||
#define vsetall__vi2(v) vset__vi2(v, v, v, v)
|
||||
#define vsetall__vm(v) vset__vm(v, v, v, v)
|
||||
#define vsetall__vo(v) vset__vo(v, v, v, v)
|
||||
#define vsetall__vf(v) vset__vf(v, v, v, v)
|
||||
#define vsetall__vd(v) vset__vd(v, v)
|
||||
#define vsetall__u8(v) vset__u8(v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v)
|
||||
#define vsetall__u32(v) vset__u32(v, v, v, v)
|
||||
#define vsetall__s64(v) vset__s64(v, v)
|
||||
#define vsetall__u64(v) vset__u64(v, v)
|
||||
|
||||
#define vzero__vi() vsetall__vi(0)
|
||||
#define vzero__vi2() vsetall__vi2(0)
|
||||
#define vzero__vm() vsetall__vm(0)
|
||||
#define vzero__vo() vsetall__vo(0)
|
||||
#define vzero__vf() vsetall__vf(0)
|
||||
#define vzero__vd() vsetall__vd(0)
|
||||
#define vzero__u8() vsetall__u8(0)
|
||||
#define vzero__u32() vsetall__u32(0)
|
||||
#define vzero__s64() vsetall__s64(0)
|
||||
#define vzero__u64() vsetall__u64(0)
|
||||
|
||||
//// Swap doubleword elements
|
||||
#if defined(__clang__) || __GNUC__ >= 7
|
||||
static INLINE v__u64 v__swapd_u64(v__u64 v)
|
||||
{ return vec_xxpermdi(v, v, 2); }
|
||||
#else
|
||||
static INLINE v__u64 v__swapd_u64(v__u64 v)
|
||||
{
|
||||
__asm__ __volatile__("xxswapd %x0,%x1" : "=wa" (v) : "wa" (v));
|
||||
return v;
|
||||
}
|
||||
#endif
|
||||
|
||||
/**********************************************
|
||||
** Memory
|
||||
***********************************************/
|
||||
|
||||
////////////// Unaligned memory access //////////////
|
||||
/**
|
||||
* It's not safe to use vector assignment via (cast & dereference) for unaligned memory access
|
||||
* with almost all clang versions and gcc8 when VSX3 isn't enabled,
|
||||
* these compilers tends to generate instructions 'lvx/stvx' instead of 'lxvd2x/lxvw4x/stxvd2x/stxvw4x'
|
||||
* for more information check https://github.com/seiko2plus/vsx_mem_test
|
||||
*
|
||||
* TODO: check GCC(9, 10)
|
||||
*/
|
||||
//// load
|
||||
#if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8)
|
||||
static vint vloadu_vi_p(const int32_t *ptr)
|
||||
{ return *((vint*)ptr); }
|
||||
static INLINE vint2 vloadu_vi2_p(const int32_t *ptr)
|
||||
{ return *((vint2*)ptr); }
|
||||
static INLINE vfloat vloadu_vf_p(const float *ptr)
|
||||
{ return *((vfloat*)ptr); }
|
||||
static INLINE vdouble vloadu_vd_p(const double *ptr)
|
||||
{ return *((vdouble*)ptr); }
|
||||
#else
|
||||
static vint vloadu_vi_p(const int32_t *ptr)
|
||||
{ return vec_vsx_ld(0, ptr); }
|
||||
static INLINE vint2 vloadu_vi2_p(const int32_t *ptr)
|
||||
{ return vec_vsx_ld(0, ptr); }
|
||||
static INLINE vfloat vloadu_vf_p(const float *ptr)
|
||||
{ return vec_vsx_ld(0, ptr); }
|
||||
static INLINE vdouble vloadu_vd_p(const double *ptr)
|
||||
{ return vec_vsx_ld(0, ptr); }
|
||||
#endif
|
||||
|
||||
//// store
|
||||
#if defined(__POWER9_VECTOR__) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8)
|
||||
static void vstoreu_v_p_vi(int32_t *ptr, vint v)
|
||||
{ *((vint*)ptr) = v; }
|
||||
static void vstoreu_v_p_vi2(int32_t *ptr, vint2 v)
|
||||
{ *((vint2*)ptr) = v; }
|
||||
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v)
|
||||
{ *((vfloat*)ptr) = v; }
|
||||
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v)
|
||||
{ *((vdouble*)ptr) = v; }
|
||||
#else
|
||||
static void vstoreu_v_p_vi(int32_t *ptr, vint v)
|
||||
{ vec_vsx_st(v, 0, ptr); }
|
||||
static void vstoreu_v_p_vi2(int32_t *ptr, vint2 v)
|
||||
{ vec_vsx_st(v, 0, ptr); }
|
||||
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v)
|
||||
{ vec_vsx_st(v, 0, ptr); }
|
||||
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v)
|
||||
{ vec_vsx_st(v, 0, ptr); }
|
||||
#endif
|
||||
|
||||
////////////// aligned memory access //////////////
|
||||
//// load
|
||||
static INLINE vfloat vload_vf_p(const float *ptr)
|
||||
{ return vec_ld(0, ptr); }
|
||||
static INLINE vdouble vload_vd_p(const double *ptr)
|
||||
{ return *((vdouble*)ptr); }
|
||||
|
||||
//// store
|
||||
static INLINE void vstore_v_p_vf(float *ptr, vfloat v)
|
||||
{ vec_st(v, 0, ptr); }
|
||||
static INLINE void vstore_v_p_vd(double *ptr, vdouble v)
|
||||
{ *((vdouble*)ptr) = v; }
|
||||
|
||||
////////////// non-temporal memory access //////////////
|
||||
//// store
|
||||
static INLINE void vstream_v_p_vf(float *ptr, vfloat v)
|
||||
{ vstore_v_p_vf(ptr, v); }
|
||||
static INLINE void vstream_v_p_vd(double *ptr, vdouble v)
|
||||
{ vstore_v_p_vd(ptr, v); }
|
||||
|
||||
////////////// LUT //////////////
|
||||
//// load
|
||||
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi)
|
||||
{ return vset__vd(ptr[vec_extract(vi, 0)], ptr[vec_extract(vi, 1)]); }
|
||||
|
||||
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2)
|
||||
{
|
||||
return vset__vf(
|
||||
ptr[vec_extract(vi2, 0)], ptr[vec_extract(vi2, 1)],
|
||||
ptr[vec_extract(vi2, 2)], ptr[vec_extract(vi2, 3)]
|
||||
);
|
||||
}
|
||||
|
||||
//// store
|
||||
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v)
|
||||
{
|
||||
const v__u64 vll = (v__u64)v;
|
||||
float *ptr_low = ptr + offset*2;
|
||||
float *ptr_high = ptr + (offset + step)*2;
|
||||
*((uint64_t*)ptr_low) = vec_extract(vll, 0);
|
||||
*((uint64_t*)ptr_high) = vec_extract(vll, 1);
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v)
|
||||
{ vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v)
|
||||
{ vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v)
|
||||
{ vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
|
||||
|
||||
/**********************************************
|
||||
** Misc
|
||||
**********************************************/
|
||||
|
||||
// vector with a specific value set to all lanes (Vector Splat)
|
||||
static INLINE vint vcast_vi_i(int i)
|
||||
{ return vsetall__vi(i); }
|
||||
static INLINE vint2 vcast_vi2_i(int i)
|
||||
{ return vsetall__vi2(i); }
|
||||
static INLINE vfloat vcast_vf_f(float f)
|
||||
{ return vsetall__vf(f); }
|
||||
static INLINE vdouble vcast_vd_d(double d)
|
||||
{ return vsetall__vd(d); }
|
||||
// cast
|
||||
static INLINE vint2 vcast_vi2_vm(vmask vm)
|
||||
{ return (vint2)vm; }
|
||||
static INLINE vmask vcast_vm_vi2(vint2 vi)
|
||||
{ return (vmask)vi; }
|
||||
// get the first element
|
||||
static INLINE float vcast_f_vf(vfloat v)
|
||||
{ return vec_extract(v, 0); }
|
||||
static INLINE double vcast_d_vd(vdouble v)
|
||||
{ return vec_extract(v, 0); }
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vd(vdouble vd)
|
||||
{ return (vmask)vd; }
|
||||
static INLINE vdouble vreinterpret_vd_vm(vmask vm)
|
||||
{ return (vdouble)vm; }
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vf(vfloat vf)
|
||||
{ return (vmask)vf; }
|
||||
static INLINE vfloat vreinterpret_vf_vm(vmask vm)
|
||||
{ return (vfloat)vm; }
|
||||
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi)
|
||||
{ return (vfloat)vi; }
|
||||
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf)
|
||||
{ return (vint2)vf; }
|
||||
|
||||
// per element select via mask (blend)
|
||||
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y)
|
||||
{ return vec_sel(y, x, (v__b64)o); }
|
||||
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y)
|
||||
{ return vec_sel(y, x, o); }
|
||||
|
||||
static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y)
|
||||
{ return vec_sel(y, x, o); }
|
||||
|
||||
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y)
|
||||
{ return vec_sel(y, x, o); }
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0)
|
||||
{
|
||||
return vsel_vf_vo_vf_vf(o, vsetall__vf(v1), vsetall__vf(v0));
|
||||
}
|
||||
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2)
|
||||
{
|
||||
return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_f_f(o1, d1, d2));
|
||||
}
|
||||
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3)
|
||||
{
|
||||
return vsel_vf_vo_vf_vf(o0, vsetall__vf(d0), vsel_vf_vo_vf_vf(o1, vsetall__vf(d1), vsel_vf_vo_f_f(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0)
|
||||
{
|
||||
return vsel_vd_vo_vd_vd(o, vsetall__vd(v1), vsetall__vd(v0));
|
||||
}
|
||||
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2)
|
||||
{
|
||||
return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_d_d(o1, d1, d2));
|
||||
}
|
||||
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3)
|
||||
{
|
||||
return vsel_vd_vo_vd_vd(o0, vsetall__vd(d0), vsel_vd_vo_vd_vd(o1, vsetall__vd(d1), vsel_vd_vo_d_d(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE int vtestallones_i_vo32(vopmask g)
|
||||
{ return vec_all_ne((vint2)g, vzero__vi2()); }
|
||||
static INLINE int vtestallones_i_vo64(vopmask g)
|
||||
{ return vec_all_ne((v__i64)g, vzero__s64()); }
|
||||
|
||||
/**********************************************
|
||||
** Conversions
|
||||
**********************************************/
|
||||
|
||||
////////////// Numeric //////////////
|
||||
// pack 64-bit mask to 32-bit
|
||||
static INLINE vopmask vcast_vo32_vo64(vopmask m)
|
||||
{ return (vopmask)vec_pack((v__u64)m, (v__u64)m); }
|
||||
// clip 64-bit lanes to lower 32-bit
|
||||
static INLINE vint vcastu_vi_vi2(vint2 vi2)
|
||||
{ return vec_mergeo(vi2, vec_splat(vi2, 3)); }
|
||||
static INLINE vint vcastu_vi_vm(vmask vi2)
|
||||
{ return vec_mergeo((vint2)vi2, vec_splat((vint2)vi2, 3)); }
|
||||
|
||||
|
||||
// expand lower 32-bit mask
|
||||
static INLINE vopmask vcast_vo64_vo32(vopmask m)
|
||||
{ return vec_mergeh(m, m); }
|
||||
// unsigned expand lower 32-bit integer
|
||||
static INLINE vint2 vcastu_vi2_vi(vint vi)
|
||||
{ return vec_mergeh(vzero__vi(), vi); }
|
||||
static INLINE vmask vcastu_vm_vi(vint vi)
|
||||
{ return (vmask)vec_mergeh(vzero__vi(), vi); }
|
||||
|
||||
static INLINE vopmask vcast_vo_i(int i) {
|
||||
i = i ? -1 : 0;
|
||||
return (vopmask) { i, i, i, i };
|
||||
}
|
||||
|
||||
// signed int to single-precision
|
||||
static INLINE vfloat vcast_vf_vi2(vint2 vi)
|
||||
{
|
||||
vfloat ret;
|
||||
#if defined(__clang__) || __GNUC__ >= 9
|
||||
ret = __builtin_convertvector(vi, vfloat);
|
||||
#else
|
||||
__asm__ __volatile__("xvcvsxwsp %x0,%x1" : "=wa" (ret) : "wa" (vi));
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
// lower signed int to double-precision
|
||||
static INLINE vdouble vcast_vd_vi(vint vi)
|
||||
{
|
||||
vdouble ret;
|
||||
vint swap = vec_mergeh(vi, vi);
|
||||
#if defined(__clang__) || __GNUC__ >= 7
|
||||
ret = __builtin_vsx_xvcvsxwdp(swap);
|
||||
#else
|
||||
__asm__ __volatile__("xvcvsxwdp %x0,%x1" : "=wa" (ret) : "wa" (swap));
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
// zip two scalars
|
||||
static INLINE vmask vcast_vm_i_i(int l, int h)
|
||||
{ return (vmask)vec_mergeh(vsetall__vi2(h), vsetall__vi2(l)); }
|
||||
|
||||
static INLINE vmask vcast_vm_i64(int64_t i) {
|
||||
return (vmask)vsetall__s64(i);
|
||||
}
|
||||
static INLINE vmask vcast_vm_u64(uint64_t i) {
|
||||
return (vmask)vsetall__u64(i);
|
||||
}
|
||||
|
||||
////////////// Truncation //////////////
|
||||
|
||||
static INLINE vint2 vtruncate_vi2_vf(vfloat vf)
|
||||
{
|
||||
vint2 ret;
|
||||
#if defined(__clang__) || __GNUC__ >= 9
|
||||
ret = __builtin_convertvector(vf, vint2);
|
||||
#else
|
||||
__asm__ __volatile__("xvcvspsxws %x0,%x1" : "=wa" (ret) : "wa" (vf));
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vint vtruncate_vi_vd(vdouble vd)
|
||||
{
|
||||
vint ret;
|
||||
#if defined(__clang__) || __GNUC__ >= 7
|
||||
ret = __builtin_vsx_xvcvdpsxws(vd);
|
||||
#else
|
||||
__asm__ __volatile__("xvcvdpsxws %x0,%x1" : "=wa" (ret) : "wa" (vd));
|
||||
#endif
|
||||
return vec_mergeo(ret, vec_splat(ret, 3));
|
||||
}
|
||||
|
||||
static INLINE vdouble vtruncate_vd_vd(vdouble vd)
|
||||
{ return vec_trunc(vd); }
|
||||
static INLINE vfloat vtruncate_vf_vf(vfloat vf)
|
||||
{ return vec_trunc(vf); }
|
||||
|
||||
////////////// Rounding //////////////
|
||||
|
||||
// towards the nearest even
|
||||
static INLINE vint vrint_vi_vd(vdouble vd)
|
||||
{ return vtruncate_vi_vd(vec_rint(vd)); }
|
||||
static INLINE vint2 vrint_vi2_vf(vfloat vf)
|
||||
{ return vtruncate_vi2_vf(vec_rint(vf)); }
|
||||
static INLINE vdouble vrint_vd_vd(vdouble vd)
|
||||
{ return vec_rint(vd); }
|
||||
static INLINE vfloat vrint_vf_vf(vfloat vf)
|
||||
{ return vec_rint(vf); }
|
||||
|
||||
/**********************************************
|
||||
** Logical
|
||||
**********************************************/
|
||||
|
||||
////////////// And //////////////
|
||||
static INLINE vint vand_vi_vi_vi(vint x, vint y)
|
||||
{ return vec_and(x, y); }
|
||||
static INLINE vint vand_vi_vo_vi(vopmask x, vint y)
|
||||
{ return vec_and((vint)x, y); }
|
||||
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y)
|
||||
{ return vec_and(x, y); }
|
||||
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y)
|
||||
{ return (vint2)vec_and((vint2)x, y); }
|
||||
|
||||
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y)
|
||||
{ return vec_and(x, y); }
|
||||
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y)
|
||||
{ return vec_and((vmask)x, y); }
|
||||
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y)
|
||||
{ return vec_and((vmask)x, y); }
|
||||
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y)
|
||||
{ return vec_and(x, y); }
|
||||
|
||||
////////////// Or //////////////
|
||||
static INLINE vint vor_vi_vi_vi(vint x, vint y)
|
||||
{ return vec_or(x, y); }
|
||||
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y)
|
||||
{ return vec_or(x, y); }
|
||||
|
||||
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y)
|
||||
{ return vec_or(x, y); }
|
||||
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y)
|
||||
{ return vec_or((vmask)x, y); }
|
||||
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y)
|
||||
{ return vec_or((vmask)x, y); }
|
||||
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y)
|
||||
{ return vec_or(x, y); }
|
||||
|
||||
////////////// Xor //////////////
|
||||
static INLINE vint vxor_vi_vi_vi(vint x, vint y)
|
||||
{ return vec_xor(x, y); }
|
||||
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y)
|
||||
{ return vec_xor(x, y); }
|
||||
|
||||
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y)
|
||||
{ return vec_xor(x, y); }
|
||||
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y)
|
||||
{ return vec_xor((vmask)x, y); }
|
||||
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y)
|
||||
{ return vec_xor((vmask)x, y); }
|
||||
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y)
|
||||
{ return vec_xor(x, y); }
|
||||
|
||||
////////////// Not //////////////
|
||||
static INLINE vopmask vnot_vo_vo(vopmask o)
|
||||
{ return vec_nor(o, o); }
|
||||
|
||||
////////////// And Not ((~x) & y) //////////////
|
||||
static INLINE vint vandnot_vi_vi_vi(vint x, vint y)
|
||||
{ return vec_andc(y, x); }
|
||||
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y)
|
||||
{ return vec_andc(y, (vint)x); }
|
||||
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y)
|
||||
{ return vec_andc(y, x); }
|
||||
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y)
|
||||
{ return vec_andc(y, x); }
|
||||
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y)
|
||||
{ return vec_andc(y, x); }
|
||||
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y)
|
||||
{ return vec_andc(y, x); }
|
||||
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y)
|
||||
{ return vec_andc(y, x); }
|
||||
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y)
|
||||
{ return vec_andc(y, (vint2)x); }
|
||||
|
||||
/**********************************************
|
||||
** Comparison
|
||||
**********************************************/
|
||||
|
||||
////////////// Equal //////////////
|
||||
static INLINE vint veq_vi_vi_vi(vint x, vint y)
|
||||
{ return (vint)vec_cmpeq(x, y); }
|
||||
static INLINE vopmask veq_vo_vi_vi(vint x, vint y)
|
||||
{ return vec_cmpeq(x, y); }
|
||||
|
||||
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y)
|
||||
{ return vec_cmpeq(x, y); }
|
||||
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y)
|
||||
{ return (vint2)vec_cmpeq(x, y); }
|
||||
|
||||
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y)
|
||||
{ return (vopmask)vec_cmpeq((v__u64)x, (v__u64)y); }
|
||||
|
||||
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y)
|
||||
{ return vec_cmpeq(x, y); }
|
||||
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y)
|
||||
{ return (vopmask)vec_cmpeq(x, y); }
|
||||
|
||||
////////////// Not Equal //////////////
|
||||
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y)
|
||||
{ return vnot_vo_vo(vec_cmpeq(x, y)); }
|
||||
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y)
|
||||
{ return vnot_vo_vo((vopmask)vec_cmpeq(x, y)); }
|
||||
|
||||
////////////// Less Than //////////////
|
||||
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y)
|
||||
{ return vec_cmplt(x, y); }
|
||||
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y)
|
||||
{ return (vopmask)vec_cmplt(x, y); }
|
||||
|
||||
////////////// Greater Than //////////////
|
||||
static INLINE vint vgt_vi_vi_vi(vint x, vint y)
|
||||
{ return (vint)vec_cmpgt(x, y); }
|
||||
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y)
|
||||
{ return vec_cmpgt(x, y);}
|
||||
|
||||
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y)
|
||||
{ return (vint2)vec_cmpgt(x, y); }
|
||||
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y)
|
||||
{ return vec_cmpgt(x, y); }
|
||||
|
||||
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y)
|
||||
{ return vec_cmpgt(x, y); }
|
||||
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y)
|
||||
{ return (vopmask)vec_cmpgt(x, y); }
|
||||
|
||||
////////////// Less Than Or Equal //////////////
|
||||
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y)
|
||||
{ return vec_cmple(x, y); }
|
||||
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y)
|
||||
{ return (vopmask)vec_cmple(x, y); }
|
||||
|
||||
////////////// Greater Than Or Equal //////////////
|
||||
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y)
|
||||
{ return vec_cmpge(x, y); }
|
||||
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y)
|
||||
{ return (vopmask)vec_cmpge(x, y); }
|
||||
|
||||
////////////// Special Cases //////////////
|
||||
static INLINE vopmask visinf_vo_vf(vfloat d)
|
||||
{ return vec_cmpeq(vec_abs(d), vsetall__vf(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visinf_vo_vd(vdouble d)
|
||||
{ return (vopmask)vec_cmpeq(vec_abs(d), vsetall__vd(SLEEF_INFINITY)); }
|
||||
|
||||
static INLINE vopmask vispinf_vo_vf(vfloat d)
|
||||
{ return vec_cmpeq(d, vsetall__vf(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask vispinf_vo_vd(vdouble d)
|
||||
{ return (vopmask)vec_cmpeq(d, vsetall__vd(SLEEF_INFINITY)); }
|
||||
|
||||
static INLINE vopmask visminf_vo_vf(vfloat d)
|
||||
{ return vec_cmpeq(d, vsetall__vf(-SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visminf_vo_vd(vdouble d)
|
||||
{ return (vopmask)vec_cmpeq(d, vsetall__vd(-SLEEF_INFINITY)); }
|
||||
|
||||
static INLINE vopmask visnan_vo_vf(vfloat d)
|
||||
{ return vnot_vo_vo(vec_cmpeq(d, d)); }
|
||||
static INLINE vopmask visnan_vo_vd(vdouble d)
|
||||
{ return vnot_vo_vo((vopmask)vec_cmpeq(d, d)); }
|
||||
|
||||
/**********************************************
|
||||
** Shift
|
||||
**********************************************/
|
||||
////////////// Left //////////////
|
||||
static INLINE vint vsll_vi_vi_i(vint x, int c)
|
||||
{ return vec_sl (x, vsetall__u32(c)); }
|
||||
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c)
|
||||
{ return vec_sl(x, vsetall__u32(c)); }
|
||||
|
||||
////////////// Right //////////////
|
||||
static INLINE vint vsrl_vi_vi_i(vint x, int c)
|
||||
{ return vec_sr(x, vsetall__u32(c)); }
|
||||
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c)
|
||||
{ return vec_sr(x, vsetall__u32(c)); }
|
||||
|
||||
////////////// Algebraic Right //////////////
|
||||
static INLINE vint vsra_vi_vi_i(vint x, int c)
|
||||
{ return vec_sra(x, vsetall__u32(c)); }
|
||||
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c)
|
||||
{ return vec_sra(x, vsetall__u32(c)); }
|
||||
|
||||
/**********************************************
|
||||
** Reorder
|
||||
**********************************************/
|
||||
|
||||
////////////// Reverse //////////////
|
||||
// Reverse elements order inside the lower and higher parts
|
||||
static INLINE vint2 vrev21_vi2_vi2(vint2 vi)
|
||||
{ return vec_mergee(vec_mergeo(vi, vi), vi); }
|
||||
static INLINE vfloat vrev21_vf_vf(vfloat vf)
|
||||
{ return (vfloat)vrev21_vi2_vi2((vint2)vf); }
|
||||
|
||||
// Swap the lower and higher parts
|
||||
static INLINE vfloat vreva2_vf_vf(vfloat vf)
|
||||
{ return (vfloat)v__swapd_u64((v__u64)vf); }
|
||||
static INLINE vdouble vrev21_vd_vd(vdouble vd)
|
||||
{ return (vdouble)v__swapd_u64((v__u64)vd); }
|
||||
static INLINE vdouble vreva2_vd_vd(vdouble vd)
|
||||
{ return vd; }
|
||||
|
||||
/**********************************************
|
||||
** Arithmetic
|
||||
**********************************************/
|
||||
|
||||
////////////// Negation //////////////
|
||||
static INLINE vint vneg_vi_vi(vint e) {
|
||||
#if defined(__clang__) || __GNUC__ >= 9
|
||||
return vec_neg(e);
|
||||
#else
|
||||
return vec_sub(vzero__vi(), e);
|
||||
#endif
|
||||
}
|
||||
static INLINE vint2 vneg_vi2_vi2(vint2 e)
|
||||
{ return vneg_vi_vi(e); }
|
||||
|
||||
static INLINE vfloat vneg_vf_vf(vfloat d)
|
||||
{
|
||||
vfloat ret;
|
||||
#if defined(__clang__) || __GNUC__ >= 9
|
||||
ret = vec_neg(d);
|
||||
#else
|
||||
__asm__ __volatile__("xvnegsp %x0,%x1" : "=wa" (ret) : "wa" (d));
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vdouble vneg_vd_vd(vdouble d)
|
||||
{
|
||||
vdouble ret;
|
||||
#if defined(__clang__) || __GNUC__ >= 9
|
||||
ret = vec_neg(d);
|
||||
#else
|
||||
__asm__ __volatile__("xvnegdp %x0,%x1" : "=wa" (ret) : "wa" (d));
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vfloat vposneg_vf_vf(vfloat d)
|
||||
{ return vec_xor(d, vset__vf(+0.0f, -0.0f, +0.0f, -0.0f)); }
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d)
|
||||
{ return vec_xor(d, vset__vd(+0.0, -0.0)); }
|
||||
|
||||
static INLINE vfloat vnegpos_vf_vf(vfloat d)
|
||||
{ return vec_xor(d, vset__vf(-0.0f, +0.0f, -0.0f, +0.0f)); }
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d)
|
||||
{ return vec_xor(d, vset__vd(-0.0, +0.0)); }
|
||||
|
||||
////////////// Addition //////////////
|
||||
static INLINE vint vadd_vi_vi_vi(vint x, vint y)
|
||||
{ return vec_add(x, y); }
|
||||
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y)
|
||||
{ return vec_add(x, y); }
|
||||
|
||||
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y)
|
||||
{ return vec_add(x, y); }
|
||||
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y)
|
||||
{ return vec_add(x, y); }
|
||||
|
||||
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y)
|
||||
{ return (vmask)vec_add((v__i64)x, (v__i64)y); }
|
||||
|
||||
////////////// Subtraction //////////////
|
||||
static INLINE vint vsub_vi_vi_vi(vint x, vint y)
|
||||
{ return vec_sub(x, y); }
|
||||
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y)
|
||||
{ return vec_sub(x, y); }
|
||||
|
||||
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y)
|
||||
{ return vec_sub(x, y); }
|
||||
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y)
|
||||
{ return vec_sub(x, y); }
|
||||
|
||||
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y)
|
||||
{ return vec_add(x, vnegpos_vd_vd(y)); }
|
||||
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y)
|
||||
{ return vec_add(x, vnegpos_vf_vf(y)); }
|
||||
|
||||
////////////// Multiplication //////////////
|
||||
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y)
|
||||
{ return vec_mul(x, y); }
|
||||
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y)
|
||||
{ return vec_mul(x, y); }
|
||||
|
||||
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y)
|
||||
{ return vec_div(x, y); }
|
||||
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y)
|
||||
{ return vec_div(x, y); }
|
||||
|
||||
static INLINE vfloat vrec_vf_vf(vfloat x)
|
||||
{ return vec_div(vsetall__vf(1.0f), x); }
|
||||
static INLINE vdouble vrec_vd_vd(vdouble x)
|
||||
{ return vec_div(vsetall__vd(1.0), x); }
|
||||
|
||||
/**********************************************
|
||||
** Math
|
||||
**********************************************/
|
||||
|
||||
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y)
|
||||
{ return vec_max(x, y); }
|
||||
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y)
|
||||
{ return vec_max(x, y); }
|
||||
|
||||
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y)
|
||||
{ return vec_min(x, y); }
|
||||
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y)
|
||||
{ return vec_min(x, y); }
|
||||
|
||||
static INLINE vfloat vabs_vf_vf(vfloat f)
|
||||
{ return vec_abs(f); }
|
||||
static INLINE vdouble vabs_vd_vd(vdouble d)
|
||||
{ return vec_abs(d); }
|
||||
|
||||
static INLINE vfloat vsqrt_vf_vf(vfloat f)
|
||||
{ return vec_sqrt(f); }
|
||||
static INLINE vdouble vsqrt_vd_vd(vdouble d)
|
||||
{ return vec_sqrt(d); }
|
||||
|
||||
|
||||
/**********************************************
|
||||
** FMA3
|
||||
**********************************************/
|
||||
#if CONFIG == 1 || CONFIG == 3
|
||||
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
|
||||
{ return vec_madd(x, y, z); }
|
||||
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
|
||||
{ return vec_madd(x, y, z); }
|
||||
|
||||
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
|
||||
{ return vec_msub(x, y, z); }
|
||||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
|
||||
{ return vec_msub(x, y, z); }
|
||||
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
|
||||
{ return vec_nmsub(x, y, z); }
|
||||
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
|
||||
{ return vec_nmsub(x, y, z); }
|
||||
|
||||
#else
|
||||
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
|
||||
{ return vec_add(vec_mul(x, y), z); }
|
||||
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
|
||||
{ return vec_add(vec_mul(x, y), z); }
|
||||
|
||||
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
|
||||
{ return vec_sub(vec_mul(x, y), z); }
|
||||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
|
||||
{ return vec_sub(vec_mul(x, y), z); }
|
||||
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
|
||||
{ return vec_sub(z, vec_mul(x, y)); }
|
||||
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
|
||||
{ return vec_sub(z, vec_mul(x, y)); }
|
||||
|
||||
#endif
|
||||
|
||||
static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
|
||||
{ return vec_madd(x, y, z); }
|
||||
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
|
||||
{ return vec_madd(x, y, z); }
|
||||
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
|
||||
{ return vec_madd(x, y, z); }
|
||||
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
|
||||
{ return vec_madd(x, y, z); }
|
||||
|
||||
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
|
||||
{ return vec_msub(x, y, z); }
|
||||
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
|
||||
{ return vec_msub(x, y, z); }
|
||||
|
||||
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
|
||||
{ return vec_nmsub(x, y, z); }
|
||||
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
|
||||
{ return vec_nmsub(x, y, z); }
|
||||
|
||||
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
|
||||
{ return vec_nmadd(x, y, z); }
|
||||
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
|
||||
{ return vec_nmadd(x, y, z); }
|
||||
|
||||
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z)
|
||||
{ return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
|
||||
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z)
|
||||
{ return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
|
||||
|
||||
//
|
||||
|
||||
static vquad loadu_vq_p(void *p) {
|
||||
vquad vq;
|
||||
memcpy(&vq, p, VECTLENDP * 16);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vquad cast_vq_aq(vargquad aq) {
|
||||
vquad vq;
|
||||
memcpy(&vq, &aq, VECTLENDP * 16);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vargquad cast_aq_vq(vquad vq) {
|
||||
vargquad aq;
|
||||
memcpy(&aq, &vq, VECTLENDP * 16);
|
||||
return aq;
|
||||
}
|
||||
|
||||
static INLINE int vtestallzeros_i_vo64(vopmask g) {
|
||||
return vec_all_eq((__vector signed long long)g, vzero__s64());
|
||||
}
|
||||
|
||||
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
|
||||
return (vmask)vec_sel((__vector signed long long)y, (__vector signed long long)x, (v__b64)o);
|
||||
}
|
||||
|
||||
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
|
||||
return (vmask)vec_sub((__vector signed long long)x, (__vector signed long long)y);
|
||||
}
|
||||
|
||||
static INLINE vmask vneg64_vm_vm(vmask x) {
|
||||
return (vmask)vec_sub((__vector signed long long) {0, 0}, (__vector signed long long)x);
|
||||
}
|
||||
|
||||
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
|
||||
return (vopmask)vec_cmpgt((__vector signed long long)x, (__vector signed long long)y);
|
||||
}
|
||||
|
||||
#define vsll64_vm_vm_i(x, c) ((vmask)vec_sl((__vector signed long long)x, (__vector unsigned long long)vsetall__vm(c)))
|
||||
#define vsrl64_vm_vm_i(x, c) ((vmask)vec_sr((__vector signed long long)x, (__vector unsigned long long)vsetall__vm(c)))
|
||||
|
||||
static INLINE vint vcast_vi_vm(vmask vm) {
|
||||
return (vint) { vm[0], vm[2] };
|
||||
}
|
||||
|
||||
static INLINE vmask vcast_vm_vi(vint vi) {
|
||||
return (vmask) (__vector signed long long) { vi[0], vi[1] };
|
||||
}
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return (vmask)v; }
|
||||
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return (vint64)m; }
|
||||
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return (vmask)v; }
|
||||
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return (vuint64)m; }
|
||||
@@ -0,0 +1,561 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
#include "misc.h"
|
||||
|
||||
#ifndef CONFIG
|
||||
#error CONFIG macro not defined
|
||||
#endif
|
||||
|
||||
#define ENABLE_DP
|
||||
//@#define ENABLE_DP
|
||||
#define ENABLE_SP
|
||||
//@#define ENABLE_SP
|
||||
|
||||
#define LOG2VECTLENDP CONFIG
|
||||
//@#define LOG2VECTLENDP CONFIG
|
||||
#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
//@#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
//@#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
|
||||
#define ACCURATE_SQRT
|
||||
//@#define ACCURATE_SQRT
|
||||
|
||||
#define DFTPRIORITY LOG2VECTLENDP
|
||||
#define ISANAME "Pure C Array"
|
||||
|
||||
typedef union {
|
||||
uint32_t u[VECTLENDP*2];
|
||||
uint64_t x[VECTLENDP];
|
||||
double d[VECTLENDP];
|
||||
float f[VECTLENDP*2];
|
||||
int32_t i[VECTLENDP*2];
|
||||
} versatileVector;
|
||||
|
||||
typedef versatileVector vmask;
|
||||
typedef versatileVector vopmask;
|
||||
typedef versatileVector vdouble;
|
||||
typedef versatileVector vint;
|
||||
typedef versatileVector vfloat;
|
||||
typedef versatileVector vint2;
|
||||
|
||||
typedef union {
|
||||
uint8_t u[sizeof(long double)*VECTLENDP];
|
||||
long double ld[VECTLENDP];
|
||||
} longdoubleVector;
|
||||
|
||||
typedef longdoubleVector vmaskl;
|
||||
typedef longdoubleVector vlongdouble;
|
||||
|
||||
#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
|
||||
typedef union {
|
||||
uint8_t u[sizeof(Sleef_quad)*VECTLENDP];
|
||||
Sleef_quad q[VECTLENDP];
|
||||
} quadVector;
|
||||
|
||||
typedef quadVector vmaskq;
|
||||
typedef quadVector vquad;
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
static INLINE int vavailability_i(int name) { return -1; }
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { }
|
||||
|
||||
static INLINE int vtestallones_i_vo64(vopmask g) {
|
||||
int ret = 1; for(int i=0;i<VECTLENDP;i++) ret = ret && g.x[i]; return ret;
|
||||
}
|
||||
|
||||
static INLINE int vtestallones_i_vo32(vopmask g) {
|
||||
int ret = 1; for(int i=0;i<VECTLENSP;i++) ret = ret && g.u[i]; return ret;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static vint2 vloadu_vi2_p(int32_t *p) {
|
||||
vint2 vi;
|
||||
for(int i=0;i<VECTLENSP;i++) vi.i[i] = p[i];
|
||||
return vi;
|
||||
}
|
||||
|
||||
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
|
||||
for(int i=0;i<VECTLENSP;i++) p[i] = v.i[i];
|
||||
}
|
||||
|
||||
static vint vloadu_vi_p(int32_t *p) {
|
||||
vint vi;
|
||||
for(int i=0;i<VECTLENDP;i++) vi.i[i] = p[i];
|
||||
return vi;
|
||||
}
|
||||
|
||||
static void vstoreu_v_p_vi(int32_t *p, vint v) {
|
||||
for(int i=0;i<VECTLENDP;i++) p[i] = v.i[i];
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static INLINE vopmask vcast_vo32_vo64(vopmask m) {
|
||||
vopmask ret;
|
||||
for(int i=0;i<VECTLENDP;i++) ret.u[i] = m.u[i*2+1];
|
||||
for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret.u[i] = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vopmask vcast_vo64_vo32(vopmask m) {
|
||||
vopmask ret;
|
||||
for(int i=0;i<VECTLENDP;i++) ret.u[i*2] = ret.u[i*2+1] = m.u[i];
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vmask vcast_vm_i_i(int h, int l) {
|
||||
vmask ret;
|
||||
for(int i=0;i<VECTLENDP;i++) {
|
||||
ret.u[i*2+0] = l;
|
||||
ret.u[i*2+1] = h;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vint2 vcastu_vi2_vi(vint vi) {
|
||||
vint2 ret;
|
||||
for(int i=0;i<VECTLENDP;i++) {
|
||||
ret.i[i*2+0] = 0;
|
||||
ret.i[i*2+1] = vi.i[i];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vint vcastu_vi_vi2(vint2 vi2) {
|
||||
vint ret;
|
||||
for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi2.i[i*2+1];
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) {
|
||||
vint ret;
|
||||
for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi2.i[i];
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) {
|
||||
vint2 ret;
|
||||
for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi.i[i];
|
||||
for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret.i[i] = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vdouble vrev21_vd_vd(vdouble d0) {
|
||||
vdouble r;
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
r.d[i*2+0] = d0.d[i*2+1];
|
||||
r.d[i*2+1] = d0.d[i*2+0];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vdouble vreva2_vd_vd(vdouble d0) {
|
||||
vdouble r;
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
r.d[i*2+0] = d0.d[(VECTLENDP/2-1-i)*2+0];
|
||||
r.d[i*2+1] = d0.d[(VECTLENDP/2-1-i)*2+1];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vfloat vrev21_vf_vf(vfloat d0) {
|
||||
vfloat r;
|
||||
for(int i=0;i<VECTLENSP/2;i++) {
|
||||
r.f[i*2+0] = d0.f[i*2+1];
|
||||
r.f[i*2+1] = d0.f[i*2+0];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vfloat vreva2_vf_vf(vfloat d0) {
|
||||
vfloat r;
|
||||
for(int i=0;i<VECTLENSP/2;i++) {
|
||||
r.f[i*2+0] = d0.f[(VECTLENSP/2-1-i)*2+0];
|
||||
r.f[i*2+1] = d0.f[(VECTLENSP/2-1-i)*2+1];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vdouble vcast_vd_d(double d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = d; return ret; }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vopmask vand_vo_vo_vo (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] & y.u[i]; return ret; }
|
||||
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
|
||||
static INLINE vopmask vor_vo_vo_vo (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] | y.u[i]; return ret; }
|
||||
static INLINE vopmask vxor_vo_vo_vo (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^ y.u[i]; return ret; }
|
||||
|
||||
static INLINE vmask vand_vm_vm_vm (vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] & y.u[i]; return ret; }
|
||||
static INLINE vmask vandnot_vm_vm_vm (vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
|
||||
static INLINE vmask vor_vm_vm_vm (vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] | y.u[i]; return ret; }
|
||||
static INLINE vmask vxor_vm_vm_vm (vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^ y.u[i]; return ret; }
|
||||
|
||||
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] & y.u[i]; return ret; }
|
||||
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
|
||||
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] | y.u[i]; return ret; }
|
||||
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^ y.u[i]; return ret; }
|
||||
|
||||
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] & y.u[i]; return ret; }
|
||||
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
|
||||
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] | y.u[i]; return ret; }
|
||||
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^ y.u[i]; return ret; }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vd_vd (vopmask o, vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
|
||||
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { vint2 ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
|
||||
|
||||
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
|
||||
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vdouble vcast_vd_vi(vint vi) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = vi.i[i]; return ret; }
|
||||
static INLINE vint vtruncate_vi_vd(vdouble vd) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = (int)vd.d[i]; return ret; }
|
||||
static INLINE vint vrint_vi_vd(vdouble vd) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = vd.d[i] > 0 ? (int)(vd.d[i] + 0.5) : (int)(vd.d[i] - 0.5); return ret; }
|
||||
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
|
||||
static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
|
||||
static INLINE vint vcast_vi_i(int j) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = j; return ret; }
|
||||
|
||||
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.x[i] == y.x[i] ? -1 : 0; return ret; }
|
||||
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.x[i] + y.x[i]; return ret; }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { union { vdouble vd; vmask vm; } cnv; cnv.vd = vd; return cnv.vm; }
|
||||
static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { union { vdouble vd; vint2 vi2; } cnv; cnv.vd = vd; return cnv.vi2; }
|
||||
static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { union { vint2 vi2; vdouble vd; } cnv; cnv.vi2 = vi; return cnv.vd; }
|
||||
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { union { vmask vm; vdouble vd; } cnv; cnv.vm = vm; return cnv.vd; }
|
||||
|
||||
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] + y.d[i]; return ret; }
|
||||
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] - y.d[i]; return ret; }
|
||||
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i]; return ret; }
|
||||
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] / y.d[i]; return ret; }
|
||||
static INLINE vdouble vrec_vd_vd(vdouble x) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = 1.0 / x.d[i]; return ret; }
|
||||
|
||||
static INLINE vdouble vabs_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.x[i] & 0x7fffffffffffffffULL; return ret; }
|
||||
static INLINE vdouble vneg_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = -d.d[i]; return ret; }
|
||||
static INLINE vdouble vmla_vd_vd_vd_vd (vdouble x, vdouble y, vdouble z) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i] + z.d[i]; return ret; }
|
||||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i] - z.d[i]; return ret; }
|
||||
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] > y.d[i] ? x.d[i] : y.d[i]; return ret; }
|
||||
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] < y.d[i] ? x.d[i] : y.d[i]; return ret; }
|
||||
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ? d.d[i] : -d.d[i]; return ret; }
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ? -d.d[i] : d.d[i]; return ret; }
|
||||
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ? x.d[i] - y.d[i] : x.d[i] + y.d[i]; return ret; }
|
||||
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
|
||||
|
||||
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] == y.d[i] ? -1 : 0; return ret; }
|
||||
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] != y.d[i] ? -1 : 0; return ret; }
|
||||
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] < y.d[i] ? -1 : 0; return ret; }
|
||||
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] <= y.d[i] ? -1 : 0; return ret; }
|
||||
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] > y.d[i] ? -1 : 0; return ret; }
|
||||
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] >= y.d[i] ? -1 : 0; return ret; }
|
||||
|
||||
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] + y.i[i]; return ret; }
|
||||
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] - y.i[i]; return ret; }
|
||||
static INLINE vint vneg_vi_vi (vint x) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = -x.i[i]; return ret; }
|
||||
|
||||
static INLINE vint vand_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] & y.i[i]; return ret; }
|
||||
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = y.i[i] & ~x.i[i]; return ret; }
|
||||
static INLINE vint vor_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] | y.i[i]; return ret; }
|
||||
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] ^ y.i[i]; return ret; }
|
||||
|
||||
static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(x), y); }
|
||||
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(x), y); }
|
||||
|
||||
static INLINE vint vsll_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] << c; return ret; }
|
||||
static INLINE vint vsrl_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = ((uint32_t)x.i[i]) >> c; return ret; }
|
||||
static INLINE vint vsra_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] >> c; return ret; }
|
||||
|
||||
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.u[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
|
||||
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.u[i] = x.i[i] > y.i[i] ? -1 : 0; return ret; }
|
||||
|
||||
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
|
||||
union { vopmask vo; vint2 vi2; } cnv;
|
||||
cnv.vo = m;
|
||||
return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(cnv.vi2), x),
|
||||
vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(cnv.vi2), y));
|
||||
}
|
||||
|
||||
static INLINE vopmask visinf_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = (d.d[i] == SLEEF_INFINITY || d.d[i] == -SLEEF_INFINITY) ? -1 : 0; return ret; }
|
||||
static INLINE vopmask vispinf_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] == SLEEF_INFINITY ? -1 : 0; return ret; }
|
||||
static INLINE vopmask visminf_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] == -SLEEF_INFINITY ? -1 : 0; return ret; }
|
||||
static INLINE vopmask visnan_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] != d.d[i] ? -1 : 0; return ret; }
|
||||
|
||||
static INLINE vdouble vsqrt_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = sqrt(d.d[i]); return ret; }
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// This function is needed when debugging on MSVC.
|
||||
static INLINE double vcast_d_vd(vdouble v) { return v.d[0]; }
|
||||
#endif
|
||||
|
||||
static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
|
||||
static INLINE vdouble vloadu_vd_p(const double *ptr) { vdouble vd; for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[i]; return vd; }
|
||||
|
||||
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
|
||||
vdouble vd;
|
||||
for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[vi.i[i]];
|
||||
return vd;
|
||||
}
|
||||
|
||||
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
|
||||
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { for(int i=0;i<VECTLENDP;i++) ptr[i] = v.d[i]; }
|
||||
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
*(ptr+(offset + step * i)*2 + 0) = v.d[i*2+0];
|
||||
*(ptr+(offset + step * i)*2 + 1) = v.d[i*2+1];
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint2 vcast_vi2_vm(vmask vm) { union { vint2 vi2; vmask vm; } cnv; cnv.vm = vm; return cnv.vi2; }
|
||||
static INLINE vmask vcast_vm_vi2(vint2 vi) { union { vint2 vi2; vmask vm; } cnv; cnv.vi2 = vi; return cnv.vm; }
|
||||
|
||||
static INLINE vfloat vcast_vf_vi2(vint2 vi) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = vi.i[i]; return ret; }
|
||||
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = (int)vf.f[i]; return ret; }
|
||||
static INLINE vint2 vrint_vi2_vf(vfloat vf) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = vf.f[i] > 0 ? (int)(vf.f[i] + 0.5) : (int)(vf.f[i] - 0.5); return ret; }
|
||||
static INLINE vint2 vcast_vi2_i(int j) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = j; return ret; }
|
||||
static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
|
||||
static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
|
||||
|
||||
static INLINE vfloat vcast_vf_f(float f) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = f; return ret; }
|
||||
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { union { vfloat vf; vmask vm; } cnv; cnv.vf = vf; return cnv.vm; }
|
||||
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { union { vfloat vf; vmask vm; } cnv; cnv.vm = vm; return cnv.vf; }
|
||||
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { union { vfloat vf; vint2 vi2; } cnv; cnv.vi2 = vi; return cnv.vf; }
|
||||
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { union { vfloat vf; vint2 vi2; } cnv; cnv.vf = vf; return cnv.vi2; }
|
||||
|
||||
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] + y.f[i]; return ret; }
|
||||
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] - y.f[i]; return ret; }
|
||||
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i]; return ret; }
|
||||
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] / y.f[i]; return ret; }
|
||||
static INLINE vfloat vrec_vf_vf (vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = 1.0 / x.f[i]; return ret; }
|
||||
|
||||
static INLINE vfloat vabs_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] & 0x7fffffff; return ret; }
|
||||
static INLINE vfloat vneg_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = -x.f[i]; return ret; }
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i] + z.f[i]; return ret; }
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i] - z.f[i]; return ret; }
|
||||
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] > y.f[i] ? x.f[i] : y.f[i]; return ret; }
|
||||
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] < y.f[i] ? x.f[i] : y.f[i]; return ret; }
|
||||
|
||||
static INLINE vfloat vposneg_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ? x.f[i] : -x.f[i]; return ret; }
|
||||
static INLINE vfloat vnegpos_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ? -x.f[i] : x.f[i]; return ret; }
|
||||
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ? x.f[i] - y.f[i] : x.f[i] + y.f[i]; return ret; }
|
||||
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
|
||||
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] == y.f[i]) ? -1 : 0); return ret; }
|
||||
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] != y.f[i]) ? -1 : 0); return ret; }
|
||||
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] < y.f[i]) ? -1 : 0); return ret; }
|
||||
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] <= y.f[i]) ? -1 : 0); return ret; }
|
||||
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] > y.f[i]) ? -1 : 0); return ret; }
|
||||
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] >= y.f[i]) ? -1 : 0); return ret; }
|
||||
|
||||
static INLINE vint vadd_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] + y.i[i]; return ret; }
|
||||
static INLINE vint vsub_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] - y.i[i]; return ret; }
|
||||
static INLINE vint vneg_vi2_vi2(vint x) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = -x.i[i]; return ret; }
|
||||
|
||||
static INLINE vint vand_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] & y.i[i]; return ret; }
|
||||
static INLINE vint vandnot_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = y.i[i] & ~x.i[i]; return ret; }
|
||||
static INLINE vint vor_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] | y.i[i]; return ret; }
|
||||
static INLINE vint vxor_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] ^ y.i[i]; return ret; }
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
|
||||
|
||||
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
|
||||
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {
|
||||
union { vopmask vo; vint2 vi2; } cnv;
|
||||
cnv.vo = x;
|
||||
return vand_vi2_vi2_vi2(cnv.vi2, y);
|
||||
}
|
||||
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(x, y); }
|
||||
|
||||
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] << c; return ret; }
|
||||
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = ((uint32_t)x.i[i]) >> c; return ret; }
|
||||
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] >> c; return ret; }
|
||||
|
||||
static INLINE vopmask visinf_vo_vf (vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = (d.f[i] == SLEEF_INFINITYf || d.f[i] == -SLEEF_INFINITYf) ? -1 : 0; return ret; }
|
||||
static INLINE vopmask vispinf_vo_vf(vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] == SLEEF_INFINITYf ? -1 : 0; return ret; }
|
||||
static INLINE vopmask visminf_vo_vf(vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] == -SLEEF_INFINITYf ? -1 : 0; return ret; }
|
||||
static INLINE vopmask visnan_vo_vf (vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] != d.f[i] ? -1 : 0; return ret; }
|
||||
|
||||
static INLINE vopmask veq_vo_vi2_vi2 (vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
|
||||
static INLINE vopmask vgt_vo_vi2_vi2 (vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = x.i[i] > y.i[i] ? -1 : 0; return ret; }
|
||||
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
|
||||
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] > y.i[i] ? -1 : 0; return ret; }
|
||||
|
||||
static INLINE vfloat vsqrt_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = sqrtf(x.f[i]); return ret; }
|
||||
|
||||
#ifdef _MSC_VER
|
||||
// This function is needed when debugging on MSVC.
|
||||
static INLINE float vcast_f_vf(vfloat v) { return v.f[0]; }
|
||||
#endif
|
||||
|
||||
static INLINE vfloat vload_vf_p(const float *ptr) { return *(vfloat *)ptr; }
|
||||
static INLINE vfloat vloadu_vf_p(const float *ptr) {
|
||||
vfloat vf;
|
||||
for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[i];
|
||||
return vf;
|
||||
}
|
||||
|
||||
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
|
||||
vfloat vf;
|
||||
for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[vi2.i[i]];
|
||||
return vf;
|
||||
}
|
||||
|
||||
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
|
||||
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
|
||||
for(int i=0;i<VECTLENSP;i++) ptr[i] = v.f[i];
|
||||
}
|
||||
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
|
||||
for(int i=0;i<VECTLENSP/2;i++) {
|
||||
*(ptr+(offset + step * i)*2 + 0) = v.f[i*2+0];
|
||||
*(ptr+(offset + step * i)*2 + 1) = v.f[i*2+1];
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vlongdouble vcast_vl_l(long double d) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = d; return ret; }
|
||||
|
||||
static INLINE vlongdouble vrev21_vl_vl(vlongdouble d0) {
|
||||
vlongdouble r;
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
r.ld[i*2+0] = d0.ld[i*2+1];
|
||||
r.ld[i*2+1] = d0.ld[i*2+0];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vlongdouble vreva2_vl_vl(vlongdouble d0) {
|
||||
vlongdouble r;
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
r.ld[i*2+0] = d0.ld[(VECTLENDP/2-1-i)*2+0];
|
||||
r.ld[i*2+1] = d0.ld[(VECTLENDP/2-1-i)*2+1];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vlongdouble vadd_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] + y.ld[i]; return ret; }
|
||||
static INLINE vlongdouble vsub_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] - y.ld[i]; return ret; }
|
||||
static INLINE vlongdouble vmul_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] * y.ld[i]; return ret; }
|
||||
|
||||
static INLINE vlongdouble vneg_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = -x.ld[i]; return ret; }
|
||||
static INLINE vlongdouble vsubadd_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ? x.ld[i] - y.ld[i] : x.ld[i] + y.ld[i]; return ret; }
|
||||
static INLINE vlongdouble vmlsubadd_vl_vl_vl_vl(vlongdouble x, vlongdouble y, vlongdouble z) { return vsubadd_vl_vl_vl(vmul_vl_vl_vl(x, y), z); }
|
||||
static INLINE vlongdouble vposneg_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ? x.ld[i] : -x.ld[i]; return ret; }
|
||||
static INLINE vlongdouble vnegpos_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ? -x.ld[i] : x.ld[i]; return ret; }
|
||||
|
||||
static INLINE vlongdouble vload_vl_p(const long double *ptr) { return *(vlongdouble *)ptr; }
|
||||
static INLINE vlongdouble vloadu_vl_p(const long double *ptr) {
|
||||
vlongdouble vd;
|
||||
for(int i=0;i<VECTLENDP;i++) vd.ld[i] = ptr[i];
|
||||
return vd;
|
||||
}
|
||||
|
||||
static INLINE void vstore_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
|
||||
static INLINE void vstoreu_v_p_vl(long double *ptr, vlongdouble v) {
|
||||
for(int i=0;i<VECTLENDP;i++) ptr[i] = v.ld[i];
|
||||
}
|
||||
static INLINE void vstream_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) {
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
*(ptr+(offset + step * i)*2 + 0) = v.ld[i*2+0];
|
||||
*(ptr+(offset + step * i)*2 + 1) = v.ld[i*2+1];
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) { vscatter2_v_p_i_i_vl(ptr, offset, step, v); }
|
||||
|
||||
#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
|
||||
static INLINE vquad vcast_vq_q(Sleef_quad d) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = d; return ret; }
|
||||
|
||||
static INLINE vquad vrev21_vq_vq(vquad d0) {
|
||||
vquad r;
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
r.q[i*2+0] = d0.q[i*2+1];
|
||||
r.q[i*2+1] = d0.q[i*2+0];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vquad vreva2_vq_vq(vquad d0) {
|
||||
vquad r;
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
r.q[i*2+0] = d0.q[(VECTLENDP/2-1-i)*2+0];
|
||||
r.q[i*2+1] = d0.q[(VECTLENDP/2-1-i)*2+1];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vquad vadd_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] + y.q[i]; return ret; }
|
||||
static INLINE vquad vsub_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] - y.q[i]; return ret; }
|
||||
static INLINE vquad vmul_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] * y.q[i]; return ret; }
|
||||
|
||||
static INLINE vquad vneg_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = -x.q[i]; return ret; }
|
||||
static INLINE vquad vsubadd_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ? x.q[i] - y.q[i] : x.q[i] + y.q[i]; return ret; }
|
||||
static INLINE vquad vmlsubadd_vq_vq_vq_vq(vquad x, vquad y, vquad z) { return vsubadd_vq_vq_vq(vmul_vq_vq_vq(x, y), z); }
|
||||
static INLINE vquad vposneg_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ? x.q[i] : -x.q[i]; return ret; }
|
||||
static INLINE vquad vnegpos_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ? -x.q[i] : x.q[i]; return ret; }
|
||||
|
||||
static INLINE vquad vload_vq_p(const Sleef_quad *ptr) { return *(vquad *)ptr; }
|
||||
static INLINE vquad vloadu_vq_p(const Sleef_quad *ptr) {
|
||||
vquad vd;
|
||||
for(int i=0;i<VECTLENDP;i++) vd.q[i] = ptr[i];
|
||||
return vd;
|
||||
}
|
||||
|
||||
static INLINE void vstore_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
|
||||
static INLINE void vstoreu_v_p_vq(Sleef_quad *ptr, vquad v) {
|
||||
for(int i=0;i<VECTLENDP;i++) ptr[i] = v.q[i];
|
||||
}
|
||||
static INLINE void vstream_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) {
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
*(ptr+(offset + step * i)*2 + 0) = v.q[i*2+0];
|
||||
*(ptr+(offset + step * i)*2 + 1) = v.q[i*2+1];
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) { vscatter2_v_p_i_i_vq(ptr, offset, step, v); }
|
||||
#endif
|
||||
@@ -0,0 +1,487 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2023.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
#include <stdint.h>
|
||||
#endif
|
||||
|
||||
#ifndef ENABLE_BUILTIN_MATH
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
#include <math.h>
|
||||
#endif
|
||||
|
||||
#define SQRT sqrt
|
||||
#define SQRTF sqrtf
|
||||
#define FMA fma
|
||||
#define FMAF fmaf
|
||||
#define RINT rint
|
||||
#define RINTF rintf
|
||||
#define TRUNC trunc
|
||||
#define TRUNCF truncf
|
||||
|
||||
#else
|
||||
|
||||
#define SQRT __builtin_sqrt
|
||||
#define SQRTF __builtin_sqrtf
|
||||
#define FMA __builtin_fma
|
||||
#define FMAF __builtin_fmaf
|
||||
#define RINT __builtin_rint
|
||||
#define RINTF __builtin_rintf
|
||||
#define TRUNC __builtin_trunc
|
||||
#define TRUNCF __builtin_truncf
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
#include "misc.h"
|
||||
#endif
|
||||
|
||||
#ifndef CONFIG
|
||||
#error CONFIG macro not defined
|
||||
#endif
|
||||
|
||||
#define ENABLE_DP
|
||||
//@#define ENABLE_DP
|
||||
#define ENABLE_SP
|
||||
//@#define ENABLE_SP
|
||||
|
||||
#if CONFIG == 2 || CONFIG == 3
|
||||
#define ENABLE_FMA_DP
|
||||
//@#define ENABLE_FMA_DP
|
||||
#define ENABLE_FMA_SP
|
||||
//@#define ENABLE_FMA_SP
|
||||
|
||||
#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__) || defined(__zarch__) || defined(__riscv) || CONFIG == 3
|
||||
#ifndef FP_FAST_FMA
|
||||
//@#ifndef FP_FAST_FMA
|
||||
#define FP_FAST_FMA
|
||||
//@#define FP_FAST_FMA
|
||||
#endif
|
||||
//@#endif
|
||||
#ifndef FP_FAST_FMAF
|
||||
//@#ifndef FP_FAST_FMAF
|
||||
#define FP_FAST_FMAF
|
||||
//@#define FP_FAST_FMAF
|
||||
#endif
|
||||
//@#endif
|
||||
#endif
|
||||
|
||||
#if (!defined(FP_FAST_FMA) || !defined(FP_FAST_FMAF)) && !defined(SLEEF_GENHEADER)
|
||||
#error FP_FAST_FMA or FP_FAST_FMAF not defined
|
||||
#endif
|
||||
|
||||
#define ISANAME "Pure C scalar with FMA"
|
||||
|
||||
#else // #if CONFIG == 2 || CONFIG == 3
|
||||
#define ISANAME "Pure C scalar"
|
||||
#endif // #if CONFIG == 2 || CONFIG == 3
|
||||
|
||||
#define LOG2VECTLENDP 0
|
||||
//@#define LOG2VECTLENDP 0
|
||||
#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
//@#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
#define LOG2VECTLENSP 0
|
||||
//@#define LOG2VECTLENSP 0
|
||||
#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
//@#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
|
||||
#define ACCURATE_SQRT
|
||||
//@#define ACCURATE_SQRT
|
||||
|
||||
#if defined(__SSE4_1__) || defined(__aarch64__) || CONFIG == 3
|
||||
#define FULL_FP_ROUNDING
|
||||
//@#define FULL_FP_ROUNDING
|
||||
#endif
|
||||
|
||||
#define DFTPRIORITY LOG2VECTLENDP
|
||||
|
||||
typedef uint64_t vmask;
|
||||
typedef uint32_t vopmask;
|
||||
typedef double vdouble;
|
||||
typedef int32_t vint;
|
||||
typedef float vfloat;
|
||||
typedef int32_t vint2;
|
||||
|
||||
typedef int64_t vint64;
|
||||
typedef uint64_t vuint64;
|
||||
|
||||
typedef Sleef_uint64_2t vquad;
|
||||
|
||||
#if CONFIG != 3
|
||||
typedef Sleef_quad vargquad;
|
||||
#else
|
||||
typedef Sleef_uint64_2t vargquad;
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
static INLINE int vavailability_i(int name) { return -1; }
|
||||
static INLINE void vprefetch_v_p(const void *ptr) {}
|
||||
|
||||
static INLINE int vtestallones_i_vo64(vopmask g) { return g; }
|
||||
static INLINE int vtestallones_i_vo32(vopmask g) { return g; }
|
||||
|
||||
//
|
||||
|
||||
static vint2 vloadu_vi2_p(int32_t *p) { return *p; }
|
||||
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { *p = v; }
|
||||
static vint vloadu_vi_p(int32_t *p) { return *p; }
|
||||
static void vstoreu_v_p_vi(int32_t *p, vint v) { *p = v; }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vopmask vcast_vo32_vo64(vopmask m) { return m; }
|
||||
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return m; }
|
||||
static INLINE vopmask vcast_vo_i(int i) { return i ? -1 : 0; }
|
||||
static INLINE vmask vcast_vm_i_i(int h, int l) { return (((uint64_t)h) << 32) | (uint32_t)l; }
|
||||
|
||||
static INLINE vmask vcast_vm_i64(int64_t i) { return (int64_t)i; }
|
||||
static INLINE vmask vcast_vm_u64(uint64_t i) { return i; }
|
||||
|
||||
static INLINE vmask vcastu_vm_vi(vint vi) { return ((uint64_t)vi) << 32; }
|
||||
static INLINE vint vcastu_vi_vm(vmask vm) { return (int32_t)(vm >> 32); }
|
||||
|
||||
static INLINE vdouble vcast_vd_d(double d) { return d; }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vopmask vand_vo_vo_vo (vopmask x, vopmask y) { return x & y; }
|
||||
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return y & ~x; }
|
||||
static INLINE vopmask vor_vo_vo_vo (vopmask x, vopmask y) { return x | y; }
|
||||
static INLINE vopmask vxor_vo_vo_vo (vopmask x, vopmask y) { return x ^ y; }
|
||||
|
||||
static INLINE vmask vand_vm_vm_vm (vmask x, vmask y) { return x & y; }
|
||||
static INLINE vmask vandnot_vm_vm_vm (vmask x, vmask y) { return y & ~x; }
|
||||
static INLINE vmask vor_vm_vm_vm (vmask x, vmask y) { return x | y; }
|
||||
static INLINE vmask vxor_vm_vm_vm (vmask x, vmask y) { return x ^ y; }
|
||||
|
||||
static INLINE vmask vcast_vm_vo(vopmask o) { return (vmask)o | (((vmask)o) << 32); }
|
||||
|
||||
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vcast_vm_vo(x) & y; }
|
||||
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return y & ~vcast_vm_vo(x); }
|
||||
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vcast_vm_vo(x) | y; }
|
||||
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vcast_vm_vo(x) ^ y; }
|
||||
|
||||
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vcast_vm_vo(x) & y; }
|
||||
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return y & ~vcast_vm_vo(x); }
|
||||
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vcast_vm_vo(x) | y; }
|
||||
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vcast_vm_vo(x) ^ y; }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vd_vd (vopmask o, vdouble x, vdouble y) { return o ? x : y; }
|
||||
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { return o ? x : y; }
|
||||
|
||||
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return o ? v1 : v0; }
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vdouble vcast_vd_vi(vint vi) { return vi; }
|
||||
static INLINE vint vcast_vi_i(int j) { return j; }
|
||||
|
||||
#ifdef FULL_FP_ROUNDING
|
||||
static INLINE vint vrint_vi_vd(vdouble d) { return (int32_t)RINT(d); }
|
||||
static INLINE vdouble vrint_vd_vd(vdouble vd) { return RINT(vd); }
|
||||
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return TRUNC(vd); }
|
||||
static INLINE vint vtruncate_vi_vd(vdouble vd) { return (int32_t)TRUNC(vd); }
|
||||
#else
|
||||
static INLINE vint vrint_vi_vd(vdouble a) {
|
||||
a += a > 0 ? 0.5 : -0.5;
|
||||
uint64_t vx;
|
||||
memcpy(&vx, &a, sizeof(vx));
|
||||
vx -= 1 & (int)a;
|
||||
memcpy(&a, &vx, sizeof(a));
|
||||
return a;
|
||||
}
|
||||
static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
|
||||
static INLINE vint vtruncate_vi_vd(vdouble vd) { return vd; }
|
||||
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
|
||||
#endif
|
||||
|
||||
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return x == y ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return x + y; }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { vmask vm; memcpy(&vm, &vd, sizeof(vm)); return vm; }
|
||||
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { vdouble vd; memcpy(&vd, &vm, sizeof(vd)); return vd; }
|
||||
|
||||
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return x + y; }
|
||||
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return x - y; }
|
||||
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return x * y; }
|
||||
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return x / y; }
|
||||
static INLINE vdouble vrec_vd_vd(vdouble x) { return 1 / x; }
|
||||
|
||||
static INLINE vdouble vabs_vd_vd(vdouble d) {
|
||||
uint64_t vx;
|
||||
memcpy(&vx, &d, sizeof(vx));
|
||||
vx &= UINT64_C(0x7fffffffffffffff);
|
||||
memcpy(&d, &vx, sizeof(d));
|
||||
return d;
|
||||
}
|
||||
static INLINE vdouble vneg_vd_vd(vdouble d) { return -d; }
|
||||
|
||||
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return x > y ? x : y; }
|
||||
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return x < y ? x : y; }
|
||||
|
||||
#ifndef ENABLE_FMA_DP
|
||||
static INLINE vdouble vmla_vd_vd_vd_vd (vdouble x, vdouble y, vdouble z) { return x * y + z; }
|
||||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return x * y - z; }
|
||||
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return -x * y + z; }
|
||||
#else
|
||||
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, z); }
|
||||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, -z); }
|
||||
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(-x, y, z); }
|
||||
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, z); }
|
||||
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, z); }
|
||||
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(x, y, -z); }
|
||||
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(-x, y, z); }
|
||||
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return FMA(-x, y, -z); }
|
||||
#endif
|
||||
|
||||
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return x == y ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return x != y ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return x < y ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return x <= y ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return x > y ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return x >= y ? ~(uint32_t)0 : 0; }
|
||||
|
||||
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return x + y; }
|
||||
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return x - y; }
|
||||
static INLINE vint vneg_vi_vi (vint x) { return - x; }
|
||||
|
||||
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return x & y; }
|
||||
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return y & ~x; }
|
||||
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return x | y; }
|
||||
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return x ^ y; }
|
||||
|
||||
static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return x & y; }
|
||||
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return y & ~x; }
|
||||
|
||||
static INLINE vint vsll_vi_vi_i(vint x, int c) { return (uint32_t)x << c; }
|
||||
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return (uint32_t)x >> c; }
|
||||
static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> c; }
|
||||
|
||||
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return x == y ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return x > y ? ~(uint32_t)0 : 0; }
|
||||
|
||||
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return m ? x : y; }
|
||||
|
||||
static INLINE vopmask visinf_vo_vd(vdouble d) { return (d == SLEEF_INFINITY || d == -SLEEF_INFINITY) ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask vispinf_vo_vd(vdouble d) { return d == SLEEF_INFINITY ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask visminf_vo_vd(vdouble d) { return d == -SLEEF_INFINITY ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask visnan_vo_vd(vdouble d) { return d != d ? ~(uint32_t)0 : 0; }
|
||||
|
||||
static INLINE vdouble vsqrt_vd_vd(vdouble d) { return SQRT(d); }
|
||||
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return SQRTF(x); }
|
||||
|
||||
static INLINE double vcast_d_vd(vdouble v) { return v; }
|
||||
|
||||
static INLINE vdouble vload_vd_p(const double *ptr) { return *ptr; }
|
||||
static INLINE vdouble vloadu_vd_p(const double *ptr) { return *ptr; }
|
||||
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) { return ptr[vi]; }
|
||||
|
||||
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *ptr = v; }
|
||||
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { *ptr = v; }
|
||||
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *ptr = v; }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint2 vcast_vi2_vm(vmask vm) { return (int32_t)vm; }
|
||||
static INLINE vmask vcast_vm_vi2(vint2 vi) { return (uint32_t)vi; }
|
||||
|
||||
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return (int32_t)vi; }
|
||||
static INLINE vint2 vcast_vi2_i(int j) { return j; }
|
||||
|
||||
#ifdef FULL_FP_ROUNDING
|
||||
static INLINE vint2 vrint_vi2_vf(vfloat d) { return (int)RINTF(d); }
|
||||
static INLINE vfloat vrint_vf_vf(vfloat vd) { return RINTF(vd); }
|
||||
static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return TRUNCF(vd); }
|
||||
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return (int32_t)TRUNCF(vf); }
|
||||
#else
|
||||
static INLINE vint2 vrint_vi2_vf(vfloat a) {
|
||||
a += a > 0 ? 0.5f : -0.5f;
|
||||
uint32_t vu[1];
|
||||
memcpy(vu, &a, sizeof(vu));
|
||||
vu[0] -= 1 & (int)a;
|
||||
memcpy(&a, vu, sizeof(a));
|
||||
return (int32_t)a;
|
||||
}
|
||||
static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
|
||||
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vf; }
|
||||
static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
|
||||
#endif
|
||||
|
||||
static INLINE vfloat vcast_vf_f(float f) { return f; }
|
||||
static INLINE vmask vreinterpret_vm_vf(vfloat f) { vfloat vf[2] = { f, 0 }; vmask vm; memcpy(&vm, &vf, sizeof(vm)); return vm; }
|
||||
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { vfloat vf[2]; memcpy(&vf, &vm, sizeof(vf)); return vf[0]; }
|
||||
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { vfloat vf; memcpy(&vf, &vi, sizeof(vf)); return vf; }
|
||||
static INLINE vint2 vreinterpret_vi2_vf(vfloat f) { vint2 vi2; memcpy(&vi2, &f, sizeof(vi2)); return vi2; }
|
||||
|
||||
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return x + y; }
|
||||
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return x - y; }
|
||||
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return x * y; }
|
||||
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return x / y; }
|
||||
static INLINE vfloat vrec_vf_vf (vfloat x) { return 1 / x; }
|
||||
|
||||
static INLINE vfloat vabs_vf_vf(vfloat x) {
|
||||
int32_t vi[1];
|
||||
memcpy(vi, &x, sizeof(vi));
|
||||
vi[0] &= 0x7fffffff;
|
||||
memcpy(&x, vi, sizeof(x));
|
||||
return x;
|
||||
}
|
||||
static INLINE vfloat vneg_vf_vf(vfloat x) { return -x; }
|
||||
|
||||
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return x > y ? x : y; }
|
||||
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return x < y ? x : y; }
|
||||
|
||||
#ifndef ENABLE_FMA_SP
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { return x * y + z; }
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return - x * y + z; }
|
||||
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return x * y - z; }
|
||||
#else
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, z); }
|
||||
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, -z); }
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(-x, y, z); }
|
||||
static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, z); }
|
||||
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, z); }
|
||||
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(x, y, -z); }
|
||||
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(-x, y, z); }
|
||||
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return FMAF(-x, y, -z); }
|
||||
#endif
|
||||
|
||||
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return x == y ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return x != y ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return x < y ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return x <= y ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return x > y ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return x >= y ? ~(uint32_t)0 : 0; }
|
||||
|
||||
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return x + y; }
|
||||
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return x - y; }
|
||||
static INLINE vint2 vneg_vi2_vi2(vint2 x) { return -x; }
|
||||
|
||||
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return x & y; }
|
||||
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return y & ~x; }
|
||||
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return x | y; }
|
||||
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return x ^ y; }
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return o ? x : y; }
|
||||
static INLINE vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) { return o ? v1 : v0; }
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vcast_vm_vo(x) & y; }
|
||||
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~vcast_vm_vo(x); }
|
||||
|
||||
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
|
||||
return x << c;
|
||||
}
|
||||
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
|
||||
return ((uint32_t)x) >> c;
|
||||
}
|
||||
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) {
|
||||
return x >> c;
|
||||
}
|
||||
|
||||
static INLINE vopmask visinf_vo_vf (vfloat d) { return (d == SLEEF_INFINITYf || d == -SLEEF_INFINITYf) ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask vispinf_vo_vf(vfloat d) { return d == SLEEF_INFINITYf ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask visminf_vo_vf(vfloat d) { return d == -SLEEF_INFINITYf ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask visnan_vo_vf (vfloat d) { return d != d ? ~(uint32_t)0 : 0; }
|
||||
|
||||
static INLINE vopmask veq_vo_vi2_vi2 (vint2 x, vint2 y) { return (int32_t)x == (int32_t)y ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vopmask vgt_vo_vi2_vi2 (vint2 x, vint2 y) { return (int32_t)x > (int32_t)y ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return (int32_t)x == (int32_t)y ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (int32_t)x > (int32_t)y ? ~(uint32_t)0 : 0; }
|
||||
|
||||
static INLINE float vcast_f_vf(vfloat v) { return v; }
|
||||
|
||||
static INLINE vfloat vload_vf_p(const float *ptr) { return *ptr; }
|
||||
static INLINE vfloat vloadu_vf_p(const float *ptr) { return *ptr; }
|
||||
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi) { return ptr[vi]; }
|
||||
|
||||
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *ptr = v; }
|
||||
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { *ptr = v; }
|
||||
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *ptr = v; }
|
||||
|
||||
//
|
||||
|
||||
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
static vquad loadu_vq_p(void *p) {
|
||||
vquad vq;
|
||||
memcpy(8 + (char *)&vq, p, 8);
|
||||
memcpy((char *)&vq, 8 + p, 8);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vquad cast_vq_aq(vargquad aq) {
|
||||
vquad vq;
|
||||
memcpy(8 + (char *)&vq, (char *)&aq, 8);
|
||||
memcpy((char *)&vq, 8 + (char *)&aq, 8);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vargquad cast_aq_vq(vquad vq) {
|
||||
vargquad aq;
|
||||
memcpy(8 + (char *)&aq, (char *)&vq, 8);
|
||||
memcpy((char *)&aq, 8 + (char *)&vq, 8);
|
||||
return aq;
|
||||
}
|
||||
#else
|
||||
static vquad loadu_vq_p(void *p) {
|
||||
vquad vq;
|
||||
memcpy(&vq, p, sizeof(vq));
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vquad cast_vq_aq(vargquad aq) {
|
||||
vquad vq;
|
||||
memcpy(&vq, &aq, sizeof(vq));
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vargquad cast_aq_vq(vquad vq) {
|
||||
vargquad aq;
|
||||
memcpy(&aq, &vq, sizeof(aq));
|
||||
return aq;
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
static INLINE int vtestallzeros_i_vo64(vopmask g) { return !g ? ~(uint32_t)0 : 0; }
|
||||
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) { return o ? x : y; }
|
||||
|
||||
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return (int64_t)x - (int64_t)y; }
|
||||
static INLINE vmask vneg64_vm_vm(vmask x) { return -(int64_t)x; }
|
||||
|
||||
#define vsll64_vm_vm_i(x, c) ((uint64_t)(x) << (c))
|
||||
#define vsrl64_vm_vm_i(x, c) ((uint64_t)(x) >> (c))
|
||||
//@#define vsll64_vm_vm_i(x, c) ((uint64_t)(x) << (c))
|
||||
//@#define vsrl64_vm_vm_i(x, c) ((uint64_t)(x) >> (c))
|
||||
|
||||
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) { return (int64_t)x > (int64_t)y ? ~(uint32_t)0 : 0; }
|
||||
|
||||
static INLINE vmask vcast_vm_vi(vint vi) { return vi; }
|
||||
static INLINE vint vcast_vi_vm(vmask vm) { return vm; }
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
|
||||
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
|
||||
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
|
||||
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,462 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#if CONFIG == 140 || CONFIG == 141 || CONFIG == 150 || CONFIG == 151
|
||||
|
||||
#if !defined(__VX__) && !defined(SLEEF_GENHEADER)
|
||||
#error This helper is for IBM s390x.
|
||||
#endif
|
||||
|
||||
#if __ARCH__ < 12 && !defined(SLEEF_GENHEADER)
|
||||
#error Please specify -march=z14 or higher.
|
||||
#endif
|
||||
|
||||
#else
|
||||
#error CONFIG macro invalid or not defined
|
||||
#endif
|
||||
|
||||
#define ENABLE_DP
|
||||
//@#define ENABLE_DP
|
||||
#define LOG2VECTLENDP 1
|
||||
//@#define LOG2VECTLENDP 1
|
||||
#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
//@#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
|
||||
#define ENABLE_SP
|
||||
//@#define ENABLE_SP
|
||||
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
//@#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
|
||||
#if CONFIG == 140 || CONFIG == 150
|
||||
#define ENABLE_FMA_DP
|
||||
//@#define ENABLE_FMA_DP
|
||||
#define ENABLE_FMA_SP
|
||||
//@#define ENABLE_FMA_SP
|
||||
#endif
|
||||
|
||||
#define ACCURATE_SQRT
|
||||
//@#define ACCURATE_SQRT
|
||||
#define FULL_FP_ROUNDING
|
||||
//@#define FULL_FP_ROUNDING
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
#ifndef SLEEF_VECINTRIN_H_INCLUDED
|
||||
#include <vecintrin.h>
|
||||
#define SLEEF_VECINTRIN_H_INCLUDED
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
#include "misc.h"
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
typedef __vector unsigned long long vmask;
|
||||
typedef __vector unsigned long long vopmask;
|
||||
|
||||
typedef __vector double vdouble;
|
||||
typedef __vector int vint;
|
||||
|
||||
typedef __vector float vfloat;
|
||||
typedef __vector int vint2;
|
||||
|
||||
typedef __vector long long vint64;
|
||||
typedef __vector unsigned long long vuint64;
|
||||
|
||||
typedef struct {
|
||||
vmask x, y;
|
||||
} vquad;
|
||||
|
||||
typedef vquad vargquad;
|
||||
|
||||
//
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
|
||||
static INLINE int vavailability_i(int n) {
|
||||
if (n == 1 || n == 2) {
|
||||
return vec_max((vdouble) {n, n}, (vdouble) {n, n})[0] != 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if CONFIG == 140 || CONFIG == 141
|
||||
#define ISANAME "VXE"
|
||||
#else
|
||||
#define ISANAME "VXE2"
|
||||
#endif
|
||||
|
||||
#define DFTPRIORITY 14
|
||||
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { }
|
||||
|
||||
static vint2 vloadu_vi2_p(int32_t *p) { return (vint2) { p[0], p[1], p[2], p[3] }; }
|
||||
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { p[0] = v[0]; p[1] = v[1]; p[2] = v[2]; p[3] = v[3]; }
|
||||
static vint vloadu_vi_p(int32_t *p) { return (vint) { p[0], p[1] }; }
|
||||
static void vstoreu_v_p_vi(int32_t *p, vint v) { p[0] = v[0]; p[1] = v[1]; }
|
||||
|
||||
static INLINE vdouble vload_vd_p(const double *p) { return (vdouble) { p[0], p[1] }; }
|
||||
static INLINE void vstore_v_p_vd(double *p, vdouble v) { p[0] = v[0]; p[1] = v[1]; }
|
||||
static INLINE vdouble vloadu_vd_p(const double *p) { return (vdouble) { p[0], p[1] }; }
|
||||
static INLINE void vstoreu_v_p_vd(double *p, vdouble v) { p[0] = v[0]; p[1] = v[1]; }
|
||||
|
||||
static INLINE vfloat vload_vf_p(const float *p) { return (vfloat) { p[0], p[1], p[2], p[3] }; }
|
||||
static INLINE void vstore_v_p_vf(float *p, vfloat v) { p[0] = v[0]; p[1] = v[1]; p[2] = v[2]; p[3] = v[3]; }
|
||||
static INLINE void vscatter2_v_p_i_i_vf(float *p, int offset, int step, vfloat v) {
|
||||
*(p+(offset + step * 0)*2 + 0) = v[0];
|
||||
*(p+(offset + step * 0)*2 + 1) = v[1];
|
||||
*(p+(offset + step * 1)*2 + 0) = v[2];
|
||||
*(p+(offset + step * 1)*2 + 1) = v[3];
|
||||
}
|
||||
|
||||
static INLINE vfloat vloadu_vf_p(const float *p) { return (vfloat) { p[0], p[1], p[2], p[3] }; }
|
||||
static INLINE void vstoreu_v_p_vf(float *p, vfloat v) { p[0] = v[0]; p[1] = v[1]; p[2] = v[2]; p[3] = v[3]; }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vd(double *p, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&p[2*offset]), v); }
|
||||
|
||||
static INLINE vdouble vgather_vd_p_vi(const double *p, vint vi) {
|
||||
return ((vdouble) { p[vi[0]], p[vi[1]] });
|
||||
}
|
||||
|
||||
static INLINE vfloat vgather_vf_p_vi2(const float *p, vint2 vi2) {
|
||||
return ((vfloat) { p[vi2[0]], p[vi2[1]], p[vi2[2]], p[vi2[3]] });
|
||||
}
|
||||
|
||||
static INLINE vopmask vcast_vo_i(int i) { return (vopmask) { i ? (long long)-1 : 0, i ? (long long)-1 : 0 }; }
|
||||
static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
|
||||
static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
|
||||
static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }
|
||||
static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d }; }
|
||||
|
||||
static INLINE vdouble vcast_vd_vi(vint vi) { return (vdouble) { vi[0], vi[1] }; }
|
||||
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return (vfloat) { vi[0], vi[1], vi[2], vi[3] }; }
|
||||
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return __builtin_s390_vfidb(vd, 4, 5); }
|
||||
static INLINE vdouble vrint_vd_vd(vdouble vd) { return __builtin_s390_vfidb(vd, 4, 4); }
|
||||
|
||||
static INLINE vint vrint_vi_vd(vdouble vd) {
|
||||
vd = vrint_vd_vd(vd);
|
||||
return (vint) { vd[0], vd[1] };
|
||||
}
|
||||
static INLINE vint vtruncate_vi_vd(vdouble vd) { return (vint) { vd[0], vd[1] }; }
|
||||
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return (vint) { vf[0], vf[1], vf[2], vf[3] }; }
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (vmask)vd; }
|
||||
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (vdouble)vm; }
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
|
||||
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
|
||||
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return (vfloat)vi; }
|
||||
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
|
||||
|
||||
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return x + y; }
|
||||
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return x - y; }
|
||||
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return x * y; }
|
||||
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return x / y; }
|
||||
static INLINE vdouble vrec_vd_vd(vdouble x) { return 1 / x; }
|
||||
static INLINE vdouble vneg_vd_vd(vdouble d) { return -d; }
|
||||
|
||||
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return x + y; }
|
||||
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return x - y; }
|
||||
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return x * y; }
|
||||
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return x / y; }
|
||||
static INLINE vfloat vrec_vf_vf(vfloat x) { return 1 / x; }
|
||||
static INLINE vfloat vneg_vf_vf(vfloat d) { return -d; }
|
||||
|
||||
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return x & y; }
|
||||
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return y & ~x; }
|
||||
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return x | y; }
|
||||
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return x ^ y; }
|
||||
|
||||
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return x & y; }
|
||||
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return y & ~x; }
|
||||
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return x | y; }
|
||||
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return x ^ y; }
|
||||
|
||||
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return x & y; }
|
||||
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return y & ~x; }
|
||||
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return x | y; }
|
||||
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return x ^ y; }
|
||||
|
||||
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return x & y; }
|
||||
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return y & ~x; }
|
||||
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return x | y; }
|
||||
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return x ^ y; }
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return vec_sel(y, x, o); }
|
||||
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return vec_sel(y, x, (__vector unsigned int)o); }
|
||||
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { return vec_sel(y, x, (__vector unsigned int)o); }
|
||||
|
||||
static INLINE int vtestallones_i_vo32(vopmask g) { return vec_all_ne((vint2)g, (vint2 ) { 0, 0, 0, 0 }); }
|
||||
static INLINE int vtestallones_i_vo64(vopmask g) { return vec_all_ne((__vector unsigned long long)g, (__vector unsigned long long) { 0, 0 }); }
|
||||
|
||||
static INLINE vopmask vcast_vo32_vo64(vopmask g) { return (vopmask)(vint) { g[0] != 0 ? -1 : 0, g[1] != 0 ? -1 : 0, 0, 0 }; }
|
||||
static INLINE vopmask vcast_vo64_vo32(vopmask g) { return (vopmask) { ((vint)g)[0] != 0 ? 0xffffffffffffffffLL : 0, ((vint)g)[1] != 0 ? 0xffffffffffffffffLL : 0 }; }
|
||||
|
||||
static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask)(vint){ h, l, h, l }; }
|
||||
static INLINE vmask vcast_vm_i64(int64_t i) { return (vmask)(vint64){ i, i }; }
|
||||
static INLINE vmask vcast_vm_u64(uint64_t i) { return (vmask)(vuint64){ i, i }; }
|
||||
|
||||
static INLINE vmask vcastu_vm_vi(vint vi) { return (vmask)(vint2){ vi[0], 0, vi[1], 0 }; }
|
||||
static INLINE vint vcastu_vi_vm(vmask vi2) { return (vint){ vi2[0] >> 32, vi2[1] >> 32 }; }
|
||||
|
||||
static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1] }; }
|
||||
static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], 0, 0 }; }
|
||||
|
||||
static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0] }; }
|
||||
static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
|
||||
static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2] }; }
|
||||
static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[2], vd[3], vd[0], vd[1] }; }
|
||||
|
||||
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
|
||||
return (vopmask) { x[0] == y[0] ? 0xffffffffffffffffLL : 0, x[1] == y[1] ? 0xffffffffffffffffLL : 0 };
|
||||
}
|
||||
|
||||
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
|
||||
return (vmask)((__vector long long)x + (__vector long long)y);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
#define PNMASK ((vdouble) { +0.0, -0.0 })
|
||||
#define NPMASK ((vdouble) { -0.0, +0.0 })
|
||||
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
|
||||
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
|
||||
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
|
||||
static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
|
||||
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vdouble vabs_vd_vd(vdouble d) { return vec_abs(d); }
|
||||
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
|
||||
|
||||
#if CONFIG == 140 || CONFIG == 150
|
||||
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); }
|
||||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_msub(x, y, z); }
|
||||
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmsub(x, y, z); }
|
||||
#else
|
||||
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
|
||||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
|
||||
#endif
|
||||
|
||||
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
|
||||
static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); }
|
||||
static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_madd(x, y, z); }
|
||||
static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_msub(x, y, z); }
|
||||
static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmsub(x, y, z); }
|
||||
static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vec_nmadd(x, y, z); }
|
||||
|
||||
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
|
||||
|
||||
#if CONFIG == 140 || CONFIG == 150
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmasb(x, y, z); }
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmsub(x, y, z); }
|
||||
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmssb(x, y, z); }
|
||||
static INLINE vfloat vfma_vf_vf_vf_vf (vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmasb(x, y, z); }
|
||||
static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmasb(x, y, z); }
|
||||
static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return __builtin_s390_vfmssb(x, y, z); }
|
||||
static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmsub(x, y, z); }
|
||||
static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vec_nmadd(x, y, z); }
|
||||
#else
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
|
||||
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
#endif
|
||||
|
||||
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
|
||||
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static INLINE vopmask vnot_vo_vo(vopmask o) { return ~o; }
|
||||
|
||||
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpeq(x, y); }
|
||||
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vnot_vo_vo(vec_cmpeq(x, y)); }
|
||||
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmplt(x, y); }
|
||||
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmple(x, y); }
|
||||
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpgt(x, y); }
|
||||
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)vec_cmpge(x, y); }
|
||||
|
||||
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return x + y; }
|
||||
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return x - y; }
|
||||
static INLINE vint vneg_vi_vi(vint e) { return -e; }
|
||||
|
||||
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return x & y; }
|
||||
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return y & ~x; }
|
||||
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return x | y; }
|
||||
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return x ^ y; }
|
||||
|
||||
static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vreinterpretFirstHalf_vi_vi2((vint2)x) & y; }
|
||||
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return vec_andc(y, vreinterpretFirstHalf_vi_vi2((vint2)x)); }
|
||||
|
||||
static INLINE vint vsll_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) << (__vector unsigned int){c, c, c, c}); }
|
||||
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return (vint)(((__vector unsigned int)x) >> (__vector unsigned int){c, c, c, c}); }
|
||||
static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> (__vector int){c, c, c, c}; }
|
||||
|
||||
static INLINE vint veq_vi_vi_vi(vint x, vint y) { return vec_cmpeq(x, y); }
|
||||
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return vec_cmpgt(x, y); }
|
||||
|
||||
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(vec_cmpeq(x, y)); }
|
||||
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(vec_cmpgt(x, y));}
|
||||
|
||||
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
|
||||
return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), x),
|
||||
vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), y));
|
||||
}
|
||||
|
||||
static INLINE vopmask visinf_vo_vd(vdouble d) { return (vopmask)(vec_cmpeq(vabs_vd_vd(d), vcast_vd_d(SLEEF_INFINITY))); }
|
||||
static INLINE vopmask vispinf_vo_vd(vdouble d) { return (vopmask)(vec_cmpeq(d, vcast_vd_d(SLEEF_INFINITY))); }
|
||||
static INLINE vopmask visminf_vo_vd(vdouble d) { return (vopmask)(vec_cmpeq(d, vcast_vd_d(-SLEEF_INFINITY))); }
|
||||
static INLINE vopmask visnan_vo_vd(vdouble d) { return (vopmask)(vnot_vo_vo(vec_cmpeq(d, d))); }
|
||||
|
||||
static INLINE double vcast_d_vd(vdouble v) { return v[0]; }
|
||||
static INLINE float vcast_f_vf(vfloat v) { return v[0]; }
|
||||
|
||||
static INLINE void vstream_v_p_vd(double *p, vdouble v) { vstore_v_p_vd(p, v); }
|
||||
static INLINE void vsscatter2_v_p_i_i_vd(double *p, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(p, offset, step, v); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
|
||||
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
|
||||
static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
|
||||
|
||||
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return x + y; }
|
||||
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return x - y; }
|
||||
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return -e; }
|
||||
|
||||
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return x & y; }
|
||||
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return y & ~x; }
|
||||
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return x | y; }
|
||||
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return x ^ y; }
|
||||
|
||||
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)x & y; }
|
||||
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~(vint2)x; }
|
||||
|
||||
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) << (__vector unsigned int){c, c, c, c}); }
|
||||
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return (vint2)(((__vector unsigned int)x) >> (__vector unsigned int){c, c, c, c}); }
|
||||
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return x >> (__vector int){c, c, c, c}; }
|
||||
|
||||
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)vec_cmpeq(x, y); }
|
||||
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)vec_cmpgt(x, y); }
|
||||
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_cmpeq(x, y); }
|
||||
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return vec_cmpgt(x, y); }
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vf(float *p, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(p, offset, step, v); }
|
||||
static INLINE void vstream_v_p_vf(float *p, vfloat v) { vstore_v_p_vf(p, v); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vdouble vsqrt_vd_vd(vdouble d) { return vec_sqrt(d); }
|
||||
|
||||
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return vec_max(x, y); }
|
||||
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return vec_min(x, y); }
|
||||
|
||||
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmpeq(x, y); }
|
||||
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vnot_vo_vo(vec_cmpeq(x, y)); }
|
||||
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmplt(x, y); }
|
||||
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmple(x, y); }
|
||||
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmpgt(x, y); }
|
||||
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)vec_cmpge(x, y); }
|
||||
|
||||
static INLINE vfloat vabs_vf_vf(vfloat f) { return vec_abs(f); }
|
||||
static INLINE vfloat vrint_vf_vf(vfloat vf) { return __builtin_s390_vfisb(vf, 4, 4); }
|
||||
static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return __builtin_s390_vfisb(vf, 4, 5); }
|
||||
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vec_max(x, y); }
|
||||
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vec_min(x, y); }
|
||||
|
||||
static INLINE vfloat vsqrt_vf_vf(vfloat d) { return vec_sqrt(d); }
|
||||
|
||||
static INLINE vopmask visinf_vo_vf (vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visnan_vo_vf (vfloat d) { return vneq_vo_vf_vf(d, d); }
|
||||
|
||||
static INLINE vint2 vrint_vi2_vf(vfloat vf) {
|
||||
vf = vrint_vf_vf(vf);
|
||||
return (vint) { vf[0], vf[1], vf[2], vf[3] };
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static vquad loadu_vq_p(void *p) {
|
||||
vquad vq;
|
||||
memcpy(&vq, p, VECTLENDP * 16);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vquad cast_vq_aq(vargquad aq) {
|
||||
vquad m = { aq.y, aq.x };
|
||||
return m;
|
||||
}
|
||||
static INLINE vargquad cast_aq_vq(vquad vq) {
|
||||
vargquad a = { vq.y, vq.x };
|
||||
return a;
|
||||
}
|
||||
|
||||
static INLINE int vtestallzeros_i_vo64(vopmask g) {
|
||||
return vec_all_eq((__vector signed long long)g, (__vector signed long long){ 0, 0 });
|
||||
}
|
||||
|
||||
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
|
||||
return (vmask)vec_sel((__vector signed long long)y, (__vector signed long long)x, (__vector __bool long long)o);
|
||||
}
|
||||
|
||||
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) {
|
||||
return (vmask)((__vector signed long long)x - (__vector signed long long)y);
|
||||
}
|
||||
|
||||
static INLINE vmask vneg64_vm_vm(vmask x) {
|
||||
return (vmask)((__vector signed long long) {0, 0} - (__vector signed long long)x);
|
||||
}
|
||||
|
||||
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
|
||||
return (vopmask)vec_cmpgt((__vector signed long long)x, (__vector signed long long)y);
|
||||
}
|
||||
|
||||
#define vsll64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x << (__vector unsigned long long) { c, c }))
|
||||
#define vsrl64_vm_vm_i(x, c) ((vmask)((__vector unsigned long long)x >> (__vector unsigned long long) { c, c }))
|
||||
|
||||
static INLINE vint vcast_vi_vm(vmask vm) {
|
||||
return (vint) { vm[0], vm[1] };
|
||||
}
|
||||
|
||||
static INLINE vmask vcast_vm_vi(vint vi) {
|
||||
return (vmask) (__vector signed long long) { vi[0], vi[1] };
|
||||
}
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return (vmask)v; }
|
||||
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return (vint64)m; }
|
||||
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return (vmask)v; }
|
||||
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return (vuint64)m; }
|
||||
@@ -0,0 +1,517 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#if CONFIG == 2
|
||||
|
||||
#if !defined(__SSE2__) && !defined(SLEEF_GENHEADER)
|
||||
#error Please specify -msse2.
|
||||
#endif
|
||||
|
||||
#elif CONFIG == 3
|
||||
|
||||
#if (!defined(__SSE2__) || !defined(__SSE3__)) && !defined(SLEEF_GENHEADER)
|
||||
#error Please specify -msse2 and -msse3
|
||||
#endif
|
||||
|
||||
#elif CONFIG == 4
|
||||
|
||||
#if (!defined(__SSE2__) || !defined(__SSE3__) || !defined(__SSE4_1__)) && !defined(SLEEF_GENHEADER)
|
||||
#error Please specify -msse2, -msse3 and -msse4.1
|
||||
#endif
|
||||
|
||||
#else
|
||||
#error CONFIG macro invalid or not defined
|
||||
#endif
|
||||
|
||||
#define ENABLE_DP
|
||||
//@#define ENABLE_DP
|
||||
#define LOG2VECTLENDP 1
|
||||
//@#define LOG2VECTLENDP 1
|
||||
#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
//@#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
|
||||
#define ENABLE_SP
|
||||
//@#define ENABLE_SP
|
||||
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
//@#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
//@#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
|
||||
#define ACCURATE_SQRT
|
||||
//@#define ACCURATE_SQRT
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
#include "misc.h"
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
typedef __m128i vmask;
|
||||
typedef __m128i vopmask;
|
||||
|
||||
typedef __m128d vdouble;
|
||||
typedef __m128i vint;
|
||||
|
||||
typedef __m128 vfloat;
|
||||
typedef __m128i vint2;
|
||||
|
||||
typedef __m128i vint64;
|
||||
typedef __m128i vuint64;
|
||||
|
||||
typedef struct {
|
||||
vmask x, y;
|
||||
} vquad;
|
||||
|
||||
typedef vquad vargquad;
|
||||
|
||||
//
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
|
||||
#ifndef __SLEEF_H__
|
||||
void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx);
|
||||
#endif
|
||||
|
||||
static INLINE int cpuSupportsSSE2() {
|
||||
int32_t reg[4];
|
||||
Sleef_x86CpuID(reg, 1, 0);
|
||||
return (reg[3] & (1 << 26)) != 0;
|
||||
}
|
||||
|
||||
static INLINE int cpuSupportsSSE3() {
|
||||
int32_t reg[4];
|
||||
Sleef_x86CpuID(reg, 1, 0);
|
||||
return (reg[2] & (1 << 0)) != 0;
|
||||
}
|
||||
|
||||
static INLINE int cpuSupportsSSE4_1() {
|
||||
int32_t reg[4];
|
||||
Sleef_x86CpuID(reg, 1, 0);
|
||||
return (reg[2] & (1 << 19)) != 0;
|
||||
}
|
||||
|
||||
#if defined(__SSE2__) && defined(__SSE3__) && defined(__SSE4_1__)
|
||||
static INLINE int vavailability_i(int name) {
|
||||
//int d = __builtin_cpu_supports("sse2") && __builtin_cpu_supports("sse3") && __builtin_cpu_supports("sse4.1");
|
||||
int d = cpuSupportsSSE2() && cpuSupportsSSE3() && cpuSupportsSSE4_1();
|
||||
return d ? 3 : 0;
|
||||
}
|
||||
#define ISANAME "SSE4.1"
|
||||
#define DFTPRIORITY 12
|
||||
#elif defined(__SSE2__) && defined(__SSE3__)
|
||||
static INLINE int vavailability_i(int name) {
|
||||
//int d = __builtin_cpu_supports("sse2") && __builtin_cpu_supports("sse3");
|
||||
int d = cpuSupportsSSE2() && cpuSupportsSSE3();
|
||||
return d ? 3 : 0;
|
||||
}
|
||||
#define ISANAME "SSE3"
|
||||
#define DFTPRIORITY 11
|
||||
#else
|
||||
static INLINE int vavailability_i(int name) {
|
||||
int d = cpuSupportsSSE2();
|
||||
return d ? 3 : 0;
|
||||
}
|
||||
#define ISANAME "SSE2"
|
||||
#define DFTPRIORITY 10
|
||||
#endif
|
||||
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
|
||||
|
||||
static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
|
||||
static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
|
||||
|
||||
//
|
||||
|
||||
static vint2 vloadu_vi2_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
|
||||
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, v); }
|
||||
|
||||
static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
|
||||
static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm_and_si128(x, y); }
|
||||
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
|
||||
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm_or_si128(x, y); }
|
||||
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
|
||||
|
||||
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm_and_si128(x, y); }
|
||||
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm_andnot_si128(x, y); }
|
||||
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm_or_si128(x, y); }
|
||||
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm_xor_si128(x, y); }
|
||||
|
||||
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); }
|
||||
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); }
|
||||
static INLINE vmask vandnot_vm_vo64_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
|
||||
static INLINE vmask vxor_vm_vo64_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
|
||||
|
||||
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); }
|
||||
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); }
|
||||
static INLINE vmask vandnot_vm_vo32_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
|
||||
static INLINE vmask vxor_vm_vo32_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
|
||||
|
||||
static INLINE vopmask vcast_vo32_vo64(vopmask m) { return _mm_shuffle_epi32(m, 0x08); }
|
||||
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return _mm_shuffle_epi32(m, 0x50); }
|
||||
|
||||
static INLINE vopmask vcast_vo_i(int i) { return _mm_set1_epi64x(i ? -1 : 0); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
|
||||
static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
|
||||
static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }
|
||||
static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(0, 0, i, i); }
|
||||
static INLINE vint2 vcastu_vm_vi(vint vi) { return _mm_and_si128(_mm_shuffle_epi32(vi, 0x73), _mm_set_epi32(-1, 0, -1, 0)); }
|
||||
static INLINE vint vcastu_vi_vm(vint2 vi) { return _mm_shuffle_epi32(vi, 0x0d); }
|
||||
|
||||
#if CONFIG == 4
|
||||
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
|
||||
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpeq_epi64(x, y); }
|
||||
#define FULL_FP_ROUNDING
|
||||
//@#define FULL_FP_ROUNDING
|
||||
#else
|
||||
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
|
||||
static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
|
||||
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
|
||||
vmask t = _mm_cmpeq_epi32(x, y);
|
||||
return vand_vm_vm_vm(t, _mm_shuffle_epi32(t, 0xb1));
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm_add_epi64(x, y); }
|
||||
|
||||
static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm_set_epi32(i0, i1, i0, i1); }
|
||||
|
||||
static INLINE vmask vcast_vm_i64(int64_t i) { return _mm_set1_epi64x(i); }
|
||||
static INLINE vmask vcast_vm_u64(uint64_t i) { return _mm_set1_epi64x((uint64_t)i); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vdouble vcast_vd_d(double d) { return _mm_set1_pd(d); }
|
||||
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm_castpd_si128(vd); }
|
||||
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm_castsi128_pd(vm); }
|
||||
|
||||
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_add_pd(x, y); }
|
||||
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm_sub_pd(x, y); }
|
||||
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm_mul_pd(x, y); }
|
||||
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm_div_pd(x, y); }
|
||||
static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm_div_pd(_mm_set1_pd(1), x); }
|
||||
static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm_sqrt_pd(x); }
|
||||
static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm_andnot_pd(_mm_set1_pd(-0.0), d); }
|
||||
static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm_xor_pd(_mm_set1_pd(-0.0), d); }
|
||||
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
|
||||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
|
||||
static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(z, vmul_vd_vd_vd(x, y)); }
|
||||
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm_max_pd(x, y); }
|
||||
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm_min_pd(x, y); }
|
||||
|
||||
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpeq_pd(x, y)); }
|
||||
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpneq_pd(x, y)); }
|
||||
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmplt_pd(x, y)); }
|
||||
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmple_pd(x, y)); }
|
||||
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpgt_pd(x, y)); }
|
||||
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpge_pd(x, y)); }
|
||||
|
||||
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
|
||||
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
|
||||
static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
|
||||
|
||||
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
|
||||
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
|
||||
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
|
||||
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
|
||||
|
||||
static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return _mm_and_si128(x, y); }
|
||||
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return _mm_andnot_si128(x, y); }
|
||||
|
||||
static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
|
||||
static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
|
||||
static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
|
||||
|
||||
static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
|
||||
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
|
||||
|
||||
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
|
||||
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
|
||||
|
||||
#if CONFIG == 4
|
||||
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, m); }
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask m, vdouble x, vdouble y) { return _mm_blendv_pd(y, x, _mm_castsi128_pd(m)); }
|
||||
#else
|
||||
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return vor_vm_vm_vm(vand_vm_vm_vm(m, x), vandnot_vm_vm_vm(m, y)); }
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask opmask, vdouble x, vdouble y) {
|
||||
return _mm_or_pd(_mm_and_pd(_mm_castsi128_pd(opmask), x), _mm_andnot_pd(_mm_castsi128_pd(opmask), y));
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
|
||||
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vopmask visinf_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm_cmpeq_pd(vabs_vd_vd(d), _mm_set1_pd(SLEEF_INFINITY)));
|
||||
}
|
||||
|
||||
static INLINE vopmask vispinf_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(SLEEF_INFINITY)));
|
||||
}
|
||||
|
||||
static INLINE vopmask visminf_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(-SLEEF_INFINITY)));
|
||||
}
|
||||
|
||||
static INLINE vopmask visnan_vo_vd(vdouble d) {
|
||||
return vreinterpret_vm_vd(_mm_cmpneq_pd(d, d));
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static INLINE vdouble vload_vd_p(const double *ptr) { return _mm_load_pd(ptr); }
|
||||
static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr); }
|
||||
|
||||
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
|
||||
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }
|
||||
|
||||
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
|
||||
int a[sizeof(vint)/sizeof(int)];
|
||||
vstoreu_v_p_vi(a, vi);
|
||||
return _mm_set_pd(ptr[a[1]], ptr[a[0]]);
|
||||
}
|
||||
|
||||
// This function is for debugging
|
||||
static INLINE double vcast_d_vd(vdouble v) {
|
||||
double a[VECTLENDP];
|
||||
vstoreu_v_p_vd(a, v);
|
||||
return a[0];
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
|
||||
static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
|
||||
static INLINE vint2 vrint_vi2_vf(vfloat vf) { return _mm_cvtps_epi32(vf); }
|
||||
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return _mm_cvttps_epi32(vf); }
|
||||
static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); }
|
||||
static INLINE vfloat vcast_vf_f(float f) { return _mm_set1_ps(f); }
|
||||
static INLINE vint2 vcast_vi2_i(int i) { return _mm_set1_epi32(i); }
|
||||
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm_castps_si128(vf); }
|
||||
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm_castsi128_ps(vm); }
|
||||
static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return _mm_castsi128_ps(vm); }
|
||||
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm_castps_si128(vf); }
|
||||
|
||||
#if CONFIG != 4
|
||||
static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
|
||||
static INLINE vfloat vrint_vf_vf(vfloat vf) { return vcast_vf_vi2(vrint_vi2_vf(vf)); }
|
||||
#endif
|
||||
|
||||
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); }
|
||||
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); }
|
||||
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); }
|
||||
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm_div_ps(x, y); }
|
||||
static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
|
||||
static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); }
|
||||
static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
|
||||
static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
|
||||
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); }
|
||||
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); }
|
||||
|
||||
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpeq_ps(x, y)); }
|
||||
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpneq_ps(x, y)); }
|
||||
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmplt_ps(x, y)); }
|
||||
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmple_ps(x, y)); }
|
||||
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpgt_ps(x, y)); }
|
||||
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpge_ps(x, y)); }
|
||||
|
||||
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vadd_vi_vi_vi(x, y); }
|
||||
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsub_vi_vi_vi(x, y); }
|
||||
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
|
||||
|
||||
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vand_vi_vi_vi(x, y); }
|
||||
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vandnot_vi_vi_vi(x, y); }
|
||||
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vor_vi_vi_vi(x, y); }
|
||||
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return vxor_vi_vi_vi(x, y); }
|
||||
|
||||
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi_vo_vi(x, y); }
|
||||
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi_vo_vi(x, y); }
|
||||
|
||||
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return vsll_vi_vi_i(x, c); }
|
||||
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return vsrl_vi_vi_i(x, c); }
|
||||
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vsra_vi_vi_i(x, c); }
|
||||
|
||||
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
|
||||
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
|
||||
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
|
||||
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
|
||||
|
||||
#if CONFIG == 4
|
||||
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return _mm_blendv_epi8(y, x, m); }
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) { return _mm_blendv_ps(y, x, _mm_castsi128_ps(m)); }
|
||||
#else
|
||||
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
|
||||
return vor_vi2_vi2_vi2(vand_vi2_vi2_vi2(m, x), vandnot_vi2_vi2_vi2(m, y));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask opmask, vfloat x, vfloat y) {
|
||||
return _mm_or_ps(_mm_and_ps(_mm_castsi128_ps(opmask), x), _mm_andnot_ps(_mm_castsi128_ps(opmask), y));
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
|
||||
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-SLEEF_INFINITYf)); }
|
||||
static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
|
||||
|
||||
static INLINE vfloat vload_vf_p(const float *ptr) { return _mm_load_ps(ptr); }
|
||||
static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
|
||||
|
||||
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
|
||||
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }
|
||||
|
||||
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi) {
|
||||
int a[VECTLENSP];
|
||||
vstoreu_v_p_vi2(a, vi);
|
||||
return _mm_set_ps(ptr[a[3]], ptr[a[2]], ptr[a[1]], ptr[a[0]]);
|
||||
}
|
||||
|
||||
// This function is for debugging
|
||||
static INLINE float vcast_f_vf(vfloat v) {
|
||||
float a[VECTLENSP];
|
||||
vstoreu_v_p_vf(a, v);
|
||||
return a[0];
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
#define PNMASK ((vdouble) { +0.0, -0.0 })
|
||||
#define NPMASK ((vdouble) { -0.0, +0.0 })
|
||||
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
|
||||
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
|
||||
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
|
||||
static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
|
||||
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
|
||||
|
||||
#if CONFIG >= 3
|
||||
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_addsub_pd(x, y); }
|
||||
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_addsub_ps(x, y); }
|
||||
#else
|
||||
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
|
||||
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
|
||||
#endif
|
||||
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
|
||||
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
|
||||
static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm_shuffle_pd(d0, d0, 1); }
|
||||
static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
|
||||
|
||||
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm_stream_pd(ptr, v); }
|
||||
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
|
||||
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd((double *)(&ptr[2*offset]), v); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
|
||||
static INLINE vfloat vreva2_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
|
||||
|
||||
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm_stream_ps(ptr, v); }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
|
||||
_mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
|
||||
_mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
|
||||
_mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
|
||||
_mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static vquad loadu_vq_p(void *p) {
|
||||
vquad vq;
|
||||
memcpy(&vq, p, VECTLENDP * 16);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vquad cast_vq_aq(vargquad aq) {
|
||||
vquad vq;
|
||||
memcpy(&vq, &aq, VECTLENDP * 16);
|
||||
return vq;
|
||||
}
|
||||
|
||||
static INLINE vargquad cast_aq_vq(vquad vq) {
|
||||
vargquad aq;
|
||||
memcpy(&aq, &vq, VECTLENDP * 16);
|
||||
return aq;
|
||||
}
|
||||
|
||||
static INLINE int vtestallzeros_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0; }
|
||||
|
||||
static INLINE vmask vsel_vm_vo64_vm_vm(vopmask o, vmask x, vmask y) {
|
||||
return vor_vm_vm_vm(vand_vm_vm_vm(o, x), vandnot_vm_vm_vm(o, y));
|
||||
}
|
||||
|
||||
static INLINE vmask vsub64_vm_vm_vm(vmask x, vmask y) { return _mm_sub_epi64(x, y); }
|
||||
static INLINE vmask vneg64_vm_vm(vmask x) { return _mm_sub_epi64(vcast_vm_i_i(0, 0), x); }
|
||||
|
||||
#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
|
||||
#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
|
||||
//@#define vsll64_vm_vm_i(x, c) _mm_slli_epi64(x, c)
|
||||
//@#define vsrl64_vm_vm_i(x, c) _mm_srli_epi64(x, c)
|
||||
|
||||
static INLINE vopmask vgt64_vo_vm_vm(vmask x, vmask y) {
|
||||
int64_t ax[2], ay[2];
|
||||
_mm_storeu_si128((__m128i *)ax, x);
|
||||
_mm_storeu_si128((__m128i *)ay, y);
|
||||
return _mm_set_epi64x(ax[1] > ay[1] ? -1 : 0, ax[0] > ay[0] ? -1 : 0);
|
||||
}
|
||||
|
||||
static INLINE vmask vcast_vm_vi(vint vi) {
|
||||
vmask m = _mm_and_si128(_mm_shuffle_epi32(vi, (0 << 6) | (1 << 4) | (0 << 2) | (0 << 0)), _mm_set_epi32(0, -1, 0, -1));
|
||||
return vor_vm_vm_vm(vcastu_vm_vi(vgt_vo_vi_vi(vcast_vi_i(0), vi)), m);
|
||||
}
|
||||
static INLINE vint vcast_vi_vm(vmask vm) { return _mm_shuffle_epi32(vm, 0x08); }
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vi64(vint64 v) { return v; }
|
||||
static INLINE vint64 vreinterpret_vi64_vm(vmask m) { return m; }
|
||||
static INLINE vmask vreinterpret_vm_vu64(vuint64 v) { return v; }
|
||||
static INLINE vuint64 vreinterpret_vu64_vm(vmask m) { return m; }
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,871 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdint.h>
|
||||
#include "misc.h"
|
||||
|
||||
#ifndef CONFIG
|
||||
#error CONFIG macro not defined
|
||||
#endif
|
||||
|
||||
#define ENABLE_DP
|
||||
#define ENABLE_SP
|
||||
|
||||
#define LOG2VECTLENDP CONFIG
|
||||
#define VECTLENDP (1 << LOG2VECTLENDP)
|
||||
#define LOG2VECTLENSP (LOG2VECTLENDP+1)
|
||||
#define VECTLENSP (1 << LOG2VECTLENSP)
|
||||
|
||||
#define DFTPRIORITY LOG2VECTLENDP
|
||||
|
||||
#if defined(__clang__)
|
||||
#define ISANAME "Clang Vector Extension"
|
||||
|
||||
typedef uint32_t vmask __attribute__((ext_vector_type(VECTLENDP*2)));
|
||||
typedef uint32_t vopmask __attribute__((ext_vector_type(VECTLENDP*2)));
|
||||
|
||||
typedef double vdouble __attribute__((ext_vector_type(VECTLENDP)));
|
||||
typedef int32_t vint __attribute__((ext_vector_type(VECTLENDP)));
|
||||
|
||||
typedef float vfloat __attribute__((ext_vector_type(VECTLENDP*2)));
|
||||
typedef int32_t vint2 __attribute__((ext_vector_type(VECTLENDP*2)));
|
||||
|
||||
#ifdef ENABLE_LONGDOUBLE
|
||||
typedef uint8_t vmaskl __attribute__((ext_vector_type(sizeof(long double)*VECTLENDP)));
|
||||
typedef long double vlongdouble __attribute__((ext_vector_type(VECTLENDP)));
|
||||
#endif
|
||||
|
||||
#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
|
||||
typedef uint8_t vmaskq __attribute__((ext_vector_type(sizeof(Sleef_quad)*VECTLENDP)));
|
||||
#ifdef ENABLE_LONGDOUBLE
|
||||
typedef Sleef_quad vquad __attribute__((ext_vector_type(VECTLENDP)));
|
||||
#endif
|
||||
#endif
|
||||
#elif defined(__GNUC__)
|
||||
#define ISANAME "GCC Vector Extension"
|
||||
|
||||
typedef uint32_t vmask __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
|
||||
typedef uint32_t vopmask __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
|
||||
|
||||
typedef double vdouble __attribute__((vector_size(sizeof(double)*VECTLENDP)));
|
||||
typedef int32_t vint __attribute__((vector_size(sizeof(int32_t)*VECTLENDP)));
|
||||
|
||||
typedef float vfloat __attribute__((vector_size(sizeof(float)*VECTLENDP*2)));
|
||||
typedef int32_t vint2 __attribute__((vector_size(sizeof(int32_t)*VECTLENDP*2)));
|
||||
|
||||
#ifdef ENABLE_LONGDOUBLE
|
||||
typedef uint8_t vmaskl __attribute__((vector_size(sizeof(long double)*VECTLENDP)));
|
||||
typedef long double vlongdouble __attribute__((vector_size(sizeof(long double)*VECTLENDP)));
|
||||
#endif
|
||||
|
||||
#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
|
||||
typedef uint8_t vmaskq __attribute__((vector_size(sizeof(Sleef_quad)*VECTLENDP)));
|
||||
typedef Sleef_quad vquad __attribute__((vector_size(sizeof(Sleef_quad)*VECTLENDP)));
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
#if VECTLENDP == 2
|
||||
static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], 0, 0 }; }
|
||||
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1] }; }
|
||||
|
||||
static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
|
||||
static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
|
||||
static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }
|
||||
static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d }; }
|
||||
#ifdef ENABLE_LONGDOUBLE
|
||||
static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d }; }
|
||||
#endif
|
||||
#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
|
||||
static INLINE vquad vcast_vq_q(Sleef_quad d) { return (vquad) { d, d }; }
|
||||
#endif
|
||||
|
||||
static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h }; }
|
||||
static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1] }; }
|
||||
static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3] }; }
|
||||
|
||||
static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1] }; }
|
||||
static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], 0, 0 }; }
|
||||
|
||||
static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0] }; }
|
||||
static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
|
||||
static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2] }; }
|
||||
static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[2], vd[3], vd[0], vd[1] }; }
|
||||
#ifdef ENABLE_LONGDOUBLE
|
||||
static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0] }; }
|
||||
static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return vd; }
|
||||
static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1] }; }
|
||||
static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1] }; }
|
||||
#endif
|
||||
|
||||
#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
|
||||
static INLINE vquad vrev21_vq_vq(vquad vd) { return (vquad) { vd[1], vd[0] }; }
|
||||
static INLINE vquad vreva2_vq_vq(vquad vd) { return vd; }
|
||||
static INLINE vquad vposneg_vq_vq(vquad vd) { return (vquad) { +vd[0], -vd[1] }; }
|
||||
static INLINE vquad vnegpos_vq_vq(vquad vd) { return (vquad) { -vd[0], +vd[1] }; }
|
||||
#endif
|
||||
|
||||
#define PNMASK ((vdouble) { +0.0, -0.0 })
|
||||
#define NPMASK ((vdouble) { -0.0, +0.0 })
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); }
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); }
|
||||
|
||||
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
|
||||
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
|
||||
static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); }
|
||||
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); }
|
||||
#elif VECTLENDP == 4
|
||||
static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], m[5], m[7], 0, 0, 0, 0 }; }
|
||||
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1], m[2], m[2], m[3], m[3] }; }
|
||||
|
||||
static INLINE vint vcast_vi_i(int i) { return (vint) { i, i, i, i }; }
|
||||
static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i, i, i, i, i }; }
|
||||
static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f, f, f, f, f }; }
|
||||
static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d, d, d }; }
|
||||
#ifdef ENABLE_LONGDOUBLE
|
||||
static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d, d, d }; }
|
||||
#endif
|
||||
|
||||
static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h, l, h, l, h }; }
|
||||
static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1], 0, vi[2], 0, vi[3] }; }
|
||||
static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3], vi2[5], vi2[7] }; }
|
||||
|
||||
static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1], vi2[2], vi2[3] }; }
|
||||
static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], vi[2], vi[3], 0, 0, 0, 0 }; }
|
||||
|
||||
#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
|
||||
#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); }
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); }
|
||||
|
||||
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
|
||||
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
|
||||
static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); }
|
||||
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); }
|
||||
|
||||
static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0], vd[3], vd[2] }; }
|
||||
static INLINE vdouble vreva2_vd_vd(vdouble vd) { return (vdouble) { vd[2], vd[3], vd[0], vd[1] }; }
|
||||
static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; }
|
||||
static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; }
|
||||
#ifdef ENABLE_LONGDOUBLE
|
||||
static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0], vd[3], vd[2] }; }
|
||||
static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[2], vd[3], vd[0], vd[1] }; }
|
||||
static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1], +vd[2], -vd[3] }; }
|
||||
static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1], -vd[2], +vd[3] }; }
|
||||
#endif
|
||||
#elif VECTLENDP == 8
|
||||
static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], m[5], m[7], m[9], m[11], m[13], m[15], 0, 0, 0, 0, 0, 0, 0, 0 }; }
|
||||
static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1], m[2], m[2], m[3], m[3], m[4], m[4], m[5], m[5], m[6], m[6], m[7], m[7] }; }
|
||||
|
||||
static INLINE vint vcast_vi_i(int i) { return (vint) { i, i, i, i, i, i, i, i }; }
|
||||
static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i }; }
|
||||
static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f, f, f, f, f, f, f, f, f, f, f, f, f }; }
|
||||
static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d, d, d, d, d, d, d }; }
|
||||
#ifdef ENABLE_LONGDOUBLE
|
||||
static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d, d, d, d, d, d, d }; }
|
||||
#endif
|
||||
|
||||
static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h, l, h, l, h, l, h, l, h, l, h, l, h }; }
|
||||
static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1], 0, vi[2], 0, vi[3], 0, vi[4], 0, vi[5], 0, vi[6], 0, vi[7] }; }
|
||||
static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3], vi2[5], vi2[7], vi2[9], vi2[11], vi2[13], vi2[15] }; }
|
||||
|
||||
static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1], vi2[2], vi2[3], vi2[4], vi2[5], vi2[6], vi2[7] }; }
|
||||
static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], vi[2], vi[3], vi[4], vi[5], vi[6], vi[7], 0, 0, 0, 0, 0, 0, 0, 0 }; }
|
||||
|
||||
#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0 })
|
||||
#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +0.0 })
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); }
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); }
|
||||
|
||||
#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
|
||||
#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
|
||||
static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); }
|
||||
static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); }
|
||||
|
||||
static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; }
|
||||
static INLINE vdouble vreva2_vd_vd(vdouble vd) { return (vdouble) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; }
|
||||
static INLINE vfloat vrev21_vf_vf(vfloat vd) {
|
||||
return (vfloat) {
|
||||
vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6],
|
||||
vd[9], vd[8], vd[11], vd[10], vd[13], vd[12], vd[15], vd[14] };
|
||||
}
|
||||
static INLINE vfloat vreva2_vf_vf(vfloat vd) {
|
||||
return (vfloat) {
|
||||
vd[14], vd[15], vd[12], vd[13], vd[10], vd[11], vd[8], vd[9],
|
||||
vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1]};
|
||||
}
|
||||
#ifdef ENABLE_LONGDOUBLE
|
||||
static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; }
|
||||
static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; }
|
||||
static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1], +vd[2], -vd[3], +vd[4], -vd[5], +vd[6], -vd[7] }; }
|
||||
static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1], -vd[2], +vd[3], -vd[4], +vd[5], -vd[6], +vd[7] }; }
|
||||
#endif
|
||||
#else
|
||||
static INLINE vint vcast_vi_i(int k) {
|
||||
vint ret;
|
||||
for(int i=0;i<VECTLENDP;i++) ret[i] = k;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vint2 vcast_vi2_i(int k) {
|
||||
vint2 ret;
|
||||
for(int i=0;i<VECTLENSP;i++) ret[i] = k;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vdouble vcast_vd_d(double d) {
|
||||
vdouble ret;
|
||||
for(int i=0;i<VECTLENDP;i++) ret[i] = d;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vfloat vcast_vf_f(float f) {
|
||||
vfloat ret;
|
||||
for(int i=0;i<VECTLENSP;i++) ret[i] = f;
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_LONGDOUBLE
|
||||
static INLINE vlongdouble vcast_vl_l(long double d) {
|
||||
vlongdouble ret;
|
||||
for(int i=0;i<VECTLENDP;i++) ret[i] = d;
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE vopmask vcast_vo32_vo64(vopmask m) {
|
||||
vopmask ret;
|
||||
for(int i=0;i<VECTLENDP;i++) ret[i] = m[i*2+1];
|
||||
for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret[i] = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vopmask vcast_vo64_vo32(vopmask m) {
|
||||
vopmask ret;
|
||||
for(int i=0;i<VECTLENDP;i++) ret[i*2] = ret[i*2+1] = m[i];
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vmask vcast_vm_i_i(int h, int l) {
|
||||
vmask ret;
|
||||
for(int i=0;i<VECTLENDP;i++) {
|
||||
ret[i*2+0] = l;
|
||||
ret[i*2+1] = h;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vint2 vcastu_vi2_vi(vint vi) {
|
||||
vint2 ret;
|
||||
for(int i=0;i<VECTLENDP;i++) {
|
||||
ret[i*2+0] = 0;
|
||||
ret[i*2+1] = vi[i];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vint vcastu_vi_vi2(vint2 vi2) {
|
||||
vint ret;
|
||||
for(int i=0;i<VECTLENDP;i++) ret[i] = vi2[i*2+1];
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) {
|
||||
vint ret;
|
||||
for(int i=0;i<VECTLENDP;i++) ret[i] = vi2[i];
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) {
|
||||
vint2 ret;
|
||||
for(int i=0;i<VECTLENDP;i++) ret[i] = vi[i];
|
||||
for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret[i] = 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE vdouble vrev21_vd_vd(vdouble d0) {
|
||||
vdouble r;
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
r[i*2+0] = d0[i*2+1];
|
||||
r[i*2+1] = d0[i*2+0];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vdouble vreva2_vd_vd(vdouble d0) {
|
||||
vdouble r;
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
r[i*2+0] = d0[(VECTLENDP/2-1-i)*2+0];
|
||||
r[i*2+1] = d0[(VECTLENDP/2-1-i)*2+1];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vfloat vrev21_vf_vf(vfloat d0) {
|
||||
vfloat r;
|
||||
for(int i=0;i<VECTLENSP/2;i++) {
|
||||
r[i*2+0] = d0[i*2+1];
|
||||
r[i*2+1] = d0[i*2+0];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vfloat vreva2_vf_vf(vfloat d0) {
|
||||
vfloat r;
|
||||
for(int i=0;i<VECTLENSP/2;i++) {
|
||||
r[i*2+0] = d0[(VECTLENSP/2-1-i)*2+0];
|
||||
r[i*2+1] = d0[(VECTLENSP/2-1-i)*2+1];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_LONGDOUBLE
|
||||
static INLINE vlongdouble vrev21_vl_vl(vlongdouble d0) {
|
||||
vlongdouble r;
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
r[i*2+0] = d0[i*2+1];
|
||||
r[i*2+1] = d0[i*2+0];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vlongdouble vreva2_vl_vl(vlongdouble d0) {
|
||||
vlongdouble r;
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
r[i*2+0] = d0[(VECTLENDP/2-1-i)*2+0];
|
||||
r[i*2+1] = d0[(VECTLENDP/2-1-i)*2+1];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE vdouble vposneg_vd_vd(vdouble d0) {
|
||||
vdouble r;
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
r[i*2+0] = +d0[i*2+0];
|
||||
r[i*2+1] = -d0[i*2+1];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vdouble vnegpos_vd_vd(vdouble d0) {
|
||||
vdouble r;
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
r[i*2+0] = -d0[i*2+0];
|
||||
r[i*2+1] = +d0[i*2+1];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vfloat vposneg_vf_vf(vfloat d0) {
|
||||
vfloat r;
|
||||
for(int i=0;i<VECTLENSP/2;i++) {
|
||||
r[i*2+0] = +d0[i*2+0];
|
||||
r[i*2+1] = -d0[i*2+1];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vfloat vnegpos_vf_vf(vfloat d0) {
|
||||
vfloat r;
|
||||
for(int i=0;i<VECTLENSP/2;i++) {
|
||||
r[i*2+0] = -d0[i*2+0];
|
||||
r[i*2+1] = +d0[i*2+1];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_LONGDOUBLE
|
||||
static INLINE vlongdouble vposneg_vl_vl(vlongdouble d0) {
|
||||
vlongdouble r;
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
r[i*2+0] = +d0[i*2+0];
|
||||
r[i*2+1] = -d0[i*2+1];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
static INLINE vlongdouble vnegpos_vl_vl(vlongdouble d0) {
|
||||
vlongdouble r;
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
r[i*2+0] = -d0[i*2+0];
|
||||
r[i*2+1] = +d0[i*2+1];
|
||||
}
|
||||
return r;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
static INLINE int vavailability_i(int name) { return -1; }
|
||||
static INLINE void vprefetch_v_p(const void *ptr) { }
|
||||
|
||||
static INLINE int vtestallones_i_vo64(vopmask g) {
|
||||
int ret = 1; for(int i=0;i<VECTLENDP*2;i++) ret = ret && g[i]; return ret;
|
||||
}
|
||||
|
||||
static INLINE int vtestallones_i_vo32(vopmask g) {
|
||||
int ret = 1; for(int i=0;i<VECTLENDP*2;i++) ret = ret && g[i]; return ret;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static vint2 vloadu_vi2_p(int32_t *p) {
|
||||
vint2 vi;
|
||||
for(int i=0;i<VECTLENSP;i++) vi[i] = p[i];
|
||||
return vi;
|
||||
}
|
||||
|
||||
static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
|
||||
for(int i=0;i<VECTLENSP;i++) p[i] = v[i];
|
||||
}
|
||||
|
||||
static vint vloadu_vi_p(int32_t *p) {
|
||||
vint vi;
|
||||
for(int i=0;i<VECTLENDP;i++) vi[i] = p[i];
|
||||
return vi;
|
||||
}
|
||||
|
||||
static void vstoreu_v_p_vi(int32_t *p, vint v) {
|
||||
for(int i=0;i<VECTLENDP;i++) p[i] = v[i];
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return x & y; }
|
||||
static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return y & ~x; }
|
||||
static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return x | y; }
|
||||
static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return x ^ y; }
|
||||
|
||||
static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return x & y; }
|
||||
static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return y & ~x; }
|
||||
static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return x | y; }
|
||||
static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return x ^ y; }
|
||||
|
||||
static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return x & y; }
|
||||
static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return y & ~x; }
|
||||
static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return x | y; }
|
||||
static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return x ^ y; }
|
||||
|
||||
static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return x & y; }
|
||||
static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return y & ~x; }
|
||||
static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return x | y; }
|
||||
static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return x ^ y; }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return (vdouble)(((vmask)o & (vmask)x) | ((vmask)y & ~(vmask)o)); }
|
||||
static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { return (vint2)(((vmask)o & (vmask)x) | ((vmask)y & ~(vmask)o)); }
|
||||
|
||||
static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
|
||||
return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
|
||||
return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vdouble vcast_vd_vi(vint vi) {
|
||||
#if defined(__clang__)
|
||||
return __builtin_convertvector(vi, vdouble);
|
||||
#else
|
||||
vdouble vd;
|
||||
for(int i=0;i<VECTLENDP;i++) vd[i] = vi[i];
|
||||
return vd;
|
||||
#endif
|
||||
}
|
||||
static INLINE vint vtruncate_vi_vd(vdouble vd) {
|
||||
#if defined(__clang__)
|
||||
return __builtin_convertvector(vd, vint);
|
||||
#else
|
||||
vint vi;
|
||||
for(int i=0;i<VECTLENDP;i++) vi[i] = vd[i];
|
||||
return vi;
|
||||
#endif
|
||||
}
|
||||
static INLINE vint vrint_vi_vd(vdouble vd) { return vtruncate_vi_vd(vsel_vd_vo_vd_vd((vopmask)(vd < 0.0), vd - 0.5, vd + 0.5)); }
|
||||
static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
|
||||
static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
|
||||
|
||||
static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
|
||||
#if defined(__clang__)
|
||||
typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP)));
|
||||
#else
|
||||
typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP)));
|
||||
#endif
|
||||
return (vopmask)((vi64)x == (vi64)y);
|
||||
}
|
||||
|
||||
static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
|
||||
#if defined(__clang__)
|
||||
typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP)));
|
||||
#else
|
||||
typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP)));
|
||||
#endif
|
||||
return (vmask)((vi64)x + (vi64)y);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (vmask)vd; }
|
||||
static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return (vint2)vd; }
|
||||
static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return (vdouble)vi; }
|
||||
static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (vdouble)vm; }
|
||||
|
||||
static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return x + y; }
|
||||
static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return x - y; }
|
||||
static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return x * y; }
|
||||
static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return x / y; }
|
||||
static INLINE vdouble vrec_vd_vd(vdouble x) { return 1.0 / x; }
|
||||
|
||||
static INLINE vdouble vabs_vd_vd(vdouble d) { return (vdouble)((vmask)d & ~(vmask)vcast_vd_d(-0.0)); }
|
||||
static INLINE vdouble vneg_vd_vd(vdouble d) { return -d; }
|
||||
static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return x * y + z; }
|
||||
static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return x * y - z; }
|
||||
static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return vsel_vd_vo_vd_vd((vopmask)(x > y), x, y); }
|
||||
static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return vsel_vd_vo_vd_vd((vopmask)(x < y), x, y); }
|
||||
|
||||
static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
|
||||
static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
|
||||
|
||||
static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x == y); }
|
||||
static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x != y); }
|
||||
static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x < y); }
|
||||
static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x <= y); }
|
||||
static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x > y); }
|
||||
static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x >= y); }
|
||||
|
||||
static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return x + y; }
|
||||
static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return x - y; }
|
||||
static INLINE vint vneg_vi_vi(vint e) { return -e; }
|
||||
|
||||
static INLINE vint vand_vi_vi_vi(vint x, vint y) { return x & y; }
|
||||
static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return y & ~x; }
|
||||
static INLINE vint vor_vi_vi_vi(vint x, vint y) { return x | y; }
|
||||
static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return x ^ y; }
|
||||
|
||||
static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vreinterpretFirstHalf_vi_vi2((vint2)x) & y; }
|
||||
static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return y & ~vreinterpretFirstHalf_vi_vi2((vint2)x); }
|
||||
|
||||
static INLINE vint vsll_vi_vi_i(vint x, int c) {
|
||||
#if defined(__clang__)
|
||||
typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP)));
|
||||
#else
|
||||
typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP)));
|
||||
#endif
|
||||
return (vint)(((vu)x) << c);
|
||||
}
|
||||
|
||||
static INLINE vint vsrl_vi_vi_i(vint x, int c) {
|
||||
#if defined(__clang__)
|
||||
typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP)));
|
||||
#else
|
||||
typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP)));
|
||||
#endif
|
||||
return (vint)(((vu)x) >> c);
|
||||
}
|
||||
|
||||
static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> c; }
|
||||
|
||||
static INLINE vint veq_vi_vi_vi(vint x, vint y) { return x == y; }
|
||||
static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return x > y; }
|
||||
|
||||
static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(x == y); }
|
||||
static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(x > y);}
|
||||
|
||||
static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
|
||||
return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), x),
|
||||
vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), y));
|
||||
}
|
||||
|
||||
static INLINE vopmask visinf_vo_vd(vdouble d) { return (vopmask)(vabs_vd_vd(d) == SLEEF_INFINITY); }
|
||||
static INLINE vopmask vispinf_vo_vd(vdouble d) { return (vopmask)(d == SLEEF_INFINITY); }
|
||||
static INLINE vopmask visminf_vo_vd(vdouble d) { return (vopmask)(d == -SLEEF_INFINITY); }
|
||||
static INLINE vopmask visnan_vo_vd(vdouble d) { return (vopmask)(d != d); }
|
||||
|
||||
static INLINE vdouble vsqrt_vd_vd(vdouble d) {
|
||||
#if defined(__clang__)
|
||||
typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP)));
|
||||
#else
|
||||
typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP)));
|
||||
#endif
|
||||
|
||||
vdouble q = vcast_vd_d(1);
|
||||
|
||||
vopmask o = (vopmask)(d < 8.636168555094445E-78);
|
||||
d = (vdouble)((o & (vmask)(d * 1.157920892373162E77)) | (~o & (vmask)d));
|
||||
|
||||
q = (vdouble)((o & (vmask)vcast_vd_d(2.9387358770557188E-39)) | (~o & (vmask)vcast_vd_d(1)));
|
||||
|
||||
q = (vdouble)vor_vm_vm_vm(vlt_vo_vd_vd(d, vcast_vd_d(0)), (vmask)q);
|
||||
|
||||
vdouble x = (vdouble)(0x5fe6ec85e7de30daLL - ((vi64)(d + 1e-320) >> 1));
|
||||
x = x * ( 3 - d * x * x);
|
||||
x = x * ( 12 - d * x * x);
|
||||
x = x * (768 - d * x * x);
|
||||
x *= 1.0 / (1 << 13);
|
||||
x = (d - (d * x) * (d * x)) * (x * 0.5) + d * x;
|
||||
|
||||
return x * q;
|
||||
}
|
||||
|
||||
static INLINE double vcast_d_vd(vdouble v) { return v[0]; }
|
||||
static INLINE float vcast_f_vf(vfloat v) { return v[0]; }
|
||||
|
||||
static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
|
||||
static INLINE vdouble vloadu_vd_p(const double *ptr) {
|
||||
vdouble vd;
|
||||
for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[i];
|
||||
return vd;
|
||||
}
|
||||
|
||||
static INLINE vdouble vgather_vd_p_vi(const double *ptr, vint vi) {
|
||||
vdouble vd;
|
||||
for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[vi[i]];
|
||||
return vd;
|
||||
}
|
||||
|
||||
static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
|
||||
static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) {
|
||||
for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
|
||||
}
|
||||
static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
*(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
|
||||
*(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
|
||||
|
||||
//
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return (vfloat)(((vmask)o & (vmask)x) | (~(vmask)o & (vmask)y)); }
|
||||
|
||||
static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
|
||||
return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
|
||||
}
|
||||
|
||||
static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
|
||||
return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
|
||||
}
|
||||
|
||||
static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
|
||||
static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
|
||||
|
||||
static INLINE vfloat vcast_vf_vi2(vint2 vi) {
|
||||
#if defined(__clang__)
|
||||
return __builtin_convertvector(vi, vfloat);
|
||||
#else
|
||||
vfloat vf;
|
||||
for(int i=0;i<VECTLENDP*2;i++) vf[i] = vi[i];
|
||||
return vf;
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE vint2 vtruncate_vi2_vf(vfloat vf) {
|
||||
#if defined(__clang__)
|
||||
return __builtin_convertvector(vf, vint2);
|
||||
#else
|
||||
vint2 vi;
|
||||
for(int i=0;i<VECTLENDP*2;i++) vi[i] = vf[i];
|
||||
return vi;
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vtruncate_vi2_vf(vsel_vf_vo_vf_vf((vopmask)(vf < 0), vf - 0.5f, vf + 0.5)); }
|
||||
static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
|
||||
static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
|
||||
|
||||
static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
|
||||
static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
|
||||
static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return (vfloat)vi; }
|
||||
static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
|
||||
|
||||
static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return x + y; }
|
||||
static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return x - y; }
|
||||
static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return x * y; }
|
||||
static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return x / y; }
|
||||
static INLINE vfloat vrec_vf_vf(vfloat x) { return 1.0f / x; }
|
||||
|
||||
static INLINE vfloat vabs_vf_vf(vfloat f) { return (vfloat)vandnot_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
|
||||
static INLINE vfloat vneg_vf_vf(vfloat d) { return -d; }
|
||||
static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return x*y+z; }
|
||||
static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return z-x*y; }
|
||||
static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vsel_vf_vo_vf_vf((vopmask)(x > y), x, y); }
|
||||
static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vsel_vf_vo_vf_vf((vopmask)(x < y), x, y); }
|
||||
|
||||
static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
|
||||
static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
|
||||
|
||||
static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x == y); }
|
||||
static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x != y); }
|
||||
static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x < y); }
|
||||
static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x <= y); }
|
||||
static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x > y); }
|
||||
static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x >= y); }
|
||||
|
||||
static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return x + y; }
|
||||
static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return x - y; }
|
||||
static INLINE vint2 vneg_vi2_vi2(vint2 e) { return -e; }
|
||||
|
||||
static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return x & y; }
|
||||
static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return y & ~x; }
|
||||
static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return x | y; }
|
||||
static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return x ^ y; }
|
||||
|
||||
static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)x & y; }
|
||||
static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~(vint2)x; }
|
||||
|
||||
static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
|
||||
#if defined(__clang__)
|
||||
typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP*2)));
|
||||
#else
|
||||
typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
|
||||
#endif
|
||||
return (vint2)(((vu)x) << c);
|
||||
}
|
||||
static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
|
||||
#if defined(__clang__)
|
||||
typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP*2)));
|
||||
#else
|
||||
typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
|
||||
#endif
|
||||
return (vint2)(((vu)x) >> c);
|
||||
}
|
||||
static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return x >> c; }
|
||||
|
||||
static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)(x == y); }
|
||||
static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)(x > y); }
|
||||
static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return x == y; }
|
||||
static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return x > y; }
|
||||
|
||||
static INLINE vopmask visinf_vo_vf(vfloat d) { return (vopmask)(vabs_vf_vf(d) == SLEEF_INFINITYf); }
|
||||
static INLINE vopmask vispinf_vo_vf(vfloat d) { return (vopmask)(d == SLEEF_INFINITYf); }
|
||||
static INLINE vopmask visminf_vo_vf(vfloat d) { return (vopmask)(d == -SLEEF_INFINITYf); }
|
||||
static INLINE vopmask visnan_vo_vf(vfloat d) { return (vopmask)(d != d); }
|
||||
|
||||
static INLINE vfloat vsqrt_vf_vf(vfloat d) {
|
||||
vfloat q = vcast_vf_f(1);
|
||||
|
||||
vopmask o = (vopmask)(d < 5.4210108624275221700372640043497e-20f); // 2^-64
|
||||
d = (vfloat)((o & (vmask)(d * vcast_vf_f(18446744073709551616.0f))) | (~o & (vmask)d)); // 2^64
|
||||
q = (vfloat)((o & (vmask)vcast_vf_f(0.00000000023283064365386962890625f)) | (~o & (vmask)vcast_vf_f(1))); // 2^-32
|
||||
q = (vfloat)vor_vm_vm_vm(vlt_vo_vf_vf(d, vcast_vf_f(0)), (vmask)q);
|
||||
|
||||
vfloat x = (vfloat)(0x5f330de2 - (((vint2)d) >> 1));
|
||||
x = x * ( 3.0f - d * x * x);
|
||||
x = x * (12.0f - d * x * x);
|
||||
x *= 0.0625f;
|
||||
x = (d - (d * x) * (d * x)) * (x * 0.5) + d * x;
|
||||
|
||||
return x * q;
|
||||
}
|
||||
|
||||
static INLINE vfloat vload_vf_p(const float *ptr) { return *(vfloat *)ptr; }
|
||||
static INLINE vfloat vloadu_vf_p(const float *ptr) {
|
||||
vfloat vf;
|
||||
for(int i=0;i<VECTLENSP;i++) vf[i] = ptr[i];
|
||||
return vf;
|
||||
}
|
||||
|
||||
static INLINE vfloat vgather_vf_p_vi2(const float *ptr, vint2 vi2) {
|
||||
vfloat vf;
|
||||
for(int i=0;i<VECTLENSP;i++) vf[i] = ptr[vi2[i]];
|
||||
return vf;
|
||||
}
|
||||
|
||||
static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
|
||||
static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
|
||||
for(int i=0;i<VECTLENSP;i++) ptr[i] = v[i];
|
||||
}
|
||||
static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
|
||||
for(int i=0;i<VECTLENSP/2;i++) {
|
||||
*(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
|
||||
*(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
|
||||
|
||||
//
|
||||
|
||||
#ifdef ENABLE_LONGDOUBLE
|
||||
static INLINE vlongdouble vadd_vl_vl_vl(vlongdouble x, vlongdouble y) { return x + y; }
|
||||
static INLINE vlongdouble vsub_vl_vl_vl(vlongdouble x, vlongdouble y) { return x - y; }
|
||||
static INLINE vlongdouble vmul_vl_vl_vl(vlongdouble x, vlongdouble y) { return x * y; }
|
||||
|
||||
static INLINE vlongdouble vneg_vl_vl(vlongdouble d) { return -d; }
|
||||
static INLINE vlongdouble vsubadd_vl_vl_vl(vlongdouble x, vlongdouble y) { return vadd_vl_vl_vl(x, vnegpos_vl_vl(y)); }
|
||||
static INLINE vlongdouble vmlsubadd_vl_vl_vl_vl(vlongdouble x, vlongdouble y, vlongdouble z) { return vsubadd_vl_vl_vl(vmul_vl_vl_vl(x, y), z); }
|
||||
|
||||
static INLINE vlongdouble vload_vl_p(const long double *ptr) { return *(vlongdouble *)ptr; }
|
||||
static INLINE vlongdouble vloadu_vl_p(const long double *ptr) {
|
||||
vlongdouble vd;
|
||||
for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[i];
|
||||
return vd;
|
||||
}
|
||||
|
||||
static INLINE void vstore_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
|
||||
static INLINE void vstoreu_v_p_vl(long double *ptr, vlongdouble v) {
|
||||
for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
|
||||
}
|
||||
static INLINE void vstream_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) {
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
*(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
|
||||
*(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) { vscatter2_v_p_i_i_vl(ptr, offset, step, v); }
|
||||
#endif
|
||||
|
||||
#if defined(Sleef_quad2_DEFINED) && defined(ENABLEFLOAT128)
|
||||
static INLINE vquad vadd_vq_vq_vq(vquad x, vquad y) { return x + y; }
|
||||
static INLINE vquad vsub_vq_vq_vq(vquad x, vquad y) { return x - y; }
|
||||
static INLINE vquad vmul_vq_vq_vq(vquad x, vquad y) { return x * y; }
|
||||
|
||||
static INLINE vquad vneg_vq_vq(vquad d) { return -d; }
|
||||
static INLINE vquad vsubadd_vq_vq_vq(vquad x, vquad y) { return vadd_vq_vq_vq(x, vnegpos_vq_vq(y)); }
|
||||
static INLINE vquad vmlsubadd_vq_vq_vq_vq(vquad x, vquad y, vquad z) { return vsubadd_vq_vq_vq(vmul_vq_vq_vq(x, y), z); }
|
||||
|
||||
static INLINE vquad vload_vq_p(const Sleef_quad *ptr) { return *(vquad *)ptr; }
|
||||
static INLINE vquad vloadu_vq_p(const Sleef_quad *ptr) {
|
||||
vquad vd;
|
||||
for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[i];
|
||||
return vd;
|
||||
}
|
||||
|
||||
static INLINE void vstore_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
|
||||
static INLINE void vstoreu_v_p_vq(Sleef_quad *ptr, vquad v) {
|
||||
for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
|
||||
}
|
||||
static INLINE void vstream_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
|
||||
|
||||
static INLINE void vscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) {
|
||||
for(int i=0;i<VECTLENDP/2;i++) {
|
||||
*(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
|
||||
*(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
|
||||
}
|
||||
}
|
||||
|
||||
static INLINE void vsscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) { vscatter2_v_p_i_i_vq(ptr, offset, step, v); }
|
||||
#endif
|
||||
@@ -0,0 +1,25 @@
|
||||
# Compiler properties
|
||||
|
||||
set(COMMON_TARGET_PROPERTIES
|
||||
C_STANDARD 99 # -std=gnu99
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
list(APPEND COMMON_TARGET_PROPERTIES POSITION_INDEPENDENT_CODE ON) # -fPIC
|
||||
endif()
|
||||
|
||||
# This is a workaround of appveyor bug
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SLEEF_C_FLAGS}")
|
||||
|
||||
# Target TARGET_LIBCOMMON_OBJ
|
||||
|
||||
add_library(${TARGET_LIBCOMMON_OBJ} OBJECT common.c)
|
||||
set_target_properties(${TARGET_LIBCOMMON_OBJ} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
# Target TARGET_LIBARRAYMAP_OBJ
|
||||
|
||||
add_library(${TARGET_LIBARRAYMAP_OBJ} OBJECT arraymap.c)
|
||||
set_target_properties(${TARGET_LIBARRAYMAP_OBJ} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
add_host_executable("addSuffix" addSuffix.c)
|
||||
set_target_properties("addSuffix" PROPERTIES C_STANDARD 99)
|
||||
@@ -0,0 +1,234 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#define N 1000
|
||||
|
||||
FILE *cygopen(const char *path, const char *mode) {
|
||||
#if defined(__MINGW64__) || defined(__MINGW32__)
|
||||
FILE *fp = fopen(path, mode);
|
||||
if (fp != NULL) return fp;
|
||||
|
||||
char *buf = malloc(strlen(path) + N + 1);
|
||||
snprintf(buf, strlen(path) + N, "cygpath -m '%s'", path);
|
||||
|
||||
FILE *pfp = popen(buf, "r");
|
||||
|
||||
if (pfp == NULL || fgets(buf, N, pfp) == NULL) {
|
||||
if (pfp != NULL) pclose(pfp);
|
||||
free(buf);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
pclose(pfp);
|
||||
|
||||
int len = strlen(buf);
|
||||
if (0 < len && len < N && buf[len-1] == '\n') buf[len-1] = '\0';
|
||||
|
||||
fp = fopen(buf, mode);
|
||||
|
||||
free(buf);
|
||||
|
||||
return fp;
|
||||
#else
|
||||
return fopen(path, mode);
|
||||
#endif
|
||||
}
|
||||
|
||||
int nkeywords = 0, nalloc = 0;
|
||||
char **keywords = NULL, *suffix = NULL;
|
||||
|
||||
int nIgnore = 0;
|
||||
char **ignore = NULL;
|
||||
|
||||
void insert(char *buf) {
|
||||
for(int i=0;i<nIgnore;i++) if (strcmp(ignore[i], buf) == 0) return;
|
||||
|
||||
for(int i=0;i<nkeywords;i++) {
|
||||
if (strcmp(keywords[i], buf) == 0) printf("%s", suffix);
|
||||
}
|
||||
}
|
||||
|
||||
void doit(FILE *fp) {
|
||||
int state = 0;
|
||||
bool nl = true;
|
||||
char buf[N+10], *p = buf;
|
||||
|
||||
for(;;) {
|
||||
int c = getc(fp);
|
||||
if (c == EOF) break;
|
||||
switch(state) {
|
||||
case 0:
|
||||
if (isalnum(c) || c == '_') {
|
||||
ungetc(c, fp);
|
||||
p = buf;
|
||||
state = 1;
|
||||
break;
|
||||
}
|
||||
if (c == '/') {
|
||||
int c2 = getc(fp);
|
||||
if (c2 == '*') {
|
||||
putc(c, stdout);
|
||||
putc(c2, stdout);
|
||||
state = 4;
|
||||
break;
|
||||
} else if (c2 == '/') {
|
||||
putc(c, stdout);
|
||||
putc(c2, stdout);
|
||||
do {
|
||||
c = getc(fp);
|
||||
putc(c, stdout);
|
||||
} while(c != '\n');
|
||||
break;
|
||||
}
|
||||
ungetc(c2, fp);
|
||||
}
|
||||
if (nl && c == '#') {
|
||||
putc(c, stdout);
|
||||
do {
|
||||
c = getc(fp);
|
||||
putc(c, stdout);
|
||||
} while(c != '\n');
|
||||
break;
|
||||
}
|
||||
putc(c, stdout);
|
||||
if (!isspace(c)) nl = false;
|
||||
if (c == '\n') nl = true;
|
||||
if (c == '\"') state = 2;
|
||||
if (c == '\'') state = 3;
|
||||
break;
|
||||
|
||||
case 1: // Identifier
|
||||
if (isalnum(c) || c == '_') {
|
||||
if (p - buf < N) { *p++ = c; *p = '\0'; }
|
||||
putc(c, stdout);
|
||||
} else if (c == '\"') {
|
||||
insert(buf);
|
||||
putc(c, stdout);
|
||||
state = 2;
|
||||
} else if (c == '\'') {
|
||||
insert(buf);
|
||||
putc(c, stdout);
|
||||
state = 3;
|
||||
} else {
|
||||
insert(buf);
|
||||
putc(c, stdout);
|
||||
state = 0;
|
||||
}
|
||||
break;
|
||||
|
||||
case 2: // String
|
||||
if (c == '\\') {
|
||||
putc(c, stdout);
|
||||
putc(getc(fp), stdout);
|
||||
} else if (c == '\"') {
|
||||
putc(c, stdout);
|
||||
state = 0;
|
||||
} else {
|
||||
putc(c, stdout);
|
||||
}
|
||||
break;
|
||||
|
||||
case 3: // Character
|
||||
if (c == '\\') {
|
||||
putc(c, stdout);
|
||||
putc(getc(fp), stdout);
|
||||
} else if (c == '\'') {
|
||||
putc(c, stdout);
|
||||
state = 0;
|
||||
} else {
|
||||
putc(c, stdout);
|
||||
}
|
||||
break;
|
||||
|
||||
case 4: // Comment
|
||||
if (c == '*') {
|
||||
int c2 = getc(fp);
|
||||
if (c2 == '/') {
|
||||
putc(c, stdout);
|
||||
putc(c2, stdout);
|
||||
state = 0;
|
||||
break;
|
||||
}
|
||||
ungetc(c2, fp);
|
||||
}
|
||||
putc(c, stdout);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
nalloc = 1;
|
||||
keywords = malloc(sizeof(char *) * nalloc);
|
||||
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "%s <input file>\n", argv[0]);
|
||||
fprintf(stderr, "Print the file on the standard output\n");
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s <input file> <keywords file> <suffix> [<keywords to ignore> ... ]\n", argv[0]);
|
||||
fprintf(stderr, "Add the suffix to keywords\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
char buf[N];
|
||||
|
||||
if (argc == 2) {
|
||||
FILE *fp = cygopen(argv[1], "r");
|
||||
if (fp == NULL) {
|
||||
fprintf(stderr, "Cannot open %s\n", argv[1]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
while(fgets(buf, N, fp) != NULL) {
|
||||
fputs(buf, stdout);
|
||||
}
|
||||
fclose(fp);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
FILE *fp = cygopen(argv[2], "r");
|
||||
if (fp == NULL) {
|
||||
fprintf(stderr, "Cannot open %s\n", argv[2]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
while(fgets(buf, N, fp) != NULL) {
|
||||
if (strlen(buf) >= 1) buf[strlen(buf)-1] = '\0';
|
||||
keywords[nkeywords] = malloc(sizeof(char) * (strlen(buf) + 1));
|
||||
strcpy(keywords[nkeywords], buf);
|
||||
nkeywords++;
|
||||
if (nkeywords >= nalloc) {
|
||||
nalloc *= 2;
|
||||
keywords = realloc(keywords, sizeof(char *) * nalloc);
|
||||
}
|
||||
}
|
||||
|
||||
fclose(fp);
|
||||
|
||||
nIgnore = argc - 4;
|
||||
ignore = argv + 4;
|
||||
|
||||
suffix = argv[3];
|
||||
|
||||
fp = cygopen(argv[1], "r");
|
||||
if (fp == NULL) {
|
||||
fprintf(stderr, "Cannot open %s\n", argv[1]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
doit(fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// cat sleef*inline*.h | egrep -o '[a-zA-Z_][0-9a-zA-Z_]*' | sort | uniq > cand.txt
|
||||
@@ -0,0 +1,347 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include <inttypes.h>
|
||||
#include <assert.h>
|
||||
|
||||
//
|
||||
|
||||
#if !(defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER))
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/file.h>
|
||||
|
||||
static void FLOCK(FILE *fp) { flock(fileno(fp), LOCK_EX); }
|
||||
static void FUNLOCK(FILE *fp) { flock(fileno(fp), LOCK_UN); }
|
||||
static void FTRUNCATE(FILE *fp, off_t z) {
|
||||
if (ftruncate(fileno(fp), z))
|
||||
;
|
||||
}
|
||||
static FILE *OPENTMPFILE() { return tmpfile(); }
|
||||
static void CLOSETMPFILE(FILE *fp) { fclose(fp); }
|
||||
#else
|
||||
#include <windows.h>
|
||||
#include <io.h>
|
||||
|
||||
static void FLOCK(FILE *fp) { }
|
||||
static void FUNLOCK(FILE *fp) { }
|
||||
static void FTRUNCATE(FILE *fp, long z) {
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
SetEndOfFile((HANDLE)_get_osfhandle(_fileno(fp)));
|
||||
}
|
||||
static FILE *OPENTMPFILE() { return fopen("tmpfile.txt", "w+"); }
|
||||
static void CLOSETMPFILE(FILE *fp) {
|
||||
fclose(fp);
|
||||
remove("tmpfile.txt");
|
||||
}
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
#define MAGIC_ARRAYMAPNODE 0xf73130fa
|
||||
#define MAGIC_ARRAYMAP 0x8693bd21
|
||||
#define LOGNBUCKETS 8
|
||||
#define NBUCKETS (1 << LOGNBUCKETS)
|
||||
|
||||
static int hash(uint64_t key) {
|
||||
return (key ^ (key >> LOGNBUCKETS) ^ (key >> (LOGNBUCKETS*2)) ^ (key >> (LOGNBUCKETS*3))) & (NBUCKETS-1);
|
||||
}
|
||||
|
||||
static void String_trim(char *str) {
|
||||
char *dst = str, *src = str, *pterm = src;
|
||||
|
||||
while(*src != '\0' && isspace((int)*src)) src++;
|
||||
|
||||
for(;*src != '\0';src++) {
|
||||
*dst++ = *src;
|
||||
if (!isspace((int)*src)) pterm = dst;
|
||||
}
|
||||
|
||||
*pterm = '\0';
|
||||
}
|
||||
|
||||
typedef struct ArrayMapNode {
|
||||
uint32_t magic;
|
||||
uint64_t key;
|
||||
void *value;
|
||||
} ArrayMapNode;
|
||||
|
||||
typedef struct ArrayMap {
|
||||
uint32_t magic;
|
||||
ArrayMapNode *array[NBUCKETS];
|
||||
int size[NBUCKETS], capacity[NBUCKETS], totalSize;
|
||||
} ArrayMap;
|
||||
|
||||
ArrayMap *initArrayMap() {
|
||||
ArrayMap *thiz = (ArrayMap *)calloc(1, sizeof(ArrayMap));
|
||||
thiz->magic = MAGIC_ARRAYMAP;
|
||||
|
||||
for(int i=0;i<NBUCKETS;i++) {
|
||||
thiz->capacity[i] = 8;
|
||||
thiz->array[i] = (ArrayMapNode *)malloc(thiz->capacity[i] * sizeof(ArrayMapNode));
|
||||
thiz->size[i] = 0;
|
||||
}
|
||||
|
||||
thiz->totalSize = 0;
|
||||
return thiz;
|
||||
}
|
||||
|
||||
void ArrayMap_dispose(ArrayMap *thiz) {
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
|
||||
for(int j=0;j<NBUCKETS;j++) {
|
||||
for(int i=0;i<thiz->size[j];i++) {
|
||||
assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE);
|
||||
thiz->array[j][i].magic = 0;
|
||||
}
|
||||
free(thiz->array[j]);
|
||||
}
|
||||
|
||||
thiz->magic = 0;
|
||||
free(thiz);
|
||||
}
|
||||
|
||||
int ArrayMap_size(ArrayMap *thiz) {
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
return thiz->totalSize;
|
||||
}
|
||||
|
||||
uint64_t *ArrayMap_keyArray(ArrayMap *thiz) {
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
uint64_t *a = (uint64_t *)malloc(sizeof(uint64_t) * thiz->totalSize);
|
||||
int p = 0;
|
||||
for(int j=0;j<NBUCKETS;j++) {
|
||||
for(int i=0;i<thiz->size[j];i++) {
|
||||
assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE);
|
||||
a[p++] = thiz->array[j][i].key;
|
||||
}
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
void **ArrayMap_valueArray(ArrayMap *thiz) {
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
void **a = (void **)malloc(sizeof(void *) * thiz->totalSize);
|
||||
int p = 0;
|
||||
for(int j=0;j<NBUCKETS;j++) {
|
||||
for(int i=0;i<thiz->size[j];i++) {
|
||||
assert(thiz->array[j][i].magic == MAGIC_ARRAYMAPNODE);
|
||||
a[p++] = thiz->array[j][i].value;
|
||||
}
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
void *ArrayMap_remove(ArrayMap *thiz, uint64_t key) {
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
|
||||
int h = hash(key);
|
||||
for(int i=0;i<thiz->size[h];i++) {
|
||||
assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE);
|
||||
if (thiz->array[h][i].key == key) {
|
||||
void *old = thiz->array[h][i].value;
|
||||
thiz->array[h][i].key = thiz->array[h][thiz->size[h]-1].key;
|
||||
thiz->array[h][i].value = thiz->array[h][thiz->size[h]-1].value;
|
||||
thiz->array[h][thiz->size[h]-1].magic = 0;
|
||||
thiz->size[h]--;
|
||||
thiz->totalSize--;
|
||||
return old;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *ArrayMap_put(ArrayMap *thiz, uint64_t key, void *value) {
|
||||
if (value == NULL) return ArrayMap_remove(thiz, key);
|
||||
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
|
||||
int h = hash(key);
|
||||
for(int i=0;i<thiz->size[h];i++) {
|
||||
assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE);
|
||||
if (thiz->array[h][i].key == key) {
|
||||
void *old = thiz->array[h][i].value;
|
||||
thiz->array[h][i].value = value;
|
||||
return old;
|
||||
}
|
||||
}
|
||||
|
||||
if (thiz->size[h] >= thiz->capacity[h]) {
|
||||
thiz->capacity[h] *= 2;
|
||||
thiz->array[h] = (ArrayMapNode *)realloc(thiz->array[h], thiz->capacity[h] * sizeof(ArrayMapNode));
|
||||
}
|
||||
|
||||
ArrayMapNode *n = &(thiz->array[h][thiz->size[h]++]);
|
||||
n->magic = MAGIC_ARRAYMAPNODE;
|
||||
n->key = key;
|
||||
n->value = value;
|
||||
|
||||
thiz->totalSize++;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void *ArrayMap_get(ArrayMap *thiz, uint64_t key) {
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
|
||||
int h = hash(key);
|
||||
for(int i=0;i<thiz->size[h];i++) {
|
||||
assert(thiz->array[h][i].magic == MAGIC_ARRAYMAPNODE);
|
||||
if (thiz->array[h][i].key == key) {
|
||||
return thiz->array[h][i].value;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#define LINELEN (1024*1024)
|
||||
|
||||
ArrayMap *ArrayMap_load(const char *fn, const char *prefix, const char *idstr, int doLock) {
|
||||
const int idstrlen = (int)strlen(idstr);
|
||||
int prefixLen = (int)strlen(prefix) + 3;
|
||||
|
||||
if (prefixLen >= LINELEN-10 || idstrlen >= LINELEN-10) return NULL;
|
||||
|
||||
FILE *fp = fopen(fn, "r");
|
||||
if (fp == NULL) return NULL;
|
||||
|
||||
if (doLock) FLOCK(fp);
|
||||
|
||||
ArrayMap *thiz = initArrayMap();
|
||||
|
||||
char *prefix2 = malloc(prefixLen+10);
|
||||
strcpy(prefix2, prefix);
|
||||
String_trim(prefix2);
|
||||
for(char *p = prefix2;*p != '\0';p++) {
|
||||
if (*p == ':') *p = ';';
|
||||
if (*p == ' ') *p = '_';
|
||||
}
|
||||
strcat(prefix2, " : ");
|
||||
prefixLen = (int)strlen(prefix2);
|
||||
|
||||
char *line = malloc(sizeof(char) * (LINELEN+10));
|
||||
line[idstrlen] = '\0';
|
||||
|
||||
if (fread(line, sizeof(char), idstrlen, fp) != idstrlen ||
|
||||
strcmp(idstr, line) != 0) {
|
||||
if (doLock) FUNLOCK(fp);
|
||||
fclose(fp);
|
||||
free(prefix2);
|
||||
free(line);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for(;;) {
|
||||
line[LINELEN] = '\0';
|
||||
if (fgets(line, LINELEN, fp) == NULL) break;
|
||||
if (strncmp(line, prefix2, prefixLen) != 0) continue;
|
||||
|
||||
uint64_t key;
|
||||
char *value = malloc(sizeof(char) * LINELEN);
|
||||
|
||||
if (sscanf(line + prefixLen, "%" SCNx64 " : %s\n", &key, value) == 2) {
|
||||
ArrayMap_put(thiz, (uint64_t)key, (void *)value);
|
||||
} else {
|
||||
free(value);
|
||||
}
|
||||
}
|
||||
|
||||
if (doLock) FUNLOCK(fp);
|
||||
fclose(fp);
|
||||
|
||||
free(prefix2);
|
||||
free(line);
|
||||
|
||||
return thiz;
|
||||
}
|
||||
|
||||
int ArrayMap_save(ArrayMap *thiz, const char *fn, const char *prefix, const char *idstr) {
|
||||
assert(thiz != NULL && thiz->magic == MAGIC_ARRAYMAP);
|
||||
|
||||
const int idstrlen = (int)strlen(idstr);
|
||||
int prefixLen = (int)strlen(prefix) + 3;
|
||||
|
||||
if (prefixLen >= LINELEN-10 || idstrlen >= LINELEN-10) return -1;
|
||||
|
||||
// Generate prefix2
|
||||
|
||||
char *prefix2 = malloc(prefixLen+10);
|
||||
strcpy(prefix2, prefix);
|
||||
String_trim(prefix2);
|
||||
for(char *p = prefix2;*p != '\0';p++) {
|
||||
if (*p == ':') *p = ';';
|
||||
if (*p == ' ') *p = '_';
|
||||
}
|
||||
strcat(prefix2, " : ");
|
||||
prefixLen = (int)strlen(prefix2);
|
||||
|
||||
//
|
||||
|
||||
FILE *fp = fopen(fn, "a+");
|
||||
if (fp == NULL) return -1;
|
||||
|
||||
FLOCK(fp);
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
|
||||
// Copy the file specified by fn to tmpfile
|
||||
|
||||
FILE *tmpfp = OPENTMPFILE();
|
||||
if (tmpfp == NULL) {
|
||||
FUNLOCK(fp);
|
||||
fclose(fp);
|
||||
return -1;
|
||||
}
|
||||
|
||||
char *line = malloc(sizeof(char) * (LINELEN+10));
|
||||
line[idstrlen] = '\0';
|
||||
|
||||
if (fread(line, sizeof(char), idstrlen, fp) == idstrlen && strcmp(idstr, line) == 0) {
|
||||
for(;;) {
|
||||
line[LINELEN] = '\0';
|
||||
if (fgets(line, LINELEN, fp) == NULL) break;
|
||||
if (strncmp(line, prefix2, prefixLen) != 0) fputs(line, tmpfp);
|
||||
}
|
||||
}
|
||||
|
||||
// Write the contents in the map into tmpfile
|
||||
|
||||
uint64_t *keys = ArrayMap_keyArray(thiz);
|
||||
int s = ArrayMap_size(thiz);
|
||||
|
||||
for(int i=0;i<s;i++) {
|
||||
char *value = ArrayMap_get(thiz, keys[i]);
|
||||
if (strlen(value) + prefixLen >= LINELEN-10) continue;
|
||||
fprintf(tmpfp, "%s %" PRIx64 " : %s\n", prefix2, keys[i], value);
|
||||
}
|
||||
|
||||
free(keys);
|
||||
|
||||
fseek(fp, 0, SEEK_SET);
|
||||
FTRUNCATE(fp, 0);
|
||||
fwrite(idstr, sizeof(char), strlen(idstr), fp);
|
||||
|
||||
fseek(tmpfp, 0, SEEK_SET);
|
||||
|
||||
for(;;) {
|
||||
size_t s = fread(line, 1, LINELEN, tmpfp);
|
||||
if (s == 0) break;
|
||||
fwrite(line, 1, s, fp);
|
||||
}
|
||||
|
||||
FUNLOCK(fp);
|
||||
fclose(fp);
|
||||
|
||||
CLOSETMPFILE(tmpfp);
|
||||
free(prefix2);
|
||||
free(line);
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef __ARRAYMAP_H__
|
||||
#define __ARRAYMAP_H__
|
||||
typedef struct ArrayMap ArrayMap;
|
||||
|
||||
ArrayMap *initArrayMap();
|
||||
void ArrayMap_dispose(ArrayMap *thiz);
|
||||
int ArrayMap_size(ArrayMap *thiz);
|
||||
void *ArrayMap_remove(ArrayMap *thiz, uint64_t key);
|
||||
void *ArrayMap_put(ArrayMap *thiz, uint64_t key, void *value);
|
||||
void *ArrayMap_get(ArrayMap *thiz, uint64_t key);
|
||||
|
||||
uint64_t *ArrayMap_keyArray(ArrayMap *thiz);
|
||||
void **ArrayMap_valueArray(ArrayMap *thiz);
|
||||
int ArrayMap_save(ArrayMap *thiz, const char *fn, const char *prefix, const char *idstr);
|
||||
ArrayMap *ArrayMap_load(const char *fn, const char *prefix, const char *idstr, int doLock);
|
||||
#endif
|
||||
@@ -0,0 +1,98 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "misc.h"
|
||||
|
||||
#if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
|
||||
#include <sys/timeb.h>
|
||||
|
||||
EXPORT void *Sleef_malloc(size_t z) { return _aligned_malloc(z, 256); }
|
||||
EXPORT void Sleef_free(void *ptr) { _aligned_free(ptr); }
|
||||
|
||||
EXPORT uint64_t Sleef_currentTimeMicros() {
|
||||
struct __timeb64 t;
|
||||
_ftime64(&t);
|
||||
return t.time * INT64_C(1000000) + t.millitm*1000;
|
||||
}
|
||||
#elif defined(__APPLE__)
|
||||
#include <sys/time.h>
|
||||
|
||||
EXPORT void *Sleef_malloc(size_t z) { void *ptr = NULL; posix_memalign(&ptr, 256, z); return ptr; }
|
||||
EXPORT void Sleef_free(void *ptr) { free(ptr); }
|
||||
|
||||
EXPORT uint64_t Sleef_currentTimeMicros() {
|
||||
struct timeval time;
|
||||
gettimeofday(&time, NULL);
|
||||
return (uint64_t)((time.tv_sec * INT64_C(1000000)) + time.tv_usec);
|
||||
}
|
||||
#else // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__)
|
||||
#include <stdlib.h>
|
||||
#else
|
||||
#include <malloc.h>
|
||||
#endif
|
||||
|
||||
EXPORT void *Sleef_malloc(size_t z) { void *ptr = NULL; posix_memalign(&ptr, 4096, z); return ptr; }
|
||||
EXPORT void Sleef_free(void *ptr) { free(ptr); }
|
||||
|
||||
EXPORT uint64_t Sleef_currentTimeMicros() {
|
||||
struct timespec tp;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tp);
|
||||
return (uint64_t)tp.tv_sec * INT64_C(1000000) + ((uint64_t)tp.tv_nsec/1000);
|
||||
}
|
||||
#endif // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
EXPORT void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
|
||||
__cpuidex(out, eax, ecx);
|
||||
}
|
||||
#else
|
||||
#if defined(__x86_64__) || defined(__i386__)
|
||||
EXPORT void Sleef_x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
|
||||
uint32_t a, b, c, d;
|
||||
__asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
|
||||
out[0] = a; out[1] = b; out[2] = c; out[3] = d;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER)
|
||||
static char x86BrandString[256];
|
||||
|
||||
EXPORT char *Sleef_getCpuIdString() {
|
||||
union {
|
||||
int32_t info[4];
|
||||
uint8_t str[16];
|
||||
} u;
|
||||
int i,j;
|
||||
char *p;
|
||||
|
||||
p = x86BrandString;
|
||||
|
||||
for(i=0;i<3;i++) {
|
||||
Sleef_x86CpuID(u.info, i + 0x80000002, 0);
|
||||
|
||||
for(j=0;j<16;j++) {
|
||||
*p++ = u.str[j];
|
||||
}
|
||||
}
|
||||
|
||||
*p++ = '\n';
|
||||
|
||||
return x86BrandString;
|
||||
}
|
||||
#else
|
||||
EXPORT char *Sleef_getCpuIdString() {
|
||||
return "Unknown architecture";
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,9 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef __COMMON_H__
|
||||
#define __COMMON_H__
|
||||
char *Sleef_getCpuIdString();
|
||||
#endif
|
||||
@@ -0,0 +1,438 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2023.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
|
||||
typedef struct {
|
||||
vdouble x, y, z;
|
||||
} vdouble3;
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vd3getx_vd_vd3(vdouble3 v) { return v.x; }
|
||||
static INLINE CONST VECTOR_CC vdouble vd3gety_vd_vd3(vdouble3 v) { return v.y; }
|
||||
static INLINE CONST VECTOR_CC vdouble vd3getz_vd_vd3(vdouble3 v) { return v.z; }
|
||||
static INLINE CONST VECTOR_CC vdouble3 vd3setxyz_vd3_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
|
||||
vdouble3 v = { x, y, z };
|
||||
return v;
|
||||
}
|
||||
static INLINE CONST VECTOR_CC vdouble3 vd3setx_vd3_vd3_vd(vdouble3 v, vdouble d) { v.x = d; return v; }
|
||||
static INLINE CONST VECTOR_CC vdouble3 vd3sety_vd3_vd3_vd(vdouble3 v, vdouble d) { v.y = d; return v; }
|
||||
static INLINE CONST VECTOR_CC vdouble3 vd3setz_vd3_vd3_vd(vdouble3 v, vdouble d) { v.z = d; return v; }
|
||||
|
||||
//
|
||||
|
||||
typedef struct {
|
||||
vdouble2 a, b;
|
||||
} dd2;
|
||||
|
||||
static dd2 dd2setab_dd2_vd2_vd2(vdouble2 a, vdouble2 b) {
|
||||
dd2 r = { a, b };
|
||||
return r;
|
||||
}
|
||||
static vdouble2 dd2geta_vd2_dd2(dd2 d) { return d.a; }
|
||||
static vdouble2 dd2getb_vd2_dd2(dd2 d) { return d.b; }
|
||||
|
||||
//
|
||||
|
||||
typedef struct {
|
||||
vmask e;
|
||||
vdouble3 d3;
|
||||
} tdx;
|
||||
|
||||
static INLINE CONST VECTOR_CC vmask tdxgete_vm_tdx(tdx t) { return t.e; }
|
||||
static INLINE CONST VECTOR_CC vdouble3 tdxgetd3_vd3_tdx(tdx t) { return t.d3; }
|
||||
static INLINE CONST VECTOR_CC vdouble tdxgetd3x_vd_tdx(tdx t) { return t.d3.x; }
|
||||
static INLINE CONST VECTOR_CC vdouble tdxgetd3y_vd_tdx(tdx t) { return t.d3.y; }
|
||||
static INLINE CONST VECTOR_CC vdouble tdxgetd3z_vd_tdx(tdx t) { return t.d3.z; }
|
||||
static INLINE CONST VECTOR_CC tdx tdxsete_tdx_tdx_vm(tdx t, vmask e) { t.e = e; return t; }
|
||||
static INLINE CONST VECTOR_CC tdx tdxsetd3_tdx_tdx_vd3(tdx t, vdouble3 d3) { t.d3 = d3; return t; }
|
||||
static INLINE CONST VECTOR_CC tdx tdxsetx_tdx_tdx_vd(tdx t, vdouble x) { t.d3.x = x; return t; }
|
||||
static INLINE CONST VECTOR_CC tdx tdxsety_tdx_tdx_vd(tdx t, vdouble y) { t.d3.y = y; return t; }
|
||||
static INLINE CONST VECTOR_CC tdx tdxsetz_tdx_tdx_vd(tdx t, vdouble z) { t.d3.z = z; return t; }
|
||||
static INLINE CONST VECTOR_CC tdx tdxsetxyz_tdx_tdx_vd_vd_vd(tdx t, vdouble x, vdouble y, vdouble z) {
|
||||
t.d3 = (vdouble3) { x, y, z };
|
||||
return t;
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC tdx tdxseted3_tdx_vm_vd3(vmask e, vdouble3 d3) { return (tdx) { e, d3 }; }
|
||||
static INLINE CONST VECTOR_CC tdx tdxsetexyz_tdx_vm_vd_vd_vd(vmask e, vdouble x, vdouble y, vdouble z) {
|
||||
return (tdx) { e, (vdouble3) { x, y, z } };
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vmask vqgetx_vm_vq(vquad v) { return v.x; }
|
||||
static INLINE CONST VECTOR_CC vmask vqgety_vm_vq(vquad v) { return v.y; }
|
||||
static INLINE CONST VECTOR_CC vquad vqsetxy_vq_vm_vm(vmask x, vmask y) { return (vquad) { x, y }; }
|
||||
static INLINE CONST VECTOR_CC vquad vqsetx_vq_vq_vm(vquad v, vmask x) { v.x = x; return v; }
|
||||
static INLINE CONST VECTOR_CC vquad vqsety_vq_vq_vm(vquad v, vmask y) { v.y = y; return v; }
|
||||
|
||||
//
|
||||
|
||||
typedef struct {
|
||||
vdouble d;
|
||||
vint i;
|
||||
} di_t;
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble digetd_vd_di(di_t d) { return d.d; }
|
||||
static INLINE CONST VECTOR_CC vint digeti_vi_di(di_t d) { return d.i; }
|
||||
static INLINE CONST VECTOR_CC di_t disetdi_di_vd_vi(vdouble d, vint i) {
|
||||
di_t r = { d, i };
|
||||
return r;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
typedef struct {
|
||||
vdouble2 dd;
|
||||
vint i;
|
||||
} ddi_t;
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddigetdd_vd2_ddi(ddi_t d) { return d.dd; }
|
||||
static INLINE CONST VECTOR_CC vint ddigeti_vi_ddi(ddi_t d) { return d.i; }
|
||||
static INLINE CONST VECTOR_CC ddi_t ddisetddi_ddi_vd2_vi(vdouble2 v, vint i) {
|
||||
ddi_t r = { v, i };
|
||||
return r;
|
||||
}
|
||||
static INLINE CONST VECTOR_CC ddi_t ddisetdd_ddi_ddi_vd2(ddi_t ddi, vdouble2 v) {
|
||||
ddi.dd = v;
|
||||
return ddi;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
typedef struct {
|
||||
vdouble3 td;
|
||||
vint i;
|
||||
} tdi_t;
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble3 tdigettd_vd3_tdi(tdi_t d) { return d.td; }
|
||||
static INLINE CONST VECTOR_CC vdouble tdigetx_vd_tdi(tdi_t d) { return d.td.x; }
|
||||
static INLINE CONST VECTOR_CC vint tdigeti_vi_tdi(tdi_t d) { return d.i; }
|
||||
static INLINE CONST VECTOR_CC tdi_t tdisettdi_tdi_vd3_vi(vdouble3 v, vint i) {
|
||||
tdi_t r = { v, i };
|
||||
return r;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_MAIN)
|
||||
// Functions for debugging
|
||||
#include <stdio.h>
|
||||
#include <wchar.h>
|
||||
|
||||
static void printvmask(char *mes, vmask g) {
|
||||
uint64_t u[VECTLENDP];
|
||||
vstoreu_v_p_vd((double *)u, vreinterpret_vd_vm(g));
|
||||
printf("%s ", mes);
|
||||
for(int i=0;i<VECTLENDP;i++) printf("%016lx : ", (unsigned long)u[i]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
#if !defined(ENABLE_SVE)
|
||||
static void printvopmask(char *mes, vopmask g) {
|
||||
union {
|
||||
vopmask g;
|
||||
uint8_t u[sizeof(vopmask)];
|
||||
} cnv = { .g = g };
|
||||
printf("%s ", mes);
|
||||
for(int i=0;i<sizeof(vopmask);i++) printf("%02x", cnv.u[i]);
|
||||
printf("\n");
|
||||
}
|
||||
#else
|
||||
static void printvopmask(char *mes, vopmask g) {
|
||||
vmask m = vand_vm_vo64_vm(g, vcast_vm_i64(-1));
|
||||
printvmask(mes, m);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void printvdouble(char *mes, vdouble vd) {
|
||||
double u[VECTLENDP];
|
||||
vstoreu_v_p_vd((double *)u, vd);
|
||||
printf("%s ", mes);
|
||||
for(int i=0;i<VECTLENDP;i++) printf("%.20g : ", u[i]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static void printvint(char *mes, vint vi) {
|
||||
uint32_t u[VECTLENDP];
|
||||
vstoreu_v_p_vi((int32_t *)u, vi);
|
||||
printf("%s ", mes);
|
||||
for(int i=0;i<VECTLENDP;i++) printf("%08x : ", (unsigned)u[i]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static void printvint64(char *mes, vint64 vi) {
|
||||
uint64_t u[VECTLENDP*2];
|
||||
vstoreu_v_p_vd((double *)u, vreinterpret_vd_vm(vreinterpret_vm_vi64(vi)));
|
||||
printf("%s ", mes);
|
||||
for(int i=0;i<VECTLENDP;i++) printf("%016lx : ", (unsigned long)u[i]);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
static void printvquad(char *mes, vquad g) {
|
||||
uint64_t u[VECTLENDP*2];
|
||||
vstoreu_v_p_vd((double *)u, vreinterpret_vd_vm(vqgetx_vm_vq(g)));
|
||||
vstoreu_v_p_vd((double *)&u[VECTLENDP], vreinterpret_vd_vm(vqgety_vm_vq(g)));
|
||||
printf("%s ", mes);
|
||||
for(int i=0;i<VECTLENDP*2;i++) printf("%016lx : ", (unsigned long)(u[i]));
|
||||
printf("\n");
|
||||
}
|
||||
#endif // #if defined(ENABLE_MAIN)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// vdouble functions
|
||||
|
||||
static INLINE CONST VECTOR_CC vopmask visnegzero_vo_vd(vdouble d) {
|
||||
return veq64_vo_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vopmask visnumber_vo_vd(vdouble x) {
|
||||
return vandnot_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, x));
|
||||
}
|
||||
|
||||
static INLINE CONST vopmask visnonfinite_vo_vd(vdouble x) {
|
||||
return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(x), vcast_vm_i64(INT64_C(0x7ff0000000000000))), vcast_vm_i64(INT64_C(0x7ff0000000000000)));
|
||||
}
|
||||
|
||||
static INLINE CONST vmask vsignbit_vm_vd(vdouble d) {
|
||||
return vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
|
||||
}
|
||||
|
||||
static INLINE CONST vopmask vsignbit_vo_vd(vdouble d) {
|
||||
return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
|
||||
}
|
||||
|
||||
static INLINE CONST vdouble vclearlsb_vd_vd_i(vdouble d, int n) {
|
||||
return vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vcast_vm_u64((~UINT64_C(0)) << n)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vtoward0_vd_vd(vdouble x) { // returns nextafter(x, 0)
|
||||
vdouble t = vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(x), vcast_vm_i64(-1)));
|
||||
return vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), vcast_vd_d(0), t);
|
||||
}
|
||||
|
||||
#if !(defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
|
||||
static INLINE CONST vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) {
|
||||
return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y)));
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vsign_vd_vd(vdouble d) {
|
||||
return vmulsign_vd_vd_vd(vcast_vd_d(1.0), d);
|
||||
}
|
||||
|
||||
#if !(defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
|
||||
static INLINE CONST VECTOR_CC vdouble vorsign_vd_vd_vd(vdouble x, vdouble y) {
|
||||
return vreinterpret_vd_vm(vor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vcopysign_vd_vd_vd(vdouble x, vdouble y) {
|
||||
return vreinterpret_vd_vm(vxor_vm_vm_vm(vandnot_vm_vm_vm(vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(x)),
|
||||
vand_vm_vm_vm (vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(y))));
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vtruncate2_vd_vd(vdouble x) {
|
||||
#ifdef FULL_FP_ROUNDING
|
||||
return vtruncate_vd_vd(x);
|
||||
#else
|
||||
vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
|
||||
fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
|
||||
return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vfloor2_vd_vd(vdouble x) {
|
||||
vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
|
||||
fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
|
||||
fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr);
|
||||
return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vceil2_vd_vd(vdouble x) {
|
||||
vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
|
||||
fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
|
||||
fr = vsel_vd_vo_vd_vd(vle_vo_vd_vd(fr, vcast_vd_d(0)), fr, vsub_vd_vd_vd(fr, vcast_vd_d(1.0)));
|
||||
return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vround2_vd_vd(vdouble d) {
|
||||
vdouble x = vadd_vd_vd_vd(d, vcast_vd_d(0.5));
|
||||
vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(INT64_C(1) << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (INT64_C(1) << 31)))))));
|
||||
fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
|
||||
x = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vle_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(fr, vcast_vd_d(0))), vsub_vd_vd_vd(x, vcast_vd_d(1.0)), x);
|
||||
fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr);
|
||||
x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0.49999999999999994449)), vcast_vd_d(0), x);
|
||||
return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(d), vge_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52))), d, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), d));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vrint2_vd_vd(vdouble d) {
|
||||
#ifdef FULL_FP_ROUNDING
|
||||
return vrint_vd_vd(d);
|
||||
#else
|
||||
vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), d);
|
||||
return vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(INT64_C(1) << 52)),
|
||||
d, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(d, c), c), d));
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vopmask visint_vo_vd(vdouble d) {
|
||||
return veq_vo_vd_vd(vrint2_vd_vd(d), d);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vopmask visodd_vo_vd(vdouble d) {
|
||||
vdouble x = vmul_vd_vd_vd(d, vcast_vd_d(0.5));
|
||||
return vneq_vo_vd_vd(vrint2_vd_vd(x), x);
|
||||
}
|
||||
|
||||
// ilogb
|
||||
|
||||
#if !defined(ENABLE_AVX512F) && !defined(ENABLE_AVX512FNOFMA)
|
||||
static INLINE CONST VECTOR_CC vint vilogbk_vi_vd(vdouble d) {
|
||||
vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(4.9090934652977266E-91));
|
||||
d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d);
|
||||
vint q = vcastu_vi_vm(vreinterpret_vm_vd(d));
|
||||
q = vand_vi_vi_vi(q, vcast_vi_i((int)(((1U << 12) - 1) << 20)));
|
||||
q = vsrl_vi_vi_i(q, 20);
|
||||
q = vsub_vi_vi_vi(q, vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vcast_vi_i(300 + 0x3ff), vcast_vi_i(0x3ff)));
|
||||
return q;
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vint vilogb2k_vi_vd(vdouble d) {
|
||||
vint q = vcastu_vi_vm(vreinterpret_vm_vd(d));
|
||||
q = vsrl_vi_vi_i(q, 20);
|
||||
q = vand_vi_vi_vi(q, vcast_vi_i(0x7ff));
|
||||
q = vsub_vi_vi_vi(q, vcast_vi_i(0x3ff));
|
||||
return q;
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE CONST vmask vilogb2k_vm_vd(vdouble d) {
|
||||
vmask m = vreinterpret_vm_vd(d);
|
||||
m = vsrl64_vm_vm_i(m, 20 + 32);
|
||||
m = vand_vm_vm_vm(m, vcast_vm_i64(0x7ff));
|
||||
m = vsub64_vm_vm_vm(m, vcast_vm_i64(0x3ff));
|
||||
return m;
|
||||
}
|
||||
|
||||
static INLINE CONST vmask vilogb3k_vm_vd(vdouble d) {
|
||||
vmask m = vreinterpret_vm_vd(d);
|
||||
m = vsrl64_vm_vm_i(m, 20 + 32);
|
||||
m = vand_vm_vm_vm(m, vcast_vm_i64(0x7ff));
|
||||
return m;
|
||||
}
|
||||
|
||||
// ldexp
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vpow2i_vd_vi(vint q) {
|
||||
q = vadd_vi_vi_vi(vcast_vi_i(0x3ff), q);
|
||||
vmask r = vcastu_vm_vi(vsll_vi_vi_i(q, 20));
|
||||
return vreinterpret_vd_vm(r);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vpow2i_vd_vm(vmask q) {
|
||||
q = vadd64_vm_vm_vm(vcast_vm_i64(0x3ff), q);
|
||||
return vreinterpret_vd_vm(vsll64_vm_vm_i(q, 52));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vldexp_vd_vd_vi(vdouble x, vint q) {
|
||||
vint m = vsra_vi_vi_i(q, 31);
|
||||
m = vsll_vi_vi_i(vsub_vi_vi_vi(vsra_vi_vi_i(vadd_vi_vi_vi(m, q), 9), m), 7);
|
||||
q = vsub_vi_vi_vi(q, vsll_vi_vi_i(m, 2));
|
||||
m = vadd_vi_vi_vi(vcast_vi_i(0x3ff), m);
|
||||
m = vandnot_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), m), m);
|
||||
m = vsel_vi_vo_vi_vi(vgt_vo_vi_vi(m, vcast_vi_i(0x7ff)), vcast_vi_i(0x7ff), m);
|
||||
vmask r = vcastu_vm_vi(vsll_vi_vi_i(m, 20));
|
||||
vdouble y = vreinterpret_vd_vm(r);
|
||||
return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vldexp2_vd_vd_vi(vdouble d, vint e) {
|
||||
return vmul_vd_vd_vd(vmul_vd_vd_vd(d, vpow2i_vd_vi(vsra_vi_vi_i(e, 1))), vpow2i_vd_vi(vsub_vi_vi_vi(e, vsra_vi_vi_i(e, 1))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vldexp3_vd_vd_vi(vdouble d, vint q) {
|
||||
return vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(d), vcastu_vm_vi(vsll_vi_vi_i(q, 20))));
|
||||
}
|
||||
|
||||
static INLINE CONST vdouble vldexp1_vd_vd_vm(vdouble d, vmask e) {
|
||||
vmask m = vsrl64_vm_vm_i(e, 2);
|
||||
e = vsub64_vm_vm_vm(vsub64_vm_vm_vm(vsub64_vm_vm_vm(e, m), m), m);
|
||||
d = vmul_vd_vd_vd(d, vpow2i_vd_vm(m));
|
||||
d = vmul_vd_vd_vd(d, vpow2i_vd_vm(m));
|
||||
d = vmul_vd_vd_vd(d, vpow2i_vd_vm(m));
|
||||
d = vmul_vd_vd_vd(d, vpow2i_vd_vm(e));
|
||||
return d;
|
||||
}
|
||||
|
||||
static INLINE CONST vdouble vldexp2_vd_vd_vm(vdouble d, vmask e) {
|
||||
return vmul_vd_vd_vd(vmul_vd_vd_vd(d, vpow2i_vd_vm(vsrl64_vm_vm_i(e, 1))), vpow2i_vd_vm(vsub64_vm_vm_vm(e, vsrl64_vm_vm_i(e, 1))));
|
||||
}
|
||||
|
||||
static INLINE CONST vdouble vldexp3_vd_vd_vm(vdouble d, vmask q) {
|
||||
return vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(d), vsll64_vm_vm_i(q, 52)));
|
||||
}
|
||||
|
||||
// vmask functions
|
||||
|
||||
static INLINE CONST vdouble vcast_vd_vm(vmask m) { return vcast_vd_vi(vcast_vi_vm(m)); } // 32 bit only
|
||||
static INLINE CONST vmask vtruncate_vm_vd(vdouble d) { return vcast_vm_vi(vtruncate_vi_vd(d)); }
|
||||
|
||||
static INLINE CONST vopmask vlt64_vo_vm_vm(vmask x, vmask y) { return vgt64_vo_vm_vm(y, x); }
|
||||
|
||||
static INLINE CONST vopmask vnot_vo64_vo64(vopmask x) {
|
||||
return vxor_vo_vo_vo(x, veq64_vo_vm_vm(vcast_vm_i64(0), vcast_vm_i64(0)));
|
||||
}
|
||||
|
||||
static INLINE CONST vopmask vugt64_vo_vm_vm(vmask x, vmask y) { // unsigned compare
|
||||
x = vxor_vm_vm_vm(vcast_vm_u64(UINT64_C(0x8000000000000000)), x);
|
||||
y = vxor_vm_vm_vm(vcast_vm_u64(UINT64_C(0x8000000000000000)), y);
|
||||
return vgt64_vo_vm_vm(x, y);
|
||||
}
|
||||
|
||||
static INLINE CONST vmask vilogbk_vm_vd(vdouble d) {
|
||||
vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(4.9090934652977266E-91));
|
||||
d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d);
|
||||
vmask q = vreinterpret_vm_vd(d);
|
||||
q = vsrl64_vm_vm_i(q, 20 + 32);
|
||||
q = vand_vm_vm_vm(q, vcast_vm_i64(0x7ff));
|
||||
q = vsub64_vm_vm_vm(q, vsel_vm_vo64_vm_vm(o, vcast_vm_i64(300 + 0x3ff), vcast_vm_i64(0x3ff)));
|
||||
return q;
|
||||
}
|
||||
|
||||
// vquad functions
|
||||
|
||||
static INLINE CONST vquad sel_vq_vo_vq_vq(vopmask o, vquad x, vquad y) {
|
||||
return vqsetxy_vq_vm_vm(vsel_vm_vo64_vm_vm(o, vqgetx_vm_vq(x), vqgetx_vm_vq(y)), vsel_vm_vo64_vm_vm(o, vqgety_vm_vq(x), vqgety_vm_vq(y)));
|
||||
}
|
||||
|
||||
static INLINE CONST vquad add128_vq_vq_vq(vquad x, vquad y) {
|
||||
vquad r = vqsetxy_vq_vm_vm(vadd64_vm_vm_vm(vqgetx_vm_vq(x), vqgetx_vm_vq(y)), vadd64_vm_vm_vm(vqgety_vm_vq(x), vqgety_vm_vq(y)));
|
||||
r = vqsety_vq_vq_vm(r, vadd64_vm_vm_vm(vqgety_vm_vq(r), vand_vm_vo64_vm(vugt64_vo_vm_vm(vqgetx_vm_vq(x), vqgetx_vm_vq(r)), vcast_vm_i64(1))));
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
static INLINE CONST vquad imdvq_vq_vm_vm(vmask x, vmask y) { vquad r = vqsetxy_vq_vm_vm(x, y); return r; }
|
||||
|
||||
// imm must be smaller than 64
|
||||
#define srl128_vq_vq_i(m, imm) \
|
||||
imdvq_vq_vm_vm(vor_vm_vm_vm(vsrl64_vm_vm_i(vqgetx_vm_vq(m), imm), vsll64_vm_vm_i(vqgety_vm_vq(m), 64-imm)), vsrl64_vm_vm_i(vqgety_vm_vq(m), imm))
|
||||
|
||||
// This function is equivalent to :
|
||||
// di_t ret = { x - rint(4 * x) * 0.25, (int32_t)(rint(4 * x) - rint(x) * 4) };
|
||||
static INLINE CONST di_t rempisub(vdouble x) {
|
||||
#ifdef FULL_FP_ROUNDING
|
||||
vdouble y = vrint_vd_vd(vmul_vd_vd_vd(x, vcast_vd_d(4)));
|
||||
vint vi = vtruncate_vi_vd(vsub_vd_vd_vd(y, vmul_vd_vd_vd(vrint_vd_vd(x), vcast_vd_d(4))));
|
||||
return disetdi_di_vd_vi(vsub_vd_vd_vd(x, vmul_vd_vd_vd(y, vcast_vd_d(0.25))), vi);
|
||||
#else
|
||||
vdouble c = vmulsign_vd_vd_vd(vcast_vd_d(INT64_C(1) << 52), x);
|
||||
vdouble rint4x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(vmul_vd_vd_vd(vcast_vd_d(4), x)), vcast_vd_d(INT64_C(1) << 52)),
|
||||
vmul_vd_vd_vd(vcast_vd_d(4), x),
|
||||
vorsign_vd_vd_vd(vsub_vd_vd_vd(vmla_vd_vd_vd_vd(vcast_vd_d(4), x, c), c), x));
|
||||
vdouble rintx = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(INT64_C(1) << 52)),
|
||||
x, vorsign_vd_vd_vd(vsub_vd_vd_vd(vadd_vd_vd_vd(x, c), c), x));
|
||||
return disetdi_di_vd_vi(vmla_vd_vd_vd_vd(vcast_vd_d(-0.25), rint4x, x),
|
||||
vtruncate_vi_vd(vmla_vd_vd_vd_vd(vcast_vd_d(-4), rintx, rint4x)));
|
||||
#endif
|
||||
}
|
||||
@@ -0,0 +1,324 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2024.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
|
||||
#if !defined(SLEEF_ENABLE_CUDA)
|
||||
typedef struct {
|
||||
vdouble x, y;
|
||||
} vdouble2;
|
||||
#else
|
||||
typedef double2 vdouble2;
|
||||
#endif
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vd2getx_vd_vd2(vdouble2 v) { return v.x; }
|
||||
static INLINE CONST VECTOR_CC vdouble vd2gety_vd_vd2(vdouble2 v) { return v.y; }
|
||||
static INLINE CONST VECTOR_CC vdouble2 vd2setxy_vd2_vd_vd(vdouble x, vdouble y) { vdouble2 v; v.x = x; v.y = y; return v; }
|
||||
static INLINE CONST VECTOR_CC vdouble2 vd2setx_vd2_vd2_vd(vdouble2 v, vdouble d) { v.x = d; return v; }
|
||||
static INLINE CONST VECTOR_CC vdouble2 vd2sety_vd2_vd2_vd(vdouble2 v, vdouble d) { v.y = d; return v; }
|
||||
#endif
|
||||
|
||||
#if !defined(SLEEF_ENABLE_CUDA)
|
||||
typedef struct {
|
||||
double x, y;
|
||||
} double2;
|
||||
#endif
|
||||
|
||||
static INLINE CONST VECTOR_CC double2 dd(double h, double l) {
|
||||
double2 ret = { h, l };
|
||||
return ret;
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vupper_vd_vd(vdouble d) {
|
||||
return vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vcast_vm_i_i(0xffffffff, 0xf8000000)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_vd_vd(vdouble h, vdouble l) {
|
||||
return vd2setxy_vd2_vd_vd(h, l);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_d_d(double h, double l) {
|
||||
return vd2setxy_vd2_vd_vd(vcast_vd_d(h), vcast_vd_d(l));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 vcast_vd2_d2(double2 dd) {
|
||||
return vd2setxy_vd2_vd_vd(vcast_vd_d(dd.x), vcast_vd_d(dd.y));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_vd2_vd2(vopmask m, vdouble2 x, vdouble2 y) {
|
||||
return vd2setxy_vd2_vd_vd(vsel_vd_vo_vd_vd(m, vd2getx_vd_vd2(x), vd2getx_vd_vd2(y)),
|
||||
vsel_vd_vo_vd_vd(m, vd2gety_vd_vd2(x), vd2gety_vd_vd2(y)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 vsel_vd2_vo_d_d_d_d(vopmask o, double x1, double y1, double x0, double y0) {
|
||||
return vd2setxy_vd2_vd_vd(vsel_vd_vo_d_d(o, x1, x0),
|
||||
vsel_vd_vo_d_d(o, y1, y0));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vadd_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {
|
||||
return vadd_vd_vd_vd(vadd_vd_vd_vd(v0, v1), v2);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vadd_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
|
||||
return vadd_vd_3vd(vadd_vd_vd_vd(v0, v1), v2, v3);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vadd_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
|
||||
return vadd_vd_4vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vadd_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
|
||||
return vadd_vd_5vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vadd_vd_7vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
|
||||
return vadd_vd_6vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5, v6);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vsub_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {
|
||||
return vsub_vd_vd_vd(vsub_vd_vd_vd(v0, v1), v2);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vsub_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
|
||||
return vsub_vd_3vd(vsub_vd_vd_vd(v0, v1), v2, v3);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vsub_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
|
||||
return vsub_vd_4vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble vsub_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
|
||||
return vsub_vd_5vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4, v5);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddneg_vd2_vd2(vdouble2 x) {
|
||||
return vcast_vd2_vd_vd(vneg_vd_vd(vd2getx_vd_vd2(x)), vneg_vd_vd(vd2gety_vd_vd2(x)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddabs_vd2_vd2(vdouble2 x) {
|
||||
return vcast_vd2_vd_vd(vabs_vd_vd(vd2getx_vd_vd2(x)),
|
||||
vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(vd2gety_vd_vd2(x)),
|
||||
vand_vm_vm_vm(vreinterpret_vm_vd(vd2getx_vd_vd2(x)),
|
||||
vreinterpret_vm_vd(vcast_vd_d(-0.0))))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddnormalize_vd2_vd2(vdouble2 t) {
|
||||
vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(t), vd2gety_vd_vd2(t));
|
||||
return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(t), s), vd2gety_vd_vd2(t)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddscale_vd2_vd2_vd(vdouble2 d, vdouble s) {
|
||||
return vd2setxy_vd2_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(d), s), vmul_vd_vd_vd(vd2gety_vd_vd2(d), s));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddscale_vd2_vd2_d(vdouble2 d, double s) { return ddscale_vd2_vd2_vd(d, vcast_vd_d(s)); }
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd(vdouble x, vdouble y) {
|
||||
vdouble s = vadd_vd_vd_vd(x, y);
|
||||
return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, s), y));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd(vdouble x, vdouble y) {
|
||||
vdouble s = vadd_vd_vd_vd(x, y);
|
||||
vdouble v = vsub_vd_vd_vd(s, x);
|
||||
return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd(vdouble2 x, vdouble y) {
|
||||
vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y);
|
||||
return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y, vd2gety_vd_vd2(x)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd(vdouble2 x, vdouble y) {
|
||||
vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), y);
|
||||
return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vsub_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), y), vd2gety_vd_vd2(x)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd(vdouble2 x, vdouble y) {
|
||||
vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), y);
|
||||
vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x));
|
||||
vdouble w = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(y, v));
|
||||
return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(w, vd2gety_vd_vd2(x)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd_vd2(vdouble x, vdouble2 y) {
|
||||
vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y));
|
||||
return vd2setxy_vd2_vd_vd(s, vadd_vd_3vd(vsub_vd_vd_vd(x, s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(y)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd_vd2(vdouble x, vdouble2 y) {
|
||||
vdouble s = vadd_vd_vd_vd(x, vd2getx_vd_vd2(y));
|
||||
vdouble v = vsub_vd_vd_vd(s, x);
|
||||
return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(s, v)),
|
||||
vsub_vd_vd_vd(vd2getx_vd_vd2(y), v)), vd2gety_vd_vd2(y)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddadd_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
|
||||
// |x| >= |y|
|
||||
|
||||
vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
|
||||
return vd2setxy_vd2_vd_vd(s, vadd_vd_4vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), s), vd2getx_vd_vd2(y), vd2gety_vd_vd2(x), vd2gety_vd_vd2(y)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddadd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
|
||||
vdouble s = vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
|
||||
vdouble v = vsub_vd_vd_vd(s, vd2getx_vd_vd2(x));
|
||||
vdouble t = vadd_vd_vd_vd(vsub_vd_vd_vd(vd2getx_vd_vd2(x), vsub_vd_vd_vd(s, v)), vsub_vd_vd_vd(vd2getx_vd_vd2(y), v));
|
||||
return vd2setxy_vd2_vd_vd(s, vadd_vd_vd_vd(t, vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(y))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd_vd(vdouble x, vdouble y) {
|
||||
// |x| >= |y|
|
||||
|
||||
vdouble s = vsub_vd_vd_vd(x, y);
|
||||
return vd2setxy_vd2_vd_vd(s, vsub_vd_vd_vd(vsub_vd_vd_vd(x, s), y));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
|
||||
// |x| >= |y|
|
||||
|
||||
vdouble s = vsub_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
|
||||
vdouble t = vsub_vd_vd_vd(vd2getx_vd_vd2(x), s);
|
||||
t = vsub_vd_vd_vd(t, vd2getx_vd_vd2(y));
|
||||
t = vadd_vd_vd_vd(t, vd2gety_vd_vd2(x));
|
||||
return vd2setxy_vd2_vd_vd(s, vsub_vd_vd_vd(t, vd2gety_vd_vd2(y)));
|
||||
}
|
||||
|
||||
#ifdef ENABLE_FMA_DP
|
||||
static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {
|
||||
vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));
|
||||
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(n), t);
|
||||
vdouble u = vfmapn_vd_vd_vd_vd(t, vd2getx_vd_vd2(n), s);
|
||||
vdouble v = vfmanp_vd_vd_vd_vd(vd2gety_vd_vd2(d), t, vfmanp_vd_vd_vd_vd(vd2getx_vd_vd2(d), t, vcast_vd_d(1)));
|
||||
return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(s, v, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(n), t, u)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {
|
||||
vdouble s = vmul_vd_vd_vd(x, y);
|
||||
return vd2setxy_vd2_vd_vd(s, vfmapn_vd_vd_vd_vd(x, y, s));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) {
|
||||
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));
|
||||
return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x)), vd2gety_vd_vd2(x), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), s)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
|
||||
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
|
||||
return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), s))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {
|
||||
return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y), vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) {
|
||||
return vfma_vd_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x), vadd_vd_vd_vd(vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x)), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(x))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {
|
||||
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), y);
|
||||
return vd2setxy_vd2_vd_vd(s, vfma_vd_vd_vd_vd(vd2gety_vd_vd2(x), y, vfmapn_vd_vd_vd_vd(vd2getx_vd_vd2(x), y, s)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) {
|
||||
vdouble s = vrec_vd_vd(d);
|
||||
return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(d, s, vcast_vd_d(1))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) {
|
||||
vdouble s = vrec_vd_vd(vd2getx_vd_vd2(d));
|
||||
return vd2setxy_vd2_vd_vd(s, vmul_vd_vd_vd(s, vfmanp_vd_vd_vd_vd(vd2gety_vd_vd2(d), s, vfmanp_vd_vd_vd_vd(vd2getx_vd_vd2(d), s, vcast_vd_d(1)))));
|
||||
}
|
||||
#else // #ifdef ENABLE_FMA_DP
|
||||
static INLINE CONST VECTOR_CC vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {
|
||||
vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));
|
||||
vdouble dh = vupper_vd_vd(vd2getx_vd_vd2(d)), dl = vsub_vd_vd_vd(vd2getx_vd_vd2(d), dh);
|
||||
vdouble th = vupper_vd_vd(t ), tl = vsub_vd_vd_vd(t , th);
|
||||
vdouble nhh = vupper_vd_vd(vd2getx_vd_vd2(n)), nhl = vsub_vd_vd_vd(vd2getx_vd_vd2(n), nhh);
|
||||
|
||||
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(n), t);
|
||||
|
||||
vdouble u = vadd_vd_5vd(vsub_vd_vd_vd(vmul_vd_vd_vd(nhh, th), s), vmul_vd_vd_vd(nhh, tl), vmul_vd_vd_vd(nhl, th), vmul_vd_vd_vd(nhl, tl),
|
||||
vmul_vd_vd_vd(s, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl))));
|
||||
|
||||
return vd2setxy_vd2_vd_vd(s, vmla_vd_vd_vd_vd(t, vsub_vd_vd_vd(vd2gety_vd_vd2(n), vmul_vd_vd_vd(s, vd2gety_vd_vd2(d))), u));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {
|
||||
vdouble xh = vupper_vd_vd(x), xl = vsub_vd_vd_vd(x, xh);
|
||||
vdouble yh = vupper_vd_vd(y), yl = vsub_vd_vd_vd(y, yh);
|
||||
|
||||
vdouble s = vmul_vd_vd_vd(x, y);
|
||||
return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {
|
||||
vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
|
||||
vdouble yh = vupper_vd_vd(y ), yl = vsub_vd_vd_vd(y , yh);
|
||||
|
||||
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), y);
|
||||
return vd2setxy_vd2_vd_vd(s, vadd_vd_6vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2gety_vd_vd2(x), y)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
|
||||
vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
|
||||
vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh);
|
||||
|
||||
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(y));
|
||||
return vd2setxy_vd2_vd_vd(s, vadd_vd_7vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(s), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2gety_vd_vd2(y)), vmul_vd_vd_vd(vd2gety_vd_vd2(x), vd2getx_vd_vd2(y))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {
|
||||
vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
|
||||
vdouble yh = vupper_vd_vd(vd2getx_vd_vd2(y)), yl = vsub_vd_vd_vd(vd2getx_vd_vd2(y), yh);
|
||||
|
||||
return vadd_vd_6vd(vmul_vd_vd_vd(vd2gety_vd_vd2(x), yh), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(y)), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yh));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddsqu_vd2_vd2(vdouble2 x) {
|
||||
vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
|
||||
|
||||
vdouble s = vmul_vd_vd_vd(vd2getx_vd_vd2(x), vd2getx_vd_vd2(x));
|
||||
return vd2setxy_vd2_vd_vd(s, vadd_vd_5vd(vmul_vd_vd_vd(xh, xh), vneg_vd_vd(s), vmul_vd_vd_vd(vadd_vd_vd_vd(xh, xh), xl), vmul_vd_vd_vd(xl, xl), vmul_vd_vd_vd(vd2getx_vd_vd2(x), vadd_vd_vd_vd(vd2gety_vd_vd2(x), vd2gety_vd_vd2(x)))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble ddsqu_vd_vd2(vdouble2 x) {
|
||||
vdouble xh = vupper_vd_vd(vd2getx_vd_vd2(x)), xl = vsub_vd_vd_vd(vd2getx_vd_vd2(x), xh);
|
||||
|
||||
return vadd_vd_5vd(vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xh, vd2gety_vd_vd2(x)), vmul_vd_vd_vd(xl, xl), vadd_vd_vd_vd(vmul_vd_vd_vd(xh, xl), vmul_vd_vd_vd(xh, xl)), vmul_vd_vd_vd(xh, xh));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd(vdouble d) {
|
||||
vdouble t = vrec_vd_vd(d);
|
||||
vdouble dh = vupper_vd_vd(d), dl = vsub_vd_vd_vd(d, dh);
|
||||
vdouble th = vupper_vd_vd(t), tl = vsub_vd_vd_vd(t, th);
|
||||
|
||||
return vd2setxy_vd2_vd_vd(t, vmul_vd_vd_vd(t, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddrec_vd2_vd2(vdouble2 d) {
|
||||
vdouble t = vrec_vd_vd(vd2getx_vd_vd2(d));
|
||||
vdouble dh = vupper_vd_vd(vd2getx_vd_vd2(d)), dl = vsub_vd_vd_vd(vd2getx_vd_vd2(d), dh);
|
||||
vdouble th = vupper_vd_vd(t ), tl = vsub_vd_vd_vd(t , th);
|
||||
|
||||
return vd2setxy_vd2_vd_vd(t, vmul_vd_vd_vd(t, vsub_vd_6vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl), vmul_vd_vd_vd(vd2gety_vd_vd2(d), t))));
|
||||
}
|
||||
#endif // #ifdef ENABLE_FMA_DP
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd2(vdouble2 d) {
|
||||
vdouble t = vsqrt_vd_vd(vadd_vd_vd_vd(vd2getx_vd_vd2(d), vd2gety_vd_vd2(d)));
|
||||
return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddsqrt_vd2_vd(vdouble d) {
|
||||
vdouble t = vsqrt_vd_vd(d);
|
||||
return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vdouble2 ddmla_vd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y, vdouble2 z) {
|
||||
return ddadd_vd2_vd2_vd2(z, ddmul_vd2_vd2_vd2(x, y));
|
||||
}
|
||||
@@ -0,0 +1,369 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2024.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#if !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA))
|
||||
#if !defined(SLEEF_ENABLE_CUDA)
|
||||
typedef struct {
|
||||
vfloat x, y;
|
||||
} vfloat2;
|
||||
#else
|
||||
typedef float2 vfloat2;
|
||||
#endif
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat vf2getx_vf_vf2(vfloat2 v) { return v.x; }
|
||||
static INLINE CONST VECTOR_CC vfloat vf2gety_vf_vf2(vfloat2 v) { return v.y; }
|
||||
static INLINE CONST VECTOR_CC vfloat2 vf2setxy_vf2_vf_vf(vfloat x, vfloat y) { vfloat2 v; v.x = x; v.y = y; return v; }
|
||||
static INLINE CONST VECTOR_CC vfloat2 vf2setx_vf2_vf2_vf(vfloat2 v, vfloat d) { v.x = d; return v; }
|
||||
static INLINE CONST VECTOR_CC vfloat2 vf2sety_vf2_vf2_vf(vfloat2 v, vfloat d) { v.y = d; return v; }
|
||||
#endif
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat vupper_vf_vf(vfloat d) {
|
||||
return vreinterpret_vf_vi2(vand_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vcast_vi2_i(0xfffff000)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_vf_vf(vfloat h, vfloat l) {
|
||||
return vf2setxy_vf2_vf_vf(h, l);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_f_f(float h, float l) {
|
||||
return vf2setxy_vf2_vf_vf(vcast_vf_f(h), vcast_vf_f(l));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 vcast_vf2_d(double d) {
|
||||
return vf2setxy_vf2_vf_vf(vcast_vf_f(d), vcast_vf_f(d - (float)d));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vf2_vf2(vopmask m, vfloat2 x, vfloat2 y) {
|
||||
return vf2setxy_vf2_vf_vf(vsel_vf_vo_vf_vf(m, vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)), vsel_vf_vo_vf_vf(m, vf2gety_vf_vf2(x), vf2gety_vf_vf2(y)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_f_f_f_f(vopmask o, float x1, float y1, float x0, float y0) {
|
||||
return vf2setxy_vf2_vf_vf(vsel_vf_vo_f_f(o, x1, x0), vsel_vf_vo_f_f(o, y1, y0));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
|
||||
return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vcast_vf2_d(d2)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 vsel_vf2_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
|
||||
return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vsel_vf2_vo_vf2_vf2(o2, vcast_vf2_d(d2), vcast_vf2_d(d3))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 vabs_vf2_vf2(vfloat2 x) {
|
||||
return vcast_vf2_vf_vf(vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2getx_vf_vf2(x)))),
|
||||
vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(vf2getx_vf_vf2(x))), vreinterpret_vm_vf(vf2gety_vf_vf2(x)))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat vadd_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {
|
||||
return vadd_vf_vf_vf(vadd_vf_vf_vf(v0, v1), v2);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat vadd_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {
|
||||
return vadd_vf_3vf(vadd_vf_vf_vf(v0, v1), v2, v3);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat vadd_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) {
|
||||
return vadd_vf_4vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat vadd_vf_6vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5) {
|
||||
return vadd_vf_5vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat vadd_vf_7vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5, vfloat v6) {
|
||||
return vadd_vf_6vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5, v6);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat vsub_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {
|
||||
return vsub_vf_vf_vf(vsub_vf_vf_vf(v0, v1), v2);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat vsub_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {
|
||||
return vsub_vf_3vf(vsub_vf_vf_vf(v0, v1), v2, v3);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat vsub_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) {
|
||||
return vsub_vf_4vf(vsub_vf_vf_vf(v0, v1), v2, v3, v4);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfneg_vf2_vf2(vfloat2 x) {
|
||||
return vcast_vf2_vf_vf(vneg_vf_vf(vf2getx_vf_vf2(x)), vneg_vf_vf(vf2gety_vf_vf2(x)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfabs_vf2_vf2(vfloat2 x) {
|
||||
return vcast_vf2_vf_vf(vabs_vf_vf(vf2getx_vf_vf2(x)),
|
||||
vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vf2gety_vf_vf2(x)), vand_vm_vm_vm(vreinterpret_vm_vf(vf2getx_vf_vf2(x)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfnormalize_vf2_vf2(vfloat2 t) {
|
||||
vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(t), vf2gety_vf_vf2(t));
|
||||
return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(t), s), vf2gety_vf_vf2(t)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfscale_vf2_vf2_vf(vfloat2 d, vfloat s) {
|
||||
return vf2setxy_vf2_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(d), s), vmul_vf_vf_vf(vf2gety_vf_vf2(d), s));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf(vfloat x, vfloat y) {
|
||||
vfloat s = vadd_vf_vf_vf(x, y);
|
||||
return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, s), y));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf(vfloat x, vfloat y) {
|
||||
vfloat s = vadd_vf_vf_vf(x, y);
|
||||
vfloat v = vsub_vf_vf_vf(s, x);
|
||||
return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf_vf2(vfloat x, vfloat2 y) {
|
||||
vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y));
|
||||
vfloat v = vsub_vf_vf_vf(s, x);
|
||||
return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v)), vf2gety_vf_vf2(y)));
|
||||
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf(vfloat2 x, vfloat y) {
|
||||
vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y);
|
||||
return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y, vf2gety_vf_vf2(x)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf(vfloat2 x, vfloat y) {
|
||||
vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), y);
|
||||
return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(vsub_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), y), vf2gety_vf_vf2(x)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf(vfloat2 x, vfloat y) {
|
||||
vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), y);
|
||||
vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x));
|
||||
vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(y, v));
|
||||
return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vf2gety_vf_vf2(x)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf_vf2(vfloat x, vfloat2 y) {
|
||||
vfloat s = vadd_vf_vf_vf(x, vf2getx_vf_vf2(y));
|
||||
return vf2setxy_vf2_vf_vf(s, vadd_vf_3vf(vsub_vf_vf_vf(x, s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(y)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfadd_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
|
||||
// |x| >= |y|
|
||||
|
||||
vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
|
||||
return vf2setxy_vf2_vf_vf(s, vadd_vf_4vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), s), vf2getx_vf_vf2(y), vf2gety_vf_vf2(x), vf2gety_vf_vf2(y)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfadd2_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
|
||||
vfloat s = vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
|
||||
vfloat v = vsub_vf_vf_vf(s, vf2getx_vf_vf2(x));
|
||||
vfloat t = vadd_vf_vf_vf(vsub_vf_vf_vf(vf2getx_vf_vf2(x), vsub_vf_vf_vf(s, v)), vsub_vf_vf_vf(vf2getx_vf_vf2(y), v));
|
||||
return vf2setxy_vf2_vf_vf(s, vadd_vf_vf_vf(t, vadd_vf_vf_vf(vf2gety_vf_vf2(x), vf2gety_vf_vf2(y))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf_vf(vfloat x, vfloat y) {
|
||||
// |x| >= |y|
|
||||
|
||||
vfloat s = vsub_vf_vf_vf(x, y);
|
||||
return vf2setxy_vf2_vf_vf(s, vsub_vf_vf_vf(vsub_vf_vf_vf(x, s), y));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfsub_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
|
||||
// |x| >= |y|
|
||||
|
||||
vfloat s = vsub_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
|
||||
vfloat t = vsub_vf_vf_vf(vf2getx_vf_vf2(x), s);
|
||||
t = vsub_vf_vf_vf(t, vf2getx_vf_vf2(y));
|
||||
t = vadd_vf_vf_vf(t, vf2gety_vf_vf2(x));
|
||||
return vf2setxy_vf2_vf_vf(s, vsub_vf_vf_vf(t, vf2gety_vf_vf2(y)));
|
||||
}
|
||||
|
||||
#ifdef ENABLE_FMA_SP
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {
|
||||
vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));
|
||||
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(n), t);
|
||||
vfloat u = vfmapn_vf_vf_vf_vf(t, vf2getx_vf_vf2(n), s);
|
||||
vfloat v = vfmanp_vf_vf_vf_vf(vf2gety_vf_vf2(d), t, vfmanp_vf_vf_vf_vf(vf2getx_vf_vf2(d), t, vcast_vf_f(1)));
|
||||
return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(s, v, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(n), t, u)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {
|
||||
vfloat s = vmul_vf_vf_vf(x, y);
|
||||
return vf2setxy_vf2_vf_vf(s, vfmapn_vf_vf_vf_vf(x, y, s));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) {
|
||||
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x));
|
||||
return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), vf2gety_vf_vf2(x), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), s)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) {
|
||||
return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x), vadd_vf_vf_vf(vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x)), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(x))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
|
||||
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y));
|
||||
return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), s))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {
|
||||
return vfma_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y), vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {
|
||||
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), y);
|
||||
return vf2setxy_vf2_vf_vf(s, vfma_vf_vf_vf_vf(vf2gety_vf_vf2(x), y, vfmapn_vf_vf_vf_vf(vf2getx_vf_vf2(x), y, s)));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) {
|
||||
vfloat s = vrec_vf_vf(d);
|
||||
return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(d, s, vcast_vf_f(1))));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) {
|
||||
vfloat s = vrec_vf_vf(vf2getx_vf_vf2(d));
|
||||
return vf2setxy_vf2_vf_vf(s, vmul_vf_vf_vf(s, vfmanp_vf_vf_vf_vf(vf2gety_vf_vf2(d), s, vfmanp_vf_vf_vf_vf(vf2getx_vf_vf2(d), s, vcast_vf_f(1)))));
|
||||
}
|
||||
#else
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {
|
||||
vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));
|
||||
vfloat dh = vupper_vf_vf(vf2getx_vf_vf2(d)), dl = vsub_vf_vf_vf(vf2getx_vf_vf2(d), dh);
|
||||
vfloat th = vupper_vf_vf(t ), tl = vsub_vf_vf_vf(t , th);
|
||||
vfloat nhh = vupper_vf_vf(vf2getx_vf_vf2(n)), nhl = vsub_vf_vf_vf(vf2getx_vf_vf2(n), nhh);
|
||||
|
||||
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(n), t);
|
||||
|
||||
vfloat u, w;
|
||||
w = vcast_vf_f(-1);
|
||||
w = vmla_vf_vf_vf_vf(dh, th, w);
|
||||
w = vmla_vf_vf_vf_vf(dh, tl, w);
|
||||
w = vmla_vf_vf_vf_vf(dl, th, w);
|
||||
w = vmla_vf_vf_vf_vf(dl, tl, w);
|
||||
w = vneg_vf_vf(w);
|
||||
|
||||
u = vmla_vf_vf_vf_vf(nhh, th, vneg_vf_vf(s));
|
||||
u = vmla_vf_vf_vf_vf(nhh, tl, u);
|
||||
u = vmla_vf_vf_vf_vf(nhl, th, u);
|
||||
u = vmla_vf_vf_vf_vf(nhl, tl, u);
|
||||
u = vmla_vf_vf_vf_vf(s, w, u);
|
||||
|
||||
return vf2setxy_vf2_vf_vf(s, vmla_vf_vf_vf_vf(t, vsub_vf_vf_vf(vf2gety_vf_vf2(n), vmul_vf_vf_vf(s, vf2gety_vf_vf2(d))), u));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {
|
||||
vfloat xh = vupper_vf_vf(x), xl = vsub_vf_vf_vf(x, xh);
|
||||
vfloat yh = vupper_vf_vf(y), yl = vsub_vf_vf_vf(y, yh);
|
||||
|
||||
vfloat s = vmul_vf_vf_vf(x, y), t;
|
||||
|
||||
t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));
|
||||
t = vmla_vf_vf_vf_vf(xl, yh, t);
|
||||
t = vmla_vf_vf_vf_vf(xh, yl, t);
|
||||
t = vmla_vf_vf_vf_vf(xl, yl, t);
|
||||
|
||||
return vf2setxy_vf2_vf_vf(s, t);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {
|
||||
vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
|
||||
vfloat yh = vupper_vf_vf(y ), yl = vsub_vf_vf_vf(y , yh);
|
||||
|
||||
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), y), t;
|
||||
|
||||
t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));
|
||||
t = vmla_vf_vf_vf_vf(xl, yh, t);
|
||||
t = vmla_vf_vf_vf_vf(xh, yl, t);
|
||||
t = vmla_vf_vf_vf_vf(xl, yl, t);
|
||||
t = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(x), y, t);
|
||||
|
||||
return vf2setxy_vf2_vf_vf(s, t);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
|
||||
vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
|
||||
vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh);
|
||||
|
||||
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(y)), t;
|
||||
|
||||
t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(s));
|
||||
t = vmla_vf_vf_vf_vf(xl, yh, t);
|
||||
t = vmla_vf_vf_vf_vf(xh, yl, t);
|
||||
t = vmla_vf_vf_vf_vf(xl, yl, t);
|
||||
t = vmla_vf_vf_vf_vf(vf2getx_vf_vf2(x), vf2gety_vf_vf2(y), t);
|
||||
t = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(x), vf2getx_vf_vf2(y), t);
|
||||
|
||||
return vf2setxy_vf2_vf_vf(s, t);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {
|
||||
vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
|
||||
vfloat yh = vupper_vf_vf(vf2getx_vf_vf2(y)), yl = vsub_vf_vf_vf(vf2getx_vf_vf2(y), yh);
|
||||
|
||||
return vadd_vf_6vf(vmul_vf_vf_vf(vf2gety_vf_vf2(x), yh), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(y)), vmul_vf_vf_vf(xl, yl), vmul_vf_vf_vf(xh, yl), vmul_vf_vf_vf(xl, yh), vmul_vf_vf_vf(xh, yh));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfsqu_vf2_vf2(vfloat2 x) {
|
||||
vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
|
||||
|
||||
vfloat s = vmul_vf_vf_vf(vf2getx_vf_vf2(x), vf2getx_vf_vf2(x)), t;
|
||||
|
||||
t = vmla_vf_vf_vf_vf(xh, xh, vneg_vf_vf(s));
|
||||
t = vmla_vf_vf_vf_vf(vadd_vf_vf_vf(xh, xh), xl, t);
|
||||
t = vmla_vf_vf_vf_vf(xl, xl, t);
|
||||
t = vmla_vf_vf_vf_vf(vf2getx_vf_vf2(x), vadd_vf_vf_vf(vf2gety_vf_vf2(x), vf2gety_vf_vf2(x)), t);
|
||||
|
||||
return vf2setxy_vf2_vf_vf(s, t);
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat dfsqu_vf_vf2(vfloat2 x) {
|
||||
vfloat xh = vupper_vf_vf(vf2getx_vf_vf2(x)), xl = vsub_vf_vf_vf(vf2getx_vf_vf2(x), xh);
|
||||
|
||||
return vadd_vf_5vf(vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xh, vf2gety_vf_vf2(x)), vmul_vf_vf_vf(xl, xl), vadd_vf_vf_vf(vmul_vf_vf_vf(xh, xl), vmul_vf_vf_vf(xh, xl)), vmul_vf_vf_vf(xh, xh));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf(vfloat d) {
|
||||
vfloat t = vrec_vf_vf(d);
|
||||
vfloat dh = vupper_vf_vf(d), dl = vsub_vf_vf_vf(d, dh);
|
||||
vfloat th = vupper_vf_vf(t), tl = vsub_vf_vf_vf(t, th);
|
||||
|
||||
vfloat u = vcast_vf_f(-1);
|
||||
u = vmla_vf_vf_vf_vf(dh, th, u);
|
||||
u = vmla_vf_vf_vf_vf(dh, tl, u);
|
||||
u = vmla_vf_vf_vf_vf(dl, th, u);
|
||||
u = vmla_vf_vf_vf_vf(dl, tl, u);
|
||||
|
||||
return vf2setxy_vf2_vf_vf(t, vmul_vf_vf_vf(vneg_vf_vf(t), u));
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfrec_vf2_vf2(vfloat2 d) {
|
||||
vfloat t = vrec_vf_vf(vf2getx_vf_vf2(d));
|
||||
vfloat dh = vupper_vf_vf(vf2getx_vf_vf2(d)), dl = vsub_vf_vf_vf(vf2getx_vf_vf2(d), dh);
|
||||
vfloat th = vupper_vf_vf(t ), tl = vsub_vf_vf_vf(t , th);
|
||||
|
||||
vfloat u = vcast_vf_f(-1);
|
||||
u = vmla_vf_vf_vf_vf(dh, th, u);
|
||||
u = vmla_vf_vf_vf_vf(dh, tl, u);
|
||||
u = vmla_vf_vf_vf_vf(dl, th, u);
|
||||
u = vmla_vf_vf_vf_vf(dl, tl, u);
|
||||
u = vmla_vf_vf_vf_vf(vf2gety_vf_vf2(d), t, u);
|
||||
|
||||
return vf2setxy_vf2_vf_vf(t, vmul_vf_vf_vf(vneg_vf_vf(t), u));
|
||||
}
|
||||
#endif
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf2(vfloat2 d) {
|
||||
#ifdef ENABLE_RECSQRT_SP
|
||||
vfloat x = vrecsqrt_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)));
|
||||
vfloat2 r = dfmul_vf2_vf2_vf(d, x);
|
||||
return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(r, dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(r, x), vcast_vf_f(-3.0))), vcast_vf_f(-0.5));
|
||||
#else
|
||||
vfloat t = vsqrt_vf_vf(vadd_vf_vf_vf(vf2getx_vf_vf2(d), vf2gety_vf_vf2(d)));
|
||||
return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5));
|
||||
#endif
|
||||
}
|
||||
|
||||
static INLINE CONST VECTOR_CC vfloat2 dfsqrt_vf2_vf(vfloat d) {
|
||||
vfloat t = vsqrt_vf_vf(d);
|
||||
return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5f));
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
// These are macros for evaluating polynomials using Estrin's method
|
||||
|
||||
#define POLY2(x, c1, c0) MLA(x, C2V(c1), C2V(c0))
|
||||
#define POLY3(x, x2, c2, c1, c0) MLA(x2, C2V(c2), MLA(x, C2V(c1), C2V(c0)))
|
||||
#define POLY4(x, x2, c3, c2, c1, c0) MLA(x2, MLA(x, C2V(c3), C2V(c2)), MLA(x, C2V(c1), C2V(c0)))
|
||||
#define POLY5(x, x2, x4, c4, c3, c2, c1, c0) MLA(x4, C2V(c4), POLY4(x, x2, c3, c2, c1, c0))
|
||||
#define POLY6(x, x2, x4, c5, c4, c3, c2, c1, c0) MLA(x4, POLY2(x, c5, c4), POLY4(x, x2, c3, c2, c1, c0))
|
||||
#define POLY7(x, x2, x4, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY3(x, x2, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0))
|
||||
#define POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0) MLA(x4, POLY4(x, x2, c7, c6, c5, c4), POLY4(x, x2, c3, c2, c1, c0))
|
||||
#define POLY9(x, x2, x4, x8, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
|
||||
MLA(x8, C2V(c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
|
||||
#define POLY10(x, x2, x4, x8, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
|
||||
MLA(x8, POLY2(x, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
|
||||
#define POLY11(x, x2, x4, x8, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
|
||||
MLA(x8, POLY3(x, x2, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
|
||||
#define POLY12(x, x2, x4, x8, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
|
||||
MLA(x8, POLY4(x, x2, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
|
||||
#define POLY13(x, x2, x4, x8, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
|
||||
MLA(x8, POLY5(x, x2, x4, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
|
||||
#define POLY14(x, x2, x4, x8, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
|
||||
MLA(x8, POLY6(x, x2, x4, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
|
||||
#define POLY15(x, x2, x4, x8, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
|
||||
MLA(x8, POLY7(x, x2, x4, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
|
||||
#define POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
|
||||
MLA(x8, POLY8(x, x2, x4, cf, ce, cd, cc, cb, ca, c9, c8), POLY8(x, x2, x4, c7, c6, c5, c4, c3, c2, c1, c0))
|
||||
#define POLY17(x, x2, x4, x8, x16, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
|
||||
MLA(x16, C2V(d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
|
||||
#define POLY18(x, x2, x4, x8, x16, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
|
||||
MLA(x16, POLY2(x, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
|
||||
#define POLY19(x, x2, x4, x8, x16, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
|
||||
MLA(x16, POLY3(x, x2, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
|
||||
#define POLY20(x, x2, x4, x8, x16, d3, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
|
||||
MLA(x16, POLY4(x, x2, d3, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
|
||||
#define POLY21(x, x2, x4, x8, x16, d4, d3, d2, d1, d0, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0)\
|
||||
MLA(x16, POLY5(x, x2, x4, d4, d3, d2, d1, d0), POLY16(x, x2, x4, x8, cf, ce, cd, cc, cb, ca, c9, c8, c7, c6, c5, c4, c3, c2, c1, c0))
|
||||
@@ -0,0 +1,92 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <quadmath.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
static __float128 mpfr_get_f128(mpfr_t m, mpfr_rnd_t rnd) {
|
||||
if (isnan(mpfr_get_d(m, GMP_RNDN))) return __builtin_nan("");
|
||||
|
||||
mpfr_t frr, frd;
|
||||
mpfr_inits(frr, frd, NULL);
|
||||
|
||||
mpfr_exp_t e;
|
||||
mpfr_frexp(&e, frr, m, GMP_RNDN);
|
||||
|
||||
double d0 = mpfr_get_d(frr, GMP_RNDN);
|
||||
mpfr_set_d(frd, d0, GMP_RNDN);
|
||||
mpfr_sub(frr, frr, frd, GMP_RNDN);
|
||||
|
||||
double d1 = mpfr_get_d(frr, GMP_RNDN);
|
||||
mpfr_set_d(frd, d1, GMP_RNDN);
|
||||
mpfr_sub(frr, frr, frd, GMP_RNDN);
|
||||
|
||||
double d2 = mpfr_get_d(frr, GMP_RNDN);
|
||||
|
||||
mpfr_clears(frr, frd, NULL);
|
||||
return ldexpq((__float128)d2 + (__float128)d1 + (__float128)d0, e);
|
||||
}
|
||||
|
||||
static void mpfr_set_f128(mpfr_t frx, __float128 f, mpfr_rnd_t rnd) {
|
||||
char s[128];
|
||||
quadmath_snprintf(s, 120, "%.50Qg", f);
|
||||
mpfr_set_str(frx, s, 10, rnd);
|
||||
}
|
||||
|
||||
static void printf128(__float128 f) {
|
||||
char s[128];
|
||||
quadmath_snprintf(s, 120, "%.50Qg", f);
|
||||
printf("%s", s);
|
||||
}
|
||||
|
||||
static char frstr[16][1000];
|
||||
static int frstrcnt = 0;
|
||||
|
||||
static char *toBC(double d) {
|
||||
union {
|
||||
double d;
|
||||
uint64_t u64;
|
||||
int64_t i64;
|
||||
} cnv;
|
||||
|
||||
cnv.d = d;
|
||||
|
||||
int64_t l = cnv.i64;
|
||||
int e = (int)((l >> 52) & ~(-1L << 11));
|
||||
int s = (int)(l >> 63);
|
||||
l = d == 0 ? 0 : ((l & ~((-1L) << 52)) | (1L << 52));
|
||||
|
||||
char *ptr = frstr[(frstrcnt++) & 15];
|
||||
|
||||
sprintf(ptr, "%s%lld*2^%d", s != 0 ? "-" : "", (long long int)l, (e-0x3ff-52));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static char *toBCq(__float128 d) {
|
||||
union {
|
||||
__float128 d;
|
||||
__uint128_t u128;
|
||||
} cnv;
|
||||
|
||||
cnv.d = d;
|
||||
|
||||
__uint128_t m = cnv.u128;
|
||||
int e = (int)((m >> 112) & ~(-1L << 15));
|
||||
int s = (int)(m >> 127);
|
||||
m = d == 0 ? 0 : ((m & ((((__uint128_t)1) << 112)-1)) | ((__uint128_t)1 << 112));
|
||||
|
||||
uint64_t h = m / UINT64_C(10000000000000000000);
|
||||
uint64_t l = m % UINT64_C(10000000000000000000);
|
||||
|
||||
char *ptr = frstr[(frstrcnt++) & 15];
|
||||
|
||||
sprintf(ptr, "%s%" PRIu64 "%019" PRIu64 "*2^%d", s != 0 ? "-" : "", h, l, (e-0x3fff-112));
|
||||
|
||||
return ptr;
|
||||
}
|
||||
|
||||
static int xisnanq(Sleef_quad x) { return x != x; }
|
||||
static int xisinfq(Sleef_quad x) { return x == (Sleef_quad)__builtin_inf() || x == -(Sleef_quad)__builtin_inf(); }
|
||||
static int xisfiniteq(Sleef_quad x) { return !xisnanq(x) && !isinfq(x); }
|
||||
@@ -0,0 +1,683 @@
|
||||
double2
|
||||
double3
|
||||
float2
|
||||
atan2k
|
||||
atan2kf
|
||||
atan2kf_u1
|
||||
atan2k_u1
|
||||
cospik
|
||||
cospifk
|
||||
dd
|
||||
dd2
|
||||
dd2geta_vd2_dd2
|
||||
dd2getb_vd2_dd2
|
||||
dd2setab_dd2_vd2_vd2
|
||||
ddabs_vd2_vd2
|
||||
ddadd2_vd2_vd2_vd
|
||||
ddadd2_vd2_vd2_vd2
|
||||
ddadd2_vd2_vd_vd
|
||||
ddadd2_vd2_vd_vd2
|
||||
ddadd_vd2_vd2_vd
|
||||
ddadd_vd2_vd2_vd2
|
||||
ddadd_vd2_vd_vd
|
||||
ddadd_vd2_vd_vd2
|
||||
dddiv_vd2_vd2_vd2
|
||||
ddi
|
||||
ddi_t
|
||||
ddigetdd_vd2_ddi
|
||||
ddigeti_vi_ddi
|
||||
ddisetdd_ddi_ddi_vd2
|
||||
ddisetddi_ddi_vd2_vi
|
||||
ddmla_vd2_vd2_vd2_vd2
|
||||
ddmla_vd2_vd_vd2_vd2
|
||||
ddmul_vd2_vd2_vd
|
||||
ddmul_vd2_vd2_vd2
|
||||
ddmul_vd2_vd_vd
|
||||
ddmul_vd_vd2_vd2
|
||||
ddneg_vd2_vd2
|
||||
ddnormalize_vd2_vd2
|
||||
ddrec_vd2_vd
|
||||
ddrec_vd2_vd2
|
||||
ddscale_vd2_vd2_d
|
||||
ddscale_vd2_vd2_vd
|
||||
ddsqrt_vd2_vd
|
||||
ddsqrt_vd2_vd2
|
||||
ddsqu_vd2_vd2
|
||||
ddsqu_vd_vd2
|
||||
ddsub_vd2_vd2_vd
|
||||
ddsub_vd2_vd2_vd2
|
||||
ddsub_vd2_vd_vd
|
||||
df
|
||||
df2
|
||||
df2geta_vf2_df2
|
||||
df2getb_vf2_df2
|
||||
df2setab_df2_vf2_vf2
|
||||
dfabs_vf2_vf2
|
||||
dfadd2_vf2_vf2_vf
|
||||
dfadd2_vf2_vf2_vf2
|
||||
dfadd2_vf2_vf_vf
|
||||
dfadd2_vf2_vf_vf2
|
||||
dfadd_vf2_vf2_vf
|
||||
dfadd_vf2_vf2_vf2
|
||||
dfadd_vf2_vf_vf
|
||||
dfadd_vf2_vf_vf2
|
||||
dfdiv_vf2_vf2_vf2
|
||||
dfi
|
||||
dfi_t
|
||||
dfigetdf_vf2_dfi
|
||||
dfigeti_vi2_dfi
|
||||
dfisetdf_dfi_dfi_vf2
|
||||
dfisetdfi_dfi_vf2_vi2
|
||||
dfmla_vf2_vf_vf2_vf2
|
||||
dfmul_vf2_vf2_vf
|
||||
dfmul_vf2_vf2_vf2
|
||||
dfmul_vf2_vf_vf
|
||||
dfmul_vf_vf2_vf2
|
||||
dfneg_vf2_vf2
|
||||
dfnormalize_vf2_vf2
|
||||
dfrec_vf2_vf
|
||||
dfrec_vf2_vf2
|
||||
dfscale_vf2_vf2_vf
|
||||
dfsqrt_vf2_vf
|
||||
dfsqrt_vf2_vf2
|
||||
dfsqu_vf2_vf2
|
||||
dfsqu_vf_vf2
|
||||
dfsub_vf2_vf2_vf
|
||||
dfsub_vf2_vf2_vf2
|
||||
dfsub_vf2_vf_vf
|
||||
di_t
|
||||
digetd_vd_di
|
||||
digeti_vi_di
|
||||
disetdi_di_vd_vi
|
||||
expk
|
||||
expk2
|
||||
expk2f
|
||||
expk3f
|
||||
expkf
|
||||
expm1fk
|
||||
expm1k
|
||||
fi_t
|
||||
figetd_vf_di
|
||||
figeti_vi2_di
|
||||
fisetdi_fi_vf_vi2
|
||||
gammafk
|
||||
gammak
|
||||
imdvq_vq_vm_vm
|
||||
logk
|
||||
logk2
|
||||
logk2f
|
||||
logk3f
|
||||
logkf
|
||||
poly2dd
|
||||
poly2dd_b
|
||||
poly2df
|
||||
poly2df_b
|
||||
poly4dd
|
||||
poly4df
|
||||
pragma
|
||||
rempi
|
||||
rempif
|
||||
rempisub
|
||||
rempisubf
|
||||
sinpifk
|
||||
sinpik
|
||||
td
|
||||
tdi_t
|
||||
tdigeti_vi_tdi
|
||||
tdigettd_vd3_tdi
|
||||
tdigetx_vd_tdi
|
||||
tdisettdi_tdi_vd3_vi
|
||||
tdx
|
||||
tdxgetd3_vd3_tdx
|
||||
tdxgetd3x_vd_tdx
|
||||
tdxgetd3y_vd_tdx
|
||||
tdxgetd3z_vd_tdx
|
||||
tdxgete_vm_tdx
|
||||
tdxsetd3_tdx_tdx_vd3
|
||||
tdxsete_tdx_tdx_vm
|
||||
tdxseted3_tdx_vm_vd3
|
||||
tdxsetexyz_tdx_vm_vd_vd_vd
|
||||
tdxsetx_tdx_tdx_vd
|
||||
tdxsetxyz_tdx_tdx_vd_vd_vd
|
||||
tdxsety_tdx_tdx_vd
|
||||
tdxsetz_tdx_tdx_vd
|
||||
vabs_vd_vd
|
||||
vabs_vf2_vf2
|
||||
vabs_vf_vf
|
||||
add128_vq_vq_vq
|
||||
vadd64_vm_vm_vm
|
||||
vadd_vd_3vd
|
||||
vadd_vd_4vd
|
||||
vadd_vd_5vd
|
||||
vadd_vd_6vd
|
||||
vadd_vd_7vd
|
||||
vadd_vd_vd_vd
|
||||
vadd_vf_3vf
|
||||
vadd_vf_4vf
|
||||
vadd_vf_5vf
|
||||
vadd_vf_6vf
|
||||
vadd_vf_7vf
|
||||
vadd_vf_vf_vf
|
||||
vadd_vi2_vi2_vi2
|
||||
vadd_vi_vi_vi
|
||||
vand_vi2_vi2_vi2
|
||||
vand_vi2_vo_vi2
|
||||
vand_vi_vi_vi
|
||||
vand_vi_vo_vi
|
||||
vand_vm_vm_vm
|
||||
vand_vm_vo32_vm
|
||||
vand_vm_vo64_vm
|
||||
vand_vo_vo_vo
|
||||
vandnot_vi2_vi2_vi2
|
||||
vandnot_vi2_vo_vi2
|
||||
vandnot_vi_vi_vi
|
||||
vandnot_vi_vo_vi
|
||||
vandnot_vm_vm_vm
|
||||
vandnot_vm_vo32_vm
|
||||
vandnot_vm_vo64_vm
|
||||
vandnot_vo_vo_vo
|
||||
vargquad
|
||||
vavailability_i
|
||||
cast_aq_vq
|
||||
vcast_d_vd
|
||||
vcast_f_vf
|
||||
vcast_vd2_d2
|
||||
vcast_vd2_d_d
|
||||
vcast_vd2_vd_vd
|
||||
vcast_vd_d
|
||||
vcast_vd_vi
|
||||
vcast_vd_vm
|
||||
vcast_vf2_d
|
||||
vcast_vf2_f_f
|
||||
vcast_vf2_vf_vf
|
||||
vcast_vf_f
|
||||
vcast_vf_vi2
|
||||
vcast_vi2_i
|
||||
vcast_vi2_i_i
|
||||
vcast_vi2_vm
|
||||
vcast_vi_i
|
||||
vcast_vi_vm
|
||||
vcast_vm_i64
|
||||
vcast_vm_i_i
|
||||
vcast_vm_u64
|
||||
vcast_vm_vi
|
||||
vcast_vm_vi2
|
||||
vcast_vm_vo
|
||||
vcast_vo_i
|
||||
vcast_vo32_vo64
|
||||
vcast_vo64_vo32
|
||||
cast_vq_aq
|
||||
vclearlsb_vd_vd_i
|
||||
vcopysign_vd_vd_vd
|
||||
vcopysign_vf_vf_vf
|
||||
vd
|
||||
vd2getx_vd_vd2
|
||||
vd2gety_vd_vd2
|
||||
vd2setx_vd2_vd2_vd
|
||||
vd2setxy_vd2_vd_vd
|
||||
vd2sety_vd2_vd2_vd
|
||||
vd3getx_vd_vd3
|
||||
vd3gety_vd_vd3
|
||||
vd3getz_vd_vd3
|
||||
vd3setx_vd3_vd3_vd
|
||||
vd3setxyz_vd3_vd_vd_vd
|
||||
vd3sety_vd3_vd3_vd
|
||||
vd3setz_vd3_vd3_vd
|
||||
vdiv_vd_vd_vd
|
||||
vdiv_vf_vf_vf
|
||||
vdouble
|
||||
vdouble2
|
||||
vdouble3
|
||||
veq64_vo_vm_vm
|
||||
veq_vi2_vi2_vi2
|
||||
veq_vi_vi_vi
|
||||
veq_vo_vd_vd
|
||||
veq_vo_vf_vf
|
||||
veq_vo_vi2_vi2
|
||||
veq_vo_vi_vi
|
||||
versatileVector
|
||||
vf2getx_vf_vf2
|
||||
vf2gety_vf_vf2
|
||||
vf2setx_vf2_vf2_vf
|
||||
vf2setxy_vf2_vf_vf
|
||||
vf2sety_vf2_vf2_vf
|
||||
vfloat
|
||||
vfloat2
|
||||
vfma_vd_vd_vd_vd
|
||||
vfma_vf_vf_vf_vf
|
||||
vfmann_vd_vd_vd_vd
|
||||
vfmann_vf_vf_vf_vf
|
||||
vfmanp_vd_vd_vd_vd
|
||||
vfmanp_vf_vf_vf_vf
|
||||
vfmapn_vd_vd_vd_vd
|
||||
vfmapn_vf_vf_vf_vf
|
||||
vfmapp_vd_vd_vd_vd
|
||||
vfmapp_vf_vf_vf_vf
|
||||
vgather_vd_p_vi
|
||||
vgather_vf_p_vi2
|
||||
vge_vo_vd_vd
|
||||
vge_vo_vf_vf
|
||||
vgetexp_vd_vd
|
||||
vgetexp_vf_vf
|
||||
vgetmant_vd_vd
|
||||
vgetmant_vf_vf
|
||||
vgt64_vo_vm_vm
|
||||
vgt_vi2_vi2_vi2
|
||||
vgt_vi_vi_vi
|
||||
vgt_vo_vd_vd
|
||||
vgt_vo_vf_vf
|
||||
vgt_vo_vi2_vi2
|
||||
vgt_vo_vi_vi
|
||||
vilogb2k_vi2_vf
|
||||
vilogb2k_vi_vd
|
||||
vilogb2k_vm_vd
|
||||
vilogb3k_vm_vd
|
||||
vilogbk_vi2_vf
|
||||
vilogbk_vi_vd
|
||||
vilogbk_vm_vd
|
||||
vint
|
||||
vint2
|
||||
vint64
|
||||
visinf2_vd_vd_vd
|
||||
visinf2_vf_vf_vf
|
||||
visinf_vo_vd
|
||||
visinf_vo_vf
|
||||
visint_vo_vd
|
||||
visint_vo_vf
|
||||
visminf_vo_vd
|
||||
visminf_vo_vf
|
||||
visnan_vo_vd
|
||||
visnan_vo_vf
|
||||
visnegzero_vo_vd
|
||||
visnegzero_vo_vf
|
||||
visnonfinite_vo_vd
|
||||
visnumber_vo_vd
|
||||
visnumber_vo_vf
|
||||
visodd_vo_vd
|
||||
vispinf_vo_vd
|
||||
vispinf_vo_vf
|
||||
vldexp1_vd_vd_vm
|
||||
vldexp2_vd_vd_vi
|
||||
vldexp2_vd_vd_vm
|
||||
vldexp2_vf_vf_vi2
|
||||
vldexp3_vd_vd_vi
|
||||
vldexp3_vd_vd_vm
|
||||
vldexp3_vf_vf_vi2
|
||||
vldexp_vd_vd_vi
|
||||
vldexp_vf_vf_vi2
|
||||
vle_vo_vd_vd
|
||||
vle_vo_vf_vf
|
||||
vload_vd_p
|
||||
vload_vf_p
|
||||
vloadu_vd_p
|
||||
vloadu_vf_p
|
||||
vloadu_vi2_p
|
||||
vloadu_vi_p
|
||||
loadu_vq_p
|
||||
vlt64_vo_vm_vm
|
||||
vlt_vo_vd_vd
|
||||
vlt_vo_vf_vf
|
||||
vmask
|
||||
vmax_vd_vd_vd
|
||||
vmax_vf_vf_vf
|
||||
vmin_vd_vd_vd
|
||||
vmin_vf_vf_vf
|
||||
vmla_vd_vd_vd_vd
|
||||
vmla_vf_vf_vf_vf
|
||||
vmlanp_vd_vd_vd_vd
|
||||
vmlanp_vf_vf_vf_vf
|
||||
vmlapn_vd_vd_vd_vd
|
||||
vmlapn_vf_vf_vf_vf
|
||||
vmlsubadd_vd_vd_vd_vd
|
||||
vmlsubadd_vf_vf_vf_vf
|
||||
vmul_vd_vd_vd
|
||||
vmul_vf_vf_vf
|
||||
vmulsign_vd_vd_vd
|
||||
vmulsign_vf_vf_vf
|
||||
vneg64_vm_vm
|
||||
vneg_vd_vd
|
||||
vneg_vf_vf
|
||||
vneg_vi2_vi2
|
||||
vneg_vi_vi
|
||||
vnegpos_vd_vd
|
||||
vnegpos_vf_vf
|
||||
vneq_vo_vd_vd
|
||||
vneq_vo_vf_vf
|
||||
vnot_vo32_vo32
|
||||
vnot_vo64_vo64
|
||||
vopmask
|
||||
vor_vi2_vi2_vi2
|
||||
vor_vi_vi_vi
|
||||
vor_vm_vm_vm
|
||||
vor_vm_vo32_vm
|
||||
vor_vm_vo64_vm
|
||||
vor_vo_vo_vo
|
||||
vorsign_vd_vd_vd
|
||||
vorsign_vf_vf_vf
|
||||
vposneg_vd_vd
|
||||
vposneg_vf_vf
|
||||
vpow2i_vd_vi
|
||||
vpow2i_vd_vm
|
||||
vpow2i_vf_vi2
|
||||
vprefetch_v_p
|
||||
vptrunc_vd_vd
|
||||
vptrunc_vf_vf
|
||||
vqgetx_vm_vq
|
||||
vqgety_vm_vq
|
||||
vqsetx_vq_vq_vm
|
||||
vqsetxy_vq_vm_vm
|
||||
vqsety_vq_vq_vm
|
||||
vquad
|
||||
vrec_vd_vd
|
||||
vrec_vf_vf
|
||||
vreinterpret_vd_vf
|
||||
vreinterpret_vd_vm
|
||||
vreinterpret_vf_vd
|
||||
vreinterpret_vf_vi2
|
||||
vreinterpret_vf_vm
|
||||
vreinterpret_vi2_vf
|
||||
vreinterpret_vi64_vm
|
||||
vreinterpret_vm_vd
|
||||
vreinterpret_vm_vf
|
||||
vreinterpret_vm_vi64
|
||||
vreinterpret_vm_vu64
|
||||
vreinterpret_vu64_vm
|
||||
vrev21_vd_vd
|
||||
vrev21_vf_vf
|
||||
vreva2_vd_vd
|
||||
vreva2_vf_vf
|
||||
vrint_vd_vd
|
||||
vrint2_vd_vd
|
||||
vrint_vf_vf
|
||||
vrint_vi2_vf
|
||||
vrint_vi_vd
|
||||
vrintfk2_vf_vf
|
||||
vrintk2_vd_vd
|
||||
vscatter2_v_p_i_i_vd
|
||||
vscatter2_v_p_i_i_vf
|
||||
vsel_vd2_vo_d_d_d_d
|
||||
vsel_vd2_vo_vd2_vd2
|
||||
vsel_vd_vo_d_d
|
||||
vsel_vd_vo_vd_vd
|
||||
vsel_vd_vo_vo_d_d_d
|
||||
vsel_vd_vo_vo_vo_d_d_d_d
|
||||
vsel_vf2_vo_f_f_f_f
|
||||
vsel_vf2_vo_vf2_vf2
|
||||
vsel_vf2_vo_vo_d_d_d
|
||||
vsel_vf2_vo_vo_vo_d_d_d_d
|
||||
vsel_vf_vo_f_f
|
||||
vsel_vf_vo_vf_vf
|
||||
vsel_vf_vo_vo_f_f_f
|
||||
vsel_vf_vo_vo_vo_f_f_f_f
|
||||
vsel_vi2_vf_vf_vi2_vi2
|
||||
vsel_vi2_vf_vi2
|
||||
vsel_vi2_vo_vi2_vi2
|
||||
vsel_vi_vd_vd_vi_vi
|
||||
vsel_vi_vd_vi
|
||||
vsel_vi_vo_vi_vi
|
||||
vsel_vm_vo64_vm_vm
|
||||
sel_vq_vo_vq_vq
|
||||
vsign_vd_vd
|
||||
vsign_vf_vf
|
||||
vsignbit_vm_vd
|
||||
vsignbit_vm_vf
|
||||
vsignbit_vo_vd
|
||||
vsignbit_vo_vf
|
||||
vsll_vi2_vi2_i
|
||||
vsll_vi_vi_i
|
||||
vsqrt_vd_vd
|
||||
vsqrt_vf_vf
|
||||
vsra_vi2_vi2_i
|
||||
vsra_vi_vi_i
|
||||
vsrl_vi2_vi2_i
|
||||
vsrl_vi_vi_i
|
||||
vsscatter2_v_p_i_i_vd
|
||||
vsscatter2_v_p_i_i_vf
|
||||
vstore_v_p_vd
|
||||
vstore_v_p_vf
|
||||
vstoreu_v_p_vd
|
||||
vstoreu_v_p_vf
|
||||
vstoreu_v_p_vi
|
||||
vstoreu_v_p_vi2
|
||||
storeu_v_p_vq
|
||||
vstream_v_p_vd
|
||||
vstream_v_p_vf
|
||||
vsub64_vm_vm_vm
|
||||
vsub_vd_3vd
|
||||
vsub_vd_4vd
|
||||
vsub_vd_5vd
|
||||
vsub_vd_6vd
|
||||
vsub_vd_vd_vd
|
||||
vsub_vf_3vf
|
||||
vsub_vf_4vf
|
||||
vsub_vf_5vf
|
||||
vsub_vf_vf_vf
|
||||
vsub_vi2_vi2_vi2
|
||||
vsub_vi_vi_vi
|
||||
vsubadd_vd_vd_vd
|
||||
vsubadd_vf_vf_vf
|
||||
vtestallones_i_vo32
|
||||
vtestallones_i_vo64
|
||||
vtestallzeros_i_vo64
|
||||
vtoward0_vd_vd
|
||||
vtoward0_vf_vf
|
||||
vtruncate_vd_vd
|
||||
vtruncate2_vd_vd
|
||||
vtruncate_vf_vf
|
||||
vtruncate_vi2_vf
|
||||
vtruncate_vi_vd
|
||||
vtruncate_vm_vd
|
||||
vugt64_vo_vm_vm
|
||||
vuint64
|
||||
vupper_vd_vd
|
||||
vupper_vf_vf
|
||||
vxor_vi2_vi2_vi2
|
||||
vxor_vi_vi_vi
|
||||
vxor_vm_vm_vm
|
||||
vxor_vm_vo32_vm
|
||||
vxor_vm_vo64_vm
|
||||
vxor_vo_vo_vo
|
||||
#
|
||||
abs_tdx_tdx
|
||||
abs_vd3_vd3
|
||||
acos_tdx_tdx
|
||||
acosh_tdx_tdx
|
||||
add2_vd3_vd2_vd3
|
||||
add2_vd3_vd3_vd3
|
||||
add2_vd3_vd_vd3
|
||||
add_tdx_tdx_tdx
|
||||
add_vd3_vd2_vd3
|
||||
add_vd3_vd_vd3
|
||||
asin_tdx_tdx
|
||||
asinh_tdx_tdx
|
||||
atan2_tdx_tdx_tdx
|
||||
atan_tdx_tdx
|
||||
atanh_tdx_tdx
|
||||
cast_tdx_d
|
||||
cast_tdx_d_d_d
|
||||
cast_tdx_vd
|
||||
cast_tdx_vd3
|
||||
cast_tdx_vq
|
||||
cast_vd3_d3
|
||||
cast_vd3_d_d_d
|
||||
cast_vd3_tdx
|
||||
cast_vd3_vd_vd_vd
|
||||
cast_vd_tdx
|
||||
cast_vq_tdx
|
||||
cmp_vm_tdx_tdx
|
||||
cmpcnv_vq_vq
|
||||
cos_tdx_tdx
|
||||
cosh_tdx_tdx
|
||||
div2_vd3_vd3_vd3
|
||||
div_tdx_tdx_tdx
|
||||
div_vd3_vd3_vd3
|
||||
eq_vo_tdx_tdx
|
||||
exp10_tdx_tdx
|
||||
exp10i
|
||||
exp10tab
|
||||
exp2_tdx_tdx
|
||||
exp_tdx_tdx
|
||||
expm1_tdx_tdx
|
||||
fastcast_tdx_vd3
|
||||
fastcast_tdx_vq
|
||||
fastcast_vq_tdx
|
||||
ge_vo_tdx_tdx
|
||||
gt_vo_tdx_tdx
|
||||
ilogb_vm_tdx
|
||||
isinf_vo_vq
|
||||
isint_vo_tdx
|
||||
isminf_vo_vq
|
||||
isnan_vo_tdx
|
||||
isnan_vo_vq
|
||||
isnonfinite_vo_vq
|
||||
isnonfinite_vo_vq_vq
|
||||
isnonfinite_vo_vq_vq_vq
|
||||
isodd_vo_tdx
|
||||
ispinf_vo_vq
|
||||
iszero_vo_tdx
|
||||
iszero_vo_vq
|
||||
le_vo_tdx_tdx
|
||||
log10_tdx_tdx
|
||||
log1p_tdx_tdx
|
||||
log2_tdx_tdx
|
||||
log_tdx_tdx
|
||||
logk_tdx_tdx
|
||||
lt_vo_tdx_tdx
|
||||
mla_vd3_vd3_vd3_vd3
|
||||
modf_tdx_tdx_ptdx
|
||||
mul2_vd3_vd3_vd3
|
||||
mul_tdx_tdx_tdx
|
||||
mul_vd3_vd2_vd2
|
||||
mul_vd3_vd2_vd3
|
||||
mul_vd3_vd3_vd
|
||||
mul_vd3_vd3_vd2
|
||||
mul_vd3_vd3_vd3
|
||||
mulsign_tdx_tdx_vd
|
||||
mulsign_vd3_vd3_vd
|
||||
mulsign_vq_vq_vq
|
||||
neg_tdx_tdx
|
||||
neg_vd3_vd3
|
||||
neq_vo_tdx_tdx
|
||||
normalize_vd3_vd3
|
||||
poly10dd
|
||||
poly10dd_b
|
||||
poly11dd
|
||||
poly11dd_b
|
||||
poly12dd
|
||||
poly12dd_b
|
||||
poly13dd
|
||||
poly13dd_b
|
||||
poly14dd
|
||||
poly14dd_b
|
||||
poly15dd
|
||||
poly15dd_b
|
||||
poly16dd
|
||||
poly16dd_b
|
||||
poly17dd
|
||||
poly17dd_b
|
||||
poly18dd
|
||||
poly18dd_b
|
||||
poly19dd
|
||||
poly19dd_b
|
||||
poly20dd
|
||||
poly20dd_b
|
||||
poly21dd
|
||||
poly21dd_b
|
||||
poly22dd
|
||||
poly22dd_b
|
||||
poly23dd
|
||||
poly23dd_b
|
||||
poly24dd
|
||||
poly24dd_b
|
||||
poly25dd
|
||||
poly25dd_b
|
||||
poly26dd
|
||||
poly26dd_b
|
||||
poly27dd
|
||||
poly27dd_b
|
||||
poly2d
|
||||
poly2td
|
||||
poly2td_b
|
||||
poly3d
|
||||
poly3dd
|
||||
poly3dd_b
|
||||
poly3td
|
||||
poly3td_b
|
||||
poly4d
|
||||
poly4dd_b
|
||||
poly4td
|
||||
poly4td_b
|
||||
poly5d
|
||||
poly5dd
|
||||
poly5dd_b
|
||||
poly5td
|
||||
poly5td_b
|
||||
poly6d
|
||||
poly6dd
|
||||
poly6dd_b
|
||||
poly6td
|
||||
poly6td_b
|
||||
poly7d
|
||||
poly7dd
|
||||
poly7dd_b
|
||||
poly7td
|
||||
poly7td_b
|
||||
poly8d
|
||||
poly8dd
|
||||
poly8dd_b
|
||||
poly8td
|
||||
poly8td_b
|
||||
poly9dd
|
||||
poly9dd_b
|
||||
pow_tdx_tdx_tdx
|
||||
quickrenormalize_vd3_vd3
|
||||
quicktwosum_vd2_vd_vd
|
||||
rec_vd3_vd2
|
||||
rec_vd3_vd3
|
||||
rempio2q
|
||||
scale_vd3_vd3_d
|
||||
scale_vd3_vd3_vd
|
||||
scaleadd2_vd3_vd3_vd3_vd
|
||||
scalesub2_vd3_vd3_vd3_vd
|
||||
sel_tdx_vo_tdx_tdx
|
||||
sel_vd3_vo_vd3_vd3
|
||||
signbit_vo_tdx
|
||||
sin_tdx_tdx
|
||||
sinh_tdx_tdx
|
||||
slowcast_vq_tdx
|
||||
snprintquad
|
||||
snprintquadhex
|
||||
sqrt_tdx_tdx
|
||||
sqrt_vd3_vd3
|
||||
squ_vd3_vd3
|
||||
sub2_vd3_vd3_vd3
|
||||
sub_tdx_tdx_tdx
|
||||
tan_tdx_tdx
|
||||
tanh_tdx_tdx
|
||||
twoprod_vd2_vd_vd
|
||||
twosub_vd2_vd_vd
|
||||
twosubx_vd2_vd_vd_vd
|
||||
twosum_vd2_vd_vd
|
||||
twosumx_vd2_vd_vd_vd
|
||||
vtruncate2_vd_vd
|
||||
vfloor2_vd_vd
|
||||
vceil2_vd_vd
|
||||
vround2_vd_vd
|
||||
isinf_vo_tdx
|
||||
trunc_tdx_tdx
|
||||
rint_tdx_tdx
|
||||
fmod_tdx_tdx_tdx
|
||||
remainder_tdx_tdx_tdx
|
||||
cbrt_tdx_tdx
|
||||
frexp_tdx_tdx_pvi
|
||||
fma_tdx_tdx_tdx_tdx
|
||||
hypot_tdx_tdx_tdx
|
||||
ilogb_vi_tdx
|
||||
ldexp_tdx_tdx_vi
|
||||
Sleef_rempitabsp
|
||||
Sleef_rempitabdp
|
||||
Sleef_rempitabqp
|
||||
vcastu_vm_vi
|
||||
vcastu_vi_vm
|
||||
rvv_sp_vopmask
|
||||
rvv_dp_vopmask
|
||||
@@ -0,0 +1,50 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <signal.h>
|
||||
#include <setjmp.h>
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__) || defined(__MINGW64__)
|
||||
static jmp_buf sigjmp;
|
||||
#define SETJMP(x) setjmp(x)
|
||||
#define LONGJMP longjmp
|
||||
#else
|
||||
static sigjmp_buf sigjmp;
|
||||
#define SETJMP(x) sigsetjmp(x, 1)
|
||||
#define LONGJMP siglongjmp
|
||||
#endif
|
||||
|
||||
int main2(int argc, char **argv);
|
||||
int check_feature(double, float);
|
||||
|
||||
static void sighandler(int signum) {
|
||||
LONGJMP(sigjmp, 1);
|
||||
}
|
||||
|
||||
int detectFeature() {
|
||||
signal(SIGILL, sighandler);
|
||||
|
||||
if (SETJMP(sigjmp) == 0) {
|
||||
int r = check_feature(1.0, 1.0f);
|
||||
signal(SIGILL, SIG_DFL);
|
||||
return r;
|
||||
} else {
|
||||
signal(SIGILL, SIG_DFL);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (!detectFeature()) {
|
||||
printf("0\n");
|
||||
fclose(stdout);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
return main2(argc, argv);
|
||||
}
|
||||
@@ -0,0 +1,332 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2024.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
//
|
||||
|
||||
#ifndef __MISC_H__
|
||||
#define __MISC_H__
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#endif
|
||||
|
||||
#ifndef M_PI
|
||||
#define M_PI 3.141592653589793238462643383279502884
|
||||
#endif
|
||||
|
||||
#ifndef M_PIl
|
||||
#define M_PIl 3.141592653589793238462643383279502884L
|
||||
#endif
|
||||
|
||||
#ifndef M_1_PI
|
||||
#define M_1_PI 0.318309886183790671537767526745028724
|
||||
#endif
|
||||
|
||||
#ifndef M_1_PIl
|
||||
#define M_1_PIl 0.318309886183790671537767526745028724L
|
||||
#endif
|
||||
|
||||
#ifndef M_2_PI
|
||||
#define M_2_PI 0.636619772367581343075535053490057448
|
||||
#endif
|
||||
|
||||
#ifndef M_2_PIl
|
||||
#define M_2_PIl 0.636619772367581343075535053490057448L
|
||||
#endif
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
|
||||
#ifndef SLEEF_FP_ILOGB0
|
||||
#define SLEEF_FP_ILOGB0 ((int)0x80000000)
|
||||
#endif
|
||||
|
||||
#ifndef SLEEF_FP_ILOGBNAN
|
||||
#define SLEEF_FP_ILOGBNAN ((int)2147483647)
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#define SLEEF_SNAN (((union { long long int i; double d; }) { .i = INT64_C(0x7ff0000000000001) }).d)
|
||||
#define SLEEF_SNANf (((union { long int i; float f; }) { .i = 0xff800001 }).f)
|
||||
|
||||
#define SLEEF_FLT_MIN 0x1p-126
|
||||
#define SLEEF_DBL_MIN 0x1p-1022
|
||||
#define SLEEF_INT_MAX 2147483647
|
||||
#define SLEEF_DBL_DENORM_MIN 4.9406564584124654e-324
|
||||
#define SLEEF_FLT_DENORM_MIN 1.40129846e-45F
|
||||
|
||||
//
|
||||
|
||||
/*
|
||||
PI_A to PI_D are constants that satisfy the following two conditions.
|
||||
|
||||
* For PI_A, PI_B and PI_C, the last 28 bits are zero.
|
||||
* PI_A + PI_B + PI_C + PI_D is close to PI as much as possible.
|
||||
|
||||
The argument of a trig function is multiplied by 1/PI, and the
|
||||
integral part is divided into two parts, each has at most 28
|
||||
bits. So, the maximum argument that could be correctly reduced
|
||||
should be 2^(28*2-1) PI = 1.1e+17. However, due to internal
|
||||
double precision calculation, the actual maximum argument that can
|
||||
be correctly reduced is around 2^47.
|
||||
*/
|
||||
|
||||
#define PI_A 3.1415926218032836914
|
||||
#define PI_B 3.1786509424591713469e-08
|
||||
#define PI_C 1.2246467864107188502e-16
|
||||
#define PI_D 1.2736634327021899816e-24
|
||||
#define TRIGRANGEMAX 1e+14
|
||||
|
||||
/*
|
||||
PI_A2 and PI_B2 are constants that satisfy the following two conditions.
|
||||
|
||||
* The last 3 bits of PI_A2 are zero.
|
||||
* PI_A2 + PI_B2 is close to PI as much as possible.
|
||||
|
||||
The argument of a trig function is multiplied by 1/PI, and the
|
||||
integral part is multiplied by PI_A2. So, the maximum argument that
|
||||
could be correctly reduced should be 2^(3-1) PI = 12.6. By testing,
|
||||
we confirmed that it correctly reduces the argument up to around 15.
|
||||
*/
|
||||
|
||||
#define PI_A2 3.141592653589793116
|
||||
#define PI_B2 1.2246467991473532072e-16
|
||||
#define TRIGRANGEMAX2 15
|
||||
|
||||
#define M_2_PI_H 0.63661977236758138243
|
||||
#define M_2_PI_L -3.9357353350364971764e-17
|
||||
|
||||
#define SQRT_DBL_MAX 1.3407807929942596355e+154
|
||||
|
||||
#define TRIGRANGEMAX3 1e+9
|
||||
|
||||
#define M_4_PI 1.273239544735162542821171882678754627704620361328125
|
||||
|
||||
#define L2U .69314718055966295651160180568695068359375
|
||||
#define L2L .28235290563031577122588448175013436025525412068e-12
|
||||
#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
|
||||
|
||||
#define L10U 0.30102999566383914498 // log 2 / log 10
|
||||
#define L10L 1.4205023227266099418e-13
|
||||
#define LOG10_2 3.3219280948873623478703194294893901758648313930
|
||||
|
||||
#define L10Uf 0.3010253906f
|
||||
#define L10Lf 4.605038981e-06f
|
||||
|
||||
//
|
||||
|
||||
#define PI_Af 3.140625f
|
||||
#define PI_Bf 0.0009670257568359375f
|
||||
#define PI_Cf 6.2771141529083251953e-07f
|
||||
#define PI_Df 1.2154201256553420762e-10f
|
||||
#define TRIGRANGEMAXf 39000
|
||||
|
||||
#define PI_A2f 3.1414794921875f
|
||||
#define PI_B2f 0.00011315941810607910156f
|
||||
#define PI_C2f 1.9841872589410058936e-09f
|
||||
#define TRIGRANGEMAX2f 125.0f
|
||||
|
||||
#define TRIGRANGEMAX4f 8e+6f
|
||||
|
||||
#define SQRT_FLT_MAX 18446743523953729536.0
|
||||
|
||||
#define L2Uf 0.693145751953125f
|
||||
#define L2Lf 1.428606765330187045e-06f
|
||||
|
||||
#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
|
||||
#ifndef M_PIf
|
||||
# define M_PIf ((float)M_PI)
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
#ifndef MIN
|
||||
#define MIN(x, y) ((x) < (y) ? (x) : (y))
|
||||
#endif
|
||||
|
||||
#ifndef MAX
|
||||
#define MAX(x, y) ((x) > (y) ? (x) : (y))
|
||||
#endif
|
||||
|
||||
#ifndef ABS
|
||||
#define ABS(x) ((x) < 0 ? -(x) : (x))
|
||||
#endif
|
||||
|
||||
#define stringify(s) stringify_(s)
|
||||
#define stringify_(s) #s
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
typedef long double longdouble;
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_double2_DEFINED) && !defined(SLEEF_GENHEADER)
|
||||
#define Sleef_double2_DEFINED
|
||||
typedef struct {
|
||||
double x, y;
|
||||
} Sleef_double2;
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_float2_DEFINED) && !defined(SLEEF_GENHEADER)
|
||||
#define Sleef_float2_DEFINED
|
||||
typedef struct {
|
||||
float x, y;
|
||||
} Sleef_float2;
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_longdouble2_DEFINED) && !defined(SLEEF_GENHEADER)
|
||||
#define Sleef_longdouble2_DEFINED
|
||||
typedef struct {
|
||||
long double x, y;
|
||||
} Sleef_longdouble2;
|
||||
#endif
|
||||
|
||||
#if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
|
||||
|
||||
#define LIKELY(condition) __builtin_expect(!!(condition), 1)
|
||||
#define UNLIKELY(condition) __builtin_expect(!!(condition), 0)
|
||||
#define RESTRICT __restrict__
|
||||
|
||||
#ifndef __arm__
|
||||
#define ALIGNED(x) __attribute__((aligned(x)))
|
||||
#else
|
||||
#define ALIGNED(x)
|
||||
#endif
|
||||
|
||||
#if defined(SLEEF_GENHEADER)
|
||||
|
||||
#define INLINE SLEEF_ALWAYS_INLINE
|
||||
#define EXPORT SLEEF_INLINE
|
||||
#define CONST SLEEF_CONST
|
||||
#define NOEXPORT
|
||||
|
||||
#else // #if defined(SLEEF_GENHEADER)
|
||||
|
||||
#define CONST __attribute__((const))
|
||||
#define INLINE __attribute__((always_inline))
|
||||
|
||||
#if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
|
||||
#ifndef SLEEF_STATIC_LIBS
|
||||
#define EXPORT __stdcall __declspec(dllexport)
|
||||
#define NOEXPORT
|
||||
#else // #ifndef SLEEF_STATIC_LIBS
|
||||
#define EXPORT
|
||||
#define NOEXPORT
|
||||
#endif // #ifndef SLEEF_STATIC_LIBS
|
||||
#else // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
|
||||
#define EXPORT __attribute__((visibility("default")))
|
||||
#define NOEXPORT __attribute__ ((visibility ("hidden")))
|
||||
#endif // #if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
|
||||
|
||||
#endif // #if defined(SLEEF_GENHEADER)
|
||||
|
||||
#define SLEEF_NAN __builtin_nan("")
|
||||
#define SLEEF_NANf __builtin_nanf("")
|
||||
#define SLEEF_NANl __builtin_nanl("")
|
||||
#define SLEEF_INFINITY __builtin_inf()
|
||||
#define SLEEF_INFINITYf __builtin_inff()
|
||||
#define SLEEF_INFINITYl __builtin_infl()
|
||||
|
||||
#if defined(__INTEL_COMPILER) || defined (__clang__)
|
||||
#define SLEEF_INFINITYq __builtin_inf()
|
||||
#define SLEEF_NANq __builtin_nan("")
|
||||
#else
|
||||
#define SLEEF_INFINITYq __builtin_infq()
|
||||
#define SLEEF_NANq (SLEEF_INFINITYq - SLEEF_INFINITYq)
|
||||
#endif
|
||||
|
||||
#elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
|
||||
|
||||
#if defined(SLEEF_GENHEADER)
|
||||
|
||||
#define INLINE SLEEF_ALWAYS_INLINE
|
||||
#define CONST SLEEF_CONST
|
||||
#define EXPORT SLEEF_INLINE
|
||||
#define NOEXPORT
|
||||
|
||||
#else // #if defined(SLEEF_GENHEADER)
|
||||
|
||||
#define INLINE __forceinline
|
||||
#define CONST
|
||||
#ifndef SLEEF_STATIC_LIBS
|
||||
#define EXPORT __declspec(dllexport)
|
||||
#define NOEXPORT
|
||||
#else
|
||||
#define EXPORT
|
||||
#define NOEXPORT
|
||||
#endif
|
||||
|
||||
#endif // #if defined(SLEEF_GENHEADER)
|
||||
|
||||
#define RESTRICT
|
||||
#define ALIGNED(x)
|
||||
#define LIKELY(condition) (condition)
|
||||
#define UNLIKELY(condition) (condition)
|
||||
|
||||
#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__)) && !defined(SLEEF_GENHEADER)
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#define SLEEF_INFINITY (1e+300 * 1e+300)
|
||||
#define SLEEF_NAN (SLEEF_INFINITY - SLEEF_INFINITY)
|
||||
#define SLEEF_INFINITYf ((float)SLEEF_INFINITY)
|
||||
#define SLEEF_NANf ((float)SLEEF_NAN)
|
||||
#define SLEEF_INFINITYl ((long double)SLEEF_INFINITY)
|
||||
#define SLEEF_NANl ((long double)SLEEF_NAN)
|
||||
|
||||
#if (defined(_M_AMD64) || defined(_M_X64))
|
||||
#ifndef __SSE2__
|
||||
#define __SSE2__
|
||||
#define __SSE3__
|
||||
#define __SSE4_1__
|
||||
#endif
|
||||
#elif _M_IX86_FP == 2
|
||||
#ifndef __SSE2__
|
||||
#define __SSE2__
|
||||
#define __SSE3__
|
||||
#define __SSE4_1__
|
||||
#endif
|
||||
#elif _M_IX86_FP == 1
|
||||
#ifndef __SSE__
|
||||
#define __SSE__
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif // #elif defined(_MSC_VER) // #if (defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)) && !defined(_MSC_VER)
|
||||
|
||||
#if !defined(__linux__)
|
||||
#define isinff(x) ((x) == SLEEF_INFINITYf || (x) == -SLEEF_INFINITYf)
|
||||
#define isinfl(x) ((x) == SLEEF_INFINITYl || (x) == -SLEEF_INFINITYl)
|
||||
#define isnanf(x) ((x) != (x))
|
||||
#define isnanl(x) ((x) != (x))
|
||||
#endif
|
||||
|
||||
#endif // #ifndef __MISC_H__
|
||||
|
||||
#ifdef ENABLE_AAVPCS
|
||||
#define VECTOR_CC __attribute__((aarch64_vector_pcs))
|
||||
#else
|
||||
#define VECTOR_CC
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
#if defined (__GNUC__) && !defined(__INTEL_COMPILER)
|
||||
#pragma GCC diagnostic ignored "-Wpragmas"
|
||||
#pragma GCC diagnostic ignored "-Wunknown-pragmas"
|
||||
#if !defined (__clang__)
|
||||
#pragma GCC diagnostic ignored "-Wattribute-alias"
|
||||
#pragma GCC diagnostic ignored "-Wlto-type-mismatch"
|
||||
#pragma GCC diagnostic ignored "-Wstringop-overflow"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#pragma warning(disable:4101) // warning C4101: 'v': unreferenced local variable
|
||||
#pragma warning(disable:4116) // warning C4116: unnamed type definition in parentheses
|
||||
#pragma warning(disable:4244) // warning C4244: 'function': conversion from 'vopmask' to '__mmask8', possible loss of data
|
||||
#pragma warning(disable:4267) // warning C4267: 'initializing': conversion from 'size_t' to 'const int', possible loss of data
|
||||
#pragma warning(disable:4305) // warning C4305: 'function': truncation from 'double' to 'float'
|
||||
#endif
|
||||
@@ -0,0 +1,99 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#if !defined(SLEEF_GENHEADER)
|
||||
|
||||
#if (defined(__SIZEOF_FLOAT128__) && __SIZEOF_FLOAT128__ == 16) || (defined(__linux__) && defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) || (defined(__PPC64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 8)
|
||||
#define SLEEF_FLOAT128_IS_IEEEQP
|
||||
#endif
|
||||
|
||||
#if !defined(SLEEF_FLOAT128_IS_IEEEQP) && defined(__SIZEOF_LONG_DOUBLE__) && __SIZEOF_LONG_DOUBLE__ == 16 && (defined(__aarch64__) || defined(__zarch__))
|
||||
#define SLEEF_LONGDOUBLE_IS_IEEEQP
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_quad_DEFINED)
|
||||
#define Sleef_quad_DEFINED
|
||||
typedef struct { uint64_t x, y; } Sleef_uint64_2t;
|
||||
#if defined(SLEEF_FLOAT128_IS_IEEEQP) || defined(ENABLEFLOAT128)
|
||||
typedef __float128 Sleef_quad;
|
||||
#define SLEEF_QUAD_C(x) (x ## Q)
|
||||
#elif defined(SLEEF_LONGDOUBLE_IS_IEEEQP)
|
||||
typedef long double Sleef_quad;
|
||||
#define SLEEF_QUAD_C(x) (x ## L)
|
||||
#else
|
||||
typedef Sleef_uint64_2t Sleef_quad;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_quad1_DEFINED)
|
||||
#define Sleef_quad1_DEFINED
|
||||
typedef union {
|
||||
struct {
|
||||
Sleef_quad x;
|
||||
};
|
||||
Sleef_quad s[1];
|
||||
} Sleef_quad1;
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_quad2_DEFINED)
|
||||
#define Sleef_quad2_DEFINED
|
||||
typedef union {
|
||||
struct {
|
||||
Sleef_quad x, y;
|
||||
};
|
||||
Sleef_quad s[2];
|
||||
} Sleef_quad2;
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_quad4_DEFINED)
|
||||
#define Sleef_quad4_DEFINED
|
||||
typedef union {
|
||||
struct {
|
||||
Sleef_quad x, y, z, w;
|
||||
};
|
||||
Sleef_quad s[4];
|
||||
} Sleef_quad4;
|
||||
#endif
|
||||
|
||||
#if !defined(Sleef_quad8_DEFINED)
|
||||
#define Sleef_quad8_DEFINED
|
||||
typedef union {
|
||||
Sleef_quad s[8];
|
||||
} Sleef_quad8;
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE) && !defined(Sleef_quadx_DEFINED)
|
||||
#define Sleef_quadx_DEFINED
|
||||
typedef union {
|
||||
Sleef_quad s[32];
|
||||
} Sleef_quadx;
|
||||
#endif
|
||||
|
||||
|
||||
#else // #if !defined(SLEEF_GENHEADER)
|
||||
|
||||
SLEEFSHARPif !defined(SLEEFXXX__NVCC__) && ((defined(SLEEFXXX__SIZEOF_FLOAT128__) && SLEEFXXX__SIZEOF_FLOAT128__ == 16) || (defined(SLEEFXXX__linux__) && defined(SLEEFXXX__GNUC__) && (defined(SLEEFXXX__i386__) || defined(SLEEFXXX__x86_64__))) || (defined(SLEEFXXX__PPC64__) && defined(SLEEFXXX__GNUC__) && !defined(SLEEFXXX__clang__) && SLEEFXXX__GNUC__ >= 8))
|
||||
SLEEFSHARPdefine SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP
|
||||
SLEEFSHARPendif
|
||||
|
||||
SLEEFSHARPif !defined(SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP) && !defined(SLEEFXXX__NVCC__) && defined(SLEEFXXX__SIZEOF_LONG_DOUBLE__) && SLEEFXXX__SIZEOF_LONG_DOUBLE__ == 16 && (defined(SLEEFXXX__aarch64__) || defined(SLEEFXXX__zarch__))
|
||||
SLEEFSHARPdefine SLEEFXXXSLEEF_LONGDOUBLE_IS_IEEEQP
|
||||
SLEEFSHARPendif
|
||||
|
||||
SLEEFSHARPif !defined(SLEEFXXXSleef_quad_DEFINED)
|
||||
SLEEFSHARPdefine SLEEFXXXSleef_quad_DEFINED
|
||||
typedef struct { uint64_t x, y; } Sleef_uint64_2t;
|
||||
SLEEFSHARPif defined(SLEEFXXXSLEEF_FLOAT128_IS_IEEEQP)
|
||||
typedef __float128 Sleef_quad;
|
||||
SLEEFSHARPdefine SLEEFXXXSLEEF_QUAD_C(x) (x ## Q)
|
||||
SLEEFSHARPelif defined(SLEEFXXXSLEEF_LONGDOUBLE_IS_IEEEQP)
|
||||
typedef long double Sleef_quad;
|
||||
SLEEFSHARPdefine SLEEFXXXSLEEF_QUAD_C(x) (x ## L)
|
||||
SLEEFSHARPelse
|
||||
typedef Sleef_uint64_2t Sleef_quad;
|
||||
SLEEFSHARPendif
|
||||
SLEEFSHARPendif
|
||||
|
||||
#endif // #if !defined(SLEEF_GENHEADER)
|
||||
@@ -0,0 +1,201 @@
|
||||
# Compiler properties
|
||||
|
||||
set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${DFT_C_FLAGS}")
|
||||
|
||||
set(COMMON_TARGET_PROPERTIES
|
||||
C_STANDARD 99 # -std=gnu99
|
||||
)
|
||||
|
||||
#
|
||||
|
||||
function(add_test_dft TESTNAME)
|
||||
if (ARMIE_COMMAND)
|
||||
add_test(NAME ${TESTNAME} COMMAND ${ARMIE_COMMAND} -msve-vector-bits=${SVE_VECTOR_BITS} ${ARGN})
|
||||
elseif (NOT EMULATOR AND NOT SDE_COMMAND)
|
||||
add_test(NAME ${TESTNAME} COMMAND ${ARGN})
|
||||
elseif(NOT EMULATOR)
|
||||
add_test(NAME ${TESTNAME} COMMAND ${SDE_COMMAND} "--" ${ARGN})
|
||||
else()
|
||||
add_test(NAME ${TESTNAME} COMMAND ${EMULATOR} ${ARGN})
|
||||
endif()
|
||||
set_tests_properties(${TESTNAME} PROPERTIES COST 0.1)
|
||||
endfunction()
|
||||
|
||||
# Include directories
|
||||
|
||||
include_directories(${PROJECT_SOURCE_DIR}/include) # sleefdft.h
|
||||
include_directories(${sleef_BINARY_DIR}/include) # sleef.h
|
||||
if (FFTW3_INCLUDE_DIR)
|
||||
include_directories(${FFTW3_INCLUDE_DIR}) # fftw3.h
|
||||
endif()
|
||||
|
||||
# Link directories
|
||||
|
||||
link_directories(${sleef_BINARY_DIR}/lib) # libsleef, libsleefdft
|
||||
|
||||
# Link libraries
|
||||
|
||||
set(COMMON_LINK_LIBRARIES ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
|
||||
if (COMPILER_SUPPORTS_OPENMP)
|
||||
set(COMMON_LINK_LIBRARIES ${COMMON_LINK_LIBRARIES} ${OpenMP_C_FLAGS})
|
||||
endif()
|
||||
|
||||
if((NOT MSVC) AND NOT SLEEF_CLANG_ON_WINDOWS)
|
||||
# Target executable naivetestdp
|
||||
set(TARGET_NAIVETESTDP "naivetestdp")
|
||||
add_executable(${TARGET_NAIVETESTDP} naivetest.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
add_dependencies(${TARGET_NAIVETESTDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
target_compile_definitions(${TARGET_NAIVETESTDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
|
||||
target_link_libraries(${TARGET_NAIVETESTDP} ${COMMON_LINK_LIBRARIES})
|
||||
set_target_properties(${TARGET_NAIVETESTDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
# Target executable naivetestsp
|
||||
set(TARGET_NAIVETESTSP "naivetestsp")
|
||||
add_executable(${TARGET_NAIVETESTSP} naivetest.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
add_dependencies(${TARGET_NAIVETESTSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
target_compile_definitions(${TARGET_NAIVETESTSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
|
||||
target_link_libraries(${TARGET_NAIVETESTSP} ${COMMON_LINK_LIBRARIES})
|
||||
set_target_properties(${TARGET_NAIVETESTSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
# Test naivetestdp
|
||||
add_test_dft(${TARGET_NAIVETESTDP}_1 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 1)
|
||||
add_test_dft(${TARGET_NAIVETESTDP}_2 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 2)
|
||||
add_test_dft(${TARGET_NAIVETESTDP}_3 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 3)
|
||||
add_test_dft(${TARGET_NAIVETESTDP}_4 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 4)
|
||||
add_test_dft(${TARGET_NAIVETESTDP}_5 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 5)
|
||||
add_test_dft(${TARGET_NAIVETESTDP}_10 $<TARGET_FILE:${TARGET_NAIVETESTDP}> 10)
|
||||
|
||||
# Test naivetestsp
|
||||
add_test_dft(${TARGET_NAIVETESTSP}_1 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 1)
|
||||
add_test_dft(${TARGET_NAIVETESTSP}_2 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 2)
|
||||
add_test_dft(${TARGET_NAIVETESTSP}_3 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 3)
|
||||
add_test_dft(${TARGET_NAIVETESTSP}_4 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 4)
|
||||
add_test_dft(${TARGET_NAIVETESTSP}_5 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 5)
|
||||
add_test_dft(${TARGET_NAIVETESTSP}_10 $<TARGET_FILE:${TARGET_NAIVETESTSP}> 10)
|
||||
endif()
|
||||
|
||||
# Target executable roundtriptest1ddp
|
||||
set(TARGET_ROUNDTRIPTEST1DDP "roundtriptest1ddp")
|
||||
add_executable(${TARGET_ROUNDTRIPTEST1DDP} roundtriptest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
add_dependencies(${TARGET_ROUNDTRIPTEST1DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
target_compile_definitions(${TARGET_ROUNDTRIPTEST1DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
|
||||
target_link_libraries(${TARGET_ROUNDTRIPTEST1DDP} ${COMMON_LINK_LIBRARIES})
|
||||
set_target_properties(${TARGET_ROUNDTRIPTEST1DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
# Target executable roundtriptest1dsp
|
||||
set(TARGET_ROUNDTRIPTEST1DSP "roundtriptest1dsp")
|
||||
add_executable(${TARGET_ROUNDTRIPTEST1DSP} roundtriptest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
add_dependencies(${TARGET_ROUNDTRIPTEST1DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
target_compile_definitions(${TARGET_ROUNDTRIPTEST1DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
|
||||
target_link_libraries(${TARGET_ROUNDTRIPTEST1DSP} ${COMMON_LINK_LIBRARIES})
|
||||
set_target_properties(${TARGET_ROUNDTRIPTEST1DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
# Target executable roundtriptest2ddp
|
||||
set(TARGET_ROUNDTRIPTEST2DDP "roundtriptest2ddp")
|
||||
add_executable(${TARGET_ROUNDTRIPTEST2DDP} roundtriptest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
add_dependencies(${TARGET_ROUNDTRIPTEST2DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
target_compile_definitions(${TARGET_ROUNDTRIPTEST2DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
|
||||
target_link_libraries(${TARGET_ROUNDTRIPTEST2DDP} ${COMMON_LINK_LIBRARIES})
|
||||
set_target_properties(${TARGET_ROUNDTRIPTEST2DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
# Target executable roundtriptest2dsp
|
||||
set(TARGET_ROUNDTRIPTEST2DSP "roundtriptest2dsp")
|
||||
add_executable(${TARGET_ROUNDTRIPTEST2DSP} roundtriptest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
add_dependencies(${TARGET_ROUNDTRIPTEST2DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
target_compile_definitions(${TARGET_ROUNDTRIPTEST2DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
|
||||
target_link_libraries(${TARGET_ROUNDTRIPTEST2DSP} ${COMMON_LINK_LIBRARIES})
|
||||
set_target_properties(${TARGET_ROUNDTRIPTEST2DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
if (LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW)
|
||||
# Target executable fftwtest1ddp
|
||||
set(TARGET_FFTWTEST1DDP "fftwtest1ddp")
|
||||
add_executable(${TARGET_FFTWTEST1DDP} fftwtest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
add_dependencies(${TARGET_FFTWTEST1DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
target_compile_definitions(${TARGET_FFTWTEST1DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
|
||||
target_link_libraries(${TARGET_FFTWTEST1DDP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
|
||||
set_target_properties(${TARGET_FFTWTEST1DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
# Target executable fftwtest1dsp
|
||||
set(TARGET_FFTWTEST1DSP "fftwtest1dsp")
|
||||
add_executable(${TARGET_FFTWTEST1DSP} fftwtest1d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
add_dependencies(${TARGET_FFTWTEST1DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
target_compile_definitions(${TARGET_FFTWTEST1DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
|
||||
target_link_libraries(${TARGET_FFTWTEST1DSP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
|
||||
set_target_properties(${TARGET_FFTWTEST1DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
# Target executable fftwtest2ddp
|
||||
set(TARGET_FFTWTEST2DDP "fftwtest2ddp")
|
||||
add_executable(${TARGET_FFTWTEST2DDP} fftwtest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
add_dependencies(${TARGET_FFTWTEST2DDP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
target_compile_definitions(${TARGET_FFTWTEST2DDP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=1)
|
||||
target_link_libraries(${TARGET_FFTWTEST2DDP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
|
||||
set_target_properties(${TARGET_FFTWTEST2DDP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
# Target executable fftwtest2dsp
|
||||
set(TARGET_FFTWTEST2DSP "fftwtest2dsp")
|
||||
add_executable(${TARGET_FFTWTEST2DSP} fftwtest2d.c ${PROJECT_SOURCE_DIR}/include/sleefdft.h)
|
||||
add_dependencies(${TARGET_FFTWTEST2DSP} ${TARGET_HEADERS} ${TARGET_LIBSLEEF} ${TARGET_LIBDFT})
|
||||
target_compile_definitions(${TARGET_FFTWTEST2DSP} PRIVATE ${COMMON_TARGET_DEFINITIONS} BASETYPEID=2)
|
||||
target_link_libraries(${TARGET_FFTWTEST2DSP} ${COMMON_LINK_LIBRARIES} ${LIBFFTW3})
|
||||
set_target_properties(${TARGET_FFTWTEST2DSP} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
# Test fftwtest1ddp
|
||||
add_test_dft(${TARGET_FFTWTEST1DDP}_12 $<TARGET_FILE:${TARGET_FFTWTEST1DDP}> 12)
|
||||
add_test_dft(${TARGET_FFTWTEST1DDP}_16 $<TARGET_FILE:${TARGET_FFTWTEST1DDP}> 16)
|
||||
|
||||
# Test fftwtest1dsp
|
||||
add_test_dft(${TARGET_FFTWTEST1DSP}_12 $<TARGET_FILE:${TARGET_FFTWTEST1DSP}> 12)
|
||||
add_test_dft(${TARGET_FFTWTEST1DSP}_16 $<TARGET_FILE:${TARGET_FFTWTEST1DSP}> 16)
|
||||
|
||||
# Test fftwtest2ddp
|
||||
add_test_dft(${TARGET_FFTWTEST2DDP}_2_2 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 2 2)
|
||||
add_test_dft(${TARGET_FFTWTEST2DDP}_4_4 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 4 4)
|
||||
add_test_dft(${TARGET_FFTWTEST2DDP}_8_8 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 8 8)
|
||||
add_test_dft(${TARGET_FFTWTEST2DDP}_10_10 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 10 10)
|
||||
add_test_dft(${TARGET_FFTWTEST2DDP}_5_15 $<TARGET_FILE:${TARGET_FFTWTEST2DDP}> 5 15)
|
||||
|
||||
# Test fftwtest2dsp
|
||||
add_test_dft(${TARGET_FFTWTEST2DSP}_2_2 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 2 2)
|
||||
add_test_dft(${TARGET_FFTWTEST2DSP}_4_4 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 4 4)
|
||||
add_test_dft(${TARGET_FFTWTEST2DSP}_8_8 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 8 8)
|
||||
add_test_dft(${TARGET_FFTWTEST2DSP}_10_10 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 10 10)
|
||||
add_test_dft(${TARGET_FFTWTEST2DSP}_5_15 $<TARGET_FILE:${TARGET_FFTWTEST2DSP}> 5 15)
|
||||
else(LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW)
|
||||
if(MSVC OR SLEEF_CLANG_ON_WINDOWS)
|
||||
# Test roundtriptestdp
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_1 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 1 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 2 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_3 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 3 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 4 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_5 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 5 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 10 10)
|
||||
|
||||
# Test roundtriptestsp
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_1 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 1 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 2 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_3 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 3 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 4 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_5 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 5 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 10 10)
|
||||
endif()
|
||||
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_12 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 12 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DDP}_16 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DDP}> 16 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_12 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 12 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST1DSP}_16 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST1DSP}> 16 10)
|
||||
|
||||
# Test roundtriptest2ddp
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_2_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 2 2 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_4_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 4 4 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_8_8 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 8 8 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_10_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 10 10 2)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST2DDP}_5_15 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DDP}> 5 15 2)
|
||||
|
||||
# Test roundtriptest2dsp
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_2_2 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 2 2 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_4_4 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 4 4 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_8_8 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 8 8 10)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_10_10 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 10 10 2)
|
||||
add_test_dft(${TARGET_ROUNDTRIPTEST2DSP}_5_15 $<TARGET_FILE:${TARGET_ROUNDTRIPTEST2DSP}> 5 15 2)
|
||||
endif(LIBFFTW3 AND NOT SLEEF_DISABLE_FFTW)
|
||||
@@ -0,0 +1,116 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#define _DEFAULT_SOURCE
|
||||
#define _XOPEN_SOURCE 700
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <complex.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#ifdef USEFFTW
|
||||
#include <fftw3.h>
|
||||
#include <omp.h>
|
||||
#else
|
||||
#include "sleef.h"
|
||||
#include "sleefdft.h"
|
||||
#endif
|
||||
|
||||
typedef double real;
|
||||
|
||||
static uint64_t gettime() {
|
||||
struct timespec tp;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tp);
|
||||
return (uint64_t)tp.tv_sec * 1000000000 + ((uint64_t)tp.tv_nsec);
|
||||
}
|
||||
|
||||
#define REPEAT 8
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc == 1) {
|
||||
fprintf(stderr, "%s <log2n>\n", argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
int backward = 0;
|
||||
|
||||
int log2n = atoi(argv[1]);
|
||||
if (log2n < 0) {
|
||||
backward = 1;
|
||||
log2n = -log2n;
|
||||
}
|
||||
|
||||
const int n = 1 << log2n;
|
||||
const int64_t niter = (int)(100000000000.0 / n / log2n);
|
||||
|
||||
printf("Number of iterations = %lld\n", (long long int)niter);
|
||||
|
||||
#ifdef USEFFTW
|
||||
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
|
||||
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
|
||||
|
||||
#if 0
|
||||
int fftw_init_threads(void);
|
||||
fftw_plan_with_nthreads(omp_get_max_threads());
|
||||
#endif
|
||||
|
||||
fftw_plan w = fftw_plan_dft_1d(n, in, out, backward ? FFTW_BACKWARD : FFTW_FORWARD, FFTW_MEASURE);
|
||||
//fftw_plan w = fftw_plan_dft_1d(n, in, out, backward ? FFTW_BACKWARD : FFTW_FORWARD, FFTW_PATIENT);
|
||||
|
||||
for(int i=0;i<n;i++) {
|
||||
in[i] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
|
||||
}
|
||||
|
||||
for(int64_t i=0;i<niter/2;i++) fftw_execute(w);
|
||||
#else
|
||||
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET);
|
||||
|
||||
real *in = (real *)Sleef_malloc(n*2 * sizeof(real));
|
||||
real *out = (real *)Sleef_malloc(n*2 * sizeof(real));
|
||||
|
||||
int mode = SLEEF_MODE_MEASURE | SLEEF_MODE_VERBOSE; // | SLEEF_MODE_NO_MT;
|
||||
if (argc >= 3) mode = SLEEF_MODE_VERBOSE | SLEEF_MODE_ESTIMATE;
|
||||
|
||||
if (backward) mode |= SLEEF_MODE_BACKWARD;
|
||||
struct SleefDFT *p = SleefDFT_double_init1d(n, in, out, mode);
|
||||
|
||||
if (argc >= 3) SleefDFT_setPath(p, argv[2]);
|
||||
|
||||
for(int i=0;i<n*2;i++) {
|
||||
in[i] = (2.0 * (rand() / (double)RAND_MAX) - 1);
|
||||
}
|
||||
|
||||
for(int64_t i=0;i<niter/2;i++) SleefDFT_double_execute(p, in, out);
|
||||
#endif
|
||||
|
||||
for(int rep=0;rep<REPEAT;rep++) {
|
||||
uint64_t tm0 = gettime();
|
||||
for(int64_t i=0;i<niter;i++) {
|
||||
#ifdef USEFFTW
|
||||
fftw_execute(w);
|
||||
#else
|
||||
SleefDFT_double_execute(p, in, out);
|
||||
#endif
|
||||
}
|
||||
uint64_t tm1 = gettime();
|
||||
|
||||
printf("Actual time = %g ns\n", (double)(tm1 - tm0) / niter);
|
||||
double timeus = (tm1 - tm0) / ((double)niter * 1000);
|
||||
|
||||
double mflops = 5 * n * log2n / timeus;
|
||||
|
||||
printf("%g Mflops\n", mflops);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
exit(0);
|
||||
}
|
||||
@@ -0,0 +1,230 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <time.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <complex.h>
|
||||
|
||||
#include "sleef.h"
|
||||
#include "sleefdft.h"
|
||||
|
||||
#include <fftw3.h>
|
||||
|
||||
#ifndef MODE
|
||||
#define MODE SLEEF_MODE_DEBUG
|
||||
#endif
|
||||
|
||||
#if BASETYPEID == 1
|
||||
#define THRES 1e-30
|
||||
#define SleefDFT_init1d SleefDFT_double_init1d
|
||||
#define SleefDFT_execute SleefDFT_double_execute
|
||||
typedef double real;
|
||||
#elif BASETYPEID == 2
|
||||
#define THRES 1e-13
|
||||
#define SleefDFT_init1d SleefDFT_float_init1d
|
||||
#define SleefDFT_execute SleefDFT_float_execute
|
||||
typedef float real;
|
||||
#else
|
||||
#error BASETYPEID not set
|
||||
#endif
|
||||
|
||||
static double squ(double x) { return x * x; }
|
||||
|
||||
// complex forward
|
||||
double check_cf(int n) {
|
||||
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
|
||||
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
|
||||
fftw_plan w = fftw_plan_dft_1d(n, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n*2*sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n*2*sizeof(real));
|
||||
struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, MODE);
|
||||
|
||||
for(int i=0;i<n;i++) {
|
||||
real re = (2.0 * random() - 1) / (real)RAND_MAX;
|
||||
real im = (2.0 * random() - 1) / (real)RAND_MAX;
|
||||
sx[(i*2+0)] = re;
|
||||
sx[(i*2+1)] = im;
|
||||
in[i] = re + im * _Complex_I;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, NULL, NULL);
|
||||
fftw_execute(w);
|
||||
|
||||
double rmsn = 0, rmsd = 0;
|
||||
|
||||
for(int i=0;i<n;i++) {
|
||||
rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
|
||||
rmsd += squ( creal(out[i])) + squ( cimag(out[i]));
|
||||
}
|
||||
|
||||
fftw_destroy_plan(w);
|
||||
fftw_free(in);
|
||||
fftw_free(out);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
return rmsn / rmsd;
|
||||
}
|
||||
|
||||
// complex backward
|
||||
double check_cb(int n) {
|
||||
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
|
||||
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n);
|
||||
fftw_plan w = fftw_plan_dft_1d(n, in, out, FFTW_BACKWARD, FFTW_ESTIMATE);
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n*2*sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n*2*sizeof(real));
|
||||
struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, SLEEF_MODE_BACKWARD | MODE);
|
||||
|
||||
for(int i=0;i<n;i++) {
|
||||
real re = (2.0 * random() - 1) / (real)RAND_MAX;
|
||||
real im = (2.0 * random() - 1) / (real)RAND_MAX;
|
||||
sx[(i*2+0)] = re;
|
||||
sx[(i*2+1)] = im;
|
||||
in[i] = re + im * _Complex_I;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, NULL, NULL);
|
||||
fftw_execute(w);
|
||||
|
||||
double rmsn = 0, rmsd = 0;
|
||||
|
||||
for(int i=0;i<n;i++) {
|
||||
rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
|
||||
rmsd += squ( creal(out[i])) + squ( cimag(out[i]));
|
||||
}
|
||||
|
||||
fftw_destroy_plan(w);
|
||||
fftw_free(in);
|
||||
fftw_free(out);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
return rmsn / rmsd;
|
||||
}
|
||||
|
||||
// real forward
|
||||
double check_rf(int n) {
|
||||
double *in = (double *) fftw_malloc(sizeof(double) * n);
|
||||
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
|
||||
fftw_plan w = fftw_plan_dft_r2c_1d(n, in, out, FFTW_ESTIMATE);
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n*sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc((n/2+1)*sizeof(real)*2);
|
||||
struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, SLEEF_MODE_REAL | MODE);
|
||||
|
||||
for(int i=0;i<n;i++) {
|
||||
real re = (2.0 * random() - 1) / (real)RAND_MAX;
|
||||
sx[i] = re;
|
||||
in[i] = re;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, NULL, NULL);
|
||||
fftw_execute(w);
|
||||
|
||||
double rmsn = 0, rmsd = 0;
|
||||
|
||||
for(int i=0;i<n/2+1;i++) {
|
||||
rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
|
||||
rmsd += squ( creal(out[i])) + squ( cimag(out[i]));
|
||||
}
|
||||
|
||||
fftw_destroy_plan(w);
|
||||
fftw_free(in);
|
||||
fftw_free(out);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
return rmsn / rmsd;
|
||||
}
|
||||
|
||||
// real backward
|
||||
double check_rb(int n) {
|
||||
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * (n/2+1));
|
||||
double *out = (double *) fftw_malloc(sizeof(double) * n);
|
||||
fftw_plan w = fftw_plan_dft_c2r_1d(n, in, out, FFTW_ESTIMATE);
|
||||
|
||||
real *sx = (real *)Sleef_malloc((n/2+1) * sizeof(real)*2);
|
||||
real *sy = (real *)Sleef_malloc(sizeof(real)*n);
|
||||
struct SleefDFT *p = SleefDFT_init1d(n, sx, sy, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | MODE);
|
||||
|
||||
for(int i=0;i<n/2;i++) {
|
||||
if (i == 0) {
|
||||
in[0 ] = (2.0 * (rand() / (real)RAND_MAX) - 1);
|
||||
in[n/2] = (2.0 * (rand() / (real)RAND_MAX) - 1);
|
||||
} else {
|
||||
in[i ] = (2.0 * (rand() / (real)RAND_MAX) - 1) + (2.0 * (rand() / (real)RAND_MAX) - 1) * _Complex_I;
|
||||
}
|
||||
}
|
||||
|
||||
for(int i=0;i<n/2+1;i++) {
|
||||
sx[2*i+0] = creal(in[i]);
|
||||
sx[2*i+1] = cimag(in[i]);
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, NULL, NULL);
|
||||
fftw_execute(w);
|
||||
|
||||
double rmsn = 0, rmsd = 0;
|
||||
|
||||
for(int i=0;i<n;i++) {
|
||||
rmsn += squ(sy[i] - out[i]);
|
||||
rmsd += squ( out[i]);
|
||||
}
|
||||
|
||||
fftw_destroy_plan(w);
|
||||
fftw_free(in);
|
||||
fftw_free(out);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
return rmsn / rmsd;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc != 2) {
|
||||
fprintf(stderr, "%s <log2n>\n", argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
const int n = 1 << atoi(argv[1]);
|
||||
|
||||
srand((unsigned int)time(NULL));
|
||||
|
||||
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
|
||||
|
||||
//
|
||||
|
||||
int success = 1;
|
||||
double e;
|
||||
|
||||
e = check_cf(n);
|
||||
success = success && e < THRES;
|
||||
printf("complex forward : %s (%g)\n", e < THRES ? "OK" : "NG", e);
|
||||
e = check_cb(n);
|
||||
success = success && e < THRES;
|
||||
printf("complex backward : %s (%g)\n", e < THRES ? "OK" : "NG", e);
|
||||
e = check_rf(n);
|
||||
success = success && e < THRES;
|
||||
printf("real forward : %s (%g)\n", e < THRES ? "OK" : "NG", e);
|
||||
e = check_rb(n);
|
||||
success = success && e < THRES;
|
||||
printf("real backward : %s (%g)\n", e < THRES ? "OK" : "NG", e);
|
||||
|
||||
exit(success ? 0 : -1);
|
||||
}
|
||||
@@ -0,0 +1,143 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <time.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <complex.h>
|
||||
|
||||
#include "sleef.h"
|
||||
#include "sleefdft.h"
|
||||
|
||||
#include <fftw3.h>
|
||||
|
||||
#ifndef MODE
|
||||
#define MODE SLEEF_MODE_DEBUG
|
||||
#endif
|
||||
|
||||
#if BASETYPEID == 1
|
||||
#define THRES 1e-30
|
||||
#define SleefDFT_init2d SleefDFT_double_init2d
|
||||
#define SleefDFT_execute SleefDFT_double_execute
|
||||
typedef double real;
|
||||
#elif BASETYPEID == 2
|
||||
#define THRES 1e-13
|
||||
#define SleefDFT_init2d SleefDFT_float_init2d
|
||||
#define SleefDFT_execute SleefDFT_float_execute
|
||||
typedef float real;
|
||||
#else
|
||||
#error BASETYPEID not set
|
||||
#endif
|
||||
|
||||
static double squ(double x) { return x * x; }
|
||||
|
||||
// complex forward
|
||||
double check_cf(int n, int m) {
|
||||
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
|
||||
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
|
||||
fftw_plan w = fftw_plan_dft_2d(n, m, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n*m*2*sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n*m*2*sizeof(real));
|
||||
struct SleefDFT *p = SleefDFT_init2d(n, m, sx, sy, MODE);
|
||||
|
||||
for(int i=0;i<n*m;i++) {
|
||||
double re = (2.0 * random() - 1) / (double)RAND_MAX;
|
||||
double im = (2.0 * random() - 1) / (double)RAND_MAX;
|
||||
sx[(i*2+0)] = re;
|
||||
sx[(i*2+1)] = im;
|
||||
in[i] = re + im * _Complex_I;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, NULL, NULL);
|
||||
fftw_execute(w);
|
||||
|
||||
double rmsn = 0, rmsd = 0;
|
||||
|
||||
for(int i=0;i<n*m;i++) {
|
||||
rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
|
||||
rmsd += squ( creal(out[i])) + squ( cimag(out[i]));
|
||||
}
|
||||
|
||||
fftw_destroy_plan(w);
|
||||
fftw_free(in);
|
||||
fftw_free(out);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
return rmsn / rmsd;
|
||||
}
|
||||
|
||||
// complex backward
|
||||
double check_cb(int n, int m) {
|
||||
fftw_complex *in = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
|
||||
fftw_complex *out = (fftw_complex*) fftw_malloc(sizeof(fftw_complex) * n * m);
|
||||
fftw_plan w = fftw_plan_dft_2d(n, m, in, out, FFTW_BACKWARD, FFTW_ESTIMATE);
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n*m*2*sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n*m*2*sizeof(real));
|
||||
struct SleefDFT *p = SleefDFT_init2d(n, m, sx, sy, SLEEF_MODE_BACKWARD | MODE);
|
||||
|
||||
for(int i=0;i<n*m;i++) {
|
||||
double re = (2.0 * random() - 1) / (double)RAND_MAX;
|
||||
double im = (2.0 * random() - 1) / (double)RAND_MAX;
|
||||
sx[(i*2+0)] = re;
|
||||
sx[(i*2+1)] = im;
|
||||
in[i] = re + im * _Complex_I;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, NULL, NULL);
|
||||
fftw_execute(w);
|
||||
|
||||
double rmsn = 0, rmsd = 0;
|
||||
|
||||
for(int i=0;i<n*m;i++) {
|
||||
rmsn += squ(sy[i*2+0] - creal(out[i])) + squ(sy[i*2+1] - cimag(out[i]));
|
||||
rmsd += squ( creal(out[i])) + squ( cimag(out[i]));
|
||||
}
|
||||
|
||||
fftw_destroy_plan(w);
|
||||
fftw_free(in);
|
||||
fftw_free(out);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
return rmsn / rmsd;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc != 3) {
|
||||
fprintf(stderr, "%s <log2n> <log2m>\n", argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
const int n = 1 << atoi(argv[1]);
|
||||
const int m = 1 << atoi(argv[2]);
|
||||
|
||||
srand((unsigned int)time(NULL));
|
||||
|
||||
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
|
||||
|
||||
//
|
||||
|
||||
int success = 1;
|
||||
double e;
|
||||
|
||||
e = check_cf(n, m);
|
||||
success = success && e < THRES;
|
||||
printf("complex forward : %s (%g)\n", e < THRES ? "OK" : "NG", e);
|
||||
e = check_cb(n, m);
|
||||
success = success && e < THRES;
|
||||
printf("complex backward : %s (%g)\n", e < THRES ? "OK" : "NG", e);
|
||||
|
||||
exit(success ? 0 : -1);
|
||||
}
|
||||
@@ -0,0 +1,175 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#define _DEFAULT_SOURCE
|
||||
#define _XOPEN_SOURCE 700
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <complex.h>
|
||||
|
||||
#include "sleef.h"
|
||||
#include "sleefdft.h"
|
||||
|
||||
static uint64_t gettime() {
|
||||
struct timespec tp;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tp);
|
||||
return (uint64_t)tp.tv_sec * 1000000000 + ((uint64_t)tp.tv_nsec);
|
||||
}
|
||||
|
||||
int mode[] = { SLEEF_MODE_MEASURE | SLEEF_MODE_NO_MT, SLEEF_MODE_MEASURE};
|
||||
|
||||
#define ENABLE_SP
|
||||
//#define ROUNDTRIP
|
||||
#define REPEAT 2
|
||||
//#define ENABLE_SLEEP
|
||||
//#define WARMUP
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int start = 1, end = 18;
|
||||
if (argc > 1) start = atoi(argv[1]);
|
||||
if (argc > 2) end = atoi(argv[2]);
|
||||
|
||||
double *din = (double *)Sleef_malloc((1 << 18)*2 * sizeof(double));
|
||||
double *dout = (double *)Sleef_malloc((1 << 18)*2 * sizeof(double));
|
||||
float *sin = (float *)Sleef_malloc((1 << 18)*2 * sizeof(float));
|
||||
float *sout = (float *)Sleef_malloc((1 << 18)*2 * sizeof(float));
|
||||
|
||||
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
|
||||
|
||||
for(int log2n=start;log2n<=end;log2n++) {
|
||||
const int n = 1 << log2n;
|
||||
int64_t niter = (int64_t)(1000000000.0 / REPEAT / n / log2n);
|
||||
|
||||
printf("%d ", n);
|
||||
|
||||
for(int m=0;m<2;m++) {
|
||||
#ifdef ENABLE_SLEEP
|
||||
sleep(1);
|
||||
#endif
|
||||
|
||||
struct SleefDFT *pf = SleefDFT_double_init1d(n, NULL, NULL, mode[m]);
|
||||
#ifdef ROUNDTRIP
|
||||
struct SleefDFT *pb = SleefDFT_double_init1d(n, NULL, NULL, mode[m] | SLEEF_MODE_BACKWARD);
|
||||
#endif
|
||||
|
||||
for(int i=0;i<n*2;i++) {
|
||||
din[i] = 0;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_SLEEP
|
||||
sleep(1);
|
||||
#endif
|
||||
|
||||
#ifdef WARMUP
|
||||
for(int64_t i=0;i<niter/2;i++) {
|
||||
SleefDFT_double_execute(pf, din, dout);
|
||||
#ifdef ROUNDTRIP
|
||||
SleefDFT_double_execute(pb, dout, din);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
uint64_t best = 1LL << 62;
|
||||
|
||||
//printf("\n");
|
||||
for(int rep=0;rep<REPEAT;rep++) {
|
||||
uint64_t tm0 = gettime();
|
||||
for(int64_t i=0;i<niter;i++) {
|
||||
SleefDFT_double_execute(pf, din, dout);
|
||||
#ifdef ROUNDTRIP
|
||||
SleefDFT_double_execute(pb, dout, din);
|
||||
#endif
|
||||
}
|
||||
uint64_t tm1 = gettime();
|
||||
if (tm1 - tm0 < best) best = tm1 - tm0;
|
||||
//printf("%g\n", (double)(tm1 - tm0));
|
||||
}
|
||||
|
||||
SleefDFT_dispose(pf);
|
||||
#ifdef ROUNDTRIP
|
||||
SleefDFT_dispose(pb);
|
||||
#endif
|
||||
|
||||
double timeus = best / ((double)niter * 1000);
|
||||
|
||||
#ifdef ROUNDTRIP
|
||||
double mflops = 10 * n * log2n / timeus;
|
||||
#else
|
||||
double mflops = 5 * n * log2n / timeus;
|
||||
#endif
|
||||
|
||||
printf("%g ", mflops);
|
||||
}
|
||||
|
||||
#ifdef ENABLE_SP
|
||||
for(int m=0;m<2;m++) {
|
||||
#ifdef ENABLE_SLEEP
|
||||
sleep(1);
|
||||
#endif
|
||||
|
||||
struct SleefDFT *pf = SleefDFT_float_init1d(n, NULL, NULL, mode[m]);
|
||||
#ifdef ROUNDTRIP
|
||||
struct SleefDFT *pb = SleefDFT_float_init1d(n, NULL, NULL, mode[m] | SLEEF_MODE_BACKWARD);
|
||||
#endif
|
||||
|
||||
for(int i=0;i<n*2;i++) {
|
||||
sin[i] = 0;
|
||||
}
|
||||
|
||||
#ifdef ENABLE_SLEEP
|
||||
sleep(1);
|
||||
#endif
|
||||
|
||||
#ifdef WARMUP
|
||||
for(int64_t i=0;i<niter/2;i++) {
|
||||
SleefDFT_float_execute(pf, sin, sout);
|
||||
#ifdef OUNDTRIP
|
||||
SleefDFT_float_execute(pb, sout, sin);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
uint64_t best = 1LL << 62;
|
||||
|
||||
for(int rep=0;rep<REPEAT;rep++) {
|
||||
uint64_t tm0 = gettime();
|
||||
for(int64_t i=0;i<niter;i++) {
|
||||
SleefDFT_float_execute(pf, sin, sout);
|
||||
#ifdef ROUNDTRIP
|
||||
SleefDFT_float_execute(pb, sout, sin);
|
||||
#endif
|
||||
}
|
||||
uint64_t tm1 = gettime();
|
||||
if (tm1 - tm0 < best) best = tm1 - tm0;
|
||||
}
|
||||
|
||||
SleefDFT_dispose(pf);
|
||||
#ifdef ROUNDTRIP
|
||||
SleefDFT_dispose(pb);
|
||||
#endif
|
||||
|
||||
double timeus = best / ((double)niter * 1000);
|
||||
|
||||
#ifdef ROUNDTRIP
|
||||
double mflops = 10 * n * log2n / timeus;
|
||||
#else
|
||||
double mflops = 5 * n * log2n / timeus;
|
||||
#endif
|
||||
|
||||
printf("%g ", mflops);
|
||||
}
|
||||
#endif
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,484 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <time.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <complex.h>
|
||||
|
||||
#include "sleef.h"
|
||||
#include "sleefdft.h"
|
||||
#include "misc.h"
|
||||
|
||||
#ifndef MODE
|
||||
#define MODE SLEEF_MODE_DEBUG
|
||||
#endif
|
||||
|
||||
#define THRES 1e-4
|
||||
|
||||
#if BASETYPEID == 1
|
||||
#define SleefDFT_init SleefDFT_double_init1d
|
||||
#define SleefDFT_execute SleefDFT_double_execute
|
||||
typedef double real;
|
||||
|
||||
typedef double complex cmpl;
|
||||
|
||||
cmpl omega(double n, double kn) {
|
||||
return cexp((-2 * M_PIl * _Complex_I / n) * kn);
|
||||
}
|
||||
#elif BASETYPEID == 2
|
||||
#define SleefDFT_init SleefDFT_float_init1d
|
||||
#define SleefDFT_execute SleefDFT_float_execute
|
||||
typedef float real;
|
||||
|
||||
typedef double complex cmpl;
|
||||
|
||||
cmpl omega(double n, double kn) {
|
||||
return cexp((-2 * M_PIl * _Complex_I / n) * kn);
|
||||
}
|
||||
#elif BASETYPEID == 3
|
||||
#define SleefDFT_init SleefDFT_longdouble_init1d
|
||||
#define SleefDFT_execute SleefDFT_longdouble_execute
|
||||
typedef double real;
|
||||
|
||||
typedef double complex cmpl;
|
||||
|
||||
cmpl omega(double n, double kn) {
|
||||
return cexp((-2 * M_PIl * _Complex_I / n) * kn);
|
||||
}
|
||||
#elif BASETYPEID == 4
|
||||
#include <quadmath.h>
|
||||
|
||||
#define SleefDFT_init SleefDFT_quad_init1d
|
||||
#define SleefDFT_execute SleefDFT_quad_execute
|
||||
typedef Sleef_quad real;
|
||||
|
||||
typedef double complex cmpl;
|
||||
|
||||
cmpl omega(double n, double kn) {
|
||||
return cexp((-2 * M_PIl * _Complex_I / n) * kn);
|
||||
}
|
||||
#else
|
||||
#error No BASETYPEID specified
|
||||
#endif
|
||||
|
||||
void forward(cmpl *ts, cmpl *fs, int len) {
|
||||
int k, n;
|
||||
|
||||
for(k=0;k<len;k++) {
|
||||
fs[k] = 0;
|
||||
|
||||
for(n=0;n<len;n++) {
|
||||
fs[k] += ts[n] * omega(len, n*k);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void backward(cmpl *fs, cmpl *ts, int len) {
|
||||
int k, n;
|
||||
|
||||
for(k=0;k<len;k++) {
|
||||
ts[k] = 0;
|
||||
|
||||
for(n=0;n<len;n++) {
|
||||
ts[k] += fs[n] * omega(-len, n*k);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// complex forward
|
||||
int check_cf(int n) {
|
||||
int i;
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n*2 * sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n*2 * sizeof(real));
|
||||
|
||||
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
|
||||
//
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
ts[i] = 0.5 * ((2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I);
|
||||
sx[(i*2+0)] = creal(ts[i]);
|
||||
sx[(i*2+1)] = cimag(ts[i]);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
forward(ts, fs, n);
|
||||
|
||||
struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, MODE | SLEEF_MODE_VERBOSE);
|
||||
|
||||
if (p == NULL) {
|
||||
printf("SleefDFT initialization failed\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sx, sy);
|
||||
|
||||
//
|
||||
|
||||
int success = 1;
|
||||
double rmsn = 0, rmsd = 0;
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
if ((fabs(sy[(i*2+0)] - creal(fs[i])) > THRES) ||
|
||||
(fabs(sy[(i*2+1)] - cimag(fs[i])) > THRES)) {
|
||||
success = 0;
|
||||
}
|
||||
|
||||
double t;
|
||||
t = (sy[(i*2+0)] - creal(fs[i]));
|
||||
rmsn += t*t;
|
||||
t = (sy[(i*2+1)] - cimag(fs[i]));
|
||||
rmsn += t*t;
|
||||
rmsd += creal(fs[i]) * creal(fs[i]) + cimag(fs[i]) * cimag(fs[i]);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
free(fs);
|
||||
free(ts);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
// complex backward
|
||||
int check_cb(int n) {
|
||||
int i;
|
||||
|
||||
real *sx = (real *)Sleef_malloc(sizeof(real)*n*2);
|
||||
real *sy = (real *)Sleef_malloc(sizeof(real)*n*2);
|
||||
|
||||
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
|
||||
//
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
fs[i] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
|
||||
sx[(i*2+0)] = creal(fs[i]);
|
||||
sx[(i*2+1)] = cimag(fs[i]);
|
||||
}
|
||||
|
||||
backward(fs, ts, n);
|
||||
|
||||
struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_BACKWARD | MODE);
|
||||
|
||||
if (p == NULL) {
|
||||
printf("SleefDFT initialization failed\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sx, sy);
|
||||
|
||||
//
|
||||
|
||||
int success = 1;
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
if ((fabs(sy[(i*2+0)] - creal(ts[i])) > THRES) ||
|
||||
(fabs(sy[(i*2+1)] - cimag(ts[i])) > THRES)) {
|
||||
success = 0;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
free(fs);
|
||||
free(ts);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
// real forward
|
||||
int check_rf(int n) {
|
||||
int i;
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n * sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc((n/2+1)*sizeof(real)*2);
|
||||
|
||||
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
|
||||
//
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
ts[i] = (2.0 * (rand() / (double)RAND_MAX) - 1);
|
||||
sx[i] = creal(ts[i]);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
forward(ts, fs, n);
|
||||
|
||||
struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_NO_MT | SLEEF_MODE_REAL | MODE);
|
||||
|
||||
if (p == NULL) {
|
||||
printf("SleefDFT initialization failed\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sx, sy);
|
||||
|
||||
//
|
||||
|
||||
int success = 1;
|
||||
|
||||
for(i=0;i<n/2+1;i++) {
|
||||
if (fabs(sy[(2*i+0)] - creal(fs[i])) > THRES) success = 0;
|
||||
if (fabs(sy[(2*i+1)] - cimag(fs[i])) > THRES) success = 0;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
free(fs);
|
||||
free(ts);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
// real backward
|
||||
int check_rb(int n) {
|
||||
int i;
|
||||
|
||||
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
|
||||
//
|
||||
|
||||
for(i=0;i<n/2;i++) {
|
||||
if (i == 0) {
|
||||
fs[0 ] = (2.0 * (rand() / (double)RAND_MAX) - 1);
|
||||
fs[n/2] = (2.0 * (rand() / (double)RAND_MAX) - 1);
|
||||
} else {
|
||||
fs[i ] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
|
||||
fs[n-i] = conj(fs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
real *sx = (real *)Sleef_malloc((n/2+1) * sizeof(real)*2);
|
||||
real *sy = (real *)Sleef_malloc(sizeof(real)*n);
|
||||
|
||||
for(i=0;i<n/2+1;i++) {
|
||||
sx[2*i+0] = creal(fs[i]);
|
||||
sx[2*i+1] = cimag(fs[i]);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
backward(fs, ts, n);
|
||||
|
||||
struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | MODE);
|
||||
|
||||
if (p == NULL) {
|
||||
printf("SleefDFT initialization failed\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sx, sy);
|
||||
|
||||
//
|
||||
|
||||
int success = 1;
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
if (fabs(cimag(ts[i])) > THRES) {
|
||||
success = 0;
|
||||
}
|
||||
|
||||
if ((fabs(sy[i] - creal(ts[i])) > THRES)) {
|
||||
success = 0;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
free(fs);
|
||||
free(ts);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
int check_arf(int n) {
|
||||
int i;
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n * sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n * sizeof(real));
|
||||
|
||||
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
|
||||
//
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
ts[i] = 2 * (rand() / (real)RAND_MAX) - 1;
|
||||
sx[i] = creal(ts[i]);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
backward(ts, fs, n);
|
||||
|
||||
struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_ALT | MODE);
|
||||
|
||||
if (p == NULL) {
|
||||
printf("SleefDFT initialization failed\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sx, sy);
|
||||
|
||||
//
|
||||
|
||||
int success = 1;
|
||||
|
||||
for(i=0;i<n/2;i++) {
|
||||
if (i == 0) {
|
||||
if (fabs(sy[(2*0+0)] - creal(fs[0 ])) > THRES) success = 0;
|
||||
if (fabs(sy[(2*0+1)] - creal(fs[n/2])) > THRES) success = 0;
|
||||
} else {
|
||||
if (fabs(sy[(2*i+0)] - creal(fs[i])) > THRES) success = 0;
|
||||
if (fabs(sy[(2*i+1)] - cimag(fs[i])) > THRES) success = 0;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
int check_arb(int n) {
|
||||
int i;
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n * sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n * sizeof(real));
|
||||
|
||||
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
|
||||
//
|
||||
|
||||
for(i=0;i<n/2;i++) {
|
||||
if (i == 0) {
|
||||
fs[0 ] = (2.0 * (rand() / (double)RAND_MAX) - 1);
|
||||
fs[n/2] = (2.0 * (rand() / (double)RAND_MAX) - 1);
|
||||
} else {
|
||||
fs[i ] = (2.0 * (rand() / (double)RAND_MAX) - 1) + (2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
|
||||
fs[n-i] = conj(fs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
for(i=0;i<n/2;i++) {
|
||||
if (i == 0) {
|
||||
sx[2*0+0] = creal(fs[0 ]);
|
||||
sx[2*0+1] = creal(fs[n/2]);
|
||||
} else {
|
||||
sx[2*i+0] = creal(fs[i]);
|
||||
sx[2*i+1] = cimag(fs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
forward(fs, ts, n);
|
||||
|
||||
struct SleefDFT *p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | SLEEF_MODE_ALT | MODE);
|
||||
|
||||
if (p == NULL) {
|
||||
printf("SleefDFT initialization failed\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sx, sy);
|
||||
|
||||
//
|
||||
|
||||
int success = 1;
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
if (fabs(cimag(ts[i])) > THRES) {
|
||||
success = 0;
|
||||
}
|
||||
|
||||
if ((fabs(sy[i]*2 - creal(ts[i])) > THRES)) {
|
||||
success = 0;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
free(fs);
|
||||
free(ts);
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc != 2) {
|
||||
fprintf(stderr, "%s <log2n>\n", argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
const int n = 1 << atoi(argv[1]);
|
||||
|
||||
srand((unsigned int)time(NULL));
|
||||
|
||||
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
|
||||
|
||||
//
|
||||
|
||||
int success = 1;
|
||||
|
||||
printf("complex forward : %s\n", (success &= check_cf(n)) ? "OK" : "NG");
|
||||
printf("complex backward : %s\n", (success &= check_cb(n)) ? "OK" : "NG");
|
||||
printf("real forward : %s\n", (success &= check_rf(n)) ? "OK" : "NG");
|
||||
printf("real backward : %s\n", (success &= check_rb(n)) ? "OK" : "NG");
|
||||
printf("real alt forward : %s\n", (success &= check_arf(n)) ? "OK" : "NG");
|
||||
printf("real alt backward : %s\n", (success &= check_arb(n)) ? "OK" : "NG");
|
||||
|
||||
exit(!success);
|
||||
}
|
||||
@@ -0,0 +1,174 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <time.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <complex.h>
|
||||
|
||||
#include "sleef.h"
|
||||
#include "sleefdft.h"
|
||||
|
||||
#ifndef MODE
|
||||
#define MODE (SLEEF_MODE_DEBUG | SLEEF_MODE_VERBOSE)
|
||||
#endif
|
||||
|
||||
#if BASETYPEID == 1
|
||||
#define THRES 1e-30
|
||||
#define SleefDFT_init SleefDFT_double_init1d
|
||||
#define SleefDFT_execute SleefDFT_double_execute
|
||||
typedef double real;
|
||||
#elif BASETYPEID == 2
|
||||
#define THRES 1e-13
|
||||
#define SleefDFT_init SleefDFT_float_init1d
|
||||
#define SleefDFT_execute SleefDFT_float_execute
|
||||
typedef float real;
|
||||
#else
|
||||
#error BASETYPEID not set
|
||||
#endif
|
||||
|
||||
static double squ(double x) { return x * x; }
|
||||
|
||||
// complex transforms
|
||||
double check_c(int n) {
|
||||
struct SleefDFT *p;
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n*2 * sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n*2 * sizeof(real));
|
||||
real *sz = (real *)Sleef_malloc(n*2 * sizeof(real));
|
||||
|
||||
for(int i=0;i<n*2;i++) sx[i] = (real)(2.0 * (rand() / (double)RAND_MAX) - 1);
|
||||
|
||||
//
|
||||
|
||||
p = SleefDFT_init(n, NULL, NULL, MODE);
|
||||
|
||||
if (p == NULL) {
|
||||
printf("SleefDFT initialization failed\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sx, sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
|
||||
p = SleefDFT_init(n, NULL, NULL, MODE | SLEEF_MODE_BACKWARD);
|
||||
|
||||
if (p == NULL) {
|
||||
printf("SleefDFT initialization failed\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sy, sz);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
|
||||
double rmsn = 0, rmsd = 0, scale = 1 / (double)n;
|
||||
|
||||
for(int i=0;i<n;i++) {
|
||||
rmsn += squ(scale * sz[i*2+0] - sx[i*2+0]) + squ(scale * sz[i*2+1] - sx[i*2+1]);
|
||||
rmsd += squ( sx[i*2+0]) + squ( sx[i*2+1]);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
Sleef_free(sz);
|
||||
|
||||
//
|
||||
|
||||
return rmsn / rmsd;
|
||||
}
|
||||
|
||||
// real transforms
|
||||
double check_r(int n) {
|
||||
struct SleefDFT *p;
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n * sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc((n/2+1)*sizeof(real)*2);
|
||||
real *sz = (real *)Sleef_malloc(n * sizeof(real));
|
||||
|
||||
for(int i=0;i<n;i++) sx[i] = (real)(2.0 * (rand() / (double)RAND_MAX) - 1);
|
||||
|
||||
//
|
||||
|
||||
p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | MODE);
|
||||
|
||||
if (p == NULL) {
|
||||
printf("SleefDFT initialization failed\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sx, sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
|
||||
p = SleefDFT_init(n, NULL, NULL, SLEEF_MODE_REAL | SLEEF_MODE_BACKWARD | MODE);
|
||||
|
||||
if (p == NULL) {
|
||||
printf("SleefDFT initialization failed\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sy, sz);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
|
||||
double rmsn = 0, rmsd = 0, scale = 1 / (double)n;
|
||||
|
||||
for(int i=0;i<n;i++) {
|
||||
rmsn += squ(scale * sz[i] - sx[i]);
|
||||
rmsd += squ( sx[i]);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
Sleef_free(sz);
|
||||
|
||||
//
|
||||
|
||||
return rmsn / rmsd;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "%s <log2n> [<nloop>]\n", argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
const int n = 1 << atoi(argv[1]);
|
||||
const int nloop = argc >= 3 ? atoi(argv[2]) : 1;
|
||||
|
||||
srand((unsigned int)time(NULL));
|
||||
|
||||
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
|
||||
|
||||
//
|
||||
|
||||
int success = 1;
|
||||
double e;
|
||||
|
||||
for(int i=0;(nloop < 0 || i < nloop) && success;i++) {
|
||||
e = check_c(n);
|
||||
success = success && e < THRES;
|
||||
printf("complex : %s (%g)\n", e < THRES ? "OK" : "NG", e);
|
||||
e = check_r(n);
|
||||
success = success && e < THRES;
|
||||
printf("real : %s (%g)\n", e < THRES ? "OK" : "NG", e);
|
||||
}
|
||||
|
||||
exit(!success);
|
||||
}
|
||||
@@ -0,0 +1,118 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <time.h>
|
||||
|
||||
#include <math.h>
|
||||
#include <complex.h>
|
||||
|
||||
#include "sleef.h"
|
||||
#include "sleefdft.h"
|
||||
|
||||
#ifndef MODE
|
||||
#define MODE (SLEEF_MODE_DEBUG | SLEEF_MODE_VERBOSE)
|
||||
#endif
|
||||
|
||||
#if BASETYPEID == 1
|
||||
#define THRES 1e-30
|
||||
#define SleefDFT_init2d SleefDFT_double_init2d
|
||||
#define SleefDFT_execute SleefDFT_double_execute
|
||||
typedef double real;
|
||||
#elif BASETYPEID == 2
|
||||
#define THRES 1e-13
|
||||
#define SleefDFT_init2d SleefDFT_float_init2d
|
||||
#define SleefDFT_execute SleefDFT_float_execute
|
||||
typedef float real;
|
||||
#else
|
||||
#error BASETYPEID not set
|
||||
#endif
|
||||
|
||||
static double squ(double x) { return x * x; }
|
||||
|
||||
// complex transforms
|
||||
double check_c(int n, int m) {
|
||||
struct SleefDFT *p;
|
||||
|
||||
real *sx = (real *)Sleef_malloc(n*m*2 * sizeof(real));
|
||||
real *sy = (real *)Sleef_malloc(n*m*2 * sizeof(real));
|
||||
real *sz = (real *)Sleef_malloc(n*m*2 * sizeof(real));
|
||||
|
||||
for(int i=0;i<n*m*2;i++) sx[i] = (real)(2.0 * (rand() / (double)RAND_MAX) - 1);
|
||||
|
||||
//
|
||||
|
||||
p = SleefDFT_init2d(n, m, NULL, NULL, MODE);
|
||||
|
||||
if (p == NULL) {
|
||||
printf("SleefDFT initialization failed\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sx, sy);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
|
||||
p = SleefDFT_init2d(n, m, NULL, NULL, MODE | SLEEF_MODE_BACKWARD);
|
||||
|
||||
if (p == NULL) {
|
||||
printf("SleefDFT initialization failed\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
SleefDFT_execute(p, sy, sz);
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
//
|
||||
|
||||
double rmsn = 0, rmsd = 0, scale = 1 / (n*(double)m);
|
||||
|
||||
for(int i=0;i<n*m;i++) {
|
||||
rmsn += squ(scale * sz[i*2+0] - sx[i*2+0]) + squ(scale * sz[i*2+1] - sx[i*2+1]);
|
||||
rmsd += squ( sx[i*2+0]) + squ( sx[i*2+1]);
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
Sleef_free(sx);
|
||||
Sleef_free(sy);
|
||||
Sleef_free(sz);
|
||||
|
||||
//
|
||||
|
||||
return rmsn / rmsd;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "%s <log2n> <log2m> [<nloop>]\n", argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
const int n = 1 << atoi(argv[1]);
|
||||
const int m = 1 << atoi(argv[2]);
|
||||
const int nloop = argc >= 4 ? atoi(argv[3]) : 1;
|
||||
|
||||
srand((unsigned int)time(NULL));
|
||||
|
||||
SleefDFT_setPlanFilePath(NULL, NULL, SLEEF_PLAN_RESET | SLEEF_PLAN_READONLY);
|
||||
|
||||
//
|
||||
|
||||
int success = 1;
|
||||
double e;
|
||||
|
||||
for(int i=0;(nloop < 0 || i < nloop) && success;i++) {
|
||||
e = check_c(n, m);
|
||||
success = success && e < THRES;
|
||||
printf("complex : %s (%g)\n", e < THRES ? "OK" : "NG", e);
|
||||
}
|
||||
|
||||
exit(!success);
|
||||
}
|
||||
@@ -0,0 +1,80 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
// gcc tutorial.c -lsleef -lsleefdft -lm
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
#include <complex.h>
|
||||
|
||||
#include "sleef.h"
|
||||
#include "sleefdft.h"
|
||||
|
||||
#define THRES 1e-4
|
||||
|
||||
typedef double complex cmpl;
|
||||
|
||||
cmpl omega(double n, double kn) {
|
||||
return cexp((-2 * M_PI * _Complex_I / n) * kn);
|
||||
}
|
||||
|
||||
void forward(cmpl *ts, cmpl *fs, int len) {
|
||||
for(int k=0;k<len;k++) {
|
||||
fs[k] = 0;
|
||||
for(int n=0;n<len;n++) fs[k] += ts[n] * omega(len, n*k);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int n = 256;
|
||||
if (argc == 2) n = 1 << atoi(argv[1]);
|
||||
|
||||
SleefDFT_setPlanFilePath("plan.txt", NULL, SLEEF_PLAN_AUTOMATIC);
|
||||
|
||||
double *sx = (double *)Sleef_malloc(n*2 * sizeof(double));
|
||||
double *sy = (double *)Sleef_malloc(n*2 * sizeof(double));
|
||||
|
||||
struct SleefDFT *p = SleefDFT_double_init1d(n, sx, sy, SLEEF_MODE_FORWARD);
|
||||
|
||||
if (p == NULL) {
|
||||
printf("SleefDFT initialization failed\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
cmpl *ts = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
cmpl *fs = (cmpl *)malloc(sizeof(cmpl)*n);
|
||||
|
||||
for(int i=0;i<n;i++) {
|
||||
ts[i] =
|
||||
(2.0 * (rand() / (double)RAND_MAX) - 1) * 1.0 +
|
||||
(2.0 * (rand() / (double)RAND_MAX) - 1) * _Complex_I;
|
||||
|
||||
sx[(i*2+0)] = creal(ts[i]);
|
||||
sx[(i*2+1)] = cimag(ts[i]);
|
||||
}
|
||||
|
||||
forward(ts, fs, n);
|
||||
|
||||
SleefDFT_double_execute(p, NULL, NULL);
|
||||
|
||||
int success = 1;
|
||||
|
||||
for(int i=0;i<n;i++) {
|
||||
if ((fabs(sy[(i*2+0)] - creal(fs[i])) > THRES) ||
|
||||
(fabs(sy[(i*2+1)] - cimag(fs[i])) > THRES)) {
|
||||
success = 0;
|
||||
}
|
||||
}
|
||||
|
||||
printf("%s\n", success ? "OK" : "NG");
|
||||
|
||||
free(fs); free(ts);
|
||||
Sleef_free(sy); Sleef_free(sx);
|
||||
|
||||
SleefDFT_dispose(p);
|
||||
|
||||
exit(success);
|
||||
}
|
||||
@@ -0,0 +1,425 @@
|
||||
|
||||
# Options
|
||||
|
||||
if (COMPILER_SUPPORTS_SVE)
|
||||
set(SLEEFDFT_MAXBUTWIDTH 6 CACHE STRING "Log_2 (Maximum butterfly length) of butterflies")
|
||||
else()
|
||||
set(SLEEFDFT_MAXBUTWIDTH 4 CACHE STRING "Log_2 (Maximum butterfly length) of butterflies")
|
||||
endif()
|
||||
|
||||
if (SLEEFDFT_MAXBUTWIDTH GREATER 7)
|
||||
message(FATAL_ERROR "SLEEFDFT_MAXBUTWIDTH has to be smaller than 8." )
|
||||
endif()
|
||||
|
||||
option(SLEEFDFT_ENABLE_STREAM "Streaming instructions are utilized in DFT." OFF)
|
||||
|
||||
# Settings
|
||||
|
||||
# Constants definition
|
||||
|
||||
set(LISTSHORTTYPENAME "dp" "sp")
|
||||
set(LISTLONGTYPENAME "double" "float")
|
||||
set(LISTTYPEID "1" "2")
|
||||
|
||||
set(MACRODEF_vecextdp BASETYPEID=1 ENABLE_VECEXT CONFIG=1)
|
||||
set(CFLAGS_vecextdp ${FLAGS_ENABLE_VECEXT})
|
||||
set(MACRODEF_vecextsp BASETYPEID=2 ENABLE_VECEXT CONFIG=1)
|
||||
set(CFLAGS_vecextsp ${FLAGS_ENABLE_VECEXT})
|
||||
set(MACRODEF_vecextld BASETYPEID=3 ENABLE_VECEXT CONFIG=1)
|
||||
set(CFLAGS_vecextld ${FLAGS_ENABLE_VECEXT})
|
||||
set(MACRODEF_vecextqp BASETYPEID=4 ENABLE_VECEXT CONFIG=1)
|
||||
set(CFLAGS_vecextqp ${FLAGS_ENABLE_VECEXT})
|
||||
set(MACRODEF_purecdp BASETYPEID=1 ENABLE_PUREC CONFIG=1)
|
||||
set(CFLAGS_purecdp ${FLAGS_ENABLE_PUREC})
|
||||
set(MACRODEF_purecsp BASETYPEID=2 ENABLE_PUREC CONFIG=1)
|
||||
set(CFLAGS_purecsp ${FLAGS_ENABLE_PUREC})
|
||||
set(MACRODEF_purecld BASETYPEID=3 ENABLE_PUREC CONFIG=1)
|
||||
set(CFLAGS_purecld ${FLAGS_ENABLE_PUREC})
|
||||
set(MACRODEF_purecqp BASETYPEID=4 ENABLE_PUREC CONFIG=1)
|
||||
set(CFLAGS_purecqp ${FLAGS_ENABLE_PUREC})
|
||||
set(MACRODEF_sse2dp BASETYPEID=1 ENABLE_SSE2 CONFIG=4)
|
||||
set(CFLAGS_sse2dp ${FLAGS_ENABLE_SSE4})
|
||||
set(MACRODEF_sse2sp BASETYPEID=2 ENABLE_SSE2 CONFIG=4)
|
||||
set(CFLAGS_sse2sp ${FLAGS_ENABLE_SSE4})
|
||||
set(MACRODEF_avxdp BASETYPEID=1 ENABLE_AVX CONFIG=1)
|
||||
set(CFLAGS_avxdp ${FLAGS_ENABLE_AVX})
|
||||
set(MACRODEF_avxsp BASETYPEID=2 ENABLE_AVX CONFIG=1)
|
||||
set(CFLAGS_avxsp ${FLAGS_ENABLE_AVX})
|
||||
set(MACRODEF_avx2dp BASETYPEID=1 ENABLE_AVX2 CONFIG=1)
|
||||
set(CFLAGS_avx2dp ${FLAGS_ENABLE_AVX2})
|
||||
set(MACRODEF_avx2sp BASETYPEID=2 ENABLE_AVX2 CONFIG=1)
|
||||
set(CFLAGS_avx2sp ${FLAGS_ENABLE_AVX2})
|
||||
set(MACRODEF_avx512fdp BASETYPEID=1 ENABLE_AVX512F CONFIG=1)
|
||||
set(CFLAGS_avx512fdp ${FLAGS_ENABLE_AVX512F})
|
||||
set(MACRODEF_avx512fsp BASETYPEID=2 ENABLE_AVX512F CONFIG=1)
|
||||
set(CFLAGS_avx512fsp ${FLAGS_ENABLE_AVX512F})
|
||||
set(MACRODEF_advsimddp BASETYPEID=1 ENABLE_ADVSIMD CONFIG=1)
|
||||
set(CFLAGS_advsimddp ${FLAGS_ENABLE_ADVSIMD})
|
||||
set(MACRODEF_advsimdsp BASETYPEID=2 ENABLE_ADVSIMD CONFIG=1)
|
||||
set(CFLAGS_advsimdsp ${FLAGS_ENABLE_ADVSIMD})
|
||||
set(MACRODEF_neon32sp BASETYPEID=2 ENABLE_NEON32 CONFIG=1)
|
||||
set(CFLAGS_neon32sp ${FLAGS_ENABLE_NEON32})
|
||||
set(MACRODEF_sve256dp BASETYPEID=1 ENABLE_SVE CONFIG=8)
|
||||
set(CFLAGS_sve256dp ${FLAGS_ENABLE_SVE})
|
||||
set(MACRODEF_sve256sp BASETYPEID=2 ENABLE_SVE CONFIG=8)
|
||||
set(CFLAGS_sve256sp ${FLAGS_ENABLE_SVE})
|
||||
set(MACRODEF_sve512dp BASETYPEID=1 ENABLE_SVE CONFIG=9)
|
||||
set(CFLAGS_sve512dp ${FLAGS_ENABLE_SVE})
|
||||
set(MACRODEF_sve512sp BASETYPEID=2 ENABLE_SVE CONFIG=9)
|
||||
set(CFLAGS_sve512sp ${FLAGS_ENABLE_SVE})
|
||||
set(MACRODEF_sve1024dp BASETYPEID=1 ENABLE_SVE CONFIG=10)
|
||||
set(CFLAGS_sve1024dp ${FLAGS_ENABLE_SVE})
|
||||
set(MACRODEF_sve1024sp BASETYPEID=2 ENABLE_SVE CONFIG=10)
|
||||
set(CFLAGS_sve1024sp ${FLAGS_ENABLE_SVE})
|
||||
set(MACRODEF_sve2048dp BASETYPEID=1 ENABLE_SVE CONFIG=11)
|
||||
set(CFLAGS_sve2048dp ${FLAGS_ENABLE_SVE})
|
||||
set(MACRODEF_sve2048sp BASETYPEID=2 ENABLE_SVE CONFIG=11)
|
||||
set(CFLAGS_sve2048sp ${FLAGS_ENABLE_SVE})
|
||||
set(MACRODEF_rvvm1128dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=7)
|
||||
set(CFLAGS_rvvm1128dp ${FLAGS_ENABLE_RVVM1})
|
||||
set(MACRODEF_rvvm1128sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=7)
|
||||
set(CFLAGS_rvvm1128sp ${FLAGS_ENABLE_RVVM1})
|
||||
set(MACRODEF_rvvm1256dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=8)
|
||||
set(CFLAGS_rvvm1256dp ${FLAGS_ENABLE_RVVM1})
|
||||
set(MACRODEF_rvvm1256sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=8)
|
||||
set(CFLAGS_rvvm1256sp ${FLAGS_ENABLE_RVVM1})
|
||||
set(MACRODEF_rvvm1512dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=9)
|
||||
set(CFLAGS_rvvm1512dp ${FLAGS_ENABLE_RVVM1})
|
||||
set(MACRODEF_rvvm1512sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=9)
|
||||
set(CFLAGS_rvvm1512sp ${FLAGS_ENABLE_RVVM1})
|
||||
set(MACRODEF_rvvm11024dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=10)
|
||||
set(CFLAGS_rvvm11024dp ${FLAGS_ENABLE_RVVM1})
|
||||
set(MACRODEF_rvvm11024sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=10)
|
||||
set(CFLAGS_rvvm11024sp ${FLAGS_ENABLE_RVVM1})
|
||||
set(MACRODEF_rvvm12048dp BASETYPEID=1 ENABLE_RVVM1 CONFIG=11)
|
||||
set(CFLAGS_rvvm12048dp ${FLAGS_ENABLE_RVVM1})
|
||||
set(MACRODEF_rvvm12048sp BASETYPEID=2 ENABLE_RVVM1 CONFIG=11)
|
||||
set(CFLAGS_rvvm12048sp ${FLAGS_ENABLE_RVVM1})
|
||||
set(MACRODEF_rvvm2128dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=7)
|
||||
set(CFLAGS_rvvm2128dp ${FLAGS_ENABLE_RVVM2})
|
||||
set(MACRODEF_rvvm2128sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=7)
|
||||
set(CFLAGS_rvvm2128sp ${FLAGS_ENABLE_RVVM2})
|
||||
set(MACRODEF_rvvm2256dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=8)
|
||||
set(CFLAGS_rvvm2256dp ${FLAGS_ENABLE_RVVM2})
|
||||
set(MACRODEF_rvvm2256sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=8)
|
||||
set(CFLAGS_rvvm2256sp ${FLAGS_ENABLE_RVVM2})
|
||||
set(MACRODEF_rvvm2512dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=9)
|
||||
set(CFLAGS_rvvm2512dp ${FLAGS_ENABLE_RVVM2})
|
||||
set(MACRODEF_rvvm2512sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=9)
|
||||
set(CFLAGS_rvvm2512sp ${FLAGS_ENABLE_RVVM2})
|
||||
set(MACRODEF_rvvm21024dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=10)
|
||||
set(CFLAGS_rvvm21024dp ${FLAGS_ENABLE_RVVM2})
|
||||
set(MACRODEF_rvvm21024sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=10)
|
||||
set(CFLAGS_rvvm21024sp ${FLAGS_ENABLE_RVVM2})
|
||||
set(MACRODEF_rvvm22048dp BASETYPEID=1 ENABLE_RVVM2 CONFIG=11)
|
||||
set(CFLAGS_rvvm22048dp ${FLAGS_ENABLE_RVVM2})
|
||||
set(MACRODEF_rvvm22048sp BASETYPEID=2 ENABLE_RVVM2 CONFIG=11)
|
||||
set(CFLAGS_rvvm22048sp ${FLAGS_ENABLE_RVVM2})
|
||||
set(MACRODEF_vsxdp BASETYPEID=1 ENABLE_VSX CONFIG=1)
|
||||
set(CFLAGS_vsxdp ${FLAGS_ENABLE_VSX})
|
||||
set(MACRODEF_vsxsp BASETYPEID=2 ENABLE_VSX CONFIG=1)
|
||||
set(CFLAGS_vsxsp ${FLAGS_ENABLE_VSX})
|
||||
set(MACRODEF_vsx3dp BASETYPEID=1 ENABLE_VSX3 CONFIG=1)
|
||||
set(CFLAGS_vsx3dp ${FLAGS_ENABLE_VSX3})
|
||||
set(MACRODEF_vsx3sp BASETYPEID=2 ENABLE_VSX3 CONFIG=1)
|
||||
set(CFLAGS_vsx3sp ${FLAGS_ENABLE_VSX3})
|
||||
set(MACRODEF_vxedp BASETYPEID=1 ENABLE_VXE CONFIG=140)
|
||||
set(CFLAGS_vxedp ${FLAGS_ENABLE_VXE})
|
||||
set(MACRODEF_vxesp BASETYPEID=2 ENABLE_VXE CONFIG=140)
|
||||
set(CFLAGS_vxesp ${FLAGS_ENABLE_VXE})
|
||||
set(MACRODEF_vxe2dp BASETYPEID=1 ENABLE_VXE2 CONFIG=150)
|
||||
set(CFLAGS_vxe2dp ${FLAGS_ENABLE_VXE2})
|
||||
set(MACRODEF_vxe2sp BASETYPEID=2 ENABLE_VXE2 CONFIG=150)
|
||||
set(CFLAGS_vxe2sp ${FLAGS_ENABLE_VXE2})
|
||||
|
||||
# List all available scalar data types
|
||||
|
||||
set(ISALIST_SP purecsp)
|
||||
set(ISALIST_DP purecdp)
|
||||
|
||||
set(LIST_SUPPORTED_FPTYPE 0 1)
|
||||
if(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
|
||||
set(ISALIST_SP vecextsp)
|
||||
set(ISALIST_DP vecextdp)
|
||||
endif(CMAKE_C_COMPILER_ID MATCHES "(GNU|Clang)")
|
||||
|
||||
# List all available vector data types
|
||||
|
||||
if (COMPILER_SUPPORTS_SSE4)
|
||||
set(ISALIST_SP ${ISALIST_SP} sse2sp)
|
||||
set(ISALIST_DP ${ISALIST_DP} sse2dp)
|
||||
endif(COMPILER_SUPPORTS_SSE4)
|
||||
|
||||
if (COMPILER_SUPPORTS_AVX)
|
||||
set(ISALIST_SP ${ISALIST_SP} avxsp)
|
||||
set(ISALIST_DP ${ISALIST_DP} avxdp)
|
||||
endif(COMPILER_SUPPORTS_AVX)
|
||||
|
||||
if (COMPILER_SUPPORTS_AVX2)
|
||||
set(ISALIST_SP ${ISALIST_SP} avx2sp)
|
||||
set(ISALIST_DP ${ISALIST_DP} avx2dp)
|
||||
endif(COMPILER_SUPPORTS_AVX2)
|
||||
|
||||
if (COMPILER_SUPPORTS_AVX512F)
|
||||
set(ISALIST_SP ${ISALIST_SP} avx512fsp)
|
||||
set(ISALIST_DP ${ISALIST_DP} avx512fdp)
|
||||
endif(COMPILER_SUPPORTS_AVX512F)
|
||||
|
||||
if (COMPILER_SUPPORTS_ADVSIMD)
|
||||
set(ISALIST_SP ${ISALIST_SP} advsimdsp)
|
||||
set(ISALIST_DP ${ISALIST_DP} advsimddp)
|
||||
endif(COMPILER_SUPPORTS_ADVSIMD)
|
||||
|
||||
if (COMPILER_SUPPORTS_SVE)
|
||||
set(ISALIST_SP ${ISALIST_SP} sve256sp sve512sp sve1024sp sve2048sp)
|
||||
set(ISALIST_DP ${ISALIST_DP} sve256dp sve512dp sve1024dp sve2048dp)
|
||||
endif(COMPILER_SUPPORTS_SVE)
|
||||
|
||||
if (COMPILER_SUPPORTS_NEON32)
|
||||
set(ISALIST_SP ${ISALIST_SP} neon32sp)
|
||||
endif(COMPILER_SUPPORTS_NEON32)
|
||||
|
||||
if (COMPILER_SUPPORTS_RVVM1)
|
||||
set(ISALIST_SP ${ISALIST_SP} rvvm1128sp rvvm1256sp rvvm1512sp rvvm11024sp rvvm12048sp)
|
||||
set(ISALIST_DP ${ISALIST_DP} rvvm1128dp rvvm1256dp rvvm1512dp rvvm11024dp rvvm12048dp)
|
||||
endif(COMPILER_SUPPORTS_RVVM1)
|
||||
|
||||
if (COMPILER_SUPPORTS_RVVM2)
|
||||
set(ISALIST_SP ${ISALIST_SP} rvvm2128sp rvvm2256sp rvvm2512sp rvvm21024sp rvvm22048sp)
|
||||
set(ISALIST_DP ${ISALIST_DP} rvvm2128dp rvvm2256dp rvvm2512dp rvvm21024dp rvvm22048dp)
|
||||
endif(COMPILER_SUPPORTS_RVVM2)
|
||||
|
||||
if (COMPILER_SUPPORTS_VSX)
|
||||
set(ISALIST_SP ${ISALIST_SP} vsxsp)
|
||||
set(ISALIST_DP ${ISALIST_DP} vsxdp)
|
||||
endif(COMPILER_SUPPORTS_VSX)
|
||||
|
||||
if (COMPILER_SUPPORTS_VSX3)
|
||||
set(ISALIST_SP ${ISALIST_SP} vsx3sp)
|
||||
set(ISALIST_DP ${ISALIST_DP} vsx3dp)
|
||||
endif(COMPILER_SUPPORTS_VSX3)
|
||||
|
||||
if (COMPILER_SUPPORTS_VXE)
|
||||
set(ISALIST_SP ${ISALIST_SP} vxesp)
|
||||
set(ISALIST_DP ${ISALIST_DP} vxedp)
|
||||
endif(COMPILER_SUPPORTS_VXE)
|
||||
|
||||
if (COMPILER_SUPPORTS_VXE2)
|
||||
set(ISALIST_SP ${ISALIST_SP} vxe2sp)
|
||||
set(ISALIST_DP ${ISALIST_DP} vxe2dp)
|
||||
endif(COMPILER_SUPPORTS_VXE2)
|
||||
|
||||
if(SLEEFDFT_ENABLE_STREAM)
|
||||
set(NLIST 0 1 2 3)
|
||||
else()
|
||||
set(NLIST 0 2)
|
||||
endif()
|
||||
|
||||
#
|
||||
|
||||
# Compiler properties
|
||||
|
||||
set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${DFT_C_FLAGS}")
|
||||
set(COMMON_TARGET_PROPERTIES
|
||||
C_STANDARD 99 # -std=gnu99
|
||||
)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
list(APPEND COMMON_TARGET_PROPERTIES POSITION_INDEPENDENT_CODE ON) # -fPIC
|
||||
endif()
|
||||
|
||||
set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} MAXBUTWIDTH=${SLEEFDFT_MAXBUTWIDTH})
|
||||
|
||||
if (SLEEFDFT_ENABLE_STREAM)
|
||||
set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_STREAM=1)
|
||||
else()
|
||||
set(COMMON_TARGET_DEFINITIONS ${COMMON_TARGET_DEFINITIONS} ENABLE_STREAM=0)
|
||||
endif()
|
||||
|
||||
if(COMPILER_SUPPORTS_OPENMP)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
|
||||
endif(COMPILER_SUPPORTS_OPENMP)
|
||||
|
||||
|
||||
# Include directories
|
||||
|
||||
include_directories(${PROJECT_SOURCE_DIR}/include)
|
||||
include_directories(${PROJECT_BINARY_DIR}/include)
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
# Target mkunroll
|
||||
|
||||
set(TARGET_MKUNROLL "mkunroll")
|
||||
add_host_executable(${TARGET_MKUNROLL} mkunroll.c)
|
||||
set_target_properties(${TARGET_MKUNROLL} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
if (NOT CMAKE_CROSSCOMPILING)
|
||||
target_compile_definitions(${TARGET_MKUNROLL} PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
endif()
|
||||
|
||||
# Target mkdispatch
|
||||
|
||||
set(TARGET_MKDISPATCH "mkdispatch")
|
||||
add_host_executable(${TARGET_MKDISPATCH} mkdispatch.c)
|
||||
set_target_properties(${TARGET_MKDISPATCH} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
if (NOT CMAKE_CROSSCOMPILING)
|
||||
target_compile_definitions(${TARGET_MKDISPATCH} PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
endif()
|
||||
|
||||
# Target dispatchparam.h
|
||||
|
||||
add_custom_command(OUTPUT dispatchparam.h
|
||||
COMMENT "Generating dispatchparam.h"
|
||||
COMMAND $<TARGET_FILE:${TARGET_MKDISPATCH}> paramonly ${SLEEFDFT_MAXBUTWIDTH} ${ISALIST_DP} > ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h
|
||||
DEPENDS ${TARGET_MKDISPATCH}
|
||||
)
|
||||
add_custom_target(dispatchparam.h_generated SOURCES ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h)
|
||||
|
||||
# Target dispatch*.h
|
||||
|
||||
foreach(T ${LIST_SUPPORTED_FPTYPE})
|
||||
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
|
||||
string(TOUPPER ${ST} CST) # CST is "DP"
|
||||
list(GET LISTLONGTYPENAME ${T} LT) # LT is "double"
|
||||
list(GET LISTTYPEID ${T} ID) # ID is 1
|
||||
|
||||
string(CONCAT S "dispatch" ${ST} ".h") # S is dispatchdp.h
|
||||
add_custom_command(OUTPUT ${S}
|
||||
COMMENT "Generating ${S}"
|
||||
COMMAND $<TARGET_FILE:${TARGET_MKDISPATCH}> ${LT} ${SLEEFDFT_MAXBUTWIDTH} ${ISALIST_${CST}} > ${S}
|
||||
DEPENDS ${TARGET_MKDISPATCH}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
)
|
||||
|
||||
string(CONCAT G ${S} "_generated") # G is dispatchdp.h_generated
|
||||
add_custom_target(${G} SOURCES ${S})
|
||||
endforeach()
|
||||
|
||||
# Target dftcommon.o
|
||||
|
||||
add_library(dftcommon_obj OBJECT dftcommon.c dftcommon.h ${CMAKE_CURRENT_BINARY_DIR}/dispatchparam.h ${sleef_BINARY_DIR}/include/sleef.h)
|
||||
add_dependencies(dftcommon_obj ${TARGET_HEADERS} dispatchparam.h_generated)
|
||||
set_source_files_properties(${sleef_BINARY_DIR}/include/sleef.h PROPERTIES GENERATED TRUE)
|
||||
set_target_properties(dftcommon_obj PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
target_compile_definitions(dftcommon_obj PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
|
||||
# Target dft*.o
|
||||
|
||||
foreach(T ${LIST_SUPPORTED_FPTYPE})
|
||||
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
|
||||
|
||||
string(CONCAT G "dft" ${ST} "_obj") # G is "dftdp_obj"
|
||||
string(CONCAT S "dispatch" ${ST} ".h") # S is "dispatchdp.h"
|
||||
add_library(${G} OBJECT dft.c dftcommon.h ${S})
|
||||
string(CONCAT SG ${S} "_generated") # SG is "dispatchdp.h_generated"
|
||||
add_dependencies(${G} ${SG} ${TARGET_HEADERS})
|
||||
set_target_properties(${G} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
list(GET LISTTYPEID ${T} ID) # ID is 1
|
||||
target_compile_definitions(${G} PRIVATE BASETYPEID=${ID} ${COMMON_TARGET_DEFINITIONS})
|
||||
endforeach()
|
||||
|
||||
# Copy unroll0.org to ${CMAKE_CURRENT_BINARY_DIR}
|
||||
|
||||
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/unroll0.org
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.org ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unroll0.org)
|
||||
add_custom_target(unroll0.org.copied DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/unroll0.org)
|
||||
|
||||
# Target unroll*.c
|
||||
|
||||
foreach(T ${LIST_SUPPORTED_FPTYPE})
|
||||
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
|
||||
string(TOUPPER ${ST} CST) # CST is "DP"
|
||||
list(GET LISTLONGTYPENAME ${T} LT) # LT is "double"
|
||||
|
||||
foreach(E ${ISALIST_${CST}}) # E is "sse2dp"
|
||||
foreach(N ${NLIST})
|
||||
string(CONCAT UC unroll_ ${N} _ ${E} ".c") # UC is "unroll_0_sse2dp.c"
|
||||
set(UNROLL_TARGET_${CST} ${UNROLL_TARGET_${CST}} ${UC})
|
||||
endforeach()
|
||||
endforeach()
|
||||
message(STATUS "Unroll target for ${CST} : ${UNROLL_TARGET_${CST}}")
|
||||
|
||||
if(UNROLL_TARGET_${CST})
|
||||
add_custom_command(OUTPUT ${UNROLL_TARGET_${CST}}
|
||||
COMMENT "Generating ${UNROLL_TARGET_${CST}}"
|
||||
COMMAND $<TARGET_FILE:${TARGET_MKUNROLL}> ${LT} ${ISALIST_${CST}}
|
||||
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
|
||||
DEPENDS ${TARGET_MKUNROLL} unroll0.org.copied
|
||||
)
|
||||
add_custom_target(unroll_target_${ST} DEPENDS ${UNROLL_TARGET_${CST}})
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# Target unroll*.o
|
||||
|
||||
foreach(T ${LIST_SUPPORTED_FPTYPE})
|
||||
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
|
||||
string(TOUPPER ${ST} CST) # CST is "DP"
|
||||
list(GET LISTLONGTYPENAME ${T} LT) # LT is "double"
|
||||
|
||||
foreach(E ${ISALIST_${CST}}) # E is "sse2dp"
|
||||
foreach(N ${NLIST})
|
||||
string(CONCAT U unroll_ ${N} _ ${E}) # U is "unroll_0_sse2dp"
|
||||
string(CONCAT UG ${U} "_obj") # UG is "unroll_0_sse2dp_obj"
|
||||
string(CONCAT UC ${U} ".c") # UC is "unroll_0_sse2dp.c"
|
||||
add_library(${UG} OBJECT ${UC})
|
||||
set_target_properties(${UG} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
target_include_directories(${UG} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_compile_definitions(${UG} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${MACRODEF_${E}})
|
||||
target_compile_options(${UG} PRIVATE ${CFLAGS_${E}})
|
||||
add_dependencies(${UG} ${TARGET_HEADERS} unroll_target_${ST})
|
||||
endforeach()
|
||||
endforeach()
|
||||
endforeach()
|
||||
|
||||
# Target libdft
|
||||
|
||||
add_library(${TARGET_LIBDFT} $<TARGET_OBJECTS:dftcommon_obj> $<TARGET_OBJECTS:${TARGET_LIBARRAYMAP_OBJ}>)
|
||||
target_link_libraries(${TARGET_LIBDFT} ${TARGET_LIBSLEEF} ${LIBM})
|
||||
|
||||
foreach(T ${LIST_SUPPORTED_FPTYPE})
|
||||
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
|
||||
|
||||
string(CONCAT G "dft" ${ST} "_obj") # G is "dftdp_obj"
|
||||
target_sources(${TARGET_LIBDFT} PRIVATE $<TARGET_OBJECTS:${G}>)
|
||||
endforeach()
|
||||
|
||||
foreach(T ${LIST_SUPPORTED_FPTYPE})
|
||||
list(GET LISTSHORTTYPENAME ${T} ST) # ST is "dp", for example
|
||||
string(TOUPPER ${ST} CST) # CST is "DP"
|
||||
|
||||
foreach(E ${ISALIST_${CST}}) # E is "sse2dp"
|
||||
foreach(N ${NLIST})
|
||||
string(CONCAT UG unroll_ ${N} _ ${E} "_obj") # U is "unroll_0_sse2dp_obj"
|
||||
target_sources(${TARGET_LIBDFT} PRIVATE $<TARGET_OBJECTS:${UG}>)
|
||||
endforeach()
|
||||
endforeach()
|
||||
endforeach()
|
||||
|
||||
set_target_properties(${TARGET_LIBDFT} PROPERTIES
|
||||
VERSION ${SLEEF_VERSION}
|
||||
SOVERSION ${SLEEF_SOVERSION}
|
||||
PUBLIC_HEADER ${PROJECT_SOURCE_DIR}/include/sleefdft.h
|
||||
${COMMON_TARGET_PROPERTIES}
|
||||
)
|
||||
|
||||
# Install
|
||||
install(
|
||||
TARGETS ${TARGET_LIBDFT}
|
||||
EXPORT sleefTargets
|
||||
PUBLIC_HEADER #
|
||||
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
|
||||
COMPONENT sleef_Development
|
||||
LIBRARY #
|
||||
DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
COMPONENT sleef_Runtime
|
||||
NAMELINK_COMPONENT sleef_Development
|
||||
ARCHIVE #
|
||||
DESTINATION "${CMAKE_INSTALL_LIBDIR}"
|
||||
COMPONENT sleef_Development
|
||||
RUNTIME #
|
||||
DESTINATION "${CMAKE_INSTALL_BINDIR}"
|
||||
COMPONENT sleef_Runtime
|
||||
INCLUDES #
|
||||
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,423 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include <inttypes.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#ifdef _OPENMP
|
||||
#include <omp.h>
|
||||
#endif
|
||||
|
||||
#include "misc.h"
|
||||
#include "sleef.h"
|
||||
|
||||
#define IMPORT_IS_EXPORT
|
||||
#include "sleefdft.h"
|
||||
#include "dispatchparam.h"
|
||||
#include "dftcommon.h"
|
||||
#include "common.h"
|
||||
#include "arraymap.h"
|
||||
|
||||
#define MAGIC_FLOAT 0x31415926
|
||||
#define MAGIC_DOUBLE 0x27182818
|
||||
|
||||
#define MAGIC2D_FLOAT 0x22360679
|
||||
#define MAGIC2D_DOUBLE 0x17320508
|
||||
|
||||
const char *configStr[] = { "ST", "ST stream", "MT", "MT stream" };
|
||||
|
||||
static int parsePathStr(char *p, int *path, int *config, int pathLenMax, int log2len) {
|
||||
int pathLen = 0, l2l = 0;
|
||||
|
||||
for(;;) {
|
||||
while(*p == ' ') p++;
|
||||
if (*p == '\0') break;
|
||||
if (!isdigit((int)*p)) return -1;
|
||||
|
||||
pathLen++;
|
||||
if (pathLen >= pathLenMax) return -2;
|
||||
|
||||
int n = 0;
|
||||
while(isdigit((int)*p)) n = n * 10 + *p++ - '0';
|
||||
|
||||
if (n > MAXBUTWIDTH) return -6;
|
||||
path[pathLen-1] = n;
|
||||
l2l += n;
|
||||
config[pathLen-1] = 0;
|
||||
|
||||
if (*p != '(') continue;
|
||||
|
||||
int c;
|
||||
for(c=3;c>=0;c--) if (strncmp(p+1, configStr[c], strlen(configStr[c])) == 0) break;
|
||||
if (c == -1) return -3;
|
||||
p += strlen(configStr[c]) + 1;
|
||||
if (*p != ')') return -4;
|
||||
p++;
|
||||
|
||||
config[pathLen-1] = c;
|
||||
}
|
||||
|
||||
if (l2l != log2len) return -5;
|
||||
|
||||
return pathLen;
|
||||
}
|
||||
|
||||
EXPORT void SleefDFT_setPath(SleefDFT *p, char *pathStr) {
|
||||
assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
|
||||
|
||||
int path[32], config[32];
|
||||
int pathLen = parsePathStr(pathStr, path, config, 31, p->log2len);
|
||||
|
||||
if (pathLen < 0) {
|
||||
if ((p->mode & SLEEF_MODE_VERBOSE) != 0) printf("Error %d in parsing path string : %s\n", pathLen, pathStr);
|
||||
return;
|
||||
}
|
||||
|
||||
for(uint32_t j = 0;j <= p->log2len;j++) p->bestPath[j] = 0;
|
||||
|
||||
for(int level = p->log2len, j=0;level > 0 && j < pathLen;) {
|
||||
p->bestPath[level] = path[j];
|
||||
p->bestPathConfig[level] = config[j];
|
||||
level -= path[j];
|
||||
j++;
|
||||
}
|
||||
|
||||
p->pathLen = 0;
|
||||
for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) p->pathLen++;
|
||||
|
||||
if ((p->mode & SLEEF_MODE_VERBOSE) != 0) {
|
||||
printf("Set path : ");
|
||||
for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) printf("%d(%s) ", p->bestPath[j], configStr[p->bestPathConfig[j]]);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
void freeTables(SleefDFT *p) {
|
||||
for(int N=1;N<=MAXBUTWIDTH;N++) {
|
||||
for(uint32_t level=N;level<=p->log2len;level++) {
|
||||
Sleef_free(p->tbl[N][level]);
|
||||
}
|
||||
free(p->tbl[N]);
|
||||
p->tbl[N] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
EXPORT void SleefDFT_dispose(SleefDFT *p) {
|
||||
if (p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE)) {
|
||||
Sleef_free(p->tBuf);
|
||||
SleefDFT_dispose(p->instH);
|
||||
if (p->hlen != p->vlen) SleefDFT_dispose(p->instV);
|
||||
|
||||
p->magic = 0;
|
||||
free(p);
|
||||
return;
|
||||
}
|
||||
|
||||
assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
|
||||
|
||||
if (p->log2len <= 1) {
|
||||
p->magic = 0;
|
||||
free(p);
|
||||
return;
|
||||
}
|
||||
|
||||
if ((p->mode & SLEEF_MODE_REAL) != 0) {
|
||||
Sleef_free(p->rtCoef1);
|
||||
Sleef_free(p->rtCoef0);
|
||||
p->rtCoef0 = p->rtCoef1 = NULL;
|
||||
}
|
||||
|
||||
for(int level = p->log2len;level >= 1;level--) {
|
||||
Sleef_free(p->perm[level]);
|
||||
}
|
||||
free(p->perm);
|
||||
p->perm = NULL;
|
||||
|
||||
freeTables(p);
|
||||
|
||||
p->magic = 0;
|
||||
free(p);
|
||||
}
|
||||
|
||||
uint32_t ilog2(uint32_t q) {
|
||||
static const uint32_t tab[] = {0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4};
|
||||
uint32_t r = 0,qq;
|
||||
|
||||
if (q & 0xffff0000) r = 16;
|
||||
|
||||
q >>= r;
|
||||
qq = q | (q >> 1);
|
||||
qq |= (qq >> 2);
|
||||
qq = ((qq & 0x10) >> 4) | ((qq & 0x100) >> 7) | ((qq & 0x1000) >> 10);
|
||||
|
||||
return r + tab[qq] * 4 + tab[q >> (tab[qq] * 4)] - 1;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
char *dftPlanFilePath = NULL;
|
||||
char *archID = NULL;
|
||||
uint64_t planMode = SLEEF_PLAN_REFERTOENVVAR;
|
||||
ArrayMap *planMap = NULL;
|
||||
int planFilePathSet = 0, planFileLoaded = 0;
|
||||
#ifdef _OPENMP
|
||||
omp_lock_t planMapLock;
|
||||
int planMapLockInitialized = 0;
|
||||
#endif
|
||||
|
||||
static void initPlanMapLock() {
|
||||
#ifdef _OPENMP
|
||||
#pragma omp critical
|
||||
{
|
||||
if (!planMapLockInitialized) {
|
||||
planMapLockInitialized = 1;
|
||||
omp_init_lock(&planMapLock);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
static void planMap_clear() {
|
||||
if (planMap != NULL) ArrayMap_dispose(planMap);
|
||||
planMap = NULL;
|
||||
}
|
||||
|
||||
EXPORT void SleefDFT_setPlanFilePath(const char *path, const char *arch, uint64_t mode) {
|
||||
initPlanMapLock();
|
||||
|
||||
if ((mode & SLEEF_PLAN_RESET) != 0) {
|
||||
planMap_clear();
|
||||
planFileLoaded = 0;
|
||||
planFilePathSet = 0;
|
||||
}
|
||||
|
||||
if (dftPlanFilePath != NULL) free(dftPlanFilePath);
|
||||
if (path != NULL) {
|
||||
dftPlanFilePath = malloc(strlen(path)+10);
|
||||
strcpy(dftPlanFilePath, path);
|
||||
} else {
|
||||
dftPlanFilePath = NULL;
|
||||
}
|
||||
|
||||
if (archID != NULL) free(archID);
|
||||
if (arch == NULL) arch = Sleef_getCpuIdString();
|
||||
archID = malloc(strlen(arch)+10);
|
||||
strcpy(archID, arch);
|
||||
|
||||
planMode = mode;
|
||||
planFilePathSet = 1;
|
||||
}
|
||||
|
||||
static void loadPlanFromFile() {
|
||||
if (planFilePathSet == 0 && (planMode & SLEEF_PLAN_REFERTOENVVAR) != 0) {
|
||||
char *s = getenv(ENVVAR);
|
||||
if (s != NULL) SleefDFT_setPlanFilePath(s, NULL, planMode);
|
||||
}
|
||||
|
||||
if (planMap != NULL) ArrayMap_dispose(planMap);
|
||||
|
||||
if (dftPlanFilePath != NULL && (planMode & SLEEF_PLAN_RESET) == 0) {
|
||||
planMap = ArrayMap_load(dftPlanFilePath, archID, PLANFILEID, (planMode & SLEEF_PLAN_NOLOCK) == 0);
|
||||
}
|
||||
|
||||
if (planMap == NULL) planMap = initArrayMap();
|
||||
|
||||
planFileLoaded = 1;
|
||||
}
|
||||
|
||||
static void savePlanToFile() {
|
||||
assert(planFileLoaded);
|
||||
if ((planMode & SLEEF_PLAN_READONLY) == 0 && dftPlanFilePath != NULL) {
|
||||
ArrayMap_save(planMap, dftPlanFilePath, archID, PLANFILEID);
|
||||
}
|
||||
}
|
||||
|
||||
#define CATBIT 8
|
||||
#define BASETYPEIDBIT 2
|
||||
#define LOG2LENBIT 8
|
||||
#define DIRBIT 1
|
||||
|
||||
#define BUTSTATBIT 16
|
||||
|
||||
static uint64_t keyButStat(int baseTypeID, int log2len, int dir, int butStat) {
|
||||
dir = (dir & SLEEF_MODE_BACKWARD) == 0;
|
||||
int cat = 0;
|
||||
uint64_t k = 0;
|
||||
k = (k << BUTSTATBIT) | (butStat & ~(~(uint64_t)0 << BUTSTATBIT));
|
||||
k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
|
||||
k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
|
||||
return k;
|
||||
}
|
||||
|
||||
#define LEVELBIT LOG2LENBIT
|
||||
#define BUTCONFIGBIT 8
|
||||
#define TRANSCONFIGBIT 8
|
||||
|
||||
static uint64_t keyTrans(int baseTypeID, int hlen, int vlen, int transConfig) {
|
||||
int max = MAX(hlen, vlen), min = MIN(hlen, vlen);
|
||||
int cat = 2;
|
||||
uint64_t k = 0;
|
||||
k = (k << TRANSCONFIGBIT) | (transConfig & ~(~(uint64_t)0 << TRANSCONFIGBIT));
|
||||
k = (k << LOG2LENBIT) | (max & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << LOG2LENBIT) | (min & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
|
||||
k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
|
||||
return k;
|
||||
}
|
||||
|
||||
static uint64_t keyPath(int baseTypeID, int log2len, int dir, int level, int config) {
|
||||
dir = (dir & SLEEF_MODE_BACKWARD) == 0;
|
||||
int cat = 3;
|
||||
uint64_t k = 0;
|
||||
k = (k << BUTCONFIGBIT) | (config & ~(~(uint64_t)0 << BUTCONFIGBIT));
|
||||
k = (k << LEVELBIT) | (level & ~(~(uint64_t)0 << LEVELBIT));
|
||||
k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
|
||||
k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
|
||||
return k;
|
||||
}
|
||||
|
||||
static uint64_t keyPathConfig(int baseTypeID, int log2len, int dir, int level, int config) {
|
||||
dir = (dir & SLEEF_MODE_BACKWARD) == 0;
|
||||
int cat = 4;
|
||||
uint64_t k = 0;
|
||||
k = (k << BUTCONFIGBIT) | (config & ~(~(uint64_t)0 << BUTCONFIGBIT));
|
||||
k = (k << LEVELBIT) | (level & ~(~(uint64_t)0 << LEVELBIT));
|
||||
k = (k << LOG2LENBIT) | (log2len & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << DIRBIT) | (dir & ~(~(uint64_t)0 << LOG2LENBIT));
|
||||
k = (k << BASETYPEIDBIT) | (baseTypeID & ~(~(uint64_t)0 << BASETYPEIDBIT));
|
||||
k = (k << CATBIT) | (cat & ~(~(uint64_t)0 << CATBIT));
|
||||
return k;
|
||||
}
|
||||
|
||||
static uint64_t planMap_getU64(uint64_t key) {
|
||||
char *s = ArrayMap_get(planMap, key);
|
||||
if (s == NULL) return 0;
|
||||
uint64_t ret;
|
||||
if (sscanf(s, "%" SCNx64, &ret) != 1) return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void planMap_putU64(uint64_t key, uint64_t value) {
|
||||
char *s = malloc(100);
|
||||
sprintf(s, "%" PRIx64, value);
|
||||
s = ArrayMap_put(planMap, key, s);
|
||||
if (s != NULL) free(s);
|
||||
}
|
||||
|
||||
int PlanManager_loadMeasurementResultsP(SleefDFT *p, int pathCat) {
|
||||
assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
|
||||
|
||||
initPlanMapLock();
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_set_lock(&planMapLock);
|
||||
#endif
|
||||
if (!planFileLoaded) loadPlanFromFile();
|
||||
|
||||
int stat = planMap_getU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10));
|
||||
if (stat == 0) {
|
||||
#ifdef _OPENMP
|
||||
omp_unset_lock(&planMapLock);
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ret = 1;
|
||||
|
||||
for(int j = p->log2len;j >= 0;j--) {
|
||||
p->bestPath[j] = planMap_getU64(keyPath(p->baseTypeID, p->log2len, p->mode, j, pathCat));
|
||||
p->bestPathConfig[j] = planMap_getU64(keyPathConfig(p->baseTypeID, p->log2len, p->mode, j, pathCat));
|
||||
if (p->bestPath[j] > MAXBUTWIDTH) ret = 0;
|
||||
}
|
||||
|
||||
p->pathLen = 0;
|
||||
for(int j = p->log2len;j >= 0;j--) if (p->bestPath[j] != 0) p->pathLen++;
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_unset_lock(&planMapLock);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
||||
void PlanManager_saveMeasurementResultsP(SleefDFT *p, int pathCat) {
|
||||
assert(p != NULL && (p->magic == MAGIC_FLOAT || p->magic == MAGIC_DOUBLE));
|
||||
|
||||
initPlanMapLock();
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_set_lock(&planMapLock);
|
||||
#endif
|
||||
if (!planFileLoaded) loadPlanFromFile();
|
||||
|
||||
if (planMap_getU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10)) != 0) {
|
||||
#ifdef _OPENMP
|
||||
omp_unset_lock(&planMapLock);
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
for(int j = p->log2len;j >= 0;j--) {
|
||||
planMap_putU64(keyPath(p->baseTypeID, p->log2len, p->mode, j, pathCat), p->bestPath[j]);
|
||||
planMap_putU64(keyPathConfig(p->baseTypeID, p->log2len, p->mode, j, pathCat), p->bestPathConfig[j]);
|
||||
}
|
||||
|
||||
planMap_putU64(keyButStat(p->baseTypeID, p->log2len, p->mode, pathCat+10), 1);
|
||||
|
||||
if ((planMode & SLEEF_PLAN_READONLY) == 0) savePlanToFile();
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_unset_lock(&planMapLock);
|
||||
#endif
|
||||
}
|
||||
|
||||
int PlanManager_loadMeasurementResultsT(SleefDFT *p) {
|
||||
assert(p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE));
|
||||
|
||||
initPlanMapLock();
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_set_lock(&planMapLock);
|
||||
#endif
|
||||
if (!planFileLoaded) loadPlanFromFile();
|
||||
|
||||
p->tmNoMT = planMap_getU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 0));
|
||||
p->tmMT = planMap_getU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 1));
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_unset_lock(&planMapLock);
|
||||
#endif
|
||||
return p->tmNoMT != 0;
|
||||
}
|
||||
|
||||
void PlanManager_saveMeasurementResultsT(SleefDFT *p) {
|
||||
assert(p != NULL && (p->magic == MAGIC2D_FLOAT || p->magic == MAGIC2D_DOUBLE));
|
||||
|
||||
initPlanMapLock();
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_set_lock(&planMapLock);
|
||||
#endif
|
||||
if (!planFileLoaded) loadPlanFromFile();
|
||||
|
||||
planMap_putU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 0), p->tmNoMT);
|
||||
planMap_putU64(keyTrans(p->baseTypeID, p->log2hlen, p->log2vlen, 1), p->tmMT );
|
||||
|
||||
if ((planMode & SLEEF_PLAN_READONLY) == 0) savePlanToFile();
|
||||
|
||||
#ifdef _OPENMP
|
||||
omp_unset_lock(&planMapLock);
|
||||
#endif
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#define CONFIGMAX 4
|
||||
#define CONFIG_STREAM 1
|
||||
#define CONFIG_MT 2
|
||||
|
||||
#define MAXLOG2LEN 32
|
||||
|
||||
typedef struct SleefDFT {
|
||||
uint32_t magic;
|
||||
uint64_t mode, mode2, mode3;
|
||||
int baseTypeID;
|
||||
const void *in;
|
||||
void *out;
|
||||
|
||||
union {
|
||||
struct {
|
||||
uint32_t log2len;
|
||||
|
||||
void **tbl[MAXBUTWIDTH+1];
|
||||
void *rtCoef0, *rtCoef1;
|
||||
uint32_t **perm;
|
||||
|
||||
void **x0, **x1;
|
||||
|
||||
int isa;
|
||||
int planMode;
|
||||
|
||||
int vecwidth, log2vecwidth;
|
||||
int nThread;
|
||||
|
||||
uint64_t tm[CONFIGMAX][(MAXBUTWIDTH+1)*32];
|
||||
uint64_t bestTime;
|
||||
int16_t bestPath[32], bestPathConfig[32], pathLen;
|
||||
};
|
||||
|
||||
struct {
|
||||
int32_t hlen, vlen;
|
||||
int32_t log2hlen, log2vlen;
|
||||
uint64_t tmNoMT, tmMT;
|
||||
struct SleefDFT *instH, *instV;
|
||||
void *tBuf;
|
||||
};
|
||||
};
|
||||
} SleefDFT;
|
||||
|
||||
#define SLEEF_MODE2_MT1D (1 << 0)
|
||||
#define SLEEF_MODE3_MT2D (1 << 0)
|
||||
|
||||
#define PLANFILEID "SLEEFDFT0\n"
|
||||
#define ENVVAR "SLEEFDFTPLAN"
|
||||
|
||||
#define SLEEF_MODE_MEASUREBITS (3 << 20)
|
||||
|
||||
void freeTables(SleefDFT *p);
|
||||
uint32_t ilog2(uint32_t q);
|
||||
|
||||
//int PlanManager_loadMeasurementResultsB(SleefDFT *p);
|
||||
//void PlanManager_saveMeasurementResultsB(SleefDFT *p, int butStat);
|
||||
int PlanManager_loadMeasurementResultsT(SleefDFT *p);
|
||||
void PlanManager_saveMeasurementResultsT(SleefDFT *p);
|
||||
int PlanManager_loadMeasurementResultsP(SleefDFT *p, int pathCat);
|
||||
void PlanManager_saveMeasurementResultsP(SleefDFT *p, int pathCat);
|
||||
|
||||
#define GETINT_VECWIDTH 100
|
||||
#define GETINT_DFTPRIORITY 101
|
||||
@@ -0,0 +1,193 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifndef ENABLE_STREAM
|
||||
#error ENABLE_STREAM not defined
|
||||
#endif
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 3) {
|
||||
fprintf(stderr, "Usage : %s <basetype> <unrollmax> <unrollmax2> <maxbutwidth> <isa> ...\n", argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
const char *basetype = argv[1];
|
||||
const int maxbutwidth = atoi(argv[2]);
|
||||
const int isastart = 3;
|
||||
const int isamax = argc - isastart;
|
||||
|
||||
#if ENABLE_STREAM == 1
|
||||
const int enable_stream = 1;
|
||||
#else
|
||||
const int enable_stream = 0;
|
||||
#endif
|
||||
|
||||
printf("#define MAXBUTWIDTH %d\n", maxbutwidth);
|
||||
printf("\n");
|
||||
|
||||
if (strcmp(basetype, "paramonly") == 0) exit(0);
|
||||
|
||||
printf("#define ISAMAX %d\n", isamax);
|
||||
printf("#define CONFIGMAX 4\n");
|
||||
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
for(int config=0;config<4;config++) {
|
||||
#if ENABLE_STREAM == 0
|
||||
if ((config & 1) != 0) continue;
|
||||
#endif
|
||||
for(int j=1;j<=maxbutwidth;j++) {
|
||||
printf("void dft%df_%d_%s(real *, const real *, const int);\n", 1 << j, config, argv[k]);
|
||||
printf("void dft%db_%d_%s(real *, const real *, const int);\n", 1 << j, config, argv[k]);
|
||||
printf("void tbut%df_%d_%s(real *, uint32_t *, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
|
||||
printf("void tbut%db_%d_%s(real *, uint32_t *, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
|
||||
printf("void but%df_%d_%s(real *, uint32_t *, const int, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
|
||||
printf("void but%db_%d_%s(real *, uint32_t *, const int, const real *, const int, const real *, const int);\n", 1 << j, config, argv[k]);
|
||||
}
|
||||
}
|
||||
printf("void realSub0_%s(real *, const real *, const int, const real *, const real *);\n", argv[k]);
|
||||
printf("void realSub1_%s(real *, const real *, const int, const real *, const real *, const int);\n", argv[k]);
|
||||
printf("int getInt_%s(int);\n", argv[k]);
|
||||
printf("const void *getPtr_%s(int);\n", argv[k]);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
printf("void (*dftf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int) = {\n", basetype);
|
||||
for(int config=0;config<4;config++) {
|
||||
printf(" {\n");
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
printf(" {NULL, ");
|
||||
for(int i=1;i<=maxbutwidth;i++) {
|
||||
if (enable_stream || (config & 1) == 0) {
|
||||
printf("dft%df_%d_%s, ", 1 << i, config, argv[k]);
|
||||
} else {
|
||||
printf("NULL, ");
|
||||
}
|
||||
}
|
||||
printf("},\n");
|
||||
}
|
||||
printf("},\n");
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
printf("void (*dftb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, const real *, const int) = {\n", basetype);
|
||||
for(int config=0;config<4;config++) {
|
||||
printf(" {\n");
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
printf(" {NULL, ");
|
||||
for(int i=1;i<=maxbutwidth;i++) {
|
||||
if (enable_stream || (config & 1) == 0) {
|
||||
if (i == 1) {
|
||||
printf("dft%df_%d_%s, ", 1 << i, config, argv[k]);
|
||||
} else {
|
||||
printf("dft%db_%d_%s, ", 1 << i, config, argv[k]);
|
||||
}
|
||||
} else {
|
||||
printf("NULL, ");
|
||||
}
|
||||
}
|
||||
printf("},\n");
|
||||
}
|
||||
printf("},\n");
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
printf("void (*tbutf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int) = {\n", basetype);
|
||||
for(int config=0;config<4;config++) {
|
||||
printf(" {\n");
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
printf(" {NULL, ");
|
||||
for(int i=1;i<=maxbutwidth;i++) {
|
||||
if (enable_stream || (config & 1) == 0) {
|
||||
printf("tbut%df_%d_%s, ", 1 << i, config, argv[k]);
|
||||
} else {
|
||||
printf("NULL, ");
|
||||
}
|
||||
}
|
||||
printf("},\n");
|
||||
}
|
||||
printf("},\n");
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
printf("void (*tbutb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const real *, const int, const real *, const int) = {\n", basetype);
|
||||
for(int config=0;config<4;config++) {
|
||||
printf(" {\n");
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
printf(" {NULL, ");
|
||||
for(int i=1;i<=maxbutwidth;i++) {
|
||||
if (enable_stream || (config & 1) == 0) {
|
||||
printf("tbut%db_%d_%s, ", 1 << i, config, argv[k]);
|
||||
} else {
|
||||
printf("NULL, ");
|
||||
}
|
||||
}
|
||||
printf("},\n");
|
||||
}
|
||||
printf("},\n");
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
printf("void (*butf_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int) = {\n", basetype);
|
||||
for(int config=0;config<4;config++) {
|
||||
printf(" {\n");
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
printf(" {NULL, ");
|
||||
for(int i=1;i<=maxbutwidth;i++) {
|
||||
if (enable_stream || (config & 1) == 0) {
|
||||
printf("but%df_%d_%s, ", 1 << i, config, argv[k]);
|
||||
} else {
|
||||
printf("NULL, ");
|
||||
}
|
||||
}
|
||||
printf("},\n");
|
||||
}
|
||||
printf("},\n");
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
printf("void (*butb_%s[CONFIGMAX][ISAMAX][MAXBUTWIDTH+1])(real *, uint32_t *, const int, const real *, const int, const real *, const int) = {\n", basetype);
|
||||
for(int config=0;config<4;config++) {
|
||||
printf(" {\n");
|
||||
for(int k=isastart;k<argc;k++) {
|
||||
printf(" {NULL, ");
|
||||
for(int i=1;i<=maxbutwidth;i++) {
|
||||
if (enable_stream || (config & 1) == 0) {
|
||||
printf("but%db_%d_%s, ", 1 << i, config, argv[k]);
|
||||
} else {
|
||||
printf("NULL, ");
|
||||
}
|
||||
}
|
||||
printf("},\n");
|
||||
}
|
||||
printf("},\n");
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
//
|
||||
|
||||
printf("void (*realSub0_%s[ISAMAX])(real *, const real *, const int, const real *, const real *) = {\n ", basetype);
|
||||
for(int k=isastart;k<argc;k++) printf("realSub0_%s, ", argv[k]);
|
||||
printf("\n};\n\n");
|
||||
|
||||
printf("void (*realSub1_%s[ISAMAX])(real *, const real *, const int, const real *, const real *, const int) = {\n ", basetype);
|
||||
for(int k=isastart;k<argc;k++) printf("realSub1_%s, ", argv[k]);
|
||||
printf("\n};\n\n");
|
||||
|
||||
printf("int (*getInt_%s[16])(int) = {\n ", basetype);
|
||||
for(int k=isastart;k<argc;k++) printf("getInt_%s, ", argv[k]);
|
||||
for(int k=0;k<16-(argc-isastart);k++) printf("NULL, ");
|
||||
printf("\n};\n\n");
|
||||
|
||||
printf("const void *(*getPtr_%s[16])(int) = {\n ", basetype);
|
||||
for(int k=isastart;k<argc;k++) printf("getPtr_%s, ", argv[k]);
|
||||
for(int k=0;k<16-(argc-isastart);k++) printf("NULL, ");
|
||||
printf("\n};\n\n");
|
||||
}
|
||||
@@ -0,0 +1,104 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#define CONFIGMAX 4
|
||||
|
||||
char *replaceAll(const char *in, const char *pat, const char *replace) {
|
||||
const int replaceLen = (int)strlen(replace);
|
||||
const int patLen = (int)strlen(pat);
|
||||
|
||||
char *str = malloc(strlen(in)+1);
|
||||
strcpy(str, in);
|
||||
|
||||
for(;;) {
|
||||
char *p = strstr(str, pat);
|
||||
if (p == NULL) return str;
|
||||
|
||||
int replace_pos = (int)(p - str);
|
||||
int tail_len = (int)strlen(p + patLen);
|
||||
|
||||
char *newstr = malloc(strlen(str) + (replaceLen - patLen) + 1);
|
||||
|
||||
memcpy(newstr, str, replace_pos);
|
||||
memcpy(newstr + replace_pos, replace, replaceLen);
|
||||
memcpy(newstr + replace_pos + replaceLen, str + replace_pos + patLen, tail_len+1);
|
||||
|
||||
free(str);
|
||||
str = newstr;
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
#define LEN 1024
|
||||
char line[LEN+10];
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
fprintf(stderr, "Usage : %s <Base type> <ISA> ...\n", argv[0]);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
const char *baseType = argv[1];
|
||||
const int isastart = 2;
|
||||
|
||||
for(int config=0;config<CONFIGMAX;config++) {
|
||||
#if ENABLE_STREAM == 0
|
||||
if ((config & 1) != 0) continue;
|
||||
#endif
|
||||
for(int isa=isastart;isa<argc;isa++) {
|
||||
char *isaString = argv[isa];
|
||||
char configString[100];
|
||||
sprintf(configString, "%d", config);
|
||||
|
||||
FILE *fpin = fopen("unroll0.org", "r");
|
||||
|
||||
sprintf(line, "unroll_%d_%s.c", config, isaString);
|
||||
FILE *fpout = fopen(line, "w");
|
||||
fputs("#include \"vectortype.h\"\n\n", fpout);
|
||||
fprintf(fpout, "extern %s ctbl_%s[];\n", baseType, baseType);
|
||||
fprintf(fpout, "#define ctbl ctbl_%s\n\n", baseType);
|
||||
|
||||
for(;;) {
|
||||
if (fgets(line, LEN, fpin) == NULL) break;
|
||||
char *s;
|
||||
if ((config & 1) == 0) {
|
||||
char *s0 = replaceAll(line, "%ISA%", isaString);
|
||||
s = replaceAll(s0, "%CONFIG%", configString);
|
||||
free(s0);
|
||||
} else {
|
||||
char *s0 = replaceAll(line, "%ISA%", isaString);
|
||||
char *s1 = replaceAll(s0, "%CONFIG%", configString);
|
||||
char *s2 = replaceAll(s1, "store(", "stream(");
|
||||
s = replaceAll(s2, "scatter(", "scstream(");
|
||||
free(s0); free(s1); free(s2);
|
||||
}
|
||||
|
||||
if ((config & 2) == 0) {
|
||||
char *s0 = replaceAll(s, "#pragma", "//");
|
||||
free(s);
|
||||
s = s0;
|
||||
}
|
||||
|
||||
if (config == 0) {
|
||||
char *s0 = replaceAll(s, "#undef EMITREALSUB", "#define EMITREALSUB");
|
||||
free(s);
|
||||
s = s0;
|
||||
}
|
||||
|
||||
fputs(s, fpout);
|
||||
free(s);
|
||||
}
|
||||
|
||||
fclose(fpin);
|
||||
fclose(fpout);
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,145 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#ifndef __VECTORTYPE_H__
|
||||
#define __VECTORTYPE_H__
|
||||
|
||||
#include <math.h>
|
||||
#include "sleef.h"
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#include "helpersse2.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_AVX
|
||||
#include "helperavx.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_AVX2
|
||||
#include "helperavx2.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_AVX512F
|
||||
#include "helperavx512f.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_NEON32
|
||||
#include "helperneon32.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_ADVSIMD
|
||||
#include "helperadvsimd.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SVE
|
||||
#include "helpersve.h"
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_RVVM1) || defined(ENABLE_RVVM2)
|
||||
#include "helperrvv.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_VSX
|
||||
#include "helperpower_128.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_VSX3
|
||||
#include "helperpower_128.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_VXE
|
||||
#include "helpers390x_128.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_VXE2
|
||||
#include "helpers390x_128.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_VECEXT
|
||||
#include "helpervecext.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_PUREC
|
||||
#include "helperpurec.h"
|
||||
#endif
|
||||
|
||||
#define IMPORT_IS_EXPORT
|
||||
#include "sleefdft.h"
|
||||
|
||||
#if BASETYPEID == 1
|
||||
#define LOG2VECWIDTH (LOG2VECTLENDP-1)
|
||||
#define VECWIDTH (1 << LOG2VECWIDTH)
|
||||
|
||||
typedef double real;
|
||||
typedef vdouble real2;
|
||||
|
||||
static int available(int name) { return vavailability_i(name); }
|
||||
|
||||
static INLINE real2 uminus(real2 d0) { return vneg_vd_vd(d0); }
|
||||
static INLINE real2 uplusminus(real2 d0) { return vposneg_vd_vd(d0); }
|
||||
static INLINE real2 uminusplus(real2 d0) { return vnegpos_vd_vd(d0); }
|
||||
|
||||
static INLINE real2 plus(real2 d0, real2 d1) { return vadd_vd_vd_vd(d0, d1); }
|
||||
static INLINE real2 minus(real2 d0, real2 d1) { return vsub_vd_vd_vd(d0, d1); }
|
||||
static INLINE real2 minusplus(real2 d0, real2 d1) { return vsubadd_vd_vd_vd(d0, d1); }
|
||||
static INLINE real2 times(real2 d0, real2 d1) { return vmul_vd_vd_vd(d0, d1); }
|
||||
static INLINE real2 timesminusplus(real2 d0, real2 d2, real2 d1) { return vmlsubadd_vd_vd_vd_vd(d0, d2, d1); }
|
||||
static INLINE real2 ctimes(real2 d0, real d) { return vmul_vd_vd_vd(d0, vcast_vd_d(d)); }
|
||||
static INLINE real2 ctimesminusplus(real2 d0, real c, real2 d1) { return vmlsubadd_vd_vd_vd_vd(d0, vcast_vd_d(c), d1); }
|
||||
|
||||
static INLINE real2 reverse(real2 d0) { return vrev21_vd_vd(d0); }
|
||||
static INLINE real2 reverse2(real2 d0) { return vreva2_vd_vd(d0); }
|
||||
|
||||
static INLINE real2 loadc(real c) { return vcast_vd_d(c); }
|
||||
|
||||
static INLINE real2 load(const real *ptr, int offset) { return vload_vd_p(&ptr[2*offset]); }
|
||||
static INLINE real2 loadu(const real *ptr, int offset) { return vloadu_vd_p(&ptr[2*offset]); }
|
||||
static INLINE void store(real *ptr, int offset, real2 v) { vstore_v_p_vd(&ptr[2*offset], v); }
|
||||
static INLINE void storeu(real *ptr, int offset, real2 v) { vstoreu_v_p_vd(&ptr[2*offset], v); }
|
||||
static INLINE void stream(real *ptr, int offset, real2 v) { vstream_v_p_vd(&ptr[2*offset], v); }
|
||||
static INLINE void scatter(real *ptr, int offset, int step, real2 v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
|
||||
static INLINE void scstream(real *ptr, int offset, int step, real2 v) { vsscatter2_v_p_i_i_vd(ptr, offset, step, v); }
|
||||
|
||||
static INLINE void prefetch(real *ptr, int offset) { vprefetch_v_p(&ptr[2*offset]); }
|
||||
#elif BASETYPEID == 2
|
||||
#define LOG2VECWIDTH (LOG2VECTLENSP-1)
|
||||
#define VECWIDTH (1 << LOG2VECWIDTH)
|
||||
|
||||
typedef float real;
|
||||
typedef vfloat real2;
|
||||
|
||||
static int available(int name) { return vavailability_i(name); }
|
||||
|
||||
static INLINE real2 uminus(real2 d0) { return vneg_vf_vf(d0); }
|
||||
static INLINE real2 uplusminus(real2 d0) { return vposneg_vf_vf(d0); }
|
||||
static INLINE real2 uminusplus(real2 d0) { return vnegpos_vf_vf(d0); }
|
||||
|
||||
static INLINE real2 plus(real2 d0, real2 d1) { return vadd_vf_vf_vf(d0, d1); }
|
||||
static INLINE real2 minus(real2 d0, real2 d1) { return vsub_vf_vf_vf(d0, d1); }
|
||||
static INLINE real2 minusplus(real2 d0, real2 d1) { return vsubadd_vf_vf_vf(d0, d1); }
|
||||
static INLINE real2 times(real2 d0, real2 d1) { return vmul_vf_vf_vf(d0, d1); }
|
||||
static INLINE real2 ctimes(real2 d0, real d) { return vmul_vf_vf_vf(d0, vcast_vf_f(d)); }
|
||||
static INLINE real2 timesminusplus(real2 d0, real2 d2, real2 d1) { return vmlsubadd_vf_vf_vf_vf(d0, d2, d1); }
|
||||
static INLINE real2 ctimesminusplus(real2 d0, real c, real2 d1) { return vmlsubadd_vf_vf_vf_vf(d0, vcast_vf_f(c), d1); }
|
||||
|
||||
static INLINE real2 reverse(real2 d0) { return vrev21_vf_vf(d0); }
|
||||
static INLINE real2 reverse2(real2 d0) { return vreva2_vf_vf(d0); }
|
||||
|
||||
static INLINE real2 loadc(real c) { return vcast_vf_f(c); }
|
||||
|
||||
static INLINE real2 load(const real *ptr, int offset) { return vload_vf_p(&ptr[2*offset]); }
|
||||
static INLINE real2 loadu(const real *ptr, int offset) { return vloadu_vf_p(&ptr[2*offset]); }
|
||||
static INLINE void store(real *ptr, int offset, real2 v) { vstore_v_p_vf(&ptr[2*offset], v); }
|
||||
static INLINE void storeu(real *ptr, int offset, real2 v) { vstoreu_v_p_vf(&ptr[2*offset], v); }
|
||||
static INLINE void stream(real *ptr, int offset, real2 v) { vstream_v_p_vf(&ptr[2*offset], v); }
|
||||
static INLINE void scatter(real *ptr, int offset, int step, real2 v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
|
||||
static INLINE void scstream(real *ptr, int offset, int step, real2 v) { vsscatter2_v_p_i_i_vf(ptr, offset, step, v); }
|
||||
|
||||
static INLINE void prefetch(real *ptr, int offset) { vprefetch_v_p(&ptr[2*offset]); }
|
||||
#else
|
||||
#error No BASETYPEID specified
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,16 @@
|
||||
.PHONY: all
|
||||
all : gencoef mkrempitab mkrempitabqp
|
||||
|
||||
gencoef : gencoef.c simplexfr.c sp.h dp.h ld.h qp.h
|
||||
gcc -O gencoef.c simplexfr.c -o gencoef -lmpfr -lm
|
||||
|
||||
mkrempitab : mkrempitab.c
|
||||
gcc -O mkrempitab.c -o mkrempitab -lmpfr
|
||||
|
||||
mkrempitabqp : mkrempitabqp.c
|
||||
gcc -O mkrempitabqp.c -o mkrempitabqp -lmpfr
|
||||
|
||||
.PHONY: clean
|
||||
clean :
|
||||
rm -f gencoef gencoefdp gencoefld mkrempitab mkrempitabqp a.out *~
|
||||
rm -f *.obj *.lib *.dll *.exp *.exe
|
||||
@@ -0,0 +1,196 @@
|
||||
// This is part of SLEEF, written by Naoki
|
||||
// Shibata. http://shibatch.sourceforge.net
|
||||
|
||||
// The code in this file is distributed under the Creative Commons
|
||||
// Attribution 4.0 International License.
|
||||
|
||||
#define PREC_TARGET 53
|
||||
|
||||
#if 0
|
||||
#define N 8 // Degree of equation
|
||||
#define S 40 // Number of samples for phase 1
|
||||
#define L 4 // Number of high precision coefficients
|
||||
#define MIN 0.0 // Min argument
|
||||
#define MAX (M_PI/4) // Max argument
|
||||
#define PMUL 2 // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_sin(ret, a, GMP_RNDN); } // The function to approximate
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0 // Fix coef 0 to 1.0
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define N 10
|
||||
#define S 40
|
||||
#define L 2
|
||||
#define MIN 0.0
|
||||
#define MAX (M_PI/4)
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
|
||||
mpfr_t x;
|
||||
mpfr_init(x);
|
||||
mpfr_cos(ret, a, GMP_RNDN);
|
||||
mpfr_set_ld(x, 1, GMP_RNDN);
|
||||
mpfr_sub(ret, ret, x, GMP_RNDN);
|
||||
mpfr_clear(x);
|
||||
}
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
|
||||
#define PMUL 2
|
||||
#define PADD 2
|
||||
#define FIXCOEF0 (-0.5)
|
||||
#endif
|
||||
|
||||
|
||||
#if 0 // for xsincospi4_u05
|
||||
#define S 40
|
||||
#define N 8
|
||||
#define L 2
|
||||
#define MIN 0.0
|
||||
#define MAX 1.0
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) {
|
||||
mpfr_t x, y;
|
||||
mpfr_inits(x, y, NULL);
|
||||
mpfr_const_pi(x, GMP_RNDN);
|
||||
mpfr_set_d(y, 1.0/4, GMP_RNDN);
|
||||
mpfr_mul(x, x, y, GMP_RNDN);
|
||||
mpfr_mul(x, x, a, GMP_RNDN);
|
||||
mpfr_sin(ret, x, GMP_RNDN);
|
||||
mpfr_clears(x, y, NULL);
|
||||
}
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#endif
|
||||
|
||||
#if 0 // for xsincospi4_u05
|
||||
#define N 8
|
||||
#define S 40
|
||||
#define L 2
|
||||
#define MIN 0.0
|
||||
#define MAX 1.0
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) {
|
||||
mpfr_t x, y;
|
||||
mpfr_inits(x, y, NULL);
|
||||
mpfr_const_pi(x, GMP_RNDN);
|
||||
mpfr_set_d(y, 1.0/4, GMP_RNDN);
|
||||
mpfr_mul(x, x, y, GMP_RNDN);
|
||||
mpfr_mul(x, x, a, GMP_RNDN);
|
||||
mpfr_cos(ret, x, GMP_RNDN);
|
||||
mpfr_set_ld(x, 1, GMP_RNDN);
|
||||
mpfr_sub(ret, ret, x, GMP_RNDN);
|
||||
mpfr_clears(x, y, NULL);
|
||||
}
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define PMUL 2
|
||||
#define PADD 2
|
||||
#endif
|
||||
|
||||
|
||||
#if 0 // for xsincospi4
|
||||
#define N 7
|
||||
#define S 40
|
||||
#define L 0
|
||||
#define MIN 0.0
|
||||
#define MAX 1.0
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) {
|
||||
mpfr_t x, y;
|
||||
mpfr_inits(x, y, NULL);
|
||||
mpfr_const_pi(x, GMP_RNDN);
|
||||
mpfr_set_d(y, 1.0/4, GMP_RNDN);
|
||||
mpfr_mul(x, x, y, GMP_RNDN);
|
||||
mpfr_mul(x, x, a, GMP_RNDN);
|
||||
mpfr_sin(ret, x, GMP_RNDN);
|
||||
mpfr_clears(x, y, NULL);
|
||||
}
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#endif
|
||||
|
||||
|
||||
#if 0
|
||||
#define N 17
|
||||
#define S 60
|
||||
#define L 0
|
||||
#define MIN 0.0
|
||||
#define MAX (M_PI/4)
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define N 11
|
||||
#define S 35
|
||||
#define L 2
|
||||
#define MIN 1 //0.75
|
||||
#define MAX 1.5
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t frd, mpfr_t fra) {
|
||||
mpfr_t tmp, one;
|
||||
mpfr_inits(tmp, one, NULL);
|
||||
mpfr_set_d(one, 1, GMP_RNDN);
|
||||
mpfr_add(tmp, fra, one, GMP_RNDN);
|
||||
mpfr_sub(frd, fra, one, GMP_RNDN);
|
||||
mpfr_div(frd, frd, tmp, GMP_RNDN);
|
||||
mpfr_clears(tmp, one, NULL);
|
||||
}
|
||||
#define FIXCOEF0 2.0
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
#define N 12
|
||||
#define S 50
|
||||
#define L 2
|
||||
#define MIN -0.347
|
||||
#define MAX 0.347 // 0.5 log 2
|
||||
#define PMUL 1
|
||||
#define PADD 0
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0
|
||||
#define FIXCOEF1 1.0
|
||||
//#define FIXCOEF2 0.5
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define N 21
|
||||
#define S 100
|
||||
#define L 1
|
||||
#define P 1.1
|
||||
#define MIN 0.0
|
||||
#define MAX 1.0
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define N 20
|
||||
#define S 100
|
||||
#define L 0
|
||||
#define P 1.54
|
||||
#define MIN 0.0
|
||||
#define MAX 0.708
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_asin(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0
|
||||
#endif
|
||||
@@ -0,0 +1,375 @@
|
||||
// This is part of SLEEF, written by Naoki Shibata. http://shibatch.sourceforge.net
|
||||
|
||||
// Since the original code for simplex algorithm is developed by Haruhiko Okumura and
|
||||
// the code is distributed under the Creative Commons Attribution 4.0 International License,
|
||||
// the contents under this directory are also distributed under the same license.
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include <time.h>
|
||||
#include <mpfr.h>
|
||||
|
||||
//#include "sp.h"
|
||||
#include "dp.h"
|
||||
//#include "ld.h"
|
||||
//#include "qp.h"
|
||||
|
||||
#undef VERBOSE
|
||||
|
||||
#define PREC 4096
|
||||
|
||||
#define EPS 1e-50
|
||||
|
||||
#define PREC2 (PREC_TARGET*4)
|
||||
|
||||
#ifndef P
|
||||
#define P 1
|
||||
#endif
|
||||
|
||||
#ifndef Q
|
||||
#define Q 10000
|
||||
#endif
|
||||
|
||||
void mpfr_zinit(mpfr_t m);
|
||||
void regressMinRelError_fr(int n, int m, mpfr_t **x, mpfr_t *result);
|
||||
|
||||
char *mpfrToStr(mpfr_t m) {
|
||||
mpfr_t fra;
|
||||
mpfr_init2(fra, mpfr_get_prec(m));
|
||||
|
||||
mpfr_abs(fra, m, GMP_RNDN);
|
||||
mpfr_exp_t e;
|
||||
char *s = mpfr_get_str(NULL, &e, 10, 0, fra, GMP_RNDN);
|
||||
|
||||
char *ret = malloc(strlen(s) + 20);
|
||||
|
||||
if (mpfr_sgn(m) == -1) ret[0] = '-'; else ret[0] = '+';
|
||||
ret[1] = '0';
|
||||
ret[2] = '.';
|
||||
|
||||
strcpy(&ret[3], s);
|
||||
mpfr_free_str(s);
|
||||
|
||||
char estr[10];
|
||||
sprintf(estr, "e%+d", (int)e);
|
||||
strcat(ret, estr);
|
||||
|
||||
mpfr_clears(fra, NULL);
|
||||
return ret;
|
||||
}
|
||||
|
||||
double countULP(mpfr_t d, mpfr_t c) {
|
||||
mpfr_t fry, frw;
|
||||
mpfr_inits(fry, frw, NULL);
|
||||
|
||||
double c2 = mpfr_get_d(c, GMP_RNDN);
|
||||
if (c2 == 0 && mpfr_cmp_d(d, 0) != 0) return 10000;
|
||||
|
||||
long e;
|
||||
mpfr_get_d_2exp(&e, c, GMP_RNDN);
|
||||
mpfr_set_ui_2exp(frw, 1, e-PREC_TARGET, GMP_RNDN);
|
||||
|
||||
mpfr_sub(fry, d, c, GMP_RNDN);
|
||||
mpfr_div(fry, fry, frw, GMP_RNDN);
|
||||
double u = fabs(mpfr_get_d(fry, GMP_RNDN));
|
||||
|
||||
mpfr_clears(fry, frw, NULL);
|
||||
|
||||
return u;
|
||||
}
|
||||
|
||||
void func(mpfr_t s, mpfr_t x, mpfr_t *coef, int n) {
|
||||
mpfr_set_prec(s, PREC_TARGET);
|
||||
mpfr_set(s, coef[n-1], GMP_RNDN);
|
||||
|
||||
for(int i=n-1;i>0;i--) {
|
||||
if (i == L-1) {
|
||||
mpfr_t t;
|
||||
mpfr_init2(t, PREC2);
|
||||
mpfr_set(t, s, GMP_RNDN);
|
||||
mpfr_set_prec(s, PREC2);
|
||||
mpfr_set(s, t, GMP_RNDN);
|
||||
mpfr_clear(t);
|
||||
}
|
||||
mpfr_mul(s, s, x, GMP_RNDN);
|
||||
mpfr_add(s, s, coef[i-1], GMP_RNDN);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int i, j;
|
||||
int n, m;
|
||||
double p;
|
||||
|
||||
mpfr_set_default_prec(PREC);
|
||||
|
||||
#if 0
|
||||
{
|
||||
mpfr_t a, b;
|
||||
mpfr_inits(a, b, NULL);
|
||||
|
||||
float x = M_PI;
|
||||
mpfr_set_d(a, x, GMP_RNDN);
|
||||
x = nexttowardf(x, 100);
|
||||
x = nexttowardf(x, 100);
|
||||
x = nexttowardf(x, 100);
|
||||
mpfr_set_d(b, x, GMP_RNDN);
|
||||
|
||||
printf("%g\n", countULP(b, a));
|
||||
mpfr_clears(a, b, NULL);
|
||||
exit(0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
{
|
||||
mpfr_t a, b;
|
||||
mpfr_inits(a, b, NULL);
|
||||
|
||||
double x = M_PI;
|
||||
mpfr_set_d(a, x, GMP_RNDN);
|
||||
x = nexttoward(x, 100);
|
||||
x = nexttoward(x, 100);
|
||||
x = nexttoward(x, 100);
|
||||
mpfr_set_d(b, x, GMP_RNDN);
|
||||
|
||||
printf("%g\n", countULP(b, a));
|
||||
mpfr_clears(a, b, NULL);
|
||||
exit(0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
{
|
||||
mpfr_t a, b;
|
||||
mpfr_inits(a, b, NULL);
|
||||
|
||||
long double x = M_PI;
|
||||
mpfr_set_ld(a, x, GMP_RNDN);
|
||||
x = nexttowardl(x, 100);
|
||||
x = nexttowardl(x, 100);
|
||||
x = nexttowardl(x, 100);
|
||||
mpfr_set_ld(b, x, GMP_RNDN);
|
||||
|
||||
printf("%g\n", countULP(b, a));
|
||||
mpfr_clears(a, b, NULL);
|
||||
exit(0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
{
|
||||
mpfr_t a, b;
|
||||
mpfr_inits(a, b, NULL);
|
||||
|
||||
__float128 x = M_PI;
|
||||
mpfr_set_f128(a, x, GMP_RNDN);
|
||||
x = nextafterq(x, 100);
|
||||
x = nextafterq(x, 100);
|
||||
x = nextafterq(x, 100);
|
||||
mpfr_set_f128(b, x, GMP_RNDN);
|
||||
|
||||
printf("%g\n", countULP(b, a));
|
||||
mpfr_clears(a, b, NULL);
|
||||
exit(0);
|
||||
}
|
||||
#endif
|
||||
|
||||
m = N+1;
|
||||
n = argc >= 2 ? atoi(argv[1]) : S;
|
||||
p = argc >= 3 ? atof(argv[2]) : P;
|
||||
|
||||
mpfr_t **x, *result; // x[m][n], result[m]
|
||||
|
||||
x = calloc(sizeof(mpfr_t *), m);
|
||||
result = calloc(sizeof(mpfr_t), m);
|
||||
for(i=0;i<m;i++) {
|
||||
x[i] = calloc(sizeof(mpfr_t), n);
|
||||
for(j=0;j<n;j++) mpfr_zinit(x[i][j]);
|
||||
mpfr_zinit(result[i]);
|
||||
}
|
||||
|
||||
mpfr_t fra, frb, frc, frd, fre;
|
||||
|
||||
mpfr_zinit(fra);
|
||||
mpfr_zinit(frb);
|
||||
mpfr_zinit(frc);
|
||||
mpfr_zinit(frd);
|
||||
mpfr_zinit(fre);
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
double b = 1.0 - pow((double)i / (n-1), p);
|
||||
double a = ((double)MAX - MIN) * b + MIN;
|
||||
mpfr_set_d(fra, a, GMP_RNDN);
|
||||
CFUNC(frd, fra);
|
||||
|
||||
for(j=0;j<m-1;j++) {
|
||||
mpfr_set_d(frb, (double)j*PMUL+PADD, GMP_RNDN);
|
||||
mpfr_pow(x[j][i], frd, frb, GMP_RNDN);
|
||||
//printf("%g ", mpfr_get_d(x[j][i], GMP_RNDN));
|
||||
}
|
||||
|
||||
TARGET(x[m-1][i], fra);
|
||||
//printf(" : %g\n", mpfr_get_d(x[m-1][i], GMP_RNDN));
|
||||
}
|
||||
|
||||
for(i=0;i<m-1;i++) mpfr_set_d(result[i], 0, GMP_RNDN);
|
||||
|
||||
regressMinRelError_fr(n, m-1, x, result);
|
||||
|
||||
for(i=m-2;i>=0;i--) {
|
||||
mpfr_set_prec(fra, PREC_TARGET+4);
|
||||
mpfr_set(fra, result[i], GMP_RNDN);
|
||||
|
||||
char *s;
|
||||
printf("%s, \n", s = mpfrToStr(fra));
|
||||
free(s);
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
mpfr_set_prec(fra, PREC);
|
||||
|
||||
double emax = 0;
|
||||
|
||||
for(i=0;i<=n*10;i++) {
|
||||
double a = i * (double)(MAX - MIN) / (n*10.0) + MIN;
|
||||
mpfr_set_d(fra, a, GMP_RNDN);
|
||||
|
||||
CFUNC(frd, fra);
|
||||
|
||||
mpfr_set_d(frb, 0, GMP_RNDN);
|
||||
|
||||
for(j=m-1;j>=0;j--) {
|
||||
mpfr_set_d(frc, (double)j*PMUL+PADD, GMP_RNDN);
|
||||
mpfr_pow(frc, frd, frc, GMP_RNDN);
|
||||
mpfr_mul(frc, frc, result[j], GMP_RNDN);
|
||||
mpfr_add(frb, frb, frc, GMP_RNDN);
|
||||
}
|
||||
|
||||
TARGET(frc, fra);
|
||||
double u = countULP(frb, frc);
|
||||
|
||||
if (u > emax) emax = u;
|
||||
}
|
||||
|
||||
printf("Phase 1 : Max error = %g ULP\n\n", emax);
|
||||
fflush(stdout);
|
||||
|
||||
//
|
||||
|
||||
mpfr_t bestcoef[N], curcoef[N];
|
||||
|
||||
for(i=0;i<N;i++) {
|
||||
mpfr_init2(bestcoef[i], i >= L ? PREC_TARGET : PREC2);
|
||||
mpfr_set(bestcoef[i], result[i], GMP_RNDN);
|
||||
|
||||
mpfr_init2(curcoef[i], i >= L ? PREC_TARGET : PREC2);
|
||||
mpfr_set(curcoef[i], result[i], GMP_RNDN);
|
||||
}
|
||||
|
||||
srandom(time(NULL));
|
||||
|
||||
mpfr_set_default_prec(PREC2);
|
||||
|
||||
static mpfr_t a[Q], v[Q], am[Q], aa[Q];
|
||||
|
||||
for(i=0;i<Q;i++) {
|
||||
mpfr_inits(a[i], v[i], am[i], aa[i], NULL);
|
||||
|
||||
mpfr_set_d(fra, ((double)MAX - (double)MIN) * i / (double)(Q-1) + (double)MIN, GMP_RNDN);
|
||||
|
||||
TARGET(v[i], fra);
|
||||
CFUNC(a[i], fra);
|
||||
mpfr_set_d(frb, PMUL, GMP_RNDN);
|
||||
mpfr_pow(am[i], a[i], frb, GMP_RNDN);
|
||||
mpfr_set_d(frb, PADD, GMP_RNDN);
|
||||
mpfr_pow(aa[i], a[i], frb, GMP_RNDN);
|
||||
mpfr_clears(a[i], v[i], am[i], aa[i], NULL);
|
||||
}
|
||||
|
||||
double best = 1e+100, bestsum = 1e+100, bestworstx;
|
||||
|
||||
for(int k=0;k<10000;k++) {
|
||||
double emax = 0, esum = 0, worstx = 0;
|
||||
|
||||
#ifdef FIXCOEF0
|
||||
mpfr_set_d(curcoef[0], FIXCOEF0, GMP_RNDN);
|
||||
#endif
|
||||
|
||||
#ifdef FIXCOEF1
|
||||
mpfr_set_d(curcoef[1], FIXCOEF1, GMP_RNDN);
|
||||
#endif
|
||||
|
||||
#ifdef FIXCOEF2
|
||||
mpfr_set_d(curcoef[2], FIXCOEF2, GMP_RNDN);
|
||||
#endif
|
||||
|
||||
for(i=0;i<Q;i++) {
|
||||
if (mpfr_cmp_d(v[i], 0) == 0) continue;
|
||||
|
||||
mpfr_set_d(frb, 0, GMP_RNDN);
|
||||
for(j=N-1;j>=0;j--) {
|
||||
mpfr_set_d(frc, (double)j*PMUL+PADD, GMP_RNDN);
|
||||
mpfr_pow(frc, a[i], frc, GMP_RNDN);
|
||||
mpfr_mul(frc, frc, curcoef[j], GMP_RNDN);
|
||||
mpfr_add(frb, frb, frc, GMP_RNDN);
|
||||
}
|
||||
|
||||
double e = countULP(frb, v[i]);
|
||||
|
||||
//printf("c = %.20g, t = %.20g, ulp = %g\n", mpfr_get_d(v[i], GMP_RNDN), mpfr_get_d(frb, GMP_RNDN), e);
|
||||
|
||||
if (!isfinite(e)) continue;
|
||||
if (e > emax) { emax = e; worstx = mpfr_get_d(a[i], GMP_RNDN); }
|
||||
esum += e;
|
||||
}
|
||||
mpfr_set_prec(frb, PREC);
|
||||
|
||||
//printf("emax = %g\n", emax);
|
||||
|
||||
if (emax < best || (emax == best && esum < bestsum)) {
|
||||
for(i=0;i<N;i++) {
|
||||
mpfr_set(bestcoef[i], curcoef[i], GMP_RNDN);
|
||||
}
|
||||
if (best == 1e+100 || k > 10) printf("Max error = %g ULP, Sum error = %g (Max error at %g)\n", emax, esum, worstx);
|
||||
if ((best - emax) / best > 0.0001) k = 0;
|
||||
best = emax;
|
||||
bestsum = esum;
|
||||
bestworstx = worstx;
|
||||
}
|
||||
|
||||
for(i=0;i<N;i++) {
|
||||
mpfr_set(curcoef[i], bestcoef[i], GMP_RNDN);
|
||||
}
|
||||
|
||||
for(i=0;i<N;i++) {
|
||||
static int tab[] = {0, 0, 0, 0, 0, 0, 1, -1};
|
||||
//static int tab[] = {0, 0, 0, 0, 2, -2, 1, -1};
|
||||
int r = tab[random() & 7];
|
||||
if (r > 0) {
|
||||
for(int j=0;j<r;j++) mpfr_nextabove(curcoef[i]);
|
||||
} else if (r < 0) {
|
||||
for(int j=0;j>r;j--) mpfr_nextbelow(curcoef[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
for(i=N-1;i>=0;i--) {
|
||||
mpfr_set_prec(fra, i >= L ? PREC_TARGET+4 : PREC2);
|
||||
mpfr_set(fra, bestcoef[i], GMP_RNDN);
|
||||
|
||||
char *s;
|
||||
printf("%s, \n", s = mpfrToStr(fra));
|
||||
free(s);
|
||||
}
|
||||
printf("\nPhase 2 : max error = %g ULP at %g\n", best, bestworstx);
|
||||
|
||||
exit(0);
|
||||
}
|
||||
@@ -0,0 +1,43 @@
|
||||
|
||||
With this small tool, the coefficients for polynomial approximation
|
||||
used in kernels can be generated.
|
||||
|
||||
Usage
|
||||
|
||||
Edit gencoefdp.c. In the beginning of the file, specifications of the
|
||||
parameters for generating coefficients are listed. Enable one of them
|
||||
by changing #if. Then, run make to compile the source code. Run the
|
||||
gencoef, and it will show the generated coefficients in a few minutes.
|
||||
|
||||
|
||||
How it works
|
||||
|
||||
There are two phases of the program.
|
||||
|
||||
The first phase is the regression for minimizing the maximum relative
|
||||
error. This problem can be reduced to a linear programming problem,
|
||||
and the Simplex method is used in this implementation. This requires
|
||||
multi-precision calculation, and the implementation uses the MPFR
|
||||
library to do this. In this phase, only a small number of values
|
||||
(specified by S macro, usually 40 or so) of the function to
|
||||
approximate are sampled within the argument range. The function to
|
||||
approximate can be given by FRFUNC function. Specifying higher values
|
||||
for S does not always give better results.
|
||||
|
||||
The second phase is to optimize the coefficients so that it gives good
|
||||
accuracy with double precision calculation. In this phase, it checks
|
||||
100000 points (specified by Q macro) within the specified argument
|
||||
range to see if the polynomial gives good error bound. In some cases,
|
||||
the last few terms have to be calculated in higher precision in order
|
||||
to achieve 1 ULP overall accuracy, and this implementation can take
|
||||
care of that. The L parameter specifies the number of high precision
|
||||
coefficients.
|
||||
|
||||
In some cases, it is desirable to fix the last few coefficients to
|
||||
values like 1. This can be specified if you define FIXCOEF0
|
||||
macro. This sometimes does not work, however. In this case, you need
|
||||
to specify the function to approximate as shown in the definition for
|
||||
cos.
|
||||
|
||||
Finding a set of good parameters is not a straightforward process. You
|
||||
usually need many iterations of trial and error.
|
||||
@@ -0,0 +1,178 @@
|
||||
// This is part of SLEEF, written by Naoki
|
||||
// Shibata. http://shibatch.sourceforge.net
|
||||
|
||||
// The code in this file is distributed under the Creative Commons
|
||||
// Attribution 4.0 International License.
|
||||
|
||||
#define PREC_TARGET 64
|
||||
|
||||
#if 0
|
||||
#define N 8 // Degree of equation
|
||||
#define S 40 // Number of samples for phase 1
|
||||
#define L 4 // Number of high precision coefficients
|
||||
#define MIN 0.0 // Min argument
|
||||
#define MAX (M_PI/4) // Max argument
|
||||
#define PMUL 2 // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_sin(ret, a, GMP_RNDN); } // The function to approximate
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0 // Fix coef 0 to 1.0
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define N 10
|
||||
#define S 40
|
||||
#define L 2
|
||||
#define MIN 0.0
|
||||
#define MAX (M_PI/4)
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
|
||||
mpfr_t x;
|
||||
mpfr_init(x);
|
||||
mpfr_cos(ret, a, GMP_RNDN);
|
||||
mpfr_set_ld(x, 1, GMP_RNDN);
|
||||
mpfr_sub(ret, ret, x, GMP_RNDN);
|
||||
mpfr_clear(x);
|
||||
}
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
|
||||
#define PMUL 2
|
||||
#define PADD 2
|
||||
#define FIXCOEF0 (-0.5)
|
||||
#endif
|
||||
|
||||
|
||||
#if 0 // for xsincospi4_u05
|
||||
#define N 9
|
||||
#define S 40
|
||||
#define L 2
|
||||
#define MIN 0.0
|
||||
#define MAX 1.0
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) {
|
||||
mpfr_t x, y;
|
||||
mpfr_inits(x, y, NULL);
|
||||
mpfr_const_pi(x, GMP_RNDN);
|
||||
mpfr_set_d(y, 1.0/4, GMP_RNDN);
|
||||
mpfr_mul(x, x, y, GMP_RNDN);
|
||||
mpfr_mul(x, x, a, GMP_RNDN);
|
||||
mpfr_sin(ret, x, GMP_RNDN);
|
||||
mpfr_clears(x, y, NULL);
|
||||
}
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#endif
|
||||
|
||||
#if 0 // for xsincospi4_u05
|
||||
#define N 9
|
||||
#define S 40
|
||||
#define L 2
|
||||
#define MIN 0.0
|
||||
#define MAX 1.0
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
|
||||
mpfr_t x, y;
|
||||
mpfr_inits(x, y, NULL);
|
||||
mpfr_const_pi(x, GMP_RNDN);
|
||||
mpfr_set_d(y, 1.0/4, GMP_RNDN);
|
||||
mpfr_mul(x, x, y, GMP_RNDN);
|
||||
mpfr_mul(x, x, a, GMP_RNDN);
|
||||
mpfr_cos(ret, x, GMP_RNDN);
|
||||
mpfr_set_ld(x, 1, GMP_RNDN);
|
||||
mpfr_sub(ret, ret, x, GMP_RNDN);
|
||||
mpfr_clears(x, y, NULL);
|
||||
}
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define PMUL 2
|
||||
#define PADD 2
|
||||
#endif
|
||||
|
||||
|
||||
#if 0 // for xsincospi4
|
||||
#define N 7
|
||||
#define S 40
|
||||
#define L 0
|
||||
#define MIN 0.0
|
||||
#define MAX 1.0
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) {
|
||||
mpfr_t x, y;
|
||||
mpfr_inits(x, y, NULL);
|
||||
mpfr_const_pi(x, GMP_RNDN);
|
||||
mpfr_set_d(y, 1.0/4, GMP_RNDN);
|
||||
mpfr_mul(x, x, y, GMP_RNDN);
|
||||
mpfr_mul(x, x, a, GMP_RNDN);
|
||||
mpfr_sin(ret, x, GMP_RNDN);
|
||||
mpfr_clears(x, y, NULL);
|
||||
}
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#endif
|
||||
|
||||
|
||||
#if 0
|
||||
#define N 17
|
||||
#define S 40
|
||||
#define L 0
|
||||
#define MIN 0.0
|
||||
#define MAX (M_PI/4)
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define N 9
|
||||
#define S 40
|
||||
#define L 2
|
||||
#define MIN 1 //0.75
|
||||
#define MAX 1.5
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t frd, mpfr_t fra) {
|
||||
mpfr_t tmp, one;
|
||||
mpfr_inits(tmp, one, NULL);
|
||||
mpfr_set_d(one, 1, GMP_RNDN);
|
||||
mpfr_add(tmp, fra, one, GMP_RNDN);
|
||||
mpfr_sub(frd, fra, one, GMP_RNDN);
|
||||
mpfr_div(frd, frd, tmp, GMP_RNDN);
|
||||
mpfr_clear(tmp, one, NULL);
|
||||
}
|
||||
#define FIXCOEF0 2.0
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define N 12
|
||||
#define S 50
|
||||
#define L 0
|
||||
#define MIN -0.347
|
||||
#define MAX 0.347 // 0.5 log 2
|
||||
#define PMUL 1
|
||||
#define PADD 0
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0
|
||||
#define FIXCOEF1 1.0
|
||||
#define FIXCOEF2 0.5
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define N 22
|
||||
#define S 100
|
||||
#define L 2
|
||||
#define MIN 0.0
|
||||
#define MAX 1.0
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0
|
||||
#endif
|
||||
@@ -0,0 +1,121 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
#include <mpfr.h>
|
||||
|
||||
static int64_t doubleToRawLongBits(double d) {
|
||||
union {
|
||||
double f;
|
||||
int64_t i;
|
||||
} tmp;
|
||||
tmp.f = d;
|
||||
return tmp.i;
|
||||
}
|
||||
|
||||
static double longBitsToDouble(int64_t i) {
|
||||
union {
|
||||
double f;
|
||||
int64_t i;
|
||||
} tmp;
|
||||
tmp.i = i;
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
static double removelsb(double d) {
|
||||
return longBitsToDouble(doubleToRawLongBits(d) & 0xfffffffffffffffeLL);
|
||||
}
|
||||
|
||||
static int32_t floatToRawIntBits(float d) {
|
||||
union {
|
||||
float f;
|
||||
int32_t i;
|
||||
} tmp;
|
||||
tmp.f = d;
|
||||
return tmp.i;
|
||||
}
|
||||
|
||||
static float intBitsToFloat(int32_t i) {
|
||||
union {
|
||||
float f;
|
||||
int32_t i;
|
||||
} tmp;
|
||||
tmp.i = i;
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
static float removelsbf(float x) {
|
||||
return intBitsToFloat(0xfffffffc & floatToRawIntBits(x));
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
mpfr_set_default_prec(2048);
|
||||
mpfr_t pi, rpi, xrpi, x, y, z, r;
|
||||
mpfr_inits(pi, rpi, xrpi, x, y, z, r, NULL);
|
||||
mpfr_const_pi(pi, GMP_RNDN);
|
||||
mpfr_set_d(x, 0.5, GMP_RNDN);
|
||||
mpfr_div(rpi, x, pi, GMP_RNDN);
|
||||
|
||||
printf("NOEXPORT ALIGNED(64) const double rempitabdp[] = {\n");
|
||||
for(int i=55;i<1024;i++) {
|
||||
int M = i > 700 ? -64 : 0;
|
||||
int ex = i - 53;
|
||||
if (ex < -52) ex = -52;
|
||||
mpfr_set_d(x, ldexp(1, ex), GMP_RNDN);
|
||||
mpfr_mul(y, x, rpi, GMP_RNDN);
|
||||
mpfr_frac(xrpi, y, GMP_RNDN);
|
||||
mpfr_div(xrpi, xrpi, x, GMP_RNDN);
|
||||
|
||||
mpfr_set_exp(xrpi, mpfr_get_exp(xrpi) - M);
|
||||
|
||||
mpfr_set(x, xrpi, GMP_RNDN);
|
||||
|
||||
double rpi0 = removelsb(mpfr_get_d(x, GMP_RNDN));
|
||||
mpfr_set_d(y, rpi0, GMP_RNDN);
|
||||
mpfr_sub(x, x, y, GMP_RNDN);
|
||||
|
||||
double rpi1 = removelsb(mpfr_get_d(x, GMP_RNDN));
|
||||
mpfr_set_d(y, rpi1, GMP_RNDN);
|
||||
mpfr_sub(x, x, y, GMP_RNDN);
|
||||
|
||||
double rpi2 = removelsb(mpfr_get_d(x, GMP_RNDN));
|
||||
mpfr_set_d(y, rpi2, GMP_RNDN);
|
||||
mpfr_sub(x, x, y, GMP_RNDN);
|
||||
|
||||
double rpi3 = mpfr_get_d(x, GMP_RNDN);
|
||||
|
||||
printf(" %.20g, %.20g, %.20g, %.20g,\n", rpi0, rpi1, rpi2, rpi3);
|
||||
}
|
||||
printf("};\n\n");
|
||||
|
||||
printf("NOEXPORT ALIGNED(64) const float rempitabsp[] = {\n");
|
||||
for(int i=25;i<128;i++) {
|
||||
int M = i > 90 ? -64 : 0;
|
||||
int ex = i - 23;
|
||||
mpfr_set_d(x, ldexp(1, ex), GMP_RNDN);
|
||||
mpfr_mul(y, x, rpi, GMP_RNDN);
|
||||
mpfr_frac(xrpi, y, GMP_RNDN);
|
||||
mpfr_div(xrpi, xrpi, x, GMP_RNDN);
|
||||
|
||||
mpfr_set_exp(xrpi, mpfr_get_exp(xrpi) - M);
|
||||
|
||||
mpfr_set(x, xrpi, GMP_RNDN);
|
||||
|
||||
float rpi20 = removelsbf(mpfr_get_d(x, GMP_RNDN));
|
||||
mpfr_set_d(y, rpi20, GMP_RNDN);
|
||||
mpfr_sub(x, x, y, GMP_RNDN);
|
||||
|
||||
float rpi21 = removelsbf(mpfr_get_d(x, GMP_RNDN));
|
||||
mpfr_set_d(y, rpi21, GMP_RNDN);
|
||||
mpfr_sub(x, x, y, GMP_RNDN);
|
||||
|
||||
float rpi22 = removelsbf(mpfr_get_d(x, GMP_RNDN));
|
||||
mpfr_set_d(y, rpi22, GMP_RNDN);
|
||||
mpfr_sub(x, x, y, GMP_RNDN);
|
||||
|
||||
float rpi23 = mpfr_get_d(x, GMP_RNDN);
|
||||
|
||||
printf(" %.10g, %.10g, %.10g, %.10g,\n", rpi20, rpi21, rpi22, rpi23);
|
||||
}
|
||||
printf("};\n");
|
||||
}
|
||||
@@ -0,0 +1,63 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
#include <mpfr.h>
|
||||
#include <quadmath.h>
|
||||
|
||||
#define N 8
|
||||
#define B 8
|
||||
#define NCOL (53-B)
|
||||
#define NROW ((16385+(53-B)*N-106)/NCOL+1)
|
||||
|
||||
static double *rempitabqp = NULL;
|
||||
|
||||
void generateRempitabqp() {
|
||||
rempitabqp = calloc(16385-106+(53-B)*(N+1), sizeof(double));
|
||||
|
||||
int orgprec = mpfr_get_default_prec();
|
||||
mpfr_set_default_prec(18000);
|
||||
|
||||
mpfr_t pi, m, n, o;
|
||||
mpfr_inits(pi, m, n, o, NULL);
|
||||
mpfr_const_pi(pi, GMP_RNDN);
|
||||
|
||||
mpfr_d_div(n, 0.5, pi, GMP_RNDN);
|
||||
|
||||
for(int e=106;e<16385+(53-B)*N;e++) {
|
||||
mpfr_set(m, n, GMP_RNDN);
|
||||
|
||||
mpfr_set_ui_2exp(o, 1, -(113 - e), GMP_RNDN);
|
||||
mpfr_mul(m, m, o, GMP_RNDN);
|
||||
|
||||
mpfr_frac(m, m, GMP_RNDN);
|
||||
|
||||
mpfr_set_ui_2exp(o, 1, (53-B), GMP_RNDN);
|
||||
mpfr_mul(m, m, o, GMP_RNDN);
|
||||
|
||||
mpfr_trunc(m, m);
|
||||
|
||||
mpfr_set_ui_2exp(o, 1, 7-(53-B), GMP_RNDN);
|
||||
mpfr_mul(m, m, o, GMP_RNDN);
|
||||
|
||||
int col = (e - 106) % NCOL;
|
||||
int row = (e - 106) / NCOL;
|
||||
rempitabqp[col * NROW + row] = mpfr_get_d(m, GMP_RNDN);
|
||||
}
|
||||
|
||||
mpfr_clears(pi, m, n, o, NULL);
|
||||
mpfr_set_default_prec(orgprec);
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
generateRempitabqp();
|
||||
|
||||
printf("NOEXPORT const double Sleef_rempitabqp[] = {\n ");
|
||||
for(int i=0;i<16385-106+(53-B)*(N+1);i++) {
|
||||
printf("%.20g, ", rempitabqp[i]);
|
||||
if ((i & 3) == 3) printf("\n ");
|
||||
}
|
||||
printf("\n};\n");
|
||||
}
|
||||
@@ -0,0 +1,161 @@
|
||||
// This is part of SLEEF, written by Naoki
|
||||
// Shibata. http://shibatch.sourceforge.net
|
||||
|
||||
// The code in this file is distributed under the Creative Commons
|
||||
// Attribution 4.0 International License.
|
||||
|
||||
#define PREC_TARGET 113
|
||||
|
||||
//
|
||||
|
||||
#if 0
|
||||
#define N 15 // Degree of equation
|
||||
#define S 150 // Number of samples for phase 1
|
||||
#define L 0 // Number of high precision coefficients
|
||||
#define P 0.37
|
||||
#define MIN 0.0 // Min argument
|
||||
#define MAX (M_PI/2) // Max argument
|
||||
#define PMUL 2 // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
|
||||
#define PADD 3
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { // The function to approximate
|
||||
mpfr_sin(ret, a, GMP_RNDN);
|
||||
mpfr_sub(ret, ret, a, GMP_RNDN); // ret = sin(a) - a
|
||||
}
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define N 15
|
||||
#define S 150
|
||||
#define L 0
|
||||
#define MIN 0.0
|
||||
#define MAX (M_PI/2)
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
|
||||
mpfr_t x;
|
||||
mpfr_init(x);
|
||||
mpfr_cos(ret, a, GMP_RNDN);
|
||||
mpfr_set_ld(x, 1, GMP_RNDN);
|
||||
mpfr_sub(ret, ret, x, GMP_RNDN);
|
||||
mpfr_clear(x);
|
||||
}
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
|
||||
#define PMUL 2
|
||||
#define PADD 2
|
||||
//#define FIXCOEF0 (-0.5)
|
||||
#endif
|
||||
|
||||
|
||||
#if 0 // for xsincospi4_u05
|
||||
#define N 13
|
||||
#define S 150
|
||||
#define L 2
|
||||
#define P 0.9
|
||||
#define MIN 0.0
|
||||
#define MAX 1.0
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) {
|
||||
mpfr_t x, y;
|
||||
mpfr_inits(x, y, NULL);
|
||||
mpfr_const_pi(x, GMP_RNDN);
|
||||
mpfr_set_d(y, 1.0/4, GMP_RNDN);
|
||||
mpfr_mul(x, x, y, GMP_RNDN);
|
||||
mpfr_mul(x, x, a, GMP_RNDN);
|
||||
mpfr_sin(ret, x, GMP_RNDN);
|
||||
mpfr_clears(x, y, NULL);
|
||||
}
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#endif
|
||||
|
||||
#if 0 // for xsincospi4_u05
|
||||
#define N 13
|
||||
#define S 150
|
||||
#define L 2
|
||||
#define MIN 0.0
|
||||
#define MAX 1.0
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
|
||||
mpfr_t x, y;
|
||||
mpfr_inits(x, y, NULL);
|
||||
mpfr_const_pi(x, GMP_RNDN);
|
||||
mpfr_set_d(y, 1.0/4, GMP_RNDN);
|
||||
mpfr_mul(x, x, y, GMP_RNDN);
|
||||
mpfr_mul(x, x, a, GMP_RNDN);
|
||||
mpfr_cos(ret, x, GMP_RNDN);
|
||||
mpfr_set_ld(x, 1, GMP_RNDN);
|
||||
mpfr_sub(ret, ret, x, GMP_RNDN);
|
||||
mpfr_clears(x, y, NULL);
|
||||
}
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define PMUL 2
|
||||
#define PADD 2
|
||||
#endif
|
||||
|
||||
#if 0 // running
|
||||
#define N 31
|
||||
#define S 100
|
||||
#define P 1.7
|
||||
#define L 0
|
||||
#define MIN 0.0
|
||||
#define MAX (M_PI/4)
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0
|
||||
#endif
|
||||
|
||||
#if 0 // running
|
||||
#define N 20
|
||||
#define S 110
|
||||
#define L 2
|
||||
#define MIN 1 //0.75
|
||||
#define MAX 1.5
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t frd, mpfr_t fra) {
|
||||
mpfr_t tmp, one;
|
||||
mpfr_inits(tmp, one, NULL);
|
||||
mpfr_set_d(one, 1, GMP_RNDN);
|
||||
mpfr_add(tmp, fra, one, GMP_RNDN);
|
||||
mpfr_sub(frd, fra, one, GMP_RNDN);
|
||||
mpfr_div(frd, frd, tmp, GMP_RNDN);
|
||||
mpfr_clears(tmp, one, NULL);
|
||||
}
|
||||
#define FIXCOEF0 2.0
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
#define N 22
|
||||
#define S 140
|
||||
#define L 2
|
||||
#define MIN -0.347
|
||||
#define MAX 0.347 // 0.5 log 2
|
||||
#define PMUL 1
|
||||
#define PADD 0
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0
|
||||
#define FIXCOEF1 1.0
|
||||
//#define FIXCOEF2 0.5
|
||||
#endif
|
||||
|
||||
#if 0 // running
|
||||
#define N 45
|
||||
#define S 100
|
||||
#define P 1.55
|
||||
#define L 2
|
||||
#define MIN 0.0
|
||||
#define MAX 1.0
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0
|
||||
#endif
|
||||
@@ -0,0 +1,459 @@
|
||||
// The original code for simplex algorithm is taken from Haruhiko Okumura's book.
|
||||
// https://oku.edu.mie-u.ac.jp/~okumura/algo/
|
||||
// The code is distributed under the Creative Commons Attribution 4.0 International License.
|
||||
// https://creativecommons.org/licenses/by/4.0/
|
||||
|
||||
// The code is modified by Naoki Shibata to process arbitrary precision numbers.
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include <time.h>
|
||||
#include <mpfr.h>
|
||||
|
||||
#define PREC 4096
|
||||
#define EPS 1e-50
|
||||
|
||||
#define OK 0
|
||||
#define MAXIMIZABLE_TO_INFINITY 1
|
||||
#define NOT_FEASIBLE 2
|
||||
#define ERROR (-1)
|
||||
|
||||
#define NOP (-1)
|
||||
#define EQU (0)
|
||||
#define LEQ 1
|
||||
#define GEQ 2
|
||||
|
||||
static int m, n, n1, n2, n3, jmax;
|
||||
static int *col, *row, *nonzero_row, *inequality;
|
||||
static mpfr_t **a, *c, **q, *pivotcolumn;
|
||||
|
||||
static mpfr_t zero, one, eps, minuseps, large;
|
||||
|
||||
void mpfr_zinit(mpfr_t m) {
|
||||
mpfr_init(m);
|
||||
mpfr_set_d(m, 0, GMP_RNDN);
|
||||
}
|
||||
|
||||
static void init(int n0, int m0) {
|
||||
int i, j;
|
||||
|
||||
m = m0; n = n0;
|
||||
|
||||
mpfr_init(zero); mpfr_set_d(zero, 0, GMP_RNDN);
|
||||
mpfr_init(one); mpfr_set_d(one, 1, GMP_RNDN);
|
||||
|
||||
mpfr_init(eps);
|
||||
mpfr_set_d(eps, EPS, GMP_RNDN);
|
||||
|
||||
mpfr_init(minuseps);
|
||||
mpfr_set_d(minuseps, -EPS, GMP_RNDN);
|
||||
|
||||
mpfr_init(large);
|
||||
mpfr_set_d(large, 1.0 / EPS, GMP_RNDN);
|
||||
|
||||
a = malloc(sizeof(mpfr_t *) * (m + 1));
|
||||
for(i=0;i < m+1;i++) {
|
||||
a[i] = malloc(sizeof(mpfr_t) * (n + 1));
|
||||
for(j=0;j < (n+1);j++) {
|
||||
mpfr_zinit(a[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
q = malloc(sizeof(mpfr_t *) * (m + 1));
|
||||
for(i=0;i < m+1;i++) {
|
||||
q[i] = malloc(sizeof(mpfr_t) * (m + 1));
|
||||
for(j=0;j < m+1;j++) {
|
||||
mpfr_zinit(q[i][j]);
|
||||
}
|
||||
}
|
||||
|
||||
c = malloc(sizeof(mpfr_t) * (n + 1));
|
||||
for(j=0;j < (n+1);j++) {
|
||||
mpfr_zinit(c[j]);
|
||||
}
|
||||
|
||||
pivotcolumn = malloc(sizeof(mpfr_t) * (m + 1));
|
||||
for(j=0;j < (m+1);j++) {
|
||||
mpfr_zinit(pivotcolumn[j]);
|
||||
}
|
||||
|
||||
col = calloc(m+1, sizeof(int));
|
||||
row = calloc(n+2*m+1, sizeof(int));
|
||||
nonzero_row = calloc(n+2*m+1, sizeof(int));
|
||||
inequality = calloc(m+1, sizeof(int));
|
||||
}
|
||||
|
||||
static void dispose() {
|
||||
mpfr_clears(zero, one, eps, minuseps, large, (mpfr_ptr)0);
|
||||
|
||||
int i, j;
|
||||
|
||||
for(i=0;i < m+1;i++) {
|
||||
for(j=0;j < m+1;j++) {
|
||||
mpfr_clear(q[i][j]);
|
||||
}
|
||||
free(q[i]);
|
||||
}
|
||||
free(q);
|
||||
|
||||
for(i=0;i < m+1;i++) {
|
||||
for(j=0;j < n+1;j++) {
|
||||
mpfr_clear(a[i][j]);
|
||||
}
|
||||
free(a[i]);
|
||||
}
|
||||
free(a);
|
||||
|
||||
for(j=0;j < n+1;j++) {
|
||||
mpfr_clear(c[j]);
|
||||
}
|
||||
free(c);
|
||||
|
||||
for(j=0;j < m+1;j++) {
|
||||
mpfr_clear(pivotcolumn[j]);
|
||||
}
|
||||
free(pivotcolumn);
|
||||
|
||||
free(col);
|
||||
free(row);
|
||||
free(nonzero_row);
|
||||
free(inequality);
|
||||
}
|
||||
|
||||
static void prepare() {
|
||||
int i;
|
||||
|
||||
n1 = n;
|
||||
for (i = 1; i <= m; i++)
|
||||
if (inequality[i] == GEQ) {
|
||||
n1++; nonzero_row[n1] = i;
|
||||
}
|
||||
n2 = n1;
|
||||
for (i = 1; i <= m; i++)
|
||||
if (inequality[i] == LEQ) {
|
||||
n2++; col[i] = n2;
|
||||
nonzero_row[n2] = row[n2] = i;
|
||||
}
|
||||
n3 = n2;
|
||||
for (i = 1; i <= m; i++)
|
||||
if (inequality[i] != LEQ) {
|
||||
n3++; col[i] = n3;
|
||||
nonzero_row[n3] = row[n3] = i;
|
||||
}
|
||||
|
||||
for (i = 0; i <= m; i++) {
|
||||
mpfr_set_d(q[i][i], 1, GMP_RNDN);
|
||||
}
|
||||
}
|
||||
|
||||
static void tableau(mpfr_t ret, int i, int j) {
|
||||
int k;
|
||||
|
||||
if (col[i] < 0) { mpfr_set_d(ret, 0, GMP_RNDN); return; }
|
||||
|
||||
if (j <= n) {
|
||||
mpfr_t s;
|
||||
mpfr_zinit(s);
|
||||
mpfr_set_d(s, 0, GMP_RNDN);
|
||||
|
||||
mpfr_t *tab = malloc(sizeof(mpfr_t) * (m + 1));
|
||||
mpfr_ptr *ptab = malloc(sizeof(mpfr_ptr) * (m + 1));
|
||||
for (k = 0; k <= m; k++) {
|
||||
mpfr_zinit(tab[k]);
|
||||
ptab[k] = (mpfr_ptr)&tab[k];
|
||||
mpfr_mul(tab[k], q[i][k], a[k][j], GMP_RNDN);
|
||||
}
|
||||
mpfr_sum(s, ptab, m+1, GMP_RNDN);
|
||||
for (k = 0; k <= m; k++) {
|
||||
mpfr_clear(tab[k]);
|
||||
}
|
||||
free(ptab);
|
||||
free(tab);
|
||||
|
||||
mpfr_set(ret, s, GMP_RNDN);
|
||||
mpfr_clear(s);
|
||||
return;
|
||||
}
|
||||
|
||||
mpfr_set(ret, q[i][nonzero_row[j]], GMP_RNDN);
|
||||
|
||||
if (j <= n1) { mpfr_neg(ret, ret, GMP_RNDN); return; }
|
||||
if (j <= n2 || i != 0) return;
|
||||
|
||||
mpfr_add(ret, ret, one, GMP_RNDN);
|
||||
return;
|
||||
}
|
||||
|
||||
static void pivot(int ipivot, int jpivot) {
|
||||
int i, j;
|
||||
mpfr_t u;
|
||||
|
||||
mpfr_zinit(u);
|
||||
|
||||
mpfr_set(u, pivotcolumn[ipivot], GMP_RNDN);
|
||||
|
||||
for (j = 1; j <= m; j++) {
|
||||
mpfr_div(q[ipivot][j], q[ipivot][j], u, GMP_RNDN);
|
||||
}
|
||||
|
||||
for (i = 0; i <= m; i++)
|
||||
if (i != ipivot) {
|
||||
mpfr_set(u, pivotcolumn[i], GMP_RNDN);
|
||||
|
||||
for (j = 1; j <= m; j++) {
|
||||
mpfr_fms(q[i][j], q[ipivot][j], u, q[i][j], GMP_RNDN);
|
||||
mpfr_neg(q[i][j], q[i][j], GMP_RNDN);
|
||||
}
|
||||
}
|
||||
|
||||
row[col[ipivot]] = 0;
|
||||
|
||||
col[ipivot] = jpivot; row[jpivot] = ipivot;
|
||||
|
||||
mpfr_clear(u);
|
||||
}
|
||||
|
||||
static int minimize() {
|
||||
int i, ipivot, jpivot;
|
||||
mpfr_t t, u;
|
||||
mpfr_inits(t, u, (mpfr_ptr)0);
|
||||
|
||||
for (;;) {
|
||||
for (jpivot = 1; jpivot <= jmax; jpivot++) {
|
||||
if (row[jpivot] == 0) {
|
||||
tableau(pivotcolumn[0], 0, jpivot);
|
||||
if (mpfr_cmp(pivotcolumn[0], minuseps) < 0) break;
|
||||
}
|
||||
}
|
||||
if (jpivot > jmax) {
|
||||
mpfr_clears(t, u, (mpfr_ptr)0);
|
||||
return 1;
|
||||
}
|
||||
|
||||
mpfr_set(u, large, GMP_RNDN);
|
||||
ipivot = 0;
|
||||
for (i = 1; i <= m; i++) {
|
||||
tableau(pivotcolumn[i], i, jpivot);
|
||||
if (mpfr_cmp(pivotcolumn[i], eps) > 0) {
|
||||
tableau(t, i, 0);
|
||||
mpfr_div(t, t, pivotcolumn[i], GMP_RNDN);
|
||||
if (mpfr_cmp(t, u) < 0) { ipivot = i; mpfr_set(u, t, GMP_RNDN); }
|
||||
}
|
||||
}
|
||||
if (ipivot == 0) {
|
||||
mpfr_clears(t, u, (mpfr_ptr)0);
|
||||
return 0; // the objective function can be minimized to -infinite
|
||||
}
|
||||
pivot(ipivot, jpivot);
|
||||
}
|
||||
}
|
||||
|
||||
static int phase1() {
|
||||
int i, j;
|
||||
mpfr_t u;
|
||||
mpfr_zinit(u);
|
||||
|
||||
jmax = n3;
|
||||
for (i = 0; i <= m; i++) {
|
||||
if (col[i] > n2) mpfr_set_d(q[0][i], -1, GMP_RNDN);
|
||||
}
|
||||
|
||||
minimize();
|
||||
|
||||
tableau(u, 0, 0);
|
||||
if (mpfr_cmp(u, minuseps) < 0) {
|
||||
mpfr_clear(u);
|
||||
return 0;
|
||||
}
|
||||
for (i = 1; i <= m; i++) {
|
||||
if (col[i] > n2) {
|
||||
col[i] = -1;
|
||||
}
|
||||
}
|
||||
mpfr_set_d(q[0][0], 1, GMP_RNDN);
|
||||
for (j = 1; j <= m; j++) mpfr_set_d(q[0][j], 0, GMP_RNDN);
|
||||
for (i = 1; i <= m; i++) {
|
||||
if ((j = col[i]) > 0 && j <= n && mpfr_cmp_d(c[j], 0) != 0) {
|
||||
mpfr_set(u, c[j], GMP_RNDN);
|
||||
for (j = 1; j <= m; j++) {
|
||||
mpfr_fms(q[0][j], q[i][j], u, q[0][j], GMP_RNDN);
|
||||
mpfr_neg(q[0][j], q[0][j], GMP_RNDN);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
mpfr_clear(u);
|
||||
return 1;
|
||||
}
|
||||
|
||||
static int phase2() {
|
||||
int j;
|
||||
jmax = n2;
|
||||
for (j = 0; j <= n; j++) {
|
||||
mpfr_set(a[0][j], c[j], GMP_RNDN);
|
||||
}
|
||||
|
||||
return minimize();
|
||||
}
|
||||
|
||||
int solve_fr(mpfr_t *result, int n0, int m0, mpfr_t **a0, int *ineq0, mpfr_t *c0) {
|
||||
int i,j;
|
||||
|
||||
m = m0; // number of inequations
|
||||
n = n0+1; // number of variables
|
||||
|
||||
init(n, m);
|
||||
|
||||
mpfr_t csum;
|
||||
mpfr_zinit(csum);
|
||||
|
||||
for(j=0;j<n0+1;j++) {
|
||||
mpfr_set(c[j], c0[j], GMP_RNDN);
|
||||
}
|
||||
|
||||
for(j=1;j<n0+1;j++) {
|
||||
mpfr_add(csum, csum, c0[j], GMP_RNDN);
|
||||
}
|
||||
|
||||
mpfr_set(c[n], csum, GMP_RNDN);
|
||||
mpfr_neg(c[n], c[n], GMP_RNDN);
|
||||
|
||||
for(i=0;i<m;i++) {
|
||||
mpfr_set_d(csum, 0, GMP_RNDN);
|
||||
|
||||
for(j=0;j<n0+1;j++) mpfr_set(a[i+1][j], a0[i][j], GMP_RNDN);
|
||||
mpfr_neg(a[i+1][0], a[i+1][0], GMP_RNDN);
|
||||
|
||||
for(j=1;j<n0+1;j++) {
|
||||
mpfr_add(csum, csum, a0[i][j], GMP_RNDN);
|
||||
}
|
||||
|
||||
mpfr_set(a[i+1][n], csum, GMP_RNDN);
|
||||
mpfr_neg(a[i+1][n], a[i+1][n], GMP_RNDN);
|
||||
inequality[i+1] = ineq0[i];
|
||||
|
||||
if (mpfr_cmp_d(a[i+1][0], 0) < 0) {
|
||||
if (inequality[i+1] == GEQ) inequality[i+1] = LEQ;
|
||||
else if (inequality[i+1] == LEQ) inequality[i+1] = GEQ;
|
||||
for (j = 0; j <= n; j++) mpfr_neg(a[i+1][j], a[i+1][j], GMP_RNDN);
|
||||
} else if (mpfr_cmp_d(a[i+1][0], 0) == 0 && inequality[i+1] == GEQ) {
|
||||
inequality[i+1] = LEQ;
|
||||
for (j = 1; j <= n; j++) mpfr_neg(a[i+1][j], a[i+1][j], GMP_RNDN);
|
||||
}
|
||||
}
|
||||
|
||||
int p1r = 1;
|
||||
|
||||
prepare();
|
||||
if (n3 != n2) p1r = phase1();
|
||||
|
||||
if (!p1r) {
|
||||
dispose();
|
||||
return NOT_FEASIBLE;
|
||||
}
|
||||
|
||||
int b = phase2();
|
||||
|
||||
mpfr_t *s = calloc(sizeof(mpfr_t), n);
|
||||
for(j=0;j<n;j++) {
|
||||
mpfr_zinit(s[j]);
|
||||
}
|
||||
|
||||
for (j = 1; j < n; j++) {
|
||||
if ((i = row[j]) != 0) {
|
||||
tableau(s[j], i, 0);
|
||||
}
|
||||
}
|
||||
|
||||
mpfr_t cs;
|
||||
mpfr_zinit(cs);
|
||||
if (row[n] != 0) tableau(cs, row[n], 0);
|
||||
|
||||
for (j = 1; j < n; j++) {
|
||||
mpfr_sub(s[j], s[j], cs, GMP_RNDN);
|
||||
}
|
||||
|
||||
for(j=0;j<n;j++) {
|
||||
mpfr_set(result[j], s[j], GMP_RNDN);
|
||||
}
|
||||
|
||||
mpfr_clear(cs);
|
||||
|
||||
for(j=0;j<n;j++) mpfr_clear(s[j]);
|
||||
free(s);
|
||||
|
||||
dispose();
|
||||
|
||||
return b ? OK : MAXIMIZABLE_TO_INFINITY;
|
||||
}
|
||||
|
||||
void regressMinRelError_fr(int n, int m, mpfr_t **x, mpfr_t *result) {
|
||||
int m0 = n * 3, n0 = m + 2 * n, i, j;
|
||||
mpfr_t **a0, *c0, *result0;
|
||||
int in0[m0];
|
||||
|
||||
a0 = malloc(sizeof(mpfr_t *) * m0);
|
||||
for(i=0;i<m0;i++) {
|
||||
a0[i] = calloc(n0+1, sizeof(mpfr_t));
|
||||
for(j=0;j<n0+1;j++) mpfr_zinit(a0[i][j]);
|
||||
}
|
||||
|
||||
c0 = calloc(n0+1, sizeof(mpfr_t));
|
||||
result0 = calloc(n0+1, sizeof(mpfr_t));
|
||||
|
||||
for(j=0;j<n0+1;j++) {
|
||||
mpfr_zinit(c0[j]);
|
||||
mpfr_zinit(result0[j]);
|
||||
}
|
||||
|
||||
for(i=0;i<n;i++) {
|
||||
long double ld = mpfr_get_ld(x[m][i], GMP_RNDN);
|
||||
if (ld < DBL_MIN) ld = 1;
|
||||
|
||||
#if 1
|
||||
mpfr_set_ld(c0[m+i +1], 1.0/fabsl(ld), GMP_RNDN);
|
||||
mpfr_set_ld(c0[m+n+i+1], 1.0/fabsl(ld), GMP_RNDN);
|
||||
#else
|
||||
int e;
|
||||
frexpl(ld, &e);
|
||||
ld = 1.0 / ldexpl(1.0, e);
|
||||
mpfr_set_ld(c0[m+i +1], ld, GMP_RNDN);
|
||||
mpfr_set_ld(c0[m+n+i+1], ld, GMP_RNDN);
|
||||
#endif
|
||||
|
||||
mpfr_set_d(a0[i*3+0][m+i+1], 1, GMP_RNDN);
|
||||
in0[i*3+0] = GEQ;
|
||||
|
||||
mpfr_set_d(a0[i*3+1][m+n+i+1], 1, GMP_RNDN);
|
||||
in0[i*3+1] = GEQ;
|
||||
|
||||
for(j=0;j<m;j++) {
|
||||
mpfr_set(a0[i*3+2][j+1], x[j][i], GMP_RNDN);
|
||||
}
|
||||
|
||||
mpfr_set_d(a0[i*3+2][m+i+1], 1, GMP_RNDN);
|
||||
mpfr_set_d(a0[i*3+2][m+n+i+1], -1, GMP_RNDN);
|
||||
in0[i*3+2] = EQU;
|
||||
mpfr_set(a0[i*3+2][0], x[m][i], GMP_RNDN);
|
||||
mpfr_neg(a0[i*3+2][0], a0[i*3+2][0], GMP_RNDN);
|
||||
}
|
||||
|
||||
int status = solve_fr(result0, n0, m0, a0, in0, c0);
|
||||
|
||||
if (status == NOT_FEASIBLE) {
|
||||
printf("not feasible\n");
|
||||
} else {
|
||||
if (status == MAXIMIZABLE_TO_INFINITY) printf("maximizable to inf\n");
|
||||
}
|
||||
|
||||
for(i=0;i<m;i++) {
|
||||
mpfr_set(result[i], result0[i+1], GMP_RNDN);
|
||||
}
|
||||
|
||||
free(result0);
|
||||
free(c0);
|
||||
}
|
||||
@@ -0,0 +1,159 @@
|
||||
// This is part of SLEEF, written by Naoki
|
||||
// Shibata. http://shibatch.sourceforge.net
|
||||
|
||||
// The code in this file is distributed under the Creative Commons
|
||||
// Attribution 4.0 International License.
|
||||
|
||||
#define PREC_TARGET 24
|
||||
|
||||
#if 1
|
||||
#define N 5 // Degree of equation
|
||||
#define S 81 // Number of samples for phase 1
|
||||
#define L 0 // Number of high precision coefficients
|
||||
#define P 0.37
|
||||
#define MIN 0.0 // Min argument
|
||||
#define MAX (M_PI/2) // Max argument
|
||||
#define PMUL 2 // The form of polynomial is y = x^(PADD+PMUL*0) + x^(PADD+PMUL*1) + ...
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_sin(ret, a, GMP_RNDN); } // The function to approximate
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0 // Fix coef 0 to 1.0
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define N 5
|
||||
#define S 40
|
||||
#define L 0
|
||||
#define MIN 0.0
|
||||
#define MAX (M_PI/2)
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { // cos(x) - 1
|
||||
mpfr_t x;
|
||||
mpfr_init(x);
|
||||
mpfr_cos(ret, a, GMP_RNDN);
|
||||
mpfr_set_ld(x, 1, GMP_RNDN);
|
||||
mpfr_sub(ret, ret, x, GMP_RNDN);
|
||||
mpfr_clear(x);
|
||||
}
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
|
||||
#define PMUL 2
|
||||
#define PADD 2
|
||||
#define FIXCOEF0 (-0.5)
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
// xsincospi4
|
||||
#define N 5
|
||||
#define S 30
|
||||
#define P 0.69
|
||||
#define L 2
|
||||
#define MIN 0.0
|
||||
#define MAX 1.0
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) {
|
||||
mpfr_t x, y;
|
||||
mpfr_inits(x, y, NULL);
|
||||
mpfr_const_pi(x, GMP_RNDN);
|
||||
mpfr_set_d(y, 1.0/4, GMP_RNDN);
|
||||
mpfr_mul(x, x, y, GMP_RNDN);
|
||||
mpfr_mul(x, x, a, GMP_RNDN);
|
||||
mpfr_sin(ret, x, GMP_RNDN);
|
||||
mpfr_clears(x, y, NULL);
|
||||
}
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
// xsincospi4
|
||||
#define N 5
|
||||
#define S 60
|
||||
#define P 0.7
|
||||
#define L 1
|
||||
#define MIN 0.0
|
||||
#define MAX 1.0
|
||||
void TARGET(mpfr_t ret, mpfr_t a) {
|
||||
mpfr_t x, y;
|
||||
mpfr_inits(x, y, NULL);
|
||||
mpfr_const_pi(x, GMP_RNDN);
|
||||
mpfr_set_d(y, 1.0/4, GMP_RNDN);
|
||||
mpfr_mul(x, x, y, GMP_RNDN);
|
||||
mpfr_mul(x, x, a, GMP_RNDN);
|
||||
mpfr_cos(ret, x, GMP_RNDN);
|
||||
mpfr_set_ld(x, 1, GMP_RNDN);
|
||||
mpfr_sub(ret, ret, x, GMP_RNDN);
|
||||
mpfr_clears(x, y, NULL);
|
||||
}
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define PMUL 2
|
||||
#define PADD 2
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define N 7
|
||||
#define S 40
|
||||
#define L 2
|
||||
#define MIN 0.0
|
||||
#define MAX (M_PI/4)
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_tan(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define N 5
|
||||
#define S 40
|
||||
#define L 2
|
||||
#define MIN 1 //0.75
|
||||
#define MAX 1.5
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_log(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t frd, mpfr_t fra) {
|
||||
mpfr_t tmp, one;
|
||||
mpfr_inits(tmp, one, NULL);
|
||||
mpfr_set_d(one, 1, GMP_RNDN);
|
||||
mpfr_add(tmp, fra, one, GMP_RNDN);
|
||||
mpfr_sub(frd, fra, one, GMP_RNDN);
|
||||
mpfr_div(frd, frd, tmp, GMP_RNDN);
|
||||
mpfr_clears(tmp, one, NULL);
|
||||
}
|
||||
#define FIXCOEF0 2.0
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define N 7
|
||||
#define S 50
|
||||
#define L 0
|
||||
#define MIN -0.347
|
||||
#define MAX 0.347 // 0.5 log 2
|
||||
#define PMUL 1
|
||||
#define PADD 0
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_exp(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0
|
||||
#define FIXCOEF1 1.0
|
||||
//#define FIXCOEF2 0.5
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#define N 10
|
||||
#define S 100
|
||||
#define L 2
|
||||
#define MIN 0.0
|
||||
#define MAX 1.0
|
||||
#define PMUL 2
|
||||
#define PADD 1
|
||||
|
||||
void TARGET(mpfr_t ret, mpfr_t a) { mpfr_atan(ret, a, GMP_RNDN); }
|
||||
void CFUNC(mpfr_t dst, mpfr_t src) { mpfr_set(dst, src, GMP_RNDN); }
|
||||
#define FIXCOEF0 1.0
|
||||
#endif
|
||||
@@ -0,0 +1,153 @@
|
||||
ICCAVAILABLE := $(shell command -v icc 2> /dev/null)
|
||||
ARCH := $(shell uname -p)
|
||||
|
||||
all :
|
||||
ifndef BUILDDIR
|
||||
@echo
|
||||
@echo Please set the build directory to BUILDDIR environment variable and run make once again.
|
||||
@echo e.g. export BUILDDIR='`pwd`'/../../build
|
||||
@echo
|
||||
else
|
||||
@echo
|
||||
@echo You can start measurement by "'"make measure"'".
|
||||
ifdef ICCAVAILABLE
|
||||
@echo You can start measurement with SVML by "'"make measureSVML"'".
|
||||
endif
|
||||
@echo Then, you can plot the results of measurement by "'"make plot"'".
|
||||
@echo
|
||||
@echo You have to install java and gnuplot to do plotting.
|
||||
@echo Stop all tasks on the computer before starting measurement.
|
||||
@echo
|
||||
endif
|
||||
|
||||
benchsvml128_10.o : benchsvml128.c bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml128.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml128_10.o
|
||||
|
||||
benchsvml128_40.o : benchsvml128.c bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml128.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml128_40.o
|
||||
|
||||
benchsvml256_10.o : benchsvml256.c bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml256.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml256_10.o
|
||||
|
||||
benchsvml256_40.o : benchsvml256.c bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml256.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -march=core-avx2 -O0 -lm -c -o benchsvml256_40.o
|
||||
|
||||
benchsvml512_10.o : benchsvml512.c bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml512.c -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -xCOMMON-AVX512 -O0 -lm -c -o benchsvml512_10.o
|
||||
|
||||
benchsvml512_40.o : benchsvml512.c bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml512.c -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -xCOMMON-AVX512 -O0 -lm -c -o benchsvml512_40.o
|
||||
|
||||
|
||||
benchsvml_10 : benchsvml.c benchsvml128_10.o benchsvml256_10.o benchsvml512_10.o bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml.c benchsvml128_10.o benchsvml256_10.o benchsvml512_10.o -Wall -I.. -DSVMLULP=1 -fimf-max-error=1.0 -fimf-domain-exclusion=0 -O0 -march=native -lm -o benchsvml_10
|
||||
|
||||
benchsvml_40 : benchsvml.c benchsvml128_40.o benchsvml256_40.o benchsvml512_40.o bench.h
|
||||
-command -v icc >/dev/null 2>&1 && icc benchsvml.c benchsvml128_40.o benchsvml256_40.o benchsvml512_40.o -Wall -I.. -DSVMLULP=4 -fimf-max-error=4.0 -fimf-domain-exclusion=0 -O0 -march=native -lm -o benchsvml_40
|
||||
|
||||
#
|
||||
|
||||
ifeq ($(ARCH),aarch64)
|
||||
|
||||
benchsleef : benchsleef.c benchsleef128.o bench.h
|
||||
$(CC) benchsleef.c benchsleef128.o -Wall -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
|
||||
|
||||
benchsleef128.o : benchsleef128.c bench.h
|
||||
$(CC) benchsleef128.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
|
||||
|
||||
else ifeq ($(ARCH),s390x)
|
||||
|
||||
benchsleef : benchsleef.c benchsleef128.o bench.h
|
||||
$(CC) benchsleef.c benchsleef128.o -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
|
||||
|
||||
benchsleef128.o : benchsleef128.c bench.h
|
||||
$(CC) benchsleef128.c -Wall -mzvector -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
|
||||
|
||||
else ifeq ($(ARCH),ppc64le)
|
||||
|
||||
benchsleef : benchsleef.c benchsleef128.o bench.h
|
||||
$(CC) benchsleef.c benchsleef128.o -Wall -mcpu=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
|
||||
|
||||
benchsleef128.o : benchsleef128.c bench.h
|
||||
$(CC) benchsleef128.c -Wall -mcpu=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
|
||||
|
||||
else
|
||||
|
||||
benchsleef : benchsleef.c benchsleef128.o benchsleef256.o benchsleef512.o bench.h
|
||||
$(CC) benchsleef.c benchsleef128.o benchsleef256.o benchsleef512.o -Wall -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -lsleef -lm -o benchsleef
|
||||
|
||||
benchsleef128.o : benchsleef128.c bench.h
|
||||
$(CC) benchsleef128.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
|
||||
|
||||
benchsleef256.o : benchsleef256.c bench.h
|
||||
$(CC) benchsleef256.c -Wall -march=native -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
|
||||
|
||||
benchsleef512.o : benchsleef512.c bench.h
|
||||
$(CC) benchsleef512.c -Wall -mavx512f -O0 -g -I$(BUILDDIR)/include -L$(BUILDDIR)/lib -Wno-attributes -c
|
||||
|
||||
endif
|
||||
|
||||
#
|
||||
|
||||
ProcessData.class : ProcessData.java
|
||||
javac ProcessData.java
|
||||
|
||||
#
|
||||
|
||||
ifndef BUILDDIR
|
||||
measure :
|
||||
@echo
|
||||
@echo Please set the build directory to BUILDDIR environment variable and run make once again.
|
||||
@echo e.g. export BUILDDIR='`pwd`'/../../build
|
||||
@echo
|
||||
else
|
||||
measure : benchsleef
|
||||
chmod +x ./measure.sh
|
||||
LD_LIBRARY_PATH=$(BUILDDIR)/lib ./measure.sh ./benchsleef
|
||||
@echo
|
||||
@echo Now, you can plot the results of measurement by "'"make plot"'".
|
||||
@echo You can do another measurement by "'"make measure"'".
|
||||
ifdef ICCAVAILABLE
|
||||
@echo You can start another measurement with SVML by "'"make measureSVML"'".
|
||||
endif
|
||||
@echo You can start over by "'"make restart"'".
|
||||
@echo
|
||||
endif
|
||||
|
||||
measureSVML : all benchsvml_10 benchsvml_40
|
||||
chmod +x ./measure.sh
|
||||
./measure.sh ./benchsvml_10 ./benchsvml_40
|
||||
@echo
|
||||
@echo Now, you can plot the results of measurement by "'"make plot"'".
|
||||
@echo You can do another measurement by "'"make measure"'".
|
||||
ifdef ICCAVAILABLE
|
||||
@echo You can start another measurement with SVML by "'"make measureSVML"'".
|
||||
endif
|
||||
@echo You can start over by "'"make restart"'".
|
||||
@echo
|
||||
|
||||
plot : ProcessData.class counter.txt
|
||||
java ProcessData *dptrig*.out
|
||||
gnuplot script.out
|
||||
mv output.png trigdp.png
|
||||
java ProcessData *dpnontrig*.out
|
||||
gnuplot script.out
|
||||
mv output.png nontrigdp.png
|
||||
java ProcessData *sptrig*.out
|
||||
gnuplot script.out
|
||||
mv output.png trigsp.png
|
||||
java ProcessData *spnontrig*.out
|
||||
gnuplot script.out
|
||||
mv output.png nontrigsp.png
|
||||
@echo
|
||||
@echo Plotted results are in trigdp.png, nontrigdp.png, trigsp.png and nontrigsp.png.
|
||||
@echo
|
||||
|
||||
clean :
|
||||
rm -f *~ a.out *.so *.so.* *.a *.s *.o
|
||||
rm -rf *.dSYM *.dylib
|
||||
rm -f *.obj *.lib *.dll *.exp *.exe *.stackdump
|
||||
rm -f *.class *.png benchsleef benchsvml_10 benchsvml_40 *.out counter.txt
|
||||
|
||||
restart :
|
||||
rm -f *.out counter.txt
|
||||
@@ -0,0 +1,193 @@
|
||||
import java.util.*;
|
||||
import java.io.*;
|
||||
|
||||
public class ProcessData {
|
||||
static final int DP = 64, SP = 32;
|
||||
|
||||
static LinkedHashMap<String, Integer> funcNameOrder = new LinkedHashMap<String, Integer>();
|
||||
|
||||
static class Key {
|
||||
final String funcName;
|
||||
|
||||
final int prec, bits;
|
||||
final ArrayList<Double> range = new ArrayList<Double>();
|
||||
final double ulps;
|
||||
|
||||
Key(String s) {
|
||||
String[] a = s.split(",");
|
||||
|
||||
funcName = a[0].trim();
|
||||
if (funcNameOrder.get(funcName) == null) {
|
||||
funcNameOrder.put(funcName, funcNameOrder.size());
|
||||
}
|
||||
|
||||
prec =
|
||||
a[1].trim().equals("DP") ? DP :
|
||||
a[1].trim().equals("SP") ? SP :
|
||||
0;
|
||||
|
||||
bits = Integer.parseInt(a[2].trim());
|
||||
|
||||
int c;
|
||||
|
||||
for(c = 3;;c++) {
|
||||
if (a[c].trim().endsWith("ulps")) break;
|
||||
range.add(Double.parseDouble(a[c]));
|
||||
}
|
||||
|
||||
ulps = Double.parseDouble(a[c].trim().replace("ulps", ""));
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
int h = funcName.hashCode();
|
||||
h ^= prec ^ bits;
|
||||
return h;
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
Key k = (Key) o;
|
||||
if (funcName.compareTo(k.funcName) != 0) return false;
|
||||
if (prec != k.prec) return false;
|
||||
if (bits != k.bits) return false;
|
||||
if (range.size() != k.range.size()) return false;
|
||||
for(int i=0;i<range.size();i++) {
|
||||
if ((double)range.get(i) != (double)k.range.get(i)) return false;
|
||||
}
|
||||
|
||||
if (ulps != k.ulps) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
String s = funcName + " ";
|
||||
s += prec == DP ? "DP " : "SP ";
|
||||
s += bits + "bit ";
|
||||
s += String.format(" %.0fulp ", ulps);
|
||||
for(int i=0;i<range.size();i+=2) {
|
||||
s += "[" + String.format("%.3g", range.get(i)) + ", " + String.format("%.3g", range.get(i+1)) + "]";
|
||||
if (i + 2 < range.size()) s += " ";
|
||||
}
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
static class KeyComparator implements Comparator<Key> {
|
||||
public int compare(Key d0, Key d1) {
|
||||
if (d0 == d1) return 0;
|
||||
if (d0.prec < d1.prec) return 1;
|
||||
if (d0.prec > d1.prec) return -1;
|
||||
if (d0.ulps > d1.ulps) return 1;
|
||||
if (d0.ulps < d1.ulps) return -1;
|
||||
|
||||
int fc = (int)funcNameOrder.get(d0.funcName) - (int)funcNameOrder.get(d1.funcName);
|
||||
if (fc != 0) return fc;
|
||||
|
||||
if (d0.bits > d1.bits) return 1;
|
||||
if (d0.bits < d1.bits) return -1;
|
||||
|
||||
if (d0.range.size() > d1.range.size()) return 1;
|
||||
if (d0.range.size() < d1.range.size()) return -1;
|
||||
|
||||
for(int i=0;i<d0.range.size();i++) {
|
||||
if (d0.range.get(i) > d1.range.get(i)) return 1;
|
||||
if (d0.range.get(i) < d1.range.get(i)) return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
LinkedHashMap<Key, LinkedHashMap<String, Double>> allData = new LinkedHashMap<Key, LinkedHashMap<String, Double>>();
|
||||
TreeSet<Key> allKeys = new TreeSet<Key>(new KeyComparator());
|
||||
LinkedHashSet<String> allColumnTitles = new LinkedHashSet<String>();
|
||||
double maximum = 0;
|
||||
|
||||
for(int i=0;i<args.length;i++) {
|
||||
LineNumberReader lnr = new LineNumberReader(new FileReader(args[i]));
|
||||
|
||||
String columnTitle = lnr.readLine();
|
||||
allColumnTitles.add(columnTitle);
|
||||
|
||||
for(;;) {
|
||||
String s = lnr.readLine();
|
||||
if (s == null) break;
|
||||
|
||||
Key key = new Key(s);
|
||||
allKeys.add(key);
|
||||
|
||||
LinkedHashMap<String, Double> v = allData.get(key);
|
||||
if (v == null) {
|
||||
v = new LinkedHashMap<String, Double>();
|
||||
allData.put(key, v);
|
||||
}
|
||||
String[] a = s.split(",");
|
||||
|
||||
double time = Double.parseDouble(a[a.length-1]);
|
||||
v.put(columnTitle, time);
|
||||
maximum = Math.max(maximum, time);
|
||||
}
|
||||
|
||||
lnr.close();
|
||||
}
|
||||
|
||||
PrintStream ps = new PrintStream("data.out");
|
||||
|
||||
for(Key k : allKeys) {
|
||||
ps.print("\"" + k + "\" ");
|
||||
|
||||
LinkedHashMap<String, Double> v = allData.get(k);
|
||||
|
||||
for(String s : allColumnTitles) {
|
||||
Double d = v.get(s);
|
||||
if (d != null) ps.print(d);
|
||||
if (d == null) ps.print("0");
|
||||
ps.print("\t");
|
||||
}
|
||||
ps.println();
|
||||
}
|
||||
|
||||
ps.close();
|
||||
|
||||
ps = new PrintStream("script.out");
|
||||
|
||||
ps.println("set terminal pngcairo size 1280, 800 font \",10\"");
|
||||
ps.println("set output \"output.png\"");
|
||||
|
||||
ps.println("color00 = \"#FF5050\";"); // red
|
||||
ps.println("color01 = \"#0066FF\";"); // blue
|
||||
ps.println("color02 = \"#00FF00\";"); // green
|
||||
ps.println("color03 = \"#FF9900\";"); // orange
|
||||
ps.println("color04 = \"#CC00CC\";"); // purple
|
||||
ps.println("color05 = \"#880000\";"); // brown
|
||||
ps.println("color06 = \"#003300\";"); // dark green
|
||||
ps.println("color07 = \"#000066\";"); // dark blue
|
||||
|
||||
ps.println("set style data histogram");
|
||||
ps.println("set style histogram cluster gap 1");
|
||||
ps.println("set style fill solid 1.00");
|
||||
ps.println("set boxwidth 0.9");
|
||||
ps.println("set xtics format \"\"");
|
||||
ps.println("set xtics rotate by -90");
|
||||
ps.println("set grid ytics");
|
||||
|
||||
ps.println("set ylabel \"Execution time in micro sec.\"");
|
||||
ps.println("set yrange [0:*]");
|
||||
ps.println("set bmargin 24");
|
||||
|
||||
ps.println("set title \"Single execution time in micro sec.\"");
|
||||
ps.print("plot");
|
||||
|
||||
int i = 0;
|
||||
for(String s : allColumnTitles) {
|
||||
ps.print("\"data.out\" using " + (i+2) + ":xtic(1) title \"" + s +
|
||||
"\" linecolor rgb color" + String.format("%02d", i));
|
||||
if (i != allColumnTitles.size()-1) ps.print(", ");
|
||||
i++;
|
||||
}
|
||||
ps.println();
|
||||
|
||||
ps.close();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
#define NITER1 100000
|
||||
#define NITER2 10000
|
||||
#define NITER (NITER1 * NITER2)
|
||||
|
||||
#define callFuncSLEEF1_1(funcName, name, xmin, xmax, ulp, arg, type) ({ \
|
||||
printf("%s\n", #funcName); \
|
||||
uint64_t t = Sleef_currentTimeMicros(); \
|
||||
for(int j=0;j<NITER2;j++) { \
|
||||
type *p = (type *)(arg); \
|
||||
for(int i=0;i<NITER1;i++) funcName(*p++); \
|
||||
} \
|
||||
fprintf(fp, name ", %.3g, %.3g, %gulps, %g\n", \
|
||||
(double)xmin, (double)xmax, ulp, (double)(Sleef_currentTimeMicros() - t) / NITER); \
|
||||
})
|
||||
|
||||
#define callFuncSLEEF1_2(funcName, name, xmin, xmax, ymin, ymax, ulp, arg1, arg2, type) ({ \
|
||||
printf("%s\n", #funcName); \
|
||||
uint64_t t = Sleef_currentTimeMicros(); \
|
||||
for(int j=0;j<NITER2;j++) { \
|
||||
type *p1 = (type *)(arg1), *p2 = (type *)(arg2); \
|
||||
for(int i=0;i<NITER1;i++) funcName(*p1++, *p2++); \
|
||||
} \
|
||||
fprintf(fp, name ", %.3g, %.3g, %.3g, %.3g, %gulps, %g\n", \
|
||||
(double)xmin, (double)xmax, (double)ymin, (double)ymax, ulp, (double)(Sleef_currentTimeMicros() - t) / NITER); \
|
||||
})
|
||||
|
||||
#define callFuncSVML1_1(funcName, name, xmin, xmax, arg, type) ({ \
|
||||
printf("%s\n", #funcName); \
|
||||
uint64_t t = Sleef_currentTimeMicros(); \
|
||||
for(int j=0;j<NITER2;j++) { \
|
||||
type *p = (type *)(arg); \
|
||||
for(int i=0;i<NITER1;i++) funcName(*p++); \
|
||||
} \
|
||||
fprintf(fp, name ", %.3g, %.3g, %gulps, %g\n", \
|
||||
(double)xmin, (double)xmax, (double)SVMLULP, (double)(Sleef_currentTimeMicros() - t) / NITER); \
|
||||
})
|
||||
|
||||
#define callFuncSVML2_1(funcName, name, xmin, xmax, arg, type) ({ \
|
||||
printf("%s\n", #funcName); \
|
||||
uint64_t t = Sleef_currentTimeMicros(); \
|
||||
for(int j=0;j<NITER2;j++) { \
|
||||
type *p = (type *)(arg), c; \
|
||||
for(int i=0;i<NITER1;i++) funcName(&c, *p++); \
|
||||
} \
|
||||
fprintf(fp, name ", %.3g, %.3g, %gulps, %g\n", \
|
||||
(double)xmin, (double)xmax, (double)SVMLULP, (double)(Sleef_currentTimeMicros() - t) / NITER); \
|
||||
})
|
||||
|
||||
#define callFuncSVML1_2(funcName, name, xmin, xmax, ymin, ymax, arg1, arg2, type) ({ \
|
||||
printf("%s\n", #funcName); \
|
||||
uint64_t t = Sleef_currentTimeMicros(); \
|
||||
for(int j=0;j<NITER2;j++) { \
|
||||
type *p1 = (type *)(arg1), *p2 = (type *)(arg2); \
|
||||
for(int i=0;i<NITER1;i++) funcName(*p1++, *p2++); \
|
||||
} \
|
||||
fprintf(fp, name ", %.3g, %.3g, %.3g, %.3g, %gulps, %g\n", \
|
||||
(double)xmin, (double)xmax, (double)ymin, (double)ymax, (double)SVMLULP, (double)(Sleef_currentTimeMicros() - t) / NITER); \
|
||||
})
|
||||
@@ -0,0 +1,144 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <sleef.h>
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
int veclen = 16;
|
||||
double *abufdp, *bbufdp;
|
||||
float *abufsp, *bbufsp;
|
||||
FILE *fp;
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
void x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
|
||||
uint32_t a, b, c, d;
|
||||
__asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
|
||||
out[0] = a; out[1] = b; out[2] = c; out[3] = d;
|
||||
}
|
||||
|
||||
int cpuSupportsAVX() {
|
||||
int32_t reg[4];
|
||||
x86CpuID(reg, 1, 0);
|
||||
return (reg[2] & (1 << 28)) != 0;
|
||||
}
|
||||
|
||||
int cpuSupportsAVX512F() {
|
||||
int32_t reg[4];
|
||||
x86CpuID(reg, 7, 0);
|
||||
return (reg[1] & (1 << 16)) != 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
void fillDP(double *buf, double min, double max) {
|
||||
for(int i=0;i<NITER1*veclen;i++) {
|
||||
double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
|
||||
buf[i] = r * (max - min) + min;
|
||||
}
|
||||
}
|
||||
|
||||
void fillSP(float *buf, double min, double max) {
|
||||
for(int i=0;i<NITER1*veclen;i++) {
|
||||
double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
|
||||
buf[i] = r * (max - min) + min;
|
||||
}
|
||||
}
|
||||
|
||||
void benchSleef128_DPTrig();
|
||||
void benchSleef256_DPTrig();
|
||||
void benchSleef512_DPTrig();
|
||||
void benchSleef128_DPNontrig();
|
||||
void benchSleef256_DPNontrig();
|
||||
void benchSleef512_DPNontrig();
|
||||
void benchSleef128_SPTrig();
|
||||
void benchSleef256_SPTrig();
|
||||
void benchSleef512_SPTrig();
|
||||
void benchSleef128_SPNontrig();
|
||||
void benchSleef256_SPNontrig();
|
||||
void benchSleef512_SPNontrig();
|
||||
|
||||
//
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
char *columnTitle = "SLEEF", *fnBase = "sleef";
|
||||
char fn[1024];
|
||||
|
||||
if (argc != 1) columnTitle = argv[1];
|
||||
if (argc >= 3) fnBase = argv[2];
|
||||
|
||||
srandom(time(NULL));
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
int do128bit = 1;
|
||||
int do256bit = cpuSupportsAVX();
|
||||
int do512bit = cpuSupportsAVX512F();
|
||||
#elif defined(__ARM_NEON) || defined(__VSX__) || defined(__VX__)
|
||||
int do128bit = 1;
|
||||
#else
|
||||
#error Unsupported architecture
|
||||
#endif
|
||||
|
||||
posix_memalign((void **)&abufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
|
||||
posix_memalign((void **)&bbufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
|
||||
|
||||
abufsp = (float *)abufdp;
|
||||
bbufsp = (float *)bbufdp;
|
||||
|
||||
sprintf(fn, "%sdptrig.out", fnBase);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do128bit) benchSleef128_DPTrig();
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
if (do256bit) benchSleef256_DPTrig();
|
||||
if (do512bit) benchSleef512_DPTrig();
|
||||
#endif
|
||||
|
||||
fclose(fp);
|
||||
|
||||
sprintf(fn, "%sdpnontrig.out", fnBase);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do128bit) benchSleef128_DPNontrig();
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
if (do256bit) benchSleef256_DPNontrig();
|
||||
if (do512bit) benchSleef512_DPNontrig();
|
||||
#endif
|
||||
|
||||
fclose(fp);
|
||||
|
||||
sprintf(fn, "%ssptrig.out", fnBase);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do128bit) benchSleef128_SPTrig();
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
if (do256bit) benchSleef256_SPTrig();
|
||||
if (do512bit) benchSleef512_SPTrig();
|
||||
#endif
|
||||
|
||||
fclose(fp);
|
||||
|
||||
sprintf(fn, "%sspnontrig.out", fnBase);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do128bit) benchSleef128_SPNontrig();
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
if (do256bit) benchSleef256_SPNontrig();
|
||||
if (do512bit) benchSleef512_SPNontrig();
|
||||
#endif
|
||||
|
||||
fclose(fp);
|
||||
|
||||
exit(0);
|
||||
}
|
||||
@@ -0,0 +1,195 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <sleef.h>
|
||||
|
||||
void fillDP(double *buf, double min, double max);
|
||||
void fillSP(float *buf, double min, double max);
|
||||
|
||||
extern char x86BrandString[256], versionString[1024];
|
||||
extern int veclen;
|
||||
extern double *abufdp, *bbufdp;
|
||||
extern float *abufsp, *bbufsp;
|
||||
extern FILE *fp;
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
#ifdef __SSE2__
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
typedef __m128d vdouble;
|
||||
typedef __m128 vfloat;
|
||||
#define ENABLED
|
||||
#elif defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
typedef float64x2_t vdouble;
|
||||
typedef float32x4_t vfloat;
|
||||
#define ENABLED
|
||||
#elif defined(__VSX__)
|
||||
#include <altivec.h>
|
||||
typedef __vector double vdouble;
|
||||
typedef __vector float vfloat;
|
||||
#define ENABLED
|
||||
#elif defined(__VX__)
|
||||
#include <vecintrin.h>
|
||||
typedef __vector double vdouble;
|
||||
typedef __vector float vfloat;
|
||||
#define ENABLED
|
||||
#endif
|
||||
|
||||
#ifdef ENABLED
|
||||
void benchSleef128_DPTrig() {
|
||||
fillDP(abufdp, 0, 6.28);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind2_u10 , "sin, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd2_u10 , "cos, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand2_u10 , "tan, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind2_u35 , "sin, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd2_u35 , "cos, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand2_u35 , "tan, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+6);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind2_u10 , "sin, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd2_u10 , "cos, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand2_u10 , "tan, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind2_u35 , "sin, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd2_u35 , "cos, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand2_u35 , "tan, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+100);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind2_u10 , "sin, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd2_u10 , "cos, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand2_u10 , "tan, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd2_u10, "sincos, DP, 128", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind2_u35 , "sin, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd2_u35 , "cos, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand2_u35 , "tan, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd2_u35, "sincos, DP, 128", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSleef128_DPNontrig() {
|
||||
fillDP(abufdp, 0, 1e+300);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logd2_u10 , "log, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_log10d2_u10, "log10, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_log1pd2_u10, "log1p, DP, 128", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_logd2_u35 , "log, DP, 128", 0, 1e+300, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -700, 700);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_expd2_u10 , "exp, DP, 128", -700, 700, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_exp2d2_u10 , "exp2, DP, 128", -700, 700, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_exp10d2_u10, "exp10, DP, 128", -700, 700, 1.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -30, 30);
|
||||
fillDP(bbufdp, -30, 30);
|
||||
|
||||
callFuncSLEEF1_2(Sleef_powd2_u10, "pow, DP, 128", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -1.0, 1.0);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asind2_u10, "asin, DP, 128", -1.0, 1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_acosd2_u10, "acos, DP, 128", -1.0, 1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_asind2_u35, "asin, DP, 128", -1.0, 1.0, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_acosd2_u35, "acos, DP, 128", -1.0, 1.0, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -10, 10);
|
||||
fillDP(bbufdp, -10, 10);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atand2_u10, "atan, DP, 128", -10, 10, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_2(Sleef_atan2d2_u10, "atan2, DP, 128", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_atand2_u35, "atan, DP, 128", -10, 10, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_2(Sleef_atan2d2_u35, "atan2, DP, 128", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSleef128_SPTrig() {
|
||||
fillSP(abufsp, 0, 6.28);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf4_u10 , "sin, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf4_u10 , "cos, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf4_u10 , "tan, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf4_u10, "sincos, SP, 128", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf4_u35 , "sin, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf4_u35 , "cos, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf4_u35 , "tan, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf4_u35, "sincos, SP, 128", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, 0, 1e+20);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf4_u10 , "sin, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf4_u10 , "cos, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf4_u10 , "tan, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf4_u10, "sincos, SP, 128", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf4_u35 , "sin, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf4_u35 , "cos, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf4_u35 , "tan, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf4_u35, "sincos, SP, 128", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
}
|
||||
|
||||
void benchSleef128_SPNontrig() {
|
||||
fillSP(abufsp, 0, 1e+38);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logf4_u10 , "log, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_log10f4_u10, "log10, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log1pf4_u10, "log1p, SP, 128", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logf4_u35 , "log, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log10f4_u35, "log10, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log1pf4_u35, "log1p, SP, 128", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -100, 100);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_expf4_u10 , "exp, SP, 128", -100, 100, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_exp2f4_u10 , "exp2, SP, 128", -100, 100, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_exp10f4_u10, "exp10, SP, 128", -100, 100, 1.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -30, 30);
|
||||
fillSP(bbufsp, -30, 30);
|
||||
|
||||
callFuncSLEEF1_2(Sleef_powf4_u10, "pow, SP, 128", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -1.0, 1.0);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asinf4_u10, "asin, SP, 128", -1.0, 1, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_acosf4_u10, "acos, SP, 128", -1.0, 1, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asinf4_u35, "asin, SP, 128", -1.0, 1.0, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_acosf4_u35, "acos, SP, 128", -1.0, 1.0, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -10, 10);
|
||||
fillSP(bbufsp, -10, 10);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atanf4_u10, "atan, SP, 128", -10, 10, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_2(Sleef_atan2f4_u10, "atan2, SP, 128", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atanf4_u35, "atan, SP, 128", -10, 10, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_2(Sleef_atan2f4_u35, "atan2, SP, 128", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
#else // #ifdef ENABLED
|
||||
void benchSleef128_DPTrig() {}
|
||||
void benchSleef128_DPNontrig() {}
|
||||
void benchSleef128_SPTrig() {}
|
||||
void benchSleef128_SPNontrig() {}
|
||||
#endif // #ifdef ENABLED
|
||||
@@ -0,0 +1,181 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <sleef.h>
|
||||
|
||||
void fillDP(double *buf, double min, double max);
|
||||
void fillSP(float *buf, double min, double max);
|
||||
|
||||
extern char x86BrandString[256], versionString[1024];
|
||||
extern int veclen;
|
||||
extern double *abufdp, *bbufdp;
|
||||
extern float *abufsp, *bbufsp;
|
||||
extern FILE *fp;
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
#ifdef __AVX__
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
typedef __m256d vdouble;
|
||||
typedef __m256 vfloat;
|
||||
#define ENABLED
|
||||
#endif
|
||||
|
||||
#ifdef ENABLED
|
||||
void benchSleef256_DPTrig() {
|
||||
fillDP(abufdp, 0, 6.28);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind4_u10 , "sin, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd4_u10 , "cos, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand4_u10 , "tan, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind4_u35 , "sin, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd4_u35 , "cos, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand4_u35 , "tan, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+6);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind4_u10 , "sin, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd4_u10 , "cos, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand4_u10 , "tan, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind4_u35 , "sin, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd4_u35 , "cos, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand4_u35 , "tan, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+100);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind4_u10 , "sin, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd4_u10 , "cos, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand4_u10 , "tan, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd4_u10, "sincos, DP, 256", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind4_u35 , "sin, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd4_u35 , "cos, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand4_u35 , "tan, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd4_u35, "sincos, DP, 256", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSleef256_DPNontrig() {
|
||||
fillDP(abufdp, 0, 1e+300);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logd4_u10 , "log, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_log10d4_u10, "log10, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_log1pd4_u10, "log1p, DP, 256", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_logd4_u35 , "log, DP, 256", 0, 1e+300, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -700, 700);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_expd4_u10 , "exp, DP, 256", -700, 700, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_exp2d4_u10 , "exp2, DP, 256", -700, 700, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_exp10d4_u10, "exp10, DP, 256", -700, 700, 1.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -30, 30);
|
||||
fillDP(bbufdp, -30, 30);
|
||||
|
||||
callFuncSLEEF1_2(Sleef_powd4_u10, "pow, DP, 256", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -1.0, 1.0);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asind4_u10, "asin, DP, 256", -1.0, 1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_acosd4_u10, "acos, DP, 256", -1.0, 1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_asind4_u35, "asin, DP, 256", -1.0, 1.0, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_acosd4_u35, "acos, DP, 256", -1.0, 1.0, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -10, 10);
|
||||
fillDP(bbufdp, -10, 10);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atand4_u10, "atan, DP, 256", -10, 10, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_2(Sleef_atan2d4_u10, "atan2, DP, 256", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_atand4_u35, "atan, DP, 256", -10, 10, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_2(Sleef_atan2d4_u35, "atan2, DP, 256", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSleef256_SPTrig() {
|
||||
fillSP(abufsp, 0, 6.28);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf8_u10 , "sin, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf8_u10 , "cos, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf8_u10 , "tan, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf8_u10, "sincos, SP, 256", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf8_u35 , "sin, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf8_u35 , "cos, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf8_u35 , "tan, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf8_u35, "sincos, SP, 256", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, 0, 1e+20);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf8_u10 , "sin, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf8_u10 , "cos, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf8_u10 , "tan, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf8_u10, "sincos, SP, 256", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf8_u35 , "sin, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf8_u35 , "cos, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf8_u35 , "tan, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf8_u35, "sincos, SP, 256", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
}
|
||||
|
||||
void benchSleef256_SPNontrig() {
|
||||
fillSP(abufsp, 0, 1e+38);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logf8_u10 , "log, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_log10f8_u10, "log10, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log1pf8_u10, "log1p, SP, 256", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logf8_u35 , "log, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log10f8_u35, "log10, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log1pf8_u35, "log1p, SP, 256", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -100, 100);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_expf8_u10 , "exp, SP, 256", -100, 100, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_exp2f8_u10 , "exp2, SP, 256", -100, 100, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_exp10f8_u10, "exp10, SP, 256", -100, 100, 1.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -30, 30);
|
||||
fillSP(bbufsp, -30, 30);
|
||||
|
||||
callFuncSLEEF1_2(Sleef_powf8_u10, "pow, SP, 256", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -1.0, 1.0);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asinf8_u10, "asin, SP, 256", -1.0, 1, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_acosf8_u10, "acos, SP, 256", -1.0, 1, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asinf8_u35, "asin, SP, 256", -1.0, 1.0, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_acosf8_u35, "acos, SP, 256", -1.0, 1.0, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -10, 10);
|
||||
fillSP(bbufsp, -10, 10);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atanf8_u10, "atan, SP, 256", -10, 10, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_2(Sleef_atan2f8_u10, "atan2, SP, 256", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atanf8_u35, "atan, SP, 256", -10, 10, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_2(Sleef_atan2f8_u35, "atan2, SP, 256", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
#else // #ifdef ENABLED
|
||||
void zeroupper256() {}
|
||||
void benchSleef256_DPTrig() {}
|
||||
void benchSleef256_DPNontrig() {}
|
||||
void benchSleef256_SPTrig() {}
|
||||
void benchSleef256_SPNontrig() {}
|
||||
#endif // #ifdef ENABLED
|
||||
@@ -0,0 +1,180 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <sleef.h>
|
||||
|
||||
void fillDP(double *buf, double min, double max);
|
||||
void fillSP(float *buf, double min, double max);
|
||||
|
||||
extern char x86BrandString[256], versionString[1024];
|
||||
extern int veclen;
|
||||
extern double *abufdp, *bbufdp;
|
||||
extern float *abufsp, *bbufsp;
|
||||
extern FILE *fp;
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
#ifdef __AVX512F__
|
||||
#if defined(_MSC_VER)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
typedef __m512d vdouble;
|
||||
typedef __m512 vfloat;
|
||||
#define ENABLED
|
||||
#endif
|
||||
|
||||
#ifdef ENABLED
|
||||
void benchSleef512_DPTrig() {
|
||||
fillDP(abufdp, 0, 6.28);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind8_u10 , "sin, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd8_u10 , "cos, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand8_u10 , "tan, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 6.28, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind8_u35 , "sin, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd8_u35 , "cos, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand8_u35 , "tan, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 6.28, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+6);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind8_u10 , "sin, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd8_u10 , "cos, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand8_u10 , "tan, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 1e+6, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind8_u35 , "sin, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd8_u35 , "cos, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand8_u35 , "tan, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 1e+6, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+100);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind8_u10 , "sin, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd8_u10 , "cos, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand8_u10 , "tan, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd8_u10, "sincos, DP, 512", 0, 1e+100, 1.0, abufdp, vdouble);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sind8_u35 , "sin, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_cosd8_u35 , "cos, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_tand8_u35 , "tan, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_sincosd8_u35, "sincos, DP, 512", 0, 1e+100, 4.0, abufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSleef512_DPNontrig() {
|
||||
fillDP(abufdp, 0, 1e+300);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logd8_u10 , "log, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_log10d8_u10, "log10, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_log1pd8_u10, "log1p, DP, 512", 0, 1e+300, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_logd8_u35 , "log, DP, 512", 0, 1e+300, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -700, 700);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_expd8_u10 , "exp, DP, 512", -700, 700, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_exp2d8_u10 , "exp2, DP, 512", -700, 700, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_exp10d8_u10, "exp10, DP, 512", -700, 700, 1.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -30, 30);
|
||||
fillDP(bbufdp, -30, 30);
|
||||
|
||||
callFuncSLEEF1_2(Sleef_powd8_u10, "pow, DP, 512", -30, 30, -30, 30, 1.0, abufdp, bbufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -1.0, 1.0);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asind8_u10, "asin, DP, 512", -1.0, 1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_acosd8_u10, "acos, DP, 512", -1.0, 1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_asind8_u35, "asin, DP, 512", -1.0, 1.0, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_acosd8_u35, "acos, DP, 512", -1.0, 1.0, 4.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -10, 10);
|
||||
fillDP(bbufdp, -10, 10);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atand8_u10, "atan, DP, 512", -10, 10, 1.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_2(Sleef_atan2d8_u10, "atan2, DP, 512", -10, 10, -10, 10, 1.0, abufdp, bbufdp, vdouble);
|
||||
callFuncSLEEF1_1(Sleef_atand8_u35, "atan, DP, 512", -10, 10, 4.0, abufdp, vdouble);
|
||||
callFuncSLEEF1_2(Sleef_atan2d8_u35, "atan2, DP, 512", -10, 10, -10, 10, 4.0, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSleef512_SPTrig() {
|
||||
fillSP(abufsp, 0, 6.28);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf16_u10 , "sin, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf16_u10 , "cos, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf16_u10 , "tan, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf16_u10, "sincos, SP, 512", 0, 6.28, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf16_u35 , "sin, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf16_u35 , "cos, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf16_u35 , "tan, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf16_u35, "sincos, SP, 512", 0, 6.28, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, 0, 1e+20);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf16_u10 , "sin, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf16_u10 , "cos, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf16_u10 , "tan, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf16_u10, "sincos, SP, 512", 0, 1e+20, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_sinf16_u35 , "sin, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_cosf16_u35 , "cos, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_tanf16_u35 , "tan, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_sincosf16_u35, "sincos, SP, 512", 0, 1e+20, 4.0, abufsp, vfloat);
|
||||
}
|
||||
|
||||
void benchSleef512_SPNontrig() {
|
||||
fillSP(abufsp, 0, 1e+38);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logf16_u10 , "log, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_log10f16_u10, "log10, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log1pf16_u10, "log1p, SP, 512", 0, 1e+38, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_logf16_u35 , "log, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log10f16_u35, "log10, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
//callFuncSLEEF1_1(Sleef_log1pf16_u35, "log1p, SP, 512", 0, 1e+38, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -100, 100);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_expf16_u10 , "exp, SP, 512", -100, 100, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_exp2f16_u10 , "exp2, SP, 512", -100, 100, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_exp10f16_u10, "exp10, SP, 512", -100, 100, 1.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -30, 30);
|
||||
fillSP(bbufsp, -30, 30);
|
||||
|
||||
callFuncSLEEF1_2(Sleef_powf16_u10, "pow, SP, 512", -30, 30, -30, 30, 1.0, abufsp, bbufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -1.0, 1.0);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asinf16_u10, "asin, SP, 512", -1.0, 1, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_acosf16_u10, "acos, SP, 512", -1.0, 1, 1.0, abufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_asinf16_u35, "asin, SP, 512", -1.0, 1.0, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_1(Sleef_acosf16_u35, "acos, SP, 512", -1.0, 1.0, 4.0, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -10, 10);
|
||||
fillSP(bbufsp, -10, 10);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atanf16_u10, "atan, SP, 512", -10, 10, 1.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_2(Sleef_atan2f16_u10, "atan2, SP, 512", -10, 10, -10, 10, 1.0, abufsp, bbufsp, vfloat);
|
||||
|
||||
callFuncSLEEF1_1(Sleef_atanf16_u35, "atan, SP, 512", -10, 10, 4.0, abufsp, vfloat);
|
||||
callFuncSLEEF1_2(Sleef_atan2f16_u35, "atan2, SP, 512", -10, 10, -10, 10, 4.0, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
#else // #ifdef ENABLED
|
||||
void benchSleef512_DPTrig() {}
|
||||
void benchSleef512_DPNontrig() {}
|
||||
void benchSleef512_SPTrig() {}
|
||||
void benchSleef512_SPNontrig() {}
|
||||
#endif // #ifdef ENABLED
|
||||
@@ -0,0 +1,153 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <x86intrin.h>
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
int veclen = 16;
|
||||
int enableLogExp;
|
||||
double *abufdp, *bbufdp;
|
||||
float *abufsp, *bbufsp;
|
||||
FILE *fp;
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
void x86CpuID(int32_t out[4], uint32_t eax, uint32_t ecx) {
|
||||
uint32_t a, b, c, d;
|
||||
__asm__ __volatile__ ("cpuid" : "=a" (a), "=b" (b), "=c" (c), "=d" (d) : "a" (eax), "c"(ecx));
|
||||
out[0] = a; out[1] = b; out[2] = c; out[3] = d;
|
||||
}
|
||||
|
||||
int cpuSupportsAVX() {
|
||||
int32_t reg[4];
|
||||
x86CpuID(reg, 1, 0);
|
||||
return (reg[2] & (1 << 28)) != 0;
|
||||
}
|
||||
|
||||
int cpuSupportsAVX512F() {
|
||||
int32_t reg[4];
|
||||
x86CpuID(reg, 7, 0);
|
||||
return (reg[1] & (1 << 16)) != 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
uint64_t Sleef_currentTimeMicros() {
|
||||
struct timespec tp;
|
||||
clock_gettime(CLOCK_MONOTONIC, &tp);
|
||||
return (uint64_t)tp.tv_sec * 1000000LL + ((uint64_t)tp.tv_nsec/1000);
|
||||
}
|
||||
|
||||
void fillDP(double *buf, double min, double max) {
|
||||
for(int i=0;i<NITER1*veclen;i++) {
|
||||
double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
|
||||
buf[i] = r * (max - min) + min;
|
||||
}
|
||||
}
|
||||
|
||||
void fillSP(float *buf, double min, double max) {
|
||||
for(int i=0;i<NITER1*veclen;i++) {
|
||||
double r = ((double)random() + RAND_MAX * (double)random()) / (RAND_MAX * (double)RAND_MAX);
|
||||
buf[i] = r * (max - min) + min;
|
||||
}
|
||||
}
|
||||
|
||||
void zeroupper256();
|
||||
void benchSVML128_DPTrig();
|
||||
void benchSVML256_DPTrig();
|
||||
void benchSVML512_DPTrig();
|
||||
void benchSVML128_DPNontrig();
|
||||
void benchSVML256_DPNontrig();
|
||||
void benchSVML512_DPNontrig();
|
||||
void benchSVML128_SPTrig();
|
||||
void benchSVML256_SPTrig();
|
||||
void benchSVML512_SPTrig();
|
||||
void benchSVML128_SPNontrig();
|
||||
void benchSVML256_SPNontrig();
|
||||
void benchSVML512_SPNontrig();
|
||||
|
||||
//
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
char *columnTitle = "SVML", *fnBase = "svml";
|
||||
char fn[1024];
|
||||
|
||||
if (argc != 1) columnTitle = argv[1];
|
||||
if (argc >= 3) fnBase = argv[2];
|
||||
|
||||
srandom(time(NULL));
|
||||
|
||||
#if defined(__i386__) || defined(__x86_64__)
|
||||
int do128bit = 1;
|
||||
int do256bit = cpuSupportsAVX();
|
||||
int do512bit = cpuSupportsAVX512F();
|
||||
#elif defined(__ARM_NEON)
|
||||
int do128bit = 1;
|
||||
int do256bit = 0;
|
||||
int do512bit = 0;
|
||||
#else
|
||||
#error Unsupported architecture
|
||||
#endif
|
||||
|
||||
posix_memalign((void **)&abufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
|
||||
posix_memalign((void **)&bbufdp, veclen*sizeof(double), NITER1*veclen*sizeof(double));
|
||||
|
||||
abufsp = (float *)abufdp;
|
||||
bbufsp = (float *)bbufdp;
|
||||
|
||||
enableLogExp = SVMLULP < 2;
|
||||
|
||||
sprintf(fn, "%sdptrig%gulp.out", fnBase, (double)SVMLULP);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do256bit) zeroupper256();
|
||||
if (do128bit) benchSVML128_DPTrig();
|
||||
if (do256bit) benchSVML256_DPTrig();
|
||||
if (do512bit) benchSVML512_DPTrig();
|
||||
|
||||
fclose(fp);
|
||||
|
||||
sprintf(fn, "%sdpnontrig%gulp.out", fnBase, (double)SVMLULP);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do256bit) zeroupper256();
|
||||
if (do128bit) benchSVML128_DPNontrig();
|
||||
if (do256bit) benchSVML256_DPNontrig();
|
||||
if (do512bit) benchSVML512_DPNontrig();
|
||||
|
||||
fclose(fp);
|
||||
|
||||
sprintf(fn, "%ssptrig%gulp.out", fnBase, (double)SVMLULP);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do256bit) zeroupper256();
|
||||
if (do128bit) benchSVML128_SPTrig();
|
||||
if (do256bit) benchSVML256_SPTrig();
|
||||
if (do512bit) benchSVML512_SPTrig();
|
||||
|
||||
fclose(fp);
|
||||
|
||||
sprintf(fn, "%sspnontrig%gulp.out", fnBase, (double)SVMLULP);
|
||||
fp = fopen(fn, "w");
|
||||
fprintf(fp, "%s\n", columnTitle);
|
||||
|
||||
if (do256bit) zeroupper256();
|
||||
if (do128bit) benchSVML128_SPNontrig();
|
||||
if (do256bit) benchSVML256_SPNontrig();
|
||||
if (do512bit) benchSVML512_SPNontrig();
|
||||
|
||||
fclose(fp);
|
||||
|
||||
exit(0);
|
||||
}
|
||||
@@ -0,0 +1,144 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <x86intrin.h>
|
||||
|
||||
uint64_t Sleef_currentTimeMicros();
|
||||
void fillDP(double *buf, double min, double max);
|
||||
void fillSP(float *buf, double min, double max);
|
||||
|
||||
extern char x86BrandString[256], versionString[1024];
|
||||
extern int veclen;
|
||||
extern int enableLogExp;
|
||||
extern double *abufdp, *bbufdp;
|
||||
extern float *abufsp, *bbufsp;
|
||||
extern FILE *fp;
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
#ifdef __SSE2__
|
||||
typedef __m128d vdouble;
|
||||
typedef __m128 vfloat;
|
||||
#define ENABLED
|
||||
#endif
|
||||
|
||||
#ifdef ENABLED
|
||||
void benchSVML128_DPTrig() {
|
||||
fillDP(abufdp, 0, 6.28);
|
||||
|
||||
callFuncSVML1_1(_mm_sin_pd , "sin, DP, 128", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_cos_pd , "cos, DP, 128", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_tan_pd , "tan, DP, 128", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 6.28, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+6);
|
||||
|
||||
callFuncSVML1_1(_mm_sin_pd , "sin, DP, 128", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_cos_pd , "cos, DP, 128", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_tan_pd , "tan, DP, 128", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 1e+6, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+100);
|
||||
|
||||
callFuncSVML1_1(_mm_sin_pd , "sin, DP, 128", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_cos_pd , "cos, DP, 128", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_tan_pd , "tan, DP, 128", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm_sincos_pd, "sincos, DP, 128", 0, 1e+100, abufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSVML128_DPNontrig() {
|
||||
fillDP(abufdp, 0, 1e+300);
|
||||
|
||||
callFuncSVML1_1(_mm_log_pd , "log, DP, 128", 0, 1e+300, abufdp, vdouble);
|
||||
|
||||
if (enableLogExp) {
|
||||
callFuncSVML1_1(_mm_log10_pd, "log10, DP, 128", 0, 1e+300, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_log1p_pd, "log1p, DP, 128", 0, 1e+300, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -700, 700);
|
||||
|
||||
callFuncSVML1_1(_mm_exp_pd , "exp, DP, 128", -700, 700, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_exp2_pd , "exp2, DP, 128", -700, 700, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_exp10_pd, "exp10, DP, 128", -700, 700, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -30, 30);
|
||||
fillDP(bbufdp, -30, 30);
|
||||
|
||||
callFuncSVML1_2(_mm_pow_pd, "pow, DP, 128", -30, 30, -30, 30, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
fillDP(abufdp, -1.0, 1.0);
|
||||
|
||||
callFuncSVML1_1(_mm_asin_pd, "asin, DP, 128", -1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm_acos_pd, "acos, DP, 128", -1.0, 1.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -10, 10);
|
||||
fillDP(bbufdp, -10, 10);
|
||||
|
||||
callFuncSVML1_1(_mm_atan_pd, "atan, DP, 128", -10, 10, abufdp, vdouble);
|
||||
callFuncSVML1_2(_mm_atan2_pd, "atan2, DP, 128", -10, 10, -10, 10, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSVML128_SPTrig() {
|
||||
fillSP(abufsp, 0, 6.28);
|
||||
|
||||
callFuncSVML1_1(_mm_sin_ps , "sin, SP, 128", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm_cos_ps , "cos, SP, 128", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm_tan_ps , "tan, SP, 128", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML2_1(_mm_sincos_ps, "sincos, SP, 128", 0, 6.28, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, 0, 1e+20);
|
||||
|
||||
callFuncSVML1_1(_mm_sin_ps , "sin, SP, 128", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm_cos_ps , "cos, SP, 128", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm_tan_ps , "tan, SP, 128", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML2_1(_mm_sincos_ps, "sincos, SP, 128", 0, 1e+20, abufsp, vfloat);
|
||||
}
|
||||
|
||||
void benchSVML128_SPNontrig() {
|
||||
fillSP(abufsp, 0, 1e+38);
|
||||
|
||||
callFuncSVML1_1(_mm_log_ps , "log, SP, 128", 0, 1e+38, abufsp, vfloat);
|
||||
|
||||
if (enableLogExp) {
|
||||
callFuncSVML1_1(_mm_log10_ps, "log10, SP, 128", 0, 1e+38, abufsp, vfloat);
|
||||
//callFuncSVML1_1(_mm_log1p_ps, "log1p, SP, 128", 0, 1e+38, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -100, 100);
|
||||
|
||||
callFuncSVML1_1(_mm_exp_ps , "exp, SP, 128", -100, 100, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm_exp2_ps , "exp2, SP, 128", -100, 100, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm_exp10_ps, "exp10, SP, 128", -100, 100, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -30, 30);
|
||||
fillSP(bbufsp, -30, 30);
|
||||
|
||||
callFuncSVML1_2(_mm_pow_ps, "pow, SP, 128", -30, 30, -30, 30, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
|
||||
fillSP(abufsp, -1.0, 1.0);
|
||||
|
||||
callFuncSVML1_1(_mm_asin_ps, "asin, SP, 128", -1.0, 1, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm_acos_ps, "acos, SP, 128", -1.0, 1, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -10, 10);
|
||||
fillSP(bbufsp, -10, 10);
|
||||
|
||||
callFuncSVML1_1(_mm_atan_ps, "atan, SP, 128", -10, 10, abufsp, vfloat);
|
||||
callFuncSVML1_2(_mm_atan2_ps, "atan2, SP, 128", -10, 10, -10, 10, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
#else // #ifdef ENABLED
|
||||
void benchSVML128_DPTrig() {}
|
||||
void benchSVML128_DPNontrig() {}
|
||||
void benchSVML128_SPTrig() {}
|
||||
void benchSVML128_SPNontrig() {}
|
||||
#endif // #ifdef ENABLED
|
||||
@@ -0,0 +1,147 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <x86intrin.h>
|
||||
|
||||
uint64_t Sleef_currentTimeMicros();
|
||||
void fillDP(double *buf, double min, double max);
|
||||
void fillSP(float *buf, double min, double max);
|
||||
|
||||
extern char x86BrandString[256], versionString[1024];
|
||||
extern int veclen;
|
||||
extern int enableLogExp;
|
||||
extern double *abufdp, *bbufdp;
|
||||
extern float *abufsp, *bbufsp;
|
||||
extern FILE *fp;
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
#ifdef __AVX__
|
||||
typedef __m256d vdouble;
|
||||
typedef __m256 vfloat;
|
||||
#define ENABLED
|
||||
#endif
|
||||
|
||||
#ifdef ENABLED
|
||||
void zeroupper256() { _mm256_zeroupper(); }
|
||||
|
||||
void benchSVML256_DPTrig() {
|
||||
fillDP(abufdp, 0, 6.28);
|
||||
|
||||
callFuncSVML1_1(_mm256_sin_pd , "sin, DP, 256", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_cos_pd , "cos, DP, 256", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_tan_pd , "tan, DP, 256", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 6.28, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+6);
|
||||
|
||||
callFuncSVML1_1(_mm256_sin_pd , "sin, DP, 256", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_cos_pd , "cos, DP, 256", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_tan_pd , "tan, DP, 256", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 1e+6, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+100);
|
||||
|
||||
callFuncSVML1_1(_mm256_sin_pd , "sin, DP, 256", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_cos_pd , "cos, DP, 256", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_tan_pd , "tan, DP, 256", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm256_sincos_pd, "sincos, DP, 256", 0, 1e+100, abufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSVML256_DPNontrig() {
|
||||
fillDP(abufdp, 0, 1e+300);
|
||||
|
||||
callFuncSVML1_1(_mm256_log_pd , "log, DP, 256", 0, 1e+300, abufdp, vdouble);
|
||||
|
||||
if (enableLogExp) {
|
||||
callFuncSVML1_1(_mm256_log10_pd, "log10, DP, 256", 0, 1e+300, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_log1p_pd, "log1p, DP, 256", 0, 1e+300, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -700, 700);
|
||||
|
||||
callFuncSVML1_1(_mm256_exp_pd , "exp, DP, 256", -700, 700, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_exp2_pd , "exp2, DP, 256", -700, 700, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_exp10_pd, "exp10, DP, 256", -700, 700, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -30, 30);
|
||||
fillDP(bbufdp, -30, 30);
|
||||
|
||||
callFuncSVML1_2(_mm256_pow_pd, "pow, DP, 256", -30, 30, -30, 30, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
fillDP(abufdp, -1.0, 1.0);
|
||||
|
||||
callFuncSVML1_1(_mm256_asin_pd, "asin, DP, 256", -1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm256_acos_pd, "acos, DP, 256", -1.0, 1.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -10, 10);
|
||||
fillDP(bbufdp, -10, 10);
|
||||
|
||||
callFuncSVML1_1(_mm256_atan_pd, "atan, DP, 256", -10, 10, abufdp, vdouble);
|
||||
callFuncSVML1_2(_mm256_atan2_pd, "atan2, DP, 256", -10, 10, -10, 10, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSVML256_SPTrig() {
|
||||
fillSP(abufsp, 0, 6.28);
|
||||
|
||||
callFuncSVML1_1(_mm256_sin_ps , "sin, SP, 256", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm256_cos_ps , "cos, SP, 256", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm256_tan_ps , "tan, SP, 256", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML2_1(_mm256_sincos_ps, "sincos, SP, 256", 0, 6.28, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, 0, 1e+20);
|
||||
|
||||
callFuncSVML1_1(_mm256_sin_ps , "sin, SP, 256", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm256_cos_ps , "cos, SP, 256", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm256_tan_ps , "tan, SP, 256", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML2_1(_mm256_sincos_ps, "sincos, SP, 256", 0, 1e+20, abufsp, vfloat);
|
||||
}
|
||||
|
||||
void benchSVML256_SPNontrig() {
|
||||
fillSP(abufsp, 0, 1e+38);
|
||||
|
||||
callFuncSVML1_1(_mm256_log_ps , "log, SP, 256", 0, 1e+38, abufsp, vfloat);
|
||||
|
||||
if (enableLogExp) {
|
||||
callFuncSVML1_1(_mm256_log10_ps, "log10, SP, 256", 0, 1e+38, abufsp, vfloat);
|
||||
//callFuncSVML1_1(_mm256_log1p_ps, "log1p, SP, 256", 0, 1e+38, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -100, 100);
|
||||
|
||||
callFuncSVML1_1(_mm256_exp_ps , "exp, SP, 256", -100, 100, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm256_exp2_ps , "exp2, SP, 256", -100, 100, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm256_exp10_ps, "exp10, SP, 256", -100, 100, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -30, 30);
|
||||
fillSP(bbufsp, -30, 30);
|
||||
|
||||
callFuncSVML1_2(_mm256_pow_ps, "pow, SP, 256", -30, 30, -30, 30, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
|
||||
fillSP(abufsp, -1.0, 1.0);
|
||||
|
||||
callFuncSVML1_1(_mm256_asin_ps, "asin, SP, 256", -1.0, 1, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm256_acos_ps, "acos, SP, 256", -1.0, 1, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -10, 10);
|
||||
fillSP(bbufsp, -10, 10);
|
||||
|
||||
callFuncSVML1_1(_mm256_atan_ps, "atan, SP, 256", -10, 10, abufsp, vfloat);
|
||||
callFuncSVML1_2(_mm256_atan2_ps, "atan2, SP, 256", -10, 10, -10, 10, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
#else // #ifdef ENABLED
|
||||
void zeroupper256() {}
|
||||
void benchSVML256_DPTrig() {}
|
||||
void benchSVML256_DPNontrig() {}
|
||||
void benchSVML256_SPTrig() {}
|
||||
void benchSVML256_SPNontrig() {}
|
||||
#endif // #ifdef ENABLED
|
||||
@@ -0,0 +1,144 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <x86intrin.h>
|
||||
|
||||
uint64_t Sleef_currentTimeMicros();
|
||||
void fillDP(double *buf, double min, double max);
|
||||
void fillSP(float *buf, double min, double max);
|
||||
|
||||
extern char x86BrandString[256], versionString[1024];
|
||||
extern int veclen;
|
||||
extern int enableLogExp;
|
||||
extern double *abufdp, *bbufdp;
|
||||
extern float *abufsp, *bbufsp;
|
||||
extern FILE *fp;
|
||||
|
||||
#include "bench.h"
|
||||
|
||||
#ifdef __AVX512F__
|
||||
typedef __m512d vdouble;
|
||||
typedef __m512 vfloat;
|
||||
#define ENABLED
|
||||
#endif
|
||||
|
||||
#ifdef ENABLED
|
||||
void benchSVML512_DPTrig() {
|
||||
fillDP(abufdp, 0, 6.28);
|
||||
|
||||
callFuncSVML1_1(_mm512_sin_pd , "sin, DP, 512", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_cos_pd , "cos, DP, 512", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_tan_pd , "tan, DP, 512", 0, 6.28, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 6.28, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+6);
|
||||
|
||||
callFuncSVML1_1(_mm512_sin_pd , "sin, DP, 512", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_cos_pd , "cos, DP, 512", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_tan_pd , "tan, DP, 512", 0, 1e+6, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 1e+6, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, 0, 1e+100);
|
||||
|
||||
callFuncSVML1_1(_mm512_sin_pd , "sin, DP, 512", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_cos_pd , "cos, DP, 512", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_tan_pd , "tan, DP, 512", 0, 1e+100, abufdp, vdouble);
|
||||
callFuncSVML2_1(_mm512_sincos_pd, "sincos, DP, 512", 0, 1e+100, abufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSVML512_DPNontrig() {
|
||||
fillDP(abufdp, 0, 1e+300);
|
||||
|
||||
callFuncSVML1_1(_mm512_log_pd , "log, DP, 512", 0, 1e+300, abufdp, vdouble);
|
||||
|
||||
if (enableLogExp) {
|
||||
callFuncSVML1_1(_mm512_log10_pd, "log10, DP, 512", 0, 1e+300, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_log1p_pd, "log1p, DP, 512", 0, 1e+300, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -700, 700);
|
||||
|
||||
callFuncSVML1_1(_mm512_exp_pd , "exp, DP, 512", -700, 700, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_exp2_pd , "exp2, DP, 512", -700, 700, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_exp10_pd, "exp10, DP, 512", -700, 700, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -30, 30);
|
||||
fillDP(bbufdp, -30, 30);
|
||||
|
||||
callFuncSVML1_2(_mm512_pow_pd, "pow, DP, 512", -30, 30, -30, 30, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
fillDP(abufdp, -1.0, 1.0);
|
||||
|
||||
callFuncSVML1_1(_mm512_asin_pd, "asin, DP, 512", -1.0, 1.0, abufdp, vdouble);
|
||||
callFuncSVML1_1(_mm512_acos_pd, "acos, DP, 512", -1.0, 1.0, abufdp, vdouble);
|
||||
|
||||
fillDP(abufdp, -10, 10);
|
||||
fillDP(bbufdp, -10, 10);
|
||||
|
||||
callFuncSVML1_1(_mm512_atan_pd, "atan, DP, 512", -10, 10, abufdp, vdouble);
|
||||
callFuncSVML1_2(_mm512_atan2_pd, "atan2, DP, 512", -10, 10, -10, 10, abufdp, bbufdp, vdouble);
|
||||
}
|
||||
|
||||
void benchSVML512_SPTrig() {
|
||||
fillSP(abufsp, 0, 6.28);
|
||||
|
||||
callFuncSVML1_1(_mm512_sin_ps , "sin, SP, 512", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm512_cos_ps , "cos, SP, 512", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm512_tan_ps , "tan, SP, 512", 0, 6.28, abufsp, vfloat);
|
||||
callFuncSVML2_1(_mm512_sincos_ps, "sincos, SP, 512", 0, 6.28, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, 0, 1e+20);
|
||||
|
||||
callFuncSVML1_1(_mm512_sin_ps , "sin, SP, 512", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm512_cos_ps , "cos, SP, 512", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm512_tan_ps , "tan, SP, 512", 0, 1e+20, abufsp, vfloat);
|
||||
callFuncSVML2_1(_mm512_sincos_ps, "sincos, SP, 512", 0, 1e+20, abufsp, vfloat);
|
||||
}
|
||||
|
||||
void benchSVML512_SPNontrig() {
|
||||
fillSP(abufsp, 0, 1e+38);
|
||||
|
||||
callFuncSVML1_1(_mm512_log_ps , "log, SP, 512", 0, 1e+38, abufsp, vfloat);
|
||||
|
||||
if (enableLogExp) {
|
||||
callFuncSVML1_1(_mm512_log10_ps, "log10, SP, 512", 0, 1e+38, abufsp, vfloat);
|
||||
//callFuncSVML1_1(_mm512_log1p_ps, "log1p, SP, 512", 0, 1e+38, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -100, 100);
|
||||
|
||||
callFuncSVML1_1(_mm512_exp_ps , "exp, SP, 512", -100, 100, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm512_exp2_ps , "exp2, SP, 512", -100, 100, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm512_exp10_ps, "exp10, SP, 512", -100, 100, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -30, 30);
|
||||
fillSP(bbufsp, -30, 30);
|
||||
|
||||
callFuncSVML1_2(_mm512_pow_ps, "pow, SP, 512", -30, 30, -30, 30, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
|
||||
fillSP(abufsp, -1.0, 1.0);
|
||||
|
||||
callFuncSVML1_1(_mm512_asin_ps, "asin, SP, 512", -1.0, 1, abufsp, vfloat);
|
||||
callFuncSVML1_1(_mm512_acos_ps, "acos, SP, 512", -1.0, 1, abufsp, vfloat);
|
||||
|
||||
fillSP(abufsp, -10, 10);
|
||||
fillSP(bbufsp, -10, 10);
|
||||
|
||||
callFuncSVML1_1(_mm512_atan_ps, "atan, SP, 512", -10, 10, abufsp, vfloat);
|
||||
callFuncSVML1_2(_mm512_atan2_ps, "atan2, SP, 512", -10, 10, -10, 10, abufsp, bbufsp, vfloat);
|
||||
}
|
||||
#else // #ifdef ENABLED
|
||||
void benchSVML512_DPTrig() {}
|
||||
void benchSVML512_DPNontrig() {}
|
||||
void benchSVML512_SPTrig() {}
|
||||
void benchSVML512_SPNontrig() {}
|
||||
#endif // #ifdef ENABLED
|
||||
@@ -0,0 +1,17 @@
|
||||
#!/bin/sh
|
||||
echo
|
||||
read -p "Enter label of measurement(e.g. My desktop PC) : " label
|
||||
|
||||
if [ -f counter.txt ]
|
||||
then
|
||||
counter=`cat counter.txt`
|
||||
else
|
||||
counter=0
|
||||
fi
|
||||
|
||||
echo Measurement in progress. This may take several minutes.
|
||||
for i in $*; do
|
||||
$i "$label" $counter
|
||||
done
|
||||
counter=$((counter+1))
|
||||
echo $counter > counter.txt
|
||||
@@ -0,0 +1,517 @@
|
||||
|
||||
# Settings
|
||||
|
||||
# TESTER3_DEFINITIONS
|
||||
|
||||
set(TESTER3_DEFINITIONS_SSE2 ATR=cinz_ DPTYPE=__m128d SPTYPE=__m128 DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=sse2)
|
||||
set(TESTER3_DEFINITIONS_SSE4 ATR=cinz_ DPTYPE=__m128d SPTYPE=__m128 DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=sse4)
|
||||
set(TESTER3_DEFINITIONS_AVX2128 ATR=finz_ DPTYPE=__m128d SPTYPE=__m128 DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=avx2128)
|
||||
set(TESTER3_DEFINITIONS_AVX ATR=cinz_ DPTYPE=__m256d SPTYPE=__m256 DPTYPESPEC=d4 SPTYPESPEC=f8 EXTSPEC=avx)
|
||||
set(TESTER3_DEFINITIONS_FMA4 ATR=finz_ DPTYPE=__m256d SPTYPE=__m256 DPTYPESPEC=d4 SPTYPESPEC=f8 EXTSPEC=fma4)
|
||||
set(TESTER3_DEFINITIONS_AVX2 ATR=finz_ DPTYPE=__m256d SPTYPE=__m256 DPTYPESPEC=d4 SPTYPESPEC=f8 EXTSPEC=avx2)
|
||||
set(TESTER3_DEFINITIONS_AVX512F ATR=finz_ DPTYPE=__m512d SPTYPE=__m512 DPTYPESPEC=d8 SPTYPESPEC=f16 EXTSPEC=avx512f)
|
||||
set(TESTER3_DEFINITIONS_AVX512FNOFMA ATR=cinz_ DPTYPE=__m512d SPTYPE=__m512 DPTYPESPEC=d8 SPTYPESPEC=f16 EXTSPEC=avx512fnofma)
|
||||
|
||||
set(TESTER3_DEFINITIONS_ADVSIMD ATR=finz_ DPTYPE=float64x2_t SPTYPE=float32x4_t DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=advsimd)
|
||||
set(TESTER3_DEFINITIONS_ADVSIMDNOFMA ATR=cinz_ DPTYPE=float64x2_t SPTYPE=float32x4_t DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=advsimdnofma)
|
||||
set(TESTER3_DEFINITIONS_SVE ATR=finz_ DPTYPE=svfloat64_t SPTYPE=svfloat32_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=sve)
|
||||
set(TESTER3_DEFINITIONS_SVENOFMA ATR=cinz_ DPTYPE=svfloat64_t SPTYPE=svfloat32_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=svenofma)
|
||||
|
||||
set(TESTER3_DEFINITIONS_VSX ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsx)
|
||||
set(TESTER3_DEFINITIONS_VSXNOFMA ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsxnofma)
|
||||
set(TESTER3_DEFINITIONS_VSX3 ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsx3)
|
||||
set(TESTER3_DEFINITIONS_VSX3NOFMA ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vsx3nofma)
|
||||
|
||||
set(TESTER3_DEFINITIONS_VXE ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxe)
|
||||
set(TESTER3_DEFINITIONS_VXENOFMA ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxenofma)
|
||||
set(TESTER3_DEFINITIONS_VXE2 ATR=finz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxe2)
|
||||
set(TESTER3_DEFINITIONS_VXE2NOFMA ATR=cinz_ DPTYPE=SLEEF_VECTOR_DOUBLE SPTYPE=SLEEF_VECTOR_FLOAT DPTYPESPEC=d2 SPTYPESPEC=f4 EXTSPEC=vxe2nofma)
|
||||
|
||||
set(TESTER3_DEFINITIONS_RVVM1 ATR=finz_ DPTYPE=vfloat64m1_t SPTYPE=vfloat32m1_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm1 ENABLE_RVVM1)
|
||||
set(TESTER3_DEFINITIONS_RVVM1NOFMA ATR=cinz_ DPTYPE=vfloat64m1_t SPTYPE=vfloat32m1_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm1nofma ENABLE_RVVM1)
|
||||
set(TESTER3_DEFINITIONS_RVVM2 ATR=finz_ DPTYPE=vfloat64m2_t SPTYPE=vfloat32m2_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm2 ENABLE_RVVM2)
|
||||
set(TESTER3_DEFINITIONS_RVVM2NOFMA ATR=cinz_ DPTYPE=vfloat64m2_t SPTYPE=vfloat32m2_t DPTYPESPEC=dx SPTYPESPEC=fx EXTSPEC=rvvm2nofma ENABLE_RVVM2)
|
||||
|
||||
set(TESTER3_DEFINITIONS_PUREC_SCALAR ATR=cinz_ DPTYPE=double SPTYPE=float DPTYPESPEC=d1 SPTYPESPEC=f1 EXTSPEC=purec)
|
||||
set(TESTER3_DEFINITIONS_PURECFMA_SCALAR ATR=finz_ DPTYPE=double SPTYPE=float DPTYPESPEC=d1 SPTYPESPEC=f1 EXTSPEC=purecfma)
|
||||
|
||||
#
|
||||
|
||||
if (SLEEF_ARCH_X86)
|
||||
set(TEST3_CINZ purec_scalar sse2 sse4 avx avx512fnofma)
|
||||
set(TEST3_FINZ purecfma_scalar avx2128 avx2 avx512f)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
|
||||
set(TEST3_CINZ purec_scalar advsimdnofma svenofma)
|
||||
set(TEST3_FINZ purecfma_scalar advsimd sve)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
|
||||
set(TEST3_CINZ purec_scalar)
|
||||
set(TEST3_FINZ purecfma_scalar)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)64")
|
||||
set(TEST3_CINZ purec_scalar vsxnofma vsx3nofma)
|
||||
set(TEST3_FINZ purecfma_scalar vsx vsx3)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x")
|
||||
set(TEST3_CINZ purec_scalar vxenofma vxe2nofma)
|
||||
set(TEST3_FINZ purecfma_scalar vxe vxe2)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
|
||||
set(TEST3_CINZ purec_scalar rvvm1nofma rvvm2nofma)
|
||||
set(TEST3_FINZ purecfma_scalar rvvm1 rvvm2)
|
||||
endif()
|
||||
|
||||
#
|
||||
|
||||
link_directories(${sleef_BINARY_DIR}/lib) # libsleef
|
||||
link_directories(${sleef_BINARY_DIR}/src/common) # common.a
|
||||
include_directories(${sleef_BINARY_DIR}/include) # sleef.h
|
||||
include_directories(${sleef_SOURCE_DIR}/src/libm) # rename.h
|
||||
include_directories(${sleef_BINARY_DIR}/src/libm/include) # rename headers
|
||||
|
||||
if(NOT LIB_MPFR)
|
||||
find_program(TESTER_COMMAND tester)
|
||||
endif(NOT LIB_MPFR)
|
||||
|
||||
if (SLEEF_ENFORCE_TESTER AND NOT LIB_MPFR AND NOT TESTER_COMMAND)
|
||||
message(FATAL_ERROR "SLEEF_ENFORCE_TESTER is specified and tester is not available")
|
||||
endif(SLEEF_ENFORCE_TESTER AND NOT LIB_MPFR AND NOT TESTER_COMMAND)
|
||||
|
||||
find_library(LIBRT rt)
|
||||
if (NOT LIBRT)
|
||||
set(LIBRT "")
|
||||
endif()
|
||||
|
||||
set(CMAKE_C_FLAGS "${ORG_CMAKE_C_FLAGS} ${SLEEF_C_FLAGS} ${FLAGS_NOSTRICTALIASING}")
|
||||
|
||||
set(COMMON_TARGET_PROPERTIES
|
||||
C_STANDARD 99 # -std=gnu99
|
||||
)
|
||||
|
||||
if (SLEEF_ENABLE_LTO)
|
||||
list(APPEND COMMON_TARGET_PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE) # -flto
|
||||
endif()
|
||||
|
||||
#
|
||||
|
||||
function(add_test_iut IUT C)
|
||||
if (LIB_MPFR)
|
||||
set(TESTER ${TARGET_TESTER})
|
||||
elseif(TESTER_COMMAND)
|
||||
set(TESTER ${TESTER_COMMAND})
|
||||
endif()
|
||||
# When we are crosscompiling using the mkrename* tools from a native
|
||||
# build, we use the tester executable from the native build.
|
||||
if (CMAKE_CROSSCOMPILING AND NATIVE_BUILD_DIR)
|
||||
set(TESTER ${NATIVE_BUILD_DIR}/bin/${TARGET_TESTER})
|
||||
endif(CMAKE_CROSSCOMPILING AND NATIVE_BUILD_DIR)
|
||||
if (TESTER)
|
||||
if (NOT EMULATOR)
|
||||
if (SDE_COMMAND)
|
||||
set(FLAGS_SDE "--sde" ${SDE_COMMAND})
|
||||
else()
|
||||
set(FLAGS_SDE)
|
||||
endif()
|
||||
if (ARMIE_COMMAND)
|
||||
set(FLAGS_ARMIE ${ARMIE_COMMAND} -msve-vector-bits=${SVE_VECTOR_BITS})
|
||||
else()
|
||||
set(FLAGS_ARMIE)
|
||||
endif()
|
||||
add_test(NAME ${IUT}
|
||||
COMMAND ${TESTER} ${FLAGS_SDE} ${FLAGS_ARMIE} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${IUT}
|
||||
WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||
set_tests_properties(${IUT} PROPERTIES COST ${C})
|
||||
else()
|
||||
add_test(NAME ${IUT}
|
||||
COMMAND ${TESTER} "--qemu" ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${IUT}
|
||||
WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||
set_tests_properties(${IUT} PROPERTIES COST ${C})
|
||||
endif()
|
||||
endif()
|
||||
endfunction()
|
||||
|
||||
# Compile executable 'iut'
|
||||
add_executable(${TARGET_IUT} iut.c testerutil.c)
|
||||
target_compile_definitions(${TARGET_IUT} PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
target_link_libraries(${TARGET_IUT} ${TARGET_LIBSLEEF}
|
||||
${LIBM} ${LIBRT})
|
||||
set_target_properties(${TARGET_IUT} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
add_test_iut(${TARGET_IUT} 1.0)
|
||||
set(IUT_LIST ${TARGET_IUT})
|
||||
|
||||
# Compile executable 'iutcuda'
|
||||
if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND AND CMAKE_CUDA_COMPILER)
|
||||
add_executable(iutcuda iutcuda.cu)
|
||||
set_target_properties(iutcuda PROPERTIES LINKER_LANGUAGE CUDA)
|
||||
target_compile_options(iutcuda PRIVATE "--fmad=false;-Xcompiler;-ffp-contract=off")
|
||||
add_dependencies(iutcuda ${TARGET_INLINE_HEADERS})
|
||||
add_test_iut(iutcuda 20.0)
|
||||
list(APPEND IUT_LIST iutcuda)
|
||||
endif()
|
||||
|
||||
set(IUT_SRC iutsimd.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c testerutil.c)
|
||||
|
||||
# Add vector extension `iut`s
|
||||
macro(test_extension SIMD)
|
||||
if(COMPILER_SUPPORTS_${SIMD})
|
||||
string(TOLOWER ${SIMD} LCSIMD)
|
||||
string(CONCAT TARGET_IUT${SIMD} "iut" ${LCSIMD})
|
||||
|
||||
add_executable(${TARGET_IUT${SIMD}} ${IUT_SRC})
|
||||
target_compile_options(${TARGET_IUT${SIMD}}
|
||||
PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${TARGET_IUT${SIMD}}
|
||||
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_link_libraries(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF}
|
||||
${LIBM} ${LIBRT})
|
||||
if (FORCE_AAVPCS)
|
||||
target_compile_definitions(${TARGET_IUT${SIMD}} PRIVATE ENABLE_AAVPCS=1)
|
||||
endif(FORCE_AAVPCS)
|
||||
|
||||
add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_HEADERS})
|
||||
add_dependencies(${TARGET_IUT${SIMD}} ${TARGET_LIBSLEEF})
|
||||
set_target_properties(${TARGET_IUT${SIMD}} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
if (DEFINED COSTOVERRIDE_${SIMD})
|
||||
add_test_iut(${TARGET_IUT${SIMD}} ${COSTOVERRIDE_${SIMD}})
|
||||
else()
|
||||
add_test_iut(${TARGET_IUT${SIMD}} 1.0)
|
||||
endif()
|
||||
list(APPEND IUT_LIST ${TARGET_IUT${SIMD}})
|
||||
|
||||
# The iut programs whose names begin with "iuty" are the iut for the
|
||||
# deterministic version of functions. By checking the result of
|
||||
# testing with iutysse2, for example, it can be checked that the
|
||||
# corresponding deterministic functions passes the accuracy and
|
||||
# nonnumber tests.
|
||||
|
||||
string(CONCAT IUTYNAME "iuty" ${LCSIMD})
|
||||
add_executable(${IUTYNAME} ${IUT_SRC})
|
||||
target_compile_options(${IUTYNAME}
|
||||
PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${IUTYNAME}
|
||||
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1)
|
||||
target_link_libraries(${IUTYNAME} ${TARGET_LIBSLEEF}
|
||||
${LIBM} ${LIBRT})
|
||||
add_dependencies(${IUTYNAME} ${TARGET_HEADERS})
|
||||
add_dependencies(${IUTYNAME} ${TARGET_LIBSLEEF})
|
||||
set_target_properties(${IUTYNAME} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
if (DEFINED COSTOVERRIDE_${SIMD})
|
||||
add_test_iut(${IUTYNAME} ${COSTOVERRIDE_${SIMD}})
|
||||
else()
|
||||
add_test_iut(${IUTYNAME} 1.0)
|
||||
endif()
|
||||
list(APPEND IUT_LIST ${IUTYNAME})
|
||||
|
||||
# The iut programs whose names begin with "iuti" are the iut for the
|
||||
# inline version of functions.
|
||||
|
||||
if (SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND)
|
||||
string(CONCAT IUTINAME "iuti" ${LCSIMD})
|
||||
add_executable(${IUTINAME} ${IUT_SRC})
|
||||
target_compile_options(${IUTINAME} PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${IUTINAME}
|
||||
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS}
|
||||
USE_INLINE_HEADER="sleefinline_${LCSIMD}.h"
|
||||
MACRO_ONLY_HEADER="macroonly${SIMD}.h"
|
||||
SIMD_SUFFIX=_${LCSIMD}_sleef
|
||||
)
|
||||
target_include_directories(${IUTINAME} PRIVATE ${PROJECT_BINARY_DIR}/include)
|
||||
target_link_libraries(${IUTINAME} ${LIBM} ${LIBRT})
|
||||
add_dependencies(${IUTINAME} ${TARGET_INLINE_HEADERS})
|
||||
set_target_properties(${IUTINAME} PROPERTIES C_STANDARD 99)
|
||||
if (DEFINED COSTOVERRIDE_${SIMD})
|
||||
add_test_iut(${IUTINAME} ${COSTOVERRIDE_${SIMD}})
|
||||
else()
|
||||
add_test_iut(${IUTINAME} 1.0)
|
||||
endif()
|
||||
list(APPEND IUT_LIST ${IUTINAME})
|
||||
endif(SLEEF_BUILD_INLINE_HEADERS AND SED_COMMAND)
|
||||
|
||||
if(LIB_MPFR AND NOT ${SIMD} STREQUAL NEON32 AND NOT ${SIMD} STREQUAL NEON32VFPV4 AND NOT MINGW)
|
||||
# Build tester2 SIMD
|
||||
string(TOLOWER ${SIMD} SCSIMD)
|
||||
foreach(P dp sp)
|
||||
set(T "tester2${SCSIMD}${P}")
|
||||
add_executable(${T} tester2simd${P}.c testerutil.c)
|
||||
if(FORCE_AAVPCS)
|
||||
target_compile_definitions(${T} PRIVATE ENABLE_AAVPCS=1)
|
||||
endif(FORCE_AAVPCS)
|
||||
target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${T} PRIVATE ENABLE_${SIMD}=1 USEMPFR=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIB_MPFR} ${LIBM} ${LIBGMP})
|
||||
add_dependencies(${T} ${TARGET_HEADERS})
|
||||
add_dependencies(${T} ${TARGET_LIBSLEEF})
|
||||
if (MPFR_INCLUDE_DIR)
|
||||
target_include_directories(${T} PRIVATE ${MPFR_INCLUDE_DIR})
|
||||
endif()
|
||||
|
||||
# The tester2 programs whose name begins with "tester2y" are the
|
||||
# testing program for the deterministic version of functions.
|
||||
|
||||
set(T "tester2y${SCSIMD}${P}")
|
||||
add_executable(${T} tester2simd${P}.c testerutil.c)
|
||||
target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${T} PRIVATE ENABLE_${SIMD}=1 USEMPFR=1 ${COMMON_TARGET_DEFINITIONS} DETERMINISTIC=1)
|
||||
set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIB_MPFR} ${LIBM} ${LIBGMP})
|
||||
add_dependencies(${T} ${TARGET_HEADERS})
|
||||
add_dependencies(${T} ${TARGET_LIBSLEEF})
|
||||
if (MPFR_INCLUDE_DIR)
|
||||
target_include_directories(${T} PRIVATE ${MPFR_INCLUDE_DIR})
|
||||
endif()
|
||||
endforeach()
|
||||
endif()
|
||||
|
||||
if(NOT ${SIMD} STREQUAL NEON32 AND NOT ${SIMD} STREQUAL NEON32VFPV4 AND SLEEF_OPENSSL_FOUND)
|
||||
# Build tester3
|
||||
string(TOLOWER ${SIMD} SCSIMD)
|
||||
set(T "tester3${SCSIMD}")
|
||||
add_executable(${T} tester3.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c testerutil.c)
|
||||
target_compile_options(${T} PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
target_compile_definitions(${T} PRIVATE ${COMMON_TARGET_DEFINITIONS} ${TESTER3_DEFINITIONS_${SIMD}})
|
||||
set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
|
||||
# Enable Vector PCS for Advanced SIMD (if supported)
|
||||
if(FORCE_AAVPCS)
|
||||
host_target_AAVPCS_definitions(${T})
|
||||
endif()
|
||||
|
||||
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIBM} ${SLEEF_OPENSSL_LIBRARIES})
|
||||
target_include_directories(${T} PRIVATE ${SLEEF_OPENSSL_INCLUDE_DIR})
|
||||
add_dependencies(${T} ${TARGET_HEADERS})
|
||||
add_dependencies(${T} ${TARGET_LIBSLEEF})
|
||||
|
||||
# Add test with tester3
|
||||
list(FIND TEST3_CINZ ${SCSIMD} INDEX_TEST3_CINZ)
|
||||
if (NOT INDEX_TEST3_CINZ EQUAL -1)
|
||||
if (SDE_COMMAND)
|
||||
add_test(NAME tester3${SCSIMD} COMMAND ${SDE_COMMAND} "--" ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_cinz.txt)
|
||||
elseif(EMULATOR)
|
||||
add_test(NAME tester3${SCSIMD} COMMAND ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_cinz.txt)
|
||||
else()
|
||||
add_test(NAME tester3${SCSIMD} COMMAND tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_cinz.txt)
|
||||
endif()
|
||||
if (DEFINED COSTOVERRIDE_${SIMD})
|
||||
set_tests_properties(tester3${SCSIMD} PROPERTIES COST ${COSTOVERRIDE_${SIMD}})
|
||||
else()
|
||||
set_tests_properties(tester3${SCSIMD} PROPERTIES COST 0.5)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
list(FIND TEST3_FINZ ${SCSIMD} INDEX_TEST3_FINZ)
|
||||
if (NOT INDEX_TEST3_FINZ EQUAL -1)
|
||||
if (SDE_COMMAND)
|
||||
add_test(NAME tester3${SCSIMD} COMMAND ${SDE_COMMAND} "--" ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_finz.txt)
|
||||
elseif(EMULATOR)
|
||||
add_test(NAME tester3${SCSIMD} COMMAND ${EMULATOR} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_finz.txt)
|
||||
else()
|
||||
add_test(NAME tester3${SCSIMD} COMMAND tester3${SCSIMD} ${sleef_SOURCE_DIR}/src/libm-tester/hash_finz.txt)
|
||||
endif()
|
||||
if (DEFINED COSTOVERRIDE_${SIMD})
|
||||
set_tests_properties(tester3${SCSIMD} PROPERTIES COST ${COSTOVERRIDE_${SIMD}})
|
||||
else()
|
||||
set_tests_properties(tester3${SCSIMD} PROPERTIES COST 0.5)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
endif(COMPILER_SUPPORTS_${SIMD})
|
||||
endmacro(test_extension)
|
||||
|
||||
foreach(SIMD ${SLEEF_SUPPORTED_LIBM_EXTENSIONS})
|
||||
test_extension(${SIMD})
|
||||
endforeach()
|
||||
|
||||
function(add_gnuabi_compatibility_test SIMD MASKED)
|
||||
if (MASKED)
|
||||
set(GNUABI_COMPATIBILITY_TEST gnuabi_compatibility_${SIMD}_masked)
|
||||
else(MASKED)
|
||||
set(GNUABI_COMPATIBILITY_TEST gnuabi_compatibility_${SIMD})
|
||||
endif(MASKED)
|
||||
add_executable(${GNUABI_COMPATIBILITY_TEST} gnuabi_compatibility.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
|
||||
set_target_properties(${GNUABI_COMPATIBILITY_TEST} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
target_compile_options(${GNUABI_COMPATIBILITY_TEST}
|
||||
PRIVATE ${FLAGS_ENABLE_${SIMD}})
|
||||
if (MASKED)
|
||||
target_compile_definitions(${GNUABI_COMPATIBILITY_TEST}
|
||||
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS} MASKED_GNUABI=1)
|
||||
else(MASKED)
|
||||
target_compile_definitions(${GNUABI_COMPATIBILITY_TEST}
|
||||
PRIVATE ENABLE_${SIMD}=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
endif(MASKED)
|
||||
if (FORCE_AAVPCS)
|
||||
target_compile_definitions(${GNUABI_COMPATIBILITY_TEST} PRIVATE ENABLE_AAVPCS=1)
|
||||
endif(FORCE_AAVPCS)
|
||||
target_link_libraries(${GNUABI_COMPATIBILITY_TEST} ${TARGET_LIBSLEEFGNUABI} ${LIBM})
|
||||
# These are linker tests that don't really need to be executed,
|
||||
# but seeing them in the report of ctest gives an idea of what
|
||||
# has been built for testing.
|
||||
if (EMULATOR)
|
||||
add_test(NAME ${GNUABI_COMPATIBILITY_TEST}
|
||||
COMMAND ${EMULATOR} $<TARGET_FILE:${GNUABI_COMPATIBILITY_TEST}>
|
||||
WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||
elseif(SDE_COMMAND)
|
||||
add_test(NAME ${GNUABI_COMPATIBILITY_TEST}
|
||||
COMMAND ${SDE_COMMAND} "--" $<TARGET_FILE:${GNUABI_COMPATIBILITY_TEST}>)
|
||||
else()
|
||||
add_test(NAME ${GNUABI_COMPATIBILITY_TEST}
|
||||
COMMAND $<TARGET_FILE:${GNUABI_COMPATIBILITY_TEST}>)
|
||||
endif(EMULATOR)
|
||||
endfunction(add_gnuabi_compatibility_test)
|
||||
|
||||
if(ENABLE_GNUABI)
|
||||
foreach(SIMD ${SLEEF_SUPPORTED_GNUABI_EXTENSIONS})
|
||||
if(COMPILER_SUPPORTS_${SIMD})
|
||||
# GNUABI compatibility for the unmasked symbols.
|
||||
add_gnuabi_compatibility_test(${SIMD} OFF)
|
||||
# GNUABI compatibility for the masked symbols.
|
||||
if (MKMASKED_PARAMS_GNUABI_${SIMD}_sp)
|
||||
add_gnuabi_compatibility_test(${SIMD} ON)
|
||||
endif(MKMASKED_PARAMS_GNUABI_${SIMD}_sp)
|
||||
endif (COMPILER_SUPPORTS_${SIMD})
|
||||
endforeach(SIMD ${SLEEF_SUPPORTED_GNUABI_EXTENSIONS})
|
||||
endif(ENABLE_GNUABI)
|
||||
|
||||
#
|
||||
|
||||
if (SLEEF_ARCH_X86)
|
||||
# iutdsp128
|
||||
add_executable(iutdsp128 ${IUT_SRC})
|
||||
target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSP128=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_SSE2})
|
||||
target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
|
||||
add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
|
||||
add_test_iut(iutdsp128 1.0)
|
||||
list(APPEND IUT_LIST iutdsp128)
|
||||
|
||||
# iutdsp256
|
||||
add_executable(iutdsp256 ${IUT_SRC})
|
||||
target_compile_definitions(iutdsp256 PRIVATE ENABLE_DSP256=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(iutdsp256 PRIVATE ${FLAGS_ENABLE_AVX})
|
||||
target_link_libraries(iutdsp256 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
|
||||
add_dependencies(iutdsp256 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
|
||||
add_test_iut(iutdsp256 1.0)
|
||||
list(APPEND IUT_LIST iutdsp256)
|
||||
endif(SLEEF_ARCH_X86)
|
||||
|
||||
if (SLEEF_ARCH_PPC64)
|
||||
add_executable(iutdsp128 ${IUT_SRC})
|
||||
target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPPOWER_128=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VSX})
|
||||
target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
|
||||
add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
|
||||
add_test_iut(iutdsp128 1.0)
|
||||
list(APPEND IUT_LIST iutdsp128)
|
||||
endif(SLEEF_ARCH_PPC64)
|
||||
|
||||
if (SLEEF_ARCH_S390X)
|
||||
add_executable(iutdsp128 ${IUT_SRC})
|
||||
target_compile_definitions(iutdsp128 PRIVATE ENABLE_DSPS390X_128=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(iutdsp128 PRIVATE ${FLAGS_ENABLE_VXE})
|
||||
target_link_libraries(iutdsp128 ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
|
||||
add_dependencies(iutdsp128 ${TARGET_HEADERS} ${TARGET_LIBSLEEF})
|
||||
add_test_iut(iutdsp128 1.0)
|
||||
list(APPEND IUT_LIST iutdsp128)
|
||||
endif(SLEEF_ARCH_S390X)
|
||||
|
||||
if(SLEEF_BUILD_SCALAR_LIB)
|
||||
# Compile executable 'iutscalar'
|
||||
add_executable(iutscalar iut.c testerutil.c)
|
||||
target_compile_definitions(iutscalar PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
target_link_libraries(iutscalar sleefscalar ${TARGET_LIBSLEEF} ${LIBM} ${LIBRT})
|
||||
set_target_properties(iutscalar PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
add_test_iut(iutscalar 1.0)
|
||||
list(APPEND IUT_LIST iutscalar)
|
||||
endif()
|
||||
|
||||
if(LIB_MPFR AND NOT MINGW)
|
||||
# Build tester2 scalar
|
||||
set(PRECISIONS dp sp)
|
||||
if(COMPILER_SUPPORTS_LONG_DOUBLE)
|
||||
list(APPEND PRECISIONS ld)
|
||||
endif()
|
||||
if(COMPILER_SUPPORTS_QUADMATH)
|
||||
list(APPEND PRECISIONS qp)
|
||||
set(LIBQUADMATH "-lquadmath")
|
||||
set(ENABLEFLOAT128 PRIVATE ENABLEFLOAT128=1)
|
||||
endif()
|
||||
foreach(P ${PRECISIONS})
|
||||
set(T "tester2${P}")
|
||||
add_executable(${T} tester2${P}.c testerutil.c)
|
||||
target_compile_definitions(${T} PRIVATE USEMPFR=1 ${ENABLEFLOAT128} ${COMMON_TARGET_DEFINITIONS})
|
||||
set_target_properties(${T} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
if (FORCE_AAVPCS)
|
||||
target_compile_definitions(${T} PRIVATE ENABLE_AAVPCS=1)
|
||||
endif(FORCE_AAVPCS)
|
||||
if (MPFR_INCLUDE_DIR)
|
||||
target_include_directories(${T} PRIVATE ${MPFR_INCLUDE_DIR})
|
||||
endif()
|
||||
target_link_libraries(${T} ${TARGET_LIBSLEEF} ${LIBQUADMATH} ${LIB_MPFR} ${LIBM} ${LIBGMP})
|
||||
add_dependencies(${T} ${TARGET_HEADERS})
|
||||
add_dependencies(${T} ${TARGET_LIBSLEEF})
|
||||
endforeach()
|
||||
|
||||
# Compile executable 'tester'
|
||||
add_host_executable(${TARGET_TESTER} tester.c testerutil.c)
|
||||
if (NOT CMAKE_CROSSCOMPILING)
|
||||
target_link_libraries(${TARGET_TESTER} ${LIB_MPFR} ${TARGET_LIBSLEEF} ${LIBM} ${LIBGMP})
|
||||
target_compile_definitions(${TARGET_TESTER}
|
||||
PRIVATE USEMPFR=1 ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(${TARGET_TESTER} PRIVATE -Wno-unused-result)
|
||||
set_target_properties(${TARGET_TESTER} PROPERTIES ${COMMON_TARGET_PROPERTIES})
|
||||
if (MPFR_INCLUDE_DIR)
|
||||
target_include_directories(${TARGET_TESTER} PRIVATE ${MPFR_INCLUDE_DIR})
|
||||
endif()
|
||||
endif()
|
||||
endif(LIB_MPFR AND NOT MINGW)
|
||||
|
||||
if(ENABLE_GNUABI AND COMPILER_SUPPORTS_OMP_SIMD AND NOT SLEEF_TARGET_PROCESSOR MATCHES "^i.86$")
|
||||
# Build tester for vectorabi
|
||||
add_executable(testervecabi testervecabi.c)
|
||||
target_compile_definitions(testervecabi PRIVATE ${COMMON_TARGET_DEFINITIONS})
|
||||
target_compile_options(testervecabi PRIVATE ${OpenMP_C_FLAGS})
|
||||
target_link_libraries(testervecabi ${TARGET_LIBSLEEF} ${OpenMP_C_FLAGS})
|
||||
set_target_properties(testervecabi PROPERTIES C_STANDARD 99)
|
||||
add_test(NAME testervecabi COMMAND ${EMULATOR} testervecabi
|
||||
WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||
endif()
|
||||
|
||||
# mveclibtest
|
||||
|
||||
if (ENABLE_GNUABI AND SLEEF_ARCH_X86 AND CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 7.99)
|
||||
add_executable(mveclibtest-sse2 mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
|
||||
set_target_properties(mveclibtest-sse2 PROPERTIES C_STANDARD 99)
|
||||
target_compile_options(mveclibtest-sse2 PRIVATE ${FLAGS_FASTMATH} "-O3")
|
||||
target_link_libraries(mveclibtest-sse2 ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
|
||||
add_dependencies(mveclibtest-sse2 ${TARGET_HEADERS})
|
||||
add_test(NAME mveclibtest-sse2 COMMAND mveclibtest-sse2)
|
||||
|
||||
add_executable(mveclibtest-avx mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
|
||||
set_target_properties(mveclibtest-avx PROPERTIES C_STANDARD 99)
|
||||
target_compile_options(mveclibtest-avx PRIVATE ${FLAGS_FASTMATH} ${FLAGS_ENABLE_AVX} "-O3")
|
||||
target_link_libraries(mveclibtest-avx ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
|
||||
add_dependencies(mveclibtest-avx ${TARGET_HEADERS})
|
||||
add_test(NAME mveclibtest-avx COMMAND mveclibtest-avx)
|
||||
|
||||
add_executable(mveclibtest-avx2 mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
|
||||
set_target_properties(mveclibtest-avx2 PROPERTIES C_STANDARD 99)
|
||||
target_compile_options(mveclibtest-avx2 PRIVATE ${FLAGS_FASTMATH} ${FLAGS_ENABLE_AVX2} "-O3")
|
||||
target_link_libraries(mveclibtest-avx2 ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
|
||||
add_dependencies(mveclibtest-avx2 ${TARGET_HEADERS})
|
||||
add_test(NAME mveclibtest-avx2 COMMAND mveclibtest-avx2)
|
||||
|
||||
add_executable(mveclibtest-avx512f mveclibtest.c ${sleef_SOURCE_DIR}/src/common/main_checkfeature.c)
|
||||
set_target_properties(mveclibtest-avx512f PROPERTIES C_STANDARD 99)
|
||||
target_compile_options(mveclibtest-avx512f PRIVATE ${FLAGS_FASTMATH} ${FLAGS_ENABLE_AVX512F} "-O3")
|
||||
target_link_libraries(mveclibtest-avx512f ${TARGET_LIBSLEEF} ${TARGET_LIBSLEEFGNUABI})
|
||||
add_dependencies(mveclibtest-avx512f ${TARGET_HEADERS})
|
||||
add_test(NAME mveclibtest-avx512f COMMAND mveclibtest-avx512f)
|
||||
endif()
|
||||
|
||||
#
|
||||
|
||||
if (FILECHECK_COMMAND AND COMPILER_SUPPORTS_OPENMP AND SLEEF_ARCH_X86 AND CMAKE_C_COMPILER_ID MATCHES "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 7.99)
|
||||
add_test(NAME autovec-avx2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -mavx2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -check-prefix=CHECK-AVX2")
|
||||
add_test(NAME autovec-sse2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -msse2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/autovec.c -check-prefix=CHECK-SSE2")
|
||||
add_test(NAME testervecabi-sse2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -msse2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -check-prefix=CHECK-SSE2")
|
||||
add_test(NAME testervecabi-avx2 COMMAND sh -c "${CMAKE_C_COMPILER} -Wno-attributes -fopenmp -mavx2 -O3 ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -I${sleef_BINARY_DIR}/include -S -o- | ${FILECHECK_COMMAND} ${CMAKE_CURRENT_SOURCE_DIR}/testervecabi.c -check-prefix=CHECK-AVX2")
|
||||
endif()
|
||||
|
||||
# Tests depends on the library
|
||||
add_dependencies(${TARGET_IUT} ${TARGET_HEADERS})
|
||||
@@ -0,0 +1,651 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#define SLEEF_ENABLE_OMP_SIMD
|
||||
#include "sleef.h"
|
||||
|
||||
#define N 1024
|
||||
double a[N], b[N], c[N], d[N];
|
||||
float e[N], f[N], g[N], h[N];
|
||||
|
||||
void testsind1_u10() {
|
||||
// CHECK-SSE2: testsind1_u10
|
||||
// CHECK-AVX2: testsind1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_sind1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_sind1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_sind1_u10
|
||||
}
|
||||
|
||||
void testsind1_u35() {
|
||||
// CHECK-SSE2: testsind1_u35
|
||||
// CHECK-AVX2: testsind1_u35
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_sind1_u35(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_sind1_u35
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_sind1_u35
|
||||
}
|
||||
|
||||
void testsinf1_u10() {
|
||||
// CHECK-SSE2: testsinf1_u10
|
||||
// CHECK-AVX2: testsinf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_sinf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_sinf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_sinf1_u10
|
||||
}
|
||||
|
||||
void testsinf1_u35() {
|
||||
// CHECK-SSE2: testsinf1_u35
|
||||
// CHECK-AVX2: testsinf1_u35
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_sinf1_u35(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_sinf1_u35
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_sinf1_u35
|
||||
}
|
||||
|
||||
void testcosd1_u10() {
|
||||
// CHECK-SSE2: testcosd1_u10
|
||||
// CHECK-AVX2: testcosd1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_cosd1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_cosd1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_cosd1_u10
|
||||
}
|
||||
|
||||
void testcosd1_u35() {
|
||||
// CHECK-SSE2: testcosd1_u35
|
||||
// CHECK-AVX2: testcosd1_u35
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_cosd1_u35(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_cosd1_u35
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_cosd1_u35
|
||||
}
|
||||
|
||||
void testcosf1_u10() {
|
||||
// CHECK-SSE2: testcosf1_u10
|
||||
// CHECK-AVX2: testcosf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_cosf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_cosf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_cosf1_u10
|
||||
}
|
||||
|
||||
void testcosf1_u35() {
|
||||
// CHECK-SSE2: testcosf1_u35
|
||||
// CHECK-AVX2: testcosf1_u35
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_cosf1_u35(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_cosf1_u35
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_cosf1_u35
|
||||
}
|
||||
|
||||
void testtand1_u10() {
|
||||
// CHECK-SSE2: testtand1_u10
|
||||
// CHECK-AVX2: testtand1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_tand1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_tand1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_tand1_u10
|
||||
}
|
||||
|
||||
void testtand1_u35() {
|
||||
// CHECK-SSE2: testtand1_u35
|
||||
// CHECK-AVX2: testtand1_u35
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_tand1_u35(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_tand1_u35
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_tand1_u35
|
||||
}
|
||||
|
||||
void testtanf1_u10() {
|
||||
// CHECK-SSE2: testtanf1_u10
|
||||
// CHECK-AVX2: testtanf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_tanf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_tanf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_tanf1_u10
|
||||
}
|
||||
|
||||
void testtanf1_u35() {
|
||||
// CHECK-SSE2: testtanf1_u35
|
||||
// CHECK-AVX2: testtanf1_u35
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_tanf1_u35(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_tanf1_u35
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_tanf1_u35
|
||||
}
|
||||
|
||||
void testasind1_u10() {
|
||||
// CHECK-SSE2: testasind1_u10
|
||||
// CHECK-AVX2: testasind1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_asind1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_asind1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_asind1_u10
|
||||
}
|
||||
|
||||
void testasind1_u35() {
|
||||
// CHECK-SSE2: testasind1_u35
|
||||
// CHECK-AVX2: testasind1_u35
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_asind1_u35(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_asind1_u35
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_asind1_u35
|
||||
}
|
||||
|
||||
void testasinf1_u10() {
|
||||
// CHECK-SSE2: testasinf1_u10
|
||||
// CHECK-AVX2: testasinf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_asinf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_asinf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_asinf1_u10
|
||||
}
|
||||
|
||||
void testasinf1_u35() {
|
||||
// CHECK-SSE2: testasinf1_u35
|
||||
// CHECK-AVX2: testasinf1_u35
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_asinf1_u35(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_asinf1_u35
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_asinf1_u35
|
||||
}
|
||||
|
||||
void testacosd1_u10() {
|
||||
// CHECK-SSE2: testacosd1_u10
|
||||
// CHECK-AVX2: testacosd1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_acosd1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_acosd1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_acosd1_u10
|
||||
}
|
||||
|
||||
void testacosd1_u35() {
|
||||
// CHECK-SSE2: testacosd1_u35
|
||||
// CHECK-AVX2: testacosd1_u35
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_acosd1_u35(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_acosd1_u35
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_acosd1_u35
|
||||
}
|
||||
|
||||
void testacosf1_u10() {
|
||||
// CHECK-SSE2: testacosf1_u10
|
||||
// CHECK-AVX2: testacosf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_acosf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_acosf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_acosf1_u10
|
||||
}
|
||||
|
||||
void testacosf1_u35() {
|
||||
// CHECK-SSE2: testacosf1_u35
|
||||
// CHECK-AVX2: testacosf1_u35
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_acosf1_u35(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_acosf1_u35
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_acosf1_u35
|
||||
}
|
||||
|
||||
void testatand1_u10() {
|
||||
// CHECK-SSE2: testatand1_u10
|
||||
// CHECK-AVX2: testatand1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_atand1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_atand1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_atand1_u10
|
||||
}
|
||||
|
||||
void testatand1_u35() {
|
||||
// CHECK-SSE2: testatand1_u35
|
||||
// CHECK-AVX2: testatand1_u35
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_atand1_u35(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_atand1_u35
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_atand1_u35
|
||||
}
|
||||
|
||||
void testatanf1_u10() {
|
||||
// CHECK-SSE2: testatanf1_u10
|
||||
// CHECK-AVX2: testatanf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_atanf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_atanf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_atanf1_u10
|
||||
}
|
||||
|
||||
void testatanf1_u35() {
|
||||
// CHECK-SSE2: testatanf1_u35
|
||||
// CHECK-AVX2: testatanf1_u35
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_atanf1_u35(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_atanf1_u35
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_atanf1_u35
|
||||
}
|
||||
|
||||
void testatan2d1_u10() {
|
||||
// CHECK-SSE2: testatan2d1_u10
|
||||
// CHECK-AVX2: testatan2d1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_atan2d1_u10(b[i], c[i]);
|
||||
// CHECK-SSE2: _ZGVbN2vv_Sleef_atan2d1_u10
|
||||
// CHECK-AVX2: _ZGVdN4vv_Sleef_atan2d1_u10
|
||||
}
|
||||
|
||||
void testatan2d1_u35() {
|
||||
// CHECK-SSE2: testatan2d1_u35
|
||||
// CHECK-AVX2: testatan2d1_u35
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_atan2d1_u35(b[i], c[i]);
|
||||
// CHECK-SSE2: _ZGVbN2vv_Sleef_atan2d1_u35
|
||||
// CHECK-AVX2: _ZGVdN4vv_Sleef_atan2d1_u35
|
||||
}
|
||||
|
||||
void testatan2f1_u10() {
|
||||
// CHECK-SSE2: testatan2f1_u10
|
||||
// CHECK-AVX2: testatan2f1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_atan2f1_u10(f[i], g[i]);
|
||||
// CHECK-SSE2: _ZGVbN4vv_Sleef_atan2f1_u10
|
||||
// CHECK-AVX2: _ZGVdN8vv_Sleef_atan2f1_u10
|
||||
}
|
||||
|
||||
void testatan2f1_u35() {
|
||||
// CHECK-SSE2: testatan2f1_u35
|
||||
// CHECK-AVX2: testatan2f1_u35
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_atan2f1_u35(f[i], g[i]);
|
||||
// CHECK-SSE2: _ZGVbN4vv_Sleef_atan2f1_u35
|
||||
// CHECK-AVX2: _ZGVdN8vv_Sleef_atan2f1_u35
|
||||
}
|
||||
|
||||
void testsinhd1_u10() {
|
||||
// CHECK-SSE2: testsinhd1_u10
|
||||
// CHECK-AVX2: testsinhd1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_sinhd1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_sinhd1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_sinhd1_u10
|
||||
}
|
||||
|
||||
void testsinhd1_u35() {
|
||||
// CHECK-SSE2: testsinhd1_u35
|
||||
// CHECK-AVX2: testsinhd1_u35
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_sinhd1_u35(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_sinhd1_u35
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_sinhd1_u35
|
||||
}
|
||||
|
||||
void testsinhf1_u10() {
|
||||
// CHECK-SSE2: testsinhf1_u10
|
||||
// CHECK-AVX2: testsinhf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_sinhf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_sinhf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_sinhf1_u10
|
||||
}
|
||||
|
||||
void testsinhf1_u35() {
|
||||
// CHECK-SSE2: testsinhf1_u35
|
||||
// CHECK-AVX2: testsinhf1_u35
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_sinhf1_u35(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_sinhf1_u35
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_sinhf1_u35
|
||||
}
|
||||
|
||||
void testcoshd1_u10() {
|
||||
// CHECK-SSE2: testcoshd1_u10
|
||||
// CHECK-AVX2: testcoshd1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_coshd1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_coshd1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_coshd1_u10
|
||||
}
|
||||
|
||||
void testcoshd1_u35() {
|
||||
// CHECK-SSE2: testcoshd1_u35
|
||||
// CHECK-AVX2: testcoshd1_u35
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_coshd1_u35(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_coshd1_u35
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_coshd1_u35
|
||||
}
|
||||
|
||||
void testcoshf1_u10() {
|
||||
// CHECK-SSE2: testcoshf1_u10
|
||||
// CHECK-AVX2: testcoshf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_coshf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_coshf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_coshf1_u10
|
||||
}
|
||||
|
||||
void testcoshf1_u35() {
|
||||
// CHECK-SSE2: testcoshf1_u35
|
||||
// CHECK-AVX2: testcoshf1_u35
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_coshf1_u35(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_coshf1_u35
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_coshf1_u35
|
||||
}
|
||||
|
||||
void testtanhd1_u10() {
|
||||
// CHECK-SSE2: testtanhd1_u10
|
||||
// CHECK-AVX2: testtanhd1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_tanhd1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_tanhd1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_tanhd1_u10
|
||||
}
|
||||
|
||||
void testtanhd1_u35() {
|
||||
// CHECK-SSE2: testtanhd1_u35
|
||||
// CHECK-AVX2: testtanhd1_u35
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_tanhd1_u35(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_tanhd1_u35
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_tanhd1_u35
|
||||
}
|
||||
|
||||
void testtanhf1_u10() {
|
||||
// CHECK-SSE2: testtanhf1_u10
|
||||
// CHECK-AVX2: testtanhf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_tanhf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_tanhf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_tanhf1_u10
|
||||
}
|
||||
|
||||
void testtanhf1_u35() {
|
||||
// CHECK-SSE2: testtanhf1_u35
|
||||
// CHECK-AVX2: testtanhf1_u35
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_tanhf1_u35(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_tanhf1_u35
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_tanhf1_u35
|
||||
}
|
||||
|
||||
void testasinhd1_u10() {
|
||||
// CHECK-SSE2: testasinhd1_u10
|
||||
// CHECK-AVX2: testasinhd1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_asinhd1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_asinhd1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_asinhd1_u10
|
||||
}
|
||||
|
||||
void testasinhf1_u10() {
|
||||
// CHECK-SSE2: testasinhf1_u10
|
||||
// CHECK-AVX2: testasinhf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_asinhf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_asinhf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_asinhf1_u10
|
||||
}
|
||||
|
||||
void testacoshd1_u10() {
|
||||
// CHECK-SSE2: testacoshd1_u10
|
||||
// CHECK-AVX2: testacoshd1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_acoshd1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_acoshd1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_acoshd1_u10
|
||||
}
|
||||
|
||||
void testacoshf1_u10() {
|
||||
// CHECK-SSE2: testacoshf1_u10
|
||||
// CHECK-AVX2: testacoshf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_acoshf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_acoshf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_acoshf1_u10
|
||||
}
|
||||
|
||||
void testatanhd1_u10() {
|
||||
// CHECK-SSE2: testatanhd1_u10
|
||||
// CHECK-AVX2: testatanhd1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_atanhd1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_atanhd1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_atanhd1_u10
|
||||
}
|
||||
|
||||
void testatanhf1_u10() {
|
||||
// CHECK-SSE2: testatanhf1_u10
|
||||
// CHECK-AVX2: testatanhf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_atanhf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_atanhf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_atanhf1_u10
|
||||
}
|
||||
|
||||
void testlogd1_u10() {
|
||||
// CHECK-SSE2: testlogd1_u10
|
||||
// CHECK-AVX2: testlogd1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_logd1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_logd1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_logd1_u10
|
||||
}
|
||||
|
||||
void testlogd1_u35() {
|
||||
// CHECK-SSE2: testlogd1_u35
|
||||
// CHECK-AVX2: testlogd1_u35
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_logd1_u35(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_logd1_u35
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_logd1_u35
|
||||
}
|
||||
|
||||
void testlogf1_u10() {
|
||||
// CHECK-SSE2: testlogf1_u10
|
||||
// CHECK-AVX2: testlogf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_logf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_logf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_logf1_u10
|
||||
}
|
||||
|
||||
void testlogf1_u35() {
|
||||
// CHECK-SSE2: testlogf1_u35
|
||||
// CHECK-AVX2: testlogf1_u35
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_logf1_u35(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_logf1_u35
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_logf1_u35
|
||||
}
|
||||
|
||||
void testlog2d1_u10() {
|
||||
// CHECK-SSE2: testlog2d1_u10
|
||||
// CHECK-AVX2: testlog2d1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_log2d1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_log2d1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_log2d1_u10
|
||||
}
|
||||
|
||||
void testlog2f1_u10() {
|
||||
// CHECK-SSE2: testlog2f1_u10
|
||||
// CHECK-AVX2: testlog2f1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_log2f1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_log2f1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_log2f1_u10
|
||||
}
|
||||
|
||||
void testlog10d1_u10() {
|
||||
// CHECK-SSE2: testlog10d1_u10
|
||||
// CHECK-AVX2: testlog10d1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_log10d1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_log10d1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_log10d1_u10
|
||||
}
|
||||
|
||||
void testlog10f1_u10() {
|
||||
// CHECK-SSE2: testlog10f1_u10
|
||||
// CHECK-AVX2: testlog10f1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_log10f1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_log10f1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_log10f1_u10
|
||||
}
|
||||
|
||||
void testlog1pd1_u10() {
|
||||
// CHECK-SSE2: testlog1pd1_u10
|
||||
// CHECK-AVX2: testlog1pd1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_log1pd1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_log1pd1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_log1pd1_u10
|
||||
}
|
||||
|
||||
void testlog1pf1_u10() {
|
||||
// CHECK-SSE2: testlog1pf1_u10
|
||||
// CHECK-AVX2: testlog1pf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_log1pf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_log1pf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_log1pf1_u10
|
||||
}
|
||||
|
||||
void testexpd1_u10() {
|
||||
// CHECK-SSE2: testexpd1_u10
|
||||
// CHECK-AVX2: testexpd1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_expd1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_expd1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_expd1_u10
|
||||
}
|
||||
|
||||
void testexpf1_u10() {
|
||||
// CHECK-SSE2: testexpf1_u10
|
||||
// CHECK-AVX2: testexpf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_expf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_expf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_expf1_u10
|
||||
}
|
||||
|
||||
void testexp2d1_u10() {
|
||||
// CHECK-SSE2: testexp2d1_u10
|
||||
// CHECK-AVX2: testexp2d1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_exp2d1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_exp2d1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_exp2d1_u10
|
||||
}
|
||||
|
||||
void testexp2f1_u10() {
|
||||
// CHECK-SSE2: testexp2f1_u10
|
||||
// CHECK-AVX2: testexp2f1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_exp2f1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_exp2f1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_exp2f1_u10
|
||||
}
|
||||
|
||||
void testexp10d1_u10() {
|
||||
// CHECK-SSE2: testexp10d1_u10
|
||||
// CHECK-AVX2: testexp10d1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_exp10d1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_exp10d1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_exp10d1_u10
|
||||
}
|
||||
|
||||
void testexp10f1_u10() {
|
||||
// CHECK-SSE2: testexp10f1_u10
|
||||
// CHECK-AVX2: testexp10f1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_exp10f1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_exp10f1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_exp10f1_u10
|
||||
}
|
||||
|
||||
void testexpm1d1_u10() {
|
||||
// CHECK-SSE2: testexpm1d1_u10
|
||||
// CHECK-AVX2: testexpm1d1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_expm1d1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_expm1d1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_expm1d1_u10
|
||||
}
|
||||
|
||||
void testexpm1f1_u10() {
|
||||
// CHECK-SSE2: testexpm1f1_u10
|
||||
// CHECK-AVX2: testexpm1f1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_expm1f1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_expm1f1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_expm1f1_u10
|
||||
}
|
||||
|
||||
void testpowd1_u10() {
|
||||
// CHECK-SSE2: testpowd1_u10
|
||||
// CHECK-AVX2: testpowd1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_powd1_u10(b[i], c[i]);
|
||||
// CHECK-SSE2: _ZGVbN2vv_Sleef_powd1_u10
|
||||
// CHECK-AVX2: _ZGVdN4vv_Sleef_powd1_u10
|
||||
}
|
||||
|
||||
void testpowf1_u10() {
|
||||
// CHECK-SSE2: testpowf1_u10
|
||||
// CHECK-AVX2: testpowf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_powf1_u10(f[i], g[i]);
|
||||
// CHECK-SSE2: _ZGVbN4vv_Sleef_powf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8vv_Sleef_powf1_u10
|
||||
}
|
||||
|
||||
void testcbrtd1_u10() {
|
||||
// CHECK-SSE2: testcbrtd1_u10
|
||||
// CHECK-AVX2: testcbrtd1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_cbrtd1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_cbrtd1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_cbrtd1_u10
|
||||
}
|
||||
|
||||
void testcbrtd1_u35() {
|
||||
// CHECK-SSE2: testcbrtd1_u35
|
||||
// CHECK-AVX2: testcbrtd1_u35
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_cbrtd1_u35(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_cbrtd1_u35
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_cbrtd1_u35
|
||||
}
|
||||
|
||||
void testcbrtf1_u10() {
|
||||
// CHECK-SSE2: testcbrtf1_u10
|
||||
// CHECK-AVX2: testcbrtf1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_cbrtf1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_cbrtf1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_cbrtf1_u10
|
||||
}
|
||||
|
||||
void testcbrtf1_u35() {
|
||||
// CHECK-SSE2: testcbrtf1_u35
|
||||
// CHECK-AVX2: testcbrtf1_u35
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_cbrtf1_u35(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_cbrtf1_u35
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_cbrtf1_u35
|
||||
}
|
||||
|
||||
void testhypotd1_u05() {
|
||||
// CHECK-SSE2: testhypotd1_u05
|
||||
// CHECK-AVX2: testhypotd1_u05
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_hypotd1_u05(b[i], c[i]);
|
||||
// CHECK-SSE2: _ZGVbN2vv_Sleef_hypotd1_u05
|
||||
// CHECK-AVX2: _ZGVdN4vv_Sleef_hypotd1_u05
|
||||
}
|
||||
|
||||
void testhypotd1_u35() {
|
||||
// CHECK-SSE2: testhypotd1_u35
|
||||
// CHECK-AVX2: testhypotd1_u35
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_hypotd1_u35(b[i], c[i]);
|
||||
// CHECK-SSE2: _ZGVbN2vv_Sleef_hypotd1_u35
|
||||
// CHECK-AVX2: _ZGVdN4vv_Sleef_hypotd1_u35
|
||||
}
|
||||
|
||||
void testhypotf1_u05() {
|
||||
// CHECK-SSE2: testhypotf1_u05
|
||||
// CHECK-AVX2: testhypotf1_u05
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_hypotf1_u05(f[i], g[i]);
|
||||
// CHECK-SSE2: _ZGVbN4vv_Sleef_hypotf1_u05
|
||||
// CHECK-AVX2: _ZGVdN8vv_Sleef_hypotf1_u05
|
||||
}
|
||||
|
||||
void testhypotf1_u35() {
|
||||
// CHECK-SSE2: testhypotf1_u35
|
||||
// CHECK-AVX2: testhypotf1_u35
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_hypotf1_u35(f[i], g[i]);
|
||||
// CHECK-SSE2: _ZGVbN4vv_Sleef_hypotf1_u35
|
||||
// CHECK-AVX2: _ZGVdN8vv_Sleef_hypotf1_u35
|
||||
}
|
||||
|
||||
void testerfd1_u10() {
|
||||
// CHECK-SSE2: testerfd1_u10
|
||||
// CHECK-AVX2: testerfd1_u10
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_erfd1_u10(b[i]);
|
||||
// CHECK-SSE2: _ZGVbN2v_Sleef_erfd1_u10
|
||||
// CHECK-AVX2: _ZGVdN4v_Sleef_erfd1_u10
|
||||
}
|
||||
|
||||
void testerff1_u10() {
|
||||
// CHECK-SSE2: testerff1_u10
|
||||
// CHECK-AVX2: testerff1_u10
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_erff1_u10(f[i]);
|
||||
// CHECK-SSE2: _ZGVbN4v_Sleef_erff1_u10
|
||||
// CHECK-AVX2: _ZGVdN8v_Sleef_erff1_u10
|
||||
}
|
||||
|
||||
void testfmodd1() {
|
||||
// CHECK-SSE2: testfmodd1
|
||||
// CHECK-AVX2: testfmodd1
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_fmodd1(b[i], c[i]);
|
||||
// CHECK-SSE2: _ZGVbN2vv_Sleef_fmodd1
|
||||
// CHECK-AVX2: _ZGVdN4vv_Sleef_fmodd1
|
||||
}
|
||||
|
||||
void testfmodf1() {
|
||||
// CHECK-SSE2: testfmodf1
|
||||
// CHECK-AVX2: testfmodf1
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_fmodf1(f[i], g[i]);
|
||||
// CHECK-SSE2: _ZGVbN4vv_Sleef_fmodf1
|
||||
// CHECK-AVX2: _ZGVdN8vv_Sleef_fmodf1
|
||||
}
|
||||
|
||||
void testremainderd1() {
|
||||
// CHECK-SSE2: testremainderd1
|
||||
// CHECK-AVX2: testremainderd1
|
||||
for(int i=0;i<N;i++) a[i] = Sleef_remainderd1(b[i], c[i]);
|
||||
// CHECK-SSE2: _ZGVbN2vv_Sleef_remainderd1
|
||||
// CHECK-AVX2: _ZGVdN4vv_Sleef_remainderd1
|
||||
}
|
||||
|
||||
void testremainderf1() {
|
||||
// CHECK-SSE2: testremainderf1
|
||||
// CHECK-AVX2: testremainderf1
|
||||
for(int i=0;i<N;i++) e[i] = Sleef_remainderf1(f[i], g[i]);
|
||||
// CHECK-SSE2: _ZGVbN4vv_Sleef_remainderf1
|
||||
// CHECK-AVX2: _ZGVdN8vv_Sleef_remainderf1
|
||||
}
|
||||
@@ -0,0 +1,714 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
/// This program makes sure that all the symbols that a
|
||||
/// GNUABI-compatible compiler (clang or gcc) can generate when
|
||||
/// vectorizing functions call from `#include <math.h>` are present in
|
||||
/// `libsleefgnuabi.so`.
|
||||
///
|
||||
/// The header `math.h` is not the same on all systems, and different
|
||||
/// macros can activate different sets of functions. The list provide
|
||||
/// here shoudl cover the union of all possible systems that we want
|
||||
/// to support. In particular, the test is checking that the "finite"
|
||||
/// symmbols from `#include <bits/math-finite.h>` are present for
|
||||
/// those systems supporting them.
|
||||
|
||||
#include <setjmp.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#if defined(ENABLE_SSE4) || defined(ENABLE_SSE2)
|
||||
#include <x86intrin.h>
|
||||
|
||||
#define ISA_TOKEN b
|
||||
#define VLEN_SP 4
|
||||
#define VLEN_DP 2
|
||||
#define VECTOR_CC
|
||||
|
||||
typedef __m128i vopmask;
|
||||
typedef __m128d vdouble;
|
||||
typedef __m128 vfloat;
|
||||
typedef __m128i vint;
|
||||
typedef __m128i vint2;
|
||||
#endif /* defined(ENABLE_SSE4) || defined(ENABLE_SSE2) */
|
||||
|
||||
#ifdef ENABLE_AVX
|
||||
#include <x86intrin.h>
|
||||
|
||||
#define ISA_TOKEN c
|
||||
#define VLEN_SP 8
|
||||
#define VLEN_DP 4
|
||||
#define VECTOR_CC
|
||||
|
||||
typedef __m256i vopmask;
|
||||
typedef __m256d vdouble;
|
||||
typedef __m256 vfloat;
|
||||
typedef __m128i vint;
|
||||
typedef struct { __m128i x, y; } vint2;
|
||||
#endif /* ENABLE_AVX */
|
||||
|
||||
#ifdef ENABLE_AVX2
|
||||
#include <x86intrin.h>
|
||||
|
||||
#define ISA_TOKEN d
|
||||
#define VLEN_SP 8
|
||||
#define VLEN_DP 4
|
||||
#define VECTOR_CC
|
||||
|
||||
typedef __m256i vopmask;
|
||||
typedef __m256d vdouble;
|
||||
typedef __m256 vfloat;
|
||||
typedef __m128i vint;
|
||||
typedef __m256i vint2;
|
||||
#endif /* ENABLE_AVX2 */
|
||||
|
||||
#ifdef ENABLE_AVX512F
|
||||
#include <x86intrin.h>
|
||||
|
||||
#define ISA_TOKEN e
|
||||
#define VLEN_SP 16
|
||||
#define VLEN_DP 8
|
||||
#define VECTOR_CC
|
||||
|
||||
typedef __mmask16 vopmask;
|
||||
typedef __m512d vdouble;
|
||||
typedef __m512 vfloat;
|
||||
typedef __m256i vint;
|
||||
typedef __m512i vint2;
|
||||
#endif /* ENABLE_AVX512F */
|
||||
|
||||
#ifdef ENABLE_ADVSIMD
|
||||
#include <arm_neon.h>
|
||||
#define ISA_TOKEN n
|
||||
#define VLEN_DP 2
|
||||
#define VLEN_SP 4
|
||||
|
||||
#ifdef ENABLE_AAVPCS
|
||||
#define VECTOR_CC __attribute__((aarch64_vector_pcs))
|
||||
#else
|
||||
#define VECTOR_CC
|
||||
#endif
|
||||
|
||||
typedef uint32x4_t vopmask;
|
||||
typedef float64x2_t vdouble;
|
||||
typedef float32x4_t vfloat;
|
||||
typedef int32x2_t vint;
|
||||
typedef int32x4_t vint2;
|
||||
#endif /* ENABLE_ADVSIMDF */
|
||||
|
||||
#ifdef ENABLE_SVE
|
||||
#include <arm_sve.h>
|
||||
#define ISA_TOKEN s
|
||||
#define VLEN_SP (svcntw())
|
||||
#define VLEN_DP (svcntd())
|
||||
#define VLA_TOKEN x
|
||||
#define VECTOR_CC
|
||||
|
||||
typedef svbool_t vopmask;
|
||||
typedef svfloat64_t vdouble;
|
||||
typedef svfloat32_t vfloat;
|
||||
typedef svint32_t vint;
|
||||
typedef svint32_t vint2;
|
||||
#endif /* ENABLE_SVE */
|
||||
|
||||
// GNUABI name mangling macro.
|
||||
#ifndef MASKED_GNUABI
|
||||
|
||||
#define __MAKE_FN_NAME(name, t, vl, p) _ZGV##t##N##vl##p##_##name
|
||||
|
||||
#define __DECLARE_vd_vd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble)
|
||||
#define __CALL_vd_vd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1); } while(0)
|
||||
|
||||
#define __DECLARE_vi_vd(name, t, vl, p) \
|
||||
extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble)
|
||||
#define __CALL_vi_vd(name, t, vl, p) \
|
||||
do { vi0 = __MAKE_FN_NAME(name, t, vl, p)(vd1); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_vi(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint)
|
||||
#define __CALL_vd_vd_vi(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vi2); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_vd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble)
|
||||
#define __CALL_vd_vd_vd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_vd_vd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble)
|
||||
#define __CALL_vd_vd_vd_vd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, vd3); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_pvd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *)
|
||||
#define __CALL_vd_vd_pvd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, &vd2); } while(0)
|
||||
|
||||
#define __DECLARE_v_vd_pvd_pvd(name, t, vl, p) \
|
||||
extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *)
|
||||
#define __CALL_v_vd_pvd_pvd(name, t, vl, p) \
|
||||
do { __MAKE_FN_NAME(name, t, vl, p)(vd0, &vd1, &vd2); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat)
|
||||
#define __CALL_vf_vf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_vf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat)
|
||||
#define __CALL_vf_vf_vf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_vf_vf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat)
|
||||
#define __CALL_vf_vf_vf_vf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, vf3); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_pvf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *)
|
||||
#define __CALL_vf_vf_pvf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, &vf2); } while(0)
|
||||
|
||||
#define __DECLARE_vi_vf(name, t, vl, p) \
|
||||
extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat)
|
||||
#define __CALL_vi_vf(name, t, vl, p) \
|
||||
do { vi20 = __MAKE_FN_NAME(name, t, vl, p)(vf1); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_vi(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2)
|
||||
#define __CALL_vf_vf_vi(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vi22); } while(0)
|
||||
|
||||
#define __DECLARE_v_vf_pvf_pvf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*)
|
||||
#define __CALL_v_vf_pvf_pvf(name, t, vl, p) \
|
||||
do { __MAKE_FN_NAME(name, t, vl, p)(vf0, &vf1, &vf2); } while(0)
|
||||
|
||||
#else /******************** MASKED_GNUABI *****************************/
|
||||
|
||||
#define __MAKE_FN_NAME(name, t, vl, p) _ZGV##t##M##vl##p##_##name
|
||||
|
||||
#define __DECLARE_vd_vd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask)
|
||||
#define __CALL_vd_vd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vi_vd(name, t, vl, p) \
|
||||
extern vint VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vopmask)
|
||||
#define __CALL_vi_vd(name, t, vl, p) \
|
||||
do { vi0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_vi(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vint, vopmask)
|
||||
#define __CALL_vd_vd_vi(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vi2, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_vd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vopmask)
|
||||
#define __CALL_vd_vd_vd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_vd_vd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble, vdouble, vopmask)
|
||||
#define __CALL_vd_vd_vd_vd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, vd2, vd3, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vd_vd_pvd(name, t, vl, p) \
|
||||
extern vdouble VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vopmask)
|
||||
#define __CALL_vd_vd_pvd(name, t, vl, p) \
|
||||
do { vd0 = __MAKE_FN_NAME(name, t, vl, p)(vd1, &vd2, mask); } while(0)
|
||||
|
||||
#define __DECLARE_v_vd_pvd_pvd(name, t, vl, p) \
|
||||
extern void VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vdouble, vdouble *, vdouble *, vopmask)
|
||||
#define __CALL_v_vd_pvd_pvd(name, t, vl, p) \
|
||||
do { __MAKE_FN_NAME(name, t, vl, p)(vd0, &vd1, &vd2, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask)
|
||||
#define __CALL_vf_vf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_vf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vopmask)
|
||||
#define __CALL_vf_vf_vf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_vf_vf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat, vfloat, vopmask)
|
||||
#define __CALL_vf_vf_vf_vf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vf2, vf3, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_pvf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vopmask)
|
||||
#define __CALL_vf_vf_pvf(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, &vf2, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vi_vf(name, t, vl, p) \
|
||||
extern vint2 VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vopmask)
|
||||
#define __CALL_vi_vf(name, t, vl, p) \
|
||||
do { vi20 = __MAKE_FN_NAME(name, t, vl, p)(vf1, mask); } while(0)
|
||||
|
||||
#define __DECLARE_vf_vf_vi(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vint2, vopmask)
|
||||
#define __CALL_vf_vf_vi(name, t, vl, p) \
|
||||
do { vf0 = __MAKE_FN_NAME(name, t, vl, p)(vf1, vi22, mask); } while(0)
|
||||
|
||||
#define __DECLARE_v_vf_pvf_pvf(name, t, vl, p) \
|
||||
extern vfloat VECTOR_CC __MAKE_FN_NAME(name, t, vl, p)(vfloat, vfloat *, vfloat*, vopmask)
|
||||
#define __CALL_v_vf_pvf_pvf(name, t, vl, p) \
|
||||
do { __MAKE_FN_NAME(name, t, vl, p)(vf0, &vf1, &vf2, mask); } while(0)
|
||||
|
||||
#endif /* MASKED_GNUABI */
|
||||
// Level-1 expansion macros for declaration and call. The signature of
|
||||
// each function has three input paramters to avoid segfaults of
|
||||
// sincos-like functions that are effectively loading data from
|
||||
// memory.
|
||||
|
||||
|
||||
// Make sure that the architectural macros are defined for each vector
|
||||
// extension.
|
||||
#ifndef ISA_TOKEN
|
||||
#error "Missing ISA token"
|
||||
#endif
|
||||
|
||||
#ifndef VLEN_DP
|
||||
#error "Missing VLEN_DP"
|
||||
#endif
|
||||
|
||||
#ifndef VLEN_DP
|
||||
#error "Missing VLEN_SP"
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_SVE) && !defined(VLA_TOKEN)
|
||||
#error "Missing VLA_TOKEN"
|
||||
#endif /* defined(ENABLE_SVE) && !defined(VLA_TOKEN) */
|
||||
|
||||
// Declaration and call, first level expantion to pick up the
|
||||
// ISA_TOKEN and VLEN_* architectural macros.
|
||||
#ifndef ENABLE_SVE
|
||||
|
||||
#define DECLARE_DP_vd_vd(name, p) __DECLARE_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
|
||||
#define CALL_DP_vd_vd(name, p) __CALL_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
|
||||
|
||||
#define DECLARE_DP_vd_vd_vd(name, p) __DECLARE_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
|
||||
#define CALL_DP_vd_vd_vd(name, p) __CALL_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
|
||||
|
||||
#define DECLARE_DP_vd_vd_vd_vd(name, p) __DECLARE_vd_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
|
||||
#define CALL_DP_vd_vd_vd_vd(name, p) __CALL_vd_vd_vd_vd(name, ISA_TOKEN, VLEN_DP, p)
|
||||
|
||||
#define DECLARE_DP_vd_vd_pvd(name, p) __DECLARE_vd_vd_pvd(name, ISA_TOKEN, VLEN_DP, p)
|
||||
#define CALL_DP_vd_vd_pvd(name, p) __CALL_vd_vd_pvd(name, ISA_TOKEN, VLEN_DP, p)
|
||||
|
||||
#define DECLARE_DP_vi_vd(name, p) __DECLARE_vi_vd(name, ISA_TOKEN, VLEN_DP, p)
|
||||
#define CALL_DP_vi_vd(name, p) __CALL_vi_vd(name, ISA_TOKEN, VLEN_DP, p)
|
||||
|
||||
#define DECLARE_DP_vd_vd_vi(name, p) __DECLARE_vd_vd_vi(name, ISA_TOKEN, VLEN_DP, p)
|
||||
#define CALL_DP_vd_vd_vi(name, p) __CALL_vd_vd_vi(name, ISA_TOKEN, VLEN_DP, p)
|
||||
|
||||
#define DECLARE_DP_v_vd_pvd_pvd(name, p) __DECLARE_v_vd_pvd_pvd(name, ISA_TOKEN, VLEN_DP, p)
|
||||
#define CALL_DP_v_vd_pvd_pvd(name, p) __CALL_v_vd_pvd_pvd(name, ISA_TOKEN, VLEN_DP, p)
|
||||
|
||||
#define DECLARE_SP_vf_vf(name, p) __DECLARE_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
|
||||
#define CALL_SP_vf_vf(name, p) __CALL_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
|
||||
|
||||
#define DECLARE_SP_vf_vf_vf(name, p) __DECLARE_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
|
||||
#define CALL_SP_vf_vf_vf(name, p) __CALL_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
|
||||
|
||||
#define DECLARE_SP_vf_vf_vf_vf(name, p) __DECLARE_vf_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
|
||||
#define CALL_SP_vf_vf_vf_vf(name, p) __CALL_vf_vf_vf_vf(name, ISA_TOKEN, VLEN_SP, p)
|
||||
|
||||
#define DECLARE_SP_vf_vf_pvf(name, p) __DECLARE_vf_vf_pvf(name, ISA_TOKEN, VLEN_SP, p)
|
||||
#define CALL_SP_vf_vf_pvf(name, p) __CALL_vf_vf_pvf(name, ISA_TOKEN, VLEN_SP, p)
|
||||
|
||||
#define DECLARE_SP_vi_vf(name, p) __DECLARE_vi_vf(name, ISA_TOKEN, VLEN_SP, p)
|
||||
#define CALL_SP_vi_vf(name, p) __CALL_vi_vf(name, ISA_TOKEN, VLEN_SP, p)
|
||||
|
||||
#define DECLARE_SP_vf_vf_vi(name, p) __DECLARE_vf_vf_vi(name, ISA_TOKEN, VLEN_SP, p)
|
||||
#define CALL_SP_vf_vf_vi(name, p) __CALL_vf_vf_vi(name, ISA_TOKEN, VLEN_SP, p)
|
||||
|
||||
#define DECLARE_SP_v_vf_pvf_pvf(name, p) __DECLARE_v_vf_pvf_pvf(name, ISA_TOKEN, VLEN_SP, p)
|
||||
#define CALL_SP_v_vf_pvf_pvf(name, p) __CALL_v_vf_pvf_pvf(name, ISA_TOKEN, VLEN_SP, p)
|
||||
|
||||
#else /* ENABLE_SVE */
|
||||
|
||||
#define DECLARE_DP_vd_vd(name, p) __DECLARE_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p)
|
||||
#define CALL_DP_vd_vd(name, p) __CALL_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
|
||||
|
||||
#define DECLARE_DP_vd_vd_vd(name, p) __DECLARE_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p)
|
||||
#define CALL_DP_vd_vd_vd(name, p) __CALL_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
|
||||
|
||||
#define DECLARE_DP_vd_vd_vd_vd(name, p) __DECLARE_vd_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p)
|
||||
#define CALL_DP_vd_vd_vd_vd(name, p) __CALL_vd_vd_vd_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
|
||||
|
||||
#define DECLARE_DP_vd_vd_pvd(name, p) __DECLARE_vd_vd_pvd(name, ISA_TOKEN, VLA_TOKEN, p)
|
||||
#define CALL_DP_vd_vd_pvd(name, p) __CALL_vd_vd_pvd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd2)
|
||||
|
||||
#define DECLARE_DP_vi_vd(name, p) __DECLARE_vi_vd(name, ISA_TOKEN, VLA_TOKEN, p)
|
||||
#define CALL_DP_vi_vd(name, p) __CALL_vi_vd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_s32(svptrue_b8(), (int *)outbuf, vi0)
|
||||
|
||||
#define DECLARE_DP_vd_vd_vi(name, p) __DECLARE_vd_vd_vi(name, ISA_TOKEN, VLA_TOKEN, p)
|
||||
#define CALL_DP_vd_vd_vi(name, p) __CALL_vd_vd_vi(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd0)
|
||||
|
||||
#define DECLARE_DP_v_vd_pvd_pvd(name, p) __DECLARE_v_vd_pvd_pvd(name, ISA_TOKEN, VLA_TOKEN, p)
|
||||
#define CALL_DP_v_vd_pvd_pvd(name, p) __CALL_v_vd_pvd_pvd(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f64(svptrue_b8(), (double *)outbuf, vd2)
|
||||
|
||||
#define DECLARE_SP_vf_vf(name, p) __DECLARE_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p)
|
||||
#define CALL_SP_vf_vf(name, p) __CALL_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
|
||||
|
||||
#define DECLARE_SP_vf_vf_vf(name, p) __DECLARE_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p)
|
||||
#define CALL_SP_vf_vf_vf(name, p) __CALL_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
|
||||
|
||||
#define DECLARE_SP_vf_vf_vf_vf(name, p) __DECLARE_vf_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p)
|
||||
#define CALL_SP_vf_vf_vf_vf(name, p) __CALL_vf_vf_vf_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
|
||||
|
||||
#define DECLARE_SP_vf_vf_pvf(name, p) __DECLARE_vf_vf_pvf(name, ISA_TOKEN, VLA_TOKEN, p)
|
||||
#define CALL_SP_vf_vf_pvf(name, p) __CALL_vf_vf_pvf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf2)
|
||||
|
||||
#define DECLARE_SP_vi_vf(name, p) __DECLARE_vi_vf(name, ISA_TOKEN, VLA_TOKEN, p)
|
||||
#define CALL_SP_vi_vf(name, p) __CALL_vi_vf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_s32(svptrue_b8(), (int *)outbuf, vi20)
|
||||
|
||||
#define DECLARE_SP_vf_vf_vi(name, p) __DECLARE_vf_vf_vi(name, ISA_TOKEN, VLA_TOKEN, p)
|
||||
#define CALL_SP_vf_vf_vi(name, p) __CALL_vf_vf_vi(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf0)
|
||||
|
||||
#define DECLARE_SP_v_vf_pvf_pvf(name, p) __DECLARE_v_vf_pvf_pvf(name, ISA_TOKEN, VLA_TOKEN, p)
|
||||
#define CALL_SP_v_vf_pvf_pvf(name, p) __CALL_v_vf_pvf_pvf(name, ISA_TOKEN, VLA_TOKEN, p); svst1_f32(svptrue_b8(), (float *)outbuf, vf2)
|
||||
|
||||
#endif /* ENABLE_SVE */
|
||||
|
||||
//
|
||||
|
||||
// Douple precision function declarations.
|
||||
DECLARE_DP_vd_vd(__acos_finite, v);
|
||||
DECLARE_DP_vd_vd(__acosh_finite, v);
|
||||
DECLARE_DP_vd_vd(__asin_finite, v);
|
||||
DECLARE_DP_vd_vd_vd(__atan2_finite, vv);
|
||||
DECLARE_DP_vd_vd(__atanh_finite, v);
|
||||
DECLARE_DP_vd_vd(__cosh_finite, v);
|
||||
DECLARE_DP_vd_vd(__exp10_finite, v);
|
||||
DECLARE_DP_vd_vd(__exp2_finite, v);
|
||||
DECLARE_DP_vd_vd(__exp_finite, v);
|
||||
DECLARE_DP_vd_vd_vd(__fmod_finite, vv);
|
||||
DECLARE_DP_vd_vd_pvd(__modf_finite, vl8);
|
||||
DECLARE_DP_vd_vd_vd(__hypot_finite, vv);
|
||||
DECLARE_DP_vd_vd(__log10_finite, v);
|
||||
// DECLARE_DP_vd_vd(__log2_finite,v);
|
||||
DECLARE_DP_vd_vd(__log_finite, v);
|
||||
DECLARE_DP_vd_vd_vd(__pow_finite, vv);
|
||||
DECLARE_DP_vd_vd(__sinh_finite, v);
|
||||
DECLARE_DP_vd_vd(__sqrt_finite, v);
|
||||
DECLARE_DP_vd_vd(acos, v);
|
||||
DECLARE_DP_vd_vd(acosh, v);
|
||||
DECLARE_DP_vd_vd(asin, v);
|
||||
DECLARE_DP_vd_vd(asinh, v);
|
||||
DECLARE_DP_vd_vd(atan, v);
|
||||
DECLARE_DP_vd_vd_vd(atan2, vv);
|
||||
DECLARE_DP_vd_vd_vd(__atan2_finite, vv);
|
||||
DECLARE_DP_vd_vd(atanh, v);
|
||||
DECLARE_DP_vd_vd(cbrt, v);
|
||||
DECLARE_DP_vd_vd(ceil, v);
|
||||
DECLARE_DP_vd_vd_vd(copysign, vv);
|
||||
DECLARE_DP_vd_vd(cos, v);
|
||||
DECLARE_DP_vd_vd(cosh, v);
|
||||
DECLARE_DP_vd_vd(cospi, v);
|
||||
DECLARE_DP_vd_vd(erf, v);
|
||||
DECLARE_DP_vd_vd(erfc, v);
|
||||
DECLARE_DP_vd_vd(exp, v);
|
||||
DECLARE_DP_vd_vd(exp10, v);
|
||||
DECLARE_DP_vd_vd(exp2, v);
|
||||
DECLARE_DP_vi_vd(expfrexp, v);
|
||||
DECLARE_DP_vd_vd(expm1, v);
|
||||
DECLARE_DP_vd_vd(fabs, v);
|
||||
DECLARE_DP_vd_vd_vd(fdim, vv);
|
||||
DECLARE_DP_vd_vd(floor, v);
|
||||
DECLARE_DP_vd_vd_vd_vd(fma, vvv);
|
||||
DECLARE_DP_vd_vd_vd(fmax, vv);
|
||||
DECLARE_DP_vd_vd_vd(fmin, vv);
|
||||
DECLARE_DP_vd_vd_vd(fmod, vv);
|
||||
DECLARE_DP_vd_vd(frfrexp, v);
|
||||
DECLARE_DP_vd_vd_vd(hypot, vv);
|
||||
DECLARE_DP_vi_vd(ilogb, v);
|
||||
DECLARE_DP_vd_vd_vi(ldexp, vv);
|
||||
DECLARE_DP_vd_vd(lgamma, v);
|
||||
DECLARE_DP_vd_vd(log, v);
|
||||
DECLARE_DP_vd_vd(log10, v);
|
||||
DECLARE_DP_vd_vd(log1p, v);
|
||||
DECLARE_DP_vd_vd(log2, v);
|
||||
DECLARE_DP_vd_vd_pvd(modf, vl8);
|
||||
DECLARE_DP_vd_vd_vd(nextafter, vv);
|
||||
DECLARE_DP_vd_vd_vd(pow, vv);
|
||||
DECLARE_DP_vd_vd(rint, v);
|
||||
DECLARE_DP_vd_vd(round, v);
|
||||
DECLARE_DP_vd_vd(sin, v);
|
||||
DECLARE_DP_v_vd_pvd_pvd(sincos, vl8l8);
|
||||
DECLARE_DP_v_vd_pvd_pvd(sincospi, vl8l8);
|
||||
DECLARE_DP_vd_vd(sinh, v);
|
||||
DECLARE_DP_vd_vd(sinpi, v);
|
||||
DECLARE_DP_vd_vd(sqrt, v);
|
||||
DECLARE_DP_vd_vd(tan, v);
|
||||
DECLARE_DP_vd_vd(tanh, v);
|
||||
DECLARE_DP_vd_vd(tgamma, v);
|
||||
DECLARE_DP_vd_vd(trunc, v);
|
||||
|
||||
// Single precision function declarations.
|
||||
DECLARE_SP_vf_vf(__acosf_finite, v);
|
||||
DECLARE_SP_vf_vf(__acoshf_finite, v);
|
||||
DECLARE_SP_vf_vf(__asinf_finite, v);
|
||||
DECLARE_SP_vf_vf_vf(__atan2f_finite, vv);
|
||||
DECLARE_SP_vf_vf(__atanhf_finite, v);
|
||||
DECLARE_SP_vf_vf(__coshf_finite, v);
|
||||
DECLARE_SP_vf_vf(__exp10f_finite, v);
|
||||
DECLARE_SP_vf_vf(__exp2f_finite, v);
|
||||
DECLARE_SP_vf_vf(__expf_finite, v);
|
||||
DECLARE_SP_vf_vf_vf(__fmodf_finite, vv);
|
||||
DECLARE_SP_vf_vf_pvf(__modff_finite, vl4);
|
||||
DECLARE_SP_vf_vf_vf(__hypotf_finite, vv);
|
||||
DECLARE_SP_vf_vf(__log10f_finite, v);
|
||||
// DECLARE_SP_vf_vf(__log2f_finite,v);
|
||||
DECLARE_SP_vf_vf(__logf_finite, v);
|
||||
DECLARE_SP_vf_vf_vf(__powf_finite, vv);
|
||||
DECLARE_SP_vf_vf(__sinhf_finite, v);
|
||||
DECLARE_SP_vf_vf(__sqrtf_finite, v);
|
||||
DECLARE_SP_vf_vf(acosf, v);
|
||||
DECLARE_SP_vf_vf(acoshf, v);
|
||||
DECLARE_SP_vf_vf(asinf, v);
|
||||
DECLARE_SP_vf_vf(asinhf, v);
|
||||
DECLARE_SP_vf_vf(atanf, v);
|
||||
DECLARE_SP_vf_vf_vf(atan2f, vv);
|
||||
DECLARE_SP_vf_vf(atanhf, v);
|
||||
DECLARE_SP_vf_vf(cbrtf, v);
|
||||
DECLARE_SP_vf_vf(ceilf, v);
|
||||
DECLARE_SP_vf_vf_vf(copysignf, vv);
|
||||
DECLARE_SP_vf_vf(cosf, v);
|
||||
DECLARE_SP_vf_vf(coshf, v);
|
||||
DECLARE_SP_vf_vf(cospif, v);
|
||||
DECLARE_SP_vf_vf(erff, v);
|
||||
DECLARE_SP_vf_vf(erfcf, v);
|
||||
DECLARE_SP_vf_vf(expf, v);
|
||||
DECLARE_SP_vf_vf(exp10f, v);
|
||||
DECLARE_SP_vf_vf(exp2f, v);
|
||||
DECLARE_SP_vf_vf(expm1f, v);
|
||||
DECLARE_SP_vf_vf(fabsf, v);
|
||||
DECLARE_SP_vf_vf_vf(fdimf, vv);
|
||||
DECLARE_SP_vf_vf(floorf, v);
|
||||
DECLARE_SP_vf_vf_vf_vf(fmaf, vvv);
|
||||
DECLARE_SP_vf_vf_vf(fmaxf, vv);
|
||||
DECLARE_SP_vf_vf_vf(fminf, vv);
|
||||
DECLARE_SP_vf_vf_vf(fmodf, vv);
|
||||
DECLARE_SP_vf_vf(frfrexpf, v);
|
||||
DECLARE_SP_vf_vf_vf(hypotf, vv);
|
||||
#ifndef ENABLE_AVX
|
||||
// These two functions are not checked in some configurations due to
|
||||
// the issue in https://github.com/shibatch/sleef/issues/221
|
||||
DECLARE_SP_vi_vf(expfrexpf, v);
|
||||
DECLARE_SP_vi_vf(ilogbf, v);
|
||||
#endif
|
||||
DECLARE_SP_vf_vf_vi(ldexpf, vv);
|
||||
DECLARE_SP_vf_vf(lgammaf, v);
|
||||
DECLARE_SP_vf_vf(logf, v);
|
||||
DECLARE_SP_vf_vf(log10f, v);
|
||||
DECLARE_SP_vf_vf(log1pf, v);
|
||||
DECLARE_SP_vf_vf(log2f, v);
|
||||
DECLARE_SP_vf_vf_pvf(modff, vl4);
|
||||
DECLARE_SP_vf_vf_vf(nextafterf, vv);
|
||||
DECLARE_SP_vf_vf_vf(powf, vv);
|
||||
DECLARE_SP_vf_vf(rintf, v);
|
||||
DECLARE_SP_vf_vf(roundf, v);
|
||||
DECLARE_SP_vf_vf(sinf, v);
|
||||
DECLARE_SP_v_vf_pvf_pvf(sincosf, vl4l4);
|
||||
DECLARE_SP_v_vf_pvf_pvf(sincospif, vl4l4);
|
||||
DECLARE_SP_vf_vf(sinhf, v);
|
||||
DECLARE_SP_vf_vf(sinpif, v);
|
||||
DECLARE_SP_vf_vf(sqrtf, v);
|
||||
DECLARE_SP_vf_vf(tanf, v);
|
||||
DECLARE_SP_vf_vf(tanhf, v);
|
||||
DECLARE_SP_vf_vf(tgammaf, v);
|
||||
DECLARE_SP_vf_vf(truncf, v);
|
||||
|
||||
#ifndef ENABLE_SVE
|
||||
vdouble vd0, vd1, vd2, vd3;
|
||||
vfloat vf0, vf1, vf2, vf3;
|
||||
vint vi0, vi1, vi2, vi3;
|
||||
vint2 vi20, vi21, vi22, vi23;
|
||||
vopmask mask;
|
||||
#else
|
||||
volatile char outbuf[1024];
|
||||
#endif
|
||||
|
||||
int check_feature(double d, float f) {
|
||||
#ifdef ENABLE_SVE
|
||||
vdouble vd0 = svdup_n_f64(d), vd1 = svdup_n_f64(d);
|
||||
#ifdef MASKED_GNUABI
|
||||
vopmask mask = svcmpne_s32(svptrue_b8(), svdup_n_s32(f), svdup_n_s32(0));
|
||||
#endif
|
||||
#endif
|
||||
|
||||
CALL_DP_vd_vd(__acos_finite, v);
|
||||
#ifdef ENABLE_SVE
|
||||
svst1_f64(svptrue_b8(), (double *)outbuf, vd0);
|
||||
#endif
|
||||
return 1;
|
||||
}
|
||||
|
||||
int main2(int argc, char **argv) {
|
||||
#ifdef ENABLE_SVE
|
||||
vdouble vd0 = svdup_n_f64(argc), vd1 = svdup_n_f64(argc), vd2 = svdup_n_f64(argc), vd3 = svdup_n_f64(argc);
|
||||
vfloat vf0 = svdup_n_f32(argc), vf1 = svdup_n_f32(argc), vf2 = svdup_n_f32(argc), vf3 = svdup_n_f32(argc);
|
||||
vint vi0 = svdup_n_s32(argc), vi2 = svdup_n_s32(argc);
|
||||
vint2 vi20 = svdup_n_s32(argc), vi22 = svdup_n_s32(argc);
|
||||
#ifdef MASKED_GNUABI
|
||||
vopmask mask = svcmpne_s32(svptrue_b8(), svdup_n_s32(argc), svdup_n_s32(0));
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// Double precision function call.
|
||||
CALL_DP_vd_vd(__acos_finite, v);
|
||||
CALL_DP_vd_vd(__acosh_finite, v);
|
||||
CALL_DP_vd_vd(__asin_finite, v);
|
||||
CALL_DP_vd_vd_vd(__atan2_finite, vv);
|
||||
CALL_DP_vd_vd(__atanh_finite, v);
|
||||
CALL_DP_vd_vd(__cosh_finite, v);
|
||||
CALL_DP_vd_vd(__exp10_finite, v);
|
||||
CALL_DP_vd_vd(__exp2_finite, v);
|
||||
CALL_DP_vd_vd(__exp_finite, v);
|
||||
CALL_DP_vd_vd_vd(__fmod_finite, vv);
|
||||
CALL_DP_vd_vd_pvd(__modf_finite, vl8);
|
||||
CALL_DP_vd_vd_vd(__hypot_finite, vv);
|
||||
CALL_DP_vd_vd(__log10_finite, v);
|
||||
// CALL_DP_vd_vd(__log2_finite,v);
|
||||
CALL_DP_vd_vd(__log_finite, v);
|
||||
CALL_DP_vd_vd_vd(__pow_finite, vv);
|
||||
CALL_DP_vd_vd(__sinh_finite, v);
|
||||
CALL_DP_vd_vd(__sqrt_finite, v);
|
||||
CALL_DP_vd_vd(acos, v);
|
||||
CALL_DP_vd_vd(acosh, v);
|
||||
CALL_DP_vd_vd(asin, v);
|
||||
CALL_DP_vd_vd(asinh, v);
|
||||
CALL_DP_vd_vd(atan, v);
|
||||
CALL_DP_vd_vd_vd(atan2, vv);
|
||||
CALL_DP_vd_vd(atanh, v);
|
||||
CALL_DP_vd_vd(cbrt, v);
|
||||
CALL_DP_vd_vd(ceil, v);
|
||||
CALL_DP_vd_vd_vd(copysign, vv);
|
||||
CALL_DP_vd_vd(cos, v);
|
||||
CALL_DP_vd_vd(cosh, v);
|
||||
CALL_DP_vd_vd(cospi, v);
|
||||
CALL_DP_vd_vd(erf, v);
|
||||
CALL_DP_vd_vd(erfc, v);
|
||||
CALL_DP_vd_vd(exp, v);
|
||||
CALL_DP_vd_vd(exp10, v);
|
||||
CALL_DP_vd_vd(exp2, v);
|
||||
CALL_DP_vi_vd(expfrexp, v);
|
||||
CALL_DP_vd_vd(expm1, v);
|
||||
CALL_DP_vd_vd(fabs, v);
|
||||
CALL_DP_vd_vd_vd(fdim, vv);
|
||||
CALL_DP_vd_vd(floor, v);
|
||||
CALL_DP_vd_vd_vd_vd(fma, vvv);
|
||||
CALL_DP_vd_vd_vd(fmax, vv);
|
||||
CALL_DP_vd_vd_vd(fmin, vv);
|
||||
CALL_DP_vd_vd_vd(fmod, vv);
|
||||
CALL_DP_vd_vd(frfrexp, v);
|
||||
CALL_DP_vd_vd_vd(hypot, vv);
|
||||
CALL_DP_vi_vd(ilogb, v);
|
||||
CALL_DP_vd_vd_vi(ldexp, vv);
|
||||
CALL_DP_vd_vd(lgamma, v);
|
||||
CALL_DP_vd_vd(log, v);
|
||||
CALL_DP_vd_vd(log10, v);
|
||||
CALL_DP_vd_vd(log1p, v);
|
||||
CALL_DP_vd_vd(log2, v);
|
||||
CALL_DP_vd_vd_pvd(modf, vl8);
|
||||
CALL_DP_vd_vd_vd(nextafter, vv);
|
||||
CALL_DP_vd_vd_vd(pow, vv);
|
||||
CALL_DP_vd_vd(rint, v);
|
||||
CALL_DP_vd_vd(round, v);
|
||||
CALL_DP_vd_vd(sin, v);
|
||||
CALL_DP_v_vd_pvd_pvd(sincos, vl8l8);
|
||||
CALL_DP_v_vd_pvd_pvd(sincospi, vl8l8);
|
||||
CALL_DP_vd_vd(sinh, v);
|
||||
CALL_DP_vd_vd(sinpi, v);
|
||||
CALL_DP_vd_vd(sqrt, v);
|
||||
CALL_DP_vd_vd(tan, v);
|
||||
CALL_DP_vd_vd(tanh, v);
|
||||
CALL_DP_vd_vd(tgamma, v);
|
||||
CALL_DP_vd_vd(trunc, v);
|
||||
|
||||
// Single precision function call.
|
||||
CALL_SP_vf_vf(__acosf_finite, v);
|
||||
CALL_SP_vf_vf(__acoshf_finite, v);
|
||||
CALL_SP_vf_vf(__asinf_finite, v);
|
||||
CALL_SP_vf_vf_vf(__atan2f_finite, vv);
|
||||
CALL_SP_vf_vf(__atanhf_finite, v);
|
||||
CALL_SP_vf_vf(__coshf_finite, v);
|
||||
CALL_SP_vf_vf(__exp10f_finite, v);
|
||||
CALL_SP_vf_vf(__exp2f_finite, v);
|
||||
CALL_SP_vf_vf(__expf_finite, v);
|
||||
CALL_SP_vf_vf_vf(__fmodf_finite, vv);
|
||||
CALL_SP_vf_vf_pvf(__modff_finite, vl4);
|
||||
CALL_SP_vf_vf_vf(__hypotf_finite, vv);
|
||||
CALL_SP_vf_vf(__log10f_finite, v);
|
||||
// CALL_SP_vf_vf(__log2f_finite,v);
|
||||
CALL_SP_vf_vf(__logf_finite, v);
|
||||
CALL_SP_vf_vf_vf(__powf_finite, vv);
|
||||
CALL_SP_vf_vf(__sinhf_finite, v);
|
||||
CALL_SP_vf_vf(__sqrtf_finite, v);
|
||||
CALL_SP_vf_vf(acosf, v);
|
||||
CALL_SP_vf_vf(acoshf, v);
|
||||
CALL_SP_vf_vf(asinf, v);
|
||||
CALL_SP_vf_vf(asinhf, v);
|
||||
CALL_SP_vf_vf(atanf, v);
|
||||
CALL_SP_vf_vf_vf(atan2f, vv);
|
||||
CALL_SP_vf_vf(atanhf, v);
|
||||
CALL_SP_vf_vf(cbrtf, v);
|
||||
CALL_SP_vf_vf(ceilf, v);
|
||||
CALL_SP_vf_vf_vf(copysignf, vv);
|
||||
CALL_SP_vf_vf(cosf, v);
|
||||
CALL_SP_vf_vf(coshf, v);
|
||||
CALL_SP_vf_vf(cospif, v);
|
||||
CALL_SP_vf_vf(erff, v);
|
||||
CALL_SP_vf_vf(erfcf, v);
|
||||
CALL_SP_vf_vf(expf, v);
|
||||
CALL_SP_vf_vf(exp10f, v);
|
||||
CALL_SP_vf_vf(exp2f, v);
|
||||
CALL_SP_vf_vf(expm1f, v);
|
||||
CALL_SP_vf_vf(fabsf, v);
|
||||
CALL_SP_vf_vf_vf(fdimf, vv);
|
||||
CALL_SP_vf_vf(floorf, v);
|
||||
CALL_SP_vf_vf_vf_vf(fmaf, vvv);
|
||||
CALL_SP_vf_vf_vf(fmaxf, vv);
|
||||
CALL_SP_vf_vf_vf(fminf, vv);
|
||||
CALL_SP_vf_vf_vf(fmodf, vv);
|
||||
CALL_SP_vf_vf(frfrexpf, v);
|
||||
CALL_SP_vf_vf_vf(hypotf, vv);
|
||||
#ifndef ENABLE_AVX
|
||||
// These two functions are not checked in some configurations due to
|
||||
// the issue in https://github.com/shibatch/sleef/issues/221
|
||||
CALL_SP_vi_vf(expfrexpf, v);
|
||||
CALL_SP_vi_vf(ilogbf, v);
|
||||
#endif
|
||||
CALL_SP_vf_vf_vi(ldexpf, vv);
|
||||
CALL_SP_vf_vf(lgammaf, v);
|
||||
CALL_SP_vf_vf(logf, v);
|
||||
CALL_SP_vf_vf(log10f, v);
|
||||
CALL_SP_vf_vf(log1pf, v);
|
||||
CALL_SP_vf_vf(log2f, v);
|
||||
CALL_SP_vf_vf_pvf(modff, vl4);
|
||||
CALL_SP_vf_vf_vf(nextafterf, vv);
|
||||
CALL_SP_vf_vf_vf(powf, vv);
|
||||
CALL_SP_vf_vf(rintf, v);
|
||||
CALL_SP_vf_vf(roundf, v);
|
||||
CALL_SP_vf_vf(sinf, v);
|
||||
CALL_SP_v_vf_pvf_pvf(sincosf, vl4l4);
|
||||
CALL_SP_v_vf_pvf_pvf(sincospif, vl4l4);
|
||||
CALL_SP_vf_vf(sinhf, v);
|
||||
CALL_SP_vf_vf(sinpif, v);
|
||||
CALL_SP_vf_vf(sqrtf, v);
|
||||
CALL_SP_vf_vf(tanf, v);
|
||||
CALL_SP_vf_vf(tanhf, v);
|
||||
CALL_SP_vf_vf(tgammaf, v);
|
||||
CALL_SP_vf_vf(truncf, v);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,129 @@
|
||||
sin u35 bc50dfbcbd8ef534541d1babe90860c7
|
||||
sin u10 dbc2cf81f292ef50fa0119e222c6c9f9
|
||||
cos u35 506e34a809b80ad3603ed46ba2a574b0
|
||||
cos u10 a0f69df5937152b8f8f0e671f3676289
|
||||
tan u35 970b5cd7f0e05defa22ebb155ab61a40
|
||||
tan u10 5fd08e0552e3ab853439bf5fd2bd344d
|
||||
sincos u10 7c164edcaa45988f6165b653fc76c495
|
||||
sincos u35 38fe7e261e184ed8dbf432ce6bedc5c4
|
||||
sincospi u05 0c6fc00c7aaf0b6e67d66542d1be833d
|
||||
sincospi u35 c428b0fc3e6c5be4d2c03dcd8bb27a7c
|
||||
log u10 4855b27222d900bea47a27cadba71727
|
||||
log u35 c95484de57c167da3d8d6d1baadf9ffa
|
||||
log2 u10 2662df9af919680ca62e1752fb1b7539
|
||||
log2 u35 1cd6d7f194a5e8364191497adc5c5cec
|
||||
log10 u10 36645e8031d873d66fd0ec2c5959f273
|
||||
log1p u10 1383924fb56cf2e7eda27de21320c591
|
||||
exp u10 13692a48edf2cf7a3e047b16ddfb7b81
|
||||
exp2 u10 436146f8d6dcaa4a754837108a9aa3e1
|
||||
exp2 u35 8881d075d9101a1dfa3f6a10b9ee8373
|
||||
exp10 u10 9d704b310f683872a6446cfc97726a4d
|
||||
exp10 u35 bc07745ebc22a7ee97679154c24b23cc
|
||||
expm1 u10 cd3f0b8e86943d52c278394b60e2d22e
|
||||
pow u10 a0ea63b27d33262346a35c9439741075
|
||||
cbrt u10 5d8bf28ac74624594fd1be9217817690
|
||||
cbrt u10 3c896e03746bcf1b3f70182dfec3d93b
|
||||
cbrt u35 73daa306764e208aab1627ac110b10d7
|
||||
cbrt u35 c29b7bf200215425b4ba948c8cc94c42
|
||||
hypot u05 cc2f18e409e19a02cadf7b91fd869120
|
||||
hypot u35 5194e0a554174a6145511ce3df9c1f46
|
||||
asin u10 86c061caec3fa2e1bc71bda4dad29f4c
|
||||
asin u35 31303b88bdc00206265002d6cc5e89e4
|
||||
acos u10 0a1a403590f2ac8364f132b334920945
|
||||
acos u35 493f960c1cce57931d95a5a22a0587a3
|
||||
atan u10 c97624a24ec034cc0c8985acb61d13cd
|
||||
atan u10 0be0f550406923016cfeb5ef62c25b15
|
||||
atan u35 9d6d83e066b5a4851d44771418c9948c
|
||||
atan u35 f32c1aa4caa08c6945afd1125ba8b113
|
||||
atan2 u10 6b1d9d25fcd96053acc19d1633fab36a
|
||||
atan2 u35 afb07894347062a96dab705b34eb1763
|
||||
sinh u10 61d459b1f368087f6f23ebf8e9f0ea01
|
||||
cosh u10 f77eb95f79e274c12b4e92dc0389259b
|
||||
tanh u10 2bb9dd54ed0fa22bb5f3b6d557eb58a3
|
||||
asinh u10 01136e54e2a434839530dda54f33cfdb
|
||||
acosh u10 2f3c28c9ee2eb2b3d5659c6cb2a58e3e
|
||||
atanh u10 601a77ba8c1d5175f2808b48a41260c1
|
||||
lgamma u10 90cdc41063f4198c6ad592c0cdd0f5da
|
||||
tgamma u10 6f864c3a1f17fbdf914cac7ffcd82cb7
|
||||
erf u10 f4ae148b59bb7501d8f5746300850376
|
||||
erfc u15 5e116a4316dafa742769f71e18f6f9fe
|
||||
fabs bef2f2ac8a4789357e580b4da4f9b9fe
|
||||
copysign 3219022f267464e3704f90558e8df3bc
|
||||
fmax 4e4f5220ccfef191864c316df0d18fc0
|
||||
fmin c0f8effb6c611e2b3b91b820ad943f62
|
||||
fdim e876d103931f18ceede5bfd7e3df7ab0
|
||||
fmod 618aa751e13012afdb41ec80dd35e6ba
|
||||
remainder 8d692dbb44bbc9be5af0c0657d3008b8
|
||||
modf f03ce73cd4f9ea7f69c017f6e53355d5
|
||||
nextafter 9eba4e30d12d74dc4e8003fcff0f1582
|
||||
trunc 1bc7e909eba121dcef7f0e4046937ae5
|
||||
floor 2cff66b499dc8a30cec9467de659b774
|
||||
ceil b080e632dcb8f8134d8715752be12917
|
||||
round 8907e21687ca9c2a539297536e754950
|
||||
rint e49f837096bc661fe1c742801dd99a30
|
||||
sinf u35 833d845950b9cbb025629fe4c040f8f6
|
||||
sinf u10 9c21afa4d7d6af3fc666309c3cd647fe
|
||||
cosf u35 74d7f871a6553cd0019087895e2052ad
|
||||
cosf u10 35349e94c323c1614f22093959288010
|
||||
tanf u35 bbb7c092d017e96d2454a38a20687735
|
||||
tanf u10 227423bc04f42d76a8f68082ba696126
|
||||
sincosf u10 83ecc4e3d5295056e9d8c52bc196b666
|
||||
sincosf u35 533319caa49a961e4909bd6dcab40721
|
||||
sincospif u05 8b3762b67a661957c1414c351ec49034
|
||||
sincospif u35 cec15ed76a358091632634166fa77b66
|
||||
logf u10 c5a90119943acc4199e1cc7030b5def8
|
||||
logf u35 af2fbe4bfa2caaf59c734e3749dd15be
|
||||
log2f u10 ba8acae369bbb7b6404cccbc633fe25b
|
||||
log2f u35 ba32ebaa8c470899ebd433d190c00f03
|
||||
log10f u10 7e235a82d960e4434575dd39648d8bb7
|
||||
log1pf u10 350fc4f13502b36bb1107e1b1122acb1
|
||||
expf u10 ee4adaabefa3fac6c0f1925b2a948eea
|
||||
exp2f u10 b0d283dbae0f36f1b3c7eed9871f0d0d
|
||||
exp2f u35 522cc30f722f77fceb07015830b351a3
|
||||
exp10f u10 b0564be151965600f5744ff2e4992bc9
|
||||
exp10f u35 d142f1fb40e44f0c9e042718f27ee3e0
|
||||
expm1f u10 ebfd6498cb40f61b609882de8a7f3c74
|
||||
powf u10 a7cba3239c87969662e8b41a4dd8b4ab
|
||||
cbrtf u10 01c5cac23fe21638be1c3eab6e368fd6
|
||||
cbrtf u10 2a245b03f83e9114644d03b40dac707b
|
||||
cbrtf u35 3ce62350fd585f0524a12c974fbe6cf5
|
||||
cbrtf u35 2aca0404626a28f7af7f60105ad6e217
|
||||
hypotf u05 bc5971cbeebee27b4c0d91fbe3f6bf30
|
||||
hypotf u35 a6f0f774b346a6bba08889ff9ba3f193
|
||||
asinf u10 7f77f7453b961512c89e87e49c549cfe
|
||||
asinf u35 22ed8760aa328e1f714031eec592a4d8
|
||||
acosf u10 15617dd0429b90e59d2923415934c2a6
|
||||
acosf u35 af0b132d9e263721f9296187dbf9b9bf
|
||||
atanf u10 26b77fb423104b45633cf24500237d6e
|
||||
atanf u10 4313d0bc2708de53f74d804aac6564d4
|
||||
atanf u35 97a1797897955643c722c7d291987331
|
||||
atanf u35 7d3f47169415058e8578f11d899bfd10
|
||||
atan2f u10 098a33f730fe95ce4774a991db4cee14
|
||||
atan2f u35 56fc6bd8349979f0d0b1dcdb57f68363
|
||||
sinhf u10 0780a2f57df3a831718195d1ee5c19ef
|
||||
coshf u10 cfbb6aed408e43a7b7f053474100ff2d
|
||||
tanhf u10 d19f254d41e8726c748df87b95bc9acd
|
||||
asinhf u10 260d129221468a86bbfd609c27bfea6a
|
||||
acoshf u10 24ced7e5631c78b20a5716faeedbaa92
|
||||
atanhf u10 164fd77b8372b8c131baaacab1c9e650
|
||||
lgammaf u10 3bf6d824175c4f4d86f3073064e41e84
|
||||
tgammaf u10 f3a8d25c852068622bdfcae4cb813583
|
||||
erff u10 f34af3814153de040b93e573ca7d21d8
|
||||
erfcf u15 915ab9830de89a5a504b3ce7cd2fecda
|
||||
fabsf a3c72220bc0ade68fe22e0a15eb730d4
|
||||
copysignf 6b35517b8e1da78d9c9b52915d9a9b19
|
||||
fmaxf 9833a60a2080e8fd9ae8de32c758966f
|
||||
fminf 2dcfa19e1f1ab4973a7dec9f2cc09fa0
|
||||
fdimf c5c0fe7b095eb8ccbb19fbf934a36b24
|
||||
fmodf 77aa84a9703e202a56e5f4609bd2482b
|
||||
remainderf 5a453b1217c173e4dc0b0211066750be
|
||||
modff 5fa4f044f20478216aa085a01b189697
|
||||
nextafterf 517c1c8f072e9024518d3d9ead98b85b
|
||||
truncf 6937050850be63c44d4b7dbd666febe6
|
||||
floorf 9341be69ee345c8554bf3ab4e9316133
|
||||
ceilf c70874771cbe9741f1f05fedd4b629e9
|
||||
roundf 0cf52f6b8015099771e9a7dfa6b090bc
|
||||
rintf bed68e788e2b11543c09c9d52198abf8
|
||||
fastsinf u3500 8eb51f86fb40414dd21284f020f24b6c
|
||||
fastcosf u3500 69cbc3703f1d2c68695b00b1b09287b2
|
||||
fastpowf u3500 e02e6a692cfa22a6b7149168c67ea1d2
|
||||
@@ -0,0 +1,129 @@
|
||||
sin u35 c163e4a7e9ccebb2181dcc8653367d8c
|
||||
sin u10 0d6bf6f2c935db82588222da95659019
|
||||
cos u35 52f902bd939d751b5b544ac70181fcff
|
||||
cos u10 afcdba92a75a76d56b8cf2f22d4bec9e
|
||||
tan u35 906cc42b6755fe514c5e185fcb4d2f55
|
||||
tan u10 c98f29a62067fa63646d9bcc29a310c6
|
||||
sincos u10 3fe37f4eb805505152f2b14a22a9f94e
|
||||
sincos u35 95a7b7f48c71febf10ec6eff796dd391
|
||||
sincospi u05 0c6fc00c7aaf0b6e67d66542d1be833d
|
||||
sincospi u35 c428b0fc3e6c5be4d2c03dcd8bb27a7c
|
||||
log u10 4855b27222d900bea47a27cadba71727
|
||||
log u35 015f8ae899c9b921d48919dd12ef19a9
|
||||
log2 u10 2662df9af919680ca62e1752fb1b7539
|
||||
log2 u35 908b1949db34ea855944f00089b21e23
|
||||
log10 u10 36645e8031d873d66fd0ec2c5959f273
|
||||
log1p u10 1383924fb56cf2e7eda27de21320c591
|
||||
exp u10 084e5be89c2ad03e356078ea4f287bab
|
||||
exp2 u10 6e36db9ae2cf9eca82e3d9157c622351
|
||||
exp2 u35 6e36db9ae2cf9eca82e3d9157c622351
|
||||
exp10 u10 0cc08bc6a3d08d6e61450b5370c6161e
|
||||
exp10 u35 6904d5509ca794747aa249c13886f90f
|
||||
expm1 u10 cd3f0b8e86943d52c278394b60e2d22e
|
||||
pow u10 7e19796027d7c1d1999be948f90e6181
|
||||
cbrt u10 5d8bf28ac74624594fd1be9217817690
|
||||
cbrt u10 3c896e03746bcf1b3f70182dfec3d93b
|
||||
cbrt u35 fc7ee3e3e6c54365d708b752c242a947
|
||||
cbrt u35 2408714a56d74f8c82389ca6772cdbc1
|
||||
hypot u05 cc2f18e409e19a02cadf7b91fd869120
|
||||
hypot u35 be7bbd41dffd746b70261ee773cbd4b2
|
||||
asin u10 8a21b7c28cdaffc9d3e53f415367932e
|
||||
asin u35 9c9e8107782898e9faed6924ad1b3cb1
|
||||
acos u10 28261e4eb8331865660c814676d5c6bc
|
||||
acos u35 310911130bfc45b10dabe3a072939331
|
||||
atan u10 f931de72f2f6a7928f307a8a382ae255
|
||||
atan u10 453f9ef62f58f9829320baf482a1d457
|
||||
atan u35 6161b6189609f105b017d8768d0a41f1
|
||||
atan u35 6face71d8d93c69448d49ed6140e361d
|
||||
atan2 u10 469babaeee9bd30e17af2f473b3ea500
|
||||
atan2 u35 6a3e764125aab2a0a13e7a0d9ec02f7f
|
||||
sinh u10 61d459b1f368087f6f23ebf8e9f0ea01
|
||||
cosh u10 f77eb95f79e274c12b4e92dc0389259b
|
||||
tanh u10 2bb9dd54ed0fa22bb5f3b6d557eb58a3
|
||||
asinh u10 01136e54e2a434839530dda54f33cfdb
|
||||
acosh u10 2f3c28c9ee2eb2b3d5659c6cb2a58e3e
|
||||
atanh u10 601a77ba8c1d5175f2808b48a41260c1
|
||||
lgamma u10 90cdc41063f4198c6ad592c0cdd0f5da
|
||||
tgamma u10 cb9a93844ad1713d2ab92ff5b6398150
|
||||
erf u10 8a0bc2146a5c67b6bebc58f4b0076568
|
||||
erfc u15 3e247a54183eeddedc33e99c50118995
|
||||
fabs bef2f2ac8a4789357e580b4da4f9b9fe
|
||||
copysign 3219022f267464e3704f90558e8df3bc
|
||||
fmax 4e4f5220ccfef191864c316df0d18fc0
|
||||
fmin c0f8effb6c611e2b3b91b820ad943f62
|
||||
fdim e876d103931f18ceede5bfd7e3df7ab0
|
||||
fmod 618aa751e13012afdb41ec80dd35e6ba
|
||||
remainder 8d692dbb44bbc9be5af0c0657d3008b8
|
||||
modf f03ce73cd4f9ea7f69c017f6e53355d5
|
||||
nextafter 9eba4e30d12d74dc4e8003fcff0f1582
|
||||
trunc 1bc7e909eba121dcef7f0e4046937ae5
|
||||
floor 2cff66b499dc8a30cec9467de659b774
|
||||
ceil b080e632dcb8f8134d8715752be12917
|
||||
round 8907e21687ca9c2a539297536e754950
|
||||
rint e49f837096bc661fe1c742801dd99a30
|
||||
sinf u35 f8f804eae1d9443103e81fec96293477
|
||||
sinf u10 3f12a7381f1cbb1830d92b4ec72d21fe
|
||||
cosf u35 f2f3d1c9f090cde9c02439608dc7066e
|
||||
cosf u10 dc35f27fae65f63f0aa6ad241f8b387b
|
||||
tanf u35 68d42ad1fb412e6b8be3853461e61213
|
||||
tanf u10 97df301d4f59e67d5318b5356b703f06
|
||||
sincosf u10 a97124d810ec461c135dc4fb0c059b6f
|
||||
sincosf u35 0cc521e52ae1227d311012c2919c1ff2
|
||||
sincospif u05 8b3762b67a661957c1414c351ec49034
|
||||
sincospif u35 8720757f221c00cc8de24b7dc4949144
|
||||
logf u10 c5a90119943acc4199e1cc7030b5def8
|
||||
logf u35 b6234302d534d6ccd48155dd6b9a4293
|
||||
log2f u10 ba8acae369bbb7b6404cccbc633fe25b
|
||||
log2f u35 74174c90717c86642b71284452a8aef6
|
||||
log10f u10 7e235a82d960e4434575dd39648d8bb7
|
||||
log1pf u10 e53dbfa80bcc1a7bcfd21000e6950475
|
||||
expf u10 9597388315e4b3e89c4c97ce46374dcf
|
||||
exp2f u10 42d66e5e4cb88feb29c5b36c632159a5
|
||||
exp2f u35 42d66e5e4cb88feb29c5b36c632159a5
|
||||
exp10f u10 954f0824b6d949d0da03b49950dc6642
|
||||
exp10f u35 6fb0e9a829e12a06679d379d05b53ede
|
||||
expm1f u10 ebfd6498cb40f61b609882de8a7f3c74
|
||||
powf u10 2ed84af40d03e307a620365f172d010d
|
||||
cbrtf u10 01c5cac23fe21638be1c3eab6e368fd6
|
||||
cbrtf u10 2a245b03f83e9114644d03b40dac707b
|
||||
cbrtf u35 6c22a6dc132c5212250970f22f42256d
|
||||
cbrtf u35 5ab696ae11f9637413d30e6496d5324b
|
||||
hypotf u05 bc5971cbeebee27b4c0d91fbe3f6bf30
|
||||
hypotf u35 2a7cd97768287084b7fffc7e9fb39072
|
||||
asinf u10 e2e571a01984c4ffb3f6e38e0328d90e
|
||||
asinf u35 70df2dfc3a3569868cce60c38e7b1962
|
||||
acosf u10 5180fde4b02a0ca4cd75f0a786a1bfeb
|
||||
acosf u35 72b0e2f9791f90f1c43570b9e9ba893f
|
||||
atanf u10 fa672e387a204055f735b7af98dd8a35
|
||||
atanf u10 d017670c13bc221b68bc9ee5f41c4b5e
|
||||
atanf u35 f592e46eaa5d29583f86d3e336f20b6b
|
||||
atanf u35 e7087fe40de46921826b373d10c40954
|
||||
atan2f u10 275b2fa8ee554c45551bb142db9f8197
|
||||
atan2f u35 44b187851195d24bab2561eb8f4ff5d0
|
||||
sinhf u10 45bc228a14c3e39eeb35e9764394a23e
|
||||
coshf u10 838d441e85d415ef4fb1e5c5ea966a71
|
||||
tanhf u10 d19f254d41e8726c748df87b95bc9acd
|
||||
asinhf u10 927eeb621a3e2d5039f1a07fcf150901
|
||||
acoshf u10 932520013273174fcabe2be4a55f919f
|
||||
atanhf u10 164fd77b8372b8c131baaacab1c9e650
|
||||
lgammaf u10 3bf6d824175c4f4d86f3073064e41e84
|
||||
tgammaf u10 c3059747811d98846f74a63d3747ac3d
|
||||
erff u10 f34af3814153de040b93e573ca7d21d8
|
||||
erfcf u15 687a9c577512d349ddbc0643013d2c56
|
||||
fabsf a3c72220bc0ade68fe22e0a15eb730d4
|
||||
copysignf 6b35517b8e1da78d9c9b52915d9a9b19
|
||||
fmaxf 9833a60a2080e8fd9ae8de32c758966f
|
||||
fminf 2dcfa19e1f1ab4973a7dec9f2cc09fa0
|
||||
fdimf c5c0fe7b095eb8ccbb19fbf934a36b24
|
||||
fmodf 77aa84a9703e202a56e5f4609bd2482b
|
||||
remainderf 5a453b1217c173e4dc0b0211066750be
|
||||
modff 5fa4f044f20478216aa085a01b189697
|
||||
nextafterf 517c1c8f072e9024518d3d9ead98b85b
|
||||
truncf 6937050850be63c44d4b7dbd666febe6
|
||||
floorf 9341be69ee345c8554bf3ab4e9316133
|
||||
ceilf c70874771cbe9741f1f05fedd4b629e9
|
||||
roundf 0cf52f6b8015099771e9a7dfa6b090bc
|
||||
rintf bed68e788e2b11543c09c9d52198abf8
|
||||
fastsinf u3500 5c48081c74cd0316379b580b047dbfc2
|
||||
fastcosf u3500 6f73d116f109283e5632c31f5988f55b
|
||||
fastpowf u3500 6dbb3110412df4fed5a71f50d40def89
|
||||
@@ -0,0 +1,777 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <inttypes.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#if defined(__MINGW32__) || defined(__MINGW64__) || defined(_MSC_VER)
|
||||
#define STDIN_FILENO 0
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
|
||||
#include "sleef.h"
|
||||
#include "testerutil.h"
|
||||
|
||||
#define DORENAME
|
||||
#include "rename.h"
|
||||
|
||||
#define BUFSIZE 1024
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
char buf[BUFSIZE];
|
||||
|
||||
printf("3\n");
|
||||
fflush(stdout);
|
||||
|
||||
for(;;) {
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break;
|
||||
|
||||
if (startsWith(buf, "sin ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "sin %" PRIx64, &u);
|
||||
u = d2u(xsin(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "sin_u1 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "sin_u1 %" PRIx64, &u);
|
||||
u = d2u(xsin_u1(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "cos ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "cos %" PRIx64, &u);
|
||||
u = d2u(xcos(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "cos_u1 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "cos_u1 %" PRIx64, &u);
|
||||
u = d2u(xcos_u1(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "sincos ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "sincos %" PRIx64, &u);
|
||||
Sleef_double2 x = xsincos(u2d(u));
|
||||
printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
|
||||
} else if (startsWith(buf, "sincos_u1 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "sincos_u1 %" PRIx64, &u);
|
||||
Sleef_double2 x = xsincos_u1(u2d(u));
|
||||
printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
|
||||
} else if (startsWith(buf, "sincospi_u05 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "sincospi_u05 %" PRIx64, &u);
|
||||
Sleef_double2 x = xsincospi_u05(u2d(u));
|
||||
printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
|
||||
} else if (startsWith(buf, "sincospi_u35 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "sincospi_u35 %" PRIx64, &u);
|
||||
Sleef_double2 x = xsincospi_u35(u2d(u));
|
||||
printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
|
||||
} else if (startsWith(buf, "sinpi_u05 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "sinpi_u05 %" PRIx64, &u);
|
||||
u = d2u(xsinpi_u05(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "cospi_u05 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "cospi_u05 %" PRIx64, &u);
|
||||
u = d2u(xcospi_u05(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "tan ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "tan %" PRIx64, &u);
|
||||
u = d2u(xtan(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "tan_u1 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "tan_u1 %" PRIx64, &u);
|
||||
u = d2u(xtan_u1(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "asin ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "asin %" PRIx64, &u);
|
||||
u = d2u(xasin(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "acos ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "acos %" PRIx64, &u);
|
||||
u = d2u(xacos(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "atan ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "atan %" PRIx64, &u);
|
||||
u = d2u(xatan(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "log ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "log %" PRIx64, &u);
|
||||
u = d2u(xlog(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "exp ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "exp %" PRIx64, &u);
|
||||
u = d2u(xexp(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "atan2 ")) {
|
||||
uint64_t u, v;
|
||||
sscanf(buf, "atan2 %" PRIx64 " %" PRIx64, &u, &v);
|
||||
u = d2u(xatan2(u2d(u), u2d(v)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "asin_u1 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "asin_u1 %" PRIx64, &u);
|
||||
u = d2u(xasin_u1(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "acos_u1 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "acos_u1 %" PRIx64, &u);
|
||||
u = d2u(xacos_u1(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "atan_u1 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "atan_u1 %" PRIx64, &u);
|
||||
u = d2u(xatan_u1(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "atan2_u1 ")) {
|
||||
uint64_t u, v;
|
||||
sscanf(buf, "atan2_u1 %" PRIx64 " %" PRIx64, &u, &v);
|
||||
u = d2u(xatan2_u1(u2d(u), u2d(v)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "log_u1 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "log_u1 %" PRIx64, &u);
|
||||
u = d2u(xlog_u1(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "pow ")) {
|
||||
uint64_t u, v;
|
||||
sscanf(buf, "pow %" PRIx64 " %" PRIx64, &u, &v);
|
||||
u = d2u(xpow(u2d(u), u2d(v)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "sinh ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "sinh %" PRIx64, &u);
|
||||
u = d2u(xsinh(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "cosh ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "cosh %" PRIx64, &u);
|
||||
u = d2u(xcosh(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "tanh ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "tanh %" PRIx64, &u);
|
||||
u = d2u(xtanh(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "sinh_u35 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "sinh_u35 %" PRIx64, &u);
|
||||
u = d2u(xsinh_u35(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "cosh_u35 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "cosh_u35 %" PRIx64, &u);
|
||||
u = d2u(xcosh_u35(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "tanh_u35 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "tanh_u35 %" PRIx64, &u);
|
||||
u = d2u(xtanh_u35(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "asinh ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "asinh %" PRIx64, &u);
|
||||
u = d2u(xasinh(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "acosh ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "acosh %" PRIx64, &u);
|
||||
u = d2u(xacosh(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "atanh ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "atanh %" PRIx64, &u);
|
||||
u = d2u(xatanh(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "fma ")) {
|
||||
uint64_t u, v, w;
|
||||
sscanf(buf, "fma %" PRIx64 " %" PRIx64 " %" PRIx64, &u, &v, &w);
|
||||
u = d2u(xfma(u2d(u), u2d(v), u2d(w)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "sqrt ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "sqrt %" PRIx64, &u);
|
||||
u = d2u(xsqrt(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "sqrt_u05 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "sqrt_u05 %" PRIx64, &u);
|
||||
u = d2u(xsqrt_u05(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "sqrt_u35 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "sqrt_u35 %" PRIx64, &u);
|
||||
u = d2u(xsqrt_u35(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "cbrt ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "cbrt %" PRIx64, &u);
|
||||
u = d2u(xcbrt(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "cbrt_u1 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "cbrt_u1 %" PRIx64, &u);
|
||||
u = d2u(xcbrt_u1(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "exp2 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "exp2 %" PRIx64, &u);
|
||||
u = d2u(xexp2(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "exp2_u35 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "exp2_u35 %" PRIx64, &u);
|
||||
u = d2u(xexp2_u35(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "exp10 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "exp10 %" PRIx64, &u);
|
||||
u = d2u(xexp10(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "exp10_u35 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "exp10_u35 %" PRIx64, &u);
|
||||
u = d2u(xexp10_u35(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "expm1 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "expm1 %" PRIx64, &u);
|
||||
u = d2u(xexpm1(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "log10 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "log10 %" PRIx64, &u);
|
||||
u = d2u(xlog10(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "log2 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "log2 %" PRIx64, &u);
|
||||
u = d2u(xlog2(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "log2_u35 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "log2_u35 %" PRIx64, &u);
|
||||
u = d2u(xlog2_u35(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "log1p ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "log1p %" PRIx64, &u);
|
||||
u = d2u(xlog1p(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "ldexp ")) {
|
||||
uint64_t u, v;
|
||||
sscanf(buf, "ldexp %" PRIx64 " %" PRIx64, &u, &v);
|
||||
u = d2u(xldexp(u2d(u), (int)u2d(v)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
}
|
||||
|
||||
else if (startsWith(buf, "hypot_u05 ")) {
|
||||
uint64_t u, v;
|
||||
sscanf(buf, "hypot_u05 %" PRIx64 " %" PRIx64, &u, &v);
|
||||
u = d2u(xhypot_u05(u2d(u), u2d(v)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "hypot_u35 ")) {
|
||||
uint64_t u, v;
|
||||
sscanf(buf, "hypot_u35 %" PRIx64 " %" PRIx64, &u, &v);
|
||||
u = d2u(xhypot_u35(u2d(u), u2d(v)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "copysign ")) {
|
||||
uint64_t u, v;
|
||||
sscanf(buf, "copysign %" PRIx64 " %" PRIx64, &u, &v);
|
||||
u = d2u(xcopysign(u2d(u), u2d(v)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "fmax ")) {
|
||||
uint64_t u, v;
|
||||
sscanf(buf, "fmax %" PRIx64 " %" PRIx64, &u, &v);
|
||||
u = d2u(xfmax(u2d(u), u2d(v)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "fmin ")) {
|
||||
uint64_t u, v;
|
||||
sscanf(buf, "fmin %" PRIx64 " %" PRIx64, &u, &v);
|
||||
u = d2u(xfmin(u2d(u), u2d(v)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "fdim ")) {
|
||||
uint64_t u, v;
|
||||
sscanf(buf, "fdim %" PRIx64 " %" PRIx64, &u, &v);
|
||||
u = d2u(xfdim(u2d(u), u2d(v)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "nextafter ")) {
|
||||
uint64_t u, v;
|
||||
sscanf(buf, "nextafter %" PRIx64 " %" PRIx64, &u, &v);
|
||||
u = d2u(xnextafter(u2d(u), u2d(v)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "fmod ")) {
|
||||
uint64_t u, v;
|
||||
sscanf(buf, "fmod %" PRIx64 " %" PRIx64, &u, &v);
|
||||
u = d2u(xfmod(u2d(u), u2d(v)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "remainder ")) {
|
||||
uint64_t u, v;
|
||||
sscanf(buf, "remainder %" PRIx64 " %" PRIx64, &u, &v);
|
||||
u = d2u(xremainder(u2d(u), u2d(v)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "fabs ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "fabs %" PRIx64, &u);
|
||||
u = d2u(xfabs(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "trunc ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "trunc %" PRIx64, &u);
|
||||
u = d2u(xtrunc(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "floor ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "floor %" PRIx64, &u);
|
||||
u = d2u(xfloor(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "ceil ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "ceil %" PRIx64, &u);
|
||||
u = d2u(xceil(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "round ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "round %" PRIx64, &u);
|
||||
u = d2u(xround(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "rint ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "rint %" PRIx64, &u);
|
||||
u = d2u(xrint(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "frfrexp ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "frfrexp %" PRIx64, &u);
|
||||
u = d2u(xfrfrexp(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "modf ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "modf %" PRIx64, &u);
|
||||
Sleef_double2 x = xmodf(u2d(u));
|
||||
printf("%" PRIx64 " %" PRIx64 "\n", d2u(x.x), d2u(x.y));
|
||||
} else if (startsWith(buf, "tgamma_u1 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "tgamma_u1 %" PRIx64, &u);
|
||||
u = d2u(xtgamma_u1(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "lgamma_u1 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "lgamma_u1 %" PRIx64, &u);
|
||||
u = d2u(xlgamma_u1(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "erf_u1 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "erf_u1 %" PRIx64, &u);
|
||||
u = d2u(xerf_u1(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
} else if (startsWith(buf, "erfc_u15 ")) {
|
||||
uint64_t u;
|
||||
sscanf(buf, "erfc_u15 %" PRIx64, &u);
|
||||
u = d2u(xerfc_u15(u2d(u)));
|
||||
printf("%" PRIx64 "\n", u);
|
||||
}
|
||||
|
||||
else if (startsWith(buf, "sinf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "sinf %x", &u);
|
||||
u = f2u(xsinf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "cosf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "cosf %x", &u);
|
||||
u = f2u(xcosf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "sincosf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "sincosf %x", &u);
|
||||
Sleef_float2 x = xsincosf(u2f(u));
|
||||
printf("%x %x\n", f2u(x.x), f2u(x.y));
|
||||
} else if (startsWith(buf, "tanf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "tanf %x", &u);
|
||||
u = f2u(xtanf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "asinf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "asinf %x", &u);
|
||||
u = f2u(xasinf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "acosf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "acosf %x", &u);
|
||||
u = f2u(xacosf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "atanf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "atanf %x", &u);
|
||||
u = f2u(xatanf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "atan2f ")) {
|
||||
uint32_t u, v;
|
||||
sscanf(buf, "atan2f %x %x", &u, &v);
|
||||
u = f2u(xatan2f(u2f(u), u2f(v)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "logf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "logf %x", &u);
|
||||
u = f2u(xlogf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "expf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "expf %x", &u);
|
||||
u = f2u(xexpf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "cbrtf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "cbrtf %x", &u);
|
||||
u = f2u(xcbrtf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "sqrtf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "sqrtf %x", &u);
|
||||
u = f2u(xsqrtf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "sqrtf_u05 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "sqrtf_u05 %x", &u);
|
||||
u = f2u(xsqrtf_u05(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "sqrtf_u35 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "sqrtf_u35 %x", &u);
|
||||
u = f2u(xsqrtf_u35(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "ldexpf ")) {
|
||||
uint32_t u, v;
|
||||
sscanf(buf, "ldexpf %x %x", &u, &v);
|
||||
u = f2u(xldexpf(u2f(u), (int)u2f(v)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "powf ")) {
|
||||
uint32_t u, v;
|
||||
sscanf(buf, "powf %x %x", &u, &v);
|
||||
u = f2u(xpowf(u2f(u), u2f(v)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "fastpowf_u3500 ")) {
|
||||
uint32_t u, v;
|
||||
sscanf(buf, "fastpowf_u3500 %x %x", &u, &v);
|
||||
u = f2u(xfastpowf_u3500(u2f(u), u2f(v)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "sinhf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "sinhf %x", &u);
|
||||
u = f2u(xsinhf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "coshf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "coshf %x", &u);
|
||||
u = f2u(xcoshf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "tanhf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "tanhf %x", &u);
|
||||
u = f2u(xtanhf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "sinhf_u35 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "sinhf_u35 %x", &u);
|
||||
u = f2u(xsinhf_u35(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "coshf_u35 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "coshf_u35 %x", &u);
|
||||
u = f2u(xcoshf_u35(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "tanhf_u35 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "tanhf_u35 %x", &u);
|
||||
u = f2u(xtanhf_u35(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "asinhf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "asinhf %x", &u);
|
||||
u = f2u(xasinhf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "acoshf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "acoshf %x", &u);
|
||||
u = f2u(xacoshf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "atanhf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "atanhf %x", &u);
|
||||
u = f2u(xatanhf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "exp2f ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "exp2f %x", &u);
|
||||
u = f2u(xexp2f(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "exp10f ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "exp10f %x", &u);
|
||||
u = f2u(xexp10f(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "exp2f_u35 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "exp2f_u35 %x", &u);
|
||||
u = f2u(xexp2f_u35(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "exp10f_u35 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "exp10f_u35 %x", &u);
|
||||
u = f2u(xexp10f_u35(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "expm1f ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "expm1f %x", &u);
|
||||
u = f2u(xexpm1f(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "log10f ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "log10f %x", &u);
|
||||
u = f2u(xlog10f(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "log2f ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "log2f %x", &u);
|
||||
u = f2u(xlog2f(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "log2f_u35 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "log2f_u35 %x", &u);
|
||||
u = f2u(xlog2f_u35(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "log1pf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "log1pf %x", &u);
|
||||
u = f2u(xlog1pf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "sinf_u1 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "sinf_u1 %x", &u);
|
||||
u = f2u(xsinf_u1(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "cosf_u1 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "cosf_u1 %x", &u);
|
||||
u = f2u(xcosf_u1(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "sincosf_u1 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "sincosf_u1 %x", &u);
|
||||
Sleef_float2 x = xsincosf_u1(u2f(u));
|
||||
printf("%x %x\n", f2u(x.x), f2u(x.y));
|
||||
} else if (startsWith(buf, "sincospif_u05 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "sincospif_u05 %x", &u);
|
||||
Sleef_float2 x = xsincospif_u05(u2f(u));
|
||||
printf("%x %x\n", f2u(x.x), f2u(x.y));
|
||||
} else if (startsWith(buf, "sincospif_u35 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "sincospif_u35 %x", &u);
|
||||
Sleef_float2 x = xsincospif_u35(u2f(u));
|
||||
printf("%x %x\n", f2u(x.x), f2u(x.y));
|
||||
} else if (startsWith(buf, "sinpif_u05 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "sinpif_u05 %x", &u);
|
||||
u = f2u(xsinpif_u05(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "cospif_u05 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "cospif_u05 %x", &u);
|
||||
u = f2u(xcospif_u05(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "fastsinf_u3500 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "fastsinf_u3500 %x", &u);
|
||||
u = f2u(xfastsinf_u3500(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "fastcosf_u3500 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "fastcosf_u3500 %x", &u);
|
||||
u = f2u(xfastcosf_u3500(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "tanf_u1 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "tanf_u1 %x", &u);
|
||||
u = f2u(xtanf_u1(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "asinf_u1 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "asinf_u1 %x", &u);
|
||||
u = f2u(xasinf_u1(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "acosf_u1 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "acosf_u1 %x", &u);
|
||||
u = f2u(xacosf_u1(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "atanf_u1 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "atanf_u1 %x", &u);
|
||||
u = f2u(xatanf_u1(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "atan2f_u1 ")) {
|
||||
uint32_t u, v;
|
||||
sscanf(buf, "atan2f_u1 %x %x", &u, &v);
|
||||
u = f2u(xatan2f_u1(u2f(u), u2f(v)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "logf_u1 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "logf_u1 %x", &u);
|
||||
u = f2u(xlogf_u1(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "cbrtf_u1 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "cbrtf_u1 %x", &u);
|
||||
u = f2u(xcbrtf_u1(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "ilogb ")) {
|
||||
uint64_t u;
|
||||
int i;
|
||||
sscanf(buf, "ilogb %" PRIx64, &u);
|
||||
i = xilogb(u2d(u));
|
||||
printf("%d\n", i);
|
||||
} else if (startsWith(buf, "ilogbf ")) {
|
||||
uint32_t u;
|
||||
int i;
|
||||
sscanf(buf, "ilogbf %x", &u);
|
||||
i = xilogbf(u2f(u));
|
||||
printf("%d\n", i);
|
||||
}
|
||||
|
||||
else if (startsWith(buf, "hypotf_u05 ")) {
|
||||
uint32_t u, v;
|
||||
sscanf(buf, "hypotf_u05 %x %x", &u, &v);
|
||||
u = f2u(xhypotf_u05(u2f(u), u2f(v)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "hypotf_u35 ")) {
|
||||
uint32_t u, v;
|
||||
sscanf(buf, "hypotf_u35 %x %x", &u, &v);
|
||||
u = f2u(xhypotf_u35(u2f(u), u2f(v)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "copysignf ")) {
|
||||
uint32_t u, v;
|
||||
sscanf(buf, "copysignf %x %x", &u, &v);
|
||||
u = f2u(xcopysignf(u2f(u), u2f(v)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "fmaxf ")) {
|
||||
uint32_t u, v;
|
||||
sscanf(buf, "fmaxf %x %x", &u, &v);
|
||||
u = f2u(xfmaxf(u2f(u), u2f(v)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "fminf ")) {
|
||||
uint32_t u, v;
|
||||
sscanf(buf, "fminf %x %x", &u, &v);
|
||||
u = f2u(xfminf(u2f(u), u2f(v)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "fdimf ")) {
|
||||
uint32_t u, v;
|
||||
sscanf(buf, "fdimf %x %x", &u, &v);
|
||||
u = f2u(xfdimf(u2f(u), u2f(v)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "nextafterf ")) {
|
||||
uint32_t u, v;
|
||||
sscanf(buf, "nextafterf %x %x", &u, &v);
|
||||
u = f2u(xnextafterf(u2f(u), u2f(v)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "fmodf ")) {
|
||||
uint32_t u, v;
|
||||
sscanf(buf, "fmodf %x %x", &u, &v);
|
||||
u = f2u(xfmodf(u2f(u), u2f(v)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "remainderf ")) {
|
||||
uint32_t u, v;
|
||||
sscanf(buf, "remainderf %x %x", &u, &v);
|
||||
u = f2u(xremainderf(u2f(u), u2f(v)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "fabsf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "fabsf %x", &u);
|
||||
u = f2u(xfabsf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "truncf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "truncf %x", &u);
|
||||
u = f2u(xtruncf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "floorf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "floorf %x", &u);
|
||||
u = f2u(xfloorf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "ceilf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "ceilf %x", &u);
|
||||
u = f2u(xceilf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "roundf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "roundf %x", &u);
|
||||
u = f2u(xroundf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "rintf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "rintf %x", &u);
|
||||
u = f2u(xrintf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "frfrexpf ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "frfrexpf %x", &u);
|
||||
u = f2u(xfrfrexpf(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "modff ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "modff %x", &u);
|
||||
Sleef_float2 x = xmodff(u2f(u));
|
||||
printf("%x %x\n", f2u(x.x), f2u(x.y));
|
||||
} else if (startsWith(buf, "tgammaf_u1 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "tgammaf_u1 %x", &u);
|
||||
u = f2u(xtgammaf_u1(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "lgammaf_u1 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "lgammaf_u1 %x", &u);
|
||||
u = f2u(xlgammaf_u1(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "erff_u1 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "erff_u1 %x", &u);
|
||||
u = f2u(xerff_u1(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
} else if (startsWith(buf, "erfcf_u15 ")) {
|
||||
uint32_t u;
|
||||
sscanf(buf, "erfcf_u15 %x", &u);
|
||||
u = f2u(xerfcf_u15(u2f(u)));
|
||||
printf("%x\n", u);
|
||||
}
|
||||
|
||||
else {
|
||||
break;
|
||||
}
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,546 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <inttypes.h>
|
||||
#include <math.h>
|
||||
#include <float.h>
|
||||
#include <stdint.h>
|
||||
#include <cuda.h>
|
||||
|
||||
#include "sleefinline_purec_scalar.h"
|
||||
#include "sleefinline_cuda.h"
|
||||
|
||||
#define STDIN_FILENO 0
|
||||
|
||||
#define SIMD_SUFFIX _cuda_sleef
|
||||
#define CONCAT_SIMD_SUFFIX_(keyword, suffix) keyword ## suffix
|
||||
#define CONCAT_SIMD_SUFFIX(keyword, suffix) CONCAT_SIMD_SUFFIX_(keyword, suffix)
|
||||
|
||||
#define vdouble2 CONCAT_SIMD_SUFFIX(vdouble2, SIMD_SUFFIX)
|
||||
#define vfloat2 CONCAT_SIMD_SUFFIX(vfloat2, SIMD_SUFFIX)
|
||||
|
||||
//
|
||||
|
||||
static int startsWith(const char *str, const char *prefix) {
|
||||
while(*prefix != '\0') if (*str++ != *prefix++) return 0;
|
||||
return *prefix == '\0';
|
||||
}
|
||||
|
||||
static double u2d(uint64_t u) {
|
||||
union {
|
||||
double f;
|
||||
uint64_t i;
|
||||
} tmp;
|
||||
tmp.i = u;
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
static uint64_t d2u(double d) {
|
||||
union {
|
||||
double f;
|
||||
uint64_t i;
|
||||
} tmp;
|
||||
tmp.f = d;
|
||||
return tmp.i;
|
||||
}
|
||||
|
||||
static float u2f(uint32_t u) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} tmp;
|
||||
tmp.i = u;
|
||||
return tmp.f;
|
||||
}
|
||||
|
||||
static uint32_t f2u(float d) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} tmp;
|
||||
tmp.f = d;
|
||||
return tmp.i;
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
__global__ void xsin(double *r, double *a0) { *r = Sleef_sind1_u35cuda(*a0); }
|
||||
__global__ void xcos(double *r, double *a0) { *r = Sleef_cosd1_u35cuda(*a0); }
|
||||
__global__ void xsincos(vdouble2 *r, double *a0) { *r = Sleef_sincosd1_u35cuda(*a0); }
|
||||
__global__ void xtan(double *r, double *a0) { *r = Sleef_tand1_u35cuda(*a0); }
|
||||
__global__ void xasin(double *r, double *a0) { *r = Sleef_asind1_u35cuda(*a0); }
|
||||
__global__ void xacos(double *r, double *a0) { *r = Sleef_acosd1_u35cuda(*a0); }
|
||||
__global__ void xatan(double *r, double *a0) { *r = Sleef_atand1_u35cuda(*a0); }
|
||||
__global__ void xatan2(double *r, double *a0, double *a1) { *r = Sleef_atan2d1_u35cuda(*a0, *a1); }
|
||||
__global__ void xlog(double *r, double *a0) { *r = Sleef_logd1_u35cuda(*a0); }
|
||||
__global__ void xcbrt(double *r, double *a0) { *r = Sleef_cbrtd1_u35cuda(*a0); }
|
||||
__global__ void xsin_u1(double *r, double *a0) { *r = Sleef_sind1_u10cuda(*a0); }
|
||||
__global__ void xcos_u1(double *r, double *a0) { *r = Sleef_cosd1_u10cuda(*a0); }
|
||||
__global__ void xsincos_u1(vdouble2 *r, double *a0) { *r = Sleef_sincosd1_u10cuda(*a0); }
|
||||
__global__ void xtan_u1(double *r, double *a0) { *r = Sleef_tand1_u10cuda(*a0); }
|
||||
__global__ void xasin_u1(double *r, double *a0) { *r = Sleef_asind1_u10cuda(*a0); }
|
||||
__global__ void xacos_u1(double *r, double *a0) { *r = Sleef_acosd1_u10cuda(*a0); }
|
||||
__global__ void xatan_u1(double *r, double *a0) { *r = Sleef_atand1_u10cuda(*a0); }
|
||||
__global__ void xatan2_u1(double *r, double *a0, double *a1) { *r = Sleef_atan2d1_u10cuda(*a0, *a1); }
|
||||
__global__ void xlog_u1(double *r, double *a0) { *r = Sleef_logd1_u10cuda(*a0); }
|
||||
__global__ void xcbrt_u1(double *r, double *a0) { *r = Sleef_cbrtd1_u10cuda(*a0); }
|
||||
__global__ void xexp(double *r, double *a0) { *r = Sleef_expd1_u10cuda(*a0); }
|
||||
__global__ void xpow(double *r, double *a0, double *a1) { *r = Sleef_powd1_u10cuda(*a0, *a1); }
|
||||
__global__ void xsinh(double *r, double *a0) { *r = Sleef_sinhd1_u10cuda(*a0); }
|
||||
__global__ void xcosh(double *r, double *a0) { *r = Sleef_coshd1_u10cuda(*a0); }
|
||||
__global__ void xtanh(double *r, double *a0) { *r = Sleef_tanhd1_u10cuda(*a0); }
|
||||
__global__ void xsinh_u35(double *r, double *a0) { *r = Sleef_sinhd1_u35cuda(*a0); }
|
||||
__global__ void xcosh_u35(double *r, double *a0) { *r = Sleef_coshd1_u35cuda(*a0); }
|
||||
__global__ void xtanh_u35(double *r, double *a0) { *r = Sleef_tanhd1_u35cuda(*a0); }
|
||||
__global__ void xasinh(double *r, double *a0) { *r = Sleef_asinhd1_u10cuda(*a0); }
|
||||
__global__ void xacosh(double *r, double *a0) { *r = Sleef_acoshd1_u10cuda(*a0); }
|
||||
__global__ void xatanh(double *r, double *a0) { *r = Sleef_atanhd1_u10cuda(*a0); }
|
||||
__global__ void xexp2(double *r, double *a0) { *r = Sleef_exp2d1_u10cuda(*a0); }
|
||||
__global__ void xexp2_u35(double *r, double *a0) { *r = Sleef_exp2d1_u35cuda(*a0); }
|
||||
__global__ void xexp10(double *r, double *a0) { *r = Sleef_exp10d1_u10cuda(*a0); }
|
||||
__global__ void xexp10_u35(double *r, double *a0) { *r = Sleef_exp10d1_u35cuda(*a0); }
|
||||
__global__ void xexpm1(double *r, double *a0) { *r = Sleef_expm1d1_u10cuda(*a0); }
|
||||
__global__ void xlog10(double *r, double *a0) { *r = Sleef_log10d1_u10cuda(*a0); }
|
||||
__global__ void xlog2(double *r, double *a0) { *r = Sleef_log2d1_u10cuda(*a0); }
|
||||
__global__ void xlog2_u35(double *r, double *a0) { *r = Sleef_log2d1_u35cuda(*a0); }
|
||||
__global__ void xlog1p(double *r, double *a0) { *r = Sleef_log1pd1_u10cuda(*a0); }
|
||||
__global__ void xsincospi_u05(vdouble2 *r, double *a0) { *r = Sleef_sincospid1_u05cuda(*a0); }
|
||||
__global__ void xsincospi_u35(vdouble2 *r, double *a0) { *r = Sleef_sincospid1_u35cuda(*a0); }
|
||||
__global__ void xsinpi_u05(double *r, double *a0) { *r = Sleef_sinpid1_u05cuda(*a0); }
|
||||
__global__ void xcospi_u05(double *r, double *a0) { *r = Sleef_cospid1_u05cuda(*a0); }
|
||||
__global__ void xldexp(double *r, double *a0, int *a1) { *r = Sleef_ldexpd1_cuda(*a0, *a1); }
|
||||
__global__ void xilogb(int *r, double *a0) { *r = Sleef_ilogbd1_cuda(*a0); }
|
||||
__global__ void xfma(double *r, double *a0, double *a1, double *a2) { *r = Sleef_fmad1_cuda(*a0, *a1, *a2); }
|
||||
__global__ void xsqrt(double *r, double *a0) { *r = Sleef_sqrtd1_cuda(*a0); }
|
||||
__global__ void xsqrt_u05(double *r, double *a0) { *r = Sleef_sqrtd1_u05cuda(*a0); }
|
||||
__global__ void xsqrt_u35(double *r, double *a0) { *r = Sleef_sqrtd1_u35cuda(*a0); }
|
||||
__global__ void xhypot_u05(double *r, double *a0, double *a1) { *r = Sleef_hypotd1_u05cuda(*a0, *a1); }
|
||||
__global__ void xhypot_u35(double *r, double *a0, double *a1) { *r = Sleef_hypotd1_u35cuda(*a0, *a1); }
|
||||
__global__ void xfabs(double *r, double *a0) { *r = Sleef_fabsd1_cuda(*a0); }
|
||||
__global__ void xcopysign(double *r, double *a0, double *a1) { *r = Sleef_copysignd1_cuda(*a0, *a1); }
|
||||
__global__ void xfmax(double *r, double *a0, double *a1) { *r = Sleef_fmaxd1_cuda(*a0, *a1); }
|
||||
__global__ void xfmin(double *r, double *a0, double *a1) { *r = Sleef_fmind1_cuda(*a0, *a1); }
|
||||
__global__ void xfdim(double *r, double *a0, double *a1) { *r = Sleef_fdimd1_cuda(*a0, *a1); }
|
||||
__global__ void xtrunc(double *r, double *a0) { *r = Sleef_truncd1_cuda(*a0); }
|
||||
__global__ void xfloor(double *r, double *a0) { *r = Sleef_floord1_cuda(*a0); }
|
||||
__global__ void xceil(double *r, double *a0) { *r = Sleef_ceild1_cuda(*a0); }
|
||||
__global__ void xround(double *r, double *a0) { *r = Sleef_roundd1_cuda(*a0); }
|
||||
__global__ void xrint(double *r, double *a0) { *r = Sleef_rintd1_cuda(*a0); }
|
||||
__global__ void xnextafter(double *r, double *a0, double *a1) { *r = Sleef_nextafterd1_cuda(*a0, *a1); }
|
||||
__global__ void xfrfrexp(double *r, double *a0) { *r = Sleef_frfrexpd1_cuda(*a0); }
|
||||
__global__ void xexpfrexp(int *r, double *a0) { *r = Sleef_expfrexpd1_cuda(*a0); }
|
||||
__global__ void xfmod(double *r, double *a0, double *a1) { *r = Sleef_fmodd1_cuda(*a0, *a1); }
|
||||
__global__ void xremainder(double *r, double *a0, double *a1) { *r = Sleef_remainderd1_cuda(*a0, *a1); }
|
||||
__global__ void xmodf(vdouble2 *r, double *a0) { *r = Sleef_modfd1_cuda(*a0); }
|
||||
__global__ void xlgamma_u1(double *r, double *a0) { *r = Sleef_lgammad1_u10cuda(*a0); }
|
||||
__global__ void xtgamma_u1(double *r, double *a0) { *r = Sleef_tgammad1_u10cuda(*a0); }
|
||||
__global__ void xerf_u1(double *r, double *a0) { *r = Sleef_erfd1_u10cuda(*a0); }
|
||||
__global__ void xerfc_u15(double *r, double *a0) { *r = Sleef_erfcd1_u15cuda(*a0); }
|
||||
|
||||
__global__ void xsinf(float *r, float *a0) { *r = Sleef_sinf1_u35cuda(*a0); }
|
||||
__global__ void xcosf(float *r, float *a0) { *r = Sleef_cosf1_u35cuda(*a0); }
|
||||
__global__ void xsincosf(vfloat2 *r, float *a0) { *r = Sleef_sincosf1_u35cuda(*a0); }
|
||||
__global__ void xtanf(float *r, float *a0) { *r = Sleef_tanf1_u35cuda(*a0); }
|
||||
__global__ void xasinf(float *r, float *a0) { *r = Sleef_asinf1_u35cuda(*a0); }
|
||||
__global__ void xacosf(float *r, float *a0) { *r = Sleef_acosf1_u35cuda(*a0); }
|
||||
__global__ void xatanf(float *r, float *a0) { *r = Sleef_atanf1_u35cuda(*a0); }
|
||||
__global__ void xatan2f(float *r, float *a0, float *a1) { *r = Sleef_atan2f1_u35cuda(*a0, *a1); }
|
||||
__global__ void xlogf(float *r, float *a0) { *r = Sleef_logf1_u35cuda(*a0); }
|
||||
__global__ void xcbrtf(float *r, float *a0) { *r = Sleef_cbrtf1_u35cuda(*a0); }
|
||||
__global__ void xsinf_u1(float *r, float *a0) { *r = Sleef_sinf1_u10cuda(*a0); }
|
||||
__global__ void xcosf_u1(float *r, float *a0) { *r = Sleef_cosf1_u10cuda(*a0); }
|
||||
__global__ void xsincosf_u1(vfloat2 *r, float *a0) { *r = Sleef_sincosf1_u10cuda(*a0); }
|
||||
__global__ void xtanf_u1(float *r, float *a0) { *r = Sleef_tanf1_u10cuda(*a0); }
|
||||
__global__ void xasinf_u1(float *r, float *a0) { *r = Sleef_asinf1_u10cuda(*a0); }
|
||||
__global__ void xacosf_u1(float *r, float *a0) { *r = Sleef_acosf1_u10cuda(*a0); }
|
||||
__global__ void xatanf_u1(float *r, float *a0) { *r = Sleef_atanf1_u10cuda(*a0); }
|
||||
__global__ void xatan2f_u1(float *r, float *a0, float *a1) { *r = Sleef_atan2f1_u10cuda(*a0, *a1); }
|
||||
__global__ void xlogf_u1(float *r, float *a0) { *r = Sleef_logf1_u10cuda(*a0); }
|
||||
__global__ void xcbrtf_u1(float *r, float *a0) { *r = Sleef_cbrtf1_u10cuda(*a0); }
|
||||
__global__ void xexpf(float *r, float *a0) { *r = Sleef_expf1_u10cuda(*a0); }
|
||||
__global__ void xpowf(float *r, float *a0, float *a1) { *r = Sleef_powf1_u10cuda(*a0, *a1); }
|
||||
__global__ void xsinhf(float *r, float *a0) { *r = Sleef_sinhf1_u10cuda(*a0); }
|
||||
__global__ void xcoshf(float *r, float *a0) { *r = Sleef_coshf1_u10cuda(*a0); }
|
||||
__global__ void xtanhf(float *r, float *a0) { *r = Sleef_tanhf1_u10cuda(*a0); }
|
||||
__global__ void xsinhf_u35(float *r, float *a0) { *r = Sleef_sinhf1_u35cuda(*a0); }
|
||||
__global__ void xcoshf_u35(float *r, float *a0) { *r = Sleef_coshf1_u35cuda(*a0); }
|
||||
__global__ void xtanhf_u35(float *r, float *a0) { *r = Sleef_tanhf1_u35cuda(*a0); }
|
||||
__global__ void xfastsinf_u3500(float *r, float *a0) { *r = Sleef_fastsinf1_u3500cuda(*a0); }
|
||||
__global__ void xfastcosf_u3500(float *r, float *a0) { *r = Sleef_fastcosf1_u3500cuda(*a0); }
|
||||
__global__ void xfastpowf_u3500(float *r, float *a0, float *a1) { *r = Sleef_fastpowf1_u3500cuda(*a0, *a1); }
|
||||
__global__ void xasinhf(float *r, float *a0) { *r = Sleef_asinhf1_u10cuda(*a0); }
|
||||
__global__ void xacoshf(float *r, float *a0) { *r = Sleef_acoshf1_u10cuda(*a0); }
|
||||
__global__ void xatanhf(float *r, float *a0) { *r = Sleef_atanhf1_u10cuda(*a0); }
|
||||
__global__ void xexp2f(float *r, float *a0) { *r = Sleef_exp2f1_u10cuda(*a0); }
|
||||
__global__ void xexp2f_u35(float *r, float *a0) { *r = Sleef_exp2f1_u35cuda(*a0); }
|
||||
__global__ void xexp10f(float *r, float *a0) { *r = Sleef_exp10f1_u10cuda(*a0); }
|
||||
__global__ void xexp10f_u35(float *r, float *a0) { *r = Sleef_exp10f1_u35cuda(*a0); }
|
||||
__global__ void xexpm1f(float *r, float *a0) { *r = Sleef_expm1f1_u10cuda(*a0); }
|
||||
__global__ void xlog10f(float *r, float *a0) { *r = Sleef_log10f1_u10cuda(*a0); }
|
||||
__global__ void xlog2f(float *r, float *a0) { *r = Sleef_log2f1_u10cuda(*a0); }
|
||||
__global__ void xlog2f_u35(float *r, float *a0) { *r = Sleef_log2f1_u35cuda(*a0); }
|
||||
__global__ void xlog1pf(float *r, float *a0) { *r = Sleef_log1pf1_u10cuda(*a0); }
|
||||
__global__ void xsincospif_u05(vfloat2 *r, float *a0) { *r = Sleef_sincospif1_u05cuda(*a0); }
|
||||
__global__ void xsincospif_u35(vfloat2 *r, float *a0) { *r = Sleef_sincospif1_u35cuda(*a0); }
|
||||
__global__ void xsinpif_u05(float *r, float *a0) { *r = Sleef_sinpif1_u05cuda(*a0); }
|
||||
__global__ void xcospif_u05(float *r, float *a0) { *r = Sleef_cospif1_u05cuda(*a0); }
|
||||
__global__ void xldexpf(float *r, float *a0, int *a1) { *r = Sleef_ldexpf1_cuda(*a0, *a1); }
|
||||
__global__ void xilogbf(int *r, float *a0) { *r = Sleef_ilogbf1_cuda(*a0); }
|
||||
__global__ void xfmaf(float *r, float *a0, float *a1, float *a2) { *r = Sleef_fmaf1_cuda(*a0, *a1, *a2); }
|
||||
__global__ void xsqrtf(float *r, float *a0) { *r = Sleef_sqrtf1_cuda(*a0); }
|
||||
__global__ void xsqrtf_u05(float *r, float *a0) { *r = Sleef_sqrtf1_u05cuda(*a0); }
|
||||
__global__ void xsqrtf_u35(float *r, float *a0) { *r = Sleef_sqrtf1_u35cuda(*a0); }
|
||||
__global__ void xhypotf_u05(float *r, float *a0, float *a1) { *r = Sleef_hypotf1_u05cuda(*a0, *a1); }
|
||||
__global__ void xhypotf_u35(float *r, float *a0, float *a1) { *r = Sleef_hypotf1_u35cuda(*a0, *a1); }
|
||||
__global__ void xfabsf(float *r, float *a0) { *r = Sleef_fabsf1_cuda(*a0); }
|
||||
__global__ void xcopysignf(float *r, float *a0, float *a1) { *r = Sleef_copysignf1_cuda(*a0, *a1); }
|
||||
__global__ void xfmaxf(float *r, float *a0, float *a1) { *r = Sleef_fmaxf1_cuda(*a0, *a1); }
|
||||
__global__ void xfminf(float *r, float *a0, float *a1) { *r = Sleef_fminf1_cuda(*a0, *a1); }
|
||||
__global__ void xfdimf(float *r, float *a0, float *a1) { *r = Sleef_fdimf1_cuda(*a0, *a1); }
|
||||
__global__ void xtruncf(float *r, float *a0) { *r = Sleef_truncf1_cuda(*a0); }
|
||||
__global__ void xfloorf(float *r, float *a0) { *r = Sleef_floorf1_cuda(*a0); }
|
||||
__global__ void xceilf(float *r, float *a0) { *r = Sleef_ceilf1_cuda(*a0); }
|
||||
__global__ void xroundf(float *r, float *a0) { *r = Sleef_roundf1_cuda(*a0); }
|
||||
__global__ void xrintf(float *r, float *a0) { *r = Sleef_rintf1_cuda(*a0); }
|
||||
__global__ void xnextafterf(float *r, float *a0, float *a1) { *r = Sleef_nextafterf1_cuda(*a0, *a1); }
|
||||
__global__ void xfrfrexpf(float *r, float *a0) { *r = Sleef_frfrexpf1_cuda(*a0); }
|
||||
__global__ void xexpfrexpf(float *r, float *a0) { *r = Sleef_expfrexpf1_cuda(*a0); }
|
||||
__global__ void xfmodf(float *r, float *a0, float *a1) { *r = Sleef_fmodf1_cuda(*a0, *a1); }
|
||||
__global__ void xremainderf(float *r, float *a0, float *a1) { *r = Sleef_remainderf1_cuda(*a0, *a1); }
|
||||
__global__ void xmodff(vfloat2 *r, float *a0) { *r = Sleef_modff1_cuda(*a0); }
|
||||
__global__ void xlgammaf_u1(float *r, float *a0) { *r = Sleef_lgammaf1_u10cuda(*a0); }
|
||||
__global__ void xtgammaf_u1(float *r, float *a0) { *r = Sleef_tgammaf1_u10cuda(*a0); }
|
||||
__global__ void xerff_u1(float *r, float *a0) { *r = Sleef_erff1_u10cuda(*a0); }
|
||||
__global__ void xerfcf_u15(float *r, float *a0) { *r = Sleef_erfcf1_u15cuda(*a0); }
|
||||
|
||||
//
|
||||
|
||||
#define func_d_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
*a0 = u2d(u); \
|
||||
funcName<<<1, 1>>>(r, a0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%" PRIx64 "\n", d2u(*r)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_d2_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
*a0 = u2d(u); \
|
||||
funcName<<<1, 1>>>(r2, a0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%" PRIx64 " %" PRIx64 "\n", d2u(r2->x), d2u(r2->y)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_d_d_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u, v; \
|
||||
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
|
||||
*a0 = u2d(u); \
|
||||
*a1 = u2d(v); \
|
||||
funcName<<<1, 1>>>(r, a0, a1); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%" PRIx64 "\n", d2u(*r)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_d_d_i(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u, v; \
|
||||
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
|
||||
*a0 = u2d(u); \
|
||||
*i0 = (int)u2d(v); \
|
||||
funcName<<<1, 1>>>(r, a0, i0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%" PRIx64 "\n", d2u(*r)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_i_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
*a0 = u2d(u); \
|
||||
funcName<<<1, 1>>>(i0, a0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%d\n", *i0); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
#define func_f_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u; \
|
||||
sscanf(buf, funcStr " %x", &u); \
|
||||
*b0 = u2f(u); \
|
||||
funcName<<<1, 1>>>(s, b0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%x\n", f2u(*s)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_f2_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u; \
|
||||
sscanf(buf, funcStr " %x", &u); \
|
||||
*b0 = u2f(u); \
|
||||
funcName<<<1, 1>>>(s2, b0); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%x %x\n", f2u(s2->x), f2u(s2->y)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_f_f_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u, v; \
|
||||
sscanf(buf, funcStr " %x %x", &u, &v); \
|
||||
*b0 = u2f(u); \
|
||||
*b1 = u2f(v); \
|
||||
funcName<<<1, 1>>>(s, b0, b1); \
|
||||
cudaDeviceSynchronize(); \
|
||||
printf("%x\n", f2u(*s)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
#define BUFSIZE 1024
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
#if 0
|
||||
cuInit(0);
|
||||
|
||||
int ndevice;
|
||||
cuDeviceGetCount(&ndevice);
|
||||
if (ndevice == 0) {
|
||||
fprintf(stderr, "No cuda device available\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
CUdevice device;
|
||||
char deviceName[1024];
|
||||
cuDeviceGet(&device, 0);
|
||||
cuDeviceGetName(deviceName, 1000, device);
|
||||
fprintf(stderr, "Device : %s\n", deviceName);
|
||||
#endif
|
||||
|
||||
cudaSetDeviceFlags(cudaDeviceScheduleSpin);
|
||||
|
||||
vdouble2 *r2;
|
||||
vfloat2 *s2;
|
||||
double *r, *a0, *a1, *a2;
|
||||
float *s, *b0, *b1, *b2;
|
||||
int *i0;
|
||||
cudaMallocManaged(&r , 1*sizeof(double));
|
||||
cudaMallocManaged(&r2, 1*sizeof(vdouble2));
|
||||
cudaMallocManaged(&a0, 1*sizeof(double));
|
||||
cudaMallocManaged(&a1, 1*sizeof(double));
|
||||
cudaMallocManaged(&a2, 1*sizeof(double));
|
||||
cudaMallocManaged(&s , 1*sizeof(float));
|
||||
cudaMallocManaged(&s2, 1*sizeof(vfloat2));
|
||||
cudaMallocManaged(&b0, 1*sizeof(float));
|
||||
cudaMallocManaged(&b1, 1*sizeof(float));
|
||||
cudaMallocManaged(&b2, 1*sizeof(float));
|
||||
cudaMallocManaged(&i0, 1*sizeof(int));
|
||||
|
||||
printf("3\n");
|
||||
fflush(stdout);
|
||||
|
||||
char buf[BUFSIZE];
|
||||
if (fgets(buf, BUFSIZE-1, stdin)) {}
|
||||
|
||||
while(!feof(stdin)) {
|
||||
func_d_d("sin", xsin);
|
||||
func_d_d("cos", xcos);
|
||||
func_d_d("tan", xtan);
|
||||
func_d_d("asin", xasin);
|
||||
func_d_d("acos", xacos);
|
||||
func_d_d("atan", xatan);
|
||||
func_d_d("log", xlog);
|
||||
func_d_d("exp", xexp);
|
||||
|
||||
func_d_d("sqrt", xsqrt);
|
||||
func_d_d("sqrt_u05", xsqrt_u05);
|
||||
func_d_d("sqrt_u35", xsqrt_u35);
|
||||
func_d_d("cbrt", xcbrt);
|
||||
func_d_d("cbrt_u1", xcbrt_u1);
|
||||
|
||||
func_d_d("sinh", xsinh);
|
||||
func_d_d("cosh", xcosh);
|
||||
func_d_d("tanh", xtanh);
|
||||
func_d_d("sinh_u35", xsinh_u35);
|
||||
func_d_d("cosh_u35", xcosh_u35);
|
||||
func_d_d("tanh_u35", xtanh_u35);
|
||||
func_d_d("asinh", xasinh);
|
||||
func_d_d("acosh", xacosh);
|
||||
func_d_d("atanh", xatanh);
|
||||
|
||||
func_d_d("sin_u1", xsin_u1);
|
||||
func_d_d("cos_u1", xcos_u1);
|
||||
func_d_d("tan_u1", xtan_u1);
|
||||
func_d_d("sinpi_u05", xsinpi_u05);
|
||||
func_d_d("cospi_u05", xcospi_u05);
|
||||
func_d_d("asin_u1", xasin_u1);
|
||||
func_d_d("acos_u1", xacos_u1);
|
||||
func_d_d("atan_u1", xatan_u1);
|
||||
func_d_d("log_u1", xlog_u1);
|
||||
|
||||
func_d_d("exp2", xexp2);
|
||||
func_d_d("exp10", xexp10);
|
||||
func_d_d("exp2_u35", xexp2_u35);
|
||||
func_d_d("exp10_u35", xexp10_u35);
|
||||
func_d_d("expm1", xexpm1);
|
||||
func_d_d("log10", xlog10);
|
||||
func_d_d("log2", xlog2);
|
||||
func_d_d("log2_u35", xlog2_u35);
|
||||
func_d_d("log1p", xlog1p);
|
||||
func_d_d("fabs", xfabs);
|
||||
func_d_d("trunc", xtrunc);
|
||||
func_d_d("floor", xfloor);
|
||||
func_d_d("ceil", xceil);
|
||||
func_d_d("round", xround);
|
||||
func_d_d("rint", xrint);
|
||||
func_d_d("frfrexp", xfrfrexp);
|
||||
func_d_d("tgamma_u1", xtgamma_u1);
|
||||
func_d_d("lgamma_u1", xlgamma_u1);
|
||||
func_d_d("erf_u1", xerf_u1);
|
||||
func_d_d("erfc_u15", xerfc_u15);
|
||||
|
||||
func_d2_d("sincos", xsincos);
|
||||
func_d2_d("sincos_u1", xsincos_u1);
|
||||
func_d2_d("sincospi_u35", xsincospi_u35);
|
||||
func_d2_d("sincospi_u05", xsincospi_u05);
|
||||
func_d2_d("modf", xmodf);
|
||||
|
||||
func_d_d_d("pow", xpow);
|
||||
func_d_d_d("atan2", xatan2);
|
||||
func_d_d_d("atan2_u1", xatan2_u1);
|
||||
func_d_d_d("hypot_u05", xhypot_u05);
|
||||
func_d_d_d("hypot_u35", xhypot_u35);
|
||||
func_d_d_d("copysign", xcopysign);
|
||||
func_d_d_d("fmax", xfmax);
|
||||
func_d_d_d("fmin", xfmin);
|
||||
func_d_d_d("fdim", xfdim);
|
||||
func_d_d_d("nextafter", xnextafter);
|
||||
func_d_d_d("fmod", xfmod);
|
||||
func_d_d_d("remainder", xremainder);
|
||||
|
||||
func_d_d_i("ldexp", xldexp);
|
||||
func_i_d("ilogb", xilogb);
|
||||
func_i_d("expfrexp", xexpfrexp);
|
||||
|
||||
//
|
||||
|
||||
func_f_f("sinf", xsinf);
|
||||
func_f_f("cosf", xcosf);
|
||||
func_f_f("tanf", xtanf);
|
||||
func_f_f("asinf", xasinf);
|
||||
func_f_f("acosf", xacosf);
|
||||
func_f_f("atanf", xatanf);
|
||||
func_f_f("logf", xlogf);
|
||||
func_f_f("expf", xexpf);
|
||||
|
||||
func_f_f("sqrtf", xsqrtf);
|
||||
func_f_f("sqrtf_u05", xsqrtf_u05);
|
||||
func_f_f("sqrtf_u35", xsqrtf_u35);
|
||||
func_f_f("cbrtf", xcbrtf);
|
||||
func_f_f("cbrtf_u1", xcbrtf_u1);
|
||||
|
||||
func_f_f("sinhf", xsinhf);
|
||||
func_f_f("coshf", xcoshf);
|
||||
func_f_f("tanhf", xtanhf);
|
||||
func_f_f("sinhf_u35", xsinhf_u35);
|
||||
func_f_f("coshf_u35", xcoshf_u35);
|
||||
func_f_f("tanhf_u35", xtanhf_u35);
|
||||
func_f_f("asinhf", xasinhf);
|
||||
func_f_f("acoshf", xacoshf);
|
||||
func_f_f("atanhf", xatanhf);
|
||||
|
||||
func_f_f("sinf_u1", xsinf_u1);
|
||||
func_f_f("cosf_u1", xcosf_u1);
|
||||
func_f_f("tanf_u1", xtanf_u1);
|
||||
func_f_f("sinpif_u05", xsinpif_u05);
|
||||
func_f_f("cospif_u05", xcospif_u05);
|
||||
func_f_f("asinf_u1", xasinf_u1);
|
||||
func_f_f("acosf_u1", xacosf_u1);
|
||||
func_f_f("atanf_u1", xatanf_u1);
|
||||
func_f_f("logf_u1", xlogf_u1);
|
||||
|
||||
func_f_f("exp2f", xexp2f);
|
||||
func_f_f("exp10f", xexp10f);
|
||||
func_f_f("exp2f_u35", xexp2f_u35);
|
||||
func_f_f("exp10f_u35", xexp10f_u35);
|
||||
func_f_f("expm1f", xexpm1f);
|
||||
func_f_f("log10f", xlog10f);
|
||||
func_f_f("log2f", xlog2f);
|
||||
func_f_f("log2f_u35", xlog2f_u35);
|
||||
func_f_f("log1pf", xlog1pf);
|
||||
|
||||
func_f2_f("sincosf", xsincosf);
|
||||
func_f2_f("sincosf_u1", xsincosf_u1);
|
||||
func_f2_f("sincospif_u35", xsincospif_u35);
|
||||
func_f2_f("sincospif_u05", xsincospif_u05);
|
||||
|
||||
func_f_f_f("powf", xpowf);
|
||||
func_f_f_f("atan2f", xatan2f);
|
||||
func_f_f_f("atan2f_u1", xatan2f_u1);
|
||||
|
||||
func_f_f("fabsf", xfabsf);
|
||||
func_f_f("truncf", xtruncf);
|
||||
func_f_f("floorf", xfloorf);
|
||||
func_f_f("ceilf", xceilf);
|
||||
func_f_f("roundf", xroundf);
|
||||
func_f_f("rintf", xrintf);
|
||||
func_f_f("frfrexpf", xfrfrexpf);
|
||||
|
||||
func_f_f_f("hypotf_u05", xhypotf_u05);
|
||||
func_f_f_f("hypotf_u35", xhypotf_u35);
|
||||
func_f_f_f("copysignf", xcopysignf);
|
||||
func_f_f_f("fmaxf", xfmaxf);
|
||||
func_f_f_f("fminf", xfminf);
|
||||
func_f_f_f("fdimf", xfdimf);
|
||||
func_f_f_f("nextafterf", xnextafterf);
|
||||
func_f_f_f("fmodf", xfmodf);
|
||||
func_f_f_f("remainderf", xremainderf);
|
||||
|
||||
func_f2_f("modff", xmodff);
|
||||
|
||||
func_f_f("tgammaf_u1", xtgammaf_u1);
|
||||
func_f_f("lgammaf_u1", xlgammaf_u1);
|
||||
func_f_f("erff_u1", xerff_u1);
|
||||
func_f_f("erfcf_u15", xerfcf_u15);
|
||||
|
||||
func_f_f("fastsinf_u3500", xfastsinf_u3500);
|
||||
func_f_f("fastcosf_u3500", xfastcosf_u3500);
|
||||
func_f_f_f("fastpowf_u3500", xfastpowf_u3500);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,859 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2023.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <inttypes.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define STDIN_FILENO 0
|
||||
#else
|
||||
#include <unistd.h>
|
||||
#include <sys/types.h>
|
||||
#endif
|
||||
|
||||
#include "quaddef.h"
|
||||
#include "misc.h"
|
||||
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#include "sleef.h"
|
||||
#else // #if !defined(USE_INLINE_HEADER)
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <float.h>
|
||||
#include <limits.h>
|
||||
|
||||
#if defined(__AVX2__) || defined(__aarch64__) || defined(__arm__) || defined(__powerpc64__)
|
||||
#ifndef FP_FAST_FMA
|
||||
#define FP_FAST_FMA
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__STDC__)
|
||||
#define __STDC__ 1
|
||||
#endif
|
||||
|
||||
#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__))
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
#if (defined(_MSC_VER))
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_FEATURE_SVE)
|
||||
#include <arm_sve.h>
|
||||
#endif
|
||||
|
||||
#if defined(__riscv) && defined(__riscv_v)
|
||||
#include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
#if defined(__VSX__)
|
||||
#include <altivec.h>
|
||||
#endif
|
||||
|
||||
#if defined(__VX__)
|
||||
#include <vecintrin.h>
|
||||
#endif
|
||||
|
||||
#define SLEEF_ALWAYS_INLINE inline
|
||||
#define SLEEF_INLINE
|
||||
#define SLEEF_CONST
|
||||
#include USE_INLINE_HEADER
|
||||
#include MACRO_ONLY_HEADER
|
||||
|
||||
#ifndef ENABLE_PUREC_SCALAR
|
||||
#include "sleefinline_purec_scalar.h"
|
||||
#endif
|
||||
|
||||
#endif // #if !defined(USE_INLINE_HEADER)
|
||||
|
||||
#include "testerutil.h"
|
||||
|
||||
#define DORENAME
|
||||
|
||||
#ifdef ENABLE_SSE2
|
||||
#include "renamesse2.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 2
|
||||
#include "helpersse2.h"
|
||||
typedef Sleef___m128d_2 vdouble2;
|
||||
typedef Sleef___m128_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SSE4
|
||||
#include "renamesse4.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 4
|
||||
#include "helpersse2.h"
|
||||
typedef Sleef___m128d_2 vdouble2;
|
||||
typedef Sleef___m128_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_AVX
|
||||
#include "renameavx.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 1
|
||||
#include "helperavx.h"
|
||||
typedef Sleef___m256d_2 vdouble2;
|
||||
typedef Sleef___m256_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_FMA4
|
||||
#include "renamefma4.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 4
|
||||
#include "helperavx.h"
|
||||
typedef Sleef___m256d_2 vdouble2;
|
||||
typedef Sleef___m256_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_AVX2
|
||||
#include "renameavx2.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 1
|
||||
#include "helperavx2.h"
|
||||
typedef Sleef___m256d_2 vdouble2;
|
||||
typedef Sleef___m256_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_AVX2128
|
||||
#include "renameavx2128.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 1
|
||||
#include "helperavx2_128.h"
|
||||
typedef Sleef___m128d_2 vdouble2;
|
||||
typedef Sleef___m128_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_AVX512F
|
||||
#include "renameavx512f.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 1
|
||||
#include "helperavx512f.h"
|
||||
typedef Sleef___m512d_2 vdouble2;
|
||||
typedef Sleef___m512_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_AVX512FNOFMA
|
||||
#include "renameavx512fnofma.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 2
|
||||
#include "helperavx512f.h"
|
||||
typedef Sleef___m512d_2 vdouble2;
|
||||
typedef Sleef___m512_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_VECEXT
|
||||
#define CONFIG 1
|
||||
#include "helpervecext.h"
|
||||
#include "norename.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_PUREC
|
||||
#define CONFIG 1
|
||||
#include "helperpurec.h"
|
||||
#include "norename.h"
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_NEON32
|
||||
#include "renameneon32.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 1
|
||||
#include "helperneon32.h"
|
||||
typedef Sleef_float32x4_t_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_NEON32VFPV4
|
||||
#include "renameneon32vfpv4.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 4
|
||||
#include "helperneon32.h"
|
||||
typedef Sleef_float32x4_t_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_ADVSIMD
|
||||
#include "renameadvsimd.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 1
|
||||
#include "helperadvsimd.h"
|
||||
typedef Sleef_float64x2_t_2 vdouble2;
|
||||
typedef Sleef_float32x4_t_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_ADVSIMDNOFMA
|
||||
#include "renameadvsimdnofma.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 2
|
||||
#include "helperadvsimd.h"
|
||||
typedef Sleef_float64x2_t_2 vdouble2;
|
||||
typedef Sleef_float32x4_t_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_DSP128
|
||||
#define CONFIG 2
|
||||
#include "helpersse2.h"
|
||||
#include "renamedsp128.h"
|
||||
typedef Sleef___m128d_2 vdouble2;
|
||||
typedef Sleef___m128_2 vfloat2;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SVE
|
||||
#include "renamesve.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 1
|
||||
#include "helpersve.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SVENOFMA
|
||||
#include "renamesvenofma.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 2
|
||||
#include "helpersve.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_DSP256
|
||||
#define CONFIG 1
|
||||
#include "helperavx.h"
|
||||
#include "renamedsp256.h"
|
||||
typedef Sleef___m256d_2 vdouble2;
|
||||
typedef Sleef___m256_2 vfloat2;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_VSX
|
||||
#include "renamevsx.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 1
|
||||
#include "helperpower_128.h"
|
||||
#include "renamevsx.h"
|
||||
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
|
||||
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_VSXNOFMA
|
||||
#include "renamevsxnofma.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 2
|
||||
#include "helperpower_128.h"
|
||||
#include "renamevsxnofma.h"
|
||||
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
|
||||
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_VSX3
|
||||
#include "renamevsx3.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 3
|
||||
#include "helperpower_128.h"
|
||||
#include "renamevsx3.h"
|
||||
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
|
||||
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_VSX3NOFMA
|
||||
#include "renamevsx3nofma.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 4
|
||||
#include "helperpower_128.h"
|
||||
#include "renamevsx3nofma.h"
|
||||
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
|
||||
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_VXE
|
||||
#include "renamevxe.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 140
|
||||
#include "helpers390x_128.h"
|
||||
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
|
||||
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_VXENOFMA
|
||||
#include "renamevxenofma.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 141
|
||||
#include "helpers390x_128.h"
|
||||
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
|
||||
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_VXE2
|
||||
#include "renamevxe2.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 150
|
||||
#include "helpers390x_128.h"
|
||||
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
|
||||
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_VXE2NOFMA
|
||||
#include "renamevxe2nofma.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 151
|
||||
#include "helpers390x_128.h"
|
||||
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
|
||||
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_DSPPOWER_128
|
||||
#define CONFIG 1
|
||||
#include "helperpower_128.h"
|
||||
#include "renamedsp128.h"
|
||||
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
|
||||
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_DSPS390X_128
|
||||
#define CONFIG 140
|
||||
#include "helpers390x_128.h"
|
||||
#include "renamedsp128.h"
|
||||
typedef Sleef_SLEEF_VECTOR_DOUBLE_2 vdouble2;
|
||||
typedef Sleef_SLEEF_VECTOR_FLOAT_2 vfloat2;
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_RVVM1
|
||||
#include "renamervvm1.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 1
|
||||
#include "helperrvv.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_RVVM1NOFMA
|
||||
#include "renamervvm1nofma.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 2
|
||||
#include "helperrvv.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_RVVM2
|
||||
#include "renamervvm2.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 1
|
||||
#include "helperrvv.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_RVVM2NOFMA
|
||||
#include "renamervvm2nofma.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 2
|
||||
#include "helperrvv.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_PUREC_SCALAR
|
||||
#include "renamepurec_scalar.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 1
|
||||
#include "helperpurec_scalar.h"
|
||||
typedef Sleef_double_2 vdouble2;
|
||||
typedef Sleef_float_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_PURECFMA_SCALAR
|
||||
#include "renamepurecfma_scalar.h"
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
#define CONFIG 2
|
||||
#include "helperpurec_scalar.h"
|
||||
typedef Sleef_double_2 vdouble2;
|
||||
typedef Sleef_float_2 vfloat2;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_DSP_SCALAR
|
||||
#include "renamedspscalar.h"
|
||||
#define CONFIG 1
|
||||
#include "helperpurec_scalar.h"
|
||||
typedef Sleef_double_2 vdouble2;
|
||||
typedef Sleef_float_2 vfloat2;
|
||||
#endif
|
||||
|
||||
#ifdef USE_INLINE_HEADER
|
||||
#define CONCAT_SIMD_SUFFIX_(keyword, suffix) keyword ## suffix
|
||||
#define CONCAT_SIMD_SUFFIX(keyword, suffix) CONCAT_SIMD_SUFFIX_(keyword, suffix)
|
||||
#define vmask CONCAT_SIMD_SUFFIX(vmask, SIMD_SUFFIX)
|
||||
#define vopmask CONCAT_SIMD_SUFFIX(vopmask, SIMD_SUFFIX)
|
||||
#define vdouble CONCAT_SIMD_SUFFIX(vdouble, SIMD_SUFFIX)
|
||||
#define vint CONCAT_SIMD_SUFFIX(vint, SIMD_SUFFIX)
|
||||
#define vfloat CONCAT_SIMD_SUFFIX(vfloat, SIMD_SUFFIX)
|
||||
#define vint2 CONCAT_SIMD_SUFFIX(vint2, SIMD_SUFFIX)
|
||||
#define vdouble2 CONCAT_SIMD_SUFFIX(vdouble2, SIMD_SUFFIX)
|
||||
#define vfloat2 CONCAT_SIMD_SUFFIX(vfloat2, SIMD_SUFFIX)
|
||||
#define vd2getx_vd_vd2 CONCAT_SIMD_SUFFIX(vd2getx_vd_vd2, SIMD_SUFFIX)
|
||||
#define vd2gety_vd_vd2 CONCAT_SIMD_SUFFIX(vd2gety_vd_vd2, SIMD_SUFFIX)
|
||||
#define vf2getx_vf_vf2 CONCAT_SIMD_SUFFIX(vf2getx_vf_vf2, SIMD_SUFFIX)
|
||||
#define vf2gety_vf_vf2 CONCAT_SIMD_SUFFIX(vf2gety_vf_vf2, SIMD_SUFFIX)
|
||||
#define vloadu_vd_p CONCAT_SIMD_SUFFIX(vloadu_vd_p, SIMD_SUFFIX)
|
||||
#define vstoreu_v_p_vd CONCAT_SIMD_SUFFIX(vstoreu_v_p_vd, SIMD_SUFFIX)
|
||||
#define vloadu_vf_p CONCAT_SIMD_SUFFIX(vloadu_vf_p, SIMD_SUFFIX)
|
||||
#define vstoreu_v_p_vf CONCAT_SIMD_SUFFIX(vstoreu_v_p_vf, SIMD_SUFFIX)
|
||||
#define vloadu_vi_p CONCAT_SIMD_SUFFIX(vloadu_vi_p, SIMD_SUFFIX)
|
||||
#define vstoreu_v_p_vi CONCAT_SIMD_SUFFIX(vstoreu_v_p_vi, SIMD_SUFFIX)
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
int check_feature(double d, float f) {
|
||||
#ifdef ENABLE_DP
|
||||
{
|
||||
double s[VECTLENDP];
|
||||
int i;
|
||||
for(i=0;i<VECTLENDP;i++) {
|
||||
s[i] = d;
|
||||
}
|
||||
vdouble a = vloadu_vd_p(s);
|
||||
a = xpow(a, a);
|
||||
vstoreu_v_p_vd(s, a);
|
||||
if (s[0] == s[0]) return 1;
|
||||
}
|
||||
#endif
|
||||
#ifdef ENABLE_SP
|
||||
{
|
||||
float s[VECTLENSP];
|
||||
int i;
|
||||
for(i=0;i<VECTLENSP;i++) {
|
||||
s[i] = d;
|
||||
}
|
||||
vfloat a = vloadu_vf_p(s);
|
||||
a = xpowf(a, a);
|
||||
vstoreu_v_p_vf(s, a);
|
||||
if (s[0] == s[0]) return 1;
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if defined(ENABLE_DP) && !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA) || defined(USE_INLINE_HEADER))
|
||||
static vdouble vd2getx_vd_vd2(vdouble2 v) { return v.x; }
|
||||
static vdouble vd2gety_vd_vd2(vdouble2 v) { return v.y; }
|
||||
#endif
|
||||
|
||||
#if defined(ENABLE_SP) && !(defined(ENABLE_SVE) || defined(ENABLE_SVENOFMA) || defined(ENABLE_RVVM1) || defined(ENABLE_RVVM1NOFMA) || defined(ENABLE_RVVM2) || defined(ENABLE_RVVM2NOFMA) || defined(USE_INLINE_HEADER))
|
||||
static vfloat vf2getx_vf_vf2(vfloat2 v) { return v.x; }
|
||||
static vfloat vf2gety_vf_vf2(vfloat2 v) { return v.y; }
|
||||
#endif
|
||||
|
||||
//
|
||||
|
||||
#define func_d_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
double s[VECTLENDP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
int idx = xrand() & (VECTLENDP-1); \
|
||||
s[idx] = u2d(u); \
|
||||
vdouble a = vloadu_vd_p(s); \
|
||||
a = funcName(a); \
|
||||
vstoreu_v_p_vd(s, a); \
|
||||
u = d2u(s[idx]); \
|
||||
printf("%" PRIx64 "\n", u); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_d2_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
double s[VECTLENDP], t[VECTLENDP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
int idx = xrand() & (VECTLENDP-1); \
|
||||
s[idx] = u2d(u); \
|
||||
vdouble2 v; \
|
||||
vdouble a = vloadu_vd_p(s); \
|
||||
v = funcName(a); \
|
||||
vstoreu_v_p_vd(s, vd2getx_vd_vd2(v)); \
|
||||
vstoreu_v_p_vd(t, vd2gety_vd_vd2(v)); \
|
||||
Sleef_double2 d2; \
|
||||
d2.x = s[idx]; \
|
||||
d2.y = t[idx]; \
|
||||
printf("%" PRIx64 " %" PRIx64 "\n", d2u(d2.x), d2u(d2.y)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_d_d_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u, v; \
|
||||
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
|
||||
double s[VECTLENDP], t[VECTLENDP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
int idx = xrand() & (VECTLENDP-1); \
|
||||
s[idx] = u2d(u); \
|
||||
t[idx] = u2d(v); \
|
||||
vdouble a, b; \
|
||||
a = vloadu_vd_p(s); \
|
||||
b = vloadu_vd_p(t); \
|
||||
a = funcName(a, b); \
|
||||
vstoreu_v_p_vd(s, a); \
|
||||
u = d2u(s[idx]); \
|
||||
printf("%" PRIx64 "\n", u); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_d_d_i(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u, v; \
|
||||
sscanf(buf, funcStr " %" PRIx64 " %" PRIx64, &u, &v); \
|
||||
double s[VECTLENDP]; \
|
||||
int t[VECTLENDP*2]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
int idx = xrand() & (VECTLENDP-1); \
|
||||
s[idx] = u2d(u); \
|
||||
t[idx] = (int)u2d(v); \
|
||||
vstoreu_v_p_vd(s, funcName(vloadu_vd_p(s), vloadu_vi_p(t))); \
|
||||
u = d2u(s[idx]); \
|
||||
printf("%" PRIx64 "\n", u); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_i_d(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint64_t u; \
|
||||
int i; \
|
||||
sscanf(buf, funcStr " %" PRIx64, &u); \
|
||||
double s[VECTLENDP]; \
|
||||
int t[VECTLENDP*2]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
int idx = xrand() & (VECTLENDP-1); \
|
||||
s[idx] = u2d(u); \
|
||||
vdouble a = vloadu_vd_p(s); \
|
||||
vint vi = funcName(a); \
|
||||
vstoreu_v_p_vi(t, vi); \
|
||||
i = t[idx]; \
|
||||
printf("%d\n", i); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
#define func_f_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u; \
|
||||
sscanf(buf, funcStr " %x", &u); \
|
||||
float s[VECTLENSP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
int idx = xrand() & (VECTLENSP-1); \
|
||||
s[idx] = u2f(u); \
|
||||
vfloat a = vloadu_vf_p(s); \
|
||||
a = funcName(a); \
|
||||
vstoreu_v_p_vf(s, a); \
|
||||
u = f2u(s[idx]); \
|
||||
printf("%x\n", u); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_f2_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u; \
|
||||
sscanf(buf, funcStr " %x", &u); \
|
||||
float s[VECTLENSP], t[VECTLENSP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
int idx = xrand() & (VECTLENSP-1); \
|
||||
s[idx] = u2f(u); \
|
||||
vfloat2 v; \
|
||||
vfloat a = vloadu_vf_p(s); \
|
||||
v = funcName(a); \
|
||||
vstoreu_v_p_vf(s, vf2getx_vf_vf2(v)); \
|
||||
vstoreu_v_p_vf(t, vf2gety_vf_vf2(v)); \
|
||||
Sleef_float2 d2; \
|
||||
d2.x = s[idx]; \
|
||||
d2.y = t[idx]; \
|
||||
printf("%x %x\n", f2u(d2.x), f2u(d2.y)); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define func_f_f_f(funcStr, funcName) { \
|
||||
while (startsWith(buf, funcStr " ")) { \
|
||||
uint32_t u, v; \
|
||||
sscanf(buf, funcStr " %x %x", &u, &v); \
|
||||
float s[VECTLENSP], t[VECTLENSP]; \
|
||||
memrand(s, sizeof(s)); \
|
||||
memrand(t, sizeof(t)); \
|
||||
int idx = xrand() & (VECTLENSP-1); \
|
||||
s[idx] = u2f(u); \
|
||||
t[idx] = u2f(v); \
|
||||
vfloat a, b; \
|
||||
a = vloadu_vf_p(s); \
|
||||
b = vloadu_vf_p(t); \
|
||||
a = funcName(a, b); \
|
||||
vstoreu_v_p_vf(s, a); \
|
||||
u = f2u(s[idx]); \
|
||||
printf("%x\n", u); \
|
||||
fflush(stdout); \
|
||||
if (fgets(buf, BUFSIZE-1, stdin) == NULL) break; \
|
||||
} \
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
#define BUFSIZE 1024
|
||||
|
||||
int main2(int argc, char **argv) {
|
||||
xsrand(time(NULL));
|
||||
|
||||
{
|
||||
int k = 0;
|
||||
#ifdef ENABLE_DP
|
||||
k += 1;
|
||||
#endif
|
||||
#ifdef ENABLE_SP
|
||||
k += 2;
|
||||
#endif
|
||||
#if defined(ENABLE_NEON32) || defined(ENABLE_NEON32VFPV4)
|
||||
k += 4; // flush to zero
|
||||
#elif defined(ENABLE_VECEXT)
|
||||
if (vcast_f_vf(xpowf(vcast_vf_f(0.5f), vcast_vf_f(140))) == 0) k += 4;
|
||||
#endif
|
||||
#if defined(DETERMINISTIC)
|
||||
k += 8;
|
||||
#endif
|
||||
|
||||
printf("%d\n", k);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
#if !defined(USE_INLINE_HEADER)
|
||||
fprintf(stderr, "IUT : %s\n", (const char *)xgetPtrf(0));
|
||||
#endif
|
||||
fflush(stderr);
|
||||
|
||||
char buf[BUFSIZE];
|
||||
fgets(buf, BUFSIZE-1, stdin);
|
||||
|
||||
while(!feof(stdin)) {
|
||||
#ifdef ENABLE_DP
|
||||
func_d_d("sin", xsin);
|
||||
func_d_d("cos", xcos);
|
||||
func_d_d("tan", xtan);
|
||||
func_d_d("asin", xasin);
|
||||
func_d_d("acos", xacos);
|
||||
func_d_d("atan", xatan);
|
||||
func_d_d("log", xlog);
|
||||
func_d_d("exp", xexp);
|
||||
|
||||
#ifndef DETERMINISTIC
|
||||
func_d_d("sqrt", xsqrt);
|
||||
func_d_d("sqrt_u05", xsqrt_u05);
|
||||
func_d_d("sqrt_u35", xsqrt_u35);
|
||||
#endif
|
||||
func_d_d("cbrt", xcbrt);
|
||||
func_d_d("cbrt_u1", xcbrt_u1);
|
||||
|
||||
func_d_d("sinh", xsinh);
|
||||
func_d_d("cosh", xcosh);
|
||||
func_d_d("tanh", xtanh);
|
||||
func_d_d("sinh_u35", xsinh_u35);
|
||||
func_d_d("cosh_u35", xcosh_u35);
|
||||
func_d_d("tanh_u35", xtanh_u35);
|
||||
func_d_d("asinh", xasinh);
|
||||
func_d_d("acosh", xacosh);
|
||||
func_d_d("atanh", xatanh);
|
||||
|
||||
func_d_d("sin_u1", xsin_u1);
|
||||
func_d_d("cos_u1", xcos_u1);
|
||||
func_d_d("tan_u1", xtan_u1);
|
||||
func_d_d("sinpi_u05", xsinpi_u05);
|
||||
func_d_d("cospi_u05", xcospi_u05);
|
||||
func_d_d("asin_u1", xasin_u1);
|
||||
func_d_d("acos_u1", xacos_u1);
|
||||
func_d_d("atan_u1", xatan_u1);
|
||||
func_d_d("log_u1", xlog_u1);
|
||||
|
||||
func_d_d("exp2", xexp2);
|
||||
func_d_d("exp10", xexp10);
|
||||
func_d_d("exp2_u35", xexp2_u35);
|
||||
func_d_d("exp10_u35", xexp10_u35);
|
||||
func_d_d("expm1", xexpm1);
|
||||
func_d_d("log10", xlog10);
|
||||
func_d_d("log2", xlog2);
|
||||
func_d_d("log2_u35", xlog2_u35);
|
||||
func_d_d("log1p", xlog1p);
|
||||
|
||||
func_d2_d("sincos", xsincos);
|
||||
func_d2_d("sincos_u1", xsincos_u1);
|
||||
func_d2_d("sincospi_u35", xsincospi_u35);
|
||||
func_d2_d("sincospi_u05", xsincospi_u05);
|
||||
|
||||
func_d_d_d("pow", xpow);
|
||||
func_d_d_d("atan2", xatan2);
|
||||
func_d_d_d("atan2_u1", xatan2_u1);
|
||||
|
||||
func_d_d_i("ldexp", xldexp);
|
||||
|
||||
func_i_d("ilogb", xilogb);
|
||||
|
||||
func_d_d("fabs", xfabs);
|
||||
func_d_d("trunc", xtrunc);
|
||||
func_d_d("floor", xfloor);
|
||||
func_d_d("ceil", xceil);
|
||||
func_d_d("round", xround);
|
||||
func_d_d("rint", xrint);
|
||||
func_d_d("frfrexp", xfrfrexp);
|
||||
func_i_d("expfrexp", xexpfrexp);
|
||||
|
||||
func_d_d_d("hypot_u05", xhypot_u05);
|
||||
func_d_d_d("hypot_u35", xhypot_u35);
|
||||
func_d_d_d("copysign", xcopysign);
|
||||
func_d_d_d("fmax", xfmax);
|
||||
func_d_d_d("fmin", xfmin);
|
||||
func_d_d_d("fdim", xfdim);
|
||||
func_d_d_d("nextafter", xnextafter);
|
||||
func_d_d_d("fmod", xfmod);
|
||||
func_d_d_d("remainder", xremainder);
|
||||
|
||||
func_d2_d("modf", xmodf);
|
||||
|
||||
func_d_d("tgamma_u1", xtgamma_u1);
|
||||
func_d_d("lgamma_u1", xlgamma_u1);
|
||||
func_d_d("erf_u1", xerf_u1);
|
||||
func_d_d("erfc_u15", xerfc_u15);
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_SP
|
||||
func_f_f("sinf", xsinf);
|
||||
func_f_f("cosf", xcosf);
|
||||
func_f_f("tanf", xtanf);
|
||||
func_f_f("asinf", xasinf);
|
||||
func_f_f("acosf", xacosf);
|
||||
func_f_f("atanf", xatanf);
|
||||
func_f_f("logf", xlogf);
|
||||
func_f_f("expf", xexpf);
|
||||
|
||||
#ifndef DETERMINISTIC
|
||||
func_f_f("sqrtf", xsqrtf);
|
||||
func_f_f("sqrtf_u05", xsqrtf_u05);
|
||||
func_f_f("sqrtf_u35", xsqrtf_u35);
|
||||
#endif
|
||||
func_f_f("cbrtf", xcbrtf);
|
||||
func_f_f("cbrtf_u1", xcbrtf_u1);
|
||||
|
||||
func_f_f("sinhf", xsinhf);
|
||||
func_f_f("coshf", xcoshf);
|
||||
func_f_f("tanhf", xtanhf);
|
||||
func_f_f("sinhf_u35", xsinhf_u35);
|
||||
func_f_f("coshf_u35", xcoshf_u35);
|
||||
func_f_f("tanhf_u35", xtanhf_u35);
|
||||
func_f_f("asinhf", xasinhf);
|
||||
func_f_f("acoshf", xacoshf);
|
||||
func_f_f("atanhf", xatanhf);
|
||||
|
||||
func_f_f("sinf_u1", xsinf_u1);
|
||||
func_f_f("cosf_u1", xcosf_u1);
|
||||
func_f_f("tanf_u1", xtanf_u1);
|
||||
func_f_f("sinpif_u05", xsinpif_u05);
|
||||
func_f_f("cospif_u05", xcospif_u05);
|
||||
func_f_f("asinf_u1", xasinf_u1);
|
||||
func_f_f("acosf_u1", xacosf_u1);
|
||||
func_f_f("atanf_u1", xatanf_u1);
|
||||
func_f_f("logf_u1", xlogf_u1);
|
||||
|
||||
func_f_f("exp2f", xexp2f);
|
||||
func_f_f("exp10f", xexp10f);
|
||||
func_f_f("exp2f_u35", xexp2f_u35);
|
||||
func_f_f("exp10f_u35", xexp10f_u35);
|
||||
func_f_f("expm1f", xexpm1f);
|
||||
func_f_f("log10f", xlog10f);
|
||||
func_f_f("log2f", xlog2f);
|
||||
func_f_f("log2f_u35", xlog2f_u35);
|
||||
func_f_f("log1pf", xlog1pf);
|
||||
|
||||
func_f2_f("sincosf", xsincosf);
|
||||
func_f2_f("sincosf_u1", xsincosf_u1);
|
||||
func_f2_f("sincospif_u35", xsincospif_u35);
|
||||
func_f2_f("sincospif_u05", xsincospif_u05);
|
||||
|
||||
func_f_f_f("powf", xpowf);
|
||||
func_f_f_f("atan2f", xatan2f);
|
||||
func_f_f_f("atan2f_u1", xatan2f_u1);
|
||||
|
||||
func_f_f("fabsf", xfabsf);
|
||||
func_f_f("truncf", xtruncf);
|
||||
func_f_f("floorf", xfloorf);
|
||||
func_f_f("ceilf", xceilf);
|
||||
func_f_f("roundf", xroundf);
|
||||
func_f_f("rintf", xrintf);
|
||||
func_f_f("frfrexpf", xfrfrexpf);
|
||||
|
||||
func_f_f_f("hypotf_u05", xhypotf_u05);
|
||||
func_f_f_f("hypotf_u35", xhypotf_u35);
|
||||
func_f_f_f("copysignf", xcopysignf);
|
||||
func_f_f_f("fmaxf", xfmaxf);
|
||||
func_f_f_f("fminf", xfminf);
|
||||
func_f_f_f("fdimf", xfdimf);
|
||||
func_f_f_f("nextafterf", xnextafterf);
|
||||
func_f_f_f("fmodf", xfmodf);
|
||||
func_f_f_f("remainderf", xremainderf);
|
||||
|
||||
func_f2_f("modff", xmodff);
|
||||
|
||||
func_f_f("tgammaf_u1", xtgammaf_u1);
|
||||
func_f_f("lgammaf_u1", xlgammaf_u1);
|
||||
func_f_f("erff_u1", xerff_u1);
|
||||
func_f_f("erfcf_u15", xerfcf_u15);
|
||||
|
||||
func_f_f("fastsinf_u3500", xfastsinf_u3500);
|
||||
func_f_f("fastcosf_u3500", xfastcosf_u3500);
|
||||
func_f_f_f("fastpowf_u3500", xfastpowf_u3500);
|
||||
#endif
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,92 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <x86intrin.h>
|
||||
|
||||
#include <sleef.h>
|
||||
|
||||
#define N 64
|
||||
#define M 256
|
||||
|
||||
double r0[N], a0[N], a1[N], a2[N];
|
||||
|
||||
void do_libm() { for(int i=0;i<N;i++) r0[i] = sin(a0[i]); }
|
||||
|
||||
#if defined(__SSE2__)
|
||||
void do_sleef_sse2() { _mm_storeu_pd(r0, Sleef_sind2_u10sse2(_mm_loadu_pd(a0))); }
|
||||
#endif
|
||||
|
||||
#if defined(__AVX__)
|
||||
void do_sleef_avx() { _mm256_storeu_pd(r0, Sleef_sind4_u10avx(_mm256_loadu_pd(a0))); }
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
void do_sleef_avx2() { _mm256_storeu_pd(r0, Sleef_sind4_u10avx2(_mm256_loadu_pd(a0))); }
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__)
|
||||
void do_sleef_avx512f() { _mm512_storeu_pd(r0, Sleef_sind8_u10avx512f(_mm512_loadu_pd(a0))); }
|
||||
#endif
|
||||
|
||||
int do_test_once(double d) {
|
||||
for(int i=0;i<N;i++) a0[i] = d;
|
||||
do_libm();
|
||||
double rm = r0[0];
|
||||
|
||||
#if defined(__SSE2__)
|
||||
for(int i=0;i<N;i++) a0[i] = d;
|
||||
do_sleef_sse2();
|
||||
if (rm == r0[0]) return 1;
|
||||
#endif
|
||||
|
||||
#if defined(__AVX__)
|
||||
for(int i=0;i<N;i++) a0[i] = d;
|
||||
do_sleef_avx();
|
||||
if (rm == r0[0]) return 1;
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
for(int i=0;i<N;i++) a0[i] = d;
|
||||
do_sleef_avx2();
|
||||
if (rm == r0[0]) return 1;
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__)
|
||||
for(int i=0;i<N;i++) a0[i] = d;
|
||||
do_sleef_avx512f();
|
||||
if (rm == r0[0]) return 1;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int check_feature(double d, float f) {
|
||||
#if defined(__SSE2__)
|
||||
do_sleef_sse2();
|
||||
#endif
|
||||
|
||||
#if defined(__AVX__)
|
||||
do_sleef_avx();
|
||||
#endif
|
||||
|
||||
#if defined(__AVX2__)
|
||||
do_sleef_avx2();
|
||||
#endif
|
||||
|
||||
#if defined(__AVX512F__)
|
||||
do_sleef_avx512f();
|
||||
#endif
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
int main2(int argc, char **argv) {
|
||||
for(int i=0;i<M;i++) {
|
||||
if (!do_test_once(10.0 * ((2.0 * rand() / RAND_MAX) - 1))) {
|
||||
printf("fail\n");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
printf("pass\n");
|
||||
exit(0);
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,991 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2021.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <mpfr.h>
|
||||
#include <time.h>
|
||||
#include <float.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#ifdef ENABLE_SYS_getrandom
|
||||
#define _GNU_SOURCE
|
||||
#include <unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <linux/random.h>
|
||||
#endif
|
||||
|
||||
#include "sleef.h"
|
||||
#include "testerutil.h"
|
||||
|
||||
#define DORENAME
|
||||
#include "rename.h"
|
||||
|
||||
#define DENORMAL_DBL_MIN (4.9406564584124654418e-324)
|
||||
#define POSITIVE_INFINITY INFINITY
|
||||
#define NEGATIVE_INFINITY (-INFINITY)
|
||||
|
||||
typedef union {
|
||||
double d;
|
||||
uint64_t u64;
|
||||
int64_t i64;
|
||||
} conv_t;
|
||||
|
||||
double nexttoward0(double x, int n) {
|
||||
union {
|
||||
double f;
|
||||
uint64_t u;
|
||||
} cx;
|
||||
cx.f = x;
|
||||
cx.u -=n ;
|
||||
return cx.f;
|
||||
}
|
||||
|
||||
double rnd() {
|
||||
conv_t c;
|
||||
switch(random() & 63) {
|
||||
case 0: return nexttoward0( 0.0, -(random() & ((1 << (random() & 31)) - 1)));
|
||||
case 1: return nexttoward0(-0.0, -(random() & ((1 << (random() & 31)) - 1)));
|
||||
case 2: return nexttoward0( INFINITY, (random() & ((1 << (random() & 31)) - 1)));
|
||||
case 3: return nexttoward0(-INFINITY, (random() & ((1 << (random() & 31)) - 1)));
|
||||
}
|
||||
#ifdef ENABLE_SYS_getrandom
|
||||
syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
|
||||
#else
|
||||
c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
|
||||
#endif
|
||||
return c.d;
|
||||
}
|
||||
|
||||
double rnd_fr() {
|
||||
conv_t c;
|
||||
do {
|
||||
#ifdef ENABLE_SYS_getrandom
|
||||
syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
|
||||
#else
|
||||
c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
|
||||
#endif
|
||||
} while(!isnumber(c.d));
|
||||
return c.d;
|
||||
}
|
||||
|
||||
double rnd_zo() {
|
||||
conv_t c;
|
||||
do {
|
||||
#ifdef ENABLE_SYS_getrandom
|
||||
syscall(SYS_getrandom, &c.u64, sizeof(c.u64), 0);
|
||||
#else
|
||||
c.u64 = random() | ((uint64_t)random() << 31) | ((uint64_t)random() << 62);
|
||||
#endif
|
||||
} while(!isnumber(c.d) || c.d < -1 || 1 < c.d);
|
||||
return c.d;
|
||||
}
|
||||
|
||||
int main(int argc,char **argv)
|
||||
{
|
||||
mpfr_t frw, frx, fry, frz;
|
||||
|
||||
mpfr_set_default_prec(1280);
|
||||
mpfr_inits(frw, frx, fry, frz, NULL);
|
||||
|
||||
conv_t cd;
|
||||
double d, t;
|
||||
double d2, d3, zo;
|
||||
|
||||
int cnt, ecnt = 0;
|
||||
|
||||
srandom(time(NULL));
|
||||
|
||||
for(cnt = 0;ecnt < 1000;cnt++) {
|
||||
switch(cnt & 7) {
|
||||
case 0:
|
||||
d = rnd();
|
||||
d2 = rnd();
|
||||
d3 = rnd();
|
||||
zo = rnd();
|
||||
break;
|
||||
case 1:
|
||||
cd.d = rint(rnd_zo() * 1e+10) * M_PI_4;
|
||||
cd.i64 += (random() & 0xff) - 0x7f;
|
||||
d = cd.d;
|
||||
d2 = rnd();
|
||||
d3 = rnd();
|
||||
zo = rnd();
|
||||
break;
|
||||
case 2:
|
||||
cd.d = rnd_fr() * M_PI_4;
|
||||
cd.i64 += (random() & 0xf) - 0x7;
|
||||
d = cd.d;
|
||||
d2 = rnd();
|
||||
d3 = rnd();
|
||||
zo = rnd();
|
||||
break;
|
||||
default:
|
||||
d = rnd_fr();
|
||||
d2 = rnd_fr();
|
||||
d3 = rnd_fr();
|
||||
zo = rnd_zo();
|
||||
break;
|
||||
}
|
||||
|
||||
Sleef_double2 sc = xsincospi_u05(d);
|
||||
Sleef_double2 sc2 = xsincospi_u35(d);
|
||||
|
||||
{
|
||||
const double rangemax2 = 1e+9/4;
|
||||
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_sinpi(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULP2dp(t = sc.x, frx);
|
||||
|
||||
if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
|
||||
printf("Pure C sincospi_u05 sin arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULP2dp(t = sc2.x, frx);
|
||||
|
||||
if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !isnumber(t))) {
|
||||
printf("Pure C sincospi_u35 sin arg=%.20g ulp=%.20g\n", d, u1);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u2 = countULP2dp(t = xsinpi_u05(d), frx);
|
||||
|
||||
if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
|
||||
printf("Pure C sinpi_u05 arg=%.20g ulp=%.20g\n", d, u2);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
const double rangemax2 = 1e+9/4;
|
||||
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_cospi(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULP2dp(t = sc.y, frx);
|
||||
|
||||
if (u0 != 0 && ((fabs(d) <= rangemax2 && u0 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
|
||||
printf("Pure C sincospi_u05 cos arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULP2dp(t = sc.y, frx);
|
||||
|
||||
if (u1 != 0 && ((fabs(d) <= rangemax2 && u1 > 1.5) || fabs(t) > 1 || !isnumber(t))) {
|
||||
printf("Pure C sincospi_u35 cos arg=%.20g ulp=%.20g\n", d, u1);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u2 = countULP2dp(t = xcospi_u05(d), frx);
|
||||
|
||||
if (u2 != 0 && ((fabs(d) <= rangemax2 && u2 > 0.506) || fabs(t) > 1 || !isnumber(t))) {
|
||||
printf("Pure C cospi_u05 arg=%.20g ulp=%.20g\n", d, u2);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
sc = xsincos(d);
|
||||
sc2 = xsincos_u1(d);
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_sin(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xsin(d), frx);
|
||||
|
||||
if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
|
||||
printf("Pure C sin arg=%.20g ulp=%.20g\n", d, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULPdp(sc.x, frx);
|
||||
|
||||
if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
|
||||
printf("Pure C sincos sin arg=%.20g ulp=%.20g\n", d, u1);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u2 = countULPdp(t = xsin_u1(d), frx);
|
||||
|
||||
if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) {
|
||||
printf("Pure C sin_u1 arg=%.20g ulp=%.20g\n", d, u2);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u3 = countULPdp(t = sc2.x, frx);
|
||||
|
||||
if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) {
|
||||
printf("Pure C sincos_u1 sin arg=%.20g ulp=%.20g\n", d, u3);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_cos(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xcos(d), frx);
|
||||
|
||||
if (u0 != 0 && (u0 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
|
||||
printf("Pure C cos arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULPdp(t = sc.y, frx);
|
||||
|
||||
if (u1 != 0 && (u1 > 3.5 || fabs(t) > 1 || !isnumber(t))) {
|
||||
printf("Pure C sincos cos arg=%.20g ulp=%.20g\n", d, u1);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u2 = countULPdp(t = xcos_u1(d), frx);
|
||||
|
||||
if (u2 != 0 && (u2 > 1 || fabs(t) > 1 || !isnumber(t))) {
|
||||
printf("Pure C cos_u1 arg=%.20g ulp=%.20g\n", d, u2);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u3 = countULPdp(t = sc2.y, frx);
|
||||
|
||||
if (u3 != 0 && (u3 > 1 || fabs(t) > 1 || !isnumber(t))) {
|
||||
printf("Pure C sincos_u1 cos arg=%.20g ulp=%.20g\n", d, u3);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_tan(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xtan(d), frx);
|
||||
|
||||
if (u0 != 0 && (u0 > 3.5 || isnan(t))) {
|
||||
printf("Pure C tan arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULPdp(t = xtan_u1(d), frx);
|
||||
|
||||
if (u1 != 0 && (u1 > 1 || isnan(t))) {
|
||||
printf("Pure C tan_u1 arg=%.20g ulp=%.20g\n", d, u1);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, fabs(d), GMP_RNDN);
|
||||
mpfr_log(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xlog(fabs(d)), frx);
|
||||
|
||||
if (u0 > 3.5) {
|
||||
printf("Pure C log arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULPdp(t = xlog_u1(fabs(d)), frx);
|
||||
|
||||
if (u1 > 1) {
|
||||
printf("Pure C log_u1 arg=%.20g ulp=%.20g\n", d, u1);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, fabs(d), GMP_RNDN);
|
||||
mpfr_log10(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xlog10(fabs(d)), frx);
|
||||
|
||||
if (u0 > 1) {
|
||||
printf("Pure C log10 arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, fabs(d), GMP_RNDN);
|
||||
mpfr_log2(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xlog2(fabs(d)), frx);
|
||||
|
||||
if (u0 > 1) {
|
||||
printf("Pure C log2 arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULPdp(t = xlog2_u35(fabs(d)), frx);
|
||||
|
||||
if (u1 > 3.5) {
|
||||
printf("Pure C log2_u35 arg=%.20g ulp=%.20g\n", d, u1);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_log1p(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xlog1p(d), frx);
|
||||
|
||||
if ((-1 <= d && d <= 1e+307 && u0 > 1) ||
|
||||
(d < -1 && !isnan(t)) ||
|
||||
(d > 1e+307 && !(u0 <= 1 || isinf(t)))) {
|
||||
printf("Pure C log1p arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_exp(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xexp(d), frx);
|
||||
|
||||
if (u0 > 1) {
|
||||
printf("Pure C exp arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_exp2(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xexp2(d), frx);
|
||||
|
||||
if (u0 > 1) {
|
||||
printf("Pure C exp2 arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULPdp(t = xexp2_u35(d), frx);
|
||||
|
||||
if (u1 > 3.5) {
|
||||
printf("Pure C exp2_u35 arg=%.20g ulp=%.20g\n", d, u1);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_exp10(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xexp10(d), frx);
|
||||
|
||||
if (u0 > 1.09) {
|
||||
printf("Pure C exp10 arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULPdp(t = xexp10_u35(d), frx);
|
||||
|
||||
if (u1 > 3.5) {
|
||||
printf("Pure C exp10_u35 arg=%.20g ulp=%.20g\n", d, u1);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_expm1(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xexpm1(d), frx);
|
||||
|
||||
if (u0 > 1) {
|
||||
printf("Pure C expm1 arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_set_d(fry, d2, GMP_RNDN);
|
||||
mpfr_pow(frx, fry, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xpow(d2, d), frx);
|
||||
|
||||
if (u0 > 1) {
|
||||
printf("Pure C pow arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
|
||||
printf("correct = %g, test = %g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_cbrt(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xcbrt(d), frx);
|
||||
|
||||
if (u0 > 3.5) {
|
||||
printf("Pure C cbrt arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULPdp(t = xcbrt_u1(d), frx);
|
||||
|
||||
if (u1 > 1) {
|
||||
printf("Pure C cbrt_u1 arg=%.20g ulp=%.20g\n", d, u1);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, zo, GMP_RNDN);
|
||||
mpfr_asin(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xasin(zo), frx);
|
||||
|
||||
if (u0 > 3.5) {
|
||||
printf("Pure C asin arg=%.20g ulp=%.20g\n", zo, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULPdp(t = xasin_u1(zo), frx);
|
||||
|
||||
if (u1 > 1) {
|
||||
printf("Pure C asin_u1 arg=%.20g ulp=%.20g\n", zo, u1);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, zo, GMP_RNDN);
|
||||
mpfr_acos(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xacos(zo), frx);
|
||||
|
||||
if (u0 > 3.5) {
|
||||
printf("Pure C acos arg=%.20g ulp=%.20g\n", zo, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULPdp(t = xacos_u1(zo), frx);
|
||||
|
||||
if (u1 > 1) {
|
||||
printf("Pure C acos_u1 arg=%.20g ulp=%.20g\n", zo, u1);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_atan(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xatan(d), frx);
|
||||
|
||||
if (u0 > 3.5) {
|
||||
printf("Pure C atan arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULPdp(t = xatan_u1(d), frx);
|
||||
|
||||
if (u1 > 1) {
|
||||
printf("Pure C atan_u1 arg=%.20g ulp=%.20g\n", d, u1);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_set_d(fry, d2, GMP_RNDN);
|
||||
mpfr_atan2(frx, fry, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xatan2(d2, d), frx);
|
||||
|
||||
if (u0 > 3.5) {
|
||||
printf("Pure C atan2 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULP2dp(t = xatan2_u1(d2, d), frx);
|
||||
|
||||
if (u1 > 1) {
|
||||
printf("Pure C atan2_u1 arg=%.20g, %.20g ulp=%.20g\n", d2, d, u1);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_sinh(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xsinh(d), frx);
|
||||
|
||||
if ((fabs(d) <= 709 && u0 > 1) ||
|
||||
(d > 709 && !(u0 <= 1 || (isinf(t) && t > 0))) ||
|
||||
(d < -709 && !(u0 <= 1 || (isinf(t) && t < 0)))) {
|
||||
printf("Pure C sinh arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_cosh(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xcosh(d), frx);
|
||||
|
||||
if ((fabs(d) <= 709 && u0 > 1) || !(u0 <= 1 || (isinf(t) && t > 0))) {
|
||||
printf("Pure C cosh arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_tanh(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xtanh(d), frx);
|
||||
|
||||
if (u0 > 1) {
|
||||
printf("Pure C tanh arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_sinh(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xsinh_u35(d), frx);
|
||||
|
||||
if ((fabs(d) <= 709 && u0 > 3.5) ||
|
||||
(d > 709 && !(u0 <= 3.5 || (isinf(t) && t > 0))) ||
|
||||
(d < -709 && !(u0 <= 3.5 || (isinf(t) && t < 0)))) {
|
||||
printf("Pure C sinh_u35 arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_cosh(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xcosh_u35(d), frx);
|
||||
|
||||
if ((fabs(d) <= 709 && u0 > 3.5) || !(u0 <= 3.5 || (isinf(t) && t > 0))) {
|
||||
printf("Pure C cosh_u35 arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_tanh(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xtanh_u35(d), frx);
|
||||
|
||||
if (u0 > 3.5) {
|
||||
printf("Pure C tanh_u35 arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_asinh(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xasinh(d), frx);
|
||||
|
||||
if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) ||
|
||||
(d >= sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) ||
|
||||
(d <= -sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t < 0)))) {
|
||||
printf("Pure C asinh arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_acosh(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xacosh(d), frx);
|
||||
|
||||
if ((fabs(d) < sqrt(DBL_MAX) && u0 > 1) ||
|
||||
(d >= sqrt(DBL_MAX) && !(u0 <= 1 || (isinf(t) && t > 0))) ||
|
||||
(d <= -sqrt(DBL_MAX) && !isnan(t))) {
|
||||
printf("Pure C acosh arg=%.20g ulp=%.20g\n", d, u0);
|
||||
printf("%.20g\n", t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_atanh(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xatanh(d), frx);
|
||||
|
||||
if (u0 > 1) {
|
||||
printf("Pure C atanh arg=%.20g ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_abs(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xfabs(d), frx);
|
||||
|
||||
if (u0 != 0) {
|
||||
printf("Pure C fabs arg=%.20g ulp=%.20g\n", d, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_set_d(fry, d2, GMP_RNDN);
|
||||
mpfr_copysign(frx, frx, fry, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xcopysign(d, d2), frx);
|
||||
|
||||
if (u0 != 0 && !isnan(d2)) {
|
||||
printf("Pure C copysign arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
|
||||
printf("correct = %g, test = %g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_set_d(fry, d2, GMP_RNDN);
|
||||
mpfr_max(frx, frx, fry, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xfmax(d, d2), frx);
|
||||
|
||||
if (u0 != 0) {
|
||||
printf("Pure C fmax arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_set_d(fry, d2, GMP_RNDN);
|
||||
mpfr_min(frx, frx, fry, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xfmin(d, d2), frx);
|
||||
|
||||
if (u0 != 0) {
|
||||
printf("Pure C fmin arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_set_d(fry, d2, GMP_RNDN);
|
||||
mpfr_dim(frx, frx, fry, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xfdim(d, d2), frx);
|
||||
|
||||
if (u0 > 0.5) {
|
||||
printf("Pure C fdim arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_trunc(frx, frx);
|
||||
|
||||
double u0 = countULPdp(t = xtrunc(d), frx);
|
||||
|
||||
if (u0 != 0) {
|
||||
printf("Pure C trunc arg=%.20g ulp=%.20g\n", d, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_floor(frx, frx);
|
||||
|
||||
double u0 = countULPdp(t = xfloor(d), frx);
|
||||
|
||||
if (u0 != 0) {
|
||||
printf("Pure C floor arg=%.20g ulp=%.20g\n", d, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_ceil(frx, frx);
|
||||
|
||||
double u0 = countULPdp(t = xceil(d), frx);
|
||||
|
||||
if (u0 != 0) {
|
||||
printf("Pure C ceil arg=%.20g ulp=%.20g\n", d, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_round(frx, frx);
|
||||
|
||||
double u0 = countULPdp(t = xround(d), frx);
|
||||
|
||||
if (u0 != 0) {
|
||||
printf("Pure C round arg=%.24g ulp=%.20g\n", d, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_rint(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xrint(d), frx);
|
||||
|
||||
if (u0 != 0) {
|
||||
printf("Pure C rint arg=%.24g ulp=%.20g\n", d, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_set_d(fry, d2, GMP_RNDN);
|
||||
mpfr_set_d(frz, d3, GMP_RNDN);
|
||||
mpfr_fma(frx, frx, fry, frz, GMP_RNDN);
|
||||
|
||||
double u0 = countULP2dp(t = xfma(d, d2, d3), frx);
|
||||
double c = mpfr_get_d(frx, GMP_RNDN);
|
||||
|
||||
if ((-1e+303 < c && c < 1e+303 && u0 > 0.5) ||
|
||||
!(u0 <= 0.5 || isinf(t))) {
|
||||
printf("Pure C fma arg=%.20g, %.20g, %.20g ulp=%.20g\n", d, d2, d3, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_sqrt(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xsqrt_u05(d), frx);
|
||||
|
||||
if (u0 > 0.50001) {
|
||||
printf("Pure C sqrt_u05 arg=%.20g ulp=%.20g\n", d, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_set_d(fry, d2, GMP_RNDN);
|
||||
mpfr_hypot(frx, frx, fry, GMP_RNDN);
|
||||
|
||||
double u0 = countULP2dp(t = xhypot_u05(d, d2), frx);
|
||||
|
||||
if (u0 > 0.5) {
|
||||
printf("Pure C hypot arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_set_d(fry, d2, GMP_RNDN);
|
||||
mpfr_hypot(frx, frx, fry, GMP_RNDN);
|
||||
|
||||
double u0 = countULP2dp(t = xhypot_u35(d, d2), frx);
|
||||
double c = mpfr_get_d(frx, GMP_RNDN);
|
||||
|
||||
if ((-1e+308 < c && c < 1e+308 && u0 > 3.5) ||
|
||||
!(u0 <= 3.5 || isinf(t))) {
|
||||
printf("Pure C hypot arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
t = xnextafter(d, d2);
|
||||
double c = nextafter(d, d2);
|
||||
|
||||
if (!(isnan(t) && isnan(c)) && t != c) {
|
||||
printf("Pure C nextafter arg=%.20g, %.20g\n", d, d2);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_set_exp(frx, 0);
|
||||
|
||||
double u0 = countULPdp(t = xfrfrexp(d), frx);
|
||||
|
||||
if (d != 0 && isnumber(d) && u0 != 0) {
|
||||
printf("Pure C frfrexp arg=%.20g ulp=%.20g\n", d, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
int cexp = mpfr_get_exp(frx);
|
||||
|
||||
int texp = xexpfrexp(d);
|
||||
|
||||
if (d != 0 && isnumber(d) && cexp != texp) {
|
||||
printf("Pure C expfrexp arg=%.20g\n", d);
|
||||
printf("correct = %d, test = %d\n", cexp, texp);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_set_d(fry, d2, GMP_RNDN);
|
||||
mpfr_fmod(frx, frx, fry, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xfmod(d, d2), frx);
|
||||
|
||||
if (fabsl((long double)d / d2) < 1e+300 && u0 > 0.5) {
|
||||
printf("Pure C fmod arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_set_d(fry, d2, GMP_RNDN);
|
||||
mpfr_remainder(frx, frx, fry, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xremainder(d, d2), frx);
|
||||
|
||||
if (fabsl((long double)d / d2) < 1e+300 && u0 > 0.5) {
|
||||
printf("Pure C remainder arg=%.20g, %.20g ulp=%.20g\n", d, d2, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
int exp = (random() & 8191) - 4096;
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_set_exp(frx, mpfr_get_exp(frx) + exp);
|
||||
|
||||
double u0 = countULPdp(t = xldexp(d, exp), frx);
|
||||
|
||||
if (u0 > 0.5) {
|
||||
printf("Pure C ldexp arg=%.20g %d ulp=%.20g\n", d, exp, u0);
|
||||
printf("correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_modf(fry, frz, frx, GMP_RNDN);
|
||||
|
||||
Sleef_double2 t2 = xmodf(d);
|
||||
double u0 = countULPdp(t2.x, frz);
|
||||
double u1 = countULPdp(t2.y, fry);
|
||||
|
||||
if (u0 != 0 || u1 != 0) {
|
||||
printf("Pure C modf arg=%.20g ulp=%.20g %.20g\n", d, u0, u1);
|
||||
printf("correct = %.20g, %.20g\n", mpfr_get_d(frz, GMP_RNDN), mpfr_get_d(fry, GMP_RNDN));
|
||||
printf("test = %.20g, %.20g\n", t2.x, t2.y);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
int s;
|
||||
mpfr_lgamma(frx, &s, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULPdp(t = xlgamma_u1(d), frx);
|
||||
|
||||
if (((d < 0 && fabsl(t - mpfr_get_ld(frx, GMP_RNDN)) > 1e-15 && u0 > 1) || (0 <= d && d < 2e+305 && u0 > 1) || (2e+305 <= d && !(u0 <= 1 || isinf(t))))) {
|
||||
printf("Pure C xlgamma_u1 arg=%.20g ulp=%.20g\n", d, u0);
|
||||
printf("Correct = %.20Lg, test = %.20g\n", mpfr_get_ld(frx, GMP_RNDN), t);
|
||||
printf("Diff = %.20Lg\n", fabsl(t - mpfr_get_ld(frx, GMP_RNDN)));
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_gamma(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULP2dp(t = xtgamma_u1(d), frx);
|
||||
|
||||
if (u0 > 1.0) {
|
||||
printf("Pure C xtgamma_u1 arg=%.20g ulp=%.20g\n", d, u0);
|
||||
printf("Correct = %.20g, test = %.20g\n", mpfr_get_d(frx, GMP_RNDN), t);
|
||||
printf("Diff = %.20Lg\n", fabsl(t - mpfr_get_ld(frx, GMP_RNDN)));
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_erfc(frx, frx, GMP_RNDN);
|
||||
|
||||
static double ebz = 9.8813129168249308835e-324; // nextafter(nextafter(0, 1), 1);
|
||||
|
||||
double u0 = countULP2dp(t = xerfc_u15(d), frx);
|
||||
|
||||
if ((d > 26.2 && u0 > 2.5 && !(mpfr_get_d(frx, GMP_RNDN) == 0 && t <= ebz)) || (d <= 26.2 && u0 > 1.5)) {
|
||||
printf("Pure C xerfc_u15 arg=%.20g ulp=%.20g\n", d, u0);
|
||||
printf("Correct = %.20Lg, test = %.20g\n", mpfr_get_ld(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
mpfr_set_d(frx, d, GMP_RNDN);
|
||||
mpfr_erf(frx, frx, GMP_RNDN);
|
||||
|
||||
double u0 = countULP2dp(t = xerf_u1(d), frx);
|
||||
|
||||
if (u0 > 0.75) {
|
||||
printf("Pure C xerf_u1 arg=%.20g ulp=%.20g\n", d, u0);
|
||||
printf("Correct = %.20Lg, test = %.20g\n", mpfr_get_ld(frx, GMP_RNDN), t);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
mpfr_clears(frw, frx, fry, frz, NULL);
|
||||
exit(0);
|
||||
}
|
||||
@@ -0,0 +1,241 @@
|
||||
// Copyright Naoki Shibata and contributors 2010 - 2024.
|
||||
// Distributed under the Boost Software License, Version 1.0.
|
||||
// (See accompanying file LICENSE.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt)
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdint.h>
|
||||
#include <mpfr.h>
|
||||
#include <time.h>
|
||||
#include <float.h>
|
||||
#include <limits.h>
|
||||
|
||||
#include <math.h>
|
||||
|
||||
#include "misc.h"
|
||||
|
||||
#ifdef ENABLE_SYS_getrandom
|
||||
#define _GNU_SOURCE
|
||||
#include <unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <linux/random.h>
|
||||
#endif
|
||||
|
||||
#include "sleef.h"
|
||||
#include "testerutil.h"
|
||||
|
||||
#define DORENAME
|
||||
#include "rename.h"
|
||||
|
||||
#define DENORMAL_LDBL_MIN (3.6451995318824746025284059336194e-4951L)
|
||||
#define XLDBL_MIN (3.3621031431120935062626778173218e-4932L)
|
||||
|
||||
#ifndef M_PIl
|
||||
#define M_PIl 3.141592653589793238462643383279502884L
|
||||
#endif
|
||||
|
||||
#ifndef M_PI_4l
|
||||
#define M_PI_4l .785398163397448309615660845819875721049292L
|
||||
#endif
|
||||
|
||||
#define POSITIVE_INFINITY INFINITY
|
||||
#define NEGATIVE_INFINITY (-INFINITY)
|
||||
|
||||
int isnumberl(long double x) { return x != SLEEF_INFINITYl && x != -SLEEF_INFINITYl && x == x; }
|
||||
int isPlusZerol(long double x) { return x == 0 && copysignl(1, x) == 1; }
|
||||
int isMinusZerol(long double x) { return x == 0 && copysignl(1, x) == -1; }
|
||||
|
||||
mpfr_t fra, frb, frd;
|
||||
|
||||
double countULP(long double d, mpfr_t c) {
|
||||
long double c2 = mpfr_get_ld(c, GMP_RNDN);
|
||||
if (c2 == 0 && d != 0) return 10000;
|
||||
//if (isPlusZerol(c2) && !isPlusZerol(d)) return 10003;
|
||||
//if (isMinusZerol(c2) && !isMinusZerol(d)) return 10004;
|
||||
if (isnanl(c2) && isnanl(d)) return 0;
|
||||
if (isnanl(c2) || isnanl(d)) return 10001;
|
||||
if (c2 == POSITIVE_INFINITY && d == POSITIVE_INFINITY) return 0;
|
||||
if (c2 == NEGATIVE_INFINITY && d == NEGATIVE_INFINITY) return 0;
|
||||
if (!isnumberl(c2) && !isnumberl(d)) return 0;
|
||||
|
||||
int e;
|
||||
frexpl(mpfr_get_ld(c, GMP_RNDN), &e);
|
||||
mpfr_set_ld(frb, fmaxl(ldexpl(1.0, e-64), DENORMAL_LDBL_MIN), GMP_RNDN);
|
||||
|
||||
mpfr_set_ld(frd, d, GMP_RNDN);
|
||||
mpfr_sub(fra, frd, c, GMP_RNDN);
|
||||
mpfr_div(fra, fra, frb, GMP_RNDN);
|
||||
double u = fabs(mpfr_get_d(fra, GMP_RNDN));
|
||||
|
||||
return u;
|
||||
}
|
||||
|
||||
double countULP2(long double d, mpfr_t c) {
|
||||
long double c2 = mpfr_get_ld(c, GMP_RNDN);
|
||||
if (c2 == 0 && d != 0) return 10000;
|
||||
//if (isPlusZerol(c2) && !isPlusZerol(d)) return 10003;
|
||||
//if (isMinusZerol(c2) && !isMinusZerol(d)) return 10004;
|
||||
if (isnanl(c2) && isnanl(d)) return 0;
|
||||
if (isnanl(c2) || isnanl(d)) return 10001;
|
||||
if (c2 == POSITIVE_INFINITY && d == POSITIVE_INFINITY) return 0;
|
||||
if (c2 == NEGATIVE_INFINITY && d == NEGATIVE_INFINITY) return 0;
|
||||
if (!isnumberl(c2) && !isnumberl(d)) return 0;
|
||||
|
||||
int e;
|
||||
frexpl(mpfr_get_ld(c, GMP_RNDN), &e);
|
||||
mpfr_set_ld(frb, fmaxl(ldexpl(1.0, e-64), LDBL_MIN), GMP_RNDN);
|
||||
|
||||
mpfr_set_ld(frd, d, GMP_RNDN);
|
||||
mpfr_sub(fra, frd, c, GMP_RNDN);
|
||||
mpfr_div(fra, fra, frb, GMP_RNDN);
|
||||
double u = fabs(mpfr_get_d(fra, GMP_RNDN));
|
||||
|
||||
return u;
|
||||
}
|
||||
|
||||
typedef union {
|
||||
long double d;
|
||||
__int128 u128;
|
||||
} conv_t;
|
||||
|
||||
long double rnd() {
|
||||
conv_t c;
|
||||
switch(random() & 15) {
|
||||
case 0: return INFINITY;
|
||||
case 1: return -INFINITY;
|
||||
}
|
||||
#ifdef ENABLE_SYS_getrandom
|
||||
syscall(SYS_getrandom, &c.u128, sizeof(c.u128), 0);
|
||||
#else
|
||||
c.u128 = random() | ((__int128)random() << 31) | ((__int128)random() << (31*2)) | ((__int128)random() << (31*3)) | ((__int128)random() << (31*4));
|
||||
#endif
|
||||
return c.d;
|
||||
}
|
||||
|
||||
long double rnd_fr() {
|
||||
conv_t c;
|
||||
do {
|
||||
#ifdef ENABLE_SYS_getrandom
|
||||
syscall(SYS_getrandom, &c.u128, sizeof(c.u128), 0);
|
||||
#else
|
||||
c.u128 = random() | ((__int128)random() << 31) | ((__int128)random() << (31*2)) | ((__int128)random() << (31*3)) | ((__int128)random() << (31*4));
|
||||
#endif
|
||||
} while(!isnumberl(c.d));
|
||||
return c.d;
|
||||
}
|
||||
|
||||
long double rnd_zo() {
|
||||
conv_t c;
|
||||
do {
|
||||
#ifdef ENABLE_SYS_getrandom
|
||||
syscall(SYS_getrandom, &c.u128, sizeof(c.u128), 0);
|
||||
#else
|
||||
c.u128 = random() | ((__int128)random() << 31) | ((__int128)random() << (31*2)) | ((__int128)random() << (31*3)) | ((__int128)random() << (31*4));
|
||||
#endif
|
||||
} while(!isnumberl(c.d) || c.d < -1 || 1 < c.d);
|
||||
return c.d;
|
||||
}
|
||||
|
||||
void sinpifr(mpfr_t ret, long double d) {
|
||||
mpfr_t frpi, frd;
|
||||
mpfr_inits(frpi, frd, NULL);
|
||||
|
||||
mpfr_const_pi(frpi, GMP_RNDN);
|
||||
mpfr_set_d(frd, 1.0, GMP_RNDN);
|
||||
mpfr_mul(frpi, frpi, frd, GMP_RNDN);
|
||||
mpfr_set_ld(frd, d, GMP_RNDN);
|
||||
mpfr_mul(frd, frpi, frd, GMP_RNDN);
|
||||
mpfr_sin(ret, frd, GMP_RNDN);
|
||||
|
||||
mpfr_clears(frpi, frd, NULL);
|
||||
}
|
||||
|
||||
void cospifr(mpfr_t ret, long double d) {
|
||||
mpfr_t frpi, frd;
|
||||
mpfr_inits(frpi, frd, NULL);
|
||||
|
||||
mpfr_const_pi(frpi, GMP_RNDN);
|
||||
mpfr_set_d(frd, 1.0, GMP_RNDN);
|
||||
mpfr_mul(frpi, frpi, frd, GMP_RNDN);
|
||||
mpfr_set_ld(frd, d, GMP_RNDN);
|
||||
mpfr_mul(frd, frpi, frd, GMP_RNDN);
|
||||
mpfr_cos(ret, frd, GMP_RNDN);
|
||||
|
||||
mpfr_clears(frpi, frd, NULL);
|
||||
}
|
||||
|
||||
int main(int argc,char **argv)
|
||||
{
|
||||
mpfr_t frx;
|
||||
|
||||
mpfr_set_default_prec(256);
|
||||
mpfr_inits(fra, frb, frd, frx, NULL);
|
||||
|
||||
conv_t cd;
|
||||
long double d, t;
|
||||
|
||||
int cnt, ecnt = 0;
|
||||
|
||||
srandom(time(NULL));
|
||||
|
||||
for(cnt = 0;ecnt < 1000;cnt++) {
|
||||
switch(cnt & 7) {
|
||||
case 0:
|
||||
d = rnd();
|
||||
break;
|
||||
case 1:
|
||||
cd.d = rint((2 * (double)random() / RAND_MAX - 1) * 1e+10) * M_PI_4;
|
||||
cd.u128 += (random() & 0xff) - 0x7f;
|
||||
d = cd.d;
|
||||
break;
|
||||
default:
|
||||
d = rnd_fr();
|
||||
break;
|
||||
}
|
||||
|
||||
Sleef_longdouble2 sc = xsincospil_u05(d);
|
||||
Sleef_longdouble2 sc2 = xsincospil_u35(d);
|
||||
|
||||
{
|
||||
const double rangemax2 = 1e+9;
|
||||
|
||||
sinpifr(frx, d);
|
||||
|
||||
double u0 = countULP2(t = sc.x, frx);
|
||||
|
||||
if (u0 != 0 && ((fabsl(d) <= rangemax2 && u0 > 0.505) || fabsl(t) > 1 || !isnumberl(t))) {
|
||||
printf("Pure C sincospil_u05 sin arg=%.30Lg ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULP2(t = sc2.x, frx);
|
||||
|
||||
if (u1 != 0 && ((fabsl(d) <= rangemax2 && u1 > 1.5) || fabsl(t) > 1 || !isnumberl(t))) {
|
||||
printf("Pure C sincospil_u35 sin arg=%.30Lg ulp=%.20g\n", d, u1);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
const double rangemax2 = 1e+9;
|
||||
|
||||
cospifr(frx, d);
|
||||
|
||||
double u0 = countULP2(t = sc.y, frx);
|
||||
|
||||
if (u0 != 0 && ((fabsl(d) <= rangemax2 && u0 > 0.505) || fabsl(t) > 1 || !isnumberl(t))) {
|
||||
printf("Pure C sincospil_u05 cos arg=%.30Lg ulp=%.20g\n", d, u0);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
|
||||
double u1 = countULP2(t = sc.y, frx);
|
||||
|
||||
if (u1 != 0 && ((fabsl(d) <= rangemax2 && u1 > 1.5) || fabsl(t) > 1 || !isnumberl(t))) {
|
||||
printf("Pure C sincospil_u35 cos arg=%.30Lg ulp=%.20g\n", d, u1);
|
||||
fflush(stdout); ecnt++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user