8345146: [PPC64] Make intrinsic conversions between bit representations of half precision values and floats

Reviewed-by: rrich, lucy
This commit is contained in:
Martin Doerr
2024-12-05 12:03:53 +00:00
parent 7ee84d8f70
commit b42d79eb6a
15 changed files with 180 additions and 3 deletions

View File

@@ -506,6 +506,9 @@ class Assembler : public AbstractAssembler {
LFSU_OPCODE = (49u << OPCODE_SHIFT | 00u << 1),
LFSX_OPCODE = (31u << OPCODE_SHIFT | 535u << 1),
LFIWAX_OPCODE = (31u << OPCODE_SHIFT | 855u << 1),
LFIWZX_OPCODE = (31u << OPCODE_SHIFT | 887u << 1),
STFD_OPCODE = (54u << OPCODE_SHIFT | 00u << 1),
STFDU_OPCODE = (55u << OPCODE_SHIFT | 00u << 1),
STFDX_OPCODE = (31u << OPCODE_SHIFT | 727u << 1),
@@ -513,6 +516,8 @@ class Assembler : public AbstractAssembler {
STFSU_OPCODE = (53u << OPCODE_SHIFT | 00u << 1),
STFSX_OPCODE = (31u << OPCODE_SHIFT | 663u << 1),
STFIWX_OPCODE = (31u << OPCODE_SHIFT | 983u << 1),
FSQRT_OPCODE = (63u << OPCODE_SHIFT | 22u << 1), // A-FORM
FSQRTS_OPCODE = (59u << OPCODE_SHIFT | 22u << 1), // A-FORM
@@ -555,6 +560,10 @@ class Assembler : public AbstractAssembler {
XVDIVSP_OPCODE = (60u << OPCODE_SHIFT | 88u << 3),
XXBRD_OPCODE = (60u << OPCODE_SHIFT | 475u << 2 | 23u << 16), // XX2-FORM
XXBRW_OPCODE = (60u << OPCODE_SHIFT | 475u << 2 | 15u << 16), // XX2-FORM
XVCVHPSP_OPCODE= (60u << OPCODE_SHIFT | 475u << 2 | 24u << 16), // XX2-FORM
XVCVSPHP_OPCODE= (60u << OPCODE_SHIFT | 475u << 2 | 25u << 16), // XX2-FORM
XSCVHPDP_OPCODE= (60u << OPCODE_SHIFT | 347u << 2 | 16u << 16), // XX2-FORM
XSCVDPHP_OPCODE= (60u << OPCODE_SHIFT | 347u << 2 | 17u << 16), // XX2-FORM
XXPERM_OPCODE = (60u << OPCODE_SHIFT | 26u << 3),
XXSEL_OPCODE = (60u << OPCODE_SHIFT | 3u << 4),
XXSPLTIB_OPCODE= (60u << OPCODE_SHIFT | 360u << 1),
@@ -2076,6 +2085,9 @@ class Assembler : public AbstractAssembler {
inline void lfdu( FloatRegister d, int si16, Register a);
inline void lfdx( FloatRegister d, Register a, Register b);
inline void lfiwax(FloatRegister d, Register a, Register b);
inline void lfiwzx(FloatRegister d, Register a, Register b);
// PPC 1, section 4.6.3 Floating-Point Store Instructions
inline void stfs( FloatRegister s, int si16, Register a);
inline void stfsu( FloatRegister s, int si16, Register a);
@@ -2084,6 +2096,8 @@ class Assembler : public AbstractAssembler {
inline void stfdu( FloatRegister s, int si16, Register a);
inline void stfdx( FloatRegister s, Register a, Register b);
inline void stfiwx(FloatRegister s, Register a, Register b);
// PPC 1, section 4.6.4 Floating-Point Move Instructions
inline void fmr( FloatRegister d, FloatRegister b);
inline void fmr_( FloatRegister d, FloatRegister b);
@@ -2348,6 +2362,10 @@ class Assembler : public AbstractAssembler {
inline void xxleqv( VectorSRegister d, VectorSRegister a, VectorSRegister b);
inline void xxbrd( VectorSRegister d, VectorSRegister b);
inline void xxbrw( VectorSRegister d, VectorSRegister b);
inline void xvcvhpsp( VectorSRegister d, VectorSRegister b);
inline void xvcvsphp( VectorSRegister d, VectorSRegister b);
inline void xscvhpdp( VectorSRegister d, VectorSRegister b);
inline void xscvdphp( VectorSRegister d, VectorSRegister b);
inline void xxland( VectorSRegister d, VectorSRegister a, VectorSRegister b);
inline void xxsel( VectorSRegister d, VectorSRegister a, VectorSRegister b, VectorSRegister c);
inline void xxspltib( VectorSRegister d, int ui8);
@@ -2474,10 +2492,13 @@ class Assembler : public AbstractAssembler {
inline void lfsx( FloatRegister d, Register b);
inline void lfd( FloatRegister d, int si16);
inline void lfdx( FloatRegister d, Register b);
inline void lfiwax(FloatRegister d, Register b);
inline void lfiwzx(FloatRegister d, Register b);
inline void stfs( FloatRegister s, int si16);
inline void stfsx( FloatRegister s, Register b);
inline void stfd( FloatRegister s, int si16);
inline void stfdx( FloatRegister s, Register b);
inline void stfiwx(FloatRegister s, Register b);
inline void lvebx( VectorRegister d, Register s2);
inline void lvehx( VectorRegister d, Register s2);
inline void lvewx( VectorRegister d, Register s2);

View File

@@ -741,6 +741,9 @@ inline void Assembler::lfd( FloatRegister d, int si16, Register a) { emit_int3
inline void Assembler::lfdu(FloatRegister d, int si16, Register a) { emit_int32( LFDU_OPCODE | frt(d) | ra(a) | simm(si16,16)); }
inline void Assembler::lfdx(FloatRegister d, Register a, Register b) { emit_int32( LFDX_OPCODE | frt(d) | ra0mem(a) | rb(b)); }
inline void Assembler::lfiwax(FloatRegister d, Register a, Register b) { emit_int32( LFIWAX_OPCODE | frt(d) | ra0mem(a) |rb(b)); }
inline void Assembler::lfiwzx(FloatRegister d, Register a, Register b) { emit_int32( LFIWZX_OPCODE | frt(d) | ra0mem(a) |rb(b)); }
// PPC 1, section 4.6.3 Floating-Point Store Instructions
// Use ra0mem instead of ra in some instructions below.
inline void Assembler::stfs( FloatRegister s, int si16, Register a) { emit_int32( STFS_OPCODE | frs(s) | ra0mem(a) | simm(si16,16)); }
@@ -750,6 +753,8 @@ inline void Assembler::stfd( FloatRegister s, int si16, Register a) { emit_int3
inline void Assembler::stfdu(FloatRegister s, int si16, Register a) { emit_int32( STFDU_OPCODE | frs(s) | ra(a) | simm(si16,16)); }
inline void Assembler::stfdx(FloatRegister s, Register a, Register b){ emit_int32( STFDX_OPCODE | frs(s) | ra0mem(a) | rb(b)); }
inline void Assembler::stfiwx(FloatRegister s, Register a, Register b) { emit_int32( STFIWX_OPCODE | frs(s) | ra0mem(a) |rb(b)); }
// PPC 1, section 4.6.4 Floating-Point Move Instructions
inline void Assembler::fmr( FloatRegister d, FloatRegister b) { emit_int32( FMR_OPCODE | frt(d) | frb(b) | rc(0)); }
inline void Assembler::fmr_(FloatRegister d, FloatRegister b) { emit_int32( FMR_OPCODE | frt(d) | frb(b) | rc(1)); }
@@ -871,6 +876,10 @@ inline void Assembler::xxlxor( VectorSRegister d, VectorSRegister a, VectorSReg
inline void Assembler::xxleqv( VectorSRegister d, VectorSRegister a, VectorSRegister b) { emit_int32( XXLEQV_OPCODE | vsrt(d) | vsra(a) | vsrb(b)); }
inline void Assembler::xxbrd( VectorSRegister d, VectorSRegister b) { emit_int32( XXBRD_OPCODE | vsrt(d) | vsrb(b) ); }
inline void Assembler::xxbrw( VectorSRegister d, VectorSRegister b) { emit_int32( XXBRW_OPCODE | vsrt(d) | vsrb(b) ); }
inline void Assembler::xvcvhpsp(VectorSRegister d, VectorSRegister b) { emit_int32( XVCVHPSP_OPCODE | vsrt(d) | vsrb(b) ); }
inline void Assembler::xvcvsphp(VectorSRegister d, VectorSRegister b) { emit_int32( XVCVSPHP_OPCODE | vsrt(d) | vsrb(b) ); }
inline void Assembler::xscvhpdp(VectorSRegister d, VectorSRegister b) { emit_int32( XSCVHPDP_OPCODE | vsrt(d) | vsrb(b) ); }
inline void Assembler::xscvdphp(VectorSRegister d, VectorSRegister b) { emit_int32( XSCVDPHP_OPCODE | vsrt(d) | vsrb(b) ); }
inline void Assembler::xvdivsp( VectorSRegister d, VectorSRegister a, VectorSRegister b) { emit_int32( XVDIVSP_OPCODE | vsrt(d) | vsra(a) | vsrb(b)); }
inline void Assembler::xvdivdp( VectorSRegister d, VectorSRegister a, VectorSRegister b) { emit_int32( XVDIVDP_OPCODE | vsrt(d) | vsra(a) | vsrb(b)); }
inline void Assembler::xvabssp( VectorSRegister d, VectorSRegister b) { emit_int32( XVABSSP_OPCODE | vsrt(d) | vsrb(b)); }
@@ -1150,12 +1159,17 @@ inline void Assembler::lfsx(FloatRegister d, Register b) { emit_int32( LFSX_OPCO
inline void Assembler::lfd( FloatRegister d, int si16) { emit_int32( LFD_OPCODE | frt(d) | simm(si16,16)); }
inline void Assembler::lfdx(FloatRegister d, Register b) { emit_int32( LFDX_OPCODE | frt(d) | rb(b)); }
inline void Assembler::lfiwax(FloatRegister d, Register b) { emit_int32( LFIWAX_OPCODE | frt(d) | rb(b)); }
inline void Assembler::lfiwzx(FloatRegister d, Register b) { emit_int32( LFIWZX_OPCODE | frt(d) | rb(b)); }
// ra0 version
inline void Assembler::stfs( FloatRegister s, int si16) { emit_int32( STFS_OPCODE | frs(s) | simm(si16, 16)); }
inline void Assembler::stfsx(FloatRegister s, Register b) { emit_int32( STFSX_OPCODE | frs(s) | rb(b)); }
inline void Assembler::stfd( FloatRegister s, int si16) { emit_int32( STFD_OPCODE | frs(s) | simm(si16, 16)); }
inline void Assembler::stfdx(FloatRegister s, Register b) { emit_int32( STFDX_OPCODE | frs(s) | rb(b)); }
inline void Assembler::stfiwx(FloatRegister s, Register b) { emit_int32( STFIWX_OPCODE | frs(s) |rb(b)); }
// ra0 version
inline void Assembler::lvebx( VectorRegister d, Register s2) { emit_int32( LVEBX_OPCODE | vrt(d) | rb(s2)); }
inline void Assembler::lvehx( VectorRegister d, Register s2) { emit_int32( LVEHX_OPCODE | vrt(d) | rb(s2)); }

View File

@@ -1713,7 +1713,7 @@ void LIR_Assembler::arith_op(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr
}
void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr thread, LIR_Opr dest, LIR_Op* op) {
void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr tmp, LIR_Opr dest, LIR_Op* op) {
switch (code) {
case lir_sqrt: {
__ fsqrt(dest->as_double_reg(), value->as_double_reg());
@@ -1723,6 +1723,14 @@ void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr thread, L
__ fabs(dest->as_double_reg(), value->as_double_reg());
break;
}
case lir_f2hf: {
__ f2hf(dest.as_register(), value.as_float_reg(), tmp.as_float_reg());
break;
}
case lir_hf2f: {
__ hf2f(dest->as_float_reg(), value.as_register());
break;
}
default: {
ShouldNotReachHere();
break;

View File

@@ -690,6 +690,25 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
__ abs(value.result(), dst, LIR_OprFact::illegalOpr);
break;
}
case vmIntrinsics::_floatToFloat16: {
assert(x->number_of_arguments() == 1, "wrong type");
LIRItem value(x->argument_at(0), this);
value.load_item();
LIR_Opr dst = rlock_result(x);
LIR_Opr tmp = new_register(T_FLOAT);
// f2hf treats tmp as live_in. Workaround: initialize to some value.
__ move(LIR_OprFact::floatConst(-0.0), tmp); // just to satisfy LinearScan
__ f2hf(value.result(), dst, tmp);
break;
}
case vmIntrinsics::_float16ToFloat: {
assert(x->number_of_arguments() == 1, "wrong type");
LIRItem value(x->argument_at(0), this);
value.load_item();
LIR_Opr dst = rlock_result(x);
__ hf2f(value.result(), dst, LIR_OprFact::illegalOpr);
break;
}
case vmIntrinsics::_dsqrt:
case vmIntrinsics::_dsqrt_strict: {
if (VM_Version::has_fsqrt()) {

View File

@@ -186,6 +186,9 @@ class MacroAssembler: public Assembler {
void inline set_cmpu3(Register dst, bool treat_unordered_like_less);
// Branch-free implementation to convert !=0 to 1.
void inline normalize_bool(Register dst, Register temp = R0, bool is_64bit = false);
// Convert between half precision float encoded into a short and a float in a FloatRegister.
void inline f2hf(Register dst, FloatRegister src, FloatRegister tmp);
void inline hf2f(FloatRegister dst, Register src);
inline void pd_patch_instruction(address branch, address target, const char* file, int line);
NOT_PRODUCT(static void pd_print_patched_instruction(address branch);)

View File

@@ -297,6 +297,20 @@ inline void MacroAssembler::normalize_bool(Register dst, Register temp, bool is_
}
}
inline void MacroAssembler::f2hf(Register dst, FloatRegister src, FloatRegister tmp) {
// Single precision values in FloatRegisters use double precision format on PPC64.
xscvdphp(tmp->to_vsr(), src->to_vsr());
mffprd(dst, tmp);
// Make it a proper short (sign-extended).
extsh(dst, dst);
}
inline void MacroAssembler::hf2f(FloatRegister dst, Register src) {
mtfprd(dst, src);
// Single precision values in FloatRegisters use double precision format on PPC64.
xscvhpdp(dst->to_vsr(), dst->to_vsr());
}
// Convenience bc_far versions
inline void MacroAssembler::blt_far(ConditionRegister crx, Label& L, int optimize) { MacroAssembler::bc_far(bcondCRbiIs1, bi0(crx, less), L, optimize); }
inline void MacroAssembler::bgt_far(ConditionRegister crx, Label& L, int optimize) { MacroAssembler::bc_far(bcondCRbiIs1, bi0(crx, greater), L, optimize); }

View File

@@ -2077,6 +2077,9 @@ bool Matcher::match_rule_supported(int opcode) {
case Op_PopCountI:
case Op_PopCountL:
return (UsePopCountInstruction && VM_Version::has_popcntw());
case Op_ConvF2HF:
case Op_ConvHF2F:
return VM_Version::supports_float16();
case Op_AddVB:
case Op_AddVS:
@@ -11245,6 +11248,34 @@ instruct convF2D_reg(regD dst, regF src) %{
ins_pipe(pipe_class_default);
%}
instruct convF2HF_reg_reg(iRegIdst dst, regF src, regF tmp) %{
match(Set dst (ConvF2HF src));
effect(TEMP tmp);
ins_cost(3 * DEFAULT_COST);
size(12);
format %{ "xscvdphp $tmp, $src\t# convert to half precision\n\t"
"mffprd $dst, $tmp\t# move result from $tmp to $dst\n\t"
"extsh $dst, $dst\t# make it a proper short"
%}
ins_encode %{
__ f2hf($dst$$Register, $src$$FloatRegister, $tmp$$FloatRegister);
%}
ins_pipe(pipe_class_default);
%}
instruct convHF2F_reg_reg(regF dst, iRegIsrc src) %{
match(Set dst (ConvHF2F src));
ins_cost(2 * DEFAULT_COST);
size(8);
format %{ "mtfprd $dst, $src\t# move source from $src to $dst\n\t"
"xscvhpdp $dst, $dst\t# convert from half precision"
%}
ins_encode %{
__ hf2f($dst$$FloatRegister, $src$$Register);
%}
ins_pipe(pipe_class_default);
%}
//----------Control Flow Instructions------------------------------------------
// Compare Instructions

View File

@@ -3451,6 +3451,24 @@ class StubGenerator: public StubCodeGenerator {
return start;
}
address generate_floatToFloat16() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "floatToFloat16");
address start = __ function_entry();
__ f2hf(R3_RET, F1_ARG1, F0);
__ blr();
return start;
}
address generate_float16ToFloat() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "float16ToFloat");
address start = __ function_entry();
__ hf2f(F1_RET, R3_ARG1);
__ blr();
return start;
}
address generate_method_entry_barrier() {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier");
@@ -4678,6 +4696,12 @@ address generate_lookup_secondary_supers_table_stub(u1 super_klass_index) {
StubRoutines::_crc32c_table_addr = StubRoutines::ppc::generate_crc_constants(REVERSE_CRC32C_POLY);
StubRoutines::_updateBytesCRC32C = generate_CRC32_updateBytes(true);
}
if (VM_Version::supports_float16()) {
// For results consistency both intrinsics should be enabled.
StubRoutines::_hf2f = generate_float16ToFloat();
StubRoutines::_f2hf = generate_floatToFloat16();
}
}
void generate_continuation_stubs() {

View File

@@ -1155,6 +1155,44 @@ address TemplateInterpreterGenerator::generate_math_entry(AbstractInterpreter::M
return entry;
}
address TemplateInterpreterGenerator::generate_Float_floatToFloat16_entry() {
if (!VM_Version::supports_float16()) return nullptr;
address entry = __ pc();
__ lfs(F1, Interpreter::stackElementSize, R15_esp);
__ f2hf(R3_RET, F1, F0);
// Restore caller sp for c2i case (from compiled) and for resized sender frame (from interpreted).
__ resize_frame_absolute(R21_sender_SP, R11_scratch1, R0);
__ blr();
__ flush();
return entry;
}
address TemplateInterpreterGenerator::generate_Float_float16ToFloat_entry() {
if (!VM_Version::supports_float16()) return nullptr;
address entry = __ pc();
// Note: Could also use:
//__ li(R3, Interpreter::stackElementSize);
//__ lfiwax(F1_RET, R15_esp, R3); // short stored as 32 bit integer
//__ xscvhpdp(F1_RET->to_vsr(), F1_RET->to_vsr());
__ lwa(R3, Interpreter::stackElementSize, R15_esp);
__ hf2f(F1_RET, R3);
// Restore caller sp for c2i case (from compiled) and for resized sender frame (from interpreted).
__ resize_frame_absolute(R21_sender_SP, R11_scratch1, R0);
__ blr();
__ flush();
return entry;
}
void TemplateInterpreterGenerator::bang_stack_shadow_pages(bool native_call) {
// Quick & dirty stack overflow checking: bang the stack & handle trap.
// Note that we do the banging after the frame is setup, since the exception
@@ -1965,8 +2003,6 @@ address TemplateInterpreterGenerator::generate_Float_intBitsToFloat_entry() { re
address TemplateInterpreterGenerator::generate_Float_floatToRawIntBits_entry() { return nullptr; }
address TemplateInterpreterGenerator::generate_Double_longBitsToDouble_entry() { return nullptr; }
address TemplateInterpreterGenerator::generate_Double_doubleToRawLongBits_entry() { return nullptr; }
address TemplateInterpreterGenerator::generate_Float_float16ToFloat_entry() { return nullptr; }
address TemplateInterpreterGenerator::generate_Float_floatToFloat16_entry() { return nullptr; }
// =============================================================================
// Exceptions

View File

@@ -97,6 +97,8 @@ public:
constexpr static bool supports_recursive_lightweight_locking() { return true; }
constexpr static bool supports_secondary_supers_table() { return true; }
static bool supports_float16() { return PowerArchitecturePPC64 >= 9; }
static bool is_determine_features_test_running() { return _is_determine_features_test_running; }
// CPU instruction support
static bool has_fsqrt() { return (_features & fsqrt_m) != 0; }

View File

@@ -27,6 +27,7 @@
* @summary Verify conversion between float and the binary16 format
* @requires (vm.cpu.features ~= ".*avx512vl.*" | vm.cpu.features ~= ".*f16c.*") | os.arch=="aarch64"
* | (os.arch == "riscv64" & vm.cpu.features ~= ".*zfh.*")
* | ((os.arch == "ppc64" | os.arch == "ppc64le") & vm.cpu.features ~= ".*darn.*")
* @requires vm.compiler1.enabled & vm.compiler2.enabled
* @requires vm.compMode != "Xcomp"
* @comment default run

View File

@@ -27,6 +27,7 @@
* @summary Verify NaN sign and significand bits are preserved across conversions
* @requires (vm.cpu.features ~= ".*avx512vl.*" | vm.cpu.features ~= ".*f16c.*") | os.arch=="aarch64"
* | (os.arch == "riscv64" & vm.cpu.features ~= ".*zfh.*")
* | ((os.arch == "ppc64" | os.arch == "ppc64le") & vm.cpu.features ~= ".*darn.*")
* @requires vm.compiler1.enabled & vm.compiler2.enabled
* @requires vm.compMode != "Xcomp"
* @library /test/lib /

View File

@@ -27,6 +27,7 @@
* @summary Verify conversion between float and the binary16 format
* @requires (vm.cpu.features ~= ".*avx512vl.*" | vm.cpu.features ~= ".*f16c.*") | os.arch == "aarch64"
* | (os.arch == "riscv64" & vm.cpu.features ~= ".*zfh.*")
* | ((os.arch == "ppc64" | os.arch == "ppc64le") & vm.cpu.features ~= ".*darn.*")
* @requires vm.compiler1.enabled & vm.compiler2.enabled
* @requires vm.compMode != "Xcomp"
* @comment default run:

View File

@@ -27,6 +27,7 @@
* @summary Verify conversion cons between float and the binary16 format
* @requires (vm.cpu.features ~= ".*avx512vl.*" | vm.cpu.features ~= ".*f16c.*") | os.arch=="aarch64"
* | (os.arch == "riscv64" & vm.cpu.features ~= ".*zfh.*")
* | ((os.arch == "ppc64" | os.arch == "ppc64le") & vm.cpu.features ~= ".*darn.*")
* @requires vm.compiler1.enabled & vm.compiler2.enabled
* @requires vm.compMode != "Xcomp"
* @comment default run:

View File

@@ -26,6 +26,7 @@
* @summary Test Float16 vector conversion chain.
* @requires (vm.cpu.features ~= ".*avx512vl.*" | vm.cpu.features ~= ".*f16c.*") | os.arch == "aarch64"
* | (os.arch == "riscv64" & vm.cpu.features ~= ".*zvfh.*")
* | ((os.arch == "ppc64" | os.arch == "ppc64le") & vm.cpu.features ~= ".*darn.*")
* @library /test/lib /
* @run driver compiler.vectorization.TestFloat16VectorConvChain
*/