8306302: C2 Superword fix: use VectorMaskCmp and VectorBlend instead of CMoveVF/D

Reviewed-by: fgao, jbhateja
2025-12-06 09:29:38 +01:00 · 2023-05-24 07:00:27 +00:00
parent 2836c34b64
commit beb75e651f
13 changed files with 820 additions and 583 deletions
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -5992,49 +5992,6 @@ instruct vblend_sve(vReg dst, vReg src1, vReg src2, pReg pg) %{
  ins_pipe(pipe_slow);
 %}

-// ------------------------- Vector conditional move --------------------------
-
-instruct vcmove_neon(vReg dst, vReg src1, vReg src2, immI cond, cmpOp copnd) %{
-  predicate(UseSVE == 0 ||
-            (VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)) &&
-             n->in(1)->in(2)->get_int() != BoolTest::ne));
-  match(Set dst (CMoveVF (Binary copnd cond) (Binary src1 src2)));
-  match(Set dst (CMoveVD (Binary copnd cond) (Binary src1 src2)));
-  effect(TEMP_DEF dst);
-  format %{ "vcmove_neon.$copnd $dst, $src1, $src2\t# vector conditional move fp" %}
-  ins_encode %{
-    Assembler::Condition condition = to_assembler_cond((BoolTest::mask)$cond$$constant);
-    BasicType bt = Matcher::vector_element_basic_type(this);
-    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
-    assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
-    __ neon_compare($dst$$FloatRegister, bt, $src1$$FloatRegister,
-                    $src2$$FloatRegister, condition, /* isQ */ length_in_bytes == 16);
-    __ bsl($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
-           $src2$$FloatRegister, $src1$$FloatRegister);
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct vcmove_sve(vReg dst, vReg src1, vReg src2, immI cond, cmpOp copnd, pRegGov pgtmp) %{
-  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)) ||
-            (UseSVE > 0 && n->in(1)->in(2)->get_int() == BoolTest::ne));
-  match(Set dst (CMoveVF (Binary copnd cond) (Binary src1 src2)));
-  match(Set dst (CMoveVD (Binary copnd cond) (Binary src1 src2)));
-  effect(TEMP pgtmp);
-  format %{ "vcmove_sve.$copnd $dst, $src1, $src2\t# vector conditional move fp. KILL $pgtmp" %}
-  ins_encode %{
-    assert(UseSVE > 0, "must be sve");
-    Assembler::Condition condition = to_assembler_cond((BoolTest::mask)$cond$$constant);
-    BasicType bt = Matcher::vector_element_basic_type(this);
-    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
-    __ sve_compare($pgtmp$$PRegister, bt, ptrue, $src1$$FloatRegister,
-                   $src2$$FloatRegister, condition);
-    __ sve_sel($dst$$FloatRegister, __ elemType_to_regVariant(bt),
-               $pgtmp$$PRegister, $src2$$FloatRegister, $src1$$FloatRegister);
-  %}
-  ins_pipe(pipe_slow);
-%}
-
 // ------------------------------ Vector round ---------------------------------

 // vector Math.round
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -4258,49 +4258,6 @@ instruct vblend_sve(vReg dst, vReg src1, vReg src2, pReg pg) %{
  ins_pipe(pipe_slow);
 %}

-// ------------------------- Vector conditional move --------------------------
-
-instruct vcmove_neon(vReg dst, vReg src1, vReg src2, immI cond, cmpOp copnd) %{
-  predicate(UseSVE == 0 ||
-            (VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)) &&
-             n->in(1)->in(2)->get_int() != BoolTest::ne));
-  match(Set dst (CMoveVF (Binary copnd cond) (Binary src1 src2)));
-  match(Set dst (CMoveVD (Binary copnd cond) (Binary src1 src2)));
-  effect(TEMP_DEF dst);
-  format %{ "vcmove_neon.$copnd $dst, $src1, $src2\t# vector conditional move fp" %}
-  ins_encode %{
-    Assembler::Condition condition = to_assembler_cond((BoolTest::mask)$cond$$constant);
-    BasicType bt = Matcher::vector_element_basic_type(this);
-    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
-    assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
-    __ neon_compare($dst$$FloatRegister, bt, $src1$$FloatRegister,
-                    $src2$$FloatRegister, condition, /* isQ */ length_in_bytes == 16);
-    __ bsl($dst$$FloatRegister, length_in_bytes == 16 ? __ T16B : __ T8B,
-           $src2$$FloatRegister, $src1$$FloatRegister);
-  %}
-  ins_pipe(pipe_slow);
-%}
-
-instruct vcmove_sve(vReg dst, vReg src1, vReg src2, immI cond, cmpOp copnd, pRegGov pgtmp) %{
-  predicate(!VM_Version::use_neon_for_vector(Matcher::vector_length_in_bytes(n)) ||
-            (UseSVE > 0 && n->in(1)->in(2)->get_int() == BoolTest::ne));
-  match(Set dst (CMoveVF (Binary copnd cond) (Binary src1 src2)));
-  match(Set dst (CMoveVD (Binary copnd cond) (Binary src1 src2)));
-  effect(TEMP pgtmp);
-  format %{ "vcmove_sve.$copnd $dst, $src1, $src2\t# vector conditional move fp. KILL $pgtmp" %}
-  ins_encode %{
-    assert(UseSVE > 0, "must be sve");
-    Assembler::Condition condition = to_assembler_cond((BoolTest::mask)$cond$$constant);
-    BasicType bt = Matcher::vector_element_basic_type(this);
-    uint length_in_bytes = Matcher::vector_length_in_bytes(this);
-    __ sve_compare($pgtmp$$PRegister, bt, ptrue, $src1$$FloatRegister,
-                   $src2$$FloatRegister, condition);
-    __ sve_sel($dst$$FloatRegister, __ elemType_to_regVariant(bt),
-               $pgtmp$$PRegister, $src2$$FloatRegister, $src1$$FloatRegister);
-  %}
-  ins_pipe(pipe_slow);
-%}
-
 // ------------------------------ Vector round ---------------------------------

 // vector Math.round
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@@ -1504,12 +1504,6 @@ const bool Matcher::match_rule_supported(int opcode) {
        return false;
      }
      break;
-    case Op_CMoveVF:
-    case Op_CMoveVD:
-      if (UseAVX < 1) { // enabled for AVX only
-        return false;
-      }
-      break;
    case Op_StrIndexOf:
      if (!UseSSE42Intrinsics) {
        return false;
@@ -1740,11 +1734,6 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
        return false; // 512bit vpmullq, vandpd and vxorpd are not available
      }
      break;
-    case Op_CMoveVF:
-      if (vlen != 8) {
-        return false; // implementation limitation (only vcmov8F_reg is present)
-      }
-      break;
    case Op_RotateRightV:
    case Op_RotateLeftV:
      if (bt != T_INT && bt != T_LONG) {
@@ -1772,11 +1761,6 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
        return false;
      }
      break;
-    case Op_CMoveVD:
-      if (vlen != 4) {
-        return false; // implementation limitation (only vcmov4D_reg is present)
-      }
-      break;
    case Op_MaxV:
    case Op_MinV:
      if (UseSSE < 4 && is_integral_type(bt)) {
@@ -2947,29 +2931,6 @@ operand legVecZ() %{
  interface(REG_INTER);
 %}

-// Comparison Code for FP conditional move
-operand cmpOp_vcmppd() %{
-  match(Bool);
-
-  predicate(n->as_Bool()->_test._test != BoolTest::overflow &&
-            n->as_Bool()->_test._test != BoolTest::no_overflow);
-  format %{ "" %}
-  interface(COND_INTER) %{
-    equal        (0x0, "eq");
-    less         (0x1, "lt");
-    less_equal   (0x2, "le");
-    not_equal    (0xC, "ne");
-    greater_equal(0xD, "ge");
-    greater      (0xE, "gt");
-    //TODO cannot compile (adlc breaks) without two next lines with error:
-    // x86_64.ad(13987) Syntax Error: :In operand cmpOp_vcmppd: Do not support this encode constant: ' %{
-    // equal' for overflow.
-    overflow     (0x20, "o");  // not really supported by the instruction
-    no_overflow  (0x21, "no"); // not really supported by the instruction
-  %}
-%}
-
-
 // INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)

 // ============================================================================
@@ -5983,42 +5944,6 @@ instruct vmulD_mem(vec dst, vec src, memory mem) %{
  ins_pipe( pipe_slow );
 %}

-instruct vcmov8F_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
-  predicate(Matcher::vector_length(n) == 8);
-  match(Set dst (CMoveVF (Binary copnd cop) (Binary src1 src2)));
-  effect(TEMP dst, USE src1, USE src2);
-  format %{ "cmpps.$copnd  $dst, $src1, $src2  ! vcmovevf, cond=$cop\n\t"
-            "blendvps $dst,$src1,$src2,$dst ! vcmovevf\n\t"
-         %}
-  ins_encode %{
-    assert(UseAVX > 0, "required");
-
-    int vlen_enc = Assembler::AVX_256bit;
-    int cond = (Assembler::Condition)($copnd$$cmpcode);
-    __ vcmpps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
-    __ vblendvps($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
-instruct vcmov4D_reg(legVec dst, legVec src1, legVec src2, immI8 cop, cmpOp_vcmppd copnd) %{
-  predicate(Matcher::vector_length(n) == 4);
-  match(Set dst (CMoveVD (Binary copnd cop) (Binary src1 src2)));
-  effect(TEMP dst, USE src1, USE src2);
-  format %{ "cmppd.$copnd  $dst, $src1, $src2  ! vcmovevd, cond=$cop\n\t"
-            "vblendvpd $dst,$src1,$src2,$dst ! vcmovevd\n\t"
-         %}
-  ins_encode %{
-    assert(UseAVX > 0, "required");
-
-    int vlen_enc = Assembler::AVX_256bit;
-    int cond = (Assembler::Condition)($copnd$$cmpcode);
-    __ vcmppd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, cond, vlen_enc);
-    __ vblendvpd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
-  %}
-  ins_pipe( pipe_slow );
-%}
-
 // --------------------------------- DIV --------------------------------------

 // Floats vector div
--- a/src/hotspot/share/adlc/formssel.cpp
+++ b/src/hotspot/share/adlc/formssel.cpp
@@ -4203,7 +4203,6 @@ bool MatchRule::is_vector() const {
    "AddVB","AddVS","AddVI","AddVL","AddVF","AddVD",
    "SubVB","SubVS","SubVI","SubVL","SubVF","SubVD",
    "MulVB","MulVS","MulVI","MulVL","MulVF","MulVD",
-    "CMoveVD", "CMoveVF",
    "DivVF","DivVD",
    "AbsVB","AbsVS","AbsVI","AbsVL","AbsVF","AbsVD",
    "NegVF","NegVD","NegVI","NegVL",
--- a/src/hotspot/share/opto/classes.hpp
+++ b/src/hotspot/share/opto/classes.hpp
@@ -81,9 +81,7 @@ macro(CompressBitsV)
 macro(ExpandBitsV)
 macro(ConstraintCast)
 macro(CMoveD)
-macro(CMoveVD)
 macro(CMoveF)
-macro(CMoveVF)
 macro(CMoveI)
 macro(CMoveL)
 macro(CMoveP)
--- a/src/hotspot/share/opto/matcher.cpp
+++ b/src/hotspot/share/opto/matcher.cpp
@@ -2385,20 +2385,6 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) {
      n->del_req(3);
      break;
    }
-    case Op_CMoveVF:
-    case Op_CMoveVD: {
-      // Restructure into a binary tree for Matching:
-      // CMoveVF (Binary bool mask) (Binary src1 src2)
-      Node* in_cc = n->in(1);
-      assert(in_cc->is_Con(), "The condition input of cmove vector node must be a constant.");
-      Node* bol = new BoolNode(in_cc, (BoolTest::mask)in_cc->get_int());
-      Node* pair1 = new BinaryNode(bol, in_cc);
-      n->set_req(1, pair1);
-      Node* pair2 = new BinaryNode(n->in(2), n->in(3));
-      n->set_req(2, pair2);
-      n->del_req(3);
-      break;
-    }
    case Op_MacroLogicV: {
      Node* pair1 = new BinaryNode(n->in(1), n->in(2));
      Node* pair2 = new BinaryNode(n->in(3), n->in(4));
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -60,7 +60,6 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) :
  _mem_slice_tail(arena(), 8,  0, nullptr),                 // memory slice tails
  _node_info(arena(), 8,  0, SWNodeInfo::initial),          // info needed per node
  _clone_map(phase->C->clone_map()),                        // map of nodes created in cloning
-  _cmovev_kit(_arena, this),                                // map to facilitate CMoveV creation
  _align_to_ref(nullptr),                                   // memory reference to align vectors to
  _disjoint_ptrs(arena(), 8,  0, OrderedPair::initial),     // runtime disambiguated pointer pairs
  _dg(_arena),                                              // dependence graph
@@ -619,9 +618,6 @@ bool SuperWord::SLP_extract() {
    combine_packs();

    construct_my_pack_map();
-    if (UseVectorCmov) {
-      merge_packs_to_cmove();
-    }

    filter_packs();

@@ -1580,18 +1576,6 @@ void SuperWord::set_alignment(Node* s1, Node* s2, int align) {

 //------------------------------data_size---------------------------
 int SuperWord::data_size(Node* s) {
-  Node* use = nullptr; //test if the node is a candidate for CMoveV optimization, then return the size of CMov
-  if (UseVectorCmov) {
-    use = _cmovev_kit.is_Bool_candidate(s);
-    if (use != nullptr) {
-      return data_size(use);
-    }
-    use = _cmovev_kit.is_Cmp_candidate(s);
-    if (use != nullptr) {
-      return data_size(use);
-    }
-  }
-
  int bsize = type2aelembytes(velt_basic_type(s));
  assert(bsize != 0, "valid size");
  return bsize;
@@ -2052,213 +2036,6 @@ void SuperWord::filter_packs() {
 #endif
 }

-//------------------------------merge_packs_to_cmove---------------------------
-// Merge qualified CMove into new vector-nodes
-// We want to catch this pattern and subsume Cmp and Bool into CMove
-//
-//                   Sub              Con
-//                  /  |               /
-//                 /   |           /   /
-//                /    |       /      /
-//               /     |   /         /
-//              /      /            /
-//             /    /  |           /
-//            v /      |          /
-//         Cmp         |         /
-//          |          |        /
-//          v          |       /
-//         Bool        |      /
-//           \         |     /
-//             \       |    /
-//               \     |   /
-//                 \   |  /
-//                   \ v /
-//                   CMove
-//
-
-void SuperWord::merge_packs_to_cmove() {
-  for (int i = _packset.length() - 1; i >= 0; i--) {
-    Node_List* pk = _packset.at(i);
-    if (_cmovev_kit.can_merge_cmove_pack(pk)) {
-      _cmovev_kit.make_cmove_pack(pk);
-    }
-  }
-
-  #ifndef PRODUCT
-    if (TraceSuperWord) {
-      tty->print_cr("\nSuperWord::merge_packs_to_cmove(): After merge");
-      print_packset();
-      tty->cr();
-    }
-  #endif
-}
-
-Node* CMoveKit::is_Bool_candidate(Node* def) const {
-  Node* use = nullptr;
-  if (!def->is_Bool() || def->in(0) != nullptr || def->outcnt() != 1) {
-    return nullptr;
-  }
-  for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
-    use = def->fast_out(j);
-    if (!_sw->same_generation(def, use) || !use->is_CMove()) {
-      return nullptr;
-    }
-  }
-  return use;
-}
-
-Node* CMoveKit::is_Cmp_candidate(Node* def) const {
-  Node* use = nullptr;
-  if (!def->is_Cmp() || def->in(0) != nullptr || def->outcnt() != 1) {
-    return nullptr;
-  }
-  for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
-    use = def->fast_out(j);
-    if (!_sw->same_generation(def, use) || (use = is_Bool_candidate(use)) == nullptr || !_sw->same_generation(def, use)) {
-      return nullptr;
-    }
-  }
-  return use;
-}
-
-// Determine if the current pack is an ideal cmove pack, and if its related packs,
-// i.e. bool node pack and cmp node pack, can be successfully merged for vectorization.
-bool CMoveKit::can_merge_cmove_pack(Node_List* cmove_pk) {
-  Node* cmove = cmove_pk->at(0);
-
-  if (!SuperWord::is_cmove_fp_opcode(cmove->Opcode()) ||
-      pack(cmove) != nullptr /* already in the cmove pack */) {
-    return false;
-  }
-
-  if (cmove->in(0) != nullptr) {
-    NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::can_merge_cmove_pack: CMove %d has control flow, escaping...", cmove->_idx); cmove->dump();})
-    return false;
-  }
-
-  Node* bol = cmove->as_CMove()->in(CMoveNode::Condition);
-  if (!bol->is_Bool() ||
-      bol->outcnt() != 1 ||
-      !_sw->same_generation(bol, cmove) ||
-      bol->in(0) != nullptr || // Bool node has control flow!!
-      _sw->my_pack(bol) == nullptr) {
-      NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::can_merge_cmove_pack: Bool %d does not fit CMove %d for building vector, escaping...", bol->_idx, cmove->_idx); bol->dump();})
-    return false;
-  }
-  Node_List* bool_pk = _sw->my_pack(bol);
-  if (bool_pk->size() != cmove_pk->size() ) {
-    return false;
-  }
-
-  Node* cmp = bol->in(1);
-  if (!cmp->is_Cmp() ||
-      cmp->outcnt() != 1 ||
-      !_sw->same_generation(cmp, cmove) ||
-      cmp->in(0) != nullptr || // Cmp node has control flow!!
-      _sw->my_pack(cmp) == nullptr) {
-      NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::can_merge_cmove_pack: Cmp %d does not fit CMove %d for building vector, escaping...", cmp->_idx, cmove->_idx); cmp->dump();})
-    return false;
-  }
-  Node_List* cmp_pk = _sw->my_pack(cmp);
-  if (cmp_pk->size() != cmove_pk->size() ) {
-    return false;
-  }
-
-  if (!test_cmp_pack(cmp_pk, cmove_pk)) {
-    NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print("CMoveKit::can_merge_cmove_pack: cmp pack for Cmp %d failed vectorization test", cmp->_idx); cmp->dump();})
-    return false;
-  }
-
-  return true;
-}
-
-// Create a new cmove pack to substitute the old one, map all info to the
-// new pack and delete the old cmove pack and related packs from the packset.
-void CMoveKit::make_cmove_pack(Node_List* cmove_pk) {
-  Node* cmove = cmove_pk->at(0);
-  Node* bol = cmove->as_CMove()->in(CMoveNode::Condition);
-  Node_List* bool_pk = _sw->my_pack(bol);
-  Node* cmp = bol->in(1);
-  Node_List* cmp_pk = _sw->my_pack(cmp);
-
-  Node_List* new_cmove_pk = new Node_List();
-  uint sz = cmove_pk->size() - 1;
-  for (uint i = 0; i <= sz; ++i) {
-    Node* cmov = cmove_pk->at(i);
-    Node* bol  = bool_pk->at(i);
-    Node* cmp  = cmp_pk->at(i);
-
-    new_cmove_pk->insert(i, cmov);
-
-    map(cmov, new_cmove_pk);
-    map(bol, new_cmove_pk);
-    map(cmp, new_cmove_pk);
-
-    _sw->set_my_pack(cmov, new_cmove_pk); // and keep old packs for cmp and bool
-  }
-  _sw->_packset.remove(cmove_pk);
-  _sw->_packset.remove(bool_pk);
-  _sw->_packset.remove(cmp_pk);
-  _sw->_packset.append(new_cmove_pk);
-  NOT_PRODUCT(if(_sw->is_trace_cmov()) {tty->print_cr("CMoveKit::make_cmove_pack: added syntactic CMove pack"); _sw->print_pack(new_cmove_pk);})
-}
-
-bool CMoveKit::test_cmp_pack(Node_List* cmp_pk, Node_List* cmove_pk) {
-  Node* cmp0 = cmp_pk->at(0);
-  assert(cmp0->is_Cmp(), "CMoveKit::test_cmp_pack: should be Cmp Node");
-  assert(cmove_pk->at(0)->is_CMove(), "CMoveKit::test_cmp_pack: should be CMove");
-  assert(cmp_pk->size() == cmove_pk->size(), "CMoveKit::test_cmp_pack: should be same size");
-  Node* in1 = cmp0->in(1);
-  Node* in2 = cmp0->in(2);
-  Node_List* in1_pk = _sw->my_pack(in1);
-  Node_List* in2_pk = _sw->my_pack(in2);
-
-  if (  (in1_pk != nullptr && in1_pk->size() != cmp_pk->size())
-     || (in2_pk != nullptr && in2_pk->size() != cmp_pk->size()) ) {
-    return false;
-  }
-
-  // test if "all" in1 are in the same pack or the same node
-  if (in1_pk == nullptr) {
-    for (uint j = 1; j < cmp_pk->size(); j++) {
-      if (cmp_pk->at(j)->in(1) != in1) {
-        return false;
-      }
-    }//for: in1_pk is not pack but all Cmp nodes in the pack have the same in(1)
-  }
-  // test if "all" in2 are in the same pack or the same node
-  if (in2_pk == nullptr) {
-    for (uint j = 1; j < cmp_pk->size(); j++) {
-      if (cmp_pk->at(j)->in(2) != in2) {
-        return false;
-      }
-    }//for: in2_pk is not pack but all Cmp nodes in the pack have the same in(2)
-  }
-  //now check if cmp_pk may be subsumed in vector built for cmove_pk
-  int cmove_ind1, cmove_ind2;
-  if (cmp_pk->at(0)->in(1) == cmove_pk->at(0)->as_CMove()->in(CMoveNode::IfFalse)
-   && cmp_pk->at(0)->in(2) == cmove_pk->at(0)->as_CMove()->in(CMoveNode::IfTrue)) {
-      cmove_ind1 = CMoveNode::IfFalse;
-      cmove_ind2 = CMoveNode::IfTrue;
-  } else if (cmp_pk->at(0)->in(2) == cmove_pk->at(0)->as_CMove()->in(CMoveNode::IfFalse)
-          && cmp_pk->at(0)->in(1) == cmove_pk->at(0)->as_CMove()->in(CMoveNode::IfTrue)) {
-      cmove_ind2 = CMoveNode::IfFalse;
-      cmove_ind1 = CMoveNode::IfTrue;
-  }
-  else {
-    return false;
-  }
-
-  for (uint j = 1; j < cmp_pk->size(); j++) {
-    if (cmp_pk->at(j)->in(1) != cmove_pk->at(j)->as_CMove()->in(cmove_ind1)
-        || cmp_pk->at(j)->in(2) != cmove_pk->at(j)->as_CMove()->in(cmove_ind2)) {
-        return false;
-    }//if
-  }
-  NOT_PRODUCT(if(_sw->is_trace_cmov()) { tty->print("CMoveKit::test_cmp_pack: cmp pack for 1st Cmp %d is OK for vectorization: ", cmp0->_idx); cmp0->dump(); })
-  return true;
-}
-
 //------------------------------implemented---------------------------
 // Can code be generated for pack p?
 bool SuperWord::implemented(Node_List* p) {
@@ -2283,9 +2060,9 @@ bool SuperWord::implemented(Node_List* p) {
      // integer subword types with superword vectorization.
      // See JDK-8294816 for miscompilation issues with shorts.
      return false;
-    } else if (is_cmove_fp_opcode(opc)) {
-      retValue = is_cmov_pack(p) && VectorNode::implemented(opc, size, velt_basic_type(p0));
-      NOT_PRODUCT(if(retValue && is_trace_cmov()) {tty->print_cr("SWPointer::implemented: found cmove pack"); print_pack(p);})
+    } else if (p0->is_Cmp()) {
+      // Cmp -> Bool -> Cmove
+      retValue = UseVectorCmov;
    } else if (requires_long_to_int_conversion(opc)) {
      // Java API for Long.bitCount/numberOfLeadingZeros/numberOfTrailingZeros
      // returns int type, but Vector API for them returns long type. To unify
@@ -2308,10 +2085,6 @@ bool SuperWord::implemented(Node_List* p) {
  return retValue;
 }

-bool SuperWord::is_cmov_pack(Node_List* p) {
-  return _cmovev_kit.pack(p->at(0)) != nullptr;
-}
-
 bool SuperWord::requires_long_to_int_conversion(int opc) {
  switch(opc) {
    case Op_PopCountL:
@@ -2385,9 +2158,6 @@ bool SuperWord::profitable(Node_List* p) {
    // just the ones outside the block.)
    for (uint i = 0; i < p->size(); i++) {
      Node* def = p->at(i);
-      if (is_cmov_pack_internal_node(p, def)) {
-        continue;
-      }
      for (DUIterator_Fast jmax, j = def->fast_outs(jmax); j < jmax; j++) {
        Node* use = def->fast_out(j);
        for (uint k = 0; k < use->req(); k++) {
@@ -2408,11 +2178,30 @@ bool SuperWord::profitable(Node_List* p) {
      }
    }
  }
+  if (p0->is_Cmp()) {
+    // Verify that Cmp pack only has Bool pack uses
+    for (DUIterator_Fast jmax, j = p0->fast_outs(jmax); j < jmax; j++) {
+      Node* bol = p0->fast_out(j);
+      if (!bol->is_Bool() || bol->in(0) != nullptr || !is_vector_use(bol, 1)) {
+        return false;
+      }
+    }
+  }
+  if (p0->is_Bool()) {
+    // Verify that Bool pack only has CMove pack uses
+    for (DUIterator_Fast jmax, j = p0->fast_outs(jmax); j < jmax; j++) {
+      Node* cmove = p0->fast_out(j);
+      if (!cmove->is_CMove() || cmove->in(0) != nullptr || !is_vector_use(cmove, 1)) {
+        return false;
+      }
+    }
+  }
  return true;
 }

 #ifdef ASSERT
 void SuperWord::verify_packs() {
+  // Verify independence at pack level.
  for (int i = 0; i < _packset.length(); i++) {
    Node_List* p = _packset.at(i);
    Node* dependence = find_dependence(p);
@@ -2431,6 +2220,27 @@ void SuperWord::verify_packs() {
    }
    assert(dependence == nullptr, "all nodes in pack must be mutually independent");
  }
+
+  // Verify all nodes in packset have my_pack set correctly.
+  Unique_Node_List processed;
+  for (int i = 0; i < _packset.length(); i++) {
+    Node_List* p = _packset.at(i);
+    for (uint k = 0; k < p->size(); k++) {
+      Node* n = p->at(k);
+      assert(in_bb(n), "only nodes in bb can be in packset");
+      assert(!processed.member(n), "node should only occur once in packset");
+      assert(my_pack(n) == p, "n has consisten packset info");
+      processed.push(n);
+    }
+  }
+
+  // Check that no other node has my_pack set.
+  for (int i = 0; i < _block.length(); i++) {
+    Node* n = _block.at(i);
+    if (!processed.member(n)) {
+      assert(my_pack(n) == nullptr, "should not have pack if not in packset");
+    }
+  }
 }
 #endif

@@ -2535,7 +2345,7 @@ public:
      if (pid == 0) {
        pid = new_pid();
        set_pid(n, pid);
-        assert(_slp->my_pack(n) == nullptr || UseVectorCmov, "no packset");
+        assert(_slp->my_pack(n) == nullptr, "no packset");
      }
    }

@@ -2953,7 +2763,89 @@ bool SuperWord::output() {
        Node* one = vector_opd(p, 3);
        vn = VectorNode::make(opc, in, zero, one, vlen, velt_basic_type(n));
        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
-      } else if (n->req() == 3 && !is_cmov_pack(p)) {
+      } else if (n->is_Cmp()) {
+        // Bool + Cmp + CMove -> VectorMaskCmp + VectorBlend
+        continue;
+      } else if (n->is_Bool()) {
+        // Bool + Cmp + CMove -> VectorMaskCmp + VectorBlend
+        continue;
+      } else if (n->is_CMove()) {
+        // Bool + Cmp + CMove -> VectorMaskCmp + VectorBlend
+
+        BoolNode* bol = n->in(1)->as_Bool();
+        assert(bol != nullptr, "must have Bool above CMove");
+        BoolTest::mask bol_test = bol->_test._test;
+        assert(bol_test == BoolTest::eq ||
+               bol_test == BoolTest::ne ||
+               bol_test == BoolTest::ge ||
+               bol_test == BoolTest::gt ||
+               bol_test == BoolTest::lt ||
+               bol_test == BoolTest::le,
+               "CMove bool should be one of: eq,ne,ge,ge,lt,le");
+        Node_List* p_bol = my_pack(bol);
+        assert(p_bol != nullptr, "CMove must have matching Bool pack");
+
+        CmpNode* cmp = bol->in(1)->as_Cmp();
+        assert(cmp != nullptr, "must have cmp above CMove");
+        Node_List* p_cmp = my_pack(cmp);
+        assert(p_cmp != nullptr, "Bool must have matching Cmp pack");
+
+        Node* cmp_in1 = vector_opd(p_cmp, 1);
+        Node* cmp_in2 = vector_opd(p_cmp, 2);
+
+        Node* blend_in1 = vector_opd(p, 2);
+        Node* blend_in2 = vector_opd(p, 3);
+
+        if (cmp->Opcode() == Op_CmpF || cmp->Opcode() == Op_CmpD) {
+          // If we have a Float or Double comparison, we must be careful with
+          // handling NaN's correctly. CmpF and CmpD have a return code, as
+          // they are based on the java bytecodes fcmpl/dcmpl:
+          // -1: cmp_in1 <  cmp_in2, or at least one of the two is a NaN
+          //  0: cmp_in1 == cmp_in2  (no NaN)
+          //  1: cmp_in1 >  cmp_in2  (no NaN)
+          //
+          // The "bol_test" selects which of the [-1, 0, 1] cases lead to "true".
+          //
+          // Note: ordered   (O) comparison returns "false" if either input is NaN.
+          //       unordered (U) comparison returns "true"  if either input is NaN.
+          //
+          // The VectorMaskCmpNode does a comparison directly on in1 and in2, in the java
+          // standard way (all comparisons are ordered, except NEQ is unordered).
+          //
+          // In the following, "bol_test" already matches the cmp code for VectorMaskCmpNode:
+          //   BoolTest::eq:  Case 0     -> EQ_O
+          //   BoolTest::ne:  Case -1, 1 -> NEQ_U
+          //   BoolTest::ge:  Case 0, 1  -> GE_O
+          //   BoolTest::gt:  Case 1     -> GT_O
+          //
+          // But the lt and le comparisons must be converted from unordered to ordered:
+          //   BoolTest::lt:  Case -1    -> LT_U -> VectorMaskCmp would interpret lt as LT_O
+          //   BoolTest::le:  Case -1, 0 -> LE_U -> VectorMaskCmp would interpret le as LE_O
+          //
+          if (bol_test == BoolTest::lt || bol_test == BoolTest::le) {
+            // Negating the bol_test and swapping the blend-inputs leaves all non-NaN cases equal,
+            // but converts the unordered (U) to an ordered (O) comparison.
+            //      VectorBlend(VectorMaskCmp(LT_U, in1_cmp, in2_cmp), in1_blend, in2_blend)
+            // <==> VectorBlend(VectorMaskCmp(GE_O, in1_cmp, in2_cmp), in2_blend, in1_blend)
+            //      VectorBlend(VectorMaskCmp(LE_U, in1_cmp, in2_cmp), in1_blend, in2_blend)
+            // <==> VectorBlend(VectorMaskCmp(GT_O, in1_cmp, in2_cmp), in2_blend, in1_blend)
+            bol_test = bol->_test.negate();
+            swap(blend_in1, blend_in2);
+          }
+        }
+
+        // VectorMaskCmp
+        ConINode* bol_test_node  = _igvn.intcon((int)bol_test);
+        BasicType bt = velt_basic_type(cmp);
+        const TypeVect* vt = TypeVect::make(bt, vlen);
+        VectorNode* mask = new VectorMaskCmpNode(bol_test, cmp_in1, cmp_in2, bol_test_node, vt);
+        _igvn.register_new_node_with_optimizer(mask);
+        _phase->set_ctrl(mask, _phase->get_ctrl(p->at(0)));
+        _igvn._worklist.push(mask);
+
+        // VectorBlend
+        vn = new VectorBlendNode(blend_in1, blend_in2, mask);
+      } else if (n->req() == 3) {
        // Promote operands to vector
        Node* in1 = nullptr;
        bool node_isa_reduction = is_marked_reduction(n);
@@ -3037,85 +2929,6 @@ bool SuperWord::output() {
        int vopc = VectorCastNode::opcode(opc, in->bottom_type()->is_vect()->element_basic_type());
        vn = VectorCastNode::make(vopc, in, bt, vlen);
        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
-      } else if (is_cmov_pack(p)) {
-        if (cl->is_rce_post_loop()) {
-          // do not refactor of flow in post loop context
-          return false;
-        }
-        if (!n->is_CMove()) {
-          continue;
-        }
-        // place here CMoveVDNode
-        NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: print before CMove vectorization"); print_loop(false);})
-        Node* bol = n->in(CMoveNode::Condition);
-        if (!bol->is_Bool() && bol->Opcode() == Op_ExtractI && bol->req() > 1 ) {
-          NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d is not Bool node, trying its in(1) node %d", bol->_idx, bol->in(1)->_idx); bol->dump(); bol->in(1)->dump();})
-          bol = bol->in(1); //may be ExtractNode
-        }
-
-        assert(bol->is_Bool(), "should be BoolNode - too late to bail out!");
-        if (!bol->is_Bool()) {
-          if (do_reserve_copy()) {
-            NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: expected %d bool node, exiting SuperWord", bol->_idx); bol->dump();})
-            return false; //and reverse to backup IG
-          }
-          ShouldNotReachHere();
-        }
-
-        BoolTest boltest = bol->as_Bool()->_test;
-        BoolTest::mask cond = boltest._test;
-        Node* cmp = bol->in(1);
-        // When the src order of cmp node and cmove node are the same:
-        //   cmp: CmpD src1 src2
-        //   bool: Bool cmp mask
-        //   cmove: CMoveD bool scr1 src2
-        // =====> vectorized, equivalent to
-        //   cmovev: CMoveVD mask src_vector1 src_vector2
-        //
-        // When the src order of cmp node and cmove node are different:
-        //   cmp: CmpD src2 src1
-        //   bool: Bool cmp mask
-        //   cmove: CMoveD bool scr1 src2
-        // =====> equivalent to
-        //   cmp: CmpD src1 src2
-        //   bool: Bool cmp negate(mask)
-        //   cmove: CMoveD bool scr1 src2
-        // (Note: when mask is ne or eq, we don't need to negate it even after swapping.)
-        // =====> vectorized, equivalent to
-        //   cmovev: CMoveVD negate(mask) src_vector1 src_vector2
-        if (cmp->in(2) == n->in(CMoveNode::IfFalse) && cond != BoolTest::ne && cond != BoolTest::eq) {
-          assert(cmp->in(1) == n->in(CMoveNode::IfTrue), "cmpnode and cmovenode don't share the same inputs.");
-          cond = boltest.negate();
-        }
-        Node* cc  = _igvn.intcon((int)cond);
-        NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created intcon in_cc node %d", cc->_idx); cc->dump();})
-
-        Node* src1 = vector_opd(p, 2); //2=CMoveNode::IfFalse
-        if (src1 == nullptr) {
-          if (do_reserve_copy()) {
-            NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: src1 should not be null, exiting SuperWord");})
-            return false; //and reverse to backup IG
-          }
-          ShouldNotReachHere();
-        }
-        Node* src2 = vector_opd(p, 3); //3=CMoveNode::IfTrue
-        if (src2 == nullptr) {
-          if (do_reserve_copy()) {
-            NOT_PRODUCT(if(is_trace_loop_reverse() || TraceLoopOpts) {tty->print_cr("SWPointer::output: src2 should not be null, exiting SuperWord");})
-            return false; //and reverse to backup IG
-          }
-          ShouldNotReachHere();
-        }
-        BasicType bt = velt_basic_type(n);
-        const TypeVect* vt = TypeVect::make(bt, vlen);
-        assert(bt == T_FLOAT || bt == T_DOUBLE, "Only vectorization for FP cmovs is supported");
-        if (bt == T_FLOAT) {
-          vn = new CMoveVFNode(cc, src1, src2, vt);
-        } else {
-          assert(bt == T_DOUBLE, "Expected double");
-          vn = new CMoveVDNode(cc, src1, src2, vt);
-        }
-        NOT_PRODUCT(if(is_trace_cmov()) {tty->print("SWPointer::output: created new CMove node %d: ", vn->_idx); vn->dump();})
      } else if (opc == Op_FmaD || opc == Op_FmaF) {
        // Promote operands to vector
        Node* in1 = vector_opd(p, 1);
@@ -3455,7 +3268,7 @@ void SuperWord::insert_extracts(Node_List* p) {
        Node* n = use->in(k);
        if (def == n) {
          Node_List* u_pk = my_pack(use);
-          if ((u_pk == nullptr || !is_cmov_pack(u_pk) || use->is_CMove()) && !is_vector_use(use, k)) {
+          if ((u_pk == nullptr || use->is_CMove()) && !is_vector_use(use, k)) {
              _n_idx_list.push(use, k);
          }
        }
@@ -3886,6 +3699,18 @@ void SuperWord::compute_vector_element_type() {
      }
    }
  }
+  for (int i = 0; i < _block.length(); i++) {
+    Node* n = _block.at(i);
+    Node* nn = n;
+    if (nn->is_Bool() && nn->in(0) == nullptr) {
+      nn = nn->in(1);
+      assert(nn->is_Cmp(), "always have Cmp above Bool");
+    }
+    if (nn->is_Cmp() && nn->in(0) == nullptr) {
+      nn = nn->in(1);
+      set_velt_type(n, velt_type(nn));
+    }
+  }
 #ifndef PRODUCT
  if (TraceSuperWord && Verbose) {
    for (int i = 0; i < _block.length(); i++) {
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@@ -203,24 +203,6 @@ class SWNodeInfo {
 };

 class SuperWord;
-class CMoveKit {
- friend class SuperWord;
- private:
-  SuperWord* _sw;
-  Dict* _dict;
-  CMoveKit(Arena* a, SuperWord* sw) : _sw(sw)  {_dict = new Dict(cmpkey, hashkey, a);}
-  void*     _2p(Node* key)        const  { return (void*)(intptr_t)key; } // 2 conversion functions to make gcc happy
-  Dict*     dict()                const  { return _dict; }
-  void map(Node* key, Node_List* val)    { assert(_dict->operator[](_2p(key)) == nullptr, "key existed"); _dict->Insert(_2p(key), (void*)val); }
-  void unmap(Node* key)                  { _dict->Delete(_2p(key)); }
-  Node_List* pack(Node* key)      const  { return (Node_List*)_dict->operator[](_2p(key)); }
-  Node* is_Bool_candidate(Node* nd) const; // if it is the right candidate return corresponding CMove* ,
-  Node* is_Cmp_candidate(Node* nd) const; // otherwise return null
-  // Determine if the current pack is a cmove candidate that can be vectorized.
-  bool can_merge_cmove_pack(Node_List* cmove_pk);
-  void make_cmove_pack(Node_List* cmove_pk);
-  bool test_cmp_pack(Node_List* cmp_pk, Node_List* cmove_pk);
-};//class CMoveKit

 // JVMCI: OrderedPair is moved up to deal with compilation issues on Windows
 //------------------------------OrderedPair---------------------------
@@ -309,7 +291,6 @@ class SuperWord : public ResourceObj {
  GrowableArray<Node*> _mem_slice_tail;  // Memory slice tail nodes
  GrowableArray<SWNodeInfo> _node_info;  // Info needed per node
  CloneMap&            _clone_map;       // map of nodes created in cloning
-  CMoveKit             _cmovev_kit;      // support for vectorization of CMov
  MemNode* _align_to_ref;                // Memory reference that pre-loop will align to

  GrowableArray<OrderedPair> _disjoint_ptrs; // runtime disambiguated pointer pairs
@@ -458,9 +439,6 @@ class SuperWord : public ResourceObj {
 private:
  void set_my_pack(Node* n, Node_List* p)     { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_my_pack = p; }
  // is pack good for converting into one vector node replacing bunches of Cmp, Bool, CMov nodes.
-  bool is_cmov_pack(Node_List* p);
-  bool is_cmov_pack_internal_node(Node_List* p, Node* nd) { return is_cmov_pack(p) && !nd->is_CMove(); }
-  static bool is_cmove_fp_opcode(int opc) { return (opc == Op_CMoveF || opc == Op_CMoveD); }
  static bool requires_long_to_int_conversion(int opc);
  // For pack p, are all idx operands the same?
  bool same_inputs(Node_List* p, int idx);
@@ -595,9 +573,8 @@ private:
  void construct_my_pack_map();
  // Remove packs that are not implemented or not profitable.
  void filter_packs();
-  // Merge CMove into new vector-nodes
-  void merge_packs_to_cmove();
-  // Verify that for every pack, all nodes are mutually independent
+  // Verify that for every pack, all nodes are mutually independent.
+  // Also verify that packset and my_pack are consistent.
  DEBUG_ONLY(void verify_packs();)
  // Adjust the memory graph for the packed operations
  void schedule();
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@@ -82,9 +82,11 @@ int VectorNode::opcode(int sopc, BasicType bt) {
  case Op_FmaF:
    return (bt == T_FLOAT ? Op_FmaVF : 0);
  case Op_CMoveF:
-    return (bt == T_FLOAT ? Op_CMoveVF : 0);
+    return (bt == T_FLOAT ? Op_VectorBlend : 0);
  case Op_CMoveD:
-    return (bt == T_DOUBLE ? Op_CMoveVD : 0);
+    return (bt == T_DOUBLE ? Op_VectorBlend : 0);
+  case Op_Bool:
+    return Op_VectorMaskCmp;
  case Op_DivF:
    return (bt == T_FLOAT ? Op_DivVF : 0);
  case Op_DivD:
@@ -683,10 +685,6 @@ void VectorNode::vector_operands(Node* n, uint* start, uint* end) {
    *start = 1;
    *end   = 3; // 2 vector operands
    break;
-  case Op_CMoveI:  case Op_CMoveL:  case Op_CMoveF:  case Op_CMoveD:
-    *start = 2;
-    *end   = n->req();
-    break;
  case Op_FmaD:
  case Op_FmaF:
    *start = 1;
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@@ -392,22 +392,6 @@ public:
  virtual int Opcode() const;
 };

-//------------------------------CMoveVFNode--------------------------------------
-// Vector float conditional move
-class CMoveVFNode : public VectorNode {
-public:
-  CMoveVFNode(Node* in1, Node* in2, Node* in3, const TypeVect* vt) : VectorNode(in1, in2, in3, vt) {}
-  virtual int Opcode() const;
-};
-
-//------------------------------CMoveVDNode--------------------------------------
-// Vector double conditional move
-class CMoveVDNode : public VectorNode {
-public:
-  CMoveVDNode(Node* in1, Node* in2, Node* in3, const TypeVect* vt) : VectorNode(in1, in2, in3, vt) {}
-  virtual int Opcode() const;
-};
-
 //------------------------------MulReductionVINode--------------------------------------
 // Vector multiply byte, short and int as a reduction
 class MulReductionVINode : public UnorderedReductionNode {
--- a/src/hotspot/share/runtime/vmStructs.cpp
+++ b/src/hotspot/share/runtime/vmStructs.cpp
@@ -1768,8 +1768,6 @@
  declare_c2_type(NegVDNode, NegVNode)                                    \
  declare_c2_type(FmaVDNode, VectorNode)                                  \
  declare_c2_type(FmaVFNode, VectorNode)                                  \
-  declare_c2_type(CMoveVFNode, VectorNode)                                \
-  declare_c2_type(CMoveVDNode, VectorNode)                                \
  declare_c2_type(CompressVNode, VectorNode)                              \
  declare_c2_type(CompressMNode, VectorNode)                              \
  declare_c2_type(ExpandVNode, VectorNode)                                \
--- a/test/hotspot/jtreg/compiler/c2/irTests/TestVectorConditionalMove.java
+++ b/test/hotspot/jtreg/compiler/c2/irTests/TestVectorConditionalMove.java
@@ -30,30 +30,25 @@ import jdk.test.lib.Utils;

 /*
 * @test
- * @bug 8289422
+ * @bug 8289422 8306088
 * @key randomness
- * @summary Auto-vectorization enhancement to support vector conditional move on AArch64
- * @requires os.arch=="aarch64"
+ * @summary Auto-vectorization enhancement to support vector conditional move.
 * @library /test/lib /
 * @run driver compiler.c2.irTests.TestVectorConditionalMove
 */

 public class TestVectorConditionalMove {
-    final private static int SIZE = 3000;
+    final private static int SIZE = 1024;
    private static final Random RANDOM = Utils.getRandomInstance();

-    private static float[] floata = new float[SIZE];
-    private static float[] floatb = new float[SIZE];
-    private static float[] floatc = new float[SIZE];
-    private static double[] doublea = new double[SIZE];
-    private static double[] doubleb = new double[SIZE];
-    private static double[] doublec = new double[SIZE];
-
    public static void main(String[] args) {
-        TestFramework.runWithFlags("-Xcomp", "-XX:-TieredCompilation", "-XX:+UseCMoveUnconditionally",
-                                   "-XX:+UseVectorCmov", "-XX:CompileCommand=exclude,*.cmove*");
+        TestFramework.runWithFlags("-XX:-TieredCompilation",
+                                   "-XX:+UseCMoveUnconditionally",
+                                   "-XX:+UseVectorCmov",
+                                   "-XX:CompileCommand=compileonly,*.TestVectorConditionalMove.test*");
    }

+    // Compare 2 values, and pick one of them
    private float cmoveFloatGT(float a, float b) {
        return (a > b) ? a : b;
    }
@@ -94,8 +89,124 @@ public class TestVectorConditionalMove {
        return (a != b) ? a : b;
    }

+    // Extensions: compare 2 values, and pick from 2 consts
+    private float cmoveFGTforFConst(float a, float b) {
+        return (a > b) ? 0.1f : -0.1f;
+    }
+
+    private float cmoveFGEforFConst(float a, float b) {
+        return (a >= b) ? 0.1f : -0.1f;
+    }
+
+    private float cmoveFLTforFConst(float a, float b) {
+        return (a < b) ? 0.1f : -0.1f;
+    }
+
+    private float cmoveFLEforFConst(float a, float b) {
+        return (a <= b) ? 0.1f : -0.1f;
+    }
+
+    private float cmoveFEQforFConst(float a, float b) {
+        return (a == b) ? 0.1f : -0.1f;
+    }
+
+    private float cmoveFNEQforFConst(float a, float b) {
+        return (a != b) ? 0.1f : -0.1f;
+    }
+
+    private double cmoveDGTforDConst(double a, double b) {
+        return (a > b) ? 0.1 : -0.1;
+    }
+
+    private double cmoveDGEforDConst(double a, double b) {
+        return (a >= b) ? 0.1 : -0.1;
+    }
+
+    private double cmoveDLTforDConst(double a, double b) {
+        return (a < b) ? 0.1 : -0.1;
+    }
+
+    private double cmoveDLEforDConst(double a, double b) {
+        return (a <= b) ? 0.1 : -0.1;
+    }
+
+    private double cmoveDEQforDConst(double a, double b) {
+        return (a == b) ? 0.1 : -0.1;
+    }
+
+    private double cmoveDNEQforDConst(double a, double b) {
+        return (a != b) ? 0.1 : -0.1;
+    }
+
+    // Extension: Compare 2 ILFD values, and pick from 2 ILFD values
+    private int cmoveIGTforI(int a, int b, int c, int d) {
+        return (a > b) ? c : d;
+    }
+
+    private long cmoveIGTforL(int a, int b, long c, long d) {
+        return (a > b) ? c : d;
+    }
+
+    private float cmoveIGTforF(int a, int b, float c, float d) {
+        return (a > b) ? c : d;
+    }
+
+    private double cmoveIGTforD(int a, int b, double c, double d) {
+        return (a > b) ? c : d;
+    }
+
+    private int cmoveLGTforI(long a, long b, int c, int d) {
+        return (a > b) ? c : d;
+    }
+
+    private long cmoveLGTforL(long a, long b, long c, long d) {
+        return (a > b) ? c : d;
+    }
+
+    private float cmoveLGTforF(long a, long b, float c, float d) {
+        return (a > b) ? c : d;
+    }
+
+    private double cmoveLGTforD(long a, long b, double c, double d) {
+        return (a > b) ? c : d;
+    }
+
+    private int cmoveFGTforI(float a, float b, int c, int d) {
+        return (a > b) ? c : d;
+    }
+
+    private long cmoveFGTforL(float a, float b, long c, long d) {
+        return (a > b) ? c : d;
+    }
+
+    private float cmoveFGTforF(float a, float b, float c, float d) {
+        return (a > b) ? c : d;
+    }
+
+    private double cmoveFGTforD(float a, float b, double c, double d) {
+        return (a > b) ? c : d;
+    }
+
+    private int cmoveDGTforI(double a, double b, int c, int d) {
+        return (a > b) ? c : d;
+    }
+
+    private long cmoveDGTforL(double a, double b, long c, long d) {
+        return (a > b) ? c : d;
+    }
+
+    private float cmoveDGTforF(double a, double b, float c, float d) {
+        return (a > b) ? c : d;
+    }
+
+    private double cmoveDGTforD(double a, double b, double c, double d) {
+        return (a > b) ? c : d;
+    }
+
+    // Compare 2 values, and pick one of them
    @Test
-    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.CMOVE_VF, ">0", IRNode.STORE_VECTOR, ">0"})
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
    private static void testCMoveVFGT(float[] a, float[] b, float[] c) {
        for (int i = 0; i < a.length; i++) {
            c[i] = (a[i] > b[i]) ? a[i] : b[i];
@@ -103,7 +214,8 @@ public class TestVectorConditionalMove {
    }

    @Test
-    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.CMOVE_VF, ">0", IRNode.STORE_VECTOR, ">0"})
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
    private static void testCMoveVFGTSwap(float[] a, float[] b, float[] c) {
        for (int i = 0; i < a.length; i++) {
            c[i] = (b[i] > a[i]) ? a[i] : b[i];
@@ -111,7 +223,8 @@ public class TestVectorConditionalMove {
    }

    @Test
-    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.CMOVE_VF, ">0", IRNode.STORE_VECTOR, ">0"})
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
    private static void testCMoveVFLT(float[] a, float[] b, float[] c) {
        for (int i = 0; i < a.length; i++) {
            c[i] = (a[i] < b[i]) ? a[i] : b[i];
@@ -119,7 +232,8 @@ public class TestVectorConditionalMove {
    }

    @Test
-    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.CMOVE_VF, ">0", IRNode.STORE_VECTOR, ">0"})
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
    private static void testCMoveVFLTSwap(float[] a, float[] b, float[] c) {
        for (int i = 0; i < a.length; i++) {
            c[i] = (b[i] < a[i]) ? a[i] : b[i];
@@ -127,7 +241,8 @@ public class TestVectorConditionalMove {
    }

    @Test
-    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.CMOVE_VF, ">0", IRNode.STORE_VECTOR, ">0"})
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
    private static void testCMoveVFEQ(float[] a, float[] b, float[] c) {
        for (int i = 0; i < a.length; i++) {
            c[i] = (a[i] == b[i]) ? a[i] : b[i];
@@ -135,7 +250,8 @@ public class TestVectorConditionalMove {
    }

    @Test
-    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.CMOVE_VD, ">0", IRNode.STORE_VECTOR, ">0"})
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
    private static void testCMoveVDLE(double[] a, double[] b, double[] c) {
        for (int i = 0; i < a.length; i++) {
            c[i] = (a[i] <= b[i]) ? a[i] : b[i];
@@ -143,7 +259,8 @@ public class TestVectorConditionalMove {
    }

    @Test
-    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.CMOVE_VD, ">0", IRNode.STORE_VECTOR, ">0"})
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
    private static void testCMoveVDLESwap(double[] a, double[] b, double[] c) {
        for (int i = 0; i < a.length; i++) {
            c[i] = (b[i] <= a[i]) ? a[i] : b[i];
@@ -151,7 +268,8 @@ public class TestVectorConditionalMove {
    }

    @Test
-    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.CMOVE_VD, ">0", IRNode.STORE_VECTOR, ">0"})
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
    private static void testCMoveVDGE(double[] a, double[] b, double[] c) {
        for (int i = 0; i < a.length; i++) {
            c[i] = (a[i] >= b[i]) ? a[i] : b[i];
@@ -159,7 +277,8 @@ public class TestVectorConditionalMove {
    }

    @Test
-    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.CMOVE_VD, ">0", IRNode.STORE_VECTOR, ">0"})
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
    private static void testCMoveVDGESwap(double[] a, double[] b, double[] c) {
        for (int i = 0; i < a.length; i++) {
            c[i] = (b[i] >= a[i]) ? a[i] : b[i];
@@ -167,31 +286,339 @@ public class TestVectorConditionalMove {
    }

    @Test
-    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.CMOVE_VD, ">0", IRNode.STORE_VECTOR, ">0"})
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
    private static void testCMoveVDNE(double[] a, double[] b, double[] c) {
        for (int i = 0; i < a.length; i++) {
            c[i] = (a[i] != b[i]) ? a[i] : b[i];
        }
    }

+    // Extensions: compare 2 values, and pick from 2 consts
    @Test
-    @IR(failOn = {IRNode.CMOVE_VD})
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
+    private static void testCMoveFGTforFConst(float[] a, float[] b, float[] c) {
+        for (int i = 0; i < a.length; i++) {
+            c[i] = (a[i] > b[i]) ? 0.1f : -0.1f;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
+    private static void testCMoveFGEforFConst(float[] a, float[] b, float[] c) {
+        for (int i = 0; i < a.length; i++) {
+            c[i] = (a[i] >= b[i]) ? 0.1f : -0.1f;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
+    private static void testCMoveFLTforFConst(float[] a, float[] b, float[] c) {
+        for (int i = 0; i < a.length; i++) {
+            c[i] = (a[i] < b[i]) ? 0.1f : -0.1f;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
+    private static void testCMoveFLEforFConst(float[] a, float[] b, float[] c) {
+        for (int i = 0; i < a.length; i++) {
+            c[i] = (a[i] <= b[i]) ? 0.1f : -0.1f;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
+    private static void testCMoveFEQforFConst(float[] a, float[] b, float[] c) {
+        for (int i = 0; i < a.length; i++) {
+            c[i] = (a[i] == b[i]) ? 0.1f : -0.1f;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
+    private static void testCMoveFNEQforFConst(float[] a, float[] b, float[] c) {
+        for (int i = 0; i < a.length; i++) {
+            c[i] = (a[i] != b[i]) ? 0.1f : -0.1f;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
+    private static void testCMoveDGTforDConst(double[] a, double[] b, double[] c) {
+        for (int i = 0; i < a.length; i++) {
+            c[i] = (a[i] > b[i]) ? 0.1 : -0.1;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
+    private static void testCMoveDGEforDConst(double[] a, double[] b, double[] c) {
+        for (int i = 0; i < a.length; i++) {
+            c[i] = (a[i] >= b[i]) ? 0.1 : -0.1;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
+    private static void testCMoveDLTforDConst(double[] a, double[] b, double[] c) {
+        for (int i = 0; i < a.length; i++) {
+            c[i] = (a[i] < b[i]) ? 0.1 : -0.1;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
+    private static void testCMoveDLEforDConst(double[] a, double[] b, double[] c) {
+        for (int i = 0; i < a.length; i++) {
+            c[i] = (a[i] <= b[i]) ? 0.1 : -0.1;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
+    private static void testCMoveDEQforDConst(double[] a, double[] b, double[] c) {
+        for (int i = 0; i < a.length; i++) {
+            c[i] = (a[i] == b[i]) ? 0.1 : -0.1;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
+    private static void testCMoveDNEQforDConst(double[] a, double[] b, double[] c) {
+        for (int i = 0; i < a.length; i++) {
+            c[i] = (a[i] != b[i]) ? 0.1 : -0.1;
+        }
+    }
+
+    // Extension: Compare 2 ILFD values, and pick from 2 ILFD values
+    // Note:
+    //   To guarantee that CMove is introduced, I need to perform the loads before the branch. To ensure they
+    //   do not float down into the branches, I compute a value, and store it to r2 (same as r, except that the
+    //   compilation does not know that).
+    //   So far, vectorization only works for CMoveF/D, with same data-width comparison (F/I for F, D/L for D).
+    @Test
+    @IR(failOn = {IRNode.VECTOR_MASK_CMP, IRNode.VECTOR_BLEND})
+    private static void testCMoveIGTforI(int[] a, int[] b, int[] c, int[] d, int[] r, int[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            int cc = c[i];
+            int dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(failOn = {IRNode.VECTOR_MASK_CMP, IRNode.VECTOR_BLEND})
+    private static void testCMoveIGTforL(int[] a, int[] b, long[] c, long[] d, long[] r, long[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            long cc = c[i];
+            long dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
+    private static void testCMoveIGTforF(int[] a, int[] b, float[] c, float[] d, float[] r, float[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            float cc = c[i];
+            float dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(failOn = {IRNode.VECTOR_MASK_CMP, IRNode.VECTOR_BLEND})
+    private static void testCMoveIGTforD(int[] a, int[] b, double[] c, double[] d, double[] r, double[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            double cc = c[i];
+            double dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(failOn = {IRNode.VECTOR_MASK_CMP, IRNode.VECTOR_BLEND})
+    private static void testCMoveLGTforI(long[] a, long[] b, int[] c, int[] d, int[] r, int[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            int cc = c[i];
+            int dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(failOn = {IRNode.VECTOR_MASK_CMP, IRNode.VECTOR_BLEND})
+    private static void testCMoveLGTforL(long[] a, long[] b, long[] c, long[] d, long[] r, long[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            long cc = c[i];
+            long dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(failOn = {IRNode.VECTOR_MASK_CMP, IRNode.VECTOR_BLEND})
+    private static void testCMoveLGTforF(long[] a, long[] b, float[] c, float[] d, float[] r, float[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            float cc = c[i];
+            float dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"})
+    // Requires avx2, else L is restricted to 16 byte, and D has 32. That leads to a vector elements mismatch of 2 to 4.
+    private static void testCMoveLGTforD(long[] a, long[] b, double[] c, double[] d, double[] r, double[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            double cc = c[i];
+            double dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(failOn = {IRNode.VECTOR_MASK_CMP, IRNode.VECTOR_BLEND})
+    private static void testCMoveFGTforI(float[] a, float[] b, int[] c, int[] d, int[] r, int[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            int cc = c[i];
+            int dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(failOn = {IRNode.VECTOR_MASK_CMP, IRNode.VECTOR_BLEND})
+    private static void testCMoveFGTforL(float[] a, float[] b, long[] c, long[] d, long[] r, long[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            long cc = c[i];
+            long dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
+    private static void testCMoveFGTforF(float[] a, float[] b, float[] c, float[] d, float[] r, float[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            float cc = c[i];
+            float dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(failOn = {IRNode.VECTOR_MASK_CMP, IRNode.VECTOR_BLEND})
+    private static void testCMoveFGTforD(float[] a, float[] b, double[] c, double[] d, double[] r, double[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            double cc = c[i];
+            double dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(failOn = {IRNode.VECTOR_MASK_CMP, IRNode.VECTOR_BLEND})
+    private static void testCMoveDGTforI(double[] a, double[] b, int[] c, int[] d, int[] r, int[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            int cc = c[i];
+            int dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(failOn = {IRNode.VECTOR_MASK_CMP, IRNode.VECTOR_BLEND})
+    private static void testCMoveDGTforL(double[] a, double[] b, long[] c, long[] d, long[] r, long[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            long cc = c[i];
+            long dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(failOn = {IRNode.VECTOR_MASK_CMP, IRNode.VECTOR_BLEND})
+    private static void testCMoveDGTforF(double[] a, double[] b, float[] c, float[] d, float[] r, float[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            float cc = c[i];
+            float dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR, ">0", IRNode.VECTOR_MASK_CMP, ">0", IRNode.VECTOR_BLEND, ">0", IRNode.STORE_VECTOR, ">0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"})
+    private static void testCMoveDGTforD(double[] a, double[] b, double[] c, double[] d, double[] r, double[] r2) {
+        for (int i = 0; i < a.length; i++) {
+            double cc = c[i];
+            double dd = d[i];
+            r2[i] = cc + dd;
+            r[i] = (a[i] > b[i]) ? cc : dd;
+        }
+    }
+
+    @Test
+    @IR(failOn = {IRNode.VECTOR_MASK_CMP, IRNode.VECTOR_BLEND})
    private static void testCMoveVDUnsupported() {
+        double[] doublec = new double[SIZE];
        int seed = 1001;
        for (int i = 0; i < doublec.length; i++) {
            doublec[i] = (i % 2 == 0) ? seed + i : seed - i;
        }
    }

+    @Warmup(0)
    @Run(test = {"testCMoveVFGT", "testCMoveVFLT","testCMoveVDLE", "testCMoveVDGE", "testCMoveVFEQ", "testCMoveVDNE",
-                 "testCMoveVFGTSwap", "testCMoveVFLTSwap","testCMoveVDLESwap", "testCMoveVDGESwap"})
+                 "testCMoveVFGTSwap", "testCMoveVFLTSwap","testCMoveVDLESwap", "testCMoveVDGESwap",
+                 "testCMoveFGTforFConst", "testCMoveFGEforFConst", "testCMoveFLTforFConst",
+                 "testCMoveFLEforFConst", "testCMoveFEQforFConst", "testCMoveFNEQforFConst",
+                 "testCMoveDGTforDConst", "testCMoveDGEforDConst", "testCMoveDLTforDConst",
+                 "testCMoveDLEforDConst", "testCMoveDEQforDConst", "testCMoveDNEQforDConst"})
    private void testCMove_runner() {
-        for (int i = 0; i < SIZE; i++) {
-            floata[i] = RANDOM.nextFloat();
-            floatb[i] = RANDOM.nextFloat();
-            doublea[i] = RANDOM.nextDouble();
-            doubleb[i] = RANDOM.nextDouble();
-        }
+        float[] floata = new float[SIZE];
+        float[] floatb = new float[SIZE];
+        float[] floatc = new float[SIZE];
+        double[] doublea = new double[SIZE];
+        double[] doubleb = new double[SIZE];
+        double[] doublec = new double[SIZE];
+
+        init(floata);
+        init(floatb);
+        init(doublea);
+        init(doubleb);

        testCMoveVFGT(floata, floatb, floatc);
        testCMoveVDLE(doublea, doubleb, doublec);
@@ -207,6 +634,7 @@ public class TestVectorConditionalMove {
            Asserts.assertEquals(doublec[i], cmoveDoubleGE(doublea[i], doubleb[i]));
        }

+        // Ensure we frequently have equals
        for (int i = 0; i < SIZE; i++) {
            if (i % 3 == 0) {
                floatb[i] = floata[i];
@@ -234,5 +662,215 @@ public class TestVectorConditionalMove {
            Asserts.assertEquals(floatc[i], cmoveFloatLTSwap(floata[i], floatb[i]));
            Asserts.assertEquals(doublec[i], cmoveDoubleGESwap(doublea[i], doubleb[i]));
        }
+
+        // Extensions: compare 2 values, and pick from 2 consts
+        testCMoveFGTforFConst(floata, floatb, floatc);
+        testCMoveDGTforDConst(doublea, doubleb, doublec);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(floatc[i], cmoveFGTforFConst(floata[i], floatb[i]));
+            Asserts.assertEquals(doublec[i], cmoveDGTforDConst(doublea[i], doubleb[i]));
+        }
+
+        testCMoveFGEforFConst(floata, floatb, floatc);
+        testCMoveDGEforDConst(doublea, doubleb, doublec);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(floatc[i], cmoveFGEforFConst(floata[i], floatb[i]));
+            Asserts.assertEquals(doublec[i], cmoveDGEforDConst(doublea[i], doubleb[i]));
+        }
+
+        testCMoveFLTforFConst(floata, floatb, floatc);
+        testCMoveDLTforDConst(doublea, doubleb, doublec);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(floatc[i], cmoveFLTforFConst(floata[i], floatb[i]));
+            Asserts.assertEquals(doublec[i], cmoveDLTforDConst(doublea[i], doubleb[i]));
+        }
+
+        testCMoveFLEforFConst(floata, floatb, floatc);
+        testCMoveDLEforDConst(doublea, doubleb, doublec);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(floatc[i], cmoveFLEforFConst(floata[i], floatb[i]));
+            Asserts.assertEquals(doublec[i], cmoveDLEforDConst(doublea[i], doubleb[i]));
+        }
+
+        testCMoveFEQforFConst(floata, floatb, floatc);
+        testCMoveDEQforDConst(doublea, doubleb, doublec);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(floatc[i], cmoveFEQforFConst(floata[i], floatb[i]));
+            Asserts.assertEquals(doublec[i], cmoveDEQforDConst(doublea[i], doubleb[i]));
+        }
+
+        testCMoveFNEQforFConst(floata, floatb, floatc);
+        testCMoveDNEQforDConst(doublea, doubleb, doublec);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(floatc[i], cmoveFNEQforFConst(floata[i], floatb[i]));
+            Asserts.assertEquals(doublec[i], cmoveDNEQforDConst(doublea[i], doubleb[i]));
+        }
+    }
+
+    @Warmup(0)
+    @Run(test = {"testCMoveIGTforI",
+                 "testCMoveIGTforL",
+                 "testCMoveIGTforF",
+                 "testCMoveIGTforD",
+                 "testCMoveLGTforI",
+                 "testCMoveLGTforL",
+                 "testCMoveLGTforF",
+                 "testCMoveLGTforD",
+                 "testCMoveFGTforI",
+                 "testCMoveFGTforL",
+                 "testCMoveFGTforF",
+                 "testCMoveFGTforD",
+                 "testCMoveDGTforI",
+                 "testCMoveDGTforL",
+                 "testCMoveDGTforF",
+                 "testCMoveDGTforD"})
+    private void testCMove_runner_two() {
+        int[] aI = new int[SIZE];
+        int[] bI = new int[SIZE];
+        int[] cI = new int[SIZE];
+        int[] dI = new int[SIZE];
+        int[] rI = new int[SIZE];
+        long[] aL = new long[SIZE];
+        long[] bL = new long[SIZE];
+        long[] cL = new long[SIZE];
+        long[] dL = new long[SIZE];
+        long[] rL = new long[SIZE];
+        float[] aF = new float[SIZE];
+        float[] bF = new float[SIZE];
+        float[] cF = new float[SIZE];
+        float[] dF = new float[SIZE];
+        float[] rF = new float[SIZE];
+        double[] aD = new double[SIZE];
+        double[] bD = new double[SIZE];
+        double[] cD = new double[SIZE];
+        double[] dD = new double[SIZE];
+        double[] rD = new double[SIZE];
+
+        init(aI);
+        init(bI);
+        init(cI);
+        init(dI);
+        init(aL);
+        init(bL);
+        init(cL);
+        init(dL);
+        init(aF);
+        init(bF);
+        init(cF);
+        init(dF);
+        init(aD);
+        init(bD);
+        init(cD);
+        init(dD);
+
+        testCMoveIGTforI(aI, bI, cI, dI, rI, rI);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rI[i], cmoveIGTforI(aI[i], bI[i], cI[i], dI[i]));
+        }
+
+        testCMoveIGTforL(aI, bI, cL, dL, rL, rL);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rL[i], cmoveIGTforL(aI[i], bI[i], cL[i], dL[i]));
+        }
+
+        testCMoveIGTforF(aI, bI, cF, dF, rF, rF);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rF[i], cmoveIGTforF(aI[i], bI[i], cF[i], dF[i]));
+        }
+
+        testCMoveIGTforD(aI, bI, cD, dD, rD, rD);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rD[i], cmoveIGTforD(aI[i], bI[i], cD[i], dD[i]));
+        }
+
+        testCMoveLGTforI(aL, bL, cI, dI, rI, rI);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rI[i], cmoveLGTforI(aL[i], bL[i], cI[i], dI[i]));
+        }
+
+        testCMoveLGTforL(aL, bL, cL, dL, rL, rL);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rL[i], cmoveLGTforL(aL[i], bL[i], cL[i], dL[i]));
+        }
+
+        testCMoveLGTforF(aL, bL, cF, dF, rF, rF);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rF[i], cmoveLGTforF(aL[i], bL[i], cF[i], dF[i]));
+        }
+
+        testCMoveLGTforD(aL, bL, cD, dD, rD, rD);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rD[i], cmoveLGTforD(aL[i], bL[i], cD[i], dD[i]));
+        }
+
+        testCMoveFGTforI(aF, bF, cI, dI, rI, rI);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rI[i], cmoveFGTforI(aF[i], bF[i], cI[i], dI[i]));
+        }
+
+        testCMoveFGTforL(aF, bF, cL, dL, rL, rL);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rL[i], cmoveFGTforL(aF[i], bF[i], cL[i], dL[i]));
+        }
+
+        testCMoveFGTforF(aF, bF, cF, dF, rF, rF);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rF[i], cmoveFGTforF(aF[i], bF[i], cF[i], dF[i]));
+        }
+
+        testCMoveFGTforD(aF, bF, cD, dD, rD, rD);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rD[i], cmoveFGTforD(aF[i], bF[i], cD[i], dD[i]));
+        }
+
+        testCMoveDGTforI(aD, bD, cI, dI, rI, rI);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rI[i], cmoveDGTforI(aD[i], bD[i], cI[i], dI[i]));
+        }
+
+        testCMoveDGTforL(aD, bD, cL, dL, rL, rL);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rL[i], cmoveDGTforL(aD[i], bD[i], cL[i], dL[i]));
+        }
+
+        testCMoveDGTforF(aD, bD, cF, dF, rF, rF);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rF[i], cmoveDGTforF(aD[i], bD[i], cF[i], dF[i]));
+        }
+
+        testCMoveDGTforD(aD, bD, cD, dD, rD, rD);
+        for (int i = 0; i < SIZE; i++) {
+            Asserts.assertEquals(rD[i], cmoveDGTforD(aD[i], bD[i], cD[i], dD[i]));
+        }
+    }
+
+    private static void init(int[] a) {
+        for (int i = 0; i < SIZE; i++) {
+            a[i] = RANDOM.nextInt();
+        }
+    }
+
+    private static void init(long[] a) {
+        for (int i = 0; i < SIZE; i++) {
+            a[i] = RANDOM.nextLong();
+        }
+    }
+
+    private static void init(float[] a) {
+        for (int i = 0; i < SIZE; i++) {
+            a[i] = RANDOM.nextFloat();
+            if (RANDOM.nextInt() % 20 == 0) {
+                a[i] = Float.NaN;
+            }
+        }
+    }
+
+    private static void init(double[] a) {
+        for (int i = 0; i < SIZE; i++) {
+            a[i] = RANDOM.nextDouble();
+            if (RANDOM.nextInt() % 20 == 0) {
+                a[i] = Double.NaN;
+            }
+        }
    }
 }
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
@@ -304,16 +304,6 @@ public class IRNode {
        beforeMatchingNameRegex(CMOVE_I, "CMoveI");
    }

-    public static final String CMOVE_VD = PREFIX + "CMOVE_VD" + POSTFIX;
-    static {
-        superWordNodes(CMOVE_VD, "CMoveVD");
-    }
-
-    public static final String CMOVE_VF = PREFIX + "CMOVE_VF" + POSTFIX;
-    static {
-        superWordNodes(CMOVE_VF, "CMoveVF");
-    }
-
    public static final String CMP_I = PREFIX + "CMP_I" + POSTFIX;
    static {
        beforeMatchingNameRegex(CMP_I, "CmpI");
@@ -1278,6 +1268,11 @@ public class IRNode {
        beforeMatchingNameRegex(VECTOR_BLEND, "VectorBlend");
    }

+    public static final String VECTOR_MASK_CMP = PREFIX + "VECTOR_MASK_CMP" + POSTFIX;
+    static {
+        beforeMatchingNameRegex(VECTOR_MASK_CMP, "VectorMaskCmp");
+    }
+
    public static final String VECTOR_CAST_B2X = PREFIX + "VECTOR_CAST_B2X" + POSTFIX;
    static {
        beforeMatchingNameRegex(VECTOR_CAST_B2X, "VectorCastB2X");