diff --git a/src/hotspot/share/opto/loopnode.hpp b/src/hotspot/share/opto/loopnode.hpp index a7a448930a78..c781cc4651f5 100644 --- a/src/hotspot/share/opto/loopnode.hpp +++ b/src/hotspot/share/opto/loopnode.hpp @@ -895,7 +895,7 @@ class PhaseIdealLoop : public PhaseTransform { public: // Set/get control node out. Set lower bit to distinguish from IdealLoopTree // Returns true if "n" is a data node, false if it's a control node. - bool has_ctrl( Node *n ) const { return ((intptr_t)_nodes[n->_idx]) & 1; } + bool has_ctrl(const Node* n) const { return ((intptr_t)_nodes[n->_idx]) & 1; } private: // clear out dead code after build_loop_late @@ -972,7 +972,7 @@ public: PhaseIterGVN &igvn() const { return _igvn; } - bool has_node( Node* n ) const { + bool has_node(const Node* n) const { guarantee(n != nullptr, "No Node."); return _nodes[n->_idx] != nullptr; } @@ -1003,8 +1003,7 @@ public: // location of all Nodes in the subsumed block, we lazily do it. As we // pull such a subsumed block out of the array, we write back the final // correct block. - Node *get_ctrl( Node *i ) { - + Node* get_ctrl(const Node* i) { assert(has_node(i), ""); Node *n = get_ctrl_no_update(i); _nodes.map( i->_idx, (Node*)((intptr_t)n + 1) ); @@ -1024,12 +1023,12 @@ public: } } - Node *get_ctrl_no_update_helper(Node *i) const { + Node* get_ctrl_no_update_helper(const Node* i) const { assert(has_ctrl(i), "should be control, not loop"); return (Node*)(((intptr_t)_nodes[i->_idx]) & ~1); } - Node *get_ctrl_no_update(Node *i) const { + Node* get_ctrl_no_update(const Node* i) const { assert( has_ctrl(i), "" ); Node *n = get_ctrl_no_update_helper(i); if (!n->in(0)) { diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 271c3c4dd4b0..b503810b4eff 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -531,6 +531,8 @@ bool SuperWord::SLP_extract() { DEBUG_ONLY(verify_packs();) + remove_cycles(); + schedule(); // Record eventual count of vector packs for checks in post loop vectorization @@ -2336,6 +2338,230 @@ void SuperWord::verify_packs() { } #endif +// The PacksetGraph combines the DepPreds graph with the packset. In the PackSet +// graph, we have two kinds of nodes: +// (1) pack-node: Represents all nodes of some pack p in a single node, which +// shall later become a vector node. +// (2) scalar-node: Represents a node that is not in any pack. +// For any edge (n1, n2) in DepPreds, we add an edge to the PacksetGraph for the +// PacksetGraph nodes corresponding to n1 and n2. +// We work from the DepPreds graph, because it gives us all the data-dependencies, +// as well as more refined memory-dependencies than the C2 graph. DepPreds does +// not have cycles. But packing nodes can introduce cyclic dependencies. Example: +// +// +--------+ +// A -> X | v +// Pack [A,B] and [X,Y] [A,B] [X,Y] +// Y -> B ^ | +// +--------+ +// +class PacksetGraph { +private: + // pid: packset graph node id. + GrowableArray _pid; // bb_idx(n) -> pid + GrowableArray> _out; // out-edges + GrowableArray _incnt; // number of (implicit) in-edges + int _max_pid = 0; + + SuperWord* _slp; +public: + PacksetGraph(SuperWord* slp) + : _pid(8, 0, /* default */ 0), _slp(slp) { + } + // Get pid, if there is a packset node that n belongs to. Else return 0. + int get_pid_or_zero(const Node* n) const { + if (!_slp->in_bb(n)) { + return 0; + } + int idx = _slp->bb_idx(n); + if (idx >= _pid.length()) { + return 0; + } else { + return _pid.at(idx); + } + } + int get_pid(const Node* n) { + int poz = get_pid_or_zero(n); + assert(poz != 0, "pid should not be zero"); + return poz; + } + void set_pid(const Node* n, int pid) { + assert(n != nullptr && pid > 0, "sane inputs"); + assert(_slp->in_bb(n), "must be"); + int idx = _slp->bb_idx(n); + _pid.at_put_grow(idx, pid); + } + int new_pid() { + _incnt.push(0); + _out.push(GrowableArray()); + return ++_max_pid; + } + int incnt(int pid) { return _incnt.at(pid - 1); } + void incnt_set(int pid, int cnt) { return _incnt.at_put(pid - 1, cnt); } + GrowableArray& out(int pid) { return _out.at(pid - 1); } + + // Create nodes (from packs and scalar-nodes), and add edges, based on DepPreds. + void build() { + const GrowableArray &packset = _slp->packset(); + const GrowableArray &block = _slp->block(); + const DepGraph &dg = _slp->dg(); + // Map nodes in packsets + for (int i = 0; i < packset.length(); i++) { + Node_List* p = packset.at(i); + int pid = new_pid(); + for (uint k = 0; k < p->size(); k++) { + Node* n = p->at(k); + set_pid(n, pid); + } + } + + int max_pid_packset = _max_pid; + + // Map nodes not in packset + for (int i = 0; i < block.length(); i++) { + Node* n = block.at(i); + if (n->is_Phi() || n->is_CFG()) { + continue; // ignore control flow + } + int pid = get_pid_or_zero(n); + if (pid == 0) { + pid = new_pid(); + set_pid(n, pid); + } + } + + // Map edges for packset nodes + VectorSet set; + for (int i = 0; i < packset.length(); i++) { + Node_List* p = packset.at(i); + set.clear(); + int pid = get_pid(p->at(0)); + for (uint k = 0; k < p->size(); k++) { + Node* n = p->at(k); + assert(pid == get_pid(n), "all nodes in pack have same pid"); + for (DepPreds preds(n, dg); !preds.done(); preds.next()) { + Node* pred = preds.current(); + int pred_pid = get_pid_or_zero(pred); + if (pred_pid == pid && n->is_reduction()) { + continue; // reduction -> self-cycle is not a cyclic dependency + } + // Only add edges once, and only for mapped nodes (in block) + if (pred_pid > 0 && !set.test_set(pred_pid)) { + incnt_set(pid, incnt(pid) + 1); // increment + out(pred_pid).push(pid); + } + } + } + } + + // Map edges for nodes not in packset + for (int i = 0; i < block.length(); i++) { + Node* n = block.at(i); + int pid = get_pid_or_zero(n); // zero for Phi or CFG + if (pid <= max_pid_packset) { + continue; // Only scalar-nodes + } + for (DepPreds preds(n, dg); !preds.done(); preds.next()) { + Node* pred = preds.current(); + int pred_pid = get_pid_or_zero(pred); + // Only add edges for mapped nodes (in block) + if (pred_pid > 0) { + incnt_set(pid, incnt(pid) + 1); // increment + out(pred_pid).push(pid); + } + } + } + } + // Schedule the graph to worklist. Returns true iff all nodes were scheduled. + // This implies that we return true iff the PacksetGraph is acyclic. + // We schedule with topological sort: schedule any node that has zero incnt. + // Then remove that node, which decrements the incnt of all its uses (outputs). + bool schedule() { + GrowableArray worklist; + // Directly schedule all nodes without precedence + for (int pid = 1; pid <= _max_pid; pid++) { + if (incnt(pid) == 0) { + worklist.push(pid); + } + } + // Continue scheduling via topological sort + for (int i = 0; i < worklist.length(); i++) { + int pid = worklist.at(i); + for (int j = 0; j < out(pid).length(); j++){ + int pid_use = out(pid).at(j); + int incnt_use = incnt(pid_use) - 1; + incnt_set(pid_use, incnt_use); + // Did use lose its last input? + if (incnt_use == 0) { + worklist.push(pid_use); + } + } + } + // Was every pid scheduled? + return worklist.length() == _max_pid; + } + // Print the PacksetGraph. + // print_nodes = true: print all C2 nodes beloning to PacksetGrahp node. + // print_zero_incnt = false: do not print nodes that have no in-edges (any more). + void print(bool print_nodes, bool print_zero_incnt) { + const GrowableArray &block = _slp->block(); + tty->print_cr("PacksetGraph"); + for (int pid = 1; pid <= _max_pid; pid++) { + if (incnt(pid) == 0 && !print_zero_incnt) { + continue; + } + tty->print("Node %d. incnt %d [", pid, incnt(pid)); + for (int j = 0; j < out(pid).length(); j++) { + tty->print("%d ", out(pid).at(j)); + } + tty->print_cr("]"); +#ifndef PRODUCT + if (print_nodes) { + for (int i = 0; i < block.length(); i++) { + Node* n = block.at(i); + if (get_pid_or_zero(n) == pid) { + tty->print(" "); + n->dump(); + } + } + } +#endif + } + } +}; + +//------------------------------remove_cycles--------------------------- +// We now know that we only have independent packs, see verify_packs. +// This is a necessary but not a sufficient condition for an acyclic +// graph (DAG) after scheduling. Thus, we must check if the packs have +// introduced a cycle. The SuperWord paper mentions the need for this +// in "3.7 Scheduling". +// Approach: given all nodes from the _block, we create a new graph. +// The nodes that are not in a pack are their own nodes (scalar-node) +// in that new graph. Every pack is also a node (pack-node). We then +// add the edges according to DepPreds: a scalar-node has all edges +// to its node's DepPreds. A pack-node has all edges from every pack +// member to all their DepPreds. +void SuperWord::remove_cycles() { + if (_packset.length() == 0) { + return; // empty packset + } + ResourceMark rm; + + PacksetGraph graph(this); + + graph.build(); + + if (!graph.schedule()) { + if (TraceSuperWord) { + tty->print_cr("remove_cycles found cycle in PacksetGraph:"); + graph.print(true, false); + tty->print_cr("removing all packs from packset."); + } + _packset.clear(); + } +} + //------------------------------schedule--------------------------- // Adjust the memory graph for the packed operations void SuperWord::schedule() { @@ -4920,7 +5146,7 @@ void DepEdge::print() { // Iterator over predecessor edges in the dependence graph. //------------------------------DepPreds--------------------------- -DepPreds::DepPreds(Node* n, DepGraph& dg) { +DepPreds::DepPreds(Node* n, const DepGraph& dg) { _n = n; _done = false; if (_n->is_Store() || _n->is_Load()) { diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index f811d08aba9c..f07971a5330d 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -131,7 +131,7 @@ class DepGraph { DepMem* tail() { return _tail; } // Return dependence node corresponding to an ideal node - DepMem* dep(Node* node) { return _map.at(node->_idx); } + DepMem* dep(Node* node) const { return _map.at(node->_idx); } // Make a new dependence graph node for an ideal node. DepMem* make_node(Node* node); @@ -161,7 +161,7 @@ private: bool _done; public: - DepPreds(Node* n, DepGraph& dg); + DepPreds(Node* n, const DepGraph& dg); Node* current() { return _current; } bool done() { return _done; } void next(); @@ -349,6 +349,10 @@ class SuperWord : public ResourceObj { #endif bool do_vector_loop() { return _do_vector_loop; } bool do_reserve_copy() { return _do_reserve_copy; } + + const GrowableArray& packset() const { return _packset; } + const GrowableArray& block() const { return _block; } + const DepGraph& dg() const { return _dg; } private: IdealLoopTree* _lpt; // Current loop tree node CountedLoopNode* _lp; // Current CountedLoopNode @@ -412,12 +416,14 @@ class SuperWord : public ResourceObj { MemNode* align_to_ref() { return _align_to_ref; } void set_align_to_ref(MemNode* m) { _align_to_ref = m; } - Node* ctrl(Node* n) const { return _phase->has_ctrl(n) ? _phase->get_ctrl(n) : n; } + const Node* ctrl(const Node* n) const { return _phase->has_ctrl(n) ? _phase->get_ctrl(n) : n; } // block accessors - bool in_bb(Node* n) { return n != nullptr && n->outcnt() > 0 && ctrl(n) == _bb; } - int bb_idx(Node* n) { assert(in_bb(n), "must be"); return _bb_idx.at(n->_idx); } - void set_bb_idx(Node* n, int i) { _bb_idx.at_put_grow(n->_idx, i); } + public: + bool in_bb(const Node* n) const { return n != nullptr && n->outcnt() > 0 && ctrl(n) == _bb; } + int bb_idx(const Node* n) const { assert(in_bb(n), "must be"); return _bb_idx.at(n->_idx); } + private: + void set_bb_idx(Node* n, int i) { _bb_idx.at_put_grow(n->_idx, i); } // visited set accessors void visited_clear() { _visited.clear(); } @@ -554,6 +560,8 @@ class SuperWord : public ResourceObj { void merge_packs_to_cmove(); // Verify that for every pack, all nodes are mutually independent DEBUG_ONLY(void verify_packs();) + // Remove cycles in packset. + void remove_cycles(); // Adjust the memory graph for the packed operations void schedule(); // Remove "current" from its current position in the memory graph and insert diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index 6eb5ae996b15..bdbdf0067791 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -724,6 +724,16 @@ public class IRNode { beforeMatchingNameRegex(MUL_VI, "MulVI"); } + public static final String MUL_VF = PREFIX + "MUL_VF" + POSTFIX; + static { + beforeMatchingNameRegex(MUL_VF, "MulVF"); + } + + public static final String MUL_VD = PREFIX + "MUL_VD" + POSTFIX; + static { + beforeMatchingNameRegex(MUL_VD, "MulVD"); + } + public static final String MUL_REDUCTION_VD = PREFIX + "MUL_REDUCTION_VD" + POSTFIX; static { superWordNodes(MUL_REDUCTION_VD, "MulReductionVD"); diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java new file mode 100644 index 000000000000..870c4be3931e --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency.java @@ -0,0 +1,482 @@ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/* + * @test + * @bug 8304042 + * @summary Test some examples with independent packs with cyclic dependency + * between the packs. + * @requires vm.compiler2.enabled + * @requires vm.bits == 64 + * @requires vm.cpu.features ~= ".*avx2.*" | vm.cpu.features ~= ".*asimd.*" + * @modules java.base/jdk.internal.misc + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestIndependentPacksWithCyclicDependency + */ + +package compiler.loopopts.superword; + +import jdk.internal.misc.Unsafe; +import jdk.test.lib.Asserts; +import compiler.lib.ir_framework.*; + +public class TestIndependentPacksWithCyclicDependency { + static final int RANGE = 1024; + static final int ITER = 10_000; + static Unsafe unsafe = Unsafe.getUnsafe(); + + int[] goldI0 = new int[RANGE]; + float[] goldF0 = new float[RANGE]; + int[] goldI1 = new int[RANGE]; + float[] goldF1 = new float[RANGE]; + int[] goldI2 = new int[RANGE]; + float[] goldF2 = new float[RANGE]; + int[] goldI3 = new int[RANGE]; + float[] goldF3 = new float[RANGE]; + int[] goldI4 = new int[RANGE]; + float[] goldF4 = new float[RANGE]; + int[] goldI5 = new int[RANGE]; + float[] goldF5 = new float[RANGE]; + int[] goldI6 = new int[RANGE]; + float[] goldF6 = new float[RANGE]; + long[] goldL6 = new long[RANGE]; + int[] goldI7 = new int[RANGE]; + float[] goldF7 = new float[RANGE]; + long[] goldL7 = new long[RANGE]; + int[] goldI8 = new int[RANGE]; + float[] goldF8 = new float[RANGE]; + long[] goldL8 = new long[RANGE]; + int[] goldI9 = new int[RANGE]; + float[] goldF9 = new float[RANGE]; + long[] goldL9 = new long[RANGE]; + int[] goldI10 = new int[RANGE]; + float[] goldF10 = new float[RANGE]; + long[] goldL10 = new long[RANGE]; + + public static void main(String args[]) { + TestFramework.runWithFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED", + "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestIndependentPacksWithCyclicDependency::test*", + "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestIndependentPacksWithCyclicDependency::verify", + "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestIndependentPacksWithCyclicDependency::init", + "-XX:LoopUnrollLimit=1000"); + } + + TestIndependentPacksWithCyclicDependency() { + // compute the gold standard in interpreter mode + init(goldI0, goldF0); + test0(goldI0, goldI0, goldF0, goldF0); + init(goldI1, goldF1); + test1(goldI1, goldI1, goldF1, goldF1); + init(goldI2, goldF2); + test2(goldI2, goldI2, goldF2, goldF2); + init(goldI3, goldF3); + test3(goldI3, goldI3, goldF3, goldF3); + init(goldI4, goldF4); + test4(goldI4, goldI4, goldF4, goldF4); +// init(goldI5, goldF5); +// test5(goldI5, goldI5, goldF5, goldF5); + init(goldI6, goldF6, goldL6); + test6(goldI6, goldI6, goldF6, goldF6, goldL6, goldL6); + init(goldI7, goldF7, goldL7); + test7(goldI7, goldI7, goldF7, goldF7, goldL7, goldL7); + init(goldI8, goldF8, goldL8); + test8(goldI8, goldI8, goldF8, goldF8, goldL8, goldL8); + init(goldI9, goldF9, goldL9); + test9(goldI9, goldI9, goldF9, goldF9, goldL9, goldL9); + init(goldI10, goldF10, goldL10); + test10(goldI10, goldI10, goldF10, goldF10, goldL10, goldL10); + } + + @Run(test = "test0") + @Warmup(100) + public void runTest0() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + init(dataI, dataF); + test0(dataI, dataI, dataF, dataF); + verify("test0", dataI, goldI0); + verify("test0", dataF, goldF0); + } + + @Test + @IR(counts = {IRNode.ADD_VI, "> 0", IRNode.MUL_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static void test0(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) { + for (int i = 0; i < RANGE; i+=2) { + // Hand-unrolled 2x. Int and Float slice are completely separate. + dataIb[i+0] = dataIa[i+0] + 3; + dataIb[i+1] = dataIa[i+1] + 3; + dataFb[i+0] = dataFa[i+0] * 1.3f; + dataFb[i+1] = dataFa[i+1] * 1.3f; + } + } + + @Run(test = "test1") + @Warmup(100) + public void runTest1() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + init(dataI, dataF); + test1(dataI, dataI, dataF, dataF); + verify("test1", dataI, goldI1); + verify("test1", dataF, goldF1); + } + + @Test + @IR(counts = {IRNode.ADD_VI, "> 0", IRNode.MUL_VF, "> 0", IRNode.VECTOR_CAST_F2X, "> 0", IRNode.VECTOR_CAST_I2X, "> 0"}, + applyIfCPUFeatureOr = {"avx2", "true", "asimd", "true"}) + static void test1(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) { + for (int i = 0; i < RANGE; i+=2) { + // Hand-unrolled 2x. Converst to and from. StoreF -> LoadF dependency. + dataFa[i+0] = dataIa[i+0] + 3; + dataFa[i+1] = dataIa[i+1] + 3; + dataIb[i+0] = (int)(dataFb[i+0] * 1.3f); + dataIb[i+1] = (int)(dataFb[i+1] * 1.3f); + } + } + + @Run(test = "test2") + public void runTest2() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + init(dataI, dataF); + test2(dataI, dataI, dataF, dataF); + verify("test2", dataI, goldI2); + verify("test2", dataF, goldF2); + } + + @Test + @IR(counts = {IRNode.ADD_VI, "> 0", IRNode.MUL_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static void test2(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) { + for (int i = 0; i < RANGE; i+=2) { + // int and float arrays are two slices. But we pretend both are of type int. + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1); + dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); + dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); + } + } + + @Run(test = "test3") + @Warmup(100) + public void runTest3() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + init(dataI, dataF); + test3(dataI, dataI, dataF, dataF); + verify("test3", dataI, goldI3); + verify("test3", dataF, goldF3); + } + + @Test + @IR(counts = {IRNode.ADD_VI, "> 0", IRNode.MUL_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static void test3(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) { + for (int i = 0; i < RANGE; i+=2) { + // Inversion of orders. But because we operate on separate slices, this should + // safely vectorize. It should detect that each line is independent, so it can + // reorder them. + dataIb[i+0] = dataIa[i+0] + 3; + dataFb[i+1] = dataFa[i+1] * 1.3f; + dataFb[i+0] = dataFa[i+0] * 1.3f; + dataIb[i+1] = dataIa[i+1] + 3; + } + } + + @Run(test = "test4") + @Warmup(100) + public void runTest4() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + init(dataI, dataF); + test4(dataI, dataI, dataF, dataF); + verify("test4", dataI, goldI4); + verify("test4", dataF, goldF4); + } + + @Test + static void test4(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) { + for (int i = 0; i < RANGE; i+=2) { + // same as test1, except that reordering leads to different semantics + // [A,B] and [X,Y] are both packs that are internally independent + // But we have dependencies A -> X (StoreF -> LoadF) + // and Y -> B (StoreI -> LoadI) + // Hence the two packs have a cyclic dependency, we cannot schedule + // one before the other. + dataFa[i+0] = dataIa[i+0] + 3; // A + dataIb[i+0] = (int)(dataFb[i+0] * 1.3f); // X + dataIb[i+1] = (int)(dataFb[i+1] * 1.3f); // Y + dataFa[i+1] = dataIa[i+1] + 3; // B + } + } + +// TODO uncomment after fixing JDK-8304720 +// +// @Run(test = "test5") +// public void runTest5() { +// int[] dataI = new int[RANGE]; +// float[] dataF = new float[RANGE]; +// init(dataI, dataF); +// test5(dataI, dataI, dataF, dataF); +// verify("test5", dataI, goldI5); +// verify("test5", dataF, goldF5); +// } +// +// @Test +// static void test5(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb) { +// for (int i = 0; i < RANGE; i+=2) { +// // same as test2, except that reordering leads to different semantics +// // explanation analogue to test4 +// unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, dataIa[i+0] + 1); // A +// dataIb[i+0] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0); // X +// dataIb[i+1] = 11 * unsafe.getInt(dataFb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4); // Y +// unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, dataIa[i+1] + 1); // B +// } +// } + + @Run(test = "test6") + public void runTest6() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + long[] dataL = new long[RANGE]; + init(dataI, dataF, dataL); + test6(dataI, dataI, dataF, dataF, dataL, dataL); + verify("test6", dataI, goldI6); + verify("test6", dataF, goldF6); + verify("test6", dataL, goldL6); + } + + @Test + @IR(counts = {IRNode.ADD_VI, "> 0", IRNode.MUL_VI, "> 0", IRNode.ADD_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static void test6(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb, + long[] dataLa, long[] dataLb) { + for (int i = 0; i < RANGE; i+=2) { + // Chain of parallelizable op and conversion + int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3; + int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3; + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00); + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01); + int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45; + int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) * 45; + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10); + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11); + float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f; + float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f; + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20); + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21); + } + } + + @Run(test = "test7") + public void runTest7() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + long[] dataL = new long[RANGE]; + init(dataI, dataF, dataL); + test7(dataI, dataI, dataF, dataF, dataL, dataL); + verify("test7", dataI, goldI7); + verify("test7", dataF, goldF7); + verify("test7", dataL, goldL7); + } + + @Test + static void test7(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb, + long[] dataLa, long[] dataLb) { + for (int i = 0; i < RANGE; i+=2) { + // Cycle involving 3 memory slices + int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3; + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00); + int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45; + int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) * 45; + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10); + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11); + float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f; + float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f; + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20); + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21); + int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3; // moved down + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01); + } + } + + + @Run(test = "test8") + public void runTest8() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + long[] dataL = new long[RANGE]; + init(dataI, dataF, dataL); + test8(dataI, dataI, dataF, dataF, dataL, dataL); + verify("test8", dataI, goldI8); + verify("test8", dataF, goldF8); + verify("test8", dataL, goldL8); + } + + @Test + static void test8(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb, + long[] dataLa, long[] dataLb) { + for (int i = 0; i < RANGE; i+=2) { + // 2-cycle, with more ops after + int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3; + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00); + int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45; + int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) * 45; + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10); + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11); + int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3; + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01); + // more stuff after + float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f; + float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f; + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20); + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21); + } + } + + @Run(test = "test9") + public void runTest9() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + long[] dataL = new long[RANGE]; + init(dataI, dataF, dataL); + test9(dataI, dataI, dataF, dataF, dataL, dataL); + verify("test9", dataI, goldI9); + verify("test9", dataF, goldF9); + verify("test9", dataL, goldL9); + } + + @Test + static void test9(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb, + long[] dataLa, long[] dataLb) { + for (int i = 0; i < RANGE; i+=2) { + // 2-cycle, with more stuff before + float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f; + float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f; + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20); + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21); + // 2-cycle + int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3; + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00); + int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45; + int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) * 45; + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10); + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11); + int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3; + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01); + } + } + + @Run(test = "test10") + public void runTest10() { + int[] dataI = new int[RANGE]; + float[] dataF = new float[RANGE]; + long[] dataL = new long[RANGE]; + init(dataI, dataF, dataL); + test10(dataI, dataI, dataF, dataF, dataL, dataL); + verify("test10", dataI, goldI10); + verify("test10", dataF, goldF10); + verify("test10", dataL, goldL10); + } + + @Test + static void test10(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb, + long[] dataLa, long[] dataLb) { + for (int i = 0; i < RANGE; i+=2) { + // This creates the following graph before SuperWord: + // + // A -> R -> U + // S -> V -> B + // + // SuperWord analyzes the graph, and sees that [A,B] and [U,V] + // are adjacent, isomorphic and independent packs. However, + // [R,S] are not isomorphic (R mul, S add). + // So it vectorizes [A,B] and [U,V] this gives us this graph: + // + // -> R + // [A,B] -> [U,V] -+ + // ^ -> S | + // | | + // +------------------+ + // + // The cycle thus does not only go via packs, but also scalar ops. + // + int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3; // A + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00); + int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45; // R: constant mismatch + int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) + 43; // S + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10); + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11); + float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f; // U + float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f; // V + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20); + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21); + int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3; // B: moved down + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01); + } + } + + static void init(int[] dataI, float[] dataF) { + for (int i = 0; i < RANGE; i++) { + dataI[i] = i + 1; + dataF[i] = i + 0.1f; + } + } + + static void init(int[] dataI, float[] dataF, long[] dataL) { + for (int i = 0; i < RANGE; i++) { + dataI[i] = i + 1; + dataF[i] = i + 0.1f; + dataL[i] = i + 1; + } + } + + static void verify(String name, int[] data, int[] gold) { + for (int i = 0; i < RANGE; i++) { + if (data[i] != gold[i]) { + throw new RuntimeException(" Invalid " + name + " result: dataI[" + i + "]: " + data[i] + " != " + gold[i]); + } + } + } + + static void verify(String name, float[] data, float[] gold) { + for (int i = 0; i < RANGE; i++) { + int datav = unsafe.getInt(data, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i); + int goldv = unsafe.getInt(gold, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i); + if (datav != goldv) { + throw new RuntimeException(" Invalid " + name + " result: dataF[" + i + "]: " + datav + " != " + goldv); + } + } + } + + static void verify(String name, long[] data, long[] gold) { + for (int i = 0; i < RANGE; i++) { + if (data[i] != gold[i]) { + throw new RuntimeException(" Invalid " + name + " result: dataL[" + i + "]: " + data[i] + " != " + gold[i]); + } + } + } +} + diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java new file mode 100644 index 000000000000..328719daf582 --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestIndependentPacksWithCyclicDependency2.java @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +/* + * @test + * @bug 8304042 + * @summary Test some examples with independent packs with cyclic dependency + * between the packs. + * Before fix, this hit: "assert(!is_visited) failed: visit only once" + * @requires vm.compiler2.enabled + * @requires vm.bits == 64 + * @requires vm.cpu.features ~= ".*avx2.*" | vm.cpu.features ~= ".*asimd.*" + * @modules java.base/jdk.internal.misc + * @library /test/lib / + * @run main/othervm -XX:LoopUnrollLimit=250 + * -XX:CompileCommand=compileonly,compiler.loopopts.superword.TestIndependentPacksWithCyclicDependency2::test + * compiler.loopopts.superword.TestIndependentPacksWithCyclicDependency2 + */ + +package compiler.loopopts.superword; + +import jdk.test.lib.Asserts; +import jdk.internal.misc.Unsafe; + +public class TestIndependentPacksWithCyclicDependency2 { + static final int RANGE = 1024; + static final int ITER = 10_000; + + static Unsafe unsafe = Unsafe.getUnsafe(); + + static void init(int[] dataI, float[] dataF, long[] dataL) { + for (int i = 0; i < RANGE; i++) { + dataI[i] = i + 1; + dataF[i] = i + 0.1f; + dataL[i] = (long)(i + 1); + } + } + + static void test(int[] dataIa, int[] dataIb, float[] dataFa, float[] dataFb, + long[] dataLa, long[] dataLb) { + for (int i = 0; i < RANGE; i+=2) { + // For explanation, see test 10 in TestIndependentPacksWithCyclicDependency.java + int v00 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0) + 3; + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0, v00); + int v10 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 0) * 45; + int v11 = unsafe.getInt(dataFb, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4) + 43; + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0, v10); + unsafe.putInt(dataLa, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4, v11); + float v20 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 0) + 0.55f; + float v21 = unsafe.getFloat(dataLb, unsafe.ARRAY_LONG_BASE_OFFSET + 4 * i + 4) + 0.55f; + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 0, v20); + unsafe.putFloat(dataIb, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4, v21); + int v01 = unsafe.getInt(dataIa, unsafe.ARRAY_INT_BASE_OFFSET + 4 * i + 4) + 3; // moved down + unsafe.putInt(dataFa, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i + 4, v01); + } + } + + static void verify(String name, int[] data, int[] gold) { + for (int i = 0; i < RANGE; i++) { + if (data[i] != gold[i]) { + throw new RuntimeException(" Invalid " + name + " result: data[" + i + "]: " + data[i] + " != " + gold[i]); + } + } + } + + static void verify(String name, float[] data, float[] gold) { + for (int i = 0; i < RANGE; i++) { + int datav = unsafe.getInt(data, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i); + int goldv = unsafe.getInt(gold, unsafe.ARRAY_FLOAT_BASE_OFFSET + 4 * i); + if (datav != goldv) { + throw new RuntimeException(" Invalid " + name + " result: dataF[" + i + "]: " + datav + " != " + goldv); + } + } + } + + static void verify(String name, long[] data, long[] gold) { + for (int i = 0; i < RANGE; i++) { + if (data[i] != gold[i]) { + throw new RuntimeException(" Invalid " + name + " result: data[" + i + "]: " + data[i] + " != " + gold[i]); + } + } + } + + public static void main(String[] args) { + int[] dataI = new int[RANGE]; + int[] goldI = new int[RANGE]; + float[] dataF = new float[RANGE]; + float[] goldF = new float[RANGE]; + long[] dataL = new long[RANGE]; + long[] goldL = new long[RANGE]; + init(goldI, goldF, goldL); + test(goldI, goldI, goldF, goldF, goldL, goldL); + for (int i = 0; i < ITER; i++) { + init(dataI, dataF, dataL); + test(dataI, dataI, dataF, dataF, dataL, dataL); + } + verify("test", dataI, goldI); + verify("test", dataF, goldF); + verify("test", dataL, goldL); + } +} +