Commit 71df28cb authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[x64] Optimize F64x2PromoteLowF32x4 with S128Load64Zero

When the input to F64x2PromoteLowF32x4 is a S128Load64Zero, we can skip
the load + promote, and promote directly with a memory operand. The
tricky bit here is that on systems that rely on OOB trap handling, the
load is not eliminatable, so we always visit the S128Load64Zero, even
though after instruction-selector pattern-matching, it is unused. We
mark it as defined to skip visiting it, only if we matched it.

Bug: v8:12189
Change-Id: I0a805a3fce65c56ec52082b3625e1712ea1ee7cf
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3154347Reviewed-by: 's avatarGeorg Neis <neis@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76917}
parent 9ab05302
......@@ -2706,7 +2706,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64F64x2PromoteLowF32x4: {
__ Cvtps2pd(i.OutputSimd128Register(), i.InputSimd128Register(0));
if (HasAddressingMode(instr)) {
EmitOOLTrapIfNeeded(zone(), this, opcode, instr, __ pc_offset());
__ Cvtps2pd(i.OutputSimd128Register(), i.MemoryOperand());
} else {
__ Cvtps2pd(i.OutputSimd128Register(), i.InputSimd128Register(0));
}
break;
}
case kX64F32x4DemoteF64x2Zero: {
......
......@@ -16,6 +16,7 @@
#include "src/compiler/machine-operator.h"
#include "src/compiler/node-matchers.h"
#include "src/compiler/node-properties.h"
#include "src/compiler/opcodes.h"
#include "src/roots/roots-inl.h"
#if V8_ENABLE_WEBASSEMBLY
......@@ -3040,7 +3041,6 @@ VISIT_ATOMIC_BINOP(Xor)
#define SIMD_UNOP_LIST(V) \
V(F64x2Sqrt) \
V(F64x2ConvertLowI32x4S) \
V(F64x2PromoteLowF32x4) \
V(F32x4SConvertI32x4) \
V(F32x4Abs) \
V(F32x4Neg) \
......@@ -3842,6 +3842,26 @@ void InstructionSelector::VisitI64x2Abs(Node* node) {
}
}
void InstructionSelector::VisitF64x2PromoteLowF32x4(Node* node) {
X64OperandGenerator g(this);
InstructionCode code = kX64F64x2PromoteLowF32x4;
Node* input = node->InputAt(0);
LoadTransformMatcher m(input);
if (m.Is(LoadTransformation::kS128Load64Zero) && CanCover(node, input)) {
if (m.ResolvedValue().kind == MemoryAccessKind::kProtected) {
code |= AccessModeField::encode(kMemoryAccessProtected);
}
// LoadTransforms cannot be eliminated, so they are visited even if
// unused. Mark it as defined so that we don't visit it.
MarkAsDefined(input);
VisitLoad(node, input, code);
return;
}
VisitRR(this, node, code);
}
void InstructionSelector::AddOutputToSelectContinuation(OperandGenerator* g,
int first_input_index,
Node* node) {
......
......@@ -119,6 +119,10 @@ V8_EXPORT_PRIVATE std::ostream& operator<<(std::ostream&,
V8_EXPORT_PRIVATE LoadTransformParameters const& LoadTransformParametersOf(
Operator const*) V8_WARN_UNUSED_RESULT;
V8_EXPORT_PRIVATE bool operator==(LoadTransformParameters,
LoadTransformParameters);
bool operator!=(LoadTransformParameters, LoadTransformParameters);
struct LoadLaneParameters {
MemoryAccessKind kind;
LoadRepresentation rep;
......
......@@ -16,6 +16,7 @@
#include "src/compiler/common-operator.h"
#include "src/compiler/machine-operator.h"
#include "src/compiler/node.h"
#include "src/compiler/opcodes.h"
#include "src/compiler/operator.h"
#include "src/objects/heap-object.h"
......@@ -816,6 +817,14 @@ struct V8_EXPORT_PRIVATE DiamondMatcher
Node* if_false_;
};
struct LoadTransformMatcher
: ValueMatcher<LoadTransformParameters, IrOpcode::kLoadTransform> {
explicit LoadTransformMatcher(Node* node) : ValueMatcher(node) {}
bool Is(LoadTransformation t) {
return HasResolvedValue() && ResolvedValue().transformation == t;
}
};
} // namespace compiler
} // namespace internal
} // namespace v8
......
......@@ -883,6 +883,51 @@ WASM_SIMD_TEST(F64x2PromoteLowF32x4) {
}
}
// Test F64x2PromoteLowF32x4 with S128Load64Zero optimization (only on some
// architectures). These 2 opcodes should be fused into a single instruction
// with memory operands, which is tested in instruction-selector tests. This
// test checks that we get correct results.
WASM_SIMD_TEST(F64x2PromoteLowF32x4WithS128Load64Zero) {
{
WasmRunner<int32_t> r(execution_tier);
double* g = r.builder().AddGlobal<double>(kWasmS128);
float* memory =
r.builder().AddMemoryElems<float>(kWasmPageSize / sizeof(float));
r.builder().RandomizeMemory();
r.builder().WriteMemory(&memory[0], 1.0f);
r.builder().WriteMemory(&memory[1], 3.0f);
r.builder().WriteMemory(&memory[2], 5.0f);
r.builder().WriteMemory(&memory[3], 8.0f);
// Load at 4 (index) + 4 (offset) bytes, which is 2 floats.
BUILD(r,
WASM_GLOBAL_SET(
0, WASM_SIMD_UNOP(kExprF64x2PromoteLowF32x4,
WASM_SIMD_LOAD_OP_OFFSET(kExprS128Load64Zero,
WASM_I32V(4), 4))),
WASM_ONE);
r.Call();
CHECK_EQ(5.0f, LANE(g, 0));
CHECK_EQ(8.0f, LANE(g, 1));
}
{
// OOB tests.
WasmRunner<int32_t> r(execution_tier);
r.builder().AddGlobal<double>(kWasmS128);
r.builder().AddMemoryElems<float>(kWasmPageSize / sizeof(float));
BUILD(r,
WASM_GLOBAL_SET(
0, WASM_SIMD_UNOP(kExprF64x2PromoteLowF32x4,
WASM_SIMD_LOAD_OP(kExprS128Load64Zero,
WASM_I32V(kWasmPageSize)))),
WASM_ONE);
CHECK_TRAP(r.Call());
}
}
WASM_SIMD_TEST(F64x2Add) {
RunF64x2BinOpTest(execution_tier, kExprF64x2Add, Add);
}
......
......@@ -5,6 +5,7 @@
#include <limits>
#include "src/common/globals.h"
#include "src/compiler/machine-operator.h"
#include "src/compiler/node-matchers.h"
#include "src/objects/objects-inl.h"
#include "test/unittests/compiler/backend/instruction-selector-unittest.h"
......@@ -2269,6 +2270,22 @@ INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSIMDSwizzleConstantTest,
::testing::ValuesIn(kSwizzleConstants));
TEST_F(InstructionSelectorTest, F64x2PromoteLowF32x4WithS128Load64Zero) {
StreamBuilder m(this, MachineType::Simd128(), MachineType::Int32());
Node* const load =
m.AddNode(m.machine()->LoadTransform(MemoryAccessKind::kProtected,
LoadTransformation::kS128Load64Zero),
m.Int32Constant(2), m.Parameter(0));
Node* const promote = m.AddNode(m.machine()->F64x2PromoteLowF32x4(), load);
m.Return(promote);
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
ASSERT_EQ(kX64F64x2PromoteLowF32x4, s[0]->arch_opcode());
ASSERT_EQ(kMode_MRI, s[0]->addressing_mode());
EXPECT_EQ(2U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
} // namespace compiler
} // namespace internal
} // namespace v8
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment