Commit 19be4913 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd] Optimize codegen when shift is constant

Define a macro in code-generator-x64 to help identify cases when the
shift value is an immediate/constant. In those cases we can directly
emit the shifts without any masking, since the instruction selector
would have modulo-ed the shift value. We also don't need any temporaries
in this case.

This is only x64 codegen, optimizations for other archs will come in
future patches (and will probably look very similar to this).

The current test case passes the shifts as an immediate, so we add a new
path that loads the shift value from memory, thereby exercising the
slower path of non-immediate shift value.

Bug: v8:10115
Change-Id: Iaf13d81595714882a8f5418734e031b8bc654af3
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2026067Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66074}
parent 880b28e4
......@@ -601,6 +601,25 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen,
__ cmovq(zero, dst, tmp1); \
} while (false)
// This macro will directly emit the opcode if the shift is an immediate - the
// shift value will be taken modulo 2^width. Otherwise, it will emit code to
// perform the modulus operation.
#define ASSEMBLE_SIMD_SHIFT(opcode, width) \
do { \
XMMRegister dst = i.OutputSimd128Register(); \
DCHECK_EQ(dst, i.InputSimd128Register(0)); \
if (HasImmediateInput(instr, 1)) { \
__ opcode(dst, static_cast<byte>(i.InputInt##width(1))); \
} else { \
XMMRegister tmp = i.TempSimd128Register(0); \
Register shift = i.InputRegister(1); \
constexpr int mask = (1 << width) - 1; \
__ andq(shift, Immediate(mask)); \
__ Movq(tmp, shift); \
__ opcode(dst, tmp); \
} \
} while (false)
void CodeGenerator::AssembleDeconstructFrame() {
unwinding_info_writer_.MarkFrameDeconstructed(__ pc_offset());
__ movq(rsp, rbp);
......@@ -2634,12 +2653,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I64x2Shl: {
XMMRegister tmp = i.TempSimd128Register(0);
Register shift = i.InputRegister(1);
// Take shift value modulo 8.
__ andq(shift, Immediate(63));
__ movq(tmp, shift);
__ psllq(i.OutputSimd128Register(), tmp);
// Take shift value modulo 2^6.
ASSEMBLE_SIMD_SHIFT(psllq, 6);
break;
}
case kX64I64x2ShrS: {
......@@ -2788,12 +2803,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I64x2ShrU: {
XMMRegister tmp = i.TempSimd128Register(0);
Register shift = i.InputRegister(1);
// Take shift value modulo 64.
__ andq(shift, Immediate(63));
__ movq(tmp, shift);
__ psrlq(i.OutputSimd128Register(), tmp);
// Take shift value modulo 2^6.
ASSEMBLE_SIMD_SHIFT(psrlq, 6);
break;
}
case kX64I64x2MinU: {
......@@ -2943,21 +2954,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I32x4Shl: {
XMMRegister tmp = i.TempSimd128Register(0);
Register shift = i.InputRegister(1);
// Take shift value modulo 32.
__ andq(shift, Immediate(31));
__ Movq(tmp, shift);
__ Pslld(i.OutputSimd128Register(), tmp);
// Take shift value modulo 2^5.
ASSEMBLE_SIMD_SHIFT(Pslld, 5);
break;
}
case kX64I32x4ShrS: {
XMMRegister tmp = i.TempSimd128Register(0);
Register shift = i.InputRegister(1);
// Take shift value modulo 32.
__ andq(shift, Immediate(31));
__ Movq(tmp, shift);
__ Psrad(i.OutputSimd128Register(), tmp);
// Take shift value modulo 2^5.
ASSEMBLE_SIMD_SHIFT(Psrad, 5);
break;
}
case kX64I32x4Add: {
......@@ -3051,12 +3054,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I32x4ShrU: {
XMMRegister tmp = i.TempSimd128Register(0);
Register shift = i.InputRegister(1);
// Take shift value modulo 32.
__ andq(shift, Immediate(31));
__ Movq(tmp, shift);
__ Psrld(i.OutputSimd128Register(), tmp);
// Take shift value modulo 2^5.
ASSEMBLE_SIMD_SHIFT(Psrld, 5);
break;
}
case kX64I32x4MinU: {
......@@ -3151,21 +3150,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I16x8Shl: {
XMMRegister tmp = i.TempSimd128Register(0);
Register shift = i.InputRegister(1);
// Take shift value modulo 16.
__ andq(shift, Immediate(15));
__ movq(tmp, shift);
__ psllw(i.OutputSimd128Register(), tmp);
// Take shift value modulo 2^4.
ASSEMBLE_SIMD_SHIFT(psllw, 4);
break;
}
case kX64I16x8ShrS: {
XMMRegister tmp = i.TempSimd128Register(0);
Register shift = i.InputRegister(1);
// Take shift value modulo 16.
__ andq(shift, Immediate(15));
__ movq(tmp, shift);
__ psraw(i.OutputSimd128Register(), tmp);
// Take shift value modulo 2^4.
ASSEMBLE_SIMD_SHIFT(psraw, 4);
break;
}
case kX64I16x8SConvertI32x4: {
......@@ -3244,12 +3235,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I16x8ShrU: {
XMMRegister tmp = i.TempSimd128Register(0);
Register shift = i.InputRegister(1);
// Take shift value modulo 16.
__ andq(shift, Immediate(15));
__ movq(tmp, shift);
__ psrlw(i.OutputSimd128Register(), tmp);
// Take shift value modulo 2^4.
ASSEMBLE_SIMD_SHIFT(psrlw, 4);
break;
}
case kX64I16x8UConvertI32x4: {
......
......@@ -2806,13 +2806,18 @@ SIMD_VISIT_EXTRACT_LANE(I8x16, S)
SIMD_TYPES(VISIT_SIMD_REPLACE_LANE)
#undef VISIT_SIMD_REPLACE_LANE
#define VISIT_SIMD_SHIFT(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
X64OperandGenerator g(this); \
InstructionOperand temps[] = {g.TempSimd128Register()}; \
Emit(kX64##Opcode, g.DefineSameAsFirst(node), \
g.UseUniqueRegister(node->InputAt(0)), \
g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps); \
#define VISIT_SIMD_SHIFT(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
X64OperandGenerator g(this); \
if (g.CanBeImmediate(node->InputAt(1))) { \
Emit(kX64##Opcode, g.DefineSameAsFirst(node), \
g.UseRegister(node->InputAt(0)), g.UseImmediate(node->InputAt(1))); \
} else { \
InstructionOperand temps[] = {g.TempSimd128Register()}; \
Emit(kX64##Opcode, g.DefineSameAsFirst(node), \
g.UseUniqueRegister(node->InputAt(0)), \
g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps); \
} \
}
SIMD_SHIFT_OPCODES(VISIT_SIMD_SHIFT)
#undef VISIT_SIMD_SHIFT
......
......@@ -997,22 +997,28 @@ void RunI64x2ShiftOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
// Intentionally shift by 64, should be no-op.
for (int shift = 1; shift <= 64; shift++) {
WasmRunner<int32_t, int64_t> r(execution_tier, lower_simd);
int64_t* g = r.builder().AddGlobal<int64_t>(kWasmS128);
int32_t* memory = r.builder().AddMemoryElems<int32_t>(1);
int64_t* g_imm = r.builder().AddGlobal<int64_t>(kWasmS128);
int64_t* g_mem = r.builder().AddGlobal<int64_t>(kWasmS128);
byte value = 0;
byte shift_index = r.AllocateLocal(kWasmI32);
byte simd1 = r.AllocateLocal(kWasmS128);
BUILD(r,
WASM_SET_LOCAL(simd1, WASM_SIMD_I64x2_SPLAT(WASM_GET_LOCAL(value))),
WASM_SET_LOCAL(shift_index, WASM_I32V(shift)),
WASM_SET_GLOBAL(0, WASM_SIMD_SHIFT_OP(opcode, WASM_GET_LOCAL(simd1),
WASM_GET_LOCAL(shift_index))),
WASM_ONE);
byte simd = r.AllocateLocal(kWasmS128);
// Shift using an immediate, and shift using a value loaded from memory.
BUILD(
r, WASM_SET_LOCAL(simd, WASM_SIMD_I64x2_SPLAT(WASM_GET_LOCAL(value))),
WASM_SET_GLOBAL(0, WASM_SIMD_SHIFT_OP(opcode, WASM_GET_LOCAL(simd),
WASM_I32V(shift))),
WASM_SET_GLOBAL(1, WASM_SIMD_SHIFT_OP(
opcode, WASM_GET_LOCAL(simd),
WASM_LOAD_MEM(MachineType::Int32(), WASM_ZERO))),
WASM_ONE);
r.builder().WriteMemory(&memory[0], shift);
FOR_INT64_INPUTS(x) {
r.Call(x);
int64_t expected = expected_op(x, shift);
for (int i = 0; i < 2; i++) {
CHECK_EQ(expected, ReadLittleEndianValue<int64_t>(&g[i]));
CHECK_EQ(expected, ReadLittleEndianValue<int64_t>(&g_imm[i]));
CHECK_EQ(expected, ReadLittleEndianValue<int64_t>(&g_mem[i]));
}
}
}
......@@ -1938,21 +1944,28 @@ void RunI32x4ShiftOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
// Intentionally shift by 32, should be no-op.
for (int shift = 1; shift <= 32; shift++) {
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
int32_t* g = r.builder().AddGlobal<int32_t>(kWasmS128);
int32_t* memory = r.builder().AddMemoryElems<int32_t>(1);
int32_t* g_imm = r.builder().AddGlobal<int32_t>(kWasmS128);
int32_t* g_mem = r.builder().AddGlobal<int32_t>(kWasmS128);
byte value = 0;
byte shift_index = r.AllocateLocal(kWasmI32);
byte simd1 = r.AllocateLocal(kWasmS128);
BUILD(r, WASM_SET_LOCAL(shift_index, WASM_I32V(shift)),
WASM_SET_LOCAL(simd1, WASM_SIMD_I32x4_SPLAT(WASM_GET_LOCAL(value))),
WASM_SET_GLOBAL(0, WASM_SIMD_SHIFT_OP(opcode, WASM_GET_LOCAL(simd1),
WASM_GET_LOCAL(shift_index))),
WASM_ONE);
byte simd = r.AllocateLocal(kWasmS128);
// Shift using an immediate, and shift using a value loaded from memory.
BUILD(
r, WASM_SET_LOCAL(simd, WASM_SIMD_I32x4_SPLAT(WASM_GET_LOCAL(value))),
WASM_SET_GLOBAL(0, WASM_SIMD_SHIFT_OP(opcode, WASM_GET_LOCAL(simd),
WASM_I32V(shift))),
WASM_SET_GLOBAL(1, WASM_SIMD_SHIFT_OP(
opcode, WASM_GET_LOCAL(simd),
WASM_LOAD_MEM(MachineType::Int32(), WASM_ZERO))),
WASM_ONE);
r.builder().WriteMemory(&memory[0], shift);
FOR_INT32_INPUTS(x) {
r.Call(x);
int32_t expected = expected_op(x, shift);
for (int i = 0; i < 4; i++) {
CHECK_EQ(expected, ReadLittleEndianValue<int32_t>(&g[i]));
CHECK_EQ(expected, ReadLittleEndianValue<int32_t>(&g_imm[i]));
CHECK_EQ(expected, ReadLittleEndianValue<int32_t>(&g_mem[i]));
}
}
}
......@@ -2196,22 +2209,28 @@ void RunI16x8ShiftOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
// Intentionally shift by 16, should be no-op.
for (int shift = 1; shift <= 16; shift++) {
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
int16_t* g = r.builder().AddGlobal<int16_t>(kWasmS128);
int32_t* memory = r.builder().AddMemoryElems<int32_t>(1);
int16_t* g_imm = r.builder().AddGlobal<int16_t>(kWasmS128);
int16_t* g_mem = r.builder().AddGlobal<int16_t>(kWasmS128);
byte value = 0;
byte simd1 = r.AllocateLocal(kWasmS128);
byte shift_index = r.AllocateLocal(kWasmI32);
BUILD(r,
WASM_SET_LOCAL(simd1, WASM_SIMD_I16x8_SPLAT(WASM_GET_LOCAL(value))),
WASM_SET_LOCAL(shift_index, WASM_I32V(shift)),
WASM_SET_GLOBAL(0, WASM_SIMD_SHIFT_OP(opcode, WASM_GET_LOCAL(simd1),
WASM_GET_LOCAL(shift_index))),
WASM_ONE);
byte simd = r.AllocateLocal(kWasmS128);
// Shift using an immediate, and shift using a value loaded from memory.
BUILD(
r, WASM_SET_LOCAL(simd, WASM_SIMD_I16x8_SPLAT(WASM_GET_LOCAL(value))),
WASM_SET_GLOBAL(0, WASM_SIMD_SHIFT_OP(opcode, WASM_GET_LOCAL(simd),
WASM_I32V(shift))),
WASM_SET_GLOBAL(1, WASM_SIMD_SHIFT_OP(
opcode, WASM_GET_LOCAL(simd),
WASM_LOAD_MEM(MachineType::Int32(), WASM_ZERO))),
WASM_ONE);
r.builder().WriteMemory(&memory[0], shift);
FOR_INT16_INPUTS(x) {
r.Call(x);
int16_t expected = expected_op(x, shift);
for (int i = 0; i < 8; i++) {
CHECK_EQ(expected, ReadLittleEndianValue<int16_t>(&g[i]));
CHECK_EQ(expected, ReadLittleEndianValue<int16_t>(&g_imm[i]));
CHECK_EQ(expected, ReadLittleEndianValue<int16_t>(&g_mem[i]));
}
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment