Commit 603ade14 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd] Improve codegen for all_true and any_true

Based on feedback in https://github.com/WebAssembly/simd/issues/189 and
inspired by cranelift's codegen, we reduce instruction count by 1 for
both types of operations - all_true goes from 6 -> 5, any_true from 4 ->
3. The main transformation is to change a sequence of movq + ptest +
cmovq to ptest + setcc. We unfortunately cannot cut down the instruction
counts further, since we need to zero the destination register.

Change-Id: Idc2540dbec755c7a7ff5069955f74e978190161d
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2100994Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66710}
parent b53bf811
......@@ -587,18 +587,16 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen,
__ opcode(i.OutputSimd128Register(), i.InputSimd128Register(1), imm); \
} while (false)
#define ASSEMBLE_SIMD_ALL_TRUE(opcode) \
do { \
CpuFeatureScope sse_scope(tasm(), SSE4_1); \
Register dst = i.OutputRegister(); \
Register tmp1 = i.TempRegister(0); \
XMMRegister tmp2 = i.TempSimd128Register(1); \
__ movq(tmp1, Immediate(1)); \
__ xorq(dst, dst); \
__ Pxor(tmp2, tmp2); \
__ opcode(tmp2, i.InputSimd128Register(0)); \
__ Ptest(tmp2, tmp2); \
__ cmovq(zero, dst, tmp1); \
#define ASSEMBLE_SIMD_ALL_TRUE(opcode) \
do { \
CpuFeatureScope sse_scope(tasm(), SSE4_1); \
Register dst = i.OutputRegister(); \
XMMRegister tmp = i.TempSimd128Register(0); \
__ xorq(dst, dst); \
__ Pxor(tmp, tmp); \
__ opcode(tmp, i.InputSimd128Register(0)); \
__ Ptest(tmp, tmp); \
__ setcc(equal, dst); \
} while (false)
// This macro will directly emit the opcode if the shift is an immediate - the
......@@ -3922,11 +3920,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
CpuFeatureScope sse_scope(tasm(), SSE4_1);
Register dst = i.OutputRegister();
XMMRegister src = i.InputSimd128Register(0);
Register tmp = i.TempRegister(0);
__ xorq(tmp, tmp);
__ movq(dst, Immediate(1));
__ ptest(src, src);
__ cmovq(zero, dst, tmp);
__ xorq(dst, dst);
__ Ptest(src, src);
__ setcc(not_equal, dst);
break;
}
// Need to split up all the different lane structures because the
......
......@@ -2860,23 +2860,22 @@ SIMD_BINOP_ONE_TEMP_LIST(VISIT_SIMD_BINOP_ONE_TEMP)
#undef VISIT_SIMD_BINOP_ONE_TEMP
#undef SIMD_BINOP_ONE_TEMP_LIST
#define VISIT_SIMD_ANYTRUE(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
X64OperandGenerator g(this); \
InstructionOperand temps[] = {g.TempRegister()}; \
Emit(kX64##Opcode, g.DefineAsRegister(node), \
g.UseUniqueRegister(node->InputAt(0)), arraysize(temps), temps); \
#define VISIT_SIMD_ANYTRUE(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
X64OperandGenerator g(this); \
Emit(kX64##Opcode, g.DefineAsRegister(node), \
g.UseUniqueRegister(node->InputAt(0))); \
}
SIMD_ANYTRUE_LIST(VISIT_SIMD_ANYTRUE)
#undef VISIT_SIMD_ANYTRUE
#undef SIMD_ANYTRUE_LIST
#define VISIT_SIMD_ALLTRUE(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
X64OperandGenerator g(this); \
InstructionOperand temps[] = {g.TempRegister(), g.TempSimd128Register()}; \
Emit(kX64##Opcode, g.DefineAsRegister(node), \
g.UseUniqueRegister(node->InputAt(0)), arraysize(temps), temps); \
#define VISIT_SIMD_ALLTRUE(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
X64OperandGenerator g(this); \
InstructionOperand temps[] = {g.TempSimd128Register()}; \
Emit(kX64##Opcode, g.DefineAsRegister(node), \
g.UseUniqueRegister(node->InputAt(0)), arraysize(temps), temps); \
}
SIMD_ALLTRUE_LIST(VISIT_SIMD_ALLTRUE)
#undef VISIT_SIMD_ALLTRUE
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment