Commit b435c60b authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-simd] Merge SSE/AVX S128 And/Or/Xor

Combine the SSE and AVX versions, delegate to the macro-assembler
functions to check for AVX support.

Change Pand, Por, Pxor to generate the *ps version of the instruction
when AVX is not supported. The *ps versions are 1 byte shorter, and have
no performance difference on SSE-only processors.

Bug: v8:11589
Bug: v8:11217
Change-Id: I9d51054359dcc909efcbb2c3d3bb63d399cd6721
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3124101Reviewed-by: 's avatarAdam Klein <adamk@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76733}
parent 64758c63
......@@ -158,6 +158,25 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
args...); \
}
// Define a macro which uses |avx_name| when AVX is supported, and |sse_name|
// when AVX is not supported. This is useful for bit-wise instructions like
// andpd/andps, where the behavior is exactly the same, but the *ps
// version is 1 byte shorter, and on SSE-only processors there is no
// performance difference since those processors don't differentiate integer
// and floating-point domains.
// Note: we require |avx_name| to be the AVX instruction without the "v"
// prefix. If we require the full AVX instruction name and the caller
// accidentally passes in a SSE instruction, we compile without any issues and
// generate the SSE instruction. By appending "v" here, we ensure that we will
// generate an AVX instruction.
#define AVX_OP_WITH_DIFF_SSE_INSTR(macro_name, avx_name, sse_name) \
template <typename Dst, typename Arg, typename... Args> \
void macro_name(Dst dst, Arg arg, Args... args) { \
AvxHelper<Dst, Arg, Args...>{this} \
.template emit<&Assembler::v##avx_name, &Assembler::sse_name>( \
dst, arg, args...); \
}
#define AVX_OP_SSE3(macro_name, name) \
template <typename Dst, typename Arg, typename... Args> \
void macro_name(Dst dst, Arg arg, Args... args) { \
......@@ -250,7 +269,6 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Paddusb, paddusb)
AVX_OP(Paddusw, paddusw)
AVX_OP(Paddw, paddw)
AVX_OP(Pand, pand)
AVX_OP(Pavgb, pavgb)
AVX_OP(Pavgw, pavgw)
AVX_OP(Pcmpgtb, pcmpgtb)
......@@ -266,7 +284,6 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Pmovmskb, pmovmskb)
AVX_OP(Pmullw, pmullw)
AVX_OP(Pmuludq, pmuludq)
AVX_OP(Por, por)
AVX_OP(Pshufd, pshufd)
AVX_OP(Pshufhw, pshufhw)
AVX_OP(Pshuflw, pshuflw)
......@@ -294,7 +311,6 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Punpckldq, punpckldq)
AVX_OP(Punpcklqdq, punpcklqdq)
AVX_OP(Punpcklwd, punpcklwd)
AVX_OP(Pxor, pxor)
AVX_OP(Rcpps, rcpps)
AVX_OP(Rsqrtps, rsqrtps)
AVX_OP(Sqrtpd, sqrtpd)
......@@ -309,6 +325,10 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Xorpd, xorpd)
AVX_OP(Xorps, xorps)
AVX_OP_WITH_DIFF_SSE_INSTR(Pand, pand, andps)
AVX_OP_WITH_DIFF_SSE_INSTR(Por, por, orps)
AVX_OP_WITH_DIFF_SSE_INSTR(Pxor, pxor, xorps)
AVX_OP_SSE3(Haddps, haddps)
AVX_OP_SSE3(Movddup, movddup)
AVX_OP_SSE3(Movshdup, movshdup)
......
......@@ -3131,37 +3131,19 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
kScratchDoubleReg);
break;
}
case kSSES128And: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ andps(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXS128And: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpand(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSES128Or: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ orps(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXS128Or: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpor(i.OutputSimd128Register(), i.InputSimd128Register(0),
case kIA32S128And: {
__ Pand(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kSSES128Xor: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ xorps(i.OutputSimd128Register(), i.InputOperand(1));
case kIA32S128Or: {
__ Por(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kAVXS128Xor: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpxor(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
case kIA32S128Xor: {
__ Pxor(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
break;
}
case kIA32S128Select: {
......
......@@ -301,12 +301,9 @@ namespace compiler {
V(IA32S128Zero) \
V(IA32S128AllOnes) \
V(IA32S128Not) \
V(SSES128And) \
V(AVXS128And) \
V(SSES128Or) \
V(AVXS128Or) \
V(SSES128Xor) \
V(AVXS128Xor) \
V(IA32S128And) \
V(IA32S128Or) \
V(IA32S128Xor) \
V(IA32S128Select) \
V(IA32S128AndNot) \
V(IA32I8x16Swizzle) \
......
......@@ -286,12 +286,9 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32S128Zero:
case kIA32S128AllOnes:
case kIA32S128Not:
case kSSES128And:
case kAVXS128And:
case kSSES128Or:
case kAVXS128Or:
case kSSES128Xor:
case kAVXS128Xor:
case kIA32S128And:
case kIA32S128Or:
case kIA32S128Xor:
case kIA32S128Select:
case kIA32S128AndNot:
case kIA32I8x16Swizzle:
......
......@@ -2260,10 +2260,7 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(I8x16Ne) \
V(I8x16GeS) \
V(I8x16GtU) \
V(I8x16GeU) \
V(S128And) \
V(S128Or) \
V(S128Xor)
V(I8x16GeU)
#define SIMD_BINOP_UNIFIED_SSE_AVX_LIST(V) \
V(F32x4Add) \
......@@ -2316,7 +2313,10 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(I8x16MinU) \
V(I8x16MaxU) \
V(I8x16SConvertI16x8) \
V(I8x16RoundingAverageU)
V(I8x16RoundingAverageU) \
V(S128And) \
V(S128Or) \
V(S128Xor)
// These opcodes require all inputs to be registers because the codegen is
// simpler with all registers.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment