Commit aee85229 authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][x64] Prototype extended pairwise addition

Add new macro-assembler instructions that can handle both AVX and SSE.
In the SSE case it checks that dst == src1. (This is different from that
the AvxHelper does, which passes dst as the first operand to AVX
instructions.)

Sorted SSSE3_INSTRUCTION_LIST by instruction code.

Header additions are added by clangd, we were already using something
from those headers via transitive includes, adding them explicitly gets
us closer to IWYU.

Codegen sequences are from https://github.com/WebAssembly/simd/pull/380
and also
https://github.com/WebAssembly/simd/pull/380#issuecomment-707440671.

Bug: v8:11086
Change-Id: I4c04f836e471ed8b00f9ff1a1b2e6348a593d4de
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2578797
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71688}
parent 26f9016f
......@@ -1727,6 +1727,29 @@ void TurboAssembler::RetpolineJump(Register reg) {
ret(0);
}
void TurboAssembler::Pmaddwd(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmaddwd(dst, src1, src2);
} else {
DCHECK_EQ(dst, src1);
pmaddwd(dst, src2);
}
}
void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpmaddubsw(dst, src1, src2);
} else {
CpuFeatureScope ssse3_scope(this, SSSE3);
DCHECK_EQ(dst, src1);
pmaddubsw(dst, src2);
}
}
void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src, byte imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
......@@ -1956,11 +1979,16 @@ void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src,
}
void TurboAssembler::Psrld(XMMRegister dst, byte imm8) {
Psrld(dst, dst, imm8);
}
void TurboAssembler::Psrld(XMMRegister dst, XMMRegister src, byte imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpsrld(dst, dst, imm8);
vpsrld(dst, src, imm8);
} else {
DCHECK(!IsEnabled(AVX));
DCHECK_EQ(dst, src);
psrld(dst, imm8);
}
}
......
......@@ -208,7 +208,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Psrlw, psrlw)
AVX_OP(Psrld, psrld)
AVX_OP(Psrlq, psrlq)
AVX_OP(Pmaddwd, pmaddwd)
AVX_OP(Paddb, paddb)
AVX_OP(Paddw, paddw)
AVX_OP(Paddd, paddd)
......@@ -522,6 +521,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void Trap() override;
void DebugBreak() override;
// Supports both AVX (dst != src1) and SSE (checks that dst == src1).
void Pmaddwd(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
// Shufps that will mov src into dst if AVX is not supported.
void Shufps(XMMRegister dst, XMMRegister src, byte imm8);
......@@ -546,6 +549,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void Pslld(XMMRegister dst, byte imm8);
void Psrld(XMMRegister dst, byte imm8);
// Supports both AVX (dst != src1) and SSE (checks that dst == src1).
void Psrld(XMMRegister dst, XMMRegister src, byte imm8);
void Pblendvb(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister mask);
void Blendvps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
......
......@@ -133,9 +133,10 @@
V(maxsd, F2, 0F, 5F)
#define SSSE3_INSTRUCTION_LIST(V) \
V(phaddd, 66, 0F, 38, 02) \
V(phaddw, 66, 0F, 38, 01) \
V(pshufb, 66, 0F, 38, 00) \
V(phaddw, 66, 0F, 38, 01) \
V(phaddd, 66, 0F, 38, 02) \
V(pmaddubsw, 66, 0F, 38, 04) \
V(psignb, 66, 0F, 38, 08) \
V(psignw, 66, 0F, 38, 09) \
V(psignd, 66, 0F, 38, 0A)
......
......@@ -2808,7 +2808,7 @@ void InstructionSelector::VisitPrefetchNonTemporal(Node* node) {
}
#endif // !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64
// TODO(v8:11086) Prototype extended pairwise add.
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8S(Node* node) {
UNIMPLEMENTED();
......@@ -2822,7 +2822,7 @@ void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16S(Node* node) {
void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16U(Node* node) {
UNIMPLEMENTED();
}
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64 && \
!V8_TARGET_ARCH_ARM
......
......@@ -9,6 +9,7 @@
#include "src/codegen/macro-assembler.h"
#include "src/codegen/optimized-compilation-info.h"
#include "src/codegen/x64/assembler-x64.h"
#include "src/codegen/x64/register-x64.h"
#include "src/compiler/backend/code-generator-impl.h"
#include "src/compiler/backend/code-generator.h"
#include "src/compiler/backend/gap-resolver.h"
......@@ -3085,6 +3086,35 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
ASSEMBLE_SIMD_BINOP(pmaddwd);
break;
}
case kX64I32x4ExtAddPairwiseI16x8S: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
// kScratchDoubleReg = |1|1|1|1|1|1|1|1|
__ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
__ Psrlw(kScratchDoubleReg, byte{15});
// pmaddwd multiplies signed words in kScratchDoubleReg and src, producing
// signed doublewords, then adds pairwise.
// src = |a|b|c|d|e|f|g|h|
// dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 |
__ Pmaddwd(dst, src, kScratchDoubleReg);
break;
}
case kX64I32x4ExtAddPairwiseI16x8U: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
// src = |a|b|c|d|e|f|g|h|
// kScratchDoubleReg = i32x4.splat(0x0000FFFF)
__ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ Psrld(kScratchDoubleReg, byte{16});
// kScratchDoubleReg =|0|b|0|d|0|f|0|h|
__ Pand(kScratchDoubleReg, src);
// dst = |0|a|0|c|0|e|0|g|
__ Psrld(dst, src, byte{16});
// dst = |a+b|c+d|e+f|g+h|
__ Paddd(dst, kScratchDoubleReg);
break;
}
case kX64S128Const: {
// Emit code for generic constants as all zeros, or ones cases will be
// handled separately by the selector.
......@@ -3297,6 +3327,25 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovzxbw);
break;
}
case kX64I16x8ExtAddPairwiseI8x16S: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
DCHECK_NE(dst, src);
// dst = i8x16.splat(1)
__ Move(dst, uint32_t{0x01010101});
__ Pshufd(dst, dst, byte{0});
__ Pmaddubsw(dst, dst, src);
break;
}
case kX64I16x8ExtAddPairwiseI8x16U: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
// dst = i8x16.splat(1)
__ Move(kScratchDoubleReg, uint32_t{0x01010101});
__ Pshufd(kScratchDoubleReg, kScratchDoubleReg, byte{0});
__ Pmaddubsw(dst, src, kScratchDoubleReg);
break;
}
case kX64I8x16Splat: {
XMMRegister dst = i.OutputSimd128Register();
if (HasRegisterInput(instr, 0)) {
......
......@@ -250,6 +250,8 @@ namespace compiler {
V(X64I32x4ExtMulHighI16x8S) \
V(X64I32x4ExtMulLowI16x8U) \
V(X64I32x4ExtMulHighI16x8U) \
V(X64I32x4ExtAddPairwiseI16x8S) \
V(X64I32x4ExtAddPairwiseI16x8U) \
V(X64I16x8Splat) \
V(X64I16x8ExtractLaneS) \
V(X64I16x8SConvertI8x16Low) \
......@@ -288,6 +290,8 @@ namespace compiler {
V(X64I16x8ExtMulHighI8x16S) \
V(X64I16x8ExtMulLowI8x16U) \
V(X64I16x8ExtMulHighI8x16U) \
V(X64I16x8ExtAddPairwiseI8x16S) \
V(X64I16x8ExtAddPairwiseI8x16U) \
V(X64I8x16Splat) \
V(X64I8x16ExtractLaneS) \
V(X64Pinsrb) \
......
......@@ -226,6 +226,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I32x4ExtMulHighI16x8S:
case kX64I32x4ExtMulLowI16x8U:
case kX64I32x4ExtMulHighI16x8U:
case kX64I32x4ExtAddPairwiseI16x8S:
case kX64I32x4ExtAddPairwiseI16x8U:
case kX64I16x8Splat:
case kX64I16x8ExtractLaneS:
case kX64I16x8SConvertI8x16Low:
......@@ -264,6 +266,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I16x8ExtMulHighI8x16S:
case kX64I16x8ExtMulLowI8x16U:
case kX64I16x8ExtMulHighI8x16U:
case kX64I16x8ExtAddPairwiseI8x16S:
case kX64I16x8ExtAddPairwiseI8x16U:
case kX64I8x16Splat:
case kX64I8x16ExtractLaneS:
case kX64I8x16SConvertI16x8:
......
......@@ -8,6 +8,7 @@
#include "src/base/logging.h"
#include "src/base/overflowing-math.h"
#include "src/base/platform/wrappers.h"
#include "src/codegen/cpu-features.h"
#include "src/codegen/machine-type.h"
#include "src/compiler/backend/instruction-selector-impl.h"
#include "src/compiler/machine-operator.h"
......@@ -3617,6 +3618,37 @@ void InstructionSelector::VisitF64x2Pmax(Node* node) {
VisitPminOrPmax(this, node, kX64F64x2Pmax);
}
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8S(Node* node) {
X64OperandGenerator g(this);
InstructionOperand dst = CpuFeatures::IsSupported(AVX)
? g.DefineAsRegister(node)
: g.DefineSameAsFirst(node);
Emit(kX64I32x4ExtAddPairwiseI16x8S, dst, g.UseRegister(node->InputAt(0)));
}
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8U(Node* node) {
X64OperandGenerator g(this);
InstructionOperand dst = CpuFeatures::IsSupported(AVX)
? g.DefineAsRegister(node)
: g.DefineSameAsFirst(node);
Emit(kX64I32x4ExtAddPairwiseI16x8U, dst, g.UseRegister(node->InputAt(0)));
}
void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16S(Node* node) {
X64OperandGenerator g(this);
// Codegen depends on dst != src.
Emit(kX64I16x8ExtAddPairwiseI8x16S, g.DefineAsRegister(node),
g.UseUniqueRegister(node->InputAt(0)));
}
void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16U(Node* node) {
X64OperandGenerator g(this);
InstructionOperand dst = CpuFeatures::IsSupported(AVX)
? g.DefineAsRegister(node)
: g.DefineSameAsFirst(node);
Emit(kX64I16x8ExtAddPairwiseI8x16U, dst, g.UseRegister(node->InputAt(0)));
}
// static
MachineOperatorBuilder::Flags
InstructionSelector::SupportedMachineOperatorFlags() {
......
......@@ -1882,7 +1882,7 @@ WASM_SIMD_TEST(S128Not) {
[](int32_t x) { return ~x; });
}
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64
// TODO(v8:11086) Prototype i32x4.extadd_pairwise_i16x8_{s,u}
template <typename Narrow, typename Wide>
void RunExtAddPairwiseTest(TestExecutionTier execution_tier,
......@@ -1931,7 +1931,7 @@ WASM_SIMD_TEST_NO_LOWERING(I16x8ExtAddPairwiseI8x16U) {
kExprI16x8ExtAddPairwiseI8x16U,
kExprI8x16Splat);
}
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64
void RunI32x4BinOpTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, Int32BinOp expected_op) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment