Commit 758e4931 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][ia32][liftoff] Implement extended add pairwise

Extract code sequence into macro-assembler for reuse between Liftoff and
TurboFan.

Bug: v8:11086
Change-Id: I914051dd8126e89f297e892da1b5c1917b47d7f1
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2707763Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#72987}
parent 74924382
......@@ -1132,6 +1132,68 @@ void TurboAssembler::I64x2GeS(XMMRegister dst, XMMRegister src0,
}
}
void TurboAssembler::I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src,
XMMRegister tmp,
Register scratch) {
// pmaddubsw treats the first operand as unsigned, so pass the external
// reference to as the first operand.
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x01(), scratch);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vmovdqa(tmp, op);
vpmaddubsw(dst, tmp, src);
} else {
CpuFeatureScope sse_scope(this, SSSE3);
if (dst == src) {
movaps(tmp, op);
pmaddubsw(tmp, src);
movaps(dst, tmp);
} else {
movaps(dst, op);
pmaddubsw(dst, src);
}
}
}
void TurboAssembler::I16x8ExtAddPairwiseI8x16U(XMMRegister dst, XMMRegister src,
Register scratch) {
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_splat_0x01(), scratch);
if (!CpuFeatures::IsSupported(AVX) && dst != src) {
movaps(dst, src);
}
Pmaddubsw(dst, src, op);
}
void TurboAssembler::I32x4ExtAddPairwiseI16x8S(XMMRegister dst, XMMRegister src,
Register scratch) {
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i16x8_splat_0x0001(), scratch);
if (!CpuFeatures::IsSupported(AVX) && dst != src) {
movaps(dst, src);
}
// pmaddwd multiplies signed words in src and op, producing
// signed doublewords, then adds pairwise.
// src = |a|b|c|d|e|f|g|h|
// dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 |
Pmaddwd(dst, src, op);
}
void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src,
XMMRegister tmp) {
// src = |a|b|c|d|e|f|g|h|
// tmp = i32x4.splat(0x0000FFFF)
Pcmpeqd(tmp, tmp);
Psrld(tmp, tmp, byte{16});
// tmp =|0|b|0|d|0|f|0|h|
Pand(tmp, src);
// dst = |0|a|0|c|0|e|0|g|
Psrld(dst, src, byte{16});
// dst = |a+b|c+d|e+f|g+h|
Paddd(dst, dst, tmp);
}
void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) {
DCHECK_GE(63, shift);
if (shift >= 32) {
......
......@@ -562,9 +562,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP3_WITH_TYPE_SCOPE(macro_name, name, XMMRegister, Operand, SSE4_1)
AVX_OP3_XO_SSE4(Pmaxsd, pmaxsd)
AVX_OP3_WITH_TYPE_SCOPE(Pmaddubsw, pmaddubsw, XMMRegister, XMMRegister, SSSE3)
AVX_OP3_XO_SSE4(Pminsb, pminsb)
AVX_OP3_XO_SSE4(Pmaxsb, pmaxsb)
AVX_OP3_WITH_TYPE_SCOPE(Pmaddubsw, pmaddubsw, XMMRegister, XMMRegister, SSSE3)
AVX_OP3_WITH_TYPE_SCOPE(Pmaddubsw, pmaddubsw, XMMRegister, Operand, SSSE3)
#undef AVX_OP3_XO_SSE4
#undef AVX_OP3_WITH_TYPE_SCOPE
......@@ -698,6 +699,14 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
XMMRegister scratch);
void I64x2GeS(XMMRegister dst, XMMRegister src0, XMMRegister src1,
XMMRegister scratch);
void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src,
XMMRegister tmp, Register scratch);
void I16x8ExtAddPairwiseI8x16U(XMMRegister dst, XMMRegister src,
Register scratch);
void I32x4ExtAddPairwiseI16x8S(XMMRegister dst, XMMRegister src,
Register scratch);
void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src,
XMMRegister tmp);
void Push(Register src) { push(src); }
void Push(Operand src) { push(src); }
......
......@@ -2297,51 +2297,27 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32I32x4ExtAddPairwiseI16x8S: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
// kScratchDoubleReg = i16x8.splat(1)
__ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
__ Psrlw(kScratchDoubleReg, byte{15});
// pmaddwd multiplies signed words in kScratchDoubleReg and src, producing
// signed doublewords, then adds pairwise.
// src = |a|b|c|d|e|f|g|h|
// dst = | a*1 + b*1 | c*1 + d*1 | e*1 + f*1 | g*1 + h*1 |
__ Pmaddwd(dst, src, kScratchDoubleReg);
__ I32x4ExtAddPairwiseI16x8S(i.OutputSimd128Register(),
i.InputSimd128Register(0),
i.TempRegister(0));
break;
}
case kIA32I32x4ExtAddPairwiseI16x8U: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
// src = |a|b|c|d|e|f|g|h|
// kScratchDoubleReg = i32x4.splat(0x0000FFFF)
__ Pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ Psrld(kScratchDoubleReg, kScratchDoubleReg, uint8_t{16});
// kScratchDoubleReg =|0|b|0|d|0|f|0|h|
__ Pand(kScratchDoubleReg, src);
// dst = |0|a|0|c|0|e|0|g|
__ Psrld(dst, src, byte{16});
// dst = |a+b|c+d|e+f|g+h|
__ Paddd(dst, src, kScratchDoubleReg);
__ I32x4ExtAddPairwiseI16x8U(i.OutputSimd128Register(),
i.InputSimd128Register(0),
kScratchDoubleReg);
break;
}
case kIA32I16x8ExtAddPairwiseI8x16S: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
DCHECK_NE(dst, src);
// dst = i8x16.splat(1)
__ Move(dst, uint32_t{0x01010101});
__ Pshufd(dst, dst, byte{0});
__ Pmaddubsw(dst, dst, src);
break;
__ I16x8ExtAddPairwiseI8x16S(i.OutputSimd128Register(),
i.InputSimd128Register(0), kScratchDoubleReg,
i.TempRegister(0));
break;
}
case kIA32I16x8ExtAddPairwiseI8x16U: {
XMMRegister dst = i.OutputSimd128Register();
// dst = i8x16.splat(1)
__ Move(kScratchDoubleReg, uint32_t{0x01010101});
__ Pshufd(kScratchDoubleReg, kScratchDoubleReg, byte{0});
__ Pmaddubsw(dst, i.InputSimd128Register(0), kScratchDoubleReg);
__ I16x8ExtAddPairwiseI8x16U(i.OutputSimd128Register(),
i.InputSimd128Register(0),
i.TempRegister(0));
break;
}
case kIA32I16x8Q15MulRSatS: {
......
......@@ -3112,22 +3112,37 @@ void InstructionSelector::VisitI64x2SignSelect(Node* node) {
VisitSignSelect(this, node, kIA32I64x2SignSelect);
}
namespace {
void VisitExtAddPairwise(InstructionSelector* selector, Node* node,
ArchOpcode opcode, bool need_temp) {
IA32OperandGenerator g(selector);
InstructionOperand operand0 = g.UseRegister(node->InputAt(0));
InstructionOperand dst = (selector->IsSupported(AVX))
? g.DefineAsRegister(node)
: g.DefineSameAsFirst(node);
if (need_temp) {
InstructionOperand temps[] = {g.TempRegister()};
selector->Emit(opcode, dst, operand0, arraysize(temps), temps);
} else {
selector->Emit(opcode, dst, operand0);
}
}
} // namespace
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8S(Node* node) {
VisitRRSimd(this, node, kIA32I32x4ExtAddPairwiseI16x8S);
VisitExtAddPairwise(this, node, kIA32I32x4ExtAddPairwiseI16x8S, true);
}
void InstructionSelector::VisitI32x4ExtAddPairwiseI16x8U(Node* node) {
VisitRRSimd(this, node, kIA32I32x4ExtAddPairwiseI16x8U);
VisitExtAddPairwise(this, node, kIA32I32x4ExtAddPairwiseI16x8U, false);
}
void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16S(Node* node) {
IA32OperandGenerator g(this);
Emit(kIA32I16x8ExtAddPairwiseI8x16S, g.DefineAsRegister(node),
g.UseUniqueRegister(node->InputAt(0)));
VisitExtAddPairwise(this, node, kIA32I16x8ExtAddPairwiseI8x16S, true);
}
void InstructionSelector::VisitI16x8ExtAddPairwiseI8x16U(Node* node) {
VisitRRSimd(this, node, kIA32I16x8ExtAddPairwiseI8x16U);
VisitExtAddPairwise(this, node, kIA32I16x8ExtAddPairwiseI8x16U, true);
}
void InstructionSelector::VisitI8x16Popcnt(Node* node) {
......
......@@ -3687,12 +3687,14 @@ void LiftoffAssembler::emit_i16x8_max_u(LiftoffRegister dst,
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_s(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i16x8.extadd_pairwise_i8x16_s");
I16x8ExtAddPairwiseI8x16S(dst.fp(), src.fp(), liftoff::kScratchDoubleReg,
GetUnusedRegister(kGpReg, {}).gp());
}
void LiftoffAssembler::emit_i16x8_extadd_pairwise_i8x16_u(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i16x8.extadd_pairwise_i8x16_u");
I16x8ExtAddPairwiseI8x16U(dst.fp(), src.fp(),
GetUnusedRegister(kGpReg, {}).gp());
}
void LiftoffAssembler::emit_i16x8_extmul_low_i8x16_s(LiftoffRegister dst,
......@@ -3843,12 +3845,13 @@ void LiftoffAssembler::emit_i32x4_dot_i16x8_s(LiftoffRegister dst,
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_s(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i32x4.extadd_pairwise_i16x8_s");
I32x4ExtAddPairwiseI16x8S(dst.fp(), src.fp(),
GetUnusedRegister(kGpReg, {}).gp());
}
void LiftoffAssembler::emit_i32x4_extadd_pairwise_i16x8_u(LiftoffRegister dst,
LiftoffRegister src) {
bailout(kSimd, "i32x4.extadd_pairwise_i16x8_u");
I32x4ExtAddPairwiseI16x8U(dst.fp(), src.fp(), liftoff::kScratchDoubleReg);
}
namespace liftoff {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment