Commit f70cfb88 authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-simd] Share i8x16.swizzle implementation

Also move Pshufb definition into shared-macro-assembler. We define a
Pshufb that handles both SSE and AVX, and in SSE case will move src to
dst if they are not the same.

Define operator== and operator!= in ia32's Operand class that will check
against XMMRegister, we can then use DCHECK_NE to ensure that a register
doesn't alias a operand wrapping a register.

Bug: v8:11589
Change-Id: I7c30881e8a9b322b736bb7301dde0c5424efacdd
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3119997
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarAdam Klein <adamk@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76515}
parent be74fe5d
...@@ -291,6 +291,8 @@ Register Operand::reg() const { ...@@ -291,6 +291,8 @@ Register Operand::reg() const {
return Register::from_code(buf_[0] & 0x07); return Register::from_code(buf_[0] & 0x07);
} }
bool operator!=(Operand op, XMMRegister r) { return !op.is_reg(r); }
void Assembler::AllocateAndInstallRequestedHeapObjects(Isolate* isolate) { void Assembler::AllocateAndInstallRequestedHeapObjects(Isolate* isolate) {
DCHECK_IMPLIES(isolate == nullptr, heap_object_requests_.empty()); DCHECK_IMPLIES(isolate == nullptr, heap_object_requests_.empty());
for (auto& request : heap_object_requests_) { for (auto& request : heap_object_requests_) {
......
...@@ -306,6 +306,8 @@ ASSERT_TRIVIALLY_COPYABLE(Operand); ...@@ -306,6 +306,8 @@ ASSERT_TRIVIALLY_COPYABLE(Operand);
static_assert(sizeof(Operand) <= 2 * kSystemPointerSize, static_assert(sizeof(Operand) <= 2 * kSystemPointerSize,
"Operand must be small enough to pass it by value"); "Operand must be small enough to pass it by value");
bool operator!=(Operand op, XMMRegister r);
// ----------------------------------------------------------------------------- // -----------------------------------------------------------------------------
// A Displacement describes the 32bit immediate field of an instruction which // A Displacement describes the 32bit immediate field of an instruction which
// may be used together with a Label in order to refer to a yet unknown code // may be used together with a Label in order to refer to a yet unknown code
......
...@@ -701,33 +701,6 @@ void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src, ...@@ -701,33 +701,6 @@ void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
} }
} }
void TurboAssembler::I8x16Swizzle(XMMRegister dst, XMMRegister src,
XMMRegister mask, XMMRegister scratch,
Register tmp, bool omit_add) {
if (omit_add) {
Pshufb(dst, src, mask);
return;
}
// Out-of-range indices should return 0, add 112 so that any value > 15
// saturates to 128 (top bit set), so pshufb will zero that lane.
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_swizzle_mask(), tmp);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpaddusb(scratch, mask, op);
vpshufb(dst, src, scratch);
} else {
CpuFeatureScope sse_scope(this, SSSE3);
movaps(scratch, op);
if (dst != src) {
movaps(dst, src);
}
paddusb(scratch, mask);
pshufb(dst, scratch);
}
}
void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) { void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) {
DCHECK_GE(63, shift); DCHECK_GE(63, shift);
if (shift >= 32) { if (shift >= 32) {
...@@ -1679,22 +1652,6 @@ void TurboAssembler::Move(XMMRegister dst, uint64_t src) { ...@@ -1679,22 +1652,6 @@ void TurboAssembler::Move(XMMRegister dst, uint64_t src) {
} }
} }
void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src, Operand mask) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpshufb(dst, src, mask);
return;
}
// Make sure these are different so that we won't overwrite mask.
DCHECK(!mask.is_reg(dst));
CpuFeatureScope sse_scope(this, SSSE3);
if (dst != src) {
movaps(dst, src);
}
pshufb(dst, mask);
}
void TurboAssembler::Pextrd(Register dst, XMMRegister src, uint8_t imm8) { void TurboAssembler::Pextrd(Register dst, XMMRegister src, uint8_t imm8) {
if (imm8 == 0) { if (imm8 == 0) {
Movd(dst, src); Movd(dst, src);
......
...@@ -336,14 +336,6 @@ class V8_EXPORT_PRIVATE TurboAssembler ...@@ -336,14 +336,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
SharedTurboAssembler::Movhps(dst, src); SharedTurboAssembler::Movhps(dst, src);
} }
void Pshufb(XMMRegister dst, XMMRegister src) { Pshufb(dst, dst, src); }
void Pshufb(XMMRegister dst, Operand src) { Pshufb(dst, dst, src); }
// Handles SSE and AVX. On SSE, moves src to dst if they are not equal.
void Pshufb(XMMRegister dst, XMMRegister src, XMMRegister mask) {
Pshufb(dst, src, Operand(mask));
}
void Pshufb(XMMRegister dst, XMMRegister src, Operand mask);
void Pextrd(Register dst, XMMRegister src, uint8_t imm8); void Pextrd(Register dst, XMMRegister src, uint8_t imm8);
void Pinsrb(XMMRegister dst, Register src, int8_t imm8) { void Pinsrb(XMMRegister dst, Register src, int8_t imm8) {
Pinsrb(dst, Operand(src), imm8); Pinsrb(dst, Operand(src), imm8);
...@@ -396,8 +388,6 @@ class V8_EXPORT_PRIVATE TurboAssembler ...@@ -396,8 +388,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
// Defined here to allow usage on both TurboFan and Liftoff. // Defined here to allow usage on both TurboFan and Liftoff.
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1, void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
XMMRegister tmp2, Register scratch); XMMRegister tmp2, Register scratch);
void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask,
XMMRegister scratch, Register tmp, bool omit_add = false);
void Push(Register src) { push(src); } void Push(Register src) { push(src); }
void Push(Operand src) { push(src); } void Push(Operand src) { push(src); }
......
...@@ -59,6 +59,28 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -59,6 +59,28 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
} }
} }
// Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE.
template <typename Op>
void Pshufb(XMMRegister dst, XMMRegister src, Op mask) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpshufb(dst, src, mask);
} else {
// Make sure these are different so that we won't overwrite mask.
DCHECK_NE(mask, dst);
if (dst != src) {
movaps(dst, src);
}
CpuFeatureScope sse_scope(this, SSSE3);
pshufb(dst, mask);
}
}
template <typename Op>
void Pshufb(XMMRegister dst, Op mask) {
Pshufb(dst, dst, mask);
}
// Shufps that will mov src1 into dst if AVX is not supported. // Shufps that will mov src1 into dst if AVX is not supported.
void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
uint8_t imm8); uint8_t imm8);
...@@ -244,7 +266,6 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -244,7 +266,6 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Pmullw, pmullw) AVX_OP(Pmullw, pmullw)
AVX_OP(Pmuludq, pmuludq) AVX_OP(Pmuludq, pmuludq)
AVX_OP(Por, por) AVX_OP(Por, por)
AVX_OP(Pshufb, pshufb)
AVX_OP(Pshufd, pshufd) AVX_OP(Pshufd, pshufd)
AVX_OP(Pshufhw, pshufhw) AVX_OP(Pshufhw, pshufhw)
AVX_OP(Pshuflw, pshuflw) AVX_OP(Pshuflw, pshuflw)
...@@ -575,6 +596,36 @@ class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler { ...@@ -575,6 +596,36 @@ class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler {
} }
} }
void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask,
XMMRegister scratch, Register tmp, bool omit_add = false) {
ASM_CODE_COMMENT(this);
if (omit_add) {
// We have determined that the indices are immediates, and they are either
// within bounds, or the top bit is set, so we can omit the add.
Pshufb(dst, src, mask);
return;
}
// Out-of-range indices should return 0, add 112 so that any value > 15
// saturates to 128 (top bit set), so pshufb will zero that lane.
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_swizzle_mask(), tmp);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpaddusb(scratch, mask, op);
vpshufb(dst, src, scratch);
} else {
CpuFeatureScope sse_scope(this, SSSE3);
movaps(scratch, op);
if (dst != src) {
DCHECK_NE(dst, mask);
movaps(dst, src);
}
paddusb(scratch, mask);
pshufb(dst, scratch);
}
}
private: private:
// All implementation-specific methods must be called through this. // All implementation-specific methods must be called through this.
Impl* impl() { return static_cast<Impl*>(this); } Impl* impl() { return static_cast<Impl*>(this); }
......
...@@ -2202,22 +2202,6 @@ void TurboAssembler::Blendvpd(XMMRegister dst, XMMRegister src1, ...@@ -2202,22 +2202,6 @@ void TurboAssembler::Blendvpd(XMMRegister dst, XMMRegister src1,
} }
} }
void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src,
XMMRegister mask) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpshufb(dst, src, mask);
} else {
// Make sure these are different so that we won't overwrite mask.
DCHECK_NE(dst, mask);
if (dst != src) {
movaps(dst, src);
}
CpuFeatureScope sse_scope(this, SSSE3);
pshufb(dst, mask);
}
}
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src, void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
XMMRegister tmp) { XMMRegister tmp) {
DCHECK_NE(dst, tmp); DCHECK_NE(dst, tmp);
...@@ -2277,34 +2261,6 @@ void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src, ...@@ -2277,34 +2261,6 @@ void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
} }
} }
void TurboAssembler::I8x16Swizzle(XMMRegister dst, XMMRegister src,
XMMRegister mask, bool omit_add) {
if (omit_add) {
// We have determined that the indices are immediates, and they are either
// within bounds, or the top bit is set, so we can omit the add.
Pshufb(dst, src, mask);
return;
}
// Out-of-range indices should return 0, add 112 so that any value > 15
// saturates to 128 (top bit set), so pshufb will zero that lane.
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_swizzle_mask());
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpaddusb(kScratchDoubleReg, mask, op);
vpshufb(dst, src, kScratchDoubleReg);
} else {
CpuFeatureScope sse_scope(this, SSSE3);
movaps(kScratchDoubleReg, op);
if (dst != src) {
movaps(dst, src);
}
paddusb(kScratchDoubleReg, mask);
pshufb(dst, kScratchDoubleReg);
}
}
void TurboAssembler::Abspd(XMMRegister dst) { void TurboAssembler::Abspd(XMMRegister dst) {
Andps(dst, ExternalReferenceAsOperand( Andps(dst, ExternalReferenceAsOperand(
ExternalReference::address_of_double_abs_constant())); ExternalReference::address_of_double_abs_constant()));
......
...@@ -90,7 +90,6 @@ class V8_EXPORT_PRIVATE TurboAssembler ...@@ -90,7 +90,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
AVX_OP(Movlhps, movlhps) AVX_OP(Movlhps, movlhps)
AVX_OP_SSSE3(Phaddd, phaddd) AVX_OP_SSSE3(Phaddd, phaddd)
AVX_OP_SSSE3(Phaddw, phaddw) AVX_OP_SSSE3(Phaddw, phaddw)
AVX_OP_SSSE3(Pshufb, pshufb)
AVX_OP_SSE4_1(Pcmpeqq, pcmpeqq) AVX_OP_SSE4_1(Pcmpeqq, pcmpeqq)
AVX_OP_SSE4_1(Packusdw, packusdw) AVX_OP_SSE4_1(Packusdw, packusdw)
AVX_OP_SSE4_1(Pminsd, pminsd) AVX_OP_SSE4_1(Pminsd, pminsd)
...@@ -471,17 +470,11 @@ class V8_EXPORT_PRIVATE TurboAssembler ...@@ -471,17 +470,11 @@ class V8_EXPORT_PRIVATE TurboAssembler
void Blendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2, void Blendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister mask); XMMRegister mask);
// Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE.
void Pshufb(XMMRegister dst, XMMRegister src1, XMMRegister src2);
// These Wasm SIMD ops do not have direct lowerings on x64. These // These Wasm SIMD ops do not have direct lowerings on x64. These
// helpers are optimized to produce the fastest and smallest codegen. // helpers are optimized to produce the fastest and smallest codegen.
// Defined here to allow usage on both TurboFan and Liftoff. // Defined here to allow usage on both TurboFan and Liftoff.
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp); void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp);
void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask,
bool omit_add = false);
void Abspd(XMMRegister dst); void Abspd(XMMRegister dst);
void Negpd(XMMRegister dst); void Negpd(XMMRegister dst);
......
...@@ -3664,9 +3664,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3664,9 +3664,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64I8x16Swizzle: { case kX64I8x16Swizzle: {
bool omit_add = MiscField::decode(instr->opcode());
__ I8x16Swizzle(i.OutputSimd128Register(), i.InputSimd128Register(0), __ I8x16Swizzle(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), omit_add); i.InputSimd128Register(1), kScratchDoubleReg,
kScratchRegister, MiscField::decode(instr->opcode()));
break; break;
} }
case kX64I8x16Shuffle: { case kX64I8x16Shuffle: {
......
...@@ -2482,7 +2482,8 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst, ...@@ -2482,7 +2482,8 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst, void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
LiftoffRegister lhs, LiftoffRegister lhs,
LiftoffRegister rhs) { LiftoffRegister rhs) {
I8x16Swizzle(dst.fp(), lhs.fp(), rhs.fp()); I8x16Swizzle(dst.fp(), lhs.fp(), rhs.fp(), kScratchDoubleReg,
kScratchRegister);
} }
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst, void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment