Commit f70cfb88 authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-simd] Share i8x16.swizzle implementation

Also move Pshufb definition into shared-macro-assembler. We define a
Pshufb that handles both SSE and AVX, and in SSE case will move src to
dst if they are not the same.

Define operator== and operator!= in ia32's Operand class that will check
against XMMRegister, we can then use DCHECK_NE to ensure that a register
doesn't alias a operand wrapping a register.

Bug: v8:11589
Change-Id: I7c30881e8a9b322b736bb7301dde0c5424efacdd
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3119997
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarAdam Klein <adamk@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76515}
parent be74fe5d
......@@ -291,6 +291,8 @@ Register Operand::reg() const {
return Register::from_code(buf_[0] & 0x07);
}
bool operator!=(Operand op, XMMRegister r) { return !op.is_reg(r); }
void Assembler::AllocateAndInstallRequestedHeapObjects(Isolate* isolate) {
DCHECK_IMPLIES(isolate == nullptr, heap_object_requests_.empty());
for (auto& request : heap_object_requests_) {
......
......@@ -306,6 +306,8 @@ ASSERT_TRIVIALLY_COPYABLE(Operand);
static_assert(sizeof(Operand) <= 2 * kSystemPointerSize,
"Operand must be small enough to pass it by value");
bool operator!=(Operand op, XMMRegister r);
// -----------------------------------------------------------------------------
// A Displacement describes the 32bit immediate field of an instruction which
// may be used together with a Label in order to refer to a yet unknown code
......
......@@ -701,33 +701,6 @@ void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
}
}
void TurboAssembler::I8x16Swizzle(XMMRegister dst, XMMRegister src,
XMMRegister mask, XMMRegister scratch,
Register tmp, bool omit_add) {
if (omit_add) {
Pshufb(dst, src, mask);
return;
}
// Out-of-range indices should return 0, add 112 so that any value > 15
// saturates to 128 (top bit set), so pshufb will zero that lane.
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_swizzle_mask(), tmp);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpaddusb(scratch, mask, op);
vpshufb(dst, src, scratch);
} else {
CpuFeatureScope sse_scope(this, SSSE3);
movaps(scratch, op);
if (dst != src) {
movaps(dst, src);
}
paddusb(scratch, mask);
pshufb(dst, scratch);
}
}
void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) {
DCHECK_GE(63, shift);
if (shift >= 32) {
......@@ -1679,22 +1652,6 @@ void TurboAssembler::Move(XMMRegister dst, uint64_t src) {
}
}
void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src, Operand mask) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpshufb(dst, src, mask);
return;
}
// Make sure these are different so that we won't overwrite mask.
DCHECK(!mask.is_reg(dst));
CpuFeatureScope sse_scope(this, SSSE3);
if (dst != src) {
movaps(dst, src);
}
pshufb(dst, mask);
}
void TurboAssembler::Pextrd(Register dst, XMMRegister src, uint8_t imm8) {
if (imm8 == 0) {
Movd(dst, src);
......
......@@ -336,14 +336,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
SharedTurboAssembler::Movhps(dst, src);
}
void Pshufb(XMMRegister dst, XMMRegister src) { Pshufb(dst, dst, src); }
void Pshufb(XMMRegister dst, Operand src) { Pshufb(dst, dst, src); }
// Handles SSE and AVX. On SSE, moves src to dst if they are not equal.
void Pshufb(XMMRegister dst, XMMRegister src, XMMRegister mask) {
Pshufb(dst, src, Operand(mask));
}
void Pshufb(XMMRegister dst, XMMRegister src, Operand mask);
void Pextrd(Register dst, XMMRegister src, uint8_t imm8);
void Pinsrb(XMMRegister dst, Register src, int8_t imm8) {
Pinsrb(dst, Operand(src), imm8);
......@@ -396,8 +388,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
// Defined here to allow usage on both TurboFan and Liftoff.
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp1,
XMMRegister tmp2, Register scratch);
void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask,
XMMRegister scratch, Register tmp, bool omit_add = false);
void Push(Register src) { push(src); }
void Push(Operand src) { push(src); }
......
......@@ -59,6 +59,28 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
}
}
// Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE.
template <typename Op>
void Pshufb(XMMRegister dst, XMMRegister src, Op mask) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpshufb(dst, src, mask);
} else {
// Make sure these are different so that we won't overwrite mask.
DCHECK_NE(mask, dst);
if (dst != src) {
movaps(dst, src);
}
CpuFeatureScope sse_scope(this, SSSE3);
pshufb(dst, mask);
}
}
template <typename Op>
void Pshufb(XMMRegister dst, Op mask) {
Pshufb(dst, dst, mask);
}
// Shufps that will mov src1 into dst if AVX is not supported.
void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
uint8_t imm8);
......@@ -244,7 +266,6 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP(Pmullw, pmullw)
AVX_OP(Pmuludq, pmuludq)
AVX_OP(Por, por)
AVX_OP(Pshufb, pshufb)
AVX_OP(Pshufd, pshufd)
AVX_OP(Pshufhw, pshufhw)
AVX_OP(Pshuflw, pshuflw)
......@@ -575,6 +596,36 @@ class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler {
}
}
void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask,
XMMRegister scratch, Register tmp, bool omit_add = false) {
ASM_CODE_COMMENT(this);
if (omit_add) {
// We have determined that the indices are immediates, and they are either
// within bounds, or the top bit is set, so we can omit the add.
Pshufb(dst, src, mask);
return;
}
// Out-of-range indices should return 0, add 112 so that any value > 15
// saturates to 128 (top bit set), so pshufb will zero that lane.
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_swizzle_mask(), tmp);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpaddusb(scratch, mask, op);
vpshufb(dst, src, scratch);
} else {
CpuFeatureScope sse_scope(this, SSSE3);
movaps(scratch, op);
if (dst != src) {
DCHECK_NE(dst, mask);
movaps(dst, src);
}
paddusb(scratch, mask);
pshufb(dst, scratch);
}
}
private:
// All implementation-specific methods must be called through this.
Impl* impl() { return static_cast<Impl*>(this); }
......
......@@ -2202,22 +2202,6 @@ void TurboAssembler::Blendvpd(XMMRegister dst, XMMRegister src1,
}
}
void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src,
XMMRegister mask) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpshufb(dst, src, mask);
} else {
// Make sure these are different so that we won't overwrite mask.
DCHECK_NE(dst, mask);
if (dst != src) {
movaps(dst, src);
}
CpuFeatureScope sse_scope(this, SSSE3);
pshufb(dst, mask);
}
}
void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
XMMRegister tmp) {
DCHECK_NE(dst, tmp);
......@@ -2277,34 +2261,6 @@ void TurboAssembler::I8x16Popcnt(XMMRegister dst, XMMRegister src,
}
}
void TurboAssembler::I8x16Swizzle(XMMRegister dst, XMMRegister src,
XMMRegister mask, bool omit_add) {
if (omit_add) {
// We have determined that the indices are immediates, and they are either
// within bounds, or the top bit is set, so we can omit the add.
Pshufb(dst, src, mask);
return;
}
// Out-of-range indices should return 0, add 112 so that any value > 15
// saturates to 128 (top bit set), so pshufb will zero that lane.
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_swizzle_mask());
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpaddusb(kScratchDoubleReg, mask, op);
vpshufb(dst, src, kScratchDoubleReg);
} else {
CpuFeatureScope sse_scope(this, SSSE3);
movaps(kScratchDoubleReg, op);
if (dst != src) {
movaps(dst, src);
}
paddusb(kScratchDoubleReg, mask);
pshufb(dst, kScratchDoubleReg);
}
}
void TurboAssembler::Abspd(XMMRegister dst) {
Andps(dst, ExternalReferenceAsOperand(
ExternalReference::address_of_double_abs_constant()));
......
......@@ -90,7 +90,6 @@ class V8_EXPORT_PRIVATE TurboAssembler
AVX_OP(Movlhps, movlhps)
AVX_OP_SSSE3(Phaddd, phaddd)
AVX_OP_SSSE3(Phaddw, phaddw)
AVX_OP_SSSE3(Pshufb, pshufb)
AVX_OP_SSE4_1(Pcmpeqq, pcmpeqq)
AVX_OP_SSE4_1(Packusdw, packusdw)
AVX_OP_SSE4_1(Pminsd, pminsd)
......@@ -471,17 +470,11 @@ class V8_EXPORT_PRIVATE TurboAssembler
void Blendvpd(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister mask);
// Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE.
void Pshufb(XMMRegister dst, XMMRegister src1, XMMRegister src2);
// These Wasm SIMD ops do not have direct lowerings on x64. These
// helpers are optimized to produce the fastest and smallest codegen.
// Defined here to allow usage on both TurboFan and Liftoff.
void I8x16Popcnt(XMMRegister dst, XMMRegister src, XMMRegister tmp);
void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask,
bool omit_add = false);
void Abspd(XMMRegister dst);
void Negpd(XMMRegister dst);
......
......@@ -3664,9 +3664,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I8x16Swizzle: {
bool omit_add = MiscField::decode(instr->opcode());
__ I8x16Swizzle(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), omit_add);
i.InputSimd128Register(1), kScratchDoubleReg,
kScratchRegister, MiscField::decode(instr->opcode()));
break;
}
case kX64I8x16Shuffle: {
......
......@@ -2482,7 +2482,8 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
I8x16Swizzle(dst.fp(), lhs.fp(), rhs.fp());
I8x16Swizzle(dst.fp(), lhs.fp(), rhs.fp(), kScratchDoubleReg,
kScratchRegister);
}
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment