Commit f0ee5100 authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[wasm-simd][ia32][x64] Share i8x16.shl implementation

Move the implementation into shared macro-assembler. TurboFan and
Liftoff for both ia32 and x64 can now share the implementation. No
functionality change expected.

Bug: v8:11589
Change-Id: Ia1f680ba139fca627e82e7dc0a9cf1c833e483cf
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3088513
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Cr-Commit-Position: refs/heads/master@{#76268}
parent aa4df5bc
......@@ -254,17 +254,64 @@ void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
}
}
void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
uint8_t src2, Register tmp1,
XMMRegister tmp2) {
DCHECK_NE(dst, tmp2);
// Perform 16-bit shift, then mask away low bits.
if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
movaps(dst, src1);
src1 = dst;
}
uint8_t shift = truncate_to_int3(src2);
Psllw(dst, src1, byte{shift});
uint8_t bmask = static_cast<uint8_t>(0xff << shift);
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
Move(tmp1, mask);
Movd(tmp2, tmp1);
Pshufd(tmp2, tmp2, uint8_t{0});
Pand(dst, tmp2);
}
void SharedTurboAssembler::I8x16Shl(XMMRegister dst, XMMRegister src1,
Register src2, Register tmp1,
XMMRegister tmp2, XMMRegister tmp3) {
DCHECK(!AreAliased(dst, tmp2, tmp3));
DCHECK(!AreAliased(src1, tmp2, tmp3));
// Take shift value modulo 8.
Move(tmp1, src2);
And(tmp1, Immediate(7));
Add(tmp1, Immediate(8));
// Create a mask to unset high bits.
Movd(tmp3, tmp1);
Pcmpeqd(tmp2, tmp2);
Psrlw(tmp2, tmp2, tmp3);
Packuswb(tmp2, tmp2);
if (!CpuFeatures::IsSupported(AVX) && (dst != src1)) {
movaps(dst, src1);
src1 = dst;
}
// Mask off the unwanted bits before word-shifting.
Pand(dst, src1, tmp2);
Add(tmp1, Immediate(-8));
Movd(tmp3, tmp1);
Psllw(dst, dst, tmp3);
}
void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
uint8_t src2, XMMRegister tmp2) {
uint8_t src2, XMMRegister tmp) {
// Unpack bytes into words, do word (16-bit) shifts, and repack.
DCHECK_NE(dst, tmp2);
DCHECK_NE(dst, tmp);
uint8_t shift = truncate_to_int3(src2) + 8;
Punpckhbw(tmp2, src1);
Punpckhbw(tmp, src1);
Punpcklbw(dst, src1);
Psraw(tmp2, shift);
Psraw(tmp, shift);
Psraw(dst, shift);
Packsswb(dst, tmp2);
Packsswb(dst, tmp);
}
void SharedTurboAssembler::I8x16ShrS(XMMRegister dst, XMMRegister src1,
......
......@@ -300,8 +300,12 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
void F32x4Splat(XMMRegister dst, DoubleRegister src);
void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane);
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
void I8x16Shl(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1,
XMMRegister tmp2);
void I8x16Shl(XMMRegister dst, XMMRegister src1, Register src2, Register tmp1,
XMMRegister tmp2, XMMRegister tmp3);
void I8x16ShrS(XMMRegister dst, XMMRegister src1, uint8_t src2,
XMMRegister tmp2);
XMMRegister tmp);
void I8x16ShrS(XMMRegister dst, XMMRegister src1, Register src2,
Register tmp1, XMMRegister tmp2, XMMRegister tmp3);
void I8x16ShrU(XMMRegister dst, XMMRegister src1, uint8_t src2, Register tmp1,
......
......@@ -3111,37 +3111,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kIA32I8x16Shl: {
XMMRegister dst = i.OutputSimd128Register();
// TODO(zhin): remove this restriction from instruction-selector.
DCHECK_EQ(dst, i.InputSimd128Register(0));
Register tmp = i.ToRegister(instr->TempAt(0));
Register tmp = i.TempRegister(0);
XMMRegister tmp_simd = i.TempSimd128Register(1);
if (HasImmediateInput(instr, 1)) {
// Perform 16-bit shift, then mask away low bits.
uint8_t shift = i.InputInt3(1);
__ Psllw(dst, dst, byte{shift});
uint8_t bmask = static_cast<uint8_t>(0xff << shift);
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
__ mov(tmp, mask);
__ Movd(tmp_simd, tmp);
__ Pshufd(tmp_simd, tmp_simd, uint8_t{0});
__ Pand(dst, tmp_simd);
__ I8x16Shl(dst, i.InputSimd128Register(0), i.InputInt3(1), tmp,
kScratchDoubleReg);
} else {
// Take shift value modulo 8.
__ mov(tmp, i.InputRegister(1));
__ and_(tmp, 7);
// Mask off the unwanted bits before word-shifting.
__ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
__ add(tmp, Immediate(8));
__ Movd(tmp_simd, tmp);
__ Psrlw(kScratchDoubleReg, kScratchDoubleReg, tmp_simd);
__ Packuswb(kScratchDoubleReg, kScratchDoubleReg);
__ Pand(dst, kScratchDoubleReg);
// TODO(zhin): sub here to avoid asking for another temporary register,
// examine codegen for other i8x16 shifts, they use less instructions.
__ sub(tmp, Immediate(8));
__ Movd(tmp_simd, tmp);
__ Psllw(dst, dst, tmp_simd);
__ I8x16Shl(dst, i.InputSimd128Register(0), i.InputRegister(1), tmp,
kScratchDoubleReg, tmp_simd);
}
break;
}
......
......@@ -3492,37 +3492,17 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64I8x16Shl: {
XMMRegister dst = i.OutputSimd128Register();
// TODO(zhin): remove this restriction from instruction-selector.
DCHECK_EQ(dst, i.InputSimd128Register(0));
// Temp registers for shift mask and additional moves to XMM registers.
Register tmp = i.ToRegister(instr->TempAt(0));
Register tmp = i.TempRegister(0);
XMMRegister tmp_simd = i.TempSimd128Register(1);
if (HasImmediateInput(instr, 1)) {
// Perform 16-bit shift, then mask away low bits.
uint8_t shift = i.InputInt3(1);
__ Psllw(dst, byte{shift});
uint8_t bmask = static_cast<uint8_t>(0xff << shift);
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
__ movl(tmp, Immediate(mask));
__ Movd(tmp_simd, tmp);
__ Pshufd(tmp_simd, tmp_simd, uint8_t{0});
__ Pand(dst, tmp_simd);
__ I8x16Shl(dst, i.InputSimd128Register(0), i.InputInt3(1), tmp,
kScratchDoubleReg);
} else {
// Mask off the unwanted bits before word-shifting.
__ Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
// Take shift value modulo 8.
__ movq(tmp, i.InputRegister(1));
__ andq(tmp, Immediate(7));
__ addq(tmp, Immediate(8));
__ Movq(tmp_simd, tmp);
__ Psrlw(kScratchDoubleReg, tmp_simd);
__ Packuswb(kScratchDoubleReg, kScratchDoubleReg);
__ Pand(dst, kScratchDoubleReg);
// TODO(zhin): subq here to avoid asking for another temporary register,
// examine codegen for other i8x16 shifts, they use less instructions.
__ subq(tmp, Immediate(8));
__ Movq(tmp_simd, tmp);
__ Psllw(dst, tmp_simd);
__ I8x16Shl(dst, i.InputSimd128Register(0), i.InputRegister(1), tmp,
kScratchDoubleReg, tmp_simd);
}
break;
}
......
......@@ -3332,51 +3332,17 @@ void LiftoffAssembler::emit_i8x16_bitmask(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_shl(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
static constexpr RegClass tmp_rc = reg_class_for(kI32);
static constexpr RegClass tmp_simd_rc = reg_class_for(kS128);
LiftoffRegister tmp = GetUnusedRegister(tmp_rc, LiftoffRegList::ForRegs(rhs));
LiftoffRegister tmp = GetUnusedRegister(kGpReg, LiftoffRegList::ForRegs(rhs));
LiftoffRegister tmp_simd =
GetUnusedRegister(tmp_simd_rc, LiftoffRegList::ForRegs(dst, lhs));
// Mask off the unwanted bits before word-shifting.
Pcmpeqw(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
mov(tmp.gp(), rhs.gp());
and_(tmp.gp(), Immediate(7));
add(tmp.gp(), Immediate(8));
Movd(tmp_simd.fp(), tmp.gp());
Psrlw(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, tmp_simd.fp());
Packuswb(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpand(dst.fp(), lhs.fp(), liftoff::kScratchDoubleReg);
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
andps(dst.fp(), liftoff::kScratchDoubleReg);
}
sub(tmp.gp(), Immediate(8));
Movd(tmp_simd.fp(), tmp.gp());
Psllw(dst.fp(), dst.fp(), tmp_simd.fp());
GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs));
I8x16Shl(dst.fp(), lhs.fp(), rhs.gp(), tmp.gp(), liftoff::kScratchDoubleReg,
tmp_simd.fp());
}
void LiftoffAssembler::emit_i8x16_shli(LiftoffRegister dst, LiftoffRegister lhs,
int32_t rhs) {
static constexpr RegClass tmp_rc = reg_class_for(kI32);
LiftoffRegister tmp = GetUnusedRegister(tmp_rc, {});
byte shift = static_cast<byte>(rhs & 0x7);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpsllw(dst.fp(), lhs.fp(), shift);
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
psllw(dst.fp(), shift);
}
uint8_t bmask = static_cast<uint8_t>(0xff << shift);
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
mov(tmp.gp(), mask);
Movd(liftoff::kScratchDoubleReg, tmp.gp());
Pshufd(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg, uint8_t{0});
Pand(dst.fp(), liftoff::kScratchDoubleReg);
LiftoffRegister tmp = GetUnusedRegister(kGpReg, {});
I8x16Shl(dst.fp(), lhs.fp(), rhs, tmp.gp(), liftoff::kScratchDoubleReg);
}
void LiftoffAssembler::emit_i8x16_shr_s(LiftoffRegister dst,
......
......@@ -2904,47 +2904,13 @@ void LiftoffAssembler::emit_i8x16_bitmask(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_shl(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs) {
static constexpr RegClass tmp_simd_rc = reg_class_for(kS128);
LiftoffRegister tmp_simd =
GetUnusedRegister(tmp_simd_rc, LiftoffRegList::ForRegs(dst, lhs));
// Mask off the unwanted bits before word-shifting.
Pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
movq(kScratchRegister, rhs.gp());
andq(kScratchRegister, Immediate(7));
addq(kScratchRegister, Immediate(8));
Movq(tmp_simd.fp(), kScratchRegister);
Psrlw(kScratchDoubleReg, tmp_simd.fp());
Packuswb(kScratchDoubleReg, kScratchDoubleReg);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpand(dst.fp(), lhs.fp(), kScratchDoubleReg);
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
andps(dst.fp(), kScratchDoubleReg);
}
subq(kScratchRegister, Immediate(8));
Movq(tmp_simd.fp(), kScratchRegister);
Psllw(dst.fp(), tmp_simd.fp());
I8x16Shl(dst.fp(), lhs.fp(), rhs.gp(), kScratchRegister, kScratchDoubleReg,
liftoff::kScratchDoubleReg2);
}
void LiftoffAssembler::emit_i8x16_shli(LiftoffRegister dst, LiftoffRegister lhs,
int32_t rhs) {
byte shift = static_cast<byte>(rhs & 0x7);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpsllw(dst.fp(), lhs.fp(), shift);
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
psllw(dst.fp(), shift);
}
uint8_t bmask = static_cast<uint8_t>(0xff << shift);
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
movl(kScratchRegister, Immediate(mask));
Movd(kScratchDoubleReg, kScratchRegister);
Pshufd(kScratchDoubleReg, kScratchDoubleReg, uint8_t{0});
Pand(dst.fp(), kScratchDoubleReg);
I8x16Shl(dst.fp(), lhs.fp(), rhs, kScratchRegister, kScratchDoubleReg);
}
void LiftoffAssembler::emit_i8x16_shr_s(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment