Commit 5f4b0e47 authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][x64] Fix definition of Shufps

The definition of Shufps is wrong, we are incorrectly passing 0 as the
immediate in all cases. No tests broke because we only used Shufps for
splats, which has imm8 == 0 anyway.

Also, it was using movss, which only moves a single 32-bit. Because we
were using it only for f32x4 splat, this ended up being enough (imm8 ==
0 meant that we only shuffled the low 32-bit). This is fixed to use
movaps, which moves the entire 128-bit register.

Also tweak the definition of Shufps to take 4 arguments. `vshufps dst,
src1, src2, imm8` shuffles src1 and src2 into dst. `shufps dst, src,
imm8`, shuffles dst and src into dst.

So `Shufps(dst, src, imm8)` is ambiguous in the AVX case, it could be:
1. vshufps(dst, src, src, imm8), or
2. vshufps(dst, dst, src, imm8)

2. is more likely to be the intended behavior, but it introduces a false
dependency on the value of dst.

With `Shufps(dst, src1, src2, imm8)`, it is clearer what the behavior
should be:
1. shufps(dst, src2, imm8) matches the AVX behavior IFF dst == src1.

Change-Id: I60dc4ec868023d28d00f2b09d2c53b82a729bc4d
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2591849Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71775}
parent 28740a36
...@@ -1756,15 +1756,16 @@ void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1, ...@@ -1756,15 +1756,16 @@ void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
} }
} }
void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src, byte imm8) { void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
byte imm8) {
if (CpuFeatures::IsSupported(AVX)) { if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX); CpuFeatureScope avx_scope(this, AVX);
vshufps(dst, src, src, imm8); vshufps(dst, src1, src2, imm8);
} else { } else {
if (dst != src) { if (dst != src1) {
movss(dst, src); movaps(dst, src1);
} }
shufps(dst, src, static_cast<byte>(0)); shufps(dst, src2, imm8);
} }
} }
......
...@@ -526,8 +526,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -526,8 +526,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void Pmaddwd(XMMRegister dst, XMMRegister src1, XMMRegister src2); void Pmaddwd(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2); void Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
// Shufps that will mov src into dst if AVX is not supported. // Shufps that will mov src1 into dst if AVX is not supported.
void Shufps(XMMRegister dst, XMMRegister src, byte imm8); void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, byte imm8);
// Non-SSE2 instructions. // Non-SSE2 instructions.
void Pextrd(Register dst, XMMRegister src, uint8_t imm8); void Pextrd(Register dst, XMMRegister src, uint8_t imm8);
......
...@@ -2572,7 +2572,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2572,7 +2572,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64F32x4Splat: { case kX64F32x4Splat: {
__ Shufps(i.OutputSimd128Register(), i.InputDoubleRegister(0), 0); __ Shufps(i.OutputSimd128Register(), i.InputDoubleRegister(0),
i.InputDoubleRegister(0), 0);
break; break;
} }
case kX64F32x4ExtractLane: { case kX64F32x4ExtractLane: {
...@@ -3912,7 +3913,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3912,7 +3913,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
uint8_t mask = i.InputUint8(1); uint8_t mask = i.InputUint8(1);
if (dst == src) { if (dst == src) {
// 1-byte shorter encoding than pshufd. // 1-byte shorter encoding than pshufd.
__ Shufps(dst, src, mask); __ Shufps(dst, src, src, mask);
} else { } else {
__ Pshufd(dst, src, mask); __ Pshufd(dst, src, mask);
} }
......
...@@ -2475,7 +2475,7 @@ void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst, ...@@ -2475,7 +2475,7 @@ void LiftoffAssembler::emit_i64x2_splat(LiftoffRegister dst,
void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst, void LiftoffAssembler::emit_f32x4_splat(LiftoffRegister dst,
LiftoffRegister src) { LiftoffRegister src) {
Shufps(dst.fp(), src.fp(), 0); Shufps(dst.fp(), src.fp(), src.fp(), 0);
} }
void LiftoffAssembler::emit_f64x2_splat(LiftoffRegister dst, void LiftoffAssembler::emit_f64x2_splat(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment