Commit d8ce100f authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][x64][ia32] Factor f32x4.extract_lane into shared code

Bug: v8:11589
Change-Id: I90a0c9f8325eb56c607addf1adde60673dfbc9c7
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2840688Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#74076}
parent 5067f049
......@@ -1966,19 +1966,6 @@ void TurboAssembler::Vbroadcastss(XMMRegister dst, Operand src) {
shufps(dst, dst, static_cast<byte>(0));
}
void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vshufps(dst, src1, src2, imm8);
} else {
if (dst != src1) {
movaps(dst, src1);
}
shufps(dst, src2, imm8);
}
}
void TurboAssembler::Lzcnt(Register dst, Operand src) {
if (CpuFeatures::IsSupported(LZCNT)) {
CpuFeatureScope scope(this, LZCNT);
......
......@@ -364,10 +364,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
void Pinsrw(XMMRegister dst, XMMRegister src1, Operand src2, int8_t imm8);
void Vbroadcastss(XMMRegister dst, Operand src);
// Shufps that will mov src1 into dst if AVX is not supported.
void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
uint8_t imm8);
// Expression support
// cvtsi2sd instruction only writes to the low 64-bit of dst register, which
// hinders register renaming and makes dependence chains longer. So we use
......
......@@ -29,6 +29,19 @@ void SharedTurboAssembler::Movapd(XMMRegister dst, XMMRegister src) {
}
}
void SharedTurboAssembler::Shufps(XMMRegister dst, XMMRegister src1,
XMMRegister src2, uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vshufps(dst, src1, src2, imm8);
} else {
if (dst != src1) {
movaps(dst, src1);
}
shufps(dst, src2, imm8);
}
}
void SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src,
uint8_t lane) {
if (lane == 0) {
......@@ -64,6 +77,27 @@ void SharedTurboAssembler::F32x4Splat(XMMRegister dst, DoubleRegister src) {
}
}
void SharedTurboAssembler::F32x4ExtractLane(FloatRegister dst, XMMRegister src,
uint8_t lane) {
DCHECK_LT(lane, 4);
// These instructions are shorter than insertps, but will leave junk in
// the top lanes of dst.
if (lane == 0) {
if (dst != src) {
Movaps(dst, src);
}
} else if (lane == 1) {
Movshdup(dst, src);
} else if (lane == 2 && dst == src) {
// Check dst == src to avoid false dependency on dst.
Movhlps(dst, src);
} else if (dst == src) {
Shufps(dst, src, src, lane);
} else {
Pshufd(dst, src, lane);
}
}
void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
uint8_t laneidx) {
if (laneidx == 0) {
......
......@@ -39,6 +39,10 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
}
}
// Shufps that will mov src1 into dst if AVX is not supported.
void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
uint8_t imm8);
// Helper struct to implement functions that check for AVX support and
// dispatch to the appropriate AVX/SSE instruction.
template <typename Dst, typename Arg, typename... Args>
......@@ -273,6 +277,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
void F64x2ExtractLane(DoubleRegister dst, XMMRegister src, uint8_t lane);
void F32x4Splat(XMMRegister dst, DoubleRegister src);
void F32x4ExtractLane(FloatRegister dst, XMMRegister src, uint8_t lane);
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scrat, bool is_signed);
......
......@@ -1886,19 +1886,6 @@ void TurboAssembler::Pmaddubsw(XMMRegister dst, XMMRegister src1,
}
}
void TurboAssembler::Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2,
byte imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vshufps(dst, src1, src2, imm8);
} else {
if (dst != src1) {
movaps(dst, src1);
}
shufps(dst, src2, imm8);
}
}
void TurboAssembler::Pextrd(Register dst, XMMRegister src, uint8_t imm8) {
if (imm8 == 0) {
Movd(dst, src);
......
......@@ -379,9 +379,6 @@ class V8_EXPORT_PRIVATE TurboAssembler : public SharedTurboAssembler {
void Pmaddubsw(XMMRegister dst, XMMRegister src1, Operand src2);
void Pmaddubsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
// Shufps that will mov src1 into dst if AVX is not supported.
void Shufps(XMMRegister dst, XMMRegister src1, XMMRegister src2, byte imm8);
// Non-SSE2 instructions.
void Pextrd(Register dst, XMMRegister src, uint8_t imm8);
......
......@@ -2278,26 +2278,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32F32x4ExtractLane: {
XMMRegister dst = i.OutputFloatRegister();
XMMRegister src = i.InputSimd128Register(0);
uint8_t lane = i.InputUint8(1);
DCHECK_LT(lane, 4);
// These instructions are shorter than insertps, but will leave junk in
// the top lanes of dst.
if (lane == 0) {
if (dst != src) {
__ Movaps(dst, src);
}
} else if (lane == 1) {
__ Movshdup(dst, src);
} else if (lane == 2 && dst == src) {
// Check dst == src to avoid false dependency on dst.
__ Movhlps(dst, src);
} else if (dst == src) {
__ Shufps(dst, src, src, lane);
} else {
__ Pshufd(dst, src, lane);
}
__ F32x4ExtractLane(i.OutputFloatRegister(), i.InputSimd128Register(0),
i.InputUint8(1));
break;
}
case kIA32Insertps: {
......
......@@ -2525,24 +2525,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64F32x4ExtractLane: {
XMMRegister dst = i.OutputDoubleRegister();
XMMRegister src = i.InputSimd128Register(0);
uint8_t lane = i.InputUint8(1);
DCHECK_LT(lane, 4);
// These instructions are shorter than insertps, but will leave junk in
// the top lanes of dst.
if (lane == 0) {
__ Move(dst, src);
} else if (lane == 1) {
__ Movshdup(dst, src);
} else if (lane == 2 && dst == src) {
// Check dst == src to avoid false dependency on dst.
__ Movhlps(dst, src);
} else if (dst == src) {
__ Shufps(dst, src, src, lane);
} else {
__ Pshufd(dst, src, lane);
}
__ F32x4ExtractLane(i.OutputFloatRegister(), i.InputSimd128Register(0),
i.InputUint8(1));
break;
}
case kX64F32x4ReplaceLane: {
......
......@@ -4605,13 +4605,7 @@ void LiftoffAssembler::emit_i64x2_extract_lane(LiftoffRegister dst,
void LiftoffAssembler::emit_f32x4_extract_lane(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vshufps(dst.fp(), lhs.fp(), lhs.fp(), imm_lane_idx);
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
if (imm_lane_idx != 0) shufps(dst.fp(), dst.fp(), imm_lane_idx);
}
F32x4ExtractLane(dst.fp(), lhs.fp(), imm_lane_idx);
}
void LiftoffAssembler::emit_f64x2_extract_lane(LiftoffRegister dst,
......
......@@ -4146,13 +4146,7 @@ void LiftoffAssembler::emit_i64x2_extract_lane(LiftoffRegister dst,
void LiftoffAssembler::emit_f32x4_extract_lane(LiftoffRegister dst,
LiftoffRegister lhs,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vshufps(dst.fp(), lhs.fp(), lhs.fp(), imm_lane_idx);
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
if (imm_lane_idx != 0) shufps(dst.fp(), dst.fp(), imm_lane_idx);
}
F32x4ExtractLane(dst.fp(), lhs.fp(), imm_lane_idx);
}
void LiftoffAssembler::emit_f64x2_extract_lane(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment