Commit b824d853 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][x64][ia32] Factor f64x2.replace_lane into shared code

This pblendw/movlhps combination has lower latency and requires less
unop than pinsrq (1 v.s. 2).

Bug: v8:11589
Change-Id: I770b0c20a286774afefbac5ef0adffe463318f21
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2828871Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#74049}
parent 090431b0
......@@ -2488,6 +2488,13 @@ void Assembler::movhlps(XMMRegister dst, XMMRegister src) {
emit_sse_operand(dst, src);
}
void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
EnsureSpace ensure_space(this);
EMIT(0x0F);
EMIT(0x16);
emit_sse_operand(dst, src);
}
void Assembler::movlps(XMMRegister dst, Operand src) {
EnsureSpace ensure_space(this);
EMIT(0x0F);
......@@ -2979,6 +2986,10 @@ void Assembler::vmovhlps(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG);
}
void Assembler::vmovlhps(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vinstr(0x16, dst, src1, src2, kNone, k0F, kWIG);
}
void Assembler::vmovlps(XMMRegister dst, XMMRegister src1, Operand src2) {
vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG);
}
......
......@@ -868,6 +868,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void shufpd(XMMRegister dst, XMMRegister src, byte imm8);
void movhlps(XMMRegister dst, XMMRegister src);
void movlhps(XMMRegister dst, XMMRegister src);
void movlps(XMMRegister dst, Operand src);
void movlps(Operand dst, XMMRegister src);
void movhps(XMMRegister dst, Operand src);
......@@ -1398,6 +1399,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vshufpd(XMMRegister dst, XMMRegister src1, Operand src2, byte imm8);
void vmovhlps(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vmovlhps(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vmovlps(XMMRegister dst, XMMRegister src1, Operand src2);
void vmovlps(Operand dst, XMMRegister src);
void vmovhps(XMMRegister dst, XMMRegister src1, Operand src2);
......
......@@ -47,6 +47,27 @@ void SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src,
}
}
void SharedTurboAssembler::F64x2ReplaceLane(XMMRegister dst, XMMRegister src,
DoubleRegister rep, uint8_t lane) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
if (lane == 0) {
vpblendw(dst, src, rep, 0b00001111);
} else {
vmovlhps(dst, src, rep);
}
} else {
CpuFeatureScope scope(this, SSE4_1);
DCHECK_NE(dst, rep);
if (dst != src) movaps(dst, src);
if (lane == 0) {
pblendw(dst, rep, 0b00001111);
} else {
movlhps(dst, rep);
}
}
}
void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
uint8_t laneidx) {
if (laneidx == 0) {
......
......@@ -272,6 +272,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP_SSE4_1(Roundps, roundps)
void F64x2ExtractLane(DoubleRegister dst, XMMRegister src, uint8_t lane);
void F64x2ReplaceLane(XMMRegister dst, XMMRegister src, DoubleRegister rep,
uint8_t lane);
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scrat, bool is_signed);
......
......@@ -1874,43 +1874,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputUint8(1));
break;
}
case kSSEF64x2ReplaceLane: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister dst = i.OutputSimd128Register();
int8_t lane = i.InputInt8(1);
DoubleRegister rep = i.InputDoubleRegister(2);
// insertps takes a mask which contains (high to low):
// - 2 bit specifying source float element to copy
// - 2 bit specifying destination float element to write to
// - 4 bits specifying which elements of the destination to zero
DCHECK_LT(lane, 2);
if (lane == 0) {
__ insertps(dst, rep, 0b00000000);
__ insertps(dst, rep, 0b01010000);
} else {
__ insertps(dst, rep, 0b00100000);
__ insertps(dst, rep, 0b01110000);
}
break;
}
case kAVXF64x2ReplaceLane: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
int8_t lane = i.InputInt8(1);
DoubleRegister rep = i.InputDoubleRegister(2);
DCHECK_NE(dst, rep);
DCHECK_LT(lane, 2);
if (lane == 0) {
__ vinsertps(dst, src, rep, 0b00000000);
__ vinsertps(dst, dst, rep, 0b01010000);
} else {
__ vinsertps(dst, src, rep, 0b00100000);
__ vinsertps(dst, dst, rep, 0b01110000);
}
case kF64x2ReplaceLane: {
__ F64x2ReplaceLane(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputDoubleRegister(2), i.InputInt8(1));
break;
}
case kIA32F64x2Sqrt: {
......
......@@ -118,8 +118,7 @@ namespace compiler {
V(IA32Peek) \
V(IA32F64x2Splat) \
V(F64x2ExtractLane) \
V(SSEF64x2ReplaceLane) \
V(AVXF64x2ReplaceLane) \
V(F64x2ReplaceLane) \
V(IA32F64x2Sqrt) \
V(IA32F64x2Add) \
V(IA32F64x2Sub) \
......
......@@ -103,8 +103,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32BitcastIF:
case kIA32F64x2Splat:
case kF64x2ExtractLane:
case kSSEF64x2ReplaceLane:
case kAVXF64x2ReplaceLane:
case kF64x2ReplaceLane:
case kIA32F64x2Sqrt:
case kIA32F64x2Add:
case kIA32F64x2Sub:
......
......@@ -2559,13 +2559,9 @@ SIMD_REPLACE_LANE_TYPE_OP(VISIT_SIMD_REPLACE_LANE)
InstructionOperand operand1 = \
g.UseImmediate(OpParameter<int32_t>(node->op())); \
InstructionOperand operand2 = g.UseUniqueRegister(node->InputAt(1)); \
if (IsSupported(AVX)) { \
Emit(kAVX##Type##ReplaceLane, g.DefineAsRegister(node), operand0, \
operand1, operand2); \
} else { \
Emit(kSSE##Type##ReplaceLane, g.DefineSameAsFirst(node), operand0, \
operand1, operand2); \
} \
InstructionOperand dst = IsSupported(AVX) ? g.DefineAsRegister(node) \
: g.DefineSameAsFirst(node); \
Emit(k##Type##ReplaceLane, dst, operand0, operand1, operand2); \
}
VISIT_SIMD_REPLACE_LANE_USE_REG(F64x2)
#undef VISIT_SIMD_REPLACE_LANE_USE_REG
......
......@@ -2394,6 +2394,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputUint8(1));
break;
}
case kX64F64x2ReplaceLane: {
__ F64x2ReplaceLane(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputDoubleRegister(2), i.InputInt8(1));
break;
}
case kX64F64x2Sqrt: {
__ Sqrtpd(i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
......
......@@ -156,6 +156,7 @@ namespace compiler {
V(X64Peek) \
V(X64F64x2Splat) \
V(X64F64x2ExtractLane) \
V(X64F64x2ReplaceLane) \
V(X64F64x2Abs) \
V(X64F64x2Neg) \
V(X64F64x2Sqrt) \
......
......@@ -132,6 +132,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64Pinsrq:
case kX64F64x2Splat:
case kX64F64x2ExtractLane:
case kX64F64x2ReplaceLane:
case kX64F64x2Abs:
case kX64F64x2Neg:
case kX64F64x2Sqrt:
......
......@@ -3070,7 +3070,7 @@ void InstructionSelector::VisitF32x4ReplaceLane(Node* node) {
}
#define SIMD_TYPES_FOR_REPLACE_LANE(V) \
V(F64x2, kX64Pinsrq) \
V(F64x2, kX64F64x2ReplaceLane) \
V(I64x2, kX64Pinsrq) \
V(I32x4, kX64Pinsrd) \
V(I16x8, kX64Pinsrw) \
......
......@@ -4712,27 +4712,7 @@ void LiftoffAssembler::emit_f64x2_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
// TODO(fanchenk): Use movlhps and blendpd
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
if (imm_lane_idx == 0) {
vinsertps(dst.fp(), src1.fp(), src2.fp(), 0b00000000);
vinsertps(dst.fp(), dst.fp(), src2.fp(), 0b01010000);
} else {
vinsertps(dst.fp(), src1.fp(), src2.fp(), 0b00100000);
vinsertps(dst.fp(), dst.fp(), src2.fp(), 0b01110000);
}
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
if (imm_lane_idx == 0) {
insertps(dst.fp(), src2.fp(), 0b00000000);
insertps(dst.fp(), src2.fp(), 0b01010000);
} else {
insertps(dst.fp(), src2.fp(), 0b00100000);
insertps(dst.fp(), src2.fp(), 0b01110000);
}
}
F64x2ReplaceLane(dst.fp(), src1.fp(), src2.fp(), imm_lane_idx);
}
void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) {
......
......@@ -4242,22 +4242,7 @@ void LiftoffAssembler::emit_f64x2_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
if (imm_lane_idx == 0) {
vpblendw(dst.fp(), src1.fp(), src2.fp(), 0b00001111);
} else {
vmovlhps(dst.fp(), src1.fp(), src2.fp());
}
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
if (imm_lane_idx == 0) {
pblendw(dst.fp(), src2.fp(), 0b00001111);
} else {
movlhps(dst.fp(), src2.fp());
}
}
F64x2ReplaceLane(dst.fp(), src1.fp(), src2.fp(), imm_lane_idx);
}
void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment