Commit b824d853 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][x64][ia32] Factor f64x2.replace_lane into shared code

This pblendw/movlhps combination has lower latency and requires less
unop than pinsrq (1 v.s. 2).

Bug: v8:11589
Change-Id: I770b0c20a286774afefbac5ef0adffe463318f21
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2828871Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#74049}
parent 090431b0
...@@ -2488,6 +2488,13 @@ void Assembler::movhlps(XMMRegister dst, XMMRegister src) { ...@@ -2488,6 +2488,13 @@ void Assembler::movhlps(XMMRegister dst, XMMRegister src) {
emit_sse_operand(dst, src); emit_sse_operand(dst, src);
} }
void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
EnsureSpace ensure_space(this);
EMIT(0x0F);
EMIT(0x16);
emit_sse_operand(dst, src);
}
void Assembler::movlps(XMMRegister dst, Operand src) { void Assembler::movlps(XMMRegister dst, Operand src) {
EnsureSpace ensure_space(this); EnsureSpace ensure_space(this);
EMIT(0x0F); EMIT(0x0F);
...@@ -2979,6 +2986,10 @@ void Assembler::vmovhlps(XMMRegister dst, XMMRegister src1, XMMRegister src2) { ...@@ -2979,6 +2986,10 @@ void Assembler::vmovhlps(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG); vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG);
} }
void Assembler::vmovlhps(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vinstr(0x16, dst, src1, src2, kNone, k0F, kWIG);
}
void Assembler::vmovlps(XMMRegister dst, XMMRegister src1, Operand src2) { void Assembler::vmovlps(XMMRegister dst, XMMRegister src1, Operand src2) {
vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG); vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG);
} }
......
...@@ -868,6 +868,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -868,6 +868,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void shufpd(XMMRegister dst, XMMRegister src, byte imm8); void shufpd(XMMRegister dst, XMMRegister src, byte imm8);
void movhlps(XMMRegister dst, XMMRegister src); void movhlps(XMMRegister dst, XMMRegister src);
void movlhps(XMMRegister dst, XMMRegister src);
void movlps(XMMRegister dst, Operand src); void movlps(XMMRegister dst, Operand src);
void movlps(Operand dst, XMMRegister src); void movlps(Operand dst, XMMRegister src);
void movhps(XMMRegister dst, Operand src); void movhps(XMMRegister dst, Operand src);
...@@ -1398,6 +1399,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1398,6 +1399,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vshufpd(XMMRegister dst, XMMRegister src1, Operand src2, byte imm8); void vshufpd(XMMRegister dst, XMMRegister src1, Operand src2, byte imm8);
void vmovhlps(XMMRegister dst, XMMRegister src1, XMMRegister src2); void vmovhlps(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vmovlhps(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vmovlps(XMMRegister dst, XMMRegister src1, Operand src2); void vmovlps(XMMRegister dst, XMMRegister src1, Operand src2);
void vmovlps(Operand dst, XMMRegister src); void vmovlps(Operand dst, XMMRegister src);
void vmovhps(XMMRegister dst, XMMRegister src1, Operand src2); void vmovhps(XMMRegister dst, XMMRegister src1, Operand src2);
......
...@@ -47,6 +47,27 @@ void SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src, ...@@ -47,6 +47,27 @@ void SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src,
} }
} }
void SharedTurboAssembler::F64x2ReplaceLane(XMMRegister dst, XMMRegister src,
DoubleRegister rep, uint8_t lane) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
if (lane == 0) {
vpblendw(dst, src, rep, 0b00001111);
} else {
vmovlhps(dst, src, rep);
}
} else {
CpuFeatureScope scope(this, SSE4_1);
DCHECK_NE(dst, rep);
if (dst != src) movaps(dst, src);
if (lane == 0) {
pblendw(dst, rep, 0b00001111);
} else {
movlhps(dst, rep);
}
}
}
void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src, void SharedTurboAssembler::S128Store32Lane(Operand dst, XMMRegister src,
uint8_t laneidx) { uint8_t laneidx) {
if (laneidx == 0) { if (laneidx == 0) {
......
...@@ -272,6 +272,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase { ...@@ -272,6 +272,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP_SSE4_1(Roundps, roundps) AVX_OP_SSE4_1(Roundps, roundps)
void F64x2ExtractLane(DoubleRegister dst, XMMRegister src, uint8_t lane); void F64x2ExtractLane(DoubleRegister dst, XMMRegister src, uint8_t lane);
void F64x2ReplaceLane(XMMRegister dst, XMMRegister src, DoubleRegister rep,
uint8_t lane);
void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx); void S128Store32Lane(Operand dst, XMMRegister src, uint8_t laneidx);
void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2, void I16x8ExtMulLow(XMMRegister dst, XMMRegister src1, XMMRegister src2,
XMMRegister scrat, bool is_signed); XMMRegister scrat, bool is_signed);
......
...@@ -1874,43 +1874,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -1874,43 +1874,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputUint8(1)); i.InputUint8(1));
break; break;
} }
case kSSEF64x2ReplaceLane: { case kF64x2ReplaceLane: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); __ F64x2ReplaceLane(i.OutputSimd128Register(), i.InputSimd128Register(0),
CpuFeatureScope sse_scope(tasm(), SSE4_1); i.InputDoubleRegister(2), i.InputInt8(1));
XMMRegister dst = i.OutputSimd128Register();
int8_t lane = i.InputInt8(1);
DoubleRegister rep = i.InputDoubleRegister(2);
// insertps takes a mask which contains (high to low):
// - 2 bit specifying source float element to copy
// - 2 bit specifying destination float element to write to
// - 4 bits specifying which elements of the destination to zero
DCHECK_LT(lane, 2);
if (lane == 0) {
__ insertps(dst, rep, 0b00000000);
__ insertps(dst, rep, 0b01010000);
} else {
__ insertps(dst, rep, 0b00100000);
__ insertps(dst, rep, 0b01110000);
}
break;
}
case kAVXF64x2ReplaceLane: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
int8_t lane = i.InputInt8(1);
DoubleRegister rep = i.InputDoubleRegister(2);
DCHECK_NE(dst, rep);
DCHECK_LT(lane, 2);
if (lane == 0) {
__ vinsertps(dst, src, rep, 0b00000000);
__ vinsertps(dst, dst, rep, 0b01010000);
} else {
__ vinsertps(dst, src, rep, 0b00100000);
__ vinsertps(dst, dst, rep, 0b01110000);
}
break; break;
} }
case kIA32F64x2Sqrt: { case kIA32F64x2Sqrt: {
......
...@@ -118,8 +118,7 @@ namespace compiler { ...@@ -118,8 +118,7 @@ namespace compiler {
V(IA32Peek) \ V(IA32Peek) \
V(IA32F64x2Splat) \ V(IA32F64x2Splat) \
V(F64x2ExtractLane) \ V(F64x2ExtractLane) \
V(SSEF64x2ReplaceLane) \ V(F64x2ReplaceLane) \
V(AVXF64x2ReplaceLane) \
V(IA32F64x2Sqrt) \ V(IA32F64x2Sqrt) \
V(IA32F64x2Add) \ V(IA32F64x2Add) \
V(IA32F64x2Sub) \ V(IA32F64x2Sub) \
......
...@@ -103,8 +103,7 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -103,8 +103,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32BitcastIF: case kIA32BitcastIF:
case kIA32F64x2Splat: case kIA32F64x2Splat:
case kF64x2ExtractLane: case kF64x2ExtractLane:
case kSSEF64x2ReplaceLane: case kF64x2ReplaceLane:
case kAVXF64x2ReplaceLane:
case kIA32F64x2Sqrt: case kIA32F64x2Sqrt:
case kIA32F64x2Add: case kIA32F64x2Add:
case kIA32F64x2Sub: case kIA32F64x2Sub:
......
...@@ -2552,20 +2552,16 @@ SIMD_REPLACE_LANE_TYPE_OP(VISIT_SIMD_REPLACE_LANE) ...@@ -2552,20 +2552,16 @@ SIMD_REPLACE_LANE_TYPE_OP(VISIT_SIMD_REPLACE_LANE)
// operand2 to be UseRegister, because the codegen relies on insertps using // operand2 to be UseRegister, because the codegen relies on insertps using
// registers. // registers.
// TODO(v8:9764) Remove this UseRegister requirement // TODO(v8:9764) Remove this UseRegister requirement
#define VISIT_SIMD_REPLACE_LANE_USE_REG(Type) \ #define VISIT_SIMD_REPLACE_LANE_USE_REG(Type) \
void InstructionSelector::Visit##Type##ReplaceLane(Node* node) { \ void InstructionSelector::Visit##Type##ReplaceLane(Node* node) { \
IA32OperandGenerator g(this); \ IA32OperandGenerator g(this); \
InstructionOperand operand0 = g.UseRegister(node->InputAt(0)); \ InstructionOperand operand0 = g.UseRegister(node->InputAt(0)); \
InstructionOperand operand1 = \ InstructionOperand operand1 = \
g.UseImmediate(OpParameter<int32_t>(node->op())); \ g.UseImmediate(OpParameter<int32_t>(node->op())); \
InstructionOperand operand2 = g.UseUniqueRegister(node->InputAt(1)); \ InstructionOperand operand2 = g.UseUniqueRegister(node->InputAt(1)); \
if (IsSupported(AVX)) { \ InstructionOperand dst = IsSupported(AVX) ? g.DefineAsRegister(node) \
Emit(kAVX##Type##ReplaceLane, g.DefineAsRegister(node), operand0, \ : g.DefineSameAsFirst(node); \
operand1, operand2); \ Emit(k##Type##ReplaceLane, dst, operand0, operand1, operand2); \
} else { \
Emit(kSSE##Type##ReplaceLane, g.DefineSameAsFirst(node), operand0, \
operand1, operand2); \
} \
} }
VISIT_SIMD_REPLACE_LANE_USE_REG(F64x2) VISIT_SIMD_REPLACE_LANE_USE_REG(F64x2)
#undef VISIT_SIMD_REPLACE_LANE_USE_REG #undef VISIT_SIMD_REPLACE_LANE_USE_REG
......
...@@ -2394,6 +2394,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2394,6 +2394,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputUint8(1)); i.InputUint8(1));
break; break;
} }
case kX64F64x2ReplaceLane: {
__ F64x2ReplaceLane(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputDoubleRegister(2), i.InputInt8(1));
break;
}
case kX64F64x2Sqrt: { case kX64F64x2Sqrt: {
__ Sqrtpd(i.OutputSimd128Register(), i.InputSimd128Register(0)); __ Sqrtpd(i.OutputSimd128Register(), i.InputSimd128Register(0));
break; break;
......
...@@ -156,6 +156,7 @@ namespace compiler { ...@@ -156,6 +156,7 @@ namespace compiler {
V(X64Peek) \ V(X64Peek) \
V(X64F64x2Splat) \ V(X64F64x2Splat) \
V(X64F64x2ExtractLane) \ V(X64F64x2ExtractLane) \
V(X64F64x2ReplaceLane) \
V(X64F64x2Abs) \ V(X64F64x2Abs) \
V(X64F64x2Neg) \ V(X64F64x2Neg) \
V(X64F64x2Sqrt) \ V(X64F64x2Sqrt) \
......
...@@ -132,6 +132,7 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -132,6 +132,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64Pinsrq: case kX64Pinsrq:
case kX64F64x2Splat: case kX64F64x2Splat:
case kX64F64x2ExtractLane: case kX64F64x2ExtractLane:
case kX64F64x2ReplaceLane:
case kX64F64x2Abs: case kX64F64x2Abs:
case kX64F64x2Neg: case kX64F64x2Neg:
case kX64F64x2Sqrt: case kX64F64x2Sqrt:
......
...@@ -3070,7 +3070,7 @@ void InstructionSelector::VisitF32x4ReplaceLane(Node* node) { ...@@ -3070,7 +3070,7 @@ void InstructionSelector::VisitF32x4ReplaceLane(Node* node) {
} }
#define SIMD_TYPES_FOR_REPLACE_LANE(V) \ #define SIMD_TYPES_FOR_REPLACE_LANE(V) \
V(F64x2, kX64Pinsrq) \ V(F64x2, kX64F64x2ReplaceLane) \
V(I64x2, kX64Pinsrq) \ V(I64x2, kX64Pinsrq) \
V(I32x4, kX64Pinsrd) \ V(I32x4, kX64Pinsrd) \
V(I16x8, kX64Pinsrw) \ V(I16x8, kX64Pinsrw) \
......
...@@ -4712,27 +4712,7 @@ void LiftoffAssembler::emit_f64x2_replace_lane(LiftoffRegister dst, ...@@ -4712,27 +4712,7 @@ void LiftoffAssembler::emit_f64x2_replace_lane(LiftoffRegister dst,
LiftoffRegister src1, LiftoffRegister src1,
LiftoffRegister src2, LiftoffRegister src2,
uint8_t imm_lane_idx) { uint8_t imm_lane_idx) {
// TODO(fanchenk): Use movlhps and blendpd F64x2ReplaceLane(dst.fp(), src1.fp(), src2.fp(), imm_lane_idx);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
if (imm_lane_idx == 0) {
vinsertps(dst.fp(), src1.fp(), src2.fp(), 0b00000000);
vinsertps(dst.fp(), dst.fp(), src2.fp(), 0b01010000);
} else {
vinsertps(dst.fp(), src1.fp(), src2.fp(), 0b00100000);
vinsertps(dst.fp(), dst.fp(), src2.fp(), 0b01110000);
}
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
if (imm_lane_idx == 0) {
insertps(dst.fp(), src2.fp(), 0b00000000);
insertps(dst.fp(), src2.fp(), 0b01010000);
} else {
insertps(dst.fp(), src2.fp(), 0b00100000);
insertps(dst.fp(), src2.fp(), 0b01110000);
}
}
} }
void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) { void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) {
......
...@@ -4242,22 +4242,7 @@ void LiftoffAssembler::emit_f64x2_replace_lane(LiftoffRegister dst, ...@@ -4242,22 +4242,7 @@ void LiftoffAssembler::emit_f64x2_replace_lane(LiftoffRegister dst,
LiftoffRegister src1, LiftoffRegister src1,
LiftoffRegister src2, LiftoffRegister src2,
uint8_t imm_lane_idx) { uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) { F64x2ReplaceLane(dst.fp(), src1.fp(), src2.fp(), imm_lane_idx);
CpuFeatureScope scope(this, AVX);
if (imm_lane_idx == 0) {
vpblendw(dst.fp(), src1.fp(), src2.fp(), 0b00001111);
} else {
vmovlhps(dst.fp(), src1.fp(), src2.fp());
}
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
if (imm_lane_idx == 0) {
pblendw(dst.fp(), src2.fp(), 0b00001111);
} else {
movlhps(dst.fp(), src2.fp());
}
}
} }
void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) { void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment