Commit f9a2435a authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

Reland "[wasm-simd][x64][ia32] Factor f64x2.replace_lane into shared code"

This is a reland of 3356078a.

The fix is in PS2:
- fix the DCHECK to be triggered only if dst != src, the dcheck
  is meant to prevent rep from being overwritten, which happens only
  if dst != src
- fix instruction selector for f64x2.replace_lane, require SameAsFirst
  only for non-AVX, which makes dst == src, saving a move
- on x64 we also require all registers, since the macro-assembler
  helper only handles registers

Original change's description:
> [wasm-simd][x64][ia32] Factor f64x2.replace_lane into shared code
>
> This pblendw/movlhps combination has lower latency and requires less
> unop than pinsrq (1 v.s. 2).
>
> Bug: v8:11589
> Change-Id: I770b0c20a286774afefbac5ef0adffe463318f21
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2828871
> Reviewed-by: Bill Budge <bbudge@chromium.org>
> Commit-Queue: Zhi An Ng <zhin@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#74049}

Bug: v8:11589
Change-Id: I51cba0539d5241242dc4d7d971ede1940b9ac1fd
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2842264
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#74545}
parent 88a1acef
......@@ -2488,6 +2488,13 @@ void Assembler::movhlps(XMMRegister dst, XMMRegister src) {
emit_sse_operand(dst, src);
}
void Assembler::movlhps(XMMRegister dst, XMMRegister src) {
EnsureSpace ensure_space(this);
EMIT(0x0F);
EMIT(0x16);
emit_sse_operand(dst, src);
}
void Assembler::movlps(XMMRegister dst, Operand src) {
EnsureSpace ensure_space(this);
EMIT(0x0F);
......@@ -2979,6 +2986,10 @@ void Assembler::vmovhlps(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG);
}
void Assembler::vmovlhps(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vinstr(0x16, dst, src1, src2, kNone, k0F, kWIG);
}
void Assembler::vmovlps(XMMRegister dst, XMMRegister src1, Operand src2) {
vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG);
}
......
......@@ -868,6 +868,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void shufpd(XMMRegister dst, XMMRegister src, byte imm8);
void movhlps(XMMRegister dst, XMMRegister src);
void movlhps(XMMRegister dst, XMMRegister src);
void movlps(XMMRegister dst, Operand src);
void movlps(Operand dst, XMMRegister src);
void movhps(XMMRegister dst, Operand src);
......@@ -1398,6 +1399,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vshufpd(XMMRegister dst, XMMRegister src1, Operand src2, byte imm8);
void vmovhlps(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vmovlhps(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vmovlps(XMMRegister dst, XMMRegister src1, Operand src2);
void vmovlps(Operand dst, XMMRegister src);
void vmovhps(XMMRegister dst, XMMRegister src1, Operand src2);
......
......@@ -60,6 +60,29 @@ void SharedTurboAssembler::F64x2ExtractLane(DoubleRegister dst, XMMRegister src,
}
}
void SharedTurboAssembler::F64x2ReplaceLane(XMMRegister dst, XMMRegister src,
DoubleRegister rep, uint8_t lane) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
if (lane == 0) {
vpblendw(dst, src, rep, 0b00001111);
} else {
vmovlhps(dst, src, rep);
}
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst != src) {
DCHECK_NE(dst, rep); // Ensure rep is not overwritten.
movaps(dst, src);
}
if (lane == 0) {
pblendw(dst, rep, 0b00001111);
} else {
movlhps(dst, rep);
}
}
}
void SharedTurboAssembler::F64x2Min(XMMRegister dst, XMMRegister lhs,
XMMRegister rhs, XMMRegister scratch) {
if (CpuFeatures::IsSupported(AVX)) {
......
......@@ -277,6 +277,8 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP_SSE4_1(Roundps, roundps)
void F64x2ExtractLane(DoubleRegister dst, XMMRegister src, uint8_t lane);
void F64x2ReplaceLane(XMMRegister dst, XMMRegister src, DoubleRegister rep,
uint8_t lane);
void F64x2Min(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
XMMRegister scratch);
void F64x2Max(XMMRegister dst, XMMRegister lhs, XMMRegister rhs,
......
......@@ -1877,43 +1877,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputUint8(1));
break;
}
case kSSEF64x2ReplaceLane: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister dst = i.OutputSimd128Register();
int8_t lane = i.InputInt8(1);
DoubleRegister rep = i.InputDoubleRegister(2);
// insertps takes a mask which contains (high to low):
// - 2 bit specifying source float element to copy
// - 2 bit specifying destination float element to write to
// - 4 bits specifying which elements of the destination to zero
DCHECK_LT(lane, 2);
if (lane == 0) {
__ insertps(dst, rep, 0b00000000);
__ insertps(dst, rep, 0b01010000);
} else {
__ insertps(dst, rep, 0b00100000);
__ insertps(dst, rep, 0b01110000);
}
break;
}
case kAVXF64x2ReplaceLane: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
int8_t lane = i.InputInt8(1);
DoubleRegister rep = i.InputDoubleRegister(2);
DCHECK_NE(dst, rep);
DCHECK_LT(lane, 2);
if (lane == 0) {
__ vinsertps(dst, src, rep, 0b00000000);
__ vinsertps(dst, dst, rep, 0b01010000);
} else {
__ vinsertps(dst, src, rep, 0b00100000);
__ vinsertps(dst, dst, rep, 0b01110000);
}
case kF64x2ReplaceLane: {
__ F64x2ReplaceLane(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputDoubleRegister(2), i.InputInt8(1));
break;
}
case kIA32F64x2Sqrt: {
......
......@@ -118,8 +118,7 @@ namespace compiler {
V(IA32Peek) \
V(IA32F64x2Splat) \
V(F64x2ExtractLane) \
V(SSEF64x2ReplaceLane) \
V(AVXF64x2ReplaceLane) \
V(F64x2ReplaceLane) \
V(IA32F64x2Sqrt) \
V(IA32F64x2Add) \
V(IA32F64x2Sub) \
......
......@@ -103,8 +103,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32BitcastIF:
case kIA32F64x2Splat:
case kF64x2ExtractLane:
case kSSEF64x2ReplaceLane:
case kAVXF64x2ReplaceLane:
case kF64x2ReplaceLane:
case kIA32F64x2Sqrt:
case kIA32F64x2Add:
case kIA32F64x2Sub:
......
......@@ -2551,27 +2551,15 @@ SIMD_REPLACE_LANE_TYPE_OP(VISIT_SIMD_REPLACE_LANE)
#undef VISIT_SIMD_REPLACE_LANE
#undef SIMD_REPLACE_LANE_TYPE_OP
// The difference between this and VISIT_SIMD_REPLACE_LANE is that this forces
// operand2 to be UseRegister, because the codegen relies on insertps using
// registers.
// TODO(v8:9764) Remove this UseRegister requirement
#define VISIT_SIMD_REPLACE_LANE_USE_REG(Type) \
void InstructionSelector::Visit##Type##ReplaceLane(Node* node) { \
IA32OperandGenerator g(this); \
InstructionOperand operand0 = g.UseRegister(node->InputAt(0)); \
InstructionOperand operand1 = \
g.UseImmediate(OpParameter<int32_t>(node->op())); \
InstructionOperand operand2 = g.UseUniqueRegister(node->InputAt(1)); \
if (IsSupported(AVX)) { \
Emit(kAVX##Type##ReplaceLane, g.DefineAsRegister(node), operand0, \
operand1, operand2); \
} else { \
Emit(kSSE##Type##ReplaceLane, g.DefineSameAsFirst(node), operand0, \
operand1, operand2); \
} \
}
VISIT_SIMD_REPLACE_LANE_USE_REG(F64x2)
#undef VISIT_SIMD_REPLACE_LANE_USE_REG
void InstructionSelector::VisitF64x2ReplaceLane(Node* node) {
IA32OperandGenerator g(this);
int32_t lane = OpParameter<int32_t>(node->op());
// When no-AVX, define dst == src to save a move.
InstructionOperand dst =
IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
Emit(kF64x2ReplaceLane, dst, g.UseRegister(node->InputAt(0)),
g.UseImmediate(lane), g.UseRegister(node->InputAt(1)));
}
#define VISIT_SIMD_SHIFT_UNIFIED_SSE_AVX(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
......
......@@ -2397,6 +2397,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputUint8(1));
break;
}
case kX64F64x2ReplaceLane: {
__ F64x2ReplaceLane(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputDoubleRegister(2), i.InputInt8(1));
break;
}
case kX64F64x2Sqrt: {
__ Sqrtpd(i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
......
......@@ -156,6 +156,7 @@ namespace compiler {
V(X64Peek) \
V(X64F64x2Splat) \
V(X64F64x2ExtractLane) \
V(X64F64x2ReplaceLane) \
V(X64F64x2Abs) \
V(X64F64x2Neg) \
V(X64F64x2Sqrt) \
......
......@@ -132,6 +132,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64Pinsrq:
case kX64F64x2Splat:
case kX64F64x2ExtractLane:
case kX64F64x2ReplaceLane:
case kX64F64x2Abs:
case kX64F64x2Neg:
case kX64F64x2Sqrt:
......
......@@ -3083,6 +3083,16 @@ void InstructionSelector::VisitF32x4ReplaceLane(Node* node) {
g.Use(node->InputAt(1)));
}
void InstructionSelector::VisitF64x2ReplaceLane(Node* node) {
X64OperandGenerator g(this);
int32_t lane = OpParameter<int32_t>(node->op());
// When no-AVX, define dst == src to save a move.
InstructionOperand dst =
IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
Emit(kX64F64x2ReplaceLane, dst, g.UseRegister(node->InputAt(0)),
g.UseImmediate(lane), g.UseRegister(node->InputAt(1)));
}
#define VISIT_SIMD_REPLACE_LANE(TYPE, OPCODE) \
void InstructionSelector::Visit##TYPE##ReplaceLane(Node* node) { \
X64OperandGenerator g(this); \
......@@ -3092,7 +3102,6 @@ void InstructionSelector::VisitF32x4ReplaceLane(Node* node) {
}
#define SIMD_TYPES_FOR_REPLACE_LANE(V) \
V(F64x2, kX64Pinsrq) \
V(I64x2, kX64Pinsrq) \
V(I32x4, kX64Pinsrd) \
V(I16x8, kX64Pinsrw) \
......
......@@ -4630,27 +4630,7 @@ void LiftoffAssembler::emit_f64x2_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
// TODO(fanchenk): Use movlhps and blendpd
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
if (imm_lane_idx == 0) {
vinsertps(dst.fp(), src1.fp(), src2.fp(), 0b00000000);
vinsertps(dst.fp(), dst.fp(), src2.fp(), 0b01010000);
} else {
vinsertps(dst.fp(), src1.fp(), src2.fp(), 0b00100000);
vinsertps(dst.fp(), dst.fp(), src2.fp(), 0b01110000);
}
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
if (imm_lane_idx == 0) {
insertps(dst.fp(), src2.fp(), 0b00000000);
insertps(dst.fp(), src2.fp(), 0b01010000);
} else {
insertps(dst.fp(), src2.fp(), 0b00100000);
insertps(dst.fp(), src2.fp(), 0b01110000);
}
}
F64x2ReplaceLane(dst.fp(), src1.fp(), src2.fp(), imm_lane_idx);
}
void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) {
......
......@@ -4173,22 +4173,7 @@ void LiftoffAssembler::emit_f64x2_replace_lane(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
uint8_t imm_lane_idx) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
if (imm_lane_idx == 0) {
vpblendw(dst.fp(), src1.fp(), src2.fp(), 0b00001111);
} else {
vmovlhps(dst.fp(), src1.fp(), src2.fp());
}
} else {
CpuFeatureScope scope(this, SSE4_1);
if (dst.fp() != src1.fp()) movaps(dst.fp(), src1.fp());
if (imm_lane_idx == 0) {
pblendw(dst.fp(), src2.fp(), 0b00001111);
} else {
movlhps(dst.fp(), src2.fp());
}
}
F64x2ReplaceLane(dst.fp(), src1.fp(), src2.fp(), imm_lane_idx);
}
void LiftoffAssembler::StackCheck(Label* ool_code, Register limit_address) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment