Commit 754cb03c authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[ia32][wasm-simd] Optimize and unify f32x4.extract_lane SSE and AVX ops

Change the codegen for f32x4.extract_lane from shufps to insertps. They
have the same performance, but shufps has a false dependency on dst (it
shuffles dst and src, but we don't care about dst at all).

We then merge the SSE and AVX opcode.

Bug: v8:11217
Change-Id: I7cdbf486573ce3a19881df84400a9c7e09c3ee48
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2585259Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71748}
parent 3ea458be
......@@ -2343,26 +2343,23 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
break;
}
case kSSEF32x4ExtractLane: {
DCHECK_EQ(i.OutputDoubleRegister(), i.InputDoubleRegister(0));
case kIA32F32x4ExtractLane: {
XMMRegister dst = i.OutputFloatRegister();
int8_t lane = i.InputInt8(1);
if (lane != 0) {
XMMRegister src = i.InputSimd128Register(0);
uint8_t lane = i.InputUint8(1);
DCHECK_LT(lane, 4);
__ shufps(dst, dst, lane);
}
if (lane == 0 && dst == src) {
break;
}
case kAVXF32x4ExtractLane: {
uint8_t zmask = 0xE; // Zero top 3 lanes.
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister dst = i.OutputFloatRegister();
XMMRegister src = i.InputSimd128Register(0);
int8_t lane = i.InputInt8(1);
if (lane == 0) {
if (dst != src) __ vmovaps(dst, src);
// Use src for both operands to avoid false-dependency on dst.
__ vinsertps(dst, src, src, zmask | (lane << 6));
} else {
DCHECK_LT(lane, 4);
__ vshufps(dst, src, src, lane);
CpuFeatureScope sse_scope(tasm(), SSE4_1);
__ insertps(dst, src, zmask | (lane << 6));
}
break;
}
......
......@@ -155,8 +155,7 @@ namespace compiler {
V(IA32I64x2ExtMulLowI32x4U) \
V(IA32I64x2ExtMulHighI32x4U) \
V(IA32F32x4Splat) \
V(SSEF32x4ExtractLane) \
V(AVXF32x4ExtractLane) \
V(IA32F32x4ExtractLane) \
V(IA32Insertps) \
V(IA32F32x4SConvertI32x4) \
V(IA32F32x4UConvertI32x4) \
......
......@@ -134,8 +134,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I64x2ExtMulLowI32x4U:
case kIA32I64x2ExtMulHighI32x4U:
case kIA32F32x4Splat:
case kSSEF32x4ExtractLane:
case kAVXF32x4ExtractLane:
case kIA32F32x4ExtractLane:
case kIA32Insertps:
case kIA32F32x4SConvertI32x4:
case kIA32F32x4UConvertI32x4:
......
......@@ -2431,7 +2431,11 @@ void InstructionSelector::VisitF32x4Splat(Node* node) {
}
void InstructionSelector::VisitF32x4ExtractLane(Node* node) {
VisitRRISimd(this, node, kAVXF32x4ExtractLane, kSSEF32x4ExtractLane);
IA32OperandGenerator g(this);
InstructionOperand operand0 = g.UseRegister(node->InputAt(0));
InstructionOperand operand1 =
g.UseImmediate(OpParameter<int32_t>(node->op()));
Emit(kIA32F32x4ExtractLane, g.DefineAsRegister(node), operand0, operand1);
}
void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment