Commit c5fd776d authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

Reland "[wasm-simd][ia32] Implement i64x2 signed compares"

This is a reland of a16add80.

The fixes are adding disassembly for pcmpgtq and vpcmpgtq.
While fixing also noticed a mistake in assembler for pcmpgtq,
which flipped dst and src.
Also realized that we don't detect SSE4.2, so adding that in.

PS2 contains these changes.

Original change's description:
> [wasm-simd][ia32] Implement i64x2 signed compares
>
> The code sequence is exactly the same as x64.
>
> Bug: v8:11415
> Change-Id: I53ed2723eda29c0a250cff514372a3d45b203476
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2683495
> Reviewed-by: Bill Budge <bbudge@chromium.org>
> Commit-Queue: Zhi An Ng <zhin@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#72637}

Bug: v8:11415
Change-Id: If6a18af2d7de20ac8ad38f94b6d0220769397194
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2688119Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#72721}
parent 9ac0fed4
......@@ -130,6 +130,7 @@ void CpuFeatures::ProbeImpl(bool cross_compile) {
// Only use statically determined features for cross compile (snapshot).
if (cross_compile) return;
if (cpu.has_sse42() && FLAG_enable_sse4_2) supported_ |= 1u << SSE4_2;
if (cpu.has_sse41() && FLAG_enable_sse4_1) supported_ |= 1u << SSE4_1;
if (cpu.has_ssse3() && FLAG_enable_ssse3) supported_ |= 1u << SSSE3;
if (cpu.has_sse3() && FLAG_enable_sse3) supported_ |= 1u << SSE3;
......@@ -2516,6 +2517,14 @@ void Assembler::movdqa(XMMRegister dst, Operand src) {
emit_sse_operand(dst, src);
}
void Assembler::movdqa(XMMRegister dst, XMMRegister src) {
EnsureSpace ensure_space(this);
EMIT(0x66);
EMIT(0x0F);
EMIT(0x6F);
emit_sse_operand(dst, src);
}
void Assembler::movdqu(Operand dst, XMMRegister src) {
EnsureSpace ensure_space(this);
EMIT(0xF3);
......@@ -2622,6 +2631,16 @@ void Assembler::extractps(Register dst, XMMRegister src, byte imm8) {
EMIT(imm8);
}
void Assembler::pcmpgtq(XMMRegister dst, XMMRegister src) {
DCHECK(IsEnabled(SSE4_2));
EnsureSpace ensure_space(this);
EMIT(0x66);
EMIT(0x0F);
EMIT(0x38);
EMIT(0x37);
emit_sse_operand(dst, src);
}
void Assembler::psllw(XMMRegister reg, uint8_t shift) {
EnsureSpace ensure_space(this);
EMIT(0x66);
......@@ -3150,6 +3169,10 @@ void Assembler::vextractps(Operand dst, XMMRegister src, byte imm8) {
EMIT(imm8);
}
void Assembler::vpcmpgtq(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vinstr(0x37, dst, src1, src2, k66, k0F38, VexW::kWIG);
}
void Assembler::bmi1(byte op, Register reg, Register vreg, Operand rm) {
DCHECK(IsEnabled(BMI1));
EnsureSpace ensure_space(this);
......
......@@ -990,6 +990,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void movdqa(XMMRegister dst, Operand src);
void movdqa(Operand dst, XMMRegister src);
void movdqa(XMMRegister dst, XMMRegister src);
void movdqu(XMMRegister dst, Operand src);
void movdqu(Operand dst, XMMRegister src);
void movdqu(XMMRegister dst, XMMRegister src);
......@@ -1016,6 +1017,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void extractps(Operand dst, XMMRegister src, byte imm8);
void extractps(Register dst, XMMRegister src, byte imm8);
void pcmpgtq(XMMRegister dst, XMMRegister src);
void psllw(XMMRegister reg, uint8_t shift);
void pslld(XMMRegister reg, uint8_t shift);
void psrlw(XMMRegister reg, uint8_t shift);
......@@ -1369,6 +1372,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vextractps(Operand dst, XMMRegister src, byte imm8);
void vpcmpgtq(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vmovaps(XMMRegister dst, XMMRegister src) { vmovaps(dst, Operand(src)); }
void vmovaps(XMMRegister dst, Operand src) { vps(0x28, dst, xmm0, src); }
void vmovapd(XMMRegister dst, XMMRegister src) { vmovapd(dst, Operand(src)); }
......
......@@ -2240,6 +2240,69 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pxor(i.OutputSimd128Register(), kScratchDoubleReg);
break;
}
case kIA32I64x2GtS: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src0 = i.InputSimd128Register(0);
XMMRegister src1 = i.InputSimd128Register(1);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpcmpgtq(dst, src0, src1);
} else if (CpuFeatures::IsSupported(SSE4_2)) {
CpuFeatureScope sse_scope(tasm(), SSE4_2);
DCHECK_EQ(dst, src0);
__ pcmpgtq(dst, src1);
} else {
DCHECK_NE(dst, src0);
DCHECK_NE(dst, src1);
__ movdqa(dst, src1);
__ movdqa(kScratchDoubleReg, src0);
__ psubq(dst, src0);
__ pcmpeqd(kScratchDoubleReg, src1);
__ pand(dst, kScratchDoubleReg);
__ movdqa(kScratchDoubleReg, src0);
__ pcmpgtd(kScratchDoubleReg, src1);
__ por(dst, kScratchDoubleReg);
__ pshufd(dst, dst, 0xF5);
}
break;
}
case kIA32I64x2GeS: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src0 = i.InputSimd128Register(0);
XMMRegister src1 = i.InputSimd128Register(1);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpcmpgtq(dst, src1, src0);
__ vpcmpeqd(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
__ vpxor(dst, dst, kScratchDoubleReg);
} else if (CpuFeatures::IsSupported(SSE4_2)) {
CpuFeatureScope sse_scope(tasm(), SSE4_2);
DCHECK_NE(dst, src0);
if (dst != src1) {
__ movdqa(dst, src1);
}
__ pcmpgtq(dst, src0);
__ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ pxor(dst, kScratchDoubleReg);
} else {
DCHECK_NE(dst, src0);
DCHECK_NE(dst, src1);
__ movdqa(dst, src0);
__ movdqa(kScratchDoubleReg, src1);
__ psubq(dst, src1);
__ pcmpeqd(kScratchDoubleReg, src0);
__ pand(dst, kScratchDoubleReg);
__ movdqa(kScratchDoubleReg, src1);
__ pcmpgtd(kScratchDoubleReg, src0);
__ por(dst, kScratchDoubleReg);
__ pshufd(dst, dst, 0xF5);
__ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ pxor(dst, kScratchDoubleReg);
}
break;
}
case kIA32I64x2SConvertI32x4Low: {
__ Pmovsxdq(i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
......
......@@ -150,6 +150,8 @@ namespace compiler {
V(IA32I64x2BitMask) \
V(IA32I64x2Eq) \
V(IA32I64x2Ne) \
V(IA32I64x2GtS) \
V(IA32I64x2GeS) \
V(IA32I64x2SignSelect) \
V(IA32I64x2ExtMulLowI32x4S) \
V(IA32I64x2ExtMulHighI32x4S) \
......
......@@ -135,6 +135,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I64x2BitMask:
case kIA32I64x2Eq:
case kIA32I64x2Ne:
case kIA32I64x2GtS:
case kIA32I64x2GeS:
case kIA32I64x2SignSelect:
case kIA32I64x2ExtMulLowI32x4S:
case kIA32I64x2ExtMulHighI32x4S:
......
......@@ -3170,6 +3170,37 @@ void InstructionSelector::VisitI32x4TruncSatF64x2UZero(Node* node) {
arraysize(temps), temps);
}
void InstructionSelector::VisitI64x2GtS(Node* node) {
IA32OperandGenerator g(this);
if (CpuFeatures::IsSupported(AVX)) {
Emit(kIA32I64x2GtS, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
} else if (CpuFeatures::IsSupported(SSE4_2)) {
Emit(kIA32I64x2GtS, g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
} else {
Emit(kIA32I64x2GtS, g.DefineAsRegister(node),
g.UseUniqueRegister(node->InputAt(0)),
g.UseUniqueRegister(node->InputAt(1)));
}
}
void InstructionSelector::VisitI64x2GeS(Node* node) {
IA32OperandGenerator g(this);
if (CpuFeatures::IsSupported(AVX)) {
Emit(kIA32I64x2GeS, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
} else if (CpuFeatures::IsSupported(SSE4_2)) {
Emit(kIA32I64x2GeS, g.DefineAsRegister(node),
g.UseUniqueRegister(node->InputAt(0)),
g.UseRegister(node->InputAt(1)));
} else {
Emit(kIA32I64x2GeS, g.DefineAsRegister(node),
g.UseUniqueRegister(node->InputAt(0)),
g.UseUniqueRegister(node->InputAt(1)));
}
}
// static
MachineOperatorBuilder::Flags
InstructionSelector::SupportedMachineOperatorFlags() {
......
......@@ -2801,10 +2801,10 @@ void InstructionSelector::VisitI32x4WidenI8x16S(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI32x4WidenI8x16U(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_IA32
void InstructionSelector::VisitI64x2GtS(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2GeS(Node* node) { UNIMPLEMENTED(); }
#endif //! V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64
#endif //! V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_IA32
// TODO(v8:11416) Prototyping i64x2.abs.
#if !V8_TARGET_ARCH_X64
......
......@@ -696,6 +696,11 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
AppendToBuffer("vbroadcastss %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x37:
AppendToBuffer("vpcmpgtq %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
break;
case 0x99:
AppendToBuffer("vfmadd132s%c %s,%s,", float_size_code(),
NameOfXMMRegister(regop), NameOfXMMRegister(vvvv));
......@@ -2265,6 +2270,10 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
data += PrintRightXMMOperand(data);
AppendToBuffer(",xmm0");
break;
case 0x37:
AppendToBuffer("pcmpgtq %s,", NameOfXMMRegister(regop));
data += PrintRightXMMOperand(data);
break;
default:
UnimplementedInstruction();
}
......
......@@ -479,6 +479,7 @@ TEST(DisasmIa320) {
// 128 bit move instructions.
__ movdqa(xmm0, Operand(ebx, ecx, times_4, 10000));
__ movdqa(Operand(ebx, ecx, times_4, 10000), xmm0);
__ movdqa(xmm1, xmm0);
__ movdqu(xmm0, Operand(ebx, ecx, times_4, 10000));
__ movdqu(Operand(ebx, ecx, times_4, 10000), xmm0);
__ movdqu(xmm1, xmm0);
......@@ -645,6 +646,13 @@ TEST(DisasmIa320) {
}
#undef EMIT_SSE34_INSTR
{
if (CpuFeatures::IsSupported(SSE4_2)) {
CpuFeatureScope scope(&assm, SSE4_2);
__ pcmpgtq(xmm0, xmm1);
}
}
// AVX instruction
{
if (CpuFeatures::IsSupported(AVX)) {
......@@ -828,6 +836,8 @@ TEST(DisasmIa320) {
__ vmovmskps(edx, xmm5);
__ vpmovmskb(ebx, xmm1);
__ vpcmpgtq(xmm0, xmm1, xmm2);
#define EMIT_SSE2_AVXINSTR(instruction, notUsed1, notUsed2, notUsed3) \
__ v##instruction(xmm7, xmm5, xmm1); \
__ v##instruction(xmm7, xmm5, Operand(edx, 4));
......
......@@ -1049,7 +1049,7 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2Ne) {
RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2Ne, NotEqual);
}
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_IA32
WASM_SIMD_TEST_NO_LOWERING(I64x2LtS) {
RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2LtS, Less);
}
......@@ -1065,7 +1065,7 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2GtS) {
WASM_SIMD_TEST_NO_LOWERING(I64x2GeS) {
RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2GeS, GreaterEqual);
}
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_IA32
WASM_SIMD_TEST(F64x2Splat) {
WasmRunner<int32_t, double> r(execution_tier, lower_simd);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment