Commit 890fc4cd authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd] Implement f64x2 min max for ia32

Bug: v8:9728
Change-Id: I56900b52d37f245cba228ec41a3acbfb7d47363b
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1837718Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#64226}
parent 7fa12e2a
......@@ -2810,6 +2810,12 @@ void Assembler::vpsrld(XMMRegister dst, XMMRegister src, uint8_t imm8) {
EMIT(imm8);
}
void Assembler::vpsrlq(XMMRegister dst, XMMRegister src, uint8_t imm8) {
XMMRegister iop = XMMRegister::from_code(2);
vinstr(0x73, iop, dst, Operand(src), k66, k0F, kWIG);
EMIT(imm8);
}
void Assembler::vpsraw(XMMRegister dst, XMMRegister src, uint8_t imm8) {
XMMRegister iop = XMMRegister::from_code(4);
vinstr(0x71, iop, dst, Operand(src), k66, k0F, kWIG);
......
......@@ -884,6 +884,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
V(cmpeq, 0x0) \
V(cmplt, 0x1) \
V(cmple, 0x2) \
V(cmpunord, 0x3) \
V(cmpneq, 0x4)
#define SSE_CMP_P(instr, imm8) \
......@@ -948,6 +949,11 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void roundss(XMMRegister dst, XMMRegister src, RoundingMode mode);
void roundsd(XMMRegister dst, XMMRegister src, RoundingMode mode);
void movapd(XMMRegister dst, XMMRegister src) { movapd(dst, Operand(src)); }
void movapd(XMMRegister dst, Operand src) {
sse2_instr(dst, src, 0x66, 0x0F, 0x28);
}
void movmskpd(Register dst, XMMRegister src);
void movmskps(Register dst, XMMRegister src);
......@@ -1330,6 +1336,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vpsrld(XMMRegister dst, XMMRegister src, uint8_t imm8);
void vpsraw(XMMRegister dst, XMMRegister src, uint8_t imm8);
void vpsrad(XMMRegister dst, XMMRegister src, uint8_t imm8);
void vpsrlq(XMMRegister dst, XMMRegister src, uint8_t imm8);
void vpshufhw(XMMRegister dst, XMMRegister src, uint8_t shuffle) {
vpshufhw(dst, Operand(src), shuffle);
......
......@@ -1482,6 +1482,15 @@ void TurboAssembler::Psrlw(XMMRegister dst, uint8_t shift) {
}
}
void TurboAssembler::Psrlq(XMMRegister dst, uint8_t shift) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpsrlq(dst, dst, shift);
} else {
psrlq(dst, shift);
}
}
void TurboAssembler::Psignb(XMMRegister dst, Operand src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
......
......@@ -237,6 +237,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void Pshufd(XMMRegister dst, Operand src, uint8_t shuffle);
void Psraw(XMMRegister dst, uint8_t shift);
void Psrlw(XMMRegister dst, uint8_t shift);
void Psrlq(XMMRegister dst, uint8_t shift);
// SSE/SSE2 instructions with AVX version.
#define AVX_OP2_WITH_TYPE(macro_name, name, dst_type, src_type) \
......@@ -259,6 +260,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP2_WITH_TYPE(Movd, movd, Operand, XMMRegister)
AVX_OP2_WITH_TYPE(Cvtdq2ps, cvtdq2ps, XMMRegister, Operand)
AVX_OP2_WITH_TYPE(Sqrtpd, sqrtpd, XMMRegister, const Operand&)
AVX_OP2_WITH_TYPE(Movapd, movapd, XMMRegister, XMMRegister)
AVX_OP2_WITH_TYPE(Movapd, movapd, XMMRegister, const Operand&)
#undef AVX_OP2_WITH_TYPE
......@@ -296,6 +299,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP3_XO(Xorpd, xorpd)
AVX_OP3_XO(Sqrtss, sqrtss)
AVX_OP3_XO(Sqrtsd, sqrtsd)
AVX_OP3_XO(Orpd, orpd)
AVX_OP3_XO(Andnpd, andnpd)
#undef AVX_OP3_XO
#undef AVX_OP3_WITH_TYPE
......@@ -323,6 +328,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_PACKED_OP3(Cmpneqpd, cmpneqpd)
AVX_PACKED_OP3(Cmpltpd, cmpltpd)
AVX_PACKED_OP3(Cmplepd, cmplepd)
AVX_PACKED_OP3(Minpd, minpd)
AVX_PACKED_OP3(Maxpd, maxpd)
AVX_PACKED_OP3(Cmpunordpd, cmpunordpd)
#undef AVX_PACKED_OP3
#undef AVX_PACKED_OP3_WITH_TYPE
......
......@@ -1925,6 +1925,47 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputOperand(1));
break;
}
case kIA32F64x2Min: {
Operand src1 = i.InputOperand(1);
XMMRegister dst = i.OutputSimd128Register(),
src = i.InputSimd128Register(0),
tmp = i.TempSimd128Register(0);
// The minpd instruction doesn't propagate NaNs and +0's in its first
// operand. Perform minpd in both orders, merge the resuls, and adjust.
__ Movapd(tmp, src1);
__ Minpd(tmp, tmp, src);
__ Minpd(dst, src, src1);
// propagate -0's and NaNs, which may be non-canonical.
__ Orpd(tmp, dst);
// Canonicalize NaNs by quieting and clearing the payload.
__ Cmpunordpd(dst, dst, tmp);
__ Orpd(tmp, dst);
__ Psrlq(dst, 13);
__ Andnpd(dst, tmp);
break;
}
case kIA32F64x2Max: {
Operand src1 = i.InputOperand(1);
XMMRegister dst = i.OutputSimd128Register(),
src = i.InputSimd128Register(0),
tmp = i.TempSimd128Register(0);
// The maxpd instruction doesn't propagate NaNs and +0's in its first
// operand. Perform maxpd in both orders, merge the resuls, and adjust.
__ Movapd(tmp, src1);
__ Maxpd(tmp, tmp, src);
__ Maxpd(dst, src, src1);
// Find discrepancies.
__ Xorpd(dst, tmp);
// Propagate NaNs, which may be non-canonical.
__ Orpd(tmp, dst);
// Propagate sign discrepancy and (subtle) quiet NaNs.
__ Subpd(tmp, tmp, dst);
// Canonicalize NaNs by clearing the payload. Sign is non-deterministic.
__ Cmpunordpd(dst, dst, tmp);
__ Psrlq(dst, 13);
__ Andnpd(dst, tmp);
break;
}
case kIA32F64x2Eq: {
__ Cmpeqpd(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1));
......
......@@ -127,6 +127,8 @@ namespace compiler {
V(IA32F64x2Sub) \
V(IA32F64x2Mul) \
V(IA32F64x2Div) \
V(IA32F64x2Min) \
V(IA32F64x2Max) \
V(IA32F64x2Eq) \
V(IA32F64x2Ne) \
V(IA32F64x2Lt) \
......
......@@ -108,6 +108,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32F64x2Sub:
case kIA32F64x2Mul:
case kIA32F64x2Div:
case kIA32F64x2Min:
case kIA32F64x2Max:
case kIA32F64x2Eq:
case kIA32F64x2Ne:
case kIA32F64x2Lt:
......
......@@ -2045,6 +2045,35 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(I8x16ShrS) \
V(I8x16ShrU)
void InstructionSelector::VisitF64x2Min(Node* node) {
IA32OperandGenerator g(this);
InstructionOperand temps[] = {g.TempSimd128Register()};
InstructionOperand operand0 = g.UseUniqueRegister(node->InputAt(0));
InstructionOperand operand1 = g.UseUnique(node->InputAt(1));
if (IsSupported(AVX)) {
Emit(kIA32F64x2Min, g.DefineAsRegister(node), operand0, operand1,
arraysize(temps), temps);
} else {
Emit(kIA32F64x2Min, g.DefineSameAsFirst(node), operand0, operand1,
arraysize(temps), temps);
}
}
void InstructionSelector::VisitF64x2Max(Node* node) {
IA32OperandGenerator g(this);
InstructionOperand temps[] = {g.TempSimd128Register()};
InstructionOperand operand0 = g.UseUniqueRegister(node->InputAt(0));
InstructionOperand operand1 = g.UseUnique(node->InputAt(1));
if (IsSupported(AVX)) {
Emit(kIA32F64x2Max, g.DefineAsRegister(node), operand0, operand1,
arraysize(temps), temps);
} else {
Emit(kIA32F64x2Max, g.DefineSameAsFirst(node), operand0, operand1,
arraysize(temps), temps);
}
}
void InstructionSelector::VisitF64x2Splat(Node* node) {
VisitRRSimd(this, node, kAVXF64x2Splat, kSSEF64x2Splat);
}
......
......@@ -2638,9 +2638,9 @@ void InstructionSelector::VisitF64x2Eq(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Ne(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Lt(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Le(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_IA32
void InstructionSelector::VisitF64x2Min(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Max(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_IA32
void InstructionSelector::VisitI64x2Splat(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2ExtractLane(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2ReplaceLane(Node* node) { UNIMPLEMENTED(); }
......
......@@ -1147,6 +1147,10 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
int mod, regop, rm, vvvv = vex_vreg();
get_modrm(*current, &mod, &regop, &rm);
switch (opcode) {
case 0x28:
AppendToBuffer("vmovapd %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x54:
AppendToBuffer("vandpd %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
......@@ -1219,6 +1223,12 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
current++;
AppendToBuffer(",%u", *current++);
break;
case 0x73:
AppendToBuffer("vps%sq %s,%s", sf_str[regop / 2],
NameOfXMMRegister(vvvv), NameOfXMMRegister(rm));
current++;
AppendToBuffer(",%u", *current++);
break;
case 0x7E:
AppendToBuffer("vmovd ");
current += PrintRightOperand(current);
......@@ -2061,7 +2071,13 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
data += 2;
} else if (*data == 0x0F) {
data++;
if (*data == 0x38) {
if (*data == 0x28) {
data++;
int mod, regop, rm;
get_modrm(*data, &mod, &regop, &rm);
AppendToBuffer("movapd %s,", NameOfXMMRegister(regop));
data += PrintRightXMMOperand(data);
} else if (*data == 0x38) {
data++;
byte op = *data;
data++;
......
......@@ -446,6 +446,8 @@ TEST(DisasmIa320) {
__ cmpltps(xmm5, Operand(ebx, ecx, times_4, 10000));
__ cmpleps(xmm5, xmm1);
__ cmpleps(xmm5, Operand(ebx, ecx, times_4, 10000));
__ cmpunordps(xmm5, xmm1);
__ cmpunordps(xmm5, Operand(ebx, ecx, times_4, 10000));
__ cmpneqps(xmm5, xmm1);
__ cmpneqps(xmm5, Operand(ebx, ecx, times_4, 10000));
......@@ -469,6 +471,9 @@ TEST(DisasmIa320) {
__ movdqu(xmm0, Operand(ebx, ecx, times_4, 10000));
__ movdqu(Operand(ebx, ecx, times_4, 10000), xmm0);
__ movapd(xmm0, xmm1);
__ movapd(xmm0, Operand(edx, 4));
__ movd(xmm0, edi);
__ movd(xmm0, Operand(ebx, ecx, times_4, 10000));
__ movd(eax, xmm1);
......@@ -518,6 +523,8 @@ TEST(DisasmIa320) {
__ cmpltpd(xmm5, Operand(ebx, ecx, times_4, 10000));
__ cmplepd(xmm5, xmm1);
__ cmplepd(xmm5, Operand(ebx, ecx, times_4, 10000));
__ cmpunordpd(xmm5, xmm1);
__ cmpunordpd(xmm5, Operand(ebx, ecx, times_4, 10000));
__ cmpneqpd(xmm5, xmm1);
__ cmpneqpd(xmm5, Operand(ebx, ecx, times_4, 10000));
......@@ -676,6 +683,8 @@ TEST(DisasmIa320) {
__ vrsqrtps(xmm1, xmm0);
__ vrsqrtps(xmm1, Operand(ebx, ecx, times_4, 10000));
__ vmovaps(xmm0, xmm1);
__ vmovapd(xmm0, xmm1);
__ vmovapd(xmm0, Operand(ebx, ecx, times_4, 10000));
__ vshufps(xmm0, xmm1, xmm2, 3);
__ vshufps(xmm0, xmm1, Operand(edx, 4), 3);
__ vhaddps(xmm0, xmm1, xmm2);
......@@ -687,6 +696,8 @@ TEST(DisasmIa320) {
__ vcmpltps(xmm5, xmm4, Operand(ebx, ecx, times_4, 10000));
__ vcmpleps(xmm5, xmm4, xmm1);
__ vcmpleps(xmm5, xmm4, Operand(ebx, ecx, times_4, 10000));
__ vcmpunordps(xmm5, xmm4, xmm1);
__ vcmpunordps(xmm5, xmm4, Operand(ebx, ecx, times_4, 10000));
__ vcmpneqps(xmm5, xmm4, xmm1);
__ vcmpneqps(xmm5, xmm4, Operand(ebx, ecx, times_4, 10000));
......@@ -717,6 +728,8 @@ TEST(DisasmIa320) {
__ vcmpltpd(xmm5, xmm4, Operand(ebx, ecx, times_4, 10000));
__ vcmplepd(xmm5, xmm4, xmm1);
__ vcmplepd(xmm5, xmm4, Operand(ebx, ecx, times_4, 10000));
__ vcmpunordpd(xmm5, xmm4, xmm1);
__ vcmpunordpd(xmm5, xmm4, Operand(ebx, ecx, times_4, 10000));
__ vcmpneqpd(xmm5, xmm4, xmm1);
__ vcmpneqpd(xmm5, xmm4, Operand(ebx, ecx, times_4, 10000));
......@@ -724,6 +737,7 @@ TEST(DisasmIa320) {
__ vpslld(xmm0, xmm7, 21);
__ vpsrlw(xmm0, xmm7, 21);
__ vpsrld(xmm0, xmm7, 21);
__ vpsrlq(xmm0, xmm7, 21);
__ vpsraw(xmm0, xmm7, 21);
__ vpsrad(xmm0, xmm7, 21);
......
......@@ -1419,7 +1419,6 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Le) {
RunF64x2CompareOpTest(execution_tier, lower_simd, kExprF64x2Le, LessEqual);
}
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
WASM_SIMD_TEST_NO_LOWERING(F64x2Min) {
RunF64x2BinOpTest(execution_tier, lower_simd, kExprF64x2Min, JSMin);
}
......@@ -1428,6 +1427,7 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Max) {
RunF64x2BinOpTest(execution_tier, lower_simd, kExprF64x2Max, JSMax);
}
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
WASM_SIMD_TEST_NO_LOWERING(I64x2Mul) {
RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2Mul,
base::MulWithWraparound);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment