Commit 043ac205 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][x64] Bitmask instructions

Implement i8x16.bitmask, i16x8.bitmask, i32x4.bitmask on x64.

Bug: v8:10308
Change-Id: Id47cb229de77d80d0a7ec91f4862a91258ff1979
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2127317
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/master@{#67022}
parent dfdf66cb
...@@ -3441,6 +3441,15 @@ void Assembler::movmskps(Register dst, XMMRegister src) { ...@@ -3441,6 +3441,15 @@ void Assembler::movmskps(Register dst, XMMRegister src) {
emit_sse_operand(dst, src); emit_sse_operand(dst, src);
} }
void Assembler::pmovmskb(Register dst, XMMRegister src) {
EnsureSpace ensure_space(this);
emit_optional_rex_32(dst, src);
emit(0x66);
emit(0x0F);
emit(0xD7);
emit_sse_operand(dst, src);
}
// AVX instructions // AVX instructions
void Assembler::vmovddup(XMMRegister dst, XMMRegister src) { void Assembler::vmovddup(XMMRegister dst, XMMRegister src) {
...@@ -3634,6 +3643,15 @@ void Assembler::vucomiss(XMMRegister dst, Operand src) { ...@@ -3634,6 +3643,15 @@ void Assembler::vucomiss(XMMRegister dst, Operand src) {
emit_sse_operand(dst, src); emit_sse_operand(dst, src);
} }
void Assembler::vpmovmskb(Register dst, XMMRegister src) {
XMMRegister idst = XMMRegister::from_code(dst.code());
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
emit_vex_prefix(idst, xmm0, src, kL128, k66, k0F, kWIG);
emit(0xD7);
emit_sse_operand(idst, src);
}
void Assembler::vss(byte op, XMMRegister dst, XMMRegister src1, void Assembler::vss(byte op, XMMRegister dst, XMMRegister src1,
XMMRegister src2) { XMMRegister src2) {
DCHECK(IsEnabled(AVX)); DCHECK(IsEnabled(AVX));
......
...@@ -1124,6 +1124,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1124,6 +1124,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void movmskpd(Register dst, XMMRegister src); void movmskpd(Register dst, XMMRegister src);
void pmovmskb(Register dst, XMMRegister src);
// SSE 4.1 instruction // SSE 4.1 instruction
void insertps(XMMRegister dst, XMMRegister src, byte imm8); void insertps(XMMRegister dst, XMMRegister src, byte imm8);
void insertps(XMMRegister dst, Operand src, byte imm8); void insertps(XMMRegister dst, Operand src, byte imm8);
...@@ -1393,6 +1395,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -1393,6 +1395,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
XMMRegister idst = XMMRegister::from_code(dst.code()); XMMRegister idst = XMMRegister::from_code(dst.code());
vpd(0x50, idst, xmm0, src); vpd(0x50, idst, xmm0, src);
} }
void vpmovmskb(Register dst, XMMRegister src);
void vcmpps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int8_t cmp) { void vcmpps(XMMRegister dst, XMMRegister src1, XMMRegister src2, int8_t cmp) {
vps(0xC2, dst, src1, src2); vps(0xC2, dst, src1, src2);
emit(cmp); emit(cmp);
......
...@@ -141,6 +141,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -141,6 +141,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Movups, movups) AVX_OP(Movups, movups)
AVX_OP(Movmskps, movmskps) AVX_OP(Movmskps, movmskps)
AVX_OP(Movmskpd, movmskpd) AVX_OP(Movmskpd, movmskpd)
AVX_OP(Pmovmskb, pmovmskb)
AVX_OP(Movss, movss) AVX_OP(Movss, movss)
AVX_OP(Movsd, movsd) AVX_OP(Movsd, movsd)
AVX_OP(Movdqu, movdqu) AVX_OP(Movdqu, movdqu)
......
...@@ -2634,11 +2634,15 @@ void InstructionSelector::VisitI64x2MinU(Node* node) { UNIMPLEMENTED(); } ...@@ -2634,11 +2634,15 @@ void InstructionSelector::VisitI64x2MinU(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2MaxU(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI64x2MaxU(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_S390X #endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_S390X
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_IA32 // TODO(v8:10308) Bitmask operations are in prototype now, we can remove these
// guards when they go into the proposal.
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_IA32 && \
!V8_TARGET_ARCH_X64
void InstructionSelector::VisitI8x16BitMask(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI8x16BitMask(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI16x8BitMask(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI16x8BitMask(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI32x4BitMask(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI32x4BitMask(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_IA32 #endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_IA32
// && !V8_TARGET_ARCH_X64
void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); } void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }
......
...@@ -3085,6 +3085,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3085,6 +3085,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pabsd(i.OutputSimd128Register(), i.InputSimd128Register(0)); __ Pabsd(i.OutputSimd128Register(), i.InputSimd128Register(0));
break; break;
} }
case kX64I32x4BitMask: {
__ Movmskps(i.OutputRegister(), i.InputSimd128Register(0));
break;
}
case kX64S128Zero: { case kX64S128Zero: {
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
__ Xorps(dst, dst); __ Xorps(dst, dst);
...@@ -3273,6 +3277,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3273,6 +3277,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pabsw(i.OutputSimd128Register(), i.InputSimd128Register(0)); __ Pabsw(i.OutputSimd128Register(), i.InputSimd128Register(0));
break; break;
} }
case kX64I16x8BitMask: {
Register dst = i.OutputRegister();
XMMRegister tmp = i.TempSimd128Register(0);
__ Packsswb(tmp, i.InputSimd128Register(0));
__ Pmovmskb(dst, tmp);
__ shrq(dst, Immediate(8));
break;
}
case kX64I8x16Splat: { case kX64I8x16Splat: {
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
if (HasRegisterInput(instr, 0)) { if (HasRegisterInput(instr, 0)) {
...@@ -3542,6 +3554,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3542,6 +3554,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pabsb(i.OutputSimd128Register(), i.InputSimd128Register(0)); __ Pabsb(i.OutputSimd128Register(), i.InputSimd128Register(0));
break; break;
} }
case kX64I8x16BitMask: {
__ Pmovmskb(i.OutputRegister(), i.InputSimd128Register(0));
break;
}
case kX64S128And: { case kX64S128And: {
__ Pand(i.OutputSimd128Register(), i.InputSimd128Register(1)); __ Pand(i.OutputSimd128Register(), i.InputSimd128Register(1));
break; break;
......
...@@ -241,6 +241,7 @@ namespace compiler { ...@@ -241,6 +241,7 @@ namespace compiler {
V(X64I32x4GtU) \ V(X64I32x4GtU) \
V(X64I32x4GeU) \ V(X64I32x4GeU) \
V(X64I32x4Abs) \ V(X64I32x4Abs) \
V(X64I32x4BitMask) \
V(X64I16x8Splat) \ V(X64I16x8Splat) \
V(X64I16x8ExtractLaneU) \ V(X64I16x8ExtractLaneU) \
V(X64I16x8ExtractLaneS) \ V(X64I16x8ExtractLaneS) \
...@@ -275,6 +276,7 @@ namespace compiler { ...@@ -275,6 +276,7 @@ namespace compiler {
V(X64I16x8GeU) \ V(X64I16x8GeU) \
V(X64I16x8RoundingAverageU) \ V(X64I16x8RoundingAverageU) \
V(X64I16x8Abs) \ V(X64I16x8Abs) \
V(X64I16x8BitMask) \
V(X64I8x16Splat) \ V(X64I8x16Splat) \
V(X64I8x16ExtractLaneU) \ V(X64I8x16ExtractLaneU) \
V(X64I8x16ExtractLaneS) \ V(X64I8x16ExtractLaneS) \
...@@ -304,6 +306,7 @@ namespace compiler { ...@@ -304,6 +306,7 @@ namespace compiler {
V(X64I8x16GeU) \ V(X64I8x16GeU) \
V(X64I8x16RoundingAverageU) \ V(X64I8x16RoundingAverageU) \
V(X64I8x16Abs) \ V(X64I8x16Abs) \
V(X64I8x16BitMask) \
V(X64S128Zero) \ V(X64S128Zero) \
V(X64S128Not) \ V(X64S128Not) \
V(X64S128And) \ V(X64S128And) \
......
...@@ -213,6 +213,7 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -213,6 +213,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I32x4GtU: case kX64I32x4GtU:
case kX64I32x4GeU: case kX64I32x4GeU:
case kX64I32x4Abs: case kX64I32x4Abs:
case kX64I32x4BitMask:
case kX64I16x8Splat: case kX64I16x8Splat:
case kX64I16x8ExtractLaneU: case kX64I16x8ExtractLaneU:
case kX64I16x8ExtractLaneS: case kX64I16x8ExtractLaneS:
...@@ -247,6 +248,7 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -247,6 +248,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I16x8GeU: case kX64I16x8GeU:
case kX64I16x8RoundingAverageU: case kX64I16x8RoundingAverageU:
case kX64I16x8Abs: case kX64I16x8Abs:
case kX64I16x8BitMask:
case kX64I8x16Splat: case kX64I8x16Splat:
case kX64I8x16ExtractLaneU: case kX64I8x16ExtractLaneU:
case kX64I8x16ExtractLaneS: case kX64I8x16ExtractLaneS:
...@@ -276,6 +278,7 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -276,6 +278,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I8x16GeU: case kX64I8x16GeU:
case kX64I8x16RoundingAverageU: case kX64I8x16RoundingAverageU:
case kX64I8x16Abs: case kX64I8x16Abs:
case kX64I8x16BitMask:
case kX64S128And: case kX64S128And:
case kX64S128Or: case kX64S128Or:
case kX64S128Xor: case kX64S128Xor:
......
...@@ -2730,6 +2730,7 @@ VISIT_ATOMIC_BINOP(Xor) ...@@ -2730,6 +2730,7 @@ VISIT_ATOMIC_BINOP(Xor)
V(I32x4UConvertI16x8Low) \ V(I32x4UConvertI16x8Low) \
V(I32x4UConvertI16x8High) \ V(I32x4UConvertI16x8High) \
V(I32x4Abs) \ V(I32x4Abs) \
V(I32x4BitMask) \
V(I16x8SConvertI8x16Low) \ V(I16x8SConvertI8x16Low) \
V(I16x8SConvertI8x16High) \ V(I16x8SConvertI8x16High) \
V(I16x8Neg) \ V(I16x8Neg) \
...@@ -2738,6 +2739,7 @@ VISIT_ATOMIC_BINOP(Xor) ...@@ -2738,6 +2739,7 @@ VISIT_ATOMIC_BINOP(Xor)
V(I16x8Abs) \ V(I16x8Abs) \
V(I8x16Neg) \ V(I8x16Neg) \
V(I8x16Abs) \ V(I8x16Abs) \
V(I8x16BitMask) \
V(S128Not) V(S128Not)
#define SIMD_SHIFT_OPCODES(V) \ #define SIMD_SHIFT_OPCODES(V) \
...@@ -3033,6 +3035,13 @@ void InstructionSelector::VisitI16x8UConvertI32x4(Node* node) { ...@@ -3033,6 +3035,13 @@ void InstructionSelector::VisitI16x8UConvertI32x4(Node* node) {
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1))); g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
} }
void InstructionSelector::VisitI16x8BitMask(Node* node) {
X64OperandGenerator g(this);
InstructionOperand temps[] = {g.TempSimd128Register()};
Emit(kX64I16x8BitMask, g.DefineAsRegister(node),
g.UseUniqueRegister(node->InputAt(0)), arraysize(temps), temps);
}
void InstructionSelector::VisitI8x16UConvertI16x8(Node* node) { void InstructionSelector::VisitI8x16UConvertI16x8(Node* node) {
X64OperandGenerator g(this); X64OperandGenerator g(this);
Emit(kX64I8x16UConvertI16x8, g.DefineSameAsFirst(node), Emit(kX64I8x16UConvertI16x8, g.DefineSameAsFirst(node),
......
...@@ -1490,6 +1490,10 @@ int DisassemblerX64::AVXInstruction(byte* data) { ...@@ -1490,6 +1490,10 @@ int DisassemblerX64::AVXInstruction(byte* data) {
current += PrintRightXMMOperand(current); current += PrintRightXMMOperand(current);
AppendToBuffer(",0x%x", *current++); AppendToBuffer(",0x%x", *current++);
break; break;
case 0xD7:
AppendToBuffer("vpmovmskb %s,", NameOfCPURegister(regop));
current += PrintRightXMMOperand(current);
break;
#define DECLARE_SSE_AVX_DIS_CASE(instruction, notUsed1, notUsed2, opcode) \ #define DECLARE_SSE_AVX_DIS_CASE(instruction, notUsed1, notUsed2, opcode) \
case 0x##opcode: { \ case 0x##opcode: { \
AppendToBuffer("v" #instruction " %s,%s,", NameOfXMMRegister(regop), \ AppendToBuffer("v" #instruction " %s,%s,", NameOfXMMRegister(regop), \
...@@ -2124,7 +2128,10 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) { ...@@ -2124,7 +2128,10 @@ int DisassemblerX64::TwoByteOpcodeInstruction(byte* data) {
} else { } else {
UnimplementedInstruction(); UnimplementedInstruction();
} }
AppendToBuffer("%s %s,", mnemonic, NameOfXMMRegister(regop)); // Not every opcode here has an XMM register as the dst operand.
const char* regop_reg = opcode == 0xD7 ? NameOfCPURegister(regop)
: NameOfXMMRegister(regop);
AppendToBuffer("%s %s,", mnemonic, regop_reg);
current += PrintRightXMMOperand(current); current += PrintRightXMMOperand(current);
if (opcode == 0xC2) { if (opcode == 0xC2) {
const char* const pseudo_op[] = {"eq", "lt", "le", "unord", const char* const pseudo_op[] = {"eq", "lt", "le", "unord",
......
...@@ -435,6 +435,8 @@ TEST(DisasmX64) { ...@@ -435,6 +435,8 @@ TEST(DisasmX64) {
__ ucomisd(xmm0, xmm1); __ ucomisd(xmm0, xmm1);
__ pmovmskb(rdx, xmm9);
__ pcmpeqd(xmm1, xmm0); __ pcmpeqd(xmm1, xmm0);
__ punpckldq(xmm1, xmm11); __ punpckldq(xmm1, xmm11);
...@@ -650,6 +652,7 @@ TEST(DisasmX64) { ...@@ -650,6 +652,7 @@ TEST(DisasmX64) {
__ vmovupd(xmm0, Operand(rbx, rcx, times_4, 10000)); __ vmovupd(xmm0, Operand(rbx, rcx, times_4, 10000));
__ vmovupd(Operand(rbx, rcx, times_4, 10000), xmm0); __ vmovupd(Operand(rbx, rcx, times_4, 10000), xmm0);
__ vmovmskpd(r9, xmm4); __ vmovmskpd(r9, xmm4);
__ vpmovmskb(r10, xmm9);
__ vmovups(xmm5, xmm1); __ vmovups(xmm5, xmm1);
__ vmovups(xmm5, Operand(rdx, 4)); __ vmovups(xmm5, Operand(rdx, 4));
......
...@@ -1661,7 +1661,8 @@ WASM_SIMD_TEST(I16x8ReplaceLane) { ...@@ -1661,7 +1661,8 @@ WASM_SIMD_TEST(I16x8ReplaceLane) {
} }
} }
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_IA32 #if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_IA32 || \
V8_TARGET_ARCH_X64
WASM_SIMD_TEST_NO_LOWERING(I8x16BitMask) { WASM_SIMD_TEST_NO_LOWERING(I8x16BitMask) {
FLAG_SCOPE(wasm_simd_post_mvp); FLAG_SCOPE(wasm_simd_post_mvp);
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd); WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
...@@ -1721,7 +1722,8 @@ WASM_SIMD_TEST_NO_LOWERING(I32x4BitMask) { ...@@ -1721,7 +1722,8 @@ WASM_SIMD_TEST_NO_LOWERING(I32x4BitMask) {
CHECK_EQ(actual, expected); CHECK_EQ(actual, expected);
} }
} }
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_IA32 #endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_IA32 ||
// V8_TARGET_ARCH_X64
WASM_SIMD_TEST(I8x16Splat) { WASM_SIMD_TEST(I8x16Splat) {
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd); WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment