Commit 8b5480b2 authored by Zhou, Zhiguo's avatar Zhou, Zhiguo Committed by Commit Bot

[wasm-simd] Implement the rest load_extend and load_splat on IA32

This CL implements load_extend with 2 lanes and all load_splat
operations on IA32. The necessary assemblers together with their
corresponding disassemblers and tests are also added in this CL.
The newly added opcodes include: S8x16LoadSplat, S16x8LoadSplat,
S32x4LoadSplat, S64x2LoadSplat, I64x2Load32x2S, I64x2Load32x2U.

Bug: v8:9886
Change-Id: I0a5dae0a683985c14c433ba9d85acbd1cee6705f
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1982989Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Commit-Queue: Zhiguo Zhou <zhiguo.zhou@intel.com>
Cr-Commit-Position: refs/heads/master@{#65937}
parent 164a0313
......@@ -2312,6 +2312,15 @@ void Assembler::movups(Operand dst, XMMRegister src) {
emit_sse_operand(src, dst);
}
void Assembler::movddup(XMMRegister dst, Operand src) {
DCHECK(IsEnabled(SSE3));
EnsureSpace ensure_space(this);
EMIT(0xF2);
EMIT(0x0F);
EMIT(0x12);
emit_sse_operand(dst, src);
}
void Assembler::shufps(XMMRegister dst, XMMRegister src, byte imm8) {
DCHECK(is_uint8(imm8));
EnsureSpace ensure_space(this);
......
......@@ -1026,6 +1026,10 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
pextrb(Operand(dst), src, offset);
}
void pextrb(Operand dst, XMMRegister src, uint8_t offset);
// SSE3 instructions
void movddup(XMMRegister dst, Operand src);
void movddup(XMMRegister dst, XMMRegister src) { movddup(dst, Operand(src)); }
// Use SSE4_1 encoding for pextrw reg, xmm, imm8 for consistency
void pextrw(Register dst, XMMRegister src, uint8_t offset) {
pextrw(Operand(dst), src, offset);
......@@ -1411,6 +1415,15 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
vinstr(0x5B, dst, xmm0, src, kF3, k0F, kWIG);
}
void vmovddup(XMMRegister dst, Operand src) {
vinstr(0x12, dst, xmm0, src, kF2, k0F, kWIG);
}
void vmovddup(XMMRegister dst, XMMRegister src) {
vmovddup(dst, Operand(src));
}
void vbroadcastss(XMMRegister dst, Operand src) {
vinstr(0x18, dst, xmm0, src, k66, k0F38, kW0);
}
void vmovdqu(XMMRegister dst, Operand src) {
vinstr(0x6F, dst, xmm0, src, kF3, k0F, kWIG);
}
......
......@@ -1541,6 +1541,20 @@ void TurboAssembler::Pextrd(Register dst, XMMRegister src, uint8_t imm8) {
add(esp, Immediate(kDoubleSize));
}
void TurboAssembler::Pinsrb(XMMRegister dst, Operand src, int8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpinsrb(dst, dst, src, imm8);
return;
}
if (CpuFeatures::IsSupported(SSE4_1)) {
CpuFeatureScope sse_scope(this, SSE4_1);
pinsrb(dst, src, imm8);
return;
}
FATAL("no AVX or SSE4.1 support");
}
void TurboAssembler::Pinsrd(XMMRegister dst, Operand src, uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
......@@ -1571,6 +1585,27 @@ void TurboAssembler::Pinsrd(XMMRegister dst, Operand src, uint8_t imm8) {
add(esp, Immediate(kDoubleSize));
}
void TurboAssembler::Pinsrw(XMMRegister dst, Operand src, int8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpinsrw(dst, dst, src, imm8);
return;
} else {
pinsrw(dst, src, imm8);
return;
}
}
void TurboAssembler::Vbroadcastss(XMMRegister dst, Operand src) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vbroadcastss(dst, src);
return;
}
movss(dst, src);
shufps(dst, dst, static_cast<byte>(0));
}
void TurboAssembler::Lzcnt(Register dst, Operand src) {
if (CpuFeatures::IsSupported(LZCNT)) {
CpuFeatureScope scope(this, LZCNT);
......
......@@ -294,6 +294,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP3_XO(Punpcklbw, punpcklbw)
AVX_OP3_XO(Punpckhbw, punpckhbw)
AVX_OP3_XO(Punpckldq, punpckldq)
AVX_OP3_XO(Punpcklqdq, punpcklqdq)
AVX_OP3_XO(Pxor, pxor)
AVX_OP3_XO(Andps, andps)
AVX_OP3_XO(Andnps, andnps)
......@@ -362,6 +363,12 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
} \
UNREACHABLE(); \
}
#define AVX_OP2_XO_SSE3(macro_name, name) \
AVX_OP2_WITH_TYPE_SCOPE(macro_name, name, XMMRegister, XMMRegister, SSE3) \
AVX_OP2_WITH_TYPE_SCOPE(macro_name, name, XMMRegister, Operand, SSE3)
AVX_OP2_XO_SSE3(Movddup, movddup)
#undef AVX_OP2_XO_SSE3
#define AVX_OP2_XO_SSE4(macro_name, name) \
AVX_OP2_WITH_TYPE_SCOPE(macro_name, name, XMMRegister, XMMRegister, SSE4_1) \
AVX_OP2_WITH_TYPE_SCOPE(macro_name, name, XMMRegister, Operand, SSE4_1)
......@@ -369,8 +376,10 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP2_XO_SSE4(Ptest, ptest)
AVX_OP2_XO_SSE4(Pmovsxbw, pmovsxbw)
AVX_OP2_XO_SSE4(Pmovsxwd, pmovsxwd)
AVX_OP2_XO_SSE4(Pmovsxdq, pmovsxdq)
AVX_OP2_XO_SSE4(Pmovzxbw, pmovzxbw)
AVX_OP2_XO_SSE4(Pmovzxwd, pmovzxwd)
AVX_OP2_XO_SSE4(Pmovzxdq, pmovzxdq)
#undef AVX_OP2_WITH_TYPE_SCOPE
#undef AVX_OP2_XO_SSE4
......@@ -397,10 +406,19 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void Pextrb(Register dst, XMMRegister src, uint8_t imm8);
void Pextrw(Register dst, XMMRegister src, uint8_t imm8);
void Pextrd(Register dst, XMMRegister src, uint8_t imm8);
void Pinsrb(XMMRegister dst, Register src, int8_t imm8) {
Pinsrb(dst, Operand(src), imm8);
}
void Pinsrb(XMMRegister dst, Operand src, int8_t imm8);
void Pinsrd(XMMRegister dst, Register src, uint8_t imm8) {
Pinsrd(dst, Operand(src), imm8);
}
void Pinsrd(XMMRegister dst, Operand src, uint8_t imm8);
void Pinsrw(XMMRegister dst, Register src, int8_t imm8) {
Pinsrw(dst, Operand(src), imm8);
}
void Pinsrw(XMMRegister dst, Operand src, int8_t imm8);
void Vbroadcastss(XMMRegister dst, Operand src);
// Expression support
// cvtsi2sd instruction only writes to the low 64-bit of dst register, which
......
......@@ -82,8 +82,10 @@
#define SSE4_RM_INSTRUCTION_LIST(V) \
V(pmovsxbw, 66, 0F, 38, 20) \
V(pmovsxwd, 66, 0F, 38, 23) \
V(pmovsxdq, 66, 0F, 38, 25) \
V(pmovzxbw, 66, 0F, 38, 30) \
V(pmovzxwd, 66, 0F, 38, 33) \
V(pmovzxdq, 66, 0F, 38, 35) \
V(ptest, 66, 0F, 38, 17)
#endif // V8_CODEGEN_IA32_SSE_INSTR_H_
......@@ -3745,6 +3745,27 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ mov(esp, tmp);
break;
}
case kIA32S8x16LoadSplat: {
__ Pinsrb(i.OutputSimd128Register(), i.MemoryOperand(), 0);
__ Pxor(kScratchDoubleReg, kScratchDoubleReg);
__ Pshufb(i.OutputSimd128Register(), kScratchDoubleReg);
break;
}
case kIA32S16x8LoadSplat: {
__ Pinsrw(i.OutputSimd128Register(), i.MemoryOperand(), 0);
__ Pshuflw(i.OutputSimd128Register(), i.OutputSimd128Register(),
static_cast<uint8_t>(0));
__ Punpcklqdq(i.OutputSimd128Register(), i.OutputSimd128Register());
break;
}
case kIA32S32x4LoadSplat: {
__ Vbroadcastss(i.OutputSimd128Register(), i.MemoryOperand());
break;
}
case kIA32S64x2LoadSplat: {
__ Movddup(i.OutputSimd128Register(), i.MemoryOperand());
break;
}
case kIA32I16x8Load8x8S: {
__ Pmovsxbw(i.OutputSimd128Register(), i.MemoryOperand());
break;
......@@ -3761,6 +3782,14 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pmovzxwd(i.OutputSimd128Register(), i.MemoryOperand());
break;
}
case kIA32I64x2Load32x2S: {
__ Pmovsxdq(i.OutputSimd128Register(), i.MemoryOperand());
break;
}
case kIA32I64x2Load32x2U: {
__ Pmovzxdq(i.OutputSimd128Register(), i.MemoryOperand());
break;
}
case kIA32S32x4Swizzle: {
DCHECK_EQ(2, instr->InputCount());
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputInt8(1));
......
......@@ -348,10 +348,16 @@ namespace compiler {
V(IA32S128AndNot) \
V(IA32S8x16Swizzle) \
V(IA32S8x16Shuffle) \
V(IA32S8x16LoadSplat) \
V(IA32S16x8LoadSplat) \
V(IA32S32x4LoadSplat) \
V(IA32S64x2LoadSplat) \
V(IA32I16x8Load8x8S) \
V(IA32I16x8Load8x8U) \
V(IA32I32x4Load16x4S) \
V(IA32I32x4Load16x4U) \
V(IA32I64x2Load32x2S) \
V(IA32I64x2Load32x2U) \
V(IA32S32x4Swizzle) \
V(IA32S32x4Shuffle) \
V(IA32S16x8Blend) \
......
......@@ -390,10 +390,16 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32Movsd:
case kIA32Movdqu:
// Moves are used for memory load/store operations.
case kIA32S8x16LoadSplat:
case kIA32S16x8LoadSplat:
case kIA32S32x4LoadSplat:
case kIA32S64x2LoadSplat:
case kIA32I16x8Load8x8S:
case kIA32I16x8Load8x8U:
case kIA32I32x4Load16x4S:
case kIA32I32x4Load16x4U:
case kIA32I64x2Load32x2S:
case kIA32I64x2Load32x2U:
return instr->HasOutput() ? kIsLoadOperation : kHasSideEffect;
case kIA32Peek:
......
......@@ -350,18 +350,16 @@ void InstructionSelector::VisitLoadTransform(Node* node) {
InstructionCode opcode = kArchNop;
switch (params.transformation) {
case LoadTransformation::kS8x16LoadSplat:
// TODO(zhiguo.zhou@intel.com): Implement the rest of load splat and load
// extend operations.
UNIMPLEMENTED();
opcode = kIA32S8x16LoadSplat;
break;
case LoadTransformation::kS16x8LoadSplat:
UNIMPLEMENTED();
opcode = kIA32S16x8LoadSplat;
break;
case LoadTransformation::kS32x4LoadSplat:
UNIMPLEMENTED();
opcode = kIA32S32x4LoadSplat;
break;
case LoadTransformation::kS64x2LoadSplat:
UNIMPLEMENTED();
opcode = kIA32S64x2LoadSplat;
break;
case LoadTransformation::kI16x8Load8x8S:
opcode = kIA32I16x8Load8x8S;
......@@ -376,10 +374,10 @@ void InstructionSelector::VisitLoadTransform(Node* node) {
opcode = kIA32I32x4Load16x4U;
break;
case LoadTransformation::kI64x2Load32x2S:
UNIMPLEMENTED();
opcode = kIA32I64x2Load32x2S;
break;
case LoadTransformation::kI64x2Load32x2U:
UNIMPLEMENTED();
opcode = kIA32I64x2Load32x2U;
break;
default:
UNREACHABLE();
......
......@@ -692,6 +692,10 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
int mod, regop, rm, vvvv = vex_vreg();
get_modrm(*current, &mod, &regop, &rm);
switch (opcode) {
case 0x18:
AppendToBuffer("vbroadcastss %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x99:
AppendToBuffer("vfmadd132s%c %s,%s,", float_size_code(),
NameOfXMMRegister(regop), NameOfXMMRegister(vvvv));
......@@ -846,6 +850,10 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
int mod, regop, rm, vvvv = vex_vreg();
get_modrm(*current, &mod, &regop, &rm);
switch (opcode) {
case 0x12:
AppendToBuffer("vmovddup %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x51:
AppendToBuffer("vsqrtsd %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
......@@ -2430,6 +2438,12 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
get_modrm(*data, &mod, &regop, &rm);
AppendToBuffer("movsd %s,", NameOfXMMRegister(regop));
data += PrintRightXMMOperand(data);
} else if (b2 == 0x12) {
data += 3;
int mod, regop, rm;
get_modrm(*data, &mod, &regop, &rm);
AppendToBuffer("movddup %s,", NameOfXMMRegister(regop));
data += PrintRightXMMOperand(data);
} else if (b2 == 0x5A) {
data += 3;
int mod, regop, rm;
......
......@@ -579,6 +579,8 @@ TEST(DisasmIa320) {
CpuFeatureScope scope(&assm, SSE3);
__ haddps(xmm1, xmm0);
__ haddps(xmm1, Operand(ebx, ecx, times_4, 10000));
__ movddup(xmm1, Operand(eax, 5));
__ movddup(xmm1, xmm2);
}
}
......@@ -770,6 +772,9 @@ TEST(DisasmIa320) {
__ vcvttps2dq(xmm1, xmm0);
__ vcvttps2dq(xmm1, Operand(ebx, ecx, times_4, 10000));
__ vmovddup(xmm1, xmm2);
__ vmovddup(xmm1, Operand(ebx, ecx, times_4, 10000));
__ vbroadcastss(xmm1, Operand(ebx, ecx, times_4, 10000));
__ vmovdqu(xmm0, Operand(ebx, ecx, times_4, 10000));
__ vmovdqu(Operand(ebx, ecx, times_4, 10000), xmm0);
__ vmovd(xmm0, edi);
......
......@@ -3304,9 +3304,6 @@ WASM_SIMD_TEST(SimdLoadStoreLoadMemargOffset) {
}
}
#if !V8_TARGET_ARCH_IA32
// TODO(zhiguo.zhou@intel.com): Add the tests on IA32 once these operations are
// implemented.
template <typename T>
void RunLoadSplatTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode op) {
......@@ -3343,7 +3340,6 @@ WASM_SIMD_TEST_NO_LOWERING(S32x4LoadSplat) {
WASM_SIMD_TEST_NO_LOWERING(S64x2LoadSplat) {
RunLoadSplatTest<int64_t>(execution_tier, lower_simd, kExprS64x2LoadSplat);
}
#endif // !V8_TARGET_ARCH_IA32
template <typename S, typename T>
void RunLoadExtendTest(ExecutionTier execution_tier, LowerSimd lower_simd,
......@@ -3388,7 +3384,6 @@ WASM_SIMD_TEST_NO_LOWERING(I32x4Load16x4S) {
kExprI32x4Load16x4S);
}
#if !V8_TARGET_ARCH_IA32
WASM_SIMD_TEST_NO_LOWERING(I64x2Load32x2U) {
RunLoadExtendTest<uint32_t, uint64_t>(execution_tier, lower_simd,
kExprI64x2Load32x2U);
......@@ -3398,7 +3393,6 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2Load32x2S) {
RunLoadExtendTest<int32_t, int64_t>(execution_tier, lower_simd,
kExprI64x2Load32x2S);
}
#endif // !V8_TARGET_ARCH_IA32
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64 || \
V8_TARGET_ARCH_ARM
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment