Commit fa54ae1d authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][ia32] Prototype extended multiply

Implementation is almost identical to x64, except that in the
instruction-selector, for AVX, we allow the second operand to
be a slot, and so we use InputOperand in the codegen.

Bug: v8:11008
Change-Id: I5b5ea4b5058dc0bf5ff1c24a67f9b787c5312106
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2576887Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71705}
parent 2fe326ce
......@@ -2474,6 +2474,14 @@ void Assembler::movdqu(XMMRegister dst, Operand src) {
emit_sse_operand(dst, src);
}
void Assembler::movdqu(XMMRegister dst, XMMRegister src) {
EnsureSpace ensure_space(this);
EMIT(0xF3);
EMIT(0x0F);
EMIT(0x7F);
emit_sse_operand(src, dst);
}
void Assembler::prefetch(Operand src, int level) {
DCHECK(is_uint2(level));
EnsureSpace ensure_space(this);
......
......@@ -987,6 +987,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void movdqa(Operand dst, XMMRegister src);
void movdqu(XMMRegister dst, Operand src);
void movdqu(Operand dst, XMMRegister src);
void movdqu(XMMRegister dst, XMMRegister src);
void movdq(bool aligned, XMMRegister dst, Operand src) {
if (aligned) {
movdqa(dst, src);
......
......@@ -360,6 +360,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP3_XO(Orps, orps)
AVX_OP3_XO(Orpd, orpd)
AVX_OP3_XO(Andnpd, andnpd)
AVX_OP3_XO(Pmullw, pmullw)
#undef AVX_OP3_XO
#undef AVX_OP3_WITH_TYPE
......
......@@ -39,6 +39,8 @@
V(psraw, 66, 0F, E1) \
V(psrad, 66, 0F, E2) \
V(pavgw, 66, 0F, E3) \
V(pmulhuw, 66, 0F, E4) \
V(pmulhw, 66, 0F, E5) \
V(psrlw, 66, 0F, D1) \
V(psrld, 66, 0F, D2) \
V(psrlq, 66, 0F, D3) \
......@@ -75,6 +77,7 @@
V(pabsd, 66, 0F, 38, 1E)
#define SSE4_INSTRUCTION_LIST(V) \
V(pmuldq, 66, 0F, 38, 28) \
V(pcmpeqq, 66, 0F, 38, 29) \
V(packusdw, 66, 0F, 38, 2B) \
V(pminsb, 66, 0F, 38, 38) \
......
......@@ -561,6 +561,74 @@ class OutOfLineRecordWrite final : public OutOfLineCode {
} \
} while (false)
// 1. Unpack src0, src1 into even-number elements of scratch.
// 2. Unpack src1, src0 into even-number elements of dst.
// 3. Multiply 1. with 2.
// For non-AVX, use non-destructive pshufd instead of punpckldq/punpckhdq.
// We only need SSE4_1 for pmuldq (singed ext mul), but enable in both signed
// and unsigned cases to reduce macro duplication.
#define ASSEMBLE_SIMD_I64X2_EXT_MUL(UNPACK_INSTR, MUL_INSTR, SHUFFLE_CONST) \
do { \
XMMRegister dst = i.OutputSimd128Register(); \
XMMRegister src0 = i.InputSimd128Register(0); \
Operand src1 = i.InputOperand(1); \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(tasm(), AVX); \
__ movdqu(kScratchDoubleReg, src1); \
__ v##UNPACK_INSTR(kScratchDoubleReg, kScratchDoubleReg, \
kScratchDoubleReg); \
__ v##UNPACK_INSTR(dst, src0, src0); \
__ v##MUL_INSTR(dst, kScratchDoubleReg, dst); \
} else { \
CpuFeatureScope sse4_scope(tasm(), SSE4_1); \
DCHECK_EQ(dst, src0); \
__ pshufd(kScratchDoubleReg, src0, SHUFFLE_CONST); \
__ pshufd(dst, src1, SHUFFLE_CONST); \
__ MUL_INSTR(dst, kScratchDoubleReg); \
} \
} while (false)
// 1. Multiply low word into scratch.
// 2. Multiply high word (can be signed or unsigned) into dst.
// 3. Unpack and interleave scratch and dst into dst.
#define ASSEMBLE_SIMD_I32X4_EXT_MUL(MUL_HIGH_INSTR, UNPACK_INSTR) \
do { \
XMMRegister dst = i.OutputSimd128Register(); \
XMMRegister src0 = i.InputSimd128Register(0); \
Operand src1 = i.InputOperand(1); \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(tasm(), AVX); \
__ vpmullw(kScratchDoubleReg, src0, src1); \
__ v##MUL_HIGH_INSTR(dst, src0, src1); \
__ v##UNPACK_INSTR(dst, kScratchDoubleReg, dst); \
} else { \
DCHECK_EQ(dst, src0); \
__ movdqu(kScratchDoubleReg, src0); \
__ pmullw(kScratchDoubleReg, src1); \
__ MUL_HIGH_INSTR(dst, src1); \
__ UNPACK_INSTR(kScratchDoubleReg, dst); \
__ movdqu(dst, kScratchDoubleReg); \
} \
} while (false)
#define ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(EXTEND_MACRO_INSTR) \
do { \
XMMRegister dst = i.OutputSimd128Register(); \
__ EXTEND_MACRO_INSTR(kScratchDoubleReg, i.InputSimd128Register(0)); \
__ EXTEND_MACRO_INSTR(dst, i.InputOperand(1)); \
__ Pmullw(dst, kScratchDoubleReg); \
} while (false)
#define ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(EXTEND_MACRO_INSTR) \
do { \
XMMRegister dst = i.OutputSimd128Register(); \
__ Palignr(kScratchDoubleReg, i.InputSimd128Register(0), uint8_t{8}); \
__ EXTEND_MACRO_INSTR(kScratchDoubleReg, kScratchDoubleReg); \
__ Palignr(dst, i.InputOperand(1), uint8_t{8}); \
__ EXTEND_MACRO_INSTR(dst, dst); \
__ Pmullw(dst, kScratchDoubleReg); \
} while (false)
void CodeGenerator::AssembleDeconstructFrame() {
__ mov(esp, ebp);
__ pop(ebp);
......@@ -2091,6 +2159,54 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Roundpd(i.OutputSimd128Register(), i.InputDoubleRegister(0), mode);
break;
}
case kIA32I64x2ExtMulLowI32x4S: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckldq, pmuldq, 0x50);
break;
}
case kIA32I64x2ExtMulHighI32x4S: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckhdq, pmuldq, 0xFA);
break;
}
case kIA32I64x2ExtMulLowI32x4U: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckldq, pmuludq, 0x50);
break;
}
case kIA32I64x2ExtMulHighI32x4U: {
ASSEMBLE_SIMD_I64X2_EXT_MUL(punpckhdq, pmuludq, 0xFA);
break;
}
case kIA32I32x4ExtMulLowI16x8S: {
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhw, punpcklwd);
break;
}
case kIA32I32x4ExtMulHighI16x8S: {
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhw, punpckhwd);
break;
}
case kIA32I32x4ExtMulLowI16x8U: {
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhuw, punpcklwd);
break;
}
case kIA32I32x4ExtMulHighI16x8U: {
ASSEMBLE_SIMD_I32X4_EXT_MUL(pmulhuw, punpckhwd);
break;
}
case kIA32I16x8ExtMulLowI8x16S: {
ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(Pmovsxbw);
break;
}
case kIA32I16x8ExtMulHighI8x16S: {
ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovsxbw);
break;
}
case kIA32I16x8ExtMulLowI8x16U: {
ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW(Pmovzxbw);
break;
}
case kIA32I16x8ExtMulHighI8x16U: {
ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH(Pmovzxbw);
break;
}
case kIA32I64x2SplatI32Pair: {
XMMRegister dst = i.OutputSimd128Register();
__ Pinsrd(dst, i.InputRegister(0), 0);
......@@ -5238,6 +5354,12 @@ void CodeGenerator::AssembleJumpTable(Label** targets, size_t target_count) {
#undef ASSEMBLE_SIMD_IMM_SHUFFLE
#undef ASSEMBLE_SIMD_ALL_TRUE
#undef ASSEMBLE_SIMD_SHIFT
#undef ASSEMBLE_SIMD_PINSR
#undef ASSEMBLE_SIMD_SIGN_SELECT
#undef ASSEMBLE_SIMD_I64X2_EXT_MUL
#undef ASSEMBLE_SIMD_I16X8_EXT_MUL_LOW
#undef ASSEMBLE_SIMD_I16X8_EXT_MUL_HIGH
#undef ASSEMBLE_SIMD_I32X4_EXT_MUL
} // namespace compiler
} // namespace internal
......
......@@ -150,6 +150,10 @@ namespace compiler {
V(IA32I64x2BitMask) \
V(IA32I64x2Eq) \
V(IA32I64x2SignSelect) \
V(IA32I64x2ExtMulLowI32x4S) \
V(IA32I64x2ExtMulHighI32x4S) \
V(IA32I64x2ExtMulLowI32x4U) \
V(IA32I64x2ExtMulHighI32x4U) \
V(IA32F32x4Splat) \
V(SSEF32x4ExtractLane) \
V(AVXF32x4ExtractLane) \
......@@ -235,6 +239,10 @@ namespace compiler {
V(IA32I32x4BitMask) \
V(IA32I32x4DotI16x8S) \
V(IA32I32x4SignSelect) \
V(IA32I32x4ExtMulLowI16x8S) \
V(IA32I32x4ExtMulHighI16x8S) \
V(IA32I32x4ExtMulLowI16x8U) \
V(IA32I32x4ExtMulHighI16x8U) \
V(IA32I16x8Splat) \
V(IA32I16x8ExtractLaneS) \
V(IA32I16x8SConvertI8x16Low) \
......@@ -289,6 +297,10 @@ namespace compiler {
V(IA32I16x8Abs) \
V(IA32I16x8BitMask) \
V(IA32I16x8SignSelect) \
V(IA32I16x8ExtMulLowI8x16S) \
V(IA32I16x8ExtMulHighI8x16S) \
V(IA32I16x8ExtMulLowI8x16U) \
V(IA32I16x8ExtMulHighI8x16U) \
V(IA32I8x16Splat) \
V(IA32I8x16ExtractLaneS) \
V(IA32Pinsrb) \
......
......@@ -129,6 +129,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I64x2BitMask:
case kIA32I64x2Eq:
case kIA32I64x2SignSelect:
case kIA32I64x2ExtMulLowI32x4S:
case kIA32I64x2ExtMulHighI32x4S:
case kIA32I64x2ExtMulLowI32x4U:
case kIA32I64x2ExtMulHighI32x4U:
case kIA32F32x4Splat:
case kSSEF32x4ExtractLane:
case kAVXF32x4ExtractLane:
......@@ -214,6 +218,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I32x4BitMask:
case kIA32I32x4DotI16x8S:
case kIA32I32x4SignSelect:
case kIA32I32x4ExtMulLowI16x8S:
case kIA32I32x4ExtMulHighI16x8S:
case kIA32I32x4ExtMulLowI16x8U:
case kIA32I32x4ExtMulHighI16x8U:
case kIA32I16x8Splat:
case kIA32I16x8ExtractLaneS:
case kIA32I16x8SConvertI8x16Low:
......@@ -268,6 +276,10 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I16x8Abs:
case kIA32I16x8BitMask:
case kIA32I16x8SignSelect:
case kIA32I16x8ExtMulLowI8x16S:
case kIA32I16x8ExtMulHighI8x16S:
case kIA32I16x8ExtMulLowI8x16U:
case kIA32I16x8ExtMulHighI8x16U:
case kIA32I8x16Splat:
case kIA32I8x16ExtractLaneS:
case kIA32Pinsrb:
......
......@@ -2206,8 +2206,20 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(I64x2Add) \
V(I64x2Sub) \
V(I64x2Eq) \
V(I64x2ExtMulLowI32x4S) \
V(I64x2ExtMulHighI32x4S) \
V(I64x2ExtMulLowI32x4U) \
V(I64x2ExtMulHighI32x4U) \
V(I32x4DotI16x8S) \
V(I32x4ExtMulLowI16x8S) \
V(I32x4ExtMulHighI16x8S) \
V(I32x4ExtMulLowI16x8U) \
V(I32x4ExtMulHighI16x8U) \
V(I16x8RoundingAverageU) \
V(I16x8ExtMulLowI8x16S) \
V(I16x8ExtMulHighI8x16S) \
V(I16x8ExtMulLowI8x16U) \
V(I16x8ExtMulHighI8x16U) \
V(I8x16RoundingAverageU)
#define SIMD_UNOP_LIST(V) \
......
......@@ -2739,7 +2739,8 @@ void InstructionSelector::VisitI64x2Eq(Node* node) { UNIMPLEMENTED(); }
// && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM
// && !V8_TARGET_ARCH_MIPS64 && !V8_TARGET_ARCH_MIPS
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_X64 && \
!V8_TARGET_ARCH_IA32
// TODO(v8:11008) Prototype extended multiplication.
void InstructionSelector::VisitI64x2ExtMulLowI32x4S(Node* node) {
UNIMPLEMENTED();
......@@ -2778,6 +2779,7 @@ void InstructionSelector::VisitI16x8ExtMulHighI8x16U(Node* node) {
UNIMPLEMENTED();
}
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM && !V8_TARGET_ARCH_ARM
// && !V8_TARGET_ARCH_IA32
#if !V8_TARGET_ARCH_ARM64
// TODO(v8:10971) Prototype i16x8.q15mulr_sat_s
......
......@@ -475,6 +475,7 @@ TEST(DisasmIa320) {
__ movdqa(Operand(ebx, ecx, times_4, 10000), xmm0);
__ movdqu(xmm0, Operand(ebx, ecx, times_4, 10000));
__ movdqu(Operand(ebx, ecx, times_4, 10000), xmm0);
__ movdqu(xmm1, xmm0);
__ movapd(xmm0, xmm1);
__ movapd(xmm0, Operand(edx, 4));
......
......@@ -2335,7 +2335,8 @@ WASM_SIMD_TEST_NO_LOWERING(I16x8Q15MulRSatS) {
}
#endif // V8_TARGET_ARCH_ARM64
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64 || \
V8_TARGET_ARCH_IA32
// TODO(v8:11008) Prototype extended multiplication.
namespace {
enum class MulHalf { kLow, kHigh };
......@@ -2447,7 +2448,8 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2ExtMulHighI32x4U) {
kExprI64x2ExtMulHighI32x4U, MultiplyLong,
kExprI32x4Splat, MulHalf::kHigh);
}
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_X64 ||
// V8_TARGET_ARCH_IA32
WASM_SIMD_TEST(I32x4DotI16x8S) {
WasmRunner<int32_t, int16_t, int16_t> r(execution_tier, lower_simd);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment