Commit 80b97562 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

Reland "[wasm-simd][x64] Prototype i32x4.widen_i8x16_{s,u}"

This is a reland of 5a0938e5

The fix is in instruction-selector-x64.cc, the OpParameter is a
uint8_t, I typo-ed a int8_t.

Drive-by fix to maro-assembler-x64.cc to use movaps instead of movapd.

Original change's description:
> [wasm-simd][x64] Prototype i32x4.widen_i8x16_{s,u}
>
> This prototypes i32x4.widen_i8x16_s and i32x4.widen_i8x16_u for x64. It
> uses some masks and pshufb for the widening.  These masks (3 for each
> instruction) are stored as external references.
>
> Bug: v8:11297
> Change-Id: I6c8f55426bbb44b16ed552f393762c34c2524b55
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2617389
> Commit-Queue: Zhi An Ng <zhin@chromium.org>
> Reviewed-by: Deepti Gandluri <gdeepti@chromium.org>
> Reviewed-by: Georg Neis <neis@chromium.org>
> Reviewed-by: Andreas Haas <ahaas@chromium.org>
> Cr-Commit-Position: refs/heads/master@{#72301}

Bug: v8:11297
Change-Id: Ie1df32bd4ef3c71532cab6f82a515f619b6a2b67
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2648967Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Reviewed-by: 's avatarGeorg Neis <neis@chromium.org>
Reviewed-by: 's avatarAndreas Haas <ahaas@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#72330}
parent c5c5d144
......@@ -105,6 +105,43 @@ constexpr struct alignas(16) {
} wasm_uint32_max_as_double = {uint64_t{0x41efffffffe00000},
uint64_t{0x41efffffffe00000}};
// Helper masks used for i32x4.widen_i8x16_{s,u}.
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
} i32x4_widen_i8x16_s1_mask = {uint64_t{0x05FFFFFF'04FFFFFF},
uint64_t{0x07FFFFFF'06FFFFFF}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
} i32x4_widen_i8x16_s2_mask = {uint64_t{0x09FFFFFF'08FFFFFF},
uint64_t{0x0BFFFFFF'0AFFFFFF}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
} i32x4_widen_i8x16_s3_mask = {uint64_t{0x0DFFFFFF'0CFFFFFF},
uint64_t{0x0FFFFFFF'0EFFFFFF}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
} i32x4_widen_i8x16_u1_mask = {uint64_t{0xFFFFFF05'FFFFFF04},
uint64_t{0xFFFFFF07'FFFFFF06}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
} i32x4_widen_i8x16_u2_mask = {uint64_t{0xFFFFFF09'FFFFFF08},
uint64_t{0xFFFFFF0B'FFFFFF0A}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
} i32x4_widen_i8x16_u3_mask = {uint64_t{0xFFFFFF0D'FFFFFF0C},
uint64_t{0xFFFFFF0F'FFFFFF0E}};
// Implementation of ExternalReference
static ExternalReference::Type BuiltinCallTypeForResultSize(int result_size) {
......@@ -539,6 +576,36 @@ ExternalReference ExternalReference::address_of_wasm_uint32_max_as_double() {
reinterpret_cast<Address>(&wasm_uint32_max_as_double));
}
ExternalReference ExternalReference::address_of_i32x4_widen_i8x16_s1_mask() {
return ExternalReference(
reinterpret_cast<Address>(&i32x4_widen_i8x16_s1_mask));
}
ExternalReference ExternalReference::address_of_i32x4_widen_i8x16_s2_mask() {
return ExternalReference(
reinterpret_cast<Address>(&i32x4_widen_i8x16_s2_mask));
}
ExternalReference ExternalReference::address_of_i32x4_widen_i8x16_s3_mask() {
return ExternalReference(
reinterpret_cast<Address>(&i32x4_widen_i8x16_s3_mask));
}
ExternalReference ExternalReference::address_of_i32x4_widen_i8x16_u1_mask() {
return ExternalReference(
reinterpret_cast<Address>(&i32x4_widen_i8x16_u1_mask));
}
ExternalReference ExternalReference::address_of_i32x4_widen_i8x16_u2_mask() {
return ExternalReference(
reinterpret_cast<Address>(&i32x4_widen_i8x16_u2_mask));
}
ExternalReference ExternalReference::address_of_i32x4_widen_i8x16_u3_mask() {
return ExternalReference(
reinterpret_cast<Address>(&i32x4_widen_i8x16_u3_mask));
}
ExternalReference
ExternalReference::address_of_enable_experimental_regexp_engine() {
return ExternalReference(&FLAG_enable_experimental_regexp_engine);
......
......@@ -241,6 +241,12 @@ class StatsCounter;
V(address_of_wasm_double_2_power_52, "wasm_double_2_power_52") \
V(address_of_wasm_int32_max_as_double, "wasm_int32_max_as_double") \
V(address_of_wasm_uint32_max_as_double, "wasm_uint32_max_as_double") \
V(address_of_i32x4_widen_i8x16_u1_mask, "i32x4_widen_i8x16_u1_mask") \
V(address_of_i32x4_widen_i8x16_u2_mask, "i32x4_widen_i8x16_u2_mask") \
V(address_of_i32x4_widen_i8x16_u3_mask, "i32x4_widen_i8x16_u3_mask") \
V(address_of_i32x4_widen_i8x16_s1_mask, "i32x4_widen_i8x16_s1_mask") \
V(address_of_i32x4_widen_i8x16_s2_mask, "i32x4_widen_i8x16_s2_mask") \
V(address_of_i32x4_widen_i8x16_s3_mask, "i32x4_widen_i8x16_s3_mask") \
V(write_barrier_marking_from_code_function, "WriteBarrier::MarkingFromCode") \
V(call_enqueue_microtask_function, "MicrotaskQueue::CallEnqueueMicrotask") \
V(call_enter_context_function, "call_enter_context_function") \
......
......@@ -2022,6 +2022,19 @@ void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src,
}
}
void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src, Operand mask) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpshufb(dst, src, mask);
} else {
if (dst != src) {
movaps(dst, src);
}
CpuFeatureScope sse_scope(this, SSSE3);
pshufb(dst, mask);
}
}
void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
if (CpuFeatures::IsSupported(AVX)) {
......
......@@ -285,9 +285,11 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP_SSE4_1(Pblendw, pblendw)
AVX_OP_SSE4_1(Ptest, ptest)
AVX_OP_SSE4_1(Pmovsxbw, pmovsxbw)
AVX_OP_SSE4_1(Pmovsxbd, pmovsxbd)
AVX_OP_SSE4_1(Pmovsxwd, pmovsxwd)
AVX_OP_SSE4_1(Pmovsxdq, pmovsxdq)
AVX_OP_SSE4_1(Pmovzxbw, pmovzxbw)
AVX_OP_SSE4_1(Pmovzxbd, pmovzxbd)
AVX_OP_SSE4_1(Pmovzxwd, pmovzxwd)
AVX_OP_SSE4_1(Pmovzxdq, pmovzxdq)
AVX_OP_SSE4_1(Pextrb, pextrb)
......@@ -573,6 +575,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
// Supports both SSE and AVX. Move src1 to dst if they are not equal on SSE.
void Pshufb(XMMRegister dst, XMMRegister src1, XMMRegister src2);
void Pshufb(XMMRegister dst, XMMRegister src1, Operand src2);
void Pmulhrsw(XMMRegister dst, XMMRegister src1, XMMRegister src2);
// These Wasm SIMD ops do not have direct lowerings on x64. These
......
......@@ -172,9 +172,11 @@
#define SSE4_UNOP_INSTRUCTION_LIST(V) \
V(ptest, 66, 0F, 38, 17) \
V(pmovsxbw, 66, 0F, 38, 20) \
V(pmovsxbd, 66, 0F, 38, 21) \
V(pmovsxwd, 66, 0F, 38, 23) \
V(pmovsxdq, 66, 0F, 38, 25) \
V(pmovzxbw, 66, 0F, 38, 30) \
V(pmovzxbd, 66, 0F, 38, 31) \
V(pmovzxwd, 66, 0F, 38, 33) \
V(pmovzxdq, 66, 0F, 38, 35)
......
......@@ -2130,6 +2130,10 @@ void InstructionSelector::VisitNode(Node* node) {
return MarkAsSimd128(node), VisitI32x4TruncSatF64x2SZero(node);
case IrOpcode::kI32x4TruncSatF64x2UZero:
return MarkAsSimd128(node), VisitI32x4TruncSatF64x2UZero(node);
case IrOpcode::kI32x4WidenI8x16S:
return MarkAsSimd128(node), VisitI32x4WidenI8x16S(node);
case IrOpcode::kI32x4WidenI8x16U:
return MarkAsSimd128(node), VisitI32x4WidenI8x16U(node);
case IrOpcode::kI16x8Splat:
return MarkAsSimd128(node), VisitI16x8Splat(node);
case IrOpcode::kI16x8ExtractLaneU:
......@@ -2825,6 +2829,13 @@ void InstructionSelector::VisitI32x4TruncSatF64x2UZero(Node* node) {
}
#endif //! V8_TARGET_ARCH_X64
#if !V8_TARGET_ARCH_X64
// TODO(v8:11297) Prototype i32x4.widen_i8x16_u
void InstructionSelector::VisitI32x4WidenI8x16S(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI32x4WidenI8x16U(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64
void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }
void InstructionSelector::VisitParameter(Node* node) {
......
......@@ -3776,6 +3776,49 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
/*is_signed=*/false);
break;
}
case kX64I32x4WidenI8x16S: {
uint8_t laneidx = static_cast<uint8_t>(MiscField::decode(opcode));
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
if (laneidx == 0) {
__ Pmovsxbd(dst, src);
break;
}
ExternalReference mask;
if (laneidx == 1) {
mask = ExternalReference::address_of_i32x4_widen_i8x16_s1_mask();
} else if (laneidx == 2) {
mask = ExternalReference::address_of_i32x4_widen_i8x16_s2_mask();
} else {
DCHECK_EQ(3, laneidx);
mask = ExternalReference::address_of_i32x4_widen_i8x16_s3_mask();
}
__ Pshufb(dst, src, __ ExternalReferenceAsOperand(mask));
__ Psrad(dst, byte{24});
break;
}
case kX64I32x4WidenI8x16U: {
uint8_t laneidx = static_cast<uint8_t>(MiscField::decode(opcode));
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
if (laneidx == 0) {
__ Pmovzxbd(dst, src);
break;
}
ExternalReference mask;
if (laneidx == 1) {
mask = ExternalReference::address_of_i32x4_widen_i8x16_u1_mask();
} else if (laneidx == 2) {
mask = ExternalReference::address_of_i32x4_widen_i8x16_u2_mask();
} else {
DCHECK_EQ(3, laneidx);
mask = ExternalReference::address_of_i32x4_widen_i8x16_u3_mask();
}
__ Pshufb(dst, src, __ ExternalReferenceAsOperand(mask));
break;
}
case kX64I64x2SignSelect: {
__ Blendvpd(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), i.InputSimd128Register(2));
......
......@@ -262,6 +262,8 @@ namespace compiler {
V(X64I32x4ExtAddPairwiseI16x8U) \
V(X64I32x4TruncSatF64x2SZero) \
V(X64I32x4TruncSatF64x2UZero) \
V(X64I32x4WidenI8x16S) \
V(X64I32x4WidenI8x16U) \
V(X64I16x8Splat) \
V(X64I16x8ExtractLaneS) \
V(X64I16x8SConvertI8x16Low) \
......
......@@ -238,6 +238,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I32x4ExtAddPairwiseI16x8U:
case kX64I32x4TruncSatF64x2SZero:
case kX64I32x4TruncSatF64x2UZero:
case kX64I32x4WidenI8x16S:
case kX64I32x4WidenI8x16U:
case kX64I16x8Splat:
case kX64I16x8ExtractLaneS:
case kX64I16x8SConvertI8x16Low:
......
......@@ -3753,6 +3753,26 @@ void InstructionSelector::VisitI32x4TruncSatF64x2UZero(Node* node) {
Emit(kX64I32x4TruncSatF64x2UZero, dst, g.UseRegister(node->InputAt(0)));
}
namespace {
void VisitWiden(InstructionSelector* selector, Node* node, ArchOpcode opcode) {
X64OperandGenerator g(selector);
uint8_t laneidx = OpParameter<uint8_t>(node->op());
InstructionOperand dst = CpuFeatures::IsSupported(AVX)
? g.DefineAsRegister(node)
: g.DefineSameAsFirst(node);
selector->Emit(opcode | MiscField::encode(laneidx), dst,
g.UseRegister(node->InputAt(0)));
}
} // namespace
void InstructionSelector::VisitI32x4WidenI8x16S(Node* node) {
VisitWiden(this, node, kX64I32x4WidenI8x16S);
}
void InstructionSelector::VisitI32x4WidenI8x16U(Node* node) {
VisitWiden(this, node, kX64I32x4WidenI8x16U);
}
// static
MachineOperatorBuilder::Flags
InstructionSelector::SupportedMachineOperatorFlags() {
......
......@@ -1707,6 +1707,18 @@ const Operator* MachineOperatorBuilder::Word64PoisonOnSpeculation() {
return GetCachedOperator<Word64PoisonOnSpeculationOperator>();
}
const Operator* MachineOperatorBuilder::I32x4WidenI8x16S(uint8_t laneidx) {
return zone_->New<Operator1<uint8_t>>(IrOpcode::kI32x4WidenI8x16S,
Operator::kPure, "I32x4WidenI8x16S", 1,
0, 0, 1, 0, 0, laneidx);
}
const Operator* MachineOperatorBuilder::I32x4WidenI8x16U(uint8_t laneidx) {
return zone_->New<Operator1<uint8_t>>(IrOpcode::kI32x4WidenI8x16U,
Operator::kPure, "I32x4WidenI8x16U", 1,
0, 0, 1, 0, 0, laneidx);
}
#define EXTRACT_LANE_OP(Type, Sign, lane_count) \
const Operator* MachineOperatorBuilder::Type##ExtractLane##Sign( \
int32_t lane_index) { \
......
......@@ -724,6 +724,8 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* I32x4ExtAddPairwiseI16x8U();
const Operator* I32x4TruncSatF64x2SZero();
const Operator* I32x4TruncSatF64x2UZero();
const Operator* I32x4WidenI8x16S(uint8_t laneidx);
const Operator* I32x4WidenI8x16U(uint8_t laneidx);
const Operator* I16x8Splat();
const Operator* I16x8ExtractLaneU(int32_t);
......
......@@ -883,6 +883,8 @@
V(I32x4ExtAddPairwiseI16x8U) \
V(I32x4TruncSatF64x2SZero) \
V(I32x4TruncSatF64x2UZero) \
V(I32x4WidenI8x16S) \
V(I32x4WidenI8x16U) \
V(I16x8Splat) \
V(I16x8ExtractLaneU) \
V(I16x8ExtractLaneS) \
......
......@@ -5235,6 +5235,12 @@ Node* WasmGraphBuilder::SimdLaneOp(wasm::WasmOpcode opcode, uint8_t lane,
Node* const* inputs) {
has_simd_ = true;
switch (opcode) {
case wasm::kExprI32x4WidenI8x16S:
return graph()->NewNode(mcgraph()->machine()->I32x4WidenI8x16S(lane),
inputs[0]);
case wasm::kExprI32x4WidenI8x16U:
return graph()->NewNode(mcgraph()->machine()->I32x4WidenI8x16U(lane),
inputs[0]);
case wasm::kExprF64x2ExtractLane:
return graph()->NewNode(mcgraph()->machine()->F64x2ExtractLane(lane),
inputs[0]);
......
......@@ -2225,8 +2225,6 @@ WASM_SIMD_TEST(I32x4ShrU) {
#if V8_TARGET_ARCH_X64
// TODO(v8:11297) Prototype i32x4.widen_i8x16_{u,s}
WASM_SIMD_TEST_NO_LOWERING(I32x4WidenI8x16U) {
// TODO(zhin): Add TurboFan support.
if (execution_tier != TestExecutionTier::kInterpreter) return;
FLAG_SCOPE(wasm_simd_post_mvp);
WasmRunner<uint32_t, uint32_t> r(execution_tier, lower_simd);
......@@ -2264,8 +2262,6 @@ WASM_SIMD_TEST_NO_LOWERING(I32x4WidenI8x16U) {
}
WASM_SIMD_TEST_NO_LOWERING(I32x4WidenI8x16S) {
// TODO(zhin): Add TurboFan support.
if (execution_tier != TestExecutionTier::kInterpreter) return;
FLAG_SCOPE(wasm_simd_post_mvp);
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment