Commit a52b44f0 authored by Ilya Rezvov's avatar Ilya Rezvov Committed by V8 LUCI CQ

[wasm-simd] Prototype relaxed integer Dot product instructions

Prototype the instruction on the interpreter, and Arm64. Details of
instruction lowerings on all relevant architectures can be found at:
https://github.com/WebAssembly/relaxed-simd/issues/52

Bug: v8:12908
Change-Id: If8ffb82c38042191c67c9b5c23a231877d4f2159
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3679848Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Commit-Queue: Ilya Rezvov <irezvov@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/main@{#80924}
parent 90c80f7a
......@@ -2488,6 +2488,31 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Addp(i.OutputSimd128Register().V4S(), tmp1, tmp2);
break;
}
case kArm64I16x8DotI8x16S: {
UseScratchRegisterScope scope(tasm());
VRegister lhs = i.InputSimd128Register(0);
VRegister rhs = i.InputSimd128Register(1);
VRegister tmp1 = scope.AcquireV(kFormat8H);
VRegister tmp2 = scope.AcquireV(kFormat8H);
__ Smull(tmp1, lhs.V8B(), rhs.V8B());
__ Smull2(tmp2, lhs.V16B(), rhs.V16B());
__ Addp(i.OutputSimd128Register().V8H(), tmp1, tmp2);
break;
}
case kArm64I32x4DotI8x16AddS: {
UseScratchRegisterScope scope(tasm());
VRegister lhs = i.InputSimd128Register(0);
VRegister rhs = i.InputSimd128Register(1);
VRegister tmp1 = scope.AcquireV(kFormat8H);
VRegister tmp2 = scope.AcquireV(kFormat8H);
__ Smull(tmp1, lhs.V8B(), rhs.V8B());
__ Smull2(tmp2, lhs.V16B(), rhs.V16B());
__ Addp(tmp1, tmp1, tmp2);
__ Saddlp(tmp1.V4S(), tmp1);
__ Add(i.OutputSimd128Register().V4S(), tmp1.V4S(),
i.InputSimd128Register(2).V4S());
break;
}
case kArm64IExtractLaneU: {
VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode));
__ Umov(i.OutputRegister32(), i.InputSimd128Register(0).Format(f),
......
......@@ -276,6 +276,8 @@ namespace compiler {
V(Arm64IGeU) \
V(Arm64I32x4BitMask) \
V(Arm64I32x4DotI16x8S) \
V(Arm64I16x8DotI8x16S) \
V(Arm64I32x4DotI8x16AddS) \
V(Arm64I32x4TruncSatF64x2SZero) \
V(Arm64I32x4TruncSatF64x2UZero) \
V(Arm64IExtractLaneU) \
......
......@@ -225,6 +225,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64IGeU:
case kArm64I32x4BitMask:
case kArm64I32x4DotI16x8S:
case kArm64I16x8DotI8x16S:
case kArm64I32x4DotI8x16AddS:
case kArm64I32x4TruncSatF64x2SZero:
case kArm64I32x4TruncSatF64x2UZero:
case kArm64IExtractLaneU:
......
......@@ -3538,6 +3538,7 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
#define SIMD_BINOP_LIST(V) \
V(I32x4Mul, kArm64I32x4Mul) \
V(I32x4DotI16x8S, kArm64I32x4DotI16x8S) \
V(I16x8DotI8x16I7x16S, kArm64I16x8DotI8x16S) \
V(I16x8SConvertI32x4, kArm64I16x8SConvertI32x4) \
V(I16x8Mul, kArm64I16x8Mul) \
V(I16x8UConvertI32x4, kArm64I16x8UConvertI32x4) \
......@@ -3724,6 +3725,13 @@ void InstructionSelector::VisitS128Zero(Node* node) {
Emit(kArm64S128Zero, g.DefineAsRegister(node));
}
void InstructionSelector::VisitI32x4DotI8x16I7x16AddS(Node* node) {
Arm64OperandGenerator g(this);
Emit(
kArm64I32x4DotI8x16AddS, g.DefineAsRegister(node), g.UseRegister(node->InputAt(0)),
g.UseRegister(node->InputAt(1)), g.UseRegister(node->InputAt(2)));
}
#define SIMD_VISIT_EXTRACT_LANE(Type, T, Sign, LaneSize) \
void InstructionSelector::Visit##Type##ExtractLane##Sign(Node* node) { \
VisitRRI(this, \
......
......@@ -2372,6 +2372,10 @@ void InstructionSelector::VisitNode(Node* node) {
return MarkAsSimd128(node), VisitI32x4RelaxedTruncF32x4U(node);
case IrOpcode::kI16x8RelaxedQ15MulRS:
return MarkAsSimd128(node), VisitI16x8RelaxedQ15MulRS(node);
case IrOpcode::kI16x8DotI8x16I7x16S:
return MarkAsSimd128(node), VisitI16x8DotI8x16I7x16S(node);
case IrOpcode::kI32x4DotI8x16I7x16AddS:
return MarkAsSimd128(node), VisitI32x4DotI8x16I7x16AddS(node);
default:
FATAL("Unexpected operator #%d:%s @ node #%d", node->opcode(),
node->op()->mnemonic(), node->id());
......@@ -2830,6 +2834,16 @@ void InstructionSelector::VisitI16x8RelaxedQ15MulRS(Node* node) {
}
#endif // !V8_TARGET_ARCH_ARM6 && !V8_TARGET_ARCH_ARM
#if !V8_TARGET_ARCH_ARM64
void InstructionSelector::VisitI16x8DotI8x16I7x16S(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI32x4DotI8x16I7x16AddS(Node* node) {
UNIMPLEMENTED();
}
#endif // !V8_TARGET_ARCH_ARM6
void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }
void InstructionSelector::VisitParameter(Node* node) {
......
......@@ -608,7 +608,9 @@ std::ostream& operator<<(std::ostream& os, TruncateKind kind) {
V(I32x4RelaxedTruncF32x4U, Operator::kNoProperties, 1, 0, 1) \
V(I32x4RelaxedTruncF64x2SZero, Operator::kNoProperties, 1, 0, 1) \
V(I32x4RelaxedTruncF64x2UZero, Operator::kNoProperties, 1, 0, 1) \
V(I16x8RelaxedQ15MulRS, Operator::kCommutative, 2, 0, 1)
V(I16x8RelaxedQ15MulRS, Operator::kCommutative, 2, 0, 1) \
V(I16x8DotI8x16I7x16S, Operator::kCommutative, 2, 0, 1) \
V(I32x4DotI8x16I7x16AddS, Operator::kNoProperties, 3, 0, 1)
// The format is:
// V(Name, properties, value_input_count, control_input_count, output_count)
......
......@@ -925,6 +925,8 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* I32x4RelaxedTruncF64x2SZero();
const Operator* I32x4RelaxedTruncF64x2UZero();
const Operator* I16x8RelaxedQ15MulRS();
const Operator* I16x8DotI8x16I7x16S();
const Operator* I32x4DotI8x16I7x16AddS();
// load [base + index]
const Operator* Load(LoadRepresentation rep);
......
......@@ -1001,6 +1001,8 @@
V(I32x4RelaxedTruncF64x2SZero) \
V(I32x4RelaxedTruncF64x2UZero) \
V(I16x8RelaxedQ15MulRS) \
V(I16x8DotI8x16I7x16S) \
V(I32x4DotI8x16I7x16AddS) \
V(I8x16Shuffle) \
V(V128AnyTrue) \
V(I64x2AllTrue) \
......
......@@ -4476,6 +4476,12 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
case wasm::kExprI16x8RelaxedQ15MulRS:
return graph()->NewNode(mcgraph()->machine()->I16x8RelaxedQ15MulRS(),
inputs[0], inputs[1]);
case wasm::kExprI16x8DotI8x16I7x16S:
return graph()->NewNode(mcgraph()->machine()->I16x8DotI8x16I7x16S(),
inputs[0], inputs[1]);
case wasm::kExprI32x4DotI8x16I7x16AddS:
return graph()->NewNode(mcgraph()->machine()->I32x4DotI8x16I7x16AddS(),
inputs[0], inputs[1], inputs[2]);
case wasm::kExprI16x8Abs:
return graph()->NewNode(mcgraph()->machine()->I16x8Abs(), inputs[0]);
case wasm::kExprI16x8BitMask:
......
......@@ -3510,6 +3510,19 @@ void LiftoffAssembler::emit_i16x8_relaxed_q15mulr_s(LiftoffRegister dst,
liftoff::GetSimd128Register(src2));
}
void LiftoffAssembler::emit_i16x8_dot_i8x16_i7x16_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "emit_i16x8_dot_i8x16_i7x16_s");
}
void LiftoffAssembler::emit_i32x4_dot_i8x16_i7x16_add_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs,
LiftoffRegister acc) {
bailout(kSimd, "emit_i32x4_dot_i8x16_i7x16_add_s");
}
void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs,
......
......@@ -3175,6 +3175,31 @@ void LiftoffAssembler::emit_i16x8_relaxed_q15mulr_s(LiftoffRegister dst,
Sqrdmulh(dst.fp().V8H(), src1.fp().V8H(), src2.fp().V8H());
}
void LiftoffAssembler::emit_i16x8_dot_i8x16_i7x16_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
UseScratchRegisterScope scope(this);
VRegister tmp1 = scope.AcquireV(kFormat8H);
VRegister tmp2 = scope.AcquireV(kFormat8H);
Smull(tmp1, lhs.fp().V8B(), rhs.fp().V8B());
Smull2(tmp2, lhs.fp().V16B(), rhs.fp().V16B());
Addp(dst.fp().V8H(), tmp1, tmp2);
}
void LiftoffAssembler::emit_i32x4_dot_i8x16_i7x16_add_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs,
LiftoffRegister acc) {
UseScratchRegisterScope scope(this);
VRegister tmp1 = scope.AcquireV(kFormat8H);
VRegister tmp2 = scope.AcquireV(kFormat8H);
Smull(tmp1, lhs.fp().V8B(), rhs.fp().V8B());
Smull2(tmp2, lhs.fp().V16B(), rhs.fp().V16B());
Addp(tmp1, tmp1, tmp2);
Saddlp(tmp1.V4S(), tmp1);
Add(dst.fp().V4S(), tmp1.V4S(), acc.fp().V4S());
}
void LiftoffAssembler::emit_i32x4_abs(LiftoffRegister dst,
LiftoffRegister src) {
Abs(dst.fp().V4S(), src.fp().V4S());
......
......@@ -3657,6 +3657,19 @@ void LiftoffAssembler::emit_i16x8_relaxed_q15mulr_s(LiftoffRegister dst,
bailout(kRelaxedSimd, "emit_i16x8_relaxed_q15mulr_s");
}
void LiftoffAssembler::emit_i16x8_dot_i8x16_i7x16_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "emit_i16x8_dot_i8x16_i7x16_s");
}
void LiftoffAssembler::emit_i32x4_dot_i8x16_i7x16_add_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs,
LiftoffRegister acc) {
bailout(kSimd, "emit_i32x4_dot_i8x16_i7x16_add_s");
}
void LiftoffAssembler::emit_i32x4_neg(LiftoffRegister dst,
LiftoffRegister src) {
if (dst.fp() == src.fp()) {
......
......@@ -1259,6 +1259,13 @@ class LiftoffAssembler : public TurboAssembler {
inline void emit_i16x8_relaxed_q15mulr_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2);
inline void emit_i16x8_dot_i8x16_i7x16_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2);
inline void emit_i32x4_dot_i8x16_i7x16_add_s(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
LiftoffRegister acc);
inline void emit_i32x4_neg(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i32x4_alltrue(LiftoffRegister dst, LiftoffRegister src);
inline void emit_i32x4_bitmask(LiftoffRegister dst, LiftoffRegister src);
......
......@@ -4102,6 +4102,22 @@ class LiftoffCompiler {
case wasm::kExprI32x4RelaxedTruncF64x2UZero:
return EmitUnOp<kS128, kS128>(
&LiftoffAssembler::emit_i32x4_relaxed_trunc_f64x2_u_zero);
case wasm::kExprI16x8DotI8x16I7x16S:
return EmitBinOp<kS128, kS128>(
&LiftoffAssembler::emit_i16x8_dot_i8x16_i7x16_s);
case wasm::kExprI32x4DotI8x16I7x16AddS: {
// There is no helper for an instruction with 3 SIMD operands
// and we do not expect to add any more, so inlining it here.
static constexpr RegClass res_rc = reg_class_for(kS128);
LiftoffRegister acc = __ PopToRegister();
LiftoffRegister rhs = __ PopToRegister(LiftoffRegList{acc});
LiftoffRegister lhs = __ PopToRegister(LiftoffRegList{rhs, acc});
LiftoffRegister dst = __ GetUnusedRegister(res_rc, {lhs, rhs, acc}, {});
__ emit_i32x4_dot_i8x16_i7x16_add_s(dst, lhs, rhs, acc);
__ PushRegister(kS128, dst);
return;
}
default:
unsupported(decoder, kSimd, "simd");
}
......
......@@ -3231,6 +3231,19 @@ void LiftoffAssembler::emit_i16x8_relaxed_q15mulr_s(LiftoffRegister dst,
bailout(kRelaxedSimd, "emit_i16x8_relaxed_q15mulr_s");
}
void LiftoffAssembler::emit_i16x8_dot_i8x16_i7x16_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
bailout(kSimd, "emit_i16x8_dot_i8x16_i7x16_s");
}
void LiftoffAssembler::emit_i32x4_dot_i8x16_i7x16_add_s(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs,
LiftoffRegister acc) {
bailout(kSimd, "emit_i32x4_dot_i8x16_i7x16_add_s");
}
void LiftoffAssembler::emit_i32x4_neg(LiftoffRegister dst,
LiftoffRegister src) {
if (dst.fp() == src.fp()) {
......
......@@ -377,6 +377,8 @@ constexpr const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) {
CASE_I32x4_OP(RelaxedTruncF64x2SZero, "relaxed_trunc_f64x2_s_zero");
CASE_I32x4_OP(RelaxedTruncF64x2UZero, "relaxed_trunc_f64x2_u_zero");
CASE_I16x8_OP(RelaxedQ15MulRS, "relaxed_q15mulr_s")
CASE_I16x8_OP(DotI8x16I7x16S, "dot_i8x16_i7x16_s")
CASE_I32x4_OP(DotI8x16I7x16AddS, "dot_i8x16_i7x16_add_s")
// Atomic operations.
CASE_OP(AtomicNotify, "atomic.notify")
......
......@@ -541,7 +541,9 @@ bool V8_EXPORT_PRIVATE IsJSCompatibleSignature(const FunctionSig* sig,
V(F32x4RelaxedMax, 0xfd10e, s_ss) \
V(F64x2RelaxedMin, 0xfd10f, s_ss) \
V(F64x2RelaxedMax, 0xfd110, s_ss) \
V(I16x8RelaxedQ15MulRS, 0xfd111, s_ss)
V(I16x8RelaxedQ15MulRS, 0xfd111, s_ss) \
V(I16x8DotI8x16I7x16S, 0xfd112, s_ss) \
V(I32x4DotI8x16I7x16AddS, 0xfd113, s_sss)
#define FOREACH_SIMD_1_OPERAND_1_PARAM_OPCODE(V) \
V(I8x16ExtractLaneS, 0xfd15, _) \
......
......@@ -411,8 +411,66 @@ WASM_RELAXED_SIMD_TEST(I16x8RelaxedQ15MulRS) {
RunI16x8BinOpTest<int16_t>(execution_tier, kExprI16x8RelaxedQ15MulRS,
SaturateRoundingQMul<int16_t>);
}
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
#if V8_TARGET_ARCH_ARM64
WASM_RELAXED_SIMD_TEST(I16x8DotI8x16I7x16S) {
WasmRunner<int32_t, int8_t, int8_t> r(execution_tier);
int16_t* g = r.builder().template AddGlobal<int16_t>(kWasmS128);
byte value1 = 0, value2 = 1;
byte temp1 = r.AllocateLocal(kWasmS128);
byte temp2 = r.AllocateLocal(kWasmS128);
BUILD(r, WASM_LOCAL_SET(temp1, WASM_SIMD_I8x16_SPLAT(WASM_LOCAL_GET(value1))),
WASM_LOCAL_SET(temp2, WASM_SIMD_I8x16_SPLAT(WASM_LOCAL_GET(value2))),
WASM_GLOBAL_SET(
0, WASM_SIMD_BINOP(kExprI16x8DotI8x16I7x16S, WASM_LOCAL_GET(temp1),
WASM_LOCAL_GET(temp2))),
WASM_ONE);
for (int8_t x : compiler::ValueHelper::GetVector<int8_t>()) {
for (int8_t y : compiler::ValueHelper::GetVector<int8_t>()) {
r.Call(x, y & 0x7F);
// * 2 because we of (x*y) + (x*y) = 2*x*y
int16_t expected = base::MulWithWraparound(x * (y & 0x7F), 2);
for (int i = 0; i < 8; i++) {
CHECK_EQ(expected, LANE(g, i));
}
}
}
}
WASM_RELAXED_SIMD_TEST(I32x4DotI8x16I7x16AddS) {
WasmRunner<int32_t, int8_t, int8_t, int32_t> r(execution_tier);
int32_t* g = r.builder().template AddGlobal<int32_t>(kWasmS128);
byte value1 = 0, value2 = 1, value3 = 2;
byte temp1 = r.AllocateLocal(kWasmS128);
byte temp2 = r.AllocateLocal(kWasmS128);
byte temp3 = r.AllocateLocal(kWasmS128);
BUILD(
r, WASM_LOCAL_SET(temp1, WASM_SIMD_I8x16_SPLAT(WASM_LOCAL_GET(value1))),
WASM_LOCAL_SET(temp2, WASM_SIMD_I8x16_SPLAT(WASM_LOCAL_GET(value2))),
WASM_LOCAL_SET(temp3, WASM_SIMD_I32x4_SPLAT(WASM_LOCAL_GET(value3))),
WASM_GLOBAL_SET(0, WASM_SIMD_TERNOP(
kExprI32x4DotI8x16I7x16AddS, WASM_LOCAL_GET(temp1),
WASM_LOCAL_GET(temp2), WASM_LOCAL_GET(temp3))),
WASM_ONE);
for (int8_t x : compiler::ValueHelper::GetVector<int8_t>()) {
for (int8_t y : compiler::ValueHelper::GetVector<int8_t>()) {
for (int32_t z : compiler::ValueHelper::GetVector<int32_t>()) {
r.Call(x, y & 0x7F, z);
int32_t expected = base::AddWithWraparound(
base::MulWithWraparound(x * (y & 0x7F), 4), z);
for (int i = 0; i < 4; i++) {
CHECK_EQ(expected, LANE(g, i));
}
}
}
}
}
#endif // V8_TARGET_ARCH_ARM64
#undef WASM_RELAXED_SIMD_TEST
} // namespace test_run_wasm_relaxed_simd
} // namespace wasm
......
......@@ -2782,6 +2782,39 @@ class WasmInterpreterInternals {
*len += 16;
return true;
}
case kExprI16x8DotI8x16I7x16S: {
int16 v2 = Pop().to_s128().to_i8x16();
int16 v1 = Pop().to_s128().to_i8x16();
int8 res;
for (size_t i = 0; i < 8; i++) {
int16_t lo = (v1.val[LANE(i * 2, v1)] * v2.val[LANE(i * 2, v2)]);
int16_t hi =
(v1.val[LANE(i * 2 + 1, v1)] * v2.val[LANE(i * 2 + 1, v2)]);
res.val[LANE(i, res)] = base::AddWithWraparound(lo, hi);
}
Push(WasmValue(Simd128(res)));
return true;
}
case kExprI32x4DotI8x16I7x16AddS: {
int4 v3 = Pop().to_s128().to_i32x4();
int16 v2 = Pop().to_s128().to_i8x16();
int16 v1 = Pop().to_s128().to_i8x16();
int4 res;
for (size_t i = 0; i < 4; i++) {
int32_t a = (v1.val[LANE(i * 4, v1)] * v2.val[LANE(i * 4, v2)]);
int32_t b =
(v1.val[LANE(i * 4 + 1, v1)] * v2.val[LANE(i * 4 + 1, v2)]);
int32_t c =
(v1.val[LANE(i * 4 + 2, v1)] * v2.val[LANE(i * 4 + 2, v2)]);
int32_t d =
(v1.val[LANE(i * 4 + 3, v1)] * v2.val[LANE(i * 4 + 3, v2)]);
int32_t acc = v3.val[LANE(i, v3)];
// a + b + c + d should not wrap
res.val[LANE(i, res)] = base::AddWithWraparound(a + b + c + d, acc);
}
Push(WasmValue(Simd128(res)));
return true;
}
case kExprI8x16RelaxedSwizzle:
case kExprI8x16Swizzle: {
int16 v2 = Pop().to_s128().to_i8x16();
......
......@@ -927,6 +927,7 @@ inline uint16_t ExtractPrefixedOpcodeBytes(WasmOpcode opcode) {
#define WASM_SIMD_SPLAT(Type, ...) __VA_ARGS__, WASM_SIMD_OP(kExpr##Type##Splat)
#define WASM_SIMD_UNOP(op, x) x, WASM_SIMD_OP(op)
#define WASM_SIMD_BINOP(op, x, y) x, y, WASM_SIMD_OP(op)
#define WASM_SIMD_TERNOP(op, x, y, z) x, y, z, WASM_SIMD_OP(op)
#define WASM_SIMD_SHIFT_OP(op, x, y) x, y, WASM_SIMD_OP(op)
#define WASM_SIMD_CONCAT_OP(op, bytes, x, y) \
x, y, WASM_SIMD_OP(op), TO_BYTE(bytes)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment