Commit 0d591e91 authored by George Wort's avatar George Wort Committed by V8 LUCI CQ

Reland "[turbofan][arm64] Emit Lsl for Int32MulWithOverflow when possible"

This is a reland of commit aa541f1c

Original change's description:
> [turbofan][arm64] Emit Lsl for Int32MulWithOverflow when possible
>
> Int32MulWithOverflow on arm64 uses a cmp to set flags rather than
> the multiply instruction itself, thus we can use a left shift when
> the multiplication is by a power of two.
>
> This provides 0.15% for Speedometer2 on a Neoverse-N1 machine,
> with React being improved by 0.45%.
>
> Change-Id: Ic8db42ecc7cb14cf1ac7bbbeab0e9d8359104351
> Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3829472
> Commit-Queue: George Wort <george.wort@arm.com>
> Reviewed-by: Nico Hartmann <nicohartmann@chromium.org>
> Cr-Commit-Position: refs/heads/main@{#82499}

Change-Id: Ib8f387bd41d283df551299f7ee98e72d39e2a3bd
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3865484
Commit-Queue: George Wort <george.wort@arm.com>
Reviewed-by: 's avatarNico Hartmann <nicohartmann@chromium.org>
Cr-Commit-Position: refs/heads/main@{#82909}
parent 1c84fedb
......@@ -903,13 +903,6 @@ void TurboAssembler::Ror(const Register& rd, const Register& rn,
rorv(rd, rn, rm);
}
void MacroAssembler::Sbfiz(const Register& rd, const Register& rn, unsigned lsb,
unsigned width) {
DCHECK(allow_macro_instructions());
DCHECK(!rd.IsZero());
sbfiz(rd, rn, lsb, width);
}
void TurboAssembler::Sbfx(const Register& rd, const Register& rn, unsigned lsb,
unsigned width) {
DCHECK(allow_macro_instructions());
......@@ -990,6 +983,13 @@ void TurboAssembler::Ubfiz(const Register& rd, const Register& rn, unsigned lsb,
ubfiz(rd, rn, lsb, width);
}
void TurboAssembler::Sbfiz(const Register& rd, const Register& rn, unsigned lsb,
unsigned width) {
DCHECK(allow_macro_instructions());
DCHECK(!rd.IsZero());
sbfiz(rd, rn, lsb, width);
}
void TurboAssembler::Ubfx(const Register& rd, const Register& rn, unsigned lsb,
unsigned width) {
DCHECK(allow_macro_instructions());
......
......@@ -1063,6 +1063,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
inline void Sxtw(const Register& rd, const Register& rn);
inline void Ubfiz(const Register& rd, const Register& rn, unsigned lsb,
unsigned width);
inline void Sbfiz(const Register& rd, const Register& rn, unsigned lsb,
unsigned width);
inline void Ubfx(const Register& rd, const Register& rn, unsigned lsb,
unsigned width);
inline void Lsr(const Register& rd, const Register& rn, unsigned shift);
......@@ -1624,8 +1626,6 @@ class V8_EXPORT_PRIVATE MacroAssembler : public TurboAssembler {
mvni(vd, imm8, shift, shift_amount);
}
inline void Rev(const Register& rd, const Register& rn);
inline void Sbfiz(const Register& rd, const Register& rn, unsigned lsb,
unsigned width);
inline void Smaddl(const Register& rd, const Register& rn, const Register& rm,
const Register& ra);
inline void Smsubl(const Register& rd, const Register& rn, const Register& rm,
......
......@@ -1474,6 +1474,10 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Ubfiz(i.OutputRegister32(), i.InputRegister32(0), i.InputInt5(1),
i.InputInt5(2));
break;
case kArm64Sbfiz:
__ Sbfiz(i.OutputRegister(), i.InputRegister(0), i.InputInt6(1),
i.InputInt6(2));
break;
case kArm64Bfi:
__ Bfi(i.OutputRegister(), i.InputRegister(1), i.InputInt6(2),
i.InputInt6(3));
......
......@@ -123,6 +123,7 @@ namespace compiler {
V(Arm64Ubfx) \
V(Arm64Ubfx32) \
V(Arm64Ubfiz32) \
V(Arm64Sbfiz) \
V(Arm64Bfi) \
V(Arm64Rbit) \
V(Arm64Rbit32) \
......
......@@ -89,6 +89,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64Ubfx:
case kArm64Ubfx32:
case kArm64Ubfiz32:
case kArm64Sbfiz:
case kArm64Bfi:
case kArm64Rbit:
case kArm64Rbit32:
......@@ -412,6 +413,7 @@ int InstructionScheduler::GetInstructionLatency(const Instruction* instr) {
case kArm64Sxth32:
case kArm64Sxtw:
case kArm64Ubfiz32:
case kArm64Sbfiz:
case kArm64Ubfx:
case kArm64Ubfx32:
return 1;
......
......@@ -1669,8 +1669,17 @@ void EmitInt32MulWithOverflow(InstructionSelector* selector, Node* node,
Int32BinopMatcher m(node);
InstructionOperand result = g.DefineAsRegister(node);
InstructionOperand left = g.UseRegister(m.left().node());
InstructionOperand right = g.UseRegister(m.right().node());
selector->Emit(kArm64Smull, result, left, right);
if (m.right().HasResolvedValue() &&
base::bits::IsPowerOfTwo(m.right().ResolvedValue())) {
// Sign extend the bottom 32 bits and shift left.
int32_t shift = base::bits::WhichPowerOfTwo(m.right().ResolvedValue());
selector->Emit(kArm64Sbfiz, result, left, g.TempImmediate(shift),
g.TempImmediate(32));
} else {
InstructionOperand right = g.UseRegister(m.right().node());
selector->Emit(kArm64Smull, result, left, right);
}
InstructionCode opcode =
kArm64Cmp | AddressingModeField::encode(kMode_Operand2_R_SXTW);
......
......@@ -201,19 +201,19 @@ class ValueHelper {
}
static constexpr uint32_t uint32_array[] = {
// 0x00000000, 0x00000001, 0xFFFFFFFF, 0x1B09788B, 0x04C5FCE8, 0xCC0DE5BF,
// // This row is useful for testing lea optimizations on intel.
// 0x00000002, 0x00000003, 0x00000004, 0x00000005, 0x00000008, 0x00000009,
// 0x273A798E, 0x187937A3, 0xECE3AF83, 0x5495A16B, 0x0B668ECC, 0x11223344,
// 0x0000009E, 0x00000043, 0x0000AF73, 0x0000116B, 0x00658ECC, 0x002B3B4C,
// 0x88776655, 0x70000000, 0x07200000, 0x7FFFFFFF, 0x56123761, 0x7FFFFF00,
// 0x761C4761, 0x80000000, 0x88888888, 0xA0000000, 0xDDDDDDDD, 0xE0000000,
// 0xEEEEEEEE, 0xFFFFFFFD, 0xF0000000, 0x007FFFFF, 0x003FFFFF, 0x001FFFFF,
// 0x000FFFFF, 0x0007FFFF, 0x0003FFFF, 0x0001FFFF, 0x0000FFFF, 0x00007FFF,
// 0x00003FFF, 0x00001FFF, 0x00000FFF, 0x000007FF, 0x000003FF, 0x000001FF,
0x00000000, 0x00000001, 0xFFFFFFFF, 0x1B09788B, 0x04C5FCE8, 0xCC0DE5BF,
// This row is useful for testing lea optimizations on intel.
0x00000002, 0x00000003, 0x00000004, 0x00000005, 0x00000008, 0x00000009,
0x273A798E, 0x187937A3, 0xECE3AF83, 0x5495A16B, 0x0B668ECC, 0x11223344,
0x0000009E, 0x00000043, 0x0000AF73, 0x0000116B, 0x00658ECC, 0x002B3B4C,
0x88776655, 0x70000000, 0x07200000, 0x7FFFFFFF, 0x56123761, 0x7FFFFF00,
0x761C4761, 0x80000000, 0x88888888, 0xA0000000, 0xDDDDDDDD, 0xE0000000,
0xEEEEEEEE, 0xFFFFFFFD, 0xF0000000, 0x007FFFFF, 0x003FFFFF, 0x001FFFFF,
0x000FFFFF, 0x0007FFFF, 0x0003FFFF, 0x0001FFFF, 0x0000FFFF, 0x00007FFF,
0x00003FFF, 0x00001FFF, 0x00000FFF, 0x000007FF, 0x000003FF, 0x000001FF,
// Bit pattern of a quiet NaN and signaling NaN, with or without
// additional payload.
0x7F876543};
0x7FC00000, 0x7F800000, 0x7FFFFFFF, 0x7F876543};
static constexpr base::Vector<const uint32_t> uint32_vector() {
return base::ArrayVector(uint32_array);
......
......@@ -1930,6 +1930,24 @@ TEST_F(InstructionSelectorTest, OvfBranchWithImmediateOnLeft) {
}
}
TEST_F(InstructionSelectorTest, OvfValMulImmediateOnRight) {
TRACED_FORRANGE(int32_t, shift, 0, 30) {
StreamBuilder m(this, MachineType::Int32(), MachineType::Int32());
m.Return(m.Projection(0, m.Int32MulWithOverflow(m.Int32Constant(1 << shift),
m.Parameter(0))));
Stream s = m.Build();
ASSERT_EQ(2U, s.size());
EXPECT_EQ(kArm64Sbfiz, s[0]->arch_opcode());
EXPECT_EQ(kArm64Cmp, s[1]->arch_opcode());
ASSERT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(shift, s.ToInt32(s[0]->InputAt(1)));
EXPECT_LE(1U, s[0]->OutputCount());
EXPECT_EQ(32, s.ToInt32(s[0]->InputAt(2)));
EXPECT_EQ(kFlags_none, s[0]->flags_mode());
}
}
// -----------------------------------------------------------------------------
// Shift instructions.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment