[arm][turbofan] Use NEON for unaligned float64 memory accesses

When available, we use the NEON instructions vld1.8 and vst1.8 to implement unaligned loads and stores of float64 values. R=bmeurer@chromium.org, v8-arm-ports@googlegroups.com Review-Url: https://codereview.chromium.org/2769723003 Cr-Commit-Position: refs/heads/master@{#44063}

[arm][turbofan] Use NEON for unaligned float64 memory accesses
When available, we use the NEON instructions vld1.8 and vst1.8 to implement unaligned loads and stores of float64 values. R=bmeurer@chromium.org, v8-arm-ports@googlegroups.com Review-Url: https://codereview.chromium.org/2769723003 Cr-Commit-Position: refs/heads/master@{#44063}
ae8bc6ed · ahaas · Commit bot · 118f09f1 · ae8bc6ed · ae8bc6ed
Commit ae8bc6ed authored Mar 23, 2017 by ahaas Committed by Commit bot Mar 23, 2017
5 changed files
--- a/src/arm/assembler-arm.cc
+++ b/src/arm/assembler-arm.cc
@@ -468,7 +468,6 @@ NeonMemOperand::NeonMemOperand(Register rn, Register rm, int align) {
  SetAlignment(align);
 }

-
 void NeonMemOperand::SetAlignment(int align) {
  switch (align) {
    case 0:

--- a/src/compiler/arm/code-generator-arm.cc
+++ b/src/compiler/arm/code-generator-arm.cc
@@ -1443,6 +1443,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
      __ vstr(i.InputFloatRegister(0), i.InputOffset(1));
      DCHECK_EQ(LeaveCC, i.OutputSBit());
      break;
+    case kArmVld1F64: {
+      __ vld1(NeonSize::Neon8, NeonListOperand(i.OutputDoubleRegister()),
+              NeonMemOperand(i.InputRegister(0)));
+      break;
+    }
+    case kArmVst1F64: {
+      __ vst1(Neon8, NeonListOperand(i.InputDoubleRegister(0)),
+              NeonMemOperand(i.InputRegister(1)));
+      break;
+    }
    case kArmVldrF64:
      __ vldr(i.OutputDoubleRegister(), i.InputOffset());
      DCHECK_EQ(LeaveCC, i.OutputSBit());

--- a/src/compiler/arm/instruction-codes-arm.h
+++ b/src/compiler/arm/instruction-codes-arm.h
@@ -104,7 +104,9 @@ namespace compiler {
  V(ArmVldrF32)                    \
  V(ArmVstrF32)                    \
  V(ArmVldrF64)                    \
+  V(ArmVld1F64)                    \
  V(ArmVstrF64)                    \
+  V(ArmVst1F64)                    \
  V(ArmFloat32Max)                 \
  V(ArmFloat64Max)                 \
  V(ArmFloat32Min)                 \

--- a/src/compiler/arm/instruction-scheduler-arm.cc
+++ b/src/compiler/arm/instruction-scheduler-arm.cc
@@ -216,6 +216,7 @@ int InstructionScheduler::GetTargetInstructionFlags(

    case kArmVldrF32:
    case kArmVldrF64:
+    case kArmVld1F64:
    case kArmLdrb:
    case kArmLdrsb:
    case kArmLdrh:
@@ -225,6 +226,7 @@ int InstructionScheduler::GetTargetInstructionFlags(

    case kArmVstrF32:
    case kArmVstrF64:
+    case kArmVst1F64:
    case kArmStrb:
    case kArmStrh:
    case kArmStr:

--- a/src/compiler/arm/instruction-selector-arm.cc
+++ b/src/compiler/arm/instruction-selector-arm.cc
@@ -560,7 +560,6 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {
      return;
    }
    case MachineRepresentation::kFloat64: {
-      // TODO(arm): use vld1.8 for this when NEON is available.
      // Compute the address of the least-significant half of the FP value.
      // We assume that the base node is unlikely to be an encodable immediate
      // or the result of a shift operation, so only consider the addressing
@@ -572,8 +571,8 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {
      size_t input_count;
      if (TryMatchImmediateOrShift(this, &add_opcode, index, &input_count,
                                   &inputs[1])) {
-        // input_count has been set by TryMatchImmediateOrShift(), so increment
-        // it to account for the base register in inputs[0].
+        // input_count has been set by TryMatchImmediateOrShift(), so
+        // increment it to account for the base register in inputs[0].
        input_count++;
      } else {
        add_opcode |= AddressingModeField::encode(kMode_Operand2_R);
@@ -584,13 +583,18 @@ void InstructionSelector::VisitUnalignedLoad(Node* node) {
      InstructionOperand addr = g.TempRegister();
      Emit(add_opcode, 1, &addr, input_count, inputs);

-      // Load both halves and move to an FP register.
-      InstructionOperand fp_lo = g.TempRegister();
-      InstructionOperand fp_hi = g.TempRegister();
-      opcode |= AddressingModeField::encode(kMode_Offset_RI);
-      Emit(opcode, fp_lo, addr, g.TempImmediate(0));
-      Emit(opcode, fp_hi, addr, g.TempImmediate(4));
-      Emit(kArmVmovF64U32U32, g.DefineAsRegister(node), fp_lo, fp_hi);
+      if (CpuFeatures::IsSupported(NEON)) {
+        // With NEON we can load directly from the calculated address.
+        Emit(kArmVld1F64, g.DefineAsRegister(node), addr);
+      } else {
+        // Load both halves and move to an FP register.
+        InstructionOperand fp_lo = g.TempRegister();
+        InstructionOperand fp_hi = g.TempRegister();
+        opcode |= AddressingModeField::encode(kMode_Offset_RI);
+        Emit(opcode, fp_lo, addr, g.TempImmediate(0));
+        Emit(opcode, fp_hi, addr, g.TempImmediate(4));
+        Emit(kArmVmovF64U32U32, g.DefineAsRegister(node), fp_lo, fp_hi);
+      }
      return;
    }
    default:
@@ -624,30 +628,57 @@ void InstructionSelector::VisitUnalignedStore(Node* node) {
      return;
    }
    case MachineRepresentation::kFloat64: {
-      // TODO(arm): use vst1.8 for this when NEON is available.
-      // Store a 64-bit floating point value using two 32-bit integer stores.
-      // Computing the store address here would require three live temporary
-      // registers (fp<63:32>, fp<31:0>, address), so compute base + 4 after
-      // storing the least-significant half of the value.
-
-      // First, move the 64-bit FP value into two temporary integer registers.
-      InstructionOperand fp[] = {g.TempRegister(), g.TempRegister()};
-      inputs[input_count++] = g.UseRegister(value);
-      Emit(kArmVmovU32U32F64, arraysize(fp), fp, input_count,
-           inputs);
-
-      // Store the least-significant half.
-      inputs[0] = fp[0];  // Low 32-bits of FP value.
-      inputs[input_count++] = g.UseRegister(base);  // First store base address.
-      EmitStore(this, kArmStr, input_count, inputs, index);
+      if (CpuFeatures::IsSupported(NEON)) {
+        InstructionOperand address = g.TempRegister();
+        {
+          // First we have to calculate the actual address.
+          InstructionCode add_opcode = kArmAdd;
+          InstructionOperand inputs[3];
+          inputs[0] = g.UseRegister(base);
+
+          size_t input_count;
+          if (TryMatchImmediateOrShift(this, &add_opcode, index, &input_count,
+                                       &inputs[1])) {
+            // input_count has been set by TryMatchImmediateOrShift(), so
+            // increment it to account for the base register in inputs[0].
+            input_count++;
+          } else {
+            add_opcode |= AddressingModeField::encode(kMode_Operand2_R);
+            inputs[1] = g.UseRegister(index);
+            input_count = 2;  // Base register and index.
+          }

-      // Store the most-significant half.
-      InstructionOperand base4 = g.TempRegister();
-      Emit(kArmAdd | AddressingModeField::encode(kMode_Operand2_I), base4,
-           g.UseRegister(base), g.TempImmediate(4));  // Compute base + 4.
-      inputs[0] = fp[1];  // High 32-bits of FP value.
-      inputs[1] = base4;  // Second store base + 4 address.
-      EmitStore(this, kArmStr, input_count, inputs, index);
+          Emit(add_opcode, 1, &address, input_count, inputs);
+        }
+
+        inputs[input_count++] = g.UseRegister(value);
+        inputs[input_count++] = address;
+        Emit(kArmVst1F64, 0, nullptr, input_count, inputs);
+      } else {
+        // Store a 64-bit floating point value using two 32-bit integer stores.
+        // Computing the store address here would require three live temporary
+        // registers (fp<63:32>, fp<31:0>, address), so compute base + 4 after
+        // storing the least-significant half of the value.
+
+        // First, move the 64-bit FP value into two temporary integer registers.
+        InstructionOperand fp[] = {g.TempRegister(), g.TempRegister()};
+        inputs[input_count++] = g.UseRegister(value);
+        Emit(kArmVmovU32U32F64, arraysize(fp), fp, input_count, inputs);
+
+        // Store the least-significant half.
+        inputs[0] = fp[0];  // Low 32-bits of FP value.
+        inputs[input_count++] =
+            g.UseRegister(base);  // First store base address.
+        EmitStore(this, kArmStr, input_count, inputs, index);
+
+        // Store the most-significant half.
+        InstructionOperand base4 = g.TempRegister();
+        Emit(kArmAdd | AddressingModeField::encode(kMode_Operand2_I), base4,
+             g.UseRegister(base), g.TempImmediate(4));  // Compute base + 4.
+        inputs[0] = fp[1];  // High 32-bits of FP value.
+        inputs[1] = base4;  // Second store base + 4 address.
+        EmitStore(this, kArmStr, input_count, inputs, index);
+      }
      return;
    }
    default: