ARM: Add support load/store multiple VFP registers

Enter/exit frames with save doubles use these instructions instead of generating 16 load/store instructions. R=karlklose@chromium.org, rodolph.perfetta@gmail.com BUG= TEST= Review URL: http://codereview.chromium.org//6691057 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@7509 ce2b1a6d-e550-0410-aec6-3dcde31c8c00

ARM: Add support load/store multiple VFP registers
Enter/exit frames with save doubles use these instructions instead of generating 16 load/store instructions. R=karlklose@chromium.org, rodolph.perfetta@gmail.com BUG= TEST= Review URL: http://codereview.chromium.org//6691057 git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@7509 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
62554760 · sgjesse@chromium.org · 03fd5602 · 62554760 · 62554760 · 62554760
Commit 62554760 authored Apr 06, 2011 by sgjesse@chromium.org
9 changed files
--- a/src/arm/assembler-arm.cc
+++ b/src/arm/assembler-arm.cc
@@ -32,7 +32,7 @@

 // The original source code covered by the above license above has been
 // modified significantly by Google Inc.
-// Copyright 2010 the V8 project authors. All rights reserved.
+// Copyright 2011 the V8 project authors. All rights reserved.

 #include "v8.h"

@@ -2035,6 +2035,88 @@ void Assembler::vstr(const SwVfpRegister src,
 }


+void  Assembler::vldm(BlockAddrMode am,
+                      Register base,
+                      DwVfpRegister first,
+                      DwVfpRegister last,
+                      Condition cond) {
+  // Instruction details available in ARM DDI 0406A, A8-626.
+  // cond(31-28) | 110(27-25)| PUDW1(24-20) | Rbase(19-16) |
+  // first(15-12) | 1010(11-8) | (count * 2)
+  ASSERT(CpuFeatures::IsEnabled(VFP3));
+  ASSERT_LE(first.code(), last.code());
+  ASSERT(am == ia || am == ia_w || am == db_w);
+  ASSERT(!base.is(pc));
+
+  int sd, d;
+  first.split_code(&sd, &d);
+  int count = last.code() - first.code() + 1;
+  emit(cond | B27 | B26 | am | d*B22 | B20 | base.code()*B16 | sd*B12 |
+       0xB*B8 | count*2);
+}
+
+
+void  Assembler::vstm(BlockAddrMode am,
+                      Register base,
+                      DwVfpRegister first,
+                      DwVfpRegister last,
+                      Condition cond) {
+  // Instruction details available in ARM DDI 0406A, A8-784.
+  // cond(31-28) | 110(27-25)| PUDW0(24-20) | Rbase(19-16) |
+  // first(15-12) | 1011(11-8) | (count * 2)
+  ASSERT(CpuFeatures::IsEnabled(VFP3));
+  ASSERT_LE(first.code(), last.code());
+  ASSERT(am == ia || am == ia_w || am == db_w);
+  ASSERT(!base.is(pc));
+
+  int sd, d;
+  first.split_code(&sd, &d);
+  int count = last.code() - first.code() + 1;
+  emit(cond | B27 | B26 | am | d*B22 | base.code()*B16 | sd*B12 |
+       0xB*B8 | count*2);
+}
+
+void  Assembler::vldm(BlockAddrMode am,
+                      Register base,
+                      SwVfpRegister first,
+                      SwVfpRegister last,
+                      Condition cond) {
+  // Instruction details available in ARM DDI 0406A, A8-626.
+  // cond(31-28) | 110(27-25)| PUDW1(24-20) | Rbase(19-16) |
+  // first(15-12) | 1010(11-8) | (count/2)
+  ASSERT(CpuFeatures::IsEnabled(VFP3));
+  ASSERT_LE(first.code(), last.code());
+  ASSERT(am == ia || am == ia_w || am == db_w);
+  ASSERT(!base.is(pc));
+
+  int sd, d;
+  first.split_code(&sd, &d);
+  int count = last.code() - first.code() + 1;
+  emit(cond | B27 | B26 | am | d*B22 | B20 | base.code()*B16 | sd*B12 |
+       0xA*B8 | count);
+}
+
+
+void  Assembler::vstm(BlockAddrMode am,
+                      Register base,
+                      SwVfpRegister first,
+                      SwVfpRegister last,
+                      Condition cond) {
+  // Instruction details available in ARM DDI 0406A, A8-784.
+  // cond(31-28) | 110(27-25)| PUDW0(24-20) | Rbase(19-16) |
+  // first(15-12) | 1011(11-8) | (count/2)
+  ASSERT(CpuFeatures::IsEnabled(VFP3));
+  ASSERT_LE(first.code(), last.code());
+  ASSERT(am == ia || am == ia_w || am == db_w);
+  ASSERT(!base.is(pc));
+
+  int sd, d;
+  first.split_code(&sd, &d);
+  int count = last.code() - first.code() + 1;
+  emit(cond | B27 | B26 | am | d*B22 | base.code()*B16 | sd*B12 |
+       0xA*B8 | count);
+}
+
 static void DoubleAsTwoUInt32(double d, uint32_t* lo, uint32_t* hi) {
  uint64_t i;
  memcpy(&i, &d, 8);

--- a/src/arm/assembler-arm.h
+++ b/src/arm/assembler-arm.h
@@ -32,7 +32,7 @@

 // The original source code covered by the above license above has been
 // modified significantly by Google Inc.
-// Copyright 2010 the V8 project authors. All rights reserved.
+// Copyright 2011 the V8 project authors. All rights reserved.

 // A light-weight ARM Assembler
 // Generates user mode instructions for the ARM architecture up to version 5
@@ -995,6 +995,30 @@ class Assembler : public AssemblerBase {
            const MemOperand& dst,
            const Condition cond = al);

+  void vldm(BlockAddrMode am,
+            Register base,
+            DwVfpRegister first,
+            DwVfpRegister last,
+            Condition cond = al);
+
+  void vstm(BlockAddrMode am,
+            Register base,
+            DwVfpRegister first,
+            DwVfpRegister last,
+            Condition cond = al);
+
+  void vldm(BlockAddrMode am,
+            Register base,
+            SwVfpRegister first,
+            SwVfpRegister last,
+            Condition cond = al);
+
+  void vstm(BlockAddrMode am,
+            Register base,
+            SwVfpRegister first,
+            SwVfpRegister last,
+            Condition cond = al);
+
  void vmov(const DwVfpRegister dst,
            double imm,
            const Condition cond = al);

--- a/src/arm/constants-arm.h
+++ b/src/arm/constants-arm.h
-// Copyright 2010 the V8 project authors. All rights reserved.
+// Copyright 2011 the V8 project authors. All rights reserved.
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -346,7 +346,9 @@ enum BlockAddrMode {
  da_x         = (0|0|0) << 21,  // Decrement after.
  ia_x         = (0|4|0) << 21,  // Increment after.
  db_x         = (8|0|0) << 21,  // Decrement before.
-  ib_x         = (8|4|0) << 21   // Increment before.
+  ib_x         = (8|4|0) << 21,  // Increment before.
+
+  kBlockAddrModeMask = (8|4|1) << 21
 };



--- a/src/arm/disasm-arm.cc
+++ b/src/arm/disasm-arm.cc
-// Copyright 2010 the V8 project authors. All rights reserved.
+// Copyright 2011 the V8 project authors. All rights reserved.
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -371,25 +371,34 @@ int Decoder::FormatRegister(Instruction* instr, const char* format) {
 int Decoder::FormatVFPRegister(Instruction* instr, const char* format) {
  ASSERT((format[0] == 'S') || (format[0] == 'D'));

+  VFPRegPrecision precision =
+      format[0] == 'D' ? kDoublePrecision : kSinglePrecision;
+
+  int retval = 2;
+  int reg = -1;
  if (format[1] == 'n') {
-    int reg = instr->VnValue();
-    if (format[0] == 'S') PrintSRegister(((reg << 1) | instr->NValue()));
-    if (format[0] == 'D') PrintDRegister(reg);
-    return 2;
+    reg = instr->VFPNRegValue(precision);
  } else if (format[1] == 'm') {
-    int reg = instr->VmValue();
-    if (format[0] == 'S') PrintSRegister(((reg << 1) | instr->MValue()));
-    if (format[0] == 'D') PrintDRegister(reg);
-    return 2;
+    reg = instr->VFPMRegValue(precision);
  } else if (format[1] == 'd') {
-    int reg = instr->VdValue();
-    if (format[0] == 'S') PrintSRegister(((reg << 1) | instr->DValue()));
-    if (format[0] == 'D') PrintDRegister(reg);
-    return 2;
+    reg = instr->VFPDRegValue(precision);
+    if (format[2] == '+') {
+      int immed8 = instr->Immed8Value();
+      if (format[0] == 'S') reg += immed8 - 1;
+      if (format[0] == 'D') reg += (immed8 / 2 - 1);
+    }
+    if (format[2] == '+') retval = 3;
+  } else {
+    UNREACHABLE();
  }

-  UNREACHABLE();
-  return -1;
+  if (precision == kSinglePrecision) {
+    PrintSRegister(reg);
+  } else {
+    PrintDRegister(reg);
+  }
+
+  return retval;
 }


@@ -1273,9 +1282,22 @@ void Decoder::DecodeType6CoprocessorIns(Instruction* instr) {
          Format(instr, "vstr'cond 'Sd, ['rn + 4*'imm08@00]");
        }
        break;
+      case 0x4:
+      case 0x5:
+      case 0x6:
+      case 0x7:
+      case 0x9:
+      case 0xB: {
+        bool to_vfp_register = (instr->VLValue() == 0x1);
+        if (to_vfp_register) {
+          Format(instr, "vldm'cond'pu 'rn'w, {'Sd-'Sd+}");
+        } else {
+          Format(instr, "vstm'cond'pu 'rn'w, {'Sd-'Sd+}");
+        }
+        break;
+      }
      default:
        Unknown(instr);  // Not used by V8.
-        break;
    }
  } else if (instr->CoprocessorValue() == 0xB) {
    switch (instr->OpcodeValue()) {
@@ -1303,9 +1325,19 @@ void Decoder::DecodeType6CoprocessorIns(Instruction* instr) {
          Format(instr, "vstr'cond 'Dd, ['rn + 4*'imm08@00]");
        }
        break;
+      case 0x4:
+      case 0x5:
+      case 0x9: {
+        bool to_vfp_register = (instr->VLValue() == 0x1);
+        if (to_vfp_register) {
+          Format(instr, "vldm'cond'pu 'rn'w, {'Dd-'Dd+}");
+        } else {
+          Format(instr, "vstm'cond'pu 'rn'w, {'Dd-'Dd+}");
+        }
+        break;
+      }
      default:
        Unknown(instr);  // Not used by V8.
-        break;
    }
  } else {
    Unknown(instr);  // Not used by V8.

--- a/src/arm/macro-assembler-arm.cc
+++ b/src/arm/macro-assembler-arm.cc
@@ -749,12 +749,10 @@ void MacroAssembler::EnterExitFrame(bool save_doubles, int stack_space) {

  // Optionally save all double registers.
  if (save_doubles) {
-    sub(sp, sp, Operand(DwVfpRegister::kNumRegisters * kDoubleSize));
-    const int offset = -2 * kPointerSize;
-    for (int i = 0; i < DwVfpRegister::kNumRegisters; i++) {
-      DwVfpRegister reg = DwVfpRegister::from_code(i);
-      vstr(reg, fp, offset - ((i + 1) * kDoubleSize));
-    }
+    DwVfpRegister first = d0;
+    DwVfpRegister last =
+        DwVfpRegister::from_code(DwVfpRegister::kNumRegisters - 1);
+    vstm(db_w, sp, first, last);
    // Note that d0 will be accessible at
    //   fp - 2 * kPointerSize - DwVfpRegister::kNumRegisters * kDoubleSize,
    // since the sp slot and code slot were pushed after the fp.
@@ -811,11 +809,13 @@ void MacroAssembler::LeaveExitFrame(bool save_doubles,
                                    Register argument_count) {
  // Optionally restore all double registers.
  if (save_doubles) {
-    for (int i = 0; i < DwVfpRegister::kNumRegisters; i++) {
-      DwVfpRegister reg = DwVfpRegister::from_code(i);
-      const int offset = -2 * kPointerSize;
-      vldr(reg, fp, offset - ((i + 1) * kDoubleSize));
-    }
+    // Calculate the stack location of the saved doubles and restore them.
+    const int offset = 2 * kPointerSize;
+    sub(r3, fp, Operand(offset + DwVfpRegister::kNumRegisters * kDoubleSize));
+    DwVfpRegister first = d0;
+    DwVfpRegister last =
+        DwVfpRegister::from_code(DwVfpRegister::kNumRegisters - 1);
+    vldm(ia, r3, first, last);
  }

  // Clear top frame.

--- a/src/arm/simulator-arm.cc
+++ b/src/arm/simulator-arm.cc
-// Copyright 2010 the V8 project authors. All rights reserved.
+// Copyright 2011 the V8 project authors. All rights reserved.
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -1504,36 +1504,34 @@ static int count_bits(int bit_vector) {
 }


-// Addressing Mode 4 - Load and Store Multiple
-void Simulator::HandleRList(Instruction* instr, bool load) {
+void Simulator::ProcessPUW(Instruction* instr,
+                           int num_regs,
+                           int reg_size,
+                           intptr_t* start_address,
+                           intptr_t* end_address) {
  int rn = instr->RnValue();
  int32_t rn_val = get_register(rn);
-  int rlist = instr->RlistValue();
-  int num_regs = count_bits(rlist);
-
-  intptr_t start_address = 0;
-  intptr_t end_address = 0;
  switch (instr->PUField()) {
    case da_x: {
      UNIMPLEMENTED();
      break;
    }
    case ia_x: {
-      start_address = rn_val;
-      end_address = rn_val + (num_regs * 4) - 4;
-      rn_val = rn_val + (num_regs * 4);
+      *start_address = rn_val;
+      *end_address = rn_val + (num_regs * reg_size) - reg_size;
+      rn_val = rn_val + (num_regs * reg_size);
      break;
    }
    case db_x: {
-      start_address = rn_val - (num_regs * 4);
-      end_address = rn_val - 4;
-      rn_val = start_address;
+      *start_address = rn_val - (num_regs * reg_size);
+      *end_address = rn_val - reg_size;
+      rn_val = *start_address;
      break;
    }
    case ib_x: {
-      start_address = rn_val + 4;
-      end_address = rn_val + (num_regs * 4);
-      rn_val = end_address;
+      *start_address = rn_val + reg_size;
+      *end_address = rn_val + (num_regs * reg_size);
+      rn_val = *end_address;
      break;
    }
    default: {
@@ -1544,6 +1542,17 @@ void Simulator::HandleRList(Instruction* instr, bool load) {
  if (instr->HasW()) {
    set_register(rn, rn_val);
  }
+}
+
+// Addressing Mode 4 - Load and Store Multiple
+void Simulator::HandleRList(Instruction* instr, bool load) {
+  int rlist = instr->RlistValue();
+  int num_regs = count_bits(rlist);
+
+  intptr_t start_address = 0;
+  intptr_t end_address = 0;
+  ProcessPUW(instr, num_regs, kPointerSize, &start_address, &end_address);
+
  intptr_t* address = reinterpret_cast<intptr_t*>(start_address);
  int reg = 0;
  while (rlist != 0) {
@@ -1562,6 +1571,57 @@ void Simulator::HandleRList(Instruction* instr, bool load) {
 }


+// Addressing Mode 6 - Load and Store Multiple Coprocessor registers.
+void Simulator::HandleVList(Instruction* instr) {
+  VFPRegPrecision precision =
+      (instr->SzValue() == 0) ? kSinglePrecision : kDoublePrecision;
+  int operand_size = (precision == kSinglePrecision) ? 4 : 8;
+
+  bool load = (instr->VLValue() == 0x1);
+
+  int vd;
+  int num_regs;
+  vd = instr->VFPDRegValue(precision);
+  if (precision == kSinglePrecision) {
+    num_regs = instr->Immed8Value();
+  } else {
+    num_regs = instr->Immed8Value() / 2;
+  }
+
+  intptr_t start_address = 0;
+  intptr_t end_address = 0;
+  ProcessPUW(instr, num_regs, operand_size, &start_address, &end_address);
+
+  intptr_t* address = reinterpret_cast<intptr_t*>(start_address);
+  for (int reg = vd; reg < vd + num_regs; reg++) {
+    if (precision == kSinglePrecision) {
+      if (load) {
+        set_s_register_from_sinteger(
+            reg, ReadW(reinterpret_cast<int32_t>(address), instr));
+      } else {
+        WriteW(reinterpret_cast<int32_t>(address),
+               get_sinteger_from_s_register(reg), instr);
+      }
+      address += 1;
+    } else {
+      if (load) {
+        set_s_register_from_sinteger(
+            2 * reg, ReadW(reinterpret_cast<int32_t>(address), instr));
+        set_s_register_from_sinteger(
+            2 * reg + 1, ReadW(reinterpret_cast<int32_t>(address + 1), instr));
+      } else {
+        WriteW(reinterpret_cast<int32_t>(address),
+               get_sinteger_from_s_register(2 * reg), instr);
+        WriteW(reinterpret_cast<int32_t>(address + 1),
+               get_sinteger_from_s_register(2 * reg + 1), instr);
+      }
+      address += 2;
+    }
+  }
+  ASSERT_EQ(((intptr_t)address) - operand_size, end_address);
+}
+
+
 // Calls into the V8 runtime are based on this very simple interface.
 // Note: To be able to return two values from some calls the code in runtime.cc
 // uses the ObjectPair which is essentially two 32-bit values stuffed into a
@@ -2945,9 +3005,17 @@ void Simulator::DecodeType6CoprocessorIns(Instruction* instr) {
        }
        break;
      }
+      case 0x4:
+      case 0x5:
+      case 0x6:
+      case 0x7:
+      case 0x9:
+      case 0xB:
+        // Load/store multiple single from memory: vldm/vstm.
+        HandleVList(instr);
+        break;
      default:
        UNIMPLEMENTED();  // Not used by V8.
-        break;
    }
  } else if (instr->CoprocessorValue() == 0xB) {
    switch (instr->OpcodeValue()) {
@@ -2994,9 +3062,14 @@ void Simulator::DecodeType6CoprocessorIns(Instruction* instr) {
        }
        break;
      }
+      case 0x4:
+      case 0x5:
+      case 0x9:
+        // Load/store multiple double from memory: vldm/vstm.
+        HandleVList(instr);
+        break;
      default:
        UNIMPLEMENTED();  // Not used by V8.
-        break;
    }
  } else {
    UNIMPLEMENTED();  // Not used by V8.

--- a/src/arm/simulator-arm.h
+++ b/src/arm/simulator-arm.h
-// Copyright 2009 the V8 project authors. All rights reserved.
+// Copyright 2011 the V8 project authors. All rights reserved.
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -236,7 +236,13 @@ class Simulator {
  // Helper functions to decode common "addressing" modes
  int32_t GetShiftRm(Instruction* instr, bool* carry_out);
  int32_t GetImm(Instruction* instr, bool* carry_out);
+  void ProcessPUW(Instruction* instr,
+                  int num_regs,
+                  int operand_size,
+                  intptr_t* start_address,
+                  intptr_t* end_address);
  void HandleRList(Instruction* instr, bool load);
+  void HandleVList(Instruction* inst);
  void SoftwareInterrupt(Instruction* instr);

  // Stop helper functions.

--- a/test/cctest/test-assembler-arm.cc
+++ b/test/cctest/test-assembler-arm.cc
-// Copyright 2010 the V8 project authors. All rights reserved.
+// Copyright 2011 the V8 project authors. All rights reserved.
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -39,7 +39,8 @@ using namespace v8::internal;
 // Define these function prototypes to match JSEntryFunction in execution.cc.
 typedef Object* (*F1)(int x, int p1, int p2, int p3, int p4);
 typedef Object* (*F2)(int x, int y, int p2, int p3, int p4);
-typedef Object* (*F3)(void* p, int p1, int p2, int p3, int p4);
+typedef Object* (*F3)(void* p0, int p1, int p2, int p3, int p4);
+typedef Object* (*F4)(void* p0, void* p1, int p2, int p3, int p4);


 static v8::Persistent<v8::Context> env;
@@ -608,4 +609,340 @@ TEST(7) {
  TestRoundingMode(u32_f64, RN, (kMaxUInt + 1.0), kMaxUInt, true);
 }

+TEST(8) {
+  // Test VFP multi load/store with ia_w.
+  InitializeVM();
+  v8::HandleScope scope;
+
+  typedef struct {
+    double a;
+    double b;
+    double c;
+    double d;
+    double e;
+    double f;
+    double g;
+    double h;
+  } D;
+  D d;
+
+  typedef struct {
+    float a;
+    float b;
+    float c;
+    float d;
+    float e;
+    float f;
+    float g;
+    float h;
+  } F;
+  F f;
+
+  // Create a function that uses vldm/vstm to move some double and
+  // single precision values around in memory.
+  Assembler assm(Isolate::Current(), NULL, 0);
+
+  if (CpuFeatures::IsSupported(VFP3)) {
+    CpuFeatures::Scope scope(VFP3);
+
+    __ mov(ip, Operand(sp));
+    __ stm(db_w, sp, r4.bit() | fp.bit() | lr.bit());
+    __ sub(fp, ip, Operand(4));
+
+    __ add(r4, r0, Operand(OFFSET_OF(D, a)));
+    __ vldm(ia_w, r4, d0, d3);
+    __ vldm(ia_w, r4, d4, d7);
+
+    __ add(r4, r0, Operand(OFFSET_OF(D, a)));
+    __ vstm(ia_w, r4, d6, d7);
+    __ vstm(ia_w, r4, d0, d5);
+
+    __ add(r4, r1, Operand(OFFSET_OF(F, a)));
+    __ vldm(ia_w, r4, s0, s3);
+    __ vldm(ia_w, r4, s4, s7);
+
+    __ add(r4, r1, Operand(OFFSET_OF(F, a)));
+    __ vstm(ia_w, r4, s6, s7);
+    __ vstm(ia_w, r4, s0, s5);
+
+    __ ldm(ia_w, sp, r4.bit() | fp.bit() | pc.bit());
+
+    CodeDesc desc;
+    assm.GetCode(&desc);
+    Object* code = HEAP->CreateCode(
+        desc,
+        Code::ComputeFlags(Code::STUB),
+        Handle<Object>(HEAP->undefined_value()))->ToObjectChecked();
+    CHECK(code->IsCode());
+#ifdef DEBUG
+    Code::cast(code)->Print();
+#endif
+    F4 fn = FUNCTION_CAST<F4>(Code::cast(code)->entry());
+    d.a = 1.1;
+    d.b = 2.2;
+    d.c = 3.3;
+    d.d = 4.4;
+    d.e = 5.5;
+    d.f = 6.6;
+    d.g = 7.7;
+    d.h = 8.8;
+
+    f.a = 1.0;
+    f.b = 2.0;
+    f.c = 3.0;
+    f.d = 4.0;
+    f.e = 5.0;
+    f.f = 6.0;
+    f.g = 7.0;
+    f.h = 8.0;
+
+    Object* dummy = CALL_GENERATED_CODE(fn, &d, &f, 0, 0, 0);
+    USE(dummy);
+
+    CHECK_EQ(7.7, d.a);
+    CHECK_EQ(8.8, d.b);
+    CHECK_EQ(1.1, d.c);
+    CHECK_EQ(2.2, d.d);
+    CHECK_EQ(3.3, d.e);
+    CHECK_EQ(4.4, d.f);
+    CHECK_EQ(5.5, d.g);
+    CHECK_EQ(6.6, d.h);
+
+    CHECK_EQ(7.0, f.a);
+    CHECK_EQ(8.0, f.b);
+    CHECK_EQ(1.0, f.c);
+    CHECK_EQ(2.0, f.d);
+    CHECK_EQ(3.0, f.e);
+    CHECK_EQ(4.0, f.f);
+    CHECK_EQ(5.0, f.g);
+    CHECK_EQ(6.0, f.h);
+  }
+}
+
+
+TEST(9) {
+  // Test VFP multi load/store with ia.
+  InitializeVM();
+  v8::HandleScope scope;
+
+  typedef struct {
+    double a;
+    double b;
+    double c;
+    double d;
+    double e;
+    double f;
+    double g;
+    double h;
+  } D;
+  D d;
+
+  typedef struct {
+    float a;
+    float b;
+    float c;
+    float d;
+    float e;
+    float f;
+    float g;
+    float h;
+  } F;
+  F f;
+
+  // Create a function that uses vldm/vstm to move some double and
+  // single precision values around in memory.
+  Assembler assm(Isolate::Current(), NULL, 0);
+
+  if (CpuFeatures::IsSupported(VFP3)) {
+    CpuFeatures::Scope scope(VFP3);
+
+    __ mov(ip, Operand(sp));
+    __ stm(db_w, sp, r4.bit() | fp.bit() | lr.bit());
+    __ sub(fp, ip, Operand(4));
+
+    __ add(r4, r0, Operand(OFFSET_OF(D, a)));
+    __ vldm(ia, r4, d0, d3);
+    __ add(r4, r4, Operand(4 * 8));
+    __ vldm(ia, r4, d4, d7);
+
+    __ add(r4, r0, Operand(OFFSET_OF(D, a)));
+    __ vstm(ia, r4, d6, d7);
+    __ add(r4, r4, Operand(2 * 8));
+    __ vstm(ia, r4, d0, d5);
+
+    __ add(r4, r1, Operand(OFFSET_OF(F, a)));
+    __ vldm(ia, r4, s0, s3);
+    __ add(r4, r4, Operand(4 * 4));
+    __ vldm(ia, r4, s4, s7);
+
+    __ add(r4, r1, Operand(OFFSET_OF(F, a)));
+    __ vstm(ia, r4, s6, s7);
+    __ add(r4, r4, Operand(2 * 4));
+    __ vstm(ia, r4, s0, s5);
+
+    __ ldm(ia_w, sp, r4.bit() | fp.bit() | pc.bit());
+
+    CodeDesc desc;
+    assm.GetCode(&desc);
+    Object* code = HEAP->CreateCode(
+        desc,
+        Code::ComputeFlags(Code::STUB),
+        Handle<Object>(HEAP->undefined_value()))->ToObjectChecked();
+    CHECK(code->IsCode());
+#ifdef DEBUG
+    Code::cast(code)->Print();
+#endif
+    F4 fn = FUNCTION_CAST<F4>(Code::cast(code)->entry());
+    d.a = 1.1;
+    d.b = 2.2;
+    d.c = 3.3;
+    d.d = 4.4;
+    d.e = 5.5;
+    d.f = 6.6;
+    d.g = 7.7;
+    d.h = 8.8;
+
+    f.a = 1.0;
+    f.b = 2.0;
+    f.c = 3.0;
+    f.d = 4.0;
+    f.e = 5.0;
+    f.f = 6.0;
+    f.g = 7.0;
+    f.h = 8.0;
+
+    Object* dummy = CALL_GENERATED_CODE(fn, &d, &f, 0, 0, 0);
+    USE(dummy);
+
+    CHECK_EQ(7.7, d.a);
+    CHECK_EQ(8.8, d.b);
+    CHECK_EQ(1.1, d.c);
+    CHECK_EQ(2.2, d.d);
+    CHECK_EQ(3.3, d.e);
+    CHECK_EQ(4.4, d.f);
+    CHECK_EQ(5.5, d.g);
+    CHECK_EQ(6.6, d.h);
+
+    CHECK_EQ(7.0, f.a);
+    CHECK_EQ(8.0, f.b);
+    CHECK_EQ(1.0, f.c);
+    CHECK_EQ(2.0, f.d);
+    CHECK_EQ(3.0, f.e);
+    CHECK_EQ(4.0, f.f);
+    CHECK_EQ(5.0, f.g);
+    CHECK_EQ(6.0, f.h);
+  }
+}
+
+
+TEST(10) {
+  // Test VFP multi load/store with db_w.
+  InitializeVM();
+  v8::HandleScope scope;
+
+  typedef struct {
+    double a;
+    double b;
+    double c;
+    double d;
+    double e;
+    double f;
+    double g;
+    double h;
+  } D;
+  D d;
+
+  typedef struct {
+    float a;
+    float b;
+    float c;
+    float d;
+    float e;
+    float f;
+    float g;
+    float h;
+  } F;
+  F f;
+
+  // Create a function that uses vldm/vstm to move some double and
+  // single precision values around in memory.
+  Assembler assm(Isolate::Current(), NULL, 0);
+
+  if (CpuFeatures::IsSupported(VFP3)) {
+    CpuFeatures::Scope scope(VFP3);
+
+    __ mov(ip, Operand(sp));
+    __ stm(db_w, sp, r4.bit() | fp.bit() | lr.bit());
+    __ sub(fp, ip, Operand(4));
+
+    __ add(r4, r0, Operand(OFFSET_OF(D, h) + 8));
+    __ vldm(db_w, r4, d4, d7);
+    __ vldm(db_w, r4, d0, d3);
+
+    __ add(r4, r0, Operand(OFFSET_OF(D, h) + 8));
+    __ vstm(db_w, r4, d0, d5);
+    __ vstm(db_w, r4, d6, d7);
+
+    __ add(r4, r1, Operand(OFFSET_OF(F, h) + 4));
+    __ vldm(db_w, r4, s4, s7);
+    __ vldm(db_w, r4, s0, s3);
+
+    __ add(r4, r1, Operand(OFFSET_OF(F, h) + 4));
+    __ vstm(db_w, r4, s0, s5);
+    __ vstm(db_w, r4, s6, s7);
+
+    __ ldm(ia_w, sp, r4.bit() | fp.bit() | pc.bit());
+
+    CodeDesc desc;
+    assm.GetCode(&desc);
+    Object* code = HEAP->CreateCode(
+        desc,
+        Code::ComputeFlags(Code::STUB),
+        Handle<Object>(HEAP->undefined_value()))->ToObjectChecked();
+    CHECK(code->IsCode());
+#ifdef DEBUG
+    Code::cast(code)->Print();
+#endif
+    F4 fn = FUNCTION_CAST<F4>(Code::cast(code)->entry());
+    d.a = 1.1;
+    d.b = 2.2;
+    d.c = 3.3;
+    d.d = 4.4;
+    d.e = 5.5;
+    d.f = 6.6;
+    d.g = 7.7;
+    d.h = 8.8;
+
+    f.a = 1.0;
+    f.b = 2.0;
+    f.c = 3.0;
+    f.d = 4.0;
+    f.e = 5.0;
+    f.f = 6.0;
+    f.g = 7.0;
+    f.h = 8.0;
+
+    Object* dummy = CALL_GENERATED_CODE(fn, &d, &f, 0, 0, 0);
+    USE(dummy);
+
+    CHECK_EQ(7.7, d.a);
+    CHECK_EQ(8.8, d.b);
+    CHECK_EQ(1.1, d.c);
+    CHECK_EQ(2.2, d.d);
+    CHECK_EQ(3.3, d.e);
+    CHECK_EQ(4.4, d.f);
+    CHECK_EQ(5.5, d.g);
+    CHECK_EQ(6.6, d.h);
+
+    CHECK_EQ(7.0, f.a);
+    CHECK_EQ(8.0, f.b);
+    CHECK_EQ(1.0, f.c);
+    CHECK_EQ(2.0, f.d);
+    CHECK_EQ(3.0, f.e);
+    CHECK_EQ(4.0, f.f);
+    CHECK_EQ(5.0, f.g);
+    CHECK_EQ(6.0, f.h);
+  }
+}
+
 #undef __
--- a/test/cctest/test-disasm-arm.cc
+++ b/test/cctest/test-disasm-arm.cc
-// Copyright 2007-2008 the V8 project authors. All rights reserved.
+// Copyright 2011 the V8 project authors. All rights reserved.
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
@@ -522,6 +522,23 @@ TEST(Vfp) {
            "aef1aa10       vmrsge r10, FPSCR");
    COMPARE(vmrs(pc),
            "eef1fa10       vmrs APSR, FPSCR");
+
+    COMPARE(vstm(ia, r0, d1, d3),
+            "ec801b06       vstmia r0, {d1-d3}");
+    COMPARE(vldm(ia, r1, d2, d5),
+            "ec912b08       vldmia r1, {d2-d5}");
+    COMPARE(vstm(ia, r2, d0, d15),
+            "ec820b20       vstmia r2, {d0-d15}");
+    COMPARE(vldm(ia, r3, d0, d15),
+            "ec930b20       vldmia r3, {d0-d15}");
+    COMPARE(vstm(ia, r4, s1, s3),
+            "ecc40a03       vstmia r4, {s1-s3}");
+    COMPARE(vldm(ia, r5, s2, s5),
+            "ec951a04       vldmia r5, {s2-s5}");
+    COMPARE(vstm(ia, r6, s0, s31),
+            "ec860a20       vstmia r6, {s0-s31}");
+    COMPARE(vldm(ia, r7, s0, s31),
+            "ec970a20       vldmia r7, {s0-s31}");
  }

  VERIFY_RUN();