dsputil_neon.S 19.5 KB
Newer Older
1 2 3 4
/*
 * ARM NEON optimised DSP functions
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
 *
5
 * This file is part of Libav.
6
 *
7
 * Libav is free software; you can redistribute it and/or
8 9 10 11
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
12
 * Libav is distributed in the hope that it will be useful,
13 14 15 16 17
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with Libav; if not, write to the Free Software
19 20 21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

22
#include "libavutil/arm/asm.S"
23

24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
function ff_clear_block_neon, export=1
        vmov.i16        q0,  #0
        .rept           8
        vst1.16         {q0}, [r0,:128]!
        .endr
        bx              lr
endfunc

function ff_clear_blocks_neon, export=1
        vmov.i16        q0,  #0
        .rept           8*6
        vst1.16         {q0}, [r0,:128]!
        .endr
        bx              lr
endfunc

40 41
.macro  pixels16        rnd=1, avg=0
  .if \avg
42
        mov             r12, r0
43
  .endif
44 45 46
1:      vld1.8          {q0},     [r1], r2
        vld1.8          {q1},     [r1], r2
        vld1.8          {q2},     [r1], r2
47
        pld             [r1, r2, lsl #2]
48
        vld1.8          {q3},     [r1], r2
49 50 51
        pld             [r1]
        pld             [r1, r2]
        pld             [r1, r2, lsl #1]
52
  .if \avg
53
        vld1.8          {q8},     [r12,:128], r2
54
        vrhadd.u8       q0,  q0,  q8
55
        vld1.8          {q9},     [r12,:128], r2
56
        vrhadd.u8       q1,  q1,  q9
57
        vld1.8          {q10},    [r12,:128], r2
58
        vrhadd.u8       q2,  q2,  q10
59
        vld1.8          {q11},    [r12,:128], r2
60
        vrhadd.u8       q3,  q3,  q11
61
  .endif
62
        subs            r3,  r3,  #4
63 64 65 66
        vst1.64         {q0},     [r0,:128], r2
        vst1.64         {q1},     [r0,:128], r2
        vst1.64         {q2},     [r0,:128], r2
        vst1.64         {q3},     [r0,:128], r2
67 68
        bne             1b
        bx              lr
69
.endm
70

71
.macro  pixels16_x2     rnd=1, avg=0
72 73
1:      vld1.8          {d0-d2},  [r1], r2
        vld1.8          {d4-d6},  [r1], r2
74 75 76 77
        pld             [r1]
        pld             [r1, r2]
        subs            r3,  r3,  #2
        vext.8          q1,  q0,  q1,  #1
78
        avg             q0,  q0,  q1
79
        vext.8          q3,  q2,  q3,  #1
80
        avg             q2,  q2,  q3
81 82 83 84 85 86 87
  .if \avg
        vld1.8          {q1},     [r0,:128], r2
        vld1.8          {q3},     [r0,:128]
        vrhadd.u8       q0,  q0,  q1
        vrhadd.u8       q2,  q2,  q3
        sub             r0,  r0,  r2
  .endif
88 89
        vst1.8          {q0},     [r0,:128], r2
        vst1.8          {q2},     [r0,:128], r2
90 91
        bne             1b
        bx              lr
92
.endm
93

94
.macro  pixels16_y2     rnd=1, avg=0
95
        sub             r3,  r3,  #2
96 97
        vld1.8          {q0},     [r1], r2
        vld1.8          {q1},     [r1], r2
98
1:      subs            r3,  r3,  #2
99
        avg             q2,  q0,  q1
100
        vld1.8          {q0},     [r1], r2
101
        avg             q3,  q0,  q1
102
        vld1.8          {q1},     [r1], r2
103
        pld             [r1]
104
        pld             [r1, r2]
105 106 107 108 109 110 111
  .if \avg
        vld1.8          {q8},     [r0,:128], r2
        vld1.8          {q9},     [r0,:128]
        vrhadd.u8       q2,  q2,  q8
        vrhadd.u8       q3,  q3,  q9
        sub             r0,  r0,  r2
  .endif
112 113
        vst1.8          {q2},     [r0,:128], r2
        vst1.8          {q3},     [r0,:128], r2
114
        bne             1b
115 116

        avg             q2,  q0,  q1
117
        vld1.8          {q0},     [r1], r2
118 119 120 121 122 123 124 125
        avg             q3,  q0,  q1
  .if \avg
        vld1.8          {q8},     [r0,:128], r2
        vld1.8          {q9},     [r0,:128]
        vrhadd.u8       q2,  q2,  q8
        vrhadd.u8       q3,  q3,  q9
        sub             r0,  r0,  r2
  .endif
126 127
        vst1.8          {q2},     [r0,:128], r2
        vst1.8          {q3},     [r0,:128], r2
128

129
        bx              lr
130
.endm
131

132
.macro  pixels16_xy2    rnd=1, avg=0
133
        sub             r3,  r3,  #2
134 135
        vld1.8          {d0-d2},  [r1], r2
        vld1.8          {d4-d6},  [r1], r2
136
NRND    vmov.i16        q13, #1
137
        pld             [r1]
138
        pld             [r1, r2]
139 140 141 142 143 144 145
        vext.8          q1,  q0,  q1,  #1
        vext.8          q3,  q2,  q3,  #1
        vaddl.u8        q8,  d0,  d2
        vaddl.u8        q10, d1,  d3
        vaddl.u8        q9,  d4,  d6
        vaddl.u8        q11, d5,  d7
1:      subs            r3,  r3,  #2
146
        vld1.8          {d0-d2},  [r1], r2
147 148
        vadd.u16        q12, q8,  q9
        pld             [r1]
149
NRND    vadd.u16        q12, q12, q13
150 151
        vext.8          q15, q0,  q1,  #1
        vadd.u16        q1 , q10, q11
152
        shrn            d28, q12, #2
153
NRND    vadd.u16        q1,  q1,  q13
154
        shrn            d29, q1,  #2
155 156 157 158
  .if \avg
        vld1.8          {q8},     [r0,:128]
        vrhadd.u8       q14, q14, q8
  .endif
159
        vaddl.u8        q8,  d0,  d30
160
        vld1.8          {d2-d4},  [r1], r2
161
        vaddl.u8        q10, d1,  d31
162
        vst1.8          {q14},    [r0,:128], r2
163
        vadd.u16        q12, q8,  q9
164
        pld             [r1, r2]
165
NRND    vadd.u16        q12, q12, q13
166 167
        vext.8          q2,  q1,  q2,  #1
        vadd.u16        q0,  q10, q11
168
        shrn            d30, q12, #2
169
NRND    vadd.u16        q0,  q0,  q13
170
        shrn            d31, q0,  #2
171 172 173 174
  .if \avg
        vld1.8          {q9},     [r0,:128]
        vrhadd.u8       q15, q15, q9
  .endif
175 176
        vaddl.u8        q9,  d2,  d4
        vaddl.u8        q11, d3,  d5
177
        vst1.8          {q15},    [r0,:128], r2
178
        bgt             1b
179

180
        vld1.8          {d0-d2},  [r1], r2
181
        vadd.u16        q12, q8,  q9
182
NRND    vadd.u16        q12, q12, q13
183 184 185
        vext.8          q15, q0,  q1,  #1
        vadd.u16        q1 , q10, q11
        shrn            d28, q12, #2
186
NRND    vadd.u16        q1,  q1,  q13
187 188 189 190 191 192 193
        shrn            d29, q1,  #2
  .if \avg
        vld1.8          {q8},     [r0,:128]
        vrhadd.u8       q14, q14, q8
  .endif
        vaddl.u8        q8,  d0,  d30
        vaddl.u8        q10, d1,  d31
194
        vst1.8          {q14},    [r0,:128], r2
195
        vadd.u16        q12, q8,  q9
196
NRND    vadd.u16        q12, q12, q13
197 198
        vadd.u16        q0,  q10, q11
        shrn            d30, q12, #2
199
NRND    vadd.u16        q0,  q0,  q13
200 201 202 203 204
        shrn            d31, q0,  #2
  .if \avg
        vld1.8          {q9},     [r0,:128]
        vrhadd.u8       q15, q15, q9
  .endif
205
        vst1.8          {q15},    [r0,:128], r2
206

207
        bx              lr
208
.endm
209

210
.macro  pixels8         rnd=1, avg=0
211 212 213
1:      vld1.8          {d0},     [r1], r2
        vld1.8          {d1},     [r1], r2
        vld1.8          {d2},     [r1], r2
214
        pld             [r1, r2, lsl #2]
215
        vld1.8          {d3},     [r1], r2
216 217 218
        pld             [r1]
        pld             [r1, r2]
        pld             [r1, r2, lsl #1]
219
  .if \avg
220
        vld1.8          {d4},     [r0,:64], r2
221
        vrhadd.u8       d0,  d0,  d4
222
        vld1.8          {d5},     [r0,:64], r2
223
        vrhadd.u8       d1,  d1,  d5
224
        vld1.8          {d6},     [r0,:64], r2
225
        vrhadd.u8       d2,  d2,  d6
226
        vld1.8          {d7},     [r0,:64], r2
227 228
        vrhadd.u8       d3,  d3,  d7
        sub             r0,  r0,  r2,  lsl #2
229
  .endif
230
        subs            r3,  r3,  #4
231 232 233 234
        vst1.8          {d0},     [r0,:64], r2
        vst1.8          {d1},     [r0,:64], r2
        vst1.8          {d2},     [r0,:64], r2
        vst1.8          {d3},     [r0,:64], r2
235 236
        bne             1b
        bx              lr
237
.endm
238

239
.macro  pixels8_x2      rnd=1, avg=0
240
1:      vld1.8          {q0},     [r1], r2
241
        vext.8          d1,  d0,  d1,  #1
242
        vld1.8          {q1},     [r1], r2
243 244 245 246 247
        vext.8          d3,  d2,  d3,  #1
        pld             [r1]
        pld             [r1, r2]
        subs            r3,  r3,  #2
        vswp            d1,  d2
248
        avg             q0,  q0,  q1
249 250 251 252 253 254
  .if \avg
        vld1.8          {d4},     [r0,:64], r2
        vld1.8          {d5},     [r0,:64]
        vrhadd.u8       q0,  q0,  q2
        sub             r0,  r0,  r2
  .endif
255 256
        vst1.8          {d0},     [r0,:64], r2
        vst1.8          {d1},     [r0,:64], r2
257 258
        bne             1b
        bx              lr
259
.endm
260

261
.macro  pixels8_y2      rnd=1, avg=0
262
        sub             r3,  r3,  #2
263 264
        vld1.8          {d0},     [r1], r2
        vld1.8          {d1},     [r1], r2
265
1:      subs            r3,  r3,  #2
266
        avg             d4,  d0,  d1
267
        vld1.8          {d0},     [r1], r2
268
        avg             d5,  d0,  d1
269
        vld1.8          {d1},     [r1], r2
270
        pld             [r1]
271
        pld             [r1, r2]
272 273 274 275 276 277
  .if \avg
        vld1.8          {d2},     [r0,:64], r2
        vld1.8          {d3},     [r0,:64]
        vrhadd.u8       q2,  q2,  q1
        sub             r0,  r0,  r2
  .endif
278 279
        vst1.8          {d4},     [r0,:64], r2
        vst1.8          {d5},     [r0,:64], r2
280
        bne             1b
281 282

        avg             d4,  d0,  d1
283
        vld1.8          {d0},     [r1], r2
284 285 286 287 288 289 290
        avg             d5,  d0,  d1
  .if \avg
        vld1.8          {d2},     [r0,:64], r2
        vld1.8          {d3},     [r0,:64]
        vrhadd.u8       q2,  q2,  q1
        sub             r0,  r0,  r2
  .endif
291 292
        vst1.8          {d4},     [r0,:64], r2
        vst1.8          {d5},     [r0,:64], r2
293

294
        bx              lr
295
.endm
296

297
.macro  pixels8_xy2     rnd=1, avg=0
298
        sub             r3,  r3,  #2
299 300
        vld1.8          {q0},     [r1], r2
        vld1.8          {q1},     [r1], r2
301
NRND    vmov.i16        q11, #1
302
        pld             [r1]
303
        pld             [r1, r2]
304 305 306 307 308
        vext.8          d4,  d0,  d1,  #1
        vext.8          d6,  d2,  d3,  #1
        vaddl.u8        q8,  d0,  d4
        vaddl.u8        q9,  d2,  d6
1:      subs            r3,  r3,  #2
309
        vld1.8          {q0},     [r1], r2
310 311 312
        pld             [r1]
        vadd.u16        q10, q8,  q9
        vext.8          d4,  d0,  d1,  #1
313
NRND    vadd.u16        q10, q10, q11
314
        vaddl.u8        q8,  d0,  d4
315
        shrn            d5,  q10, #2
316
        vld1.8          {q1},     [r1], r2
317
        vadd.u16        q10, q8,  q9
318
        pld             [r1, r2]
319 320 321 322
  .if \avg
        vld1.8          {d7},     [r0,:64]
        vrhadd.u8       d5,  d5,  d7
  .endif
323
NRND    vadd.u16        q10, q10, q11
324
        vst1.8          {d5},     [r0,:64], r2
325
        shrn            d7,  q10, #2
326 327 328 329
  .if \avg
        vld1.8          {d5},     [r0,:64]
        vrhadd.u8       d7,  d7,  d5
  .endif
330 331
        vext.8          d6,  d2,  d3,  #1
        vaddl.u8        q9,  d2,  d6
332
        vst1.8          {d7},     [r0,:64], r2
333
        bgt             1b
334

335
        vld1.8          {q0},     [r1], r2
336 337
        vadd.u16        q10, q8,  q9
        vext.8          d4,  d0,  d1,  #1
338
NRND    vadd.u16        q10, q10, q11
339 340 341 342 343 344 345
        vaddl.u8        q8,  d0,  d4
        shrn            d5,  q10, #2
        vadd.u16        q10, q8,  q9
  .if \avg
        vld1.8          {d7},     [r0,:64]
        vrhadd.u8       d5,  d5,  d7
  .endif
346
NRND    vadd.u16        q10, q10, q11
347
        vst1.8          {d5},     [r0,:64], r2
348 349 350 351 352
        shrn            d7,  q10, #2
  .if \avg
        vld1.8          {d5},     [r0,:64]
        vrhadd.u8       d7,  d7,  d5
  .endif
353
        vst1.8          {d7},     [r0,:64], r2
354

355
        bx              lr
356
.endm
357

358 359 360 361 362 363 364 365
.macro  pixfunc         pfx, name, suf, rnd=1, avg=0
  .if \rnd
    .macro avg  rd, rn, rm
        vrhadd.u8       \rd, \rn, \rm
    .endm
    .macro shrn rd, rn, rm
        vrshrn.u16      \rd, \rn, \rm
    .endm
366 367
    .macro NRND insn:vararg
    .endm
368 369 370 371 372 373 374
  .else
    .macro avg  rd, rn, rm
        vhadd.u8        \rd, \rn, \rm
    .endm
    .macro shrn rd, rn, rm
        vshrn.u16       \rd, \rn, \rm
    .endm
375 376 377
    .macro NRND insn:vararg
        \insn
    .endm
378
  .endif
379
function ff_\pfx\name\suf\()_neon, export=1
380
        \name           \rnd, \avg
381
endfunc
382 383
        .purgem         avg
        .purgem         shrn
384
        .purgem         NRND
385
.endm
386

387 388 389 390
.macro  pixfunc2        pfx, name, avg=0
        pixfunc         \pfx, \name,          rnd=1, avg=\avg
        pixfunc         \pfx, \name, _no_rnd, rnd=0, avg=\avg
.endm
391 392

function ff_put_h264_qpel16_mc00_neon, export=1
393
        mov             r3,  #16
394
endfunc
395

396 397 398 399
        pixfunc         put_, pixels16,     avg=0
        pixfunc2        put_, pixels16_x2,  avg=0
        pixfunc2        put_, pixels16_y2,  avg=0
        pixfunc2        put_, pixels16_xy2, avg=0
400 401

function ff_avg_h264_qpel16_mc00_neon, export=1
402
        mov             r3,  #16
403
endfunc
404

405
        pixfunc         avg_, pixels16,     avg=1
406 407 408
        pixfunc2        avg_, pixels16_x2,  avg=1
        pixfunc2        avg_, pixels16_y2,  avg=1
        pixfunc2        avg_, pixels16_xy2, avg=1
409 410

function ff_put_h264_qpel8_mc00_neon, export=1
411
        mov             r3,  #8
412
endfunc
413

414 415 416 417
        pixfunc         put_, pixels8,     avg=0
        pixfunc2        put_, pixels8_x2,  avg=0
        pixfunc2        put_, pixels8_y2,  avg=0
        pixfunc2        put_, pixels8_xy2, avg=0
418

419 420
function ff_avg_h264_qpel8_mc00_neon, export=1
        mov             r3,  #8
421
endfunc
422

423
        pixfunc         avg_, pixels8,     avg=1
424 425 426
        pixfunc         avg_, pixels8_x2,  avg=1
        pixfunc         avg_, pixels8_y2,  avg=1
        pixfunc         avg_, pixels8_xy2, avg=1
427

428
function ff_put_pixels_clamped_neon, export=1
429
        vld1.16         {d16-d19}, [r0,:128]!
430
        vqmovun.s16     d0, q8
431
        vld1.16         {d20-d23}, [r0,:128]!
432
        vqmovun.s16     d1, q9
433
        vld1.16         {d24-d27}, [r0,:128]!
434
        vqmovun.s16     d2, q10
435
        vld1.16         {d28-d31}, [r0,:128]!
436
        vqmovun.s16     d3, q11
437
        vst1.8          {d0},      [r1,:64], r2
438
        vqmovun.s16     d4, q12
439
        vst1.8          {d1},      [r1,:64], r2
440
        vqmovun.s16     d5, q13
441
        vst1.8          {d2},      [r1,:64], r2
442
        vqmovun.s16     d6, q14
443
        vst1.8          {d3},      [r1,:64], r2
444
        vqmovun.s16     d7, q15
445 446 447 448
        vst1.8          {d4},      [r1,:64], r2
        vst1.8          {d5},      [r1,:64], r2
        vst1.8          {d6},      [r1,:64], r2
        vst1.8          {d7},      [r1,:64], r2
449
        bx              lr
450
endfunc
451

452 453
function ff_put_signed_pixels_clamped_neon, export=1
        vmov.u8         d31, #128
454
        vld1.16         {d16-d17}, [r0,:128]!
455
        vqmovn.s16      d0, q8
456
        vld1.16         {d18-d19}, [r0,:128]!
457
        vqmovn.s16      d1, q9
458
        vld1.16         {d16-d17}, [r0,:128]!
459
        vqmovn.s16      d2, q8
460
        vld1.16         {d18-d19}, [r0,:128]!
461
        vadd.u8         d0, d0, d31
462
        vld1.16         {d20-d21}, [r0,:128]!
463
        vadd.u8         d1, d1, d31
464
        vld1.16         {d22-d23}, [r0,:128]!
465
        vadd.u8         d2, d2, d31
466
        vst1.8          {d0},      [r1,:64], r2
467
        vqmovn.s16      d3, q9
468
        vst1.8          {d1},      [r1,:64], r2
469
        vqmovn.s16      d4, q10
470
        vst1.8          {d2},      [r1,:64], r2
471
        vqmovn.s16      d5, q11
472
        vld1.16         {d24-d25}, [r0,:128]!
473
        vadd.u8         d3, d3, d31
474
        vld1.16         {d26-d27}, [r0,:128]!
475 476
        vadd.u8         d4, d4, d31
        vadd.u8         d5, d5, d31
477
        vst1.8          {d3},      [r1,:64], r2
478
        vqmovn.s16      d6, q12
479
        vst1.8          {d4},      [r1,:64], r2
480
        vqmovn.s16      d7, q13
481
        vst1.8          {d5},      [r1,:64], r2
482 483
        vadd.u8         d6, d6, d31
        vadd.u8         d7, d7, d31
484 485
        vst1.8          {d6},      [r1,:64], r2
        vst1.8          {d7},      [r1,:64], r2
486
        bx              lr
487
endfunc
488

489 490
function ff_add_pixels_clamped_neon, export=1
        mov             r3, r1
491 492
        vld1.8          {d16},   [r1,:64], r2
        vld1.16         {d0-d1}, [r0,:128]!
493
        vaddw.u8        q0, q0, d16
494 495
        vld1.8          {d17},   [r1,:64], r2
        vld1.16         {d2-d3}, [r0,:128]!
496
        vqmovun.s16     d0, q0
497
        vld1.8          {d18},   [r1,:64], r2
498
        vaddw.u8        q1, q1, d17
499
        vld1.16         {d4-d5}, [r0,:128]!
500
        vaddw.u8        q2, q2, d18
501
        vst1.8          {d0},    [r3,:64], r2
502
        vqmovun.s16     d2, q1
503 504
        vld1.8          {d19},   [r1,:64], r2
        vld1.16         {d6-d7}, [r0,:128]!
505 506
        vaddw.u8        q3, q3, d19
        vqmovun.s16     d4, q2
507 508
        vst1.8          {d2},    [r3,:64], r2
        vld1.8          {d16},   [r1,:64], r2
509
        vqmovun.s16     d6, q3
510
        vld1.16         {d0-d1}, [r0,:128]!
511
        vaddw.u8        q0, q0, d16
512 513 514
        vst1.8          {d4},    [r3,:64], r2
        vld1.8          {d17},   [r1,:64], r2
        vld1.16         {d2-d3}, [r0,:128]!
515
        vaddw.u8        q1, q1, d17
516
        vst1.8          {d6},    [r3,:64], r2
517
        vqmovun.s16     d0, q0
518 519
        vld1.8          {d18},   [r1,:64], r2
        vld1.16         {d4-d5}, [r0,:128]!
520
        vaddw.u8        q2, q2, d18
521
        vst1.8          {d0},    [r3,:64], r2
522
        vqmovun.s16     d2, q1
523
        vld1.8          {d19},   [r1,:64], r2
524
        vqmovun.s16     d4, q2
525
        vld1.16         {d6-d7}, [r0,:128]!
526
        vaddw.u8        q3, q3, d19
527
        vst1.8          {d2},    [r3,:64], r2
528
        vqmovun.s16     d6, q3
529 530
        vst1.8          {d4},    [r3,:64], r2
        vst1.8          {d6},    [r3,:64], r2
531
        bx              lr
532
endfunc
533

534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557
function ff_vector_clipf_neon, export=1
VFP     vdup.32         q1,  d0[1]
VFP     vdup.32         q0,  d0[0]
NOVFP   vdup.32         q0,  r2
NOVFP   vdup.32         q1,  r3
NOVFP   ldr             r2,  [sp]
        vld1.f32        {q2},[r1,:128]!
        vmin.f32        q10, q2,  q1
        vld1.f32        {q3},[r1,:128]!
        vmin.f32        q11, q3,  q1
1:      vmax.f32        q8,  q10, q0
        vmax.f32        q9,  q11, q0
        subs            r2,  r2,  #8
        beq             2f
        vld1.f32        {q2},[r1,:128]!
        vmin.f32        q10, q2,  q1
        vld1.f32        {q3},[r1,:128]!
        vmin.f32        q11, q3,  q1
        vst1.f32        {q8},[r0,:128]!
        vst1.f32        {q9},[r0,:128]!
        b               1b
2:      vst1.f32        {q8},[r0,:128]!
        vst1.f32        {q9},[r0,:128]!
        bx              lr
558
endfunc
559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581

function ff_apply_window_int16_neon, export=1
        push            {r4,lr}
        add             r4,  r1,  r3,  lsl #1
        add             lr,  r0,  r3,  lsl #1
        sub             r4,  r4,  #16
        sub             lr,  lr,  #16
        mov             r12, #-16
1:
        vld1.16         {q0},     [r1,:128]!
        vld1.16         {q2},     [r2,:128]!
        vld1.16         {q1},     [r4,:128], r12
        vrev64.16       q3,  q2
        vqrdmulh.s16    q0,  q0,  q2
        vqrdmulh.s16    d2,  d2,  d7
        vqrdmulh.s16    d3,  d3,  d6
        vst1.16         {q0},     [r0,:128]!
        vst1.16         {q1},     [lr,:128], r12
        subs            r3,  r3,  #16
        bgt             1b

        pop             {r4,pc}
endfunc
582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597

function ff_vector_clip_int32_neon, export=1
        vdup.32         q0,  r2
        vdup.32         q1,  r3
        ldr             r2,  [sp]
1:
        vld1.32         {q2-q3},  [r1,:128]!
        vmin.s32        q2,  q2,  q1
        vmin.s32        q3,  q3,  q1
        vmax.s32        q2,  q2,  q0
        vmax.s32        q3,  q3,  q0
        vst1.32         {q2-q3},  [r0,:128]!
        subs            r2,  r2,  #8
        bgt             1b
        bx              lr
endfunc