dsputil_neon.S 6.53 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/*
 * ARM NEON optimised DSP functions
 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

22
#include "libavutil/arm/asm.S"
23

24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39
function ff_clear_block_neon, export=1
        vmov.i16        q0,  #0
        .rept           8
        vst1.16         {q0}, [r0,:128]!
        .endr
        bx              lr
endfunc

function ff_clear_blocks_neon, export=1
        vmov.i16        q0,  #0
        .rept           8*6
        vst1.16         {q0}, [r0,:128]!
        .endr
        bx              lr
endfunc

40
function ff_put_pixels_clamped_neon, export=1
41
        vld1.16         {d16-d19}, [r0,:128]!
42
        vqmovun.s16     d0, q8
43
        vld1.16         {d20-d23}, [r0,:128]!
44
        vqmovun.s16     d1, q9
45
        vld1.16         {d24-d27}, [r0,:128]!
46
        vqmovun.s16     d2, q10
47
        vld1.16         {d28-d31}, [r0,:128]!
48
        vqmovun.s16     d3, q11
49
        vst1.8          {d0},      [r1,:64], r2
50
        vqmovun.s16     d4, q12
51
        vst1.8          {d1},      [r1,:64], r2
52
        vqmovun.s16     d5, q13
53
        vst1.8          {d2},      [r1,:64], r2
54
        vqmovun.s16     d6, q14
55
        vst1.8          {d3},      [r1,:64], r2
56
        vqmovun.s16     d7, q15
57 58 59 60
        vst1.8          {d4},      [r1,:64], r2
        vst1.8          {d5},      [r1,:64], r2
        vst1.8          {d6},      [r1,:64], r2
        vst1.8          {d7},      [r1,:64], r2
61
        bx              lr
62
endfunc
63

64 65
function ff_put_signed_pixels_clamped_neon, export=1
        vmov.u8         d31, #128
66
        vld1.16         {d16-d17}, [r0,:128]!
67
        vqmovn.s16      d0, q8
68
        vld1.16         {d18-d19}, [r0,:128]!
69
        vqmovn.s16      d1, q9
70
        vld1.16         {d16-d17}, [r0,:128]!
71
        vqmovn.s16      d2, q8
72
        vld1.16         {d18-d19}, [r0,:128]!
73
        vadd.u8         d0, d0, d31
74
        vld1.16         {d20-d21}, [r0,:128]!
75
        vadd.u8         d1, d1, d31
76
        vld1.16         {d22-d23}, [r0,:128]!
77
        vadd.u8         d2, d2, d31
78
        vst1.8          {d0},      [r1,:64], r2
79
        vqmovn.s16      d3, q9
80
        vst1.8          {d1},      [r1,:64], r2
81
        vqmovn.s16      d4, q10
82
        vst1.8          {d2},      [r1,:64], r2
83
        vqmovn.s16      d5, q11
84
        vld1.16         {d24-d25}, [r0,:128]!
85
        vadd.u8         d3, d3, d31
86
        vld1.16         {d26-d27}, [r0,:128]!
87 88
        vadd.u8         d4, d4, d31
        vadd.u8         d5, d5, d31
89
        vst1.8          {d3},      [r1,:64], r2
90
        vqmovn.s16      d6, q12
91
        vst1.8          {d4},      [r1,:64], r2
92
        vqmovn.s16      d7, q13
93
        vst1.8          {d5},      [r1,:64], r2
94 95
        vadd.u8         d6, d6, d31
        vadd.u8         d7, d7, d31
96 97
        vst1.8          {d6},      [r1,:64], r2
        vst1.8          {d7},      [r1,:64], r2
98
        bx              lr
99
endfunc
100

101 102
function ff_add_pixels_clamped_neon, export=1
        mov             r3, r1
103 104
        vld1.8          {d16},   [r1,:64], r2
        vld1.16         {d0-d1}, [r0,:128]!
105
        vaddw.u8        q0, q0, d16
106 107
        vld1.8          {d17},   [r1,:64], r2
        vld1.16         {d2-d3}, [r0,:128]!
108
        vqmovun.s16     d0, q0
109
        vld1.8          {d18},   [r1,:64], r2
110
        vaddw.u8        q1, q1, d17
111
        vld1.16         {d4-d5}, [r0,:128]!
112
        vaddw.u8        q2, q2, d18
113
        vst1.8          {d0},    [r3,:64], r2
114
        vqmovun.s16     d2, q1
115 116
        vld1.8          {d19},   [r1,:64], r2
        vld1.16         {d6-d7}, [r0,:128]!
117 118
        vaddw.u8        q3, q3, d19
        vqmovun.s16     d4, q2
119 120
        vst1.8          {d2},    [r3,:64], r2
        vld1.8          {d16},   [r1,:64], r2
121
        vqmovun.s16     d6, q3
122
        vld1.16         {d0-d1}, [r0,:128]!
123
        vaddw.u8        q0, q0, d16
124 125 126
        vst1.8          {d4},    [r3,:64], r2
        vld1.8          {d17},   [r1,:64], r2
        vld1.16         {d2-d3}, [r0,:128]!
127
        vaddw.u8        q1, q1, d17
128
        vst1.8          {d6},    [r3,:64], r2
129
        vqmovun.s16     d0, q0
130 131
        vld1.8          {d18},   [r1,:64], r2
        vld1.16         {d4-d5}, [r0,:128]!
132
        vaddw.u8        q2, q2, d18
133
        vst1.8          {d0},    [r3,:64], r2
134
        vqmovun.s16     d2, q1
135
        vld1.8          {d19},   [r1,:64], r2
136
        vqmovun.s16     d4, q2
137
        vld1.16         {d6-d7}, [r0,:128]!
138
        vaddw.u8        q3, q3, d19
139
        vst1.8          {d2},    [r3,:64], r2
140
        vqmovun.s16     d6, q3
141 142
        vst1.8          {d4},    [r3,:64], r2
        vst1.8          {d6},    [r3,:64], r2
143
        bx              lr
144
endfunc
145

146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
function ff_vector_clipf_neon, export=1
VFP     vdup.32         q1,  d0[1]
VFP     vdup.32         q0,  d0[0]
NOVFP   vdup.32         q0,  r2
NOVFP   vdup.32         q1,  r3
NOVFP   ldr             r2,  [sp]
        vld1.f32        {q2},[r1,:128]!
        vmin.f32        q10, q2,  q1
        vld1.f32        {q3},[r1,:128]!
        vmin.f32        q11, q3,  q1
1:      vmax.f32        q8,  q10, q0
        vmax.f32        q9,  q11, q0
        subs            r2,  r2,  #8
        beq             2f
        vld1.f32        {q2},[r1,:128]!
        vmin.f32        q10, q2,  q1
        vld1.f32        {q3},[r1,:128]!
        vmin.f32        q11, q3,  q1
        vst1.f32        {q8},[r0,:128]!
        vst1.f32        {q9},[r0,:128]!
        b               1b
2:      vst1.f32        {q8},[r0,:128]!
        vst1.f32        {q9},[r0,:128]!
        bx              lr
170
endfunc
171

172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
function ff_vector_clip_int32_neon, export=1
        vdup.32         q0,  r2
        vdup.32         q1,  r3
        ldr             r2,  [sp]
1:
        vld1.32         {q2-q3},  [r1,:128]!
        vmin.s32        q2,  q2,  q1
        vmin.s32        q3,  q3,  q1
        vmax.s32        q2,  q2,  q0
        vmax.s32        q3,  q3,  q0
        vst1.32         {q2-q3},  [r0,:128]!
        subs            r2,  r2,  #8
        bgt             1b
        bx              lr
endfunc