h264_weight_10bit.asm 6.18 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of Libav.
;*
;* Libav is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* Libav is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with Libav; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

25
%include "libavutil/x86/x86util.asm"
26 27 28 29 30 31 32 33 34 35 36 37

SECTION_RODATA 32

pw_pixel_max: times 8 dw ((1 << 10)-1)
sq_1: dq 1
      dq 0

cextern pw_1

SECTION .text

;-----------------------------------------------------------------------------
38
; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
39 40
;                  int weight, int offset);
;-----------------------------------------------------------------------------
41
%macro WEIGHT_PROLOGUE 0
42
.prologue:
43
    PROLOGUE 0,6,8
44 45
    movifnidn  r0, r0mp
    movifnidn r1d, r1m
46
    movifnidn r2d, r2m
47
    movifnidn r4d, r4m
48
    movifnidn r5d, r5m
49 50
%endmacro

51
%macro WEIGHT_SETUP 0
52
    mova       m0, [pw_1]
53
    movd       m2, r3m
54 55
    pslld      m0, m2       ; 1<<log2_denom
    SPLATW     m0, m0
56 57 58
    shl        r5, 19       ; *8, move to upper half of dword
    lea        r5, [r5+r4*2+0x10000]
    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
59 60 61
    pshufd     m3, m3, 0
    mova       m4, [pw_pixel_max]
    paddw      m2, [sq_1]   ; log2_denom+1
62
%if notcpuflag(sse4)
63 64 65 66
    pxor       m7, m7
%endif
%endmacro

67 68 69
%macro WEIGHT_OP 1-2
%if %0==1
    mova        m5, [r0+%1]
70 71 72
    punpckhwd   m6, m5, m0
    punpcklwd   m5, m0
%else
73 74
    movq        m5, [r0+%1]
    movq        m6, [r0+%2]
75 76 77 78 79 80 81
    punpcklwd   m5, m0
    punpcklwd   m6, m0
%endif
    pmaddwd     m5, m3
    pmaddwd     m6, m3
    psrad       m5, m2
    psrad       m6, m2
82
%if cpuflag(sse4)
83 84 85 86 87 88 89 90
    packusdw    m5, m6
    pminsw      m5, m4
%else
    packssdw    m5, m6
    CLIPW       m5, m7, m4
%endif
%endmacro

91 92
%macro WEIGHT_FUNC_DBL 0
cglobal h264_weight_16_10
93
    WEIGHT_PROLOGUE
94
    WEIGHT_SETUP
95
.nextrow:
96
    WEIGHT_OP  0
97
    mova [r0   ], m5
98
    WEIGHT_OP 16
99 100
    mova [r0+16], m5
    add       r0, r1
101
    dec       r2d
102 103 104 105
    jnz .nextrow
    REP_RET
%endmacro

106 107 108 109
INIT_XMM sse2
WEIGHT_FUNC_DBL
INIT_XMM sse4
WEIGHT_FUNC_DBL
110 111


112 113
%macro WEIGHT_FUNC_MM 0
cglobal h264_weight_8_10
114
    WEIGHT_PROLOGUE
115
    WEIGHT_SETUP
116
.nextrow:
117
    WEIGHT_OP   0
118 119
    mova     [r0], m5
    add        r0, r1
120
    dec        r2d
121 122 123 124
    jnz .nextrow
    REP_RET
%endmacro

125 126 127 128
INIT_XMM sse2
WEIGHT_FUNC_MM
INIT_XMM sse4
WEIGHT_FUNC_MM
129 130


131 132
%macro WEIGHT_FUNC_HALF_MM 0
cglobal h264_weight_4_10
133 134
    WEIGHT_PROLOGUE
    sar         r2d, 1
135
    WEIGHT_SETUP
136
    lea         r3, [r1*2]
137
.nextrow:
138
    WEIGHT_OP    0, r1
139 140 141
    movh      [r0], m5
    movhps [r0+r1], m5
    add         r0, r3
142
    dec         r2d
143 144 145 146
    jnz .nextrow
    REP_RET
%endmacro

147 148 149 150
INIT_XMM sse2
WEIGHT_FUNC_HALF_MM
INIT_XMM sse4
WEIGHT_FUNC_HALF_MM
151 152 153


;-----------------------------------------------------------------------------
154 155
; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
;                    int log2_denom, int weightd, int weights, int offset);
156
;-----------------------------------------------------------------------------
157
%if ARCH_X86_32
158
DECLARE_REG_TMP 3
159
%else
160
DECLARE_REG_TMP 7
161 162
%endif

163
%macro BIWEIGHT_PROLOGUE 0
164
.prologue:
165
    PROLOGUE 0,8,8
166 167
    movifnidn  r0, r0mp
    movifnidn  r1, r1mp
168
    movifnidn r2d, r2m
169 170
    movifnidn r5d, r5m
    movifnidn r6d, r6m
171
    movifnidn t0d, r7m
172 173
%endmacro

174
%macro BIWEIGHT_SETUP 0
175 176 177 178 179 180 181
    lea        t0, [t0*4+1] ; (offset<<2)+1
    or         t0, 1
    shl        r6, 16
    or         r5, r6
    movd       m4, r5d      ; weightd | weights
    movd       m5, t0d      ; (offset+1)|1
    movd       m6, r4m      ; log2_denom
182 183 184 185 186
    pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
    paddd      m6, [sq_1]
    pshufd     m4, m4, 0
    pshufd     m5, m5, 0
    mova       m3, [pw_pixel_max]
187
    movifnidn r3d, r3m
188
%if notcpuflag(sse4)
189 190 191 192
    pxor       m7, m7
%endif
%endmacro

193 194 195 196
%macro BIWEIGHT 1-2
%if %0==1
    mova       m0, [r0+%1]
    mova       m1, [r1+%1]
197 198 199
    punpckhwd  m2, m0, m1
    punpcklwd  m0, m1
%else
200 201
    movq       m0, [r0+%1]
    movq       m1, [r1+%1]
202
    punpcklwd  m0, m1
203 204
    movq       m2, [r0+%2]
    movq       m1, [r1+%2]
205 206 207 208 209 210 211 212
    punpcklwd  m2, m1
%endif
    pmaddwd    m0, m4
    pmaddwd    m2, m4
    paddd      m0, m5
    paddd      m2, m5
    psrad      m0, m6
    psrad      m2, m6
213
%if cpuflag(sse4)
214 215 216 217 218 219 220 221
    packusdw   m0, m2
    pminsw     m0, m3
%else
    packssdw   m0, m2
    CLIPW      m0, m7, m3
%endif
%endmacro

222 223
%macro BIWEIGHT_FUNC_DBL 0
cglobal h264_biweight_16_10
224
    BIWEIGHT_PROLOGUE
225
    BIWEIGHT_SETUP
226
.nextrow:
227
    BIWEIGHT   0
228
    mova [r0   ], m0
229
    BIWEIGHT  16
230
    mova [r0+16], m0
231 232 233
    add       r0, r2
    add       r1, r2
    dec       r3d
234 235 236 237
    jnz .nextrow
    REP_RET
%endmacro

238 239 240 241
INIT_XMM sse2
BIWEIGHT_FUNC_DBL
INIT_XMM sse4
BIWEIGHT_FUNC_DBL
242

243 244
%macro BIWEIGHT_FUNC 0
cglobal h264_biweight_8_10
245
    BIWEIGHT_PROLOGUE
246
    BIWEIGHT_SETUP
247
.nextrow:
248
    BIWEIGHT  0
249
    mova   [r0], m0
250 251 252
    add      r0, r2
    add      r1, r2
    dec      r3d
253 254 255 256
    jnz .nextrow
    REP_RET
%endmacro

257 258 259 260
INIT_XMM sse2
BIWEIGHT_FUNC
INIT_XMM sse4
BIWEIGHT_FUNC
261

262 263
%macro BIWEIGHT_FUNC_HALF 0
cglobal h264_biweight_4_10
264
    BIWEIGHT_PROLOGUE
265
    BIWEIGHT_SETUP
266 267
    sar        r3d, 1
    lea        r4, [r2*2]
268
.nextrow:
269
    BIWEIGHT     0, r2
270
    movh   [r0   ], m0
271
    movhps [r0+r2], m0
272 273
    add         r0, r4
    add         r1, r4
274
    dec         r3d
275 276 277 278
    jnz .nextrow
    REP_RET
%endmacro

279 280 281 282
INIT_XMM sse2
BIWEIGHT_FUNC_HALF
INIT_XMM sse4
BIWEIGHT_FUNC_HALF