h264_weight_10bit.asm 6.26 KB
Newer Older
1 2 3 4 5 6 7
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11 12 13 14
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16 17 18 19 20
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22 23 24
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

25
%include "libavutil/x86/x86util.asm"
26 27 28 29 30 31 32

SECTION_RODATA 32

sq_1: dq 1
      dq 0

cextern pw_1
33 34
cextern pw_1023
%define pw_pixel_max pw_1023
35 36 37 38

SECTION .text

;-----------------------------------------------------------------------------
39 40
; void ff_h264_weight_16_10(uint8_t *dst, int stride, int height,
;                           int log2_denom, int weight, int offset);
41
;-----------------------------------------------------------------------------
42
%macro WEIGHT_PROLOGUE 0
43
.prologue:
44
    PROLOGUE 0,6,8
45 46
    movifnidn  r0, r0mp
    movifnidn r1d, r1m
47
    movifnidn r2d, r2m
48
    movifnidn r4d, r4m
49
    movifnidn r5d, r5m
50 51
%endmacro

52
%macro WEIGHT_SETUP 0
53
    mova       m0, [pw_1]
54
    movd       m2, r3m
55 56
    pslld      m0, m2       ; 1<<log2_denom
    SPLATW     m0, m0
57 58 59
    shl        r5, 19       ; *8, move to upper half of dword
    lea        r5, [r5+r4*2+0x10000]
    movd       m3, r5d      ; weight<<1 | 1+(offset<<(3))
60 61 62
    pshufd     m3, m3, 0
    mova       m4, [pw_pixel_max]
    paddw      m2, [sq_1]   ; log2_denom+1
63
%if notcpuflag(sse4)
64 65 66 67
    pxor       m7, m7
%endif
%endmacro

68 69 70
%macro WEIGHT_OP 1-2
%if %0==1
    mova        m5, [r0+%1]
71 72 73
    punpckhwd   m6, m5, m0
    punpcklwd   m5, m0
%else
74 75
    movq        m5, [r0+%1]
    movq        m6, [r0+%2]
76 77 78 79 80 81 82
    punpcklwd   m5, m0
    punpcklwd   m6, m0
%endif
    pmaddwd     m5, m3
    pmaddwd     m6, m3
    psrad       m5, m2
    psrad       m6, m2
83
%if cpuflag(sse4)
84 85 86 87 88 89 90 91
    packusdw    m5, m6
    pminsw      m5, m4
%else
    packssdw    m5, m6
    CLIPW       m5, m7, m4
%endif
%endmacro

92 93
%macro WEIGHT_FUNC_DBL 0
cglobal h264_weight_16_10
94
    WEIGHT_PROLOGUE
95
    WEIGHT_SETUP
96
.nextrow:
97
    WEIGHT_OP  0
98
    mova [r0   ], m5
99
    WEIGHT_OP 16
100 101
    mova [r0+16], m5
    add       r0, r1
102
    dec       r2d
103 104 105 106
    jnz .nextrow
    REP_RET
%endmacro

107 108 109 110
INIT_XMM sse2
WEIGHT_FUNC_DBL
INIT_XMM sse4
WEIGHT_FUNC_DBL
111 112


113 114
%macro WEIGHT_FUNC_MM 0
cglobal h264_weight_8_10
115
    WEIGHT_PROLOGUE
116
    WEIGHT_SETUP
117
.nextrow:
118
    WEIGHT_OP   0
119 120
    mova     [r0], m5
    add        r0, r1
121
    dec        r2d
122 123 124 125
    jnz .nextrow
    REP_RET
%endmacro

126 127 128 129
INIT_XMM sse2
WEIGHT_FUNC_MM
INIT_XMM sse4
WEIGHT_FUNC_MM
130 131


132 133
%macro WEIGHT_FUNC_HALF_MM 0
cglobal h264_weight_4_10
134 135
    WEIGHT_PROLOGUE
    sar         r2d, 1
136
    WEIGHT_SETUP
137
    lea         r3, [r1*2]
138
.nextrow:
139
    WEIGHT_OP    0, r1
140 141 142
    movh      [r0], m5
    movhps [r0+r1], m5
    add         r0, r3
143
    dec         r2d
144 145 146 147
    jnz .nextrow
    REP_RET
%endmacro

148 149 150 151
INIT_XMM sse2
WEIGHT_FUNC_HALF_MM
INIT_XMM sse4
WEIGHT_FUNC_HALF_MM
152 153 154


;-----------------------------------------------------------------------------
155 156 157
; void ff_h264_biweight_16_10(uint8_t *dst, uint8_t *src, int stride,
;                             int height, int log2_denom, int weightd,
;                             int weights, int offset);
158
;-----------------------------------------------------------------------------
159
%if ARCH_X86_32
160
DECLARE_REG_TMP 3
161
%else
162
DECLARE_REG_TMP 7
163 164
%endif

165
%macro BIWEIGHT_PROLOGUE 0
166
.prologue:
167
    PROLOGUE 0,8,8
168 169
    movifnidn  r0, r0mp
    movifnidn  r1, r1mp
170
    movifnidn r2d, r2m
171 172
    movifnidn r5d, r5m
    movifnidn r6d, r6m
173
    movifnidn t0d, r7m
174 175
%endmacro

176
%macro BIWEIGHT_SETUP 0
177 178 179 180 181 182 183
    lea        t0, [t0*4+1] ; (offset<<2)+1
    or         t0, 1
    shl        r6, 16
    or         r5, r6
    movd       m4, r5d      ; weightd | weights
    movd       m5, t0d      ; (offset+1)|1
    movd       m6, r4m      ; log2_denom
184 185 186 187 188
    pslld      m5, m6       ; (((offset<<2)+1)|1)<<log2_denom
    paddd      m6, [sq_1]
    pshufd     m4, m4, 0
    pshufd     m5, m5, 0
    mova       m3, [pw_pixel_max]
189
    movifnidn r3d, r3m
190
%if notcpuflag(sse4)
191 192 193 194
    pxor       m7, m7
%endif
%endmacro

195 196 197 198
%macro BIWEIGHT 1-2
%if %0==1
    mova       m0, [r0+%1]
    mova       m1, [r1+%1]
199 200 201
    punpckhwd  m2, m0, m1
    punpcklwd  m0, m1
%else
202 203
    movq       m0, [r0+%1]
    movq       m1, [r1+%1]
204
    punpcklwd  m0, m1
205 206
    movq       m2, [r0+%2]
    movq       m1, [r1+%2]
207 208 209 210 211 212 213 214
    punpcklwd  m2, m1
%endif
    pmaddwd    m0, m4
    pmaddwd    m2, m4
    paddd      m0, m5
    paddd      m2, m5
    psrad      m0, m6
    psrad      m2, m6
215
%if cpuflag(sse4)
216 217 218 219 220 221 222 223
    packusdw   m0, m2
    pminsw     m0, m3
%else
    packssdw   m0, m2
    CLIPW      m0, m7, m3
%endif
%endmacro

224 225
%macro BIWEIGHT_FUNC_DBL 0
cglobal h264_biweight_16_10
226
    BIWEIGHT_PROLOGUE
227
    BIWEIGHT_SETUP
228
.nextrow:
229
    BIWEIGHT   0
230
    mova [r0   ], m0
231
    BIWEIGHT  16
232
    mova [r0+16], m0
233 234 235
    add       r0, r2
    add       r1, r2
    dec       r3d
236 237 238 239
    jnz .nextrow
    REP_RET
%endmacro

240 241 242 243
INIT_XMM sse2
BIWEIGHT_FUNC_DBL
INIT_XMM sse4
BIWEIGHT_FUNC_DBL
244

245 246
%macro BIWEIGHT_FUNC 0
cglobal h264_biweight_8_10
247
    BIWEIGHT_PROLOGUE
248
    BIWEIGHT_SETUP
249
.nextrow:
250
    BIWEIGHT  0
251
    mova   [r0], m0
252 253 254
    add      r0, r2
    add      r1, r2
    dec      r3d
255 256 257 258
    jnz .nextrow
    REP_RET
%endmacro

259 260 261 262
INIT_XMM sse2
BIWEIGHT_FUNC
INIT_XMM sse4
BIWEIGHT_FUNC
263

264 265
%macro BIWEIGHT_FUNC_HALF 0
cglobal h264_biweight_4_10
266
    BIWEIGHT_PROLOGUE
267
    BIWEIGHT_SETUP
268 269
    sar        r3d, 1
    lea        r4, [r2*2]
270
.nextrow:
271
    BIWEIGHT     0, r2
272
    movh   [r0   ], m0
273
    movhps [r0+r2], m0
274 275
    add         r0, r4
    add         r1, r4
276
    dec         r3d
277 278 279 280
    jnz .nextrow
    REP_RET
%endmacro

281 282 283 284
INIT_XMM sse2
BIWEIGHT_FUNC_HALF
INIT_XMM sse4
BIWEIGHT_FUNC_HALF