;***************************************************************************** ;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code ;***************************************************************************** ;* Copyright (C) 2005-2011 x264 project ;* ;* Authors: Daniel Kang <daniel.d.kang@gmail.com> ;* ;* This file is part of FFmpeg. ;* ;* FFmpeg is free software; you can redistribute it and/or ;* modify it under the terms of the GNU Lesser General Public ;* License as published by the Free Software Foundation; either ;* version 2.1 of the License, or (at your option) any later version. ;* ;* FFmpeg is distributed in the hope that it will be useful, ;* but WITHOUT ANY WARRANTY; without even the implied warranty of ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ;* Lesser General Public License for more details. ;* ;* You should have received a copy of the GNU Lesser General Public ;* License along with FFmpeg; if not, write to the Free Software ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ;****************************************************************************** %include "libavutil/x86/x86util.asm" SECTION_RODATA 32 pw_pixel_max: times 8 dw ((1 << 10)-1) sq_1: dq 1 dq 0 cextern pw_1 SECTION .text ;----------------------------------------------------------------------------- ; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom, ; int weight, int offset); ;----------------------------------------------------------------------------- %macro WEIGHT_PROLOGUE 0 .prologue: PROLOGUE 0,6,8 movifnidn r0, r0mp movifnidn r1d, r1m movifnidn r2d, r2m movifnidn r4d, r4m movifnidn r5d, r5m %endmacro %macro WEIGHT_SETUP 0 mova m0, [pw_1] movd m2, r3m pslld m0, m2 ; 1<<log2_denom SPLATW m0, m0 shl r5, 19 ; *8, move to upper half of dword lea r5, [r5+r4*2+0x10000] movd m3, r5d ; weight<<1 | 1+(offset<<(3)) pshufd m3, m3, 0 mova m4, [pw_pixel_max] paddw m2, [sq_1] ; log2_denom+1 %if notcpuflag(sse4) pxor m7, m7 %endif %endmacro %macro WEIGHT_OP 1-2 %if %0==1 mova m5, [r0+%1] punpckhwd m6, m5, m0 punpcklwd m5, m0 %else movq m5, [r0+%1] movq m6, [r0+%2] punpcklwd m5, m0 punpcklwd m6, m0 %endif pmaddwd m5, m3 pmaddwd m6, m3 psrad m5, m2 psrad m6, m2 %if cpuflag(sse4) packusdw m5, m6 pminsw m5, m4 %else packssdw m5, m6 CLIPW m5, m7, m4 %endif %endmacro %macro WEIGHT_FUNC_DBL 0 cglobal h264_weight_16_10 WEIGHT_PROLOGUE WEIGHT_SETUP .nextrow: WEIGHT_OP 0 mova [r0 ], m5 WEIGHT_OP 16 mova [r0+16], m5 add r0, r1 dec r2d jnz .nextrow REP_RET %endmacro INIT_XMM sse2 WEIGHT_FUNC_DBL INIT_XMM sse4 WEIGHT_FUNC_DBL %macro WEIGHT_FUNC_MM 0 cglobal h264_weight_8_10 WEIGHT_PROLOGUE WEIGHT_SETUP .nextrow: WEIGHT_OP 0 mova [r0], m5 add r0, r1 dec r2d jnz .nextrow REP_RET %endmacro INIT_XMM sse2 WEIGHT_FUNC_MM INIT_XMM sse4 WEIGHT_FUNC_MM %macro WEIGHT_FUNC_HALF_MM 0 cglobal h264_weight_4_10 WEIGHT_PROLOGUE sar r2d, 1 WEIGHT_SETUP lea r3, [r1*2] .nextrow: WEIGHT_OP 0, r1 movh [r0], m5 movhps [r0+r1], m5 add r0, r3 dec r2d jnz .nextrow REP_RET %endmacro INIT_XMM sse2 WEIGHT_FUNC_HALF_MM INIT_XMM sse4 WEIGHT_FUNC_HALF_MM ;----------------------------------------------------------------------------- ; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height, ; int log2_denom, int weightd, int weights, int offset); ;----------------------------------------------------------------------------- %if ARCH_X86_32 DECLARE_REG_TMP 3 %else DECLARE_REG_TMP 7 %endif %macro BIWEIGHT_PROLOGUE 0 .prologue: PROLOGUE 0,8,8 movifnidn r0, r0mp movifnidn r1, r1mp movifnidn r2d, r2m movifnidn r5d, r5m movifnidn r6d, r6m movifnidn t0d, r7m %endmacro %macro BIWEIGHT_SETUP 0 lea t0, [t0*4+1] ; (offset<<2)+1 or t0, 1 shl r6, 16 or r5, r6 movd m4, r5d ; weightd | weights movd m5, t0d ; (offset+1)|1 movd m6, r4m ; log2_denom pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom paddd m6, [sq_1] pshufd m4, m4, 0 pshufd m5, m5, 0 mova m3, [pw_pixel_max] movifnidn r3d, r3m %if notcpuflag(sse4) pxor m7, m7 %endif %endmacro %macro BIWEIGHT 1-2 %if %0==1 mova m0, [r0+%1] mova m1, [r1+%1] punpckhwd m2, m0, m1 punpcklwd m0, m1 %else movq m0, [r0+%1] movq m1, [r1+%1] punpcklwd m0, m1 movq m2, [r0+%2] movq m1, [r1+%2] punpcklwd m2, m1 %endif pmaddwd m0, m4 pmaddwd m2, m4 paddd m0, m5 paddd m2, m5 psrad m0, m6 psrad m2, m6 %if cpuflag(sse4) packusdw m0, m2 pminsw m0, m3 %else packssdw m0, m2 CLIPW m0, m7, m3 %endif %endmacro %macro BIWEIGHT_FUNC_DBL 0 cglobal h264_biweight_16_10 BIWEIGHT_PROLOGUE BIWEIGHT_SETUP .nextrow: BIWEIGHT 0 mova [r0 ], m0 BIWEIGHT 16 mova [r0+16], m0 add r0, r2 add r1, r2 dec r3d jnz .nextrow REP_RET %endmacro INIT_XMM sse2 BIWEIGHT_FUNC_DBL INIT_XMM sse4 BIWEIGHT_FUNC_DBL %macro BIWEIGHT_FUNC 0 cglobal h264_biweight_8_10 BIWEIGHT_PROLOGUE BIWEIGHT_SETUP .nextrow: BIWEIGHT 0 mova [r0], m0 add r0, r2 add r1, r2 dec r3d jnz .nextrow REP_RET %endmacro INIT_XMM sse2 BIWEIGHT_FUNC INIT_XMM sse4 BIWEIGHT_FUNC %macro BIWEIGHT_FUNC_HALF 0 cglobal h264_biweight_4_10 BIWEIGHT_PROLOGUE BIWEIGHT_SETUP sar r3d, 1 lea r4, [r2*2] .nextrow: BIWEIGHT 0, r2 movh [r0 ], m0 movhps [r0+r2], m0 add r0, r4 add r1, r4 dec r3d jnz .nextrow REP_RET %endmacro INIT_XMM sse2 BIWEIGHT_FUNC_HALF INIT_XMM sse4 BIWEIGHT_FUNC_HALF