Commit c2d33742 authored by Ronald S. Bultje's avatar Ronald S. Bultje

H264: change weight/biweight functions to take a height argument.

Neon parts by Mans Rullgard <mans@mansr.com>.
parent 229d263c
......@@ -32,47 +32,22 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0);
void ff_weight_h264_pixels_16x16_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_16x8_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_8x16_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_8x8_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_8x4_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_4x8_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_4x4_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_4x2_neon(uint8_t *ds, int stride, int log2_den,
int weight, int offset);
void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
int log2_den, int weight, int offset);
void ff_weight_h264_pixels_8_neon(uint8_t *dst, int stride, int height,
int log2_den, int weight, int offset);
void ff_weight_h264_pixels_4_neon(uint8_t *dst, int stride, int height,
int log2_den, int weight, int offset);
void ff_biweight_h264_pixels_16x16_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights,
int offset);
void ff_biweight_h264_pixels_16x8_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights,
int offset);
void ff_biweight_h264_pixels_8x16_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights,
int offset);
void ff_biweight_h264_pixels_8x8_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights,
int offset);
void ff_biweight_h264_pixels_8x4_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights,
int offset);
void ff_biweight_h264_pixels_4x8_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights,
int offset);
void ff_biweight_h264_pixels_4x4_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights,
int offset);
void ff_biweight_h264_pixels_4x2_neon(uint8_t *dst, uint8_t *src, int stride,
int log2_den, int weightd, int weights,
int offset);
void ff_biweight_h264_pixels_16_neon(uint8_t *dst, uint8_t *src, int stride,
int height, int log2_den, int weightd,
int weights, int offset);
void ff_biweight_h264_pixels_8_neon(uint8_t *dst, uint8_t *src, int stride,
int height, int log2_den, int weightd,
int weights, int offset);
void ff_biweight_h264_pixels_4_neon(uint8_t *dst, uint8_t *src, int stride,
int height, int log2_den, int weightd,
int weights, int offset);
void ff_h264_idct_add_neon(uint8_t *dst, DCTELEM *block, int stride);
void ff_h264_idct_dc_add_neon(uint8_t *dst, DCTELEM *block, int stride);
......@@ -100,23 +75,13 @@ static void ff_h264dsp_init_neon(H264DSPContext *c, const int bit_depth, const i
c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16x16_neon;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_16x8_neon;
c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_8x16_neon;
c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels_8x8_neon;
c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels_8x4_neon;
c->weight_h264_pixels_tab[5] = ff_weight_h264_pixels_4x8_neon;
c->weight_h264_pixels_tab[6] = ff_weight_h264_pixels_4x4_neon;
c->weight_h264_pixels_tab[7] = ff_weight_h264_pixels_4x2_neon;
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels_4_neon;
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16x16_neon;
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_16x8_neon;
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_8x16_neon;
c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels_8x8_neon;
c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels_8x4_neon;
c->biweight_h264_pixels_tab[5] = ff_biweight_h264_pixels_4x8_neon;
c->biweight_h264_pixels_tab[6] = ff_biweight_h264_pixels_4x4_neon;
c->biweight_h264_pixels_tab[7] = ff_biweight_h264_pixels_4x2_neon;
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels_16_neon;
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels_8_neon;
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels_4_neon;
c->h264_idct_add = ff_h264_idct_add_neon;
c->h264_idct_dc_add = ff_h264_idct_dc_add_neon;
......
......@@ -1592,7 +1592,7 @@ endfunc
vdup.8 d1, r5
vmov q2, q8
vmov q3, q8
1: subs ip, ip, #2
1: subs r3, r3, #2
vld1.8 {d20-d21},[r0,:128], r2
\macd q2, d0, d20
pld [r0]
......@@ -1632,7 +1632,7 @@ endfunc
vdup.8 d1, r5
vmov q1, q8
vmov q10, q8
1: subs ip, ip, #2
1: subs r3, r3, #2
vld1.8 {d4},[r0,:64], r2
\macd q1, d0, d4
pld [r0]
......@@ -1662,7 +1662,7 @@ endfunc
vdup.8 d1, r5
vmov q1, q8
vmov q10, q8
1: subs ip, ip, #4
1: subs r3, r3, #4
vld1.32 {d4[0]},[r0,:32], r2
vld1.32 {d4[1]},[r0,:32], r2
\macd q1, d0, d4
......@@ -1700,16 +1700,17 @@ endfunc
.endm
.macro biweight_func w
function biweight_h264_pixels_\w\()_neon
function ff_biweight_h264_pixels_\w\()_neon, export=1
push {r4-r6, lr}
add r4, sp, #16
ldr r12, [sp, #16]
add r4, sp, #20
ldm r4, {r4-r6}
lsr lr, r4, #31
add r6, r6, #1
eors lr, lr, r5, lsr #30
orr r6, r6, #1
vdup.16 q9, r3
lsl r6, r6, r3
vdup.16 q9, r12
lsl r6, r6, r12
vmvn q9, q9
vdup.16 q8, r6
mov r6, r0
......@@ -1730,34 +1731,15 @@ function biweight_h264_pixels_\w\()_neon
endfunc
.endm
.macro biweight_entry w, h, b=1
function ff_biweight_h264_pixels_\w\()x\h\()_neon, export=1
mov ip, #\h
.if \b
b biweight_h264_pixels_\w\()_neon
.endif
endfunc
.endm
biweight_entry 16, 8
biweight_entry 16, 16, b=0
biweight_func 16
biweight_entry 8, 16
biweight_entry 8, 4
biweight_entry 8, 8, b=0
biweight_func 8
biweight_entry 4, 8
biweight_entry 4, 2
biweight_entry 4, 4, b=0
biweight_func 4
@ Weighted prediction
.macro weight_16 add
vdup.8 d0, r3
1: subs ip, ip, #2
vdup.8 d0, r12
1: subs r2, r2, #2
vld1.8 {d20-d21},[r0,:128], r1
vmull.u8 q2, d0, d20
pld [r0]
......@@ -1785,8 +1767,8 @@ endfunc
.endm
.macro weight_8 add
vdup.8 d0, r3
1: subs ip, ip, #2
vdup.8 d0, r12
1: subs r2, r2, #2
vld1.8 {d4},[r0,:64], r1
vmull.u8 q1, d0, d4
pld [r0]
......@@ -1806,10 +1788,10 @@ endfunc
.endm
.macro weight_4 add
vdup.8 d0, r3
vdup.8 d0, r12
vmov q1, q8
vmov q10, q8
1: subs ip, ip, #4
1: subs r2, r2, #4
vld1.32 {d4[0]},[r0,:32], r1
vld1.32 {d4[1]},[r0,:32], r1
vmull.u8 q1, d0, d4
......@@ -1842,50 +1824,32 @@ endfunc
.endm
.macro weight_func w
function weight_h264_pixels_\w\()_neon
function ff_weight_h264_pixels_\w\()_neon, export=1
push {r4, lr}
ldr r4, [sp, #8]
cmp r2, #1
lsl r4, r4, r2
ldr r12, [sp, #8]
ldr r4, [sp, #12]
cmp r3, #1
lsl r4, r4, r3
vdup.16 q8, r4
mov r4, r0
ble 20f
rsb lr, r2, #1
rsb lr, r3, #1
vdup.16 q9, lr
cmp r3, #0
cmp r12, #0
blt 10f
weight_\w vhadd.s16
10: rsb r3, r3, #0
10: rsb r12, r12, #0
weight_\w vhsub.s16
20: rsb lr, r2, #0
20: rsb lr, r3, #0
vdup.16 q9, lr
cmp r3, #0
cmp r12, #0
blt 10f
weight_\w vadd.s16
10: rsb r3, r3, #0
10: rsb r12, r12, #0
weight_\w vsub.s16
endfunc
.endm
.macro weight_entry w, h, b=1
function ff_weight_h264_pixels_\w\()x\h\()_neon, export=1
mov ip, #\h
.if \b
b weight_h264_pixels_\w\()_neon
.endif
endfunc
.endm
weight_entry 16, 8
weight_entry 16, 16, b=0
weight_func 16
weight_entry 8, 16
weight_entry 8, 4
weight_entry 8, 8, b=0
weight_func 8
weight_entry 4, 8
weight_entry 4, 2
weight_entry 4, 4, b=0
weight_func 4
This diff is collapsed.
......@@ -64,26 +64,14 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo
else\
c->h264_chroma_dc_dequant_idct= FUNC(ff_h264_chroma422_dc_dequant_idct, depth);\
\
c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16x16, depth);\
c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels16x8, depth);\
c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels8x16, depth);\
c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels8x8, depth);\
c->weight_h264_pixels_tab[4]= FUNC(weight_h264_pixels8x4, depth);\
c->weight_h264_pixels_tab[5]= FUNC(weight_h264_pixels4x8, depth);\
c->weight_h264_pixels_tab[6]= FUNC(weight_h264_pixels4x4, depth);\
c->weight_h264_pixels_tab[7]= FUNC(weight_h264_pixels4x2, depth);\
c->weight_h264_pixels_tab[8]= FUNC(weight_h264_pixels2x4, depth);\
c->weight_h264_pixels_tab[9]= FUNC(weight_h264_pixels2x2, depth);\
c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16x16, depth);\
c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels16x8, depth);\
c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels8x16, depth);\
c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels8x8, depth);\
c->biweight_h264_pixels_tab[4]= FUNC(biweight_h264_pixels8x4, depth);\
c->biweight_h264_pixels_tab[5]= FUNC(biweight_h264_pixels4x8, depth);\
c->biweight_h264_pixels_tab[6]= FUNC(biweight_h264_pixels4x4, depth);\
c->biweight_h264_pixels_tab[7]= FUNC(biweight_h264_pixels4x2, depth);\
c->biweight_h264_pixels_tab[8]= FUNC(biweight_h264_pixels2x4, depth);\
c->biweight_h264_pixels_tab[9]= FUNC(biweight_h264_pixels2x2, depth);\
c->weight_h264_pixels_tab[0]= FUNC(weight_h264_pixels16, depth);\
c->weight_h264_pixels_tab[1]= FUNC(weight_h264_pixels8, depth);\
c->weight_h264_pixels_tab[2]= FUNC(weight_h264_pixels4, depth);\
c->weight_h264_pixels_tab[3]= FUNC(weight_h264_pixels2, depth);\
c->biweight_h264_pixels_tab[0]= FUNC(biweight_h264_pixels16, depth);\
c->biweight_h264_pixels_tab[1]= FUNC(biweight_h264_pixels8, depth);\
c->biweight_h264_pixels_tab[2]= FUNC(biweight_h264_pixels4, depth);\
c->biweight_h264_pixels_tab[3]= FUNC(biweight_h264_pixels2, depth);\
\
c->h264_v_loop_filter_luma= FUNC(h264_v_loop_filter_luma, depth);\
c->h264_h_loop_filter_luma= FUNC(h264_h_loop_filter_luma, depth);\
......
......@@ -31,16 +31,18 @@
#include "dsputil.h"
//typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
typedef void (*h264_weight_func)(uint8_t *block, int stride, int height,
int log2_denom, int weight, int offset);
typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int height,
int log2_denom, int weightd, int weights, int offset);
/**
* Context for storing H.264 DSP functions
*/
typedef struct H264DSPContext{
/* weighted MC */
h264_weight_func weight_h264_pixels_tab[10];
h264_biweight_func biweight_h264_pixels_tab[10];
h264_weight_func weight_h264_pixels_tab[4];
h264_biweight_func biweight_h264_pixels_tab[4];
/* loop filter */
void (*h264_v_loop_filter_luma)(uint8_t *pix/*align 16*/, int stride, int alpha, int beta, int8_t *tc0);
......
......@@ -29,14 +29,16 @@
#define op_scale1(x) block[x] = av_clip_pixel( (block[x]*weight + offset) >> log2_denom )
#define op_scale2(x) dst[x] = av_clip_pixel( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
#define H264_WEIGHT(W,H) \
static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride, int log2_denom, int weight, int offset){ \
#define H264_WEIGHT(W) \
static void FUNCC(weight_h264_pixels ## W)(uint8_t *_block, int stride, int height, \
int log2_denom, int weight, int offset) \
{ \
int y; \
pixel *block = (pixel*)_block; \
stride /= sizeof(pixel); \
offset <<= (log2_denom + (BIT_DEPTH-8)); \
if(log2_denom) offset += 1<<(log2_denom-1); \
for(y=0; y<H; y++, block += stride){ \
for (y = 0; y < height; y++, block += stride) { \
op_scale1(0); \
op_scale1(1); \
if(W==2) continue; \
......@@ -58,14 +60,16 @@ static void FUNCC(weight_h264_pixels ## W ## x ## H)(uint8_t *_block, int stride
op_scale1(15); \
} \
} \
static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_src, int stride, int log2_denom, int weightd, int weights, int offset){ \
static void FUNCC(biweight_h264_pixels ## W)(uint8_t *_dst, uint8_t *_src, int stride, int height, \
int log2_denom, int weightd, int weights, int offset) \
{ \
int y; \
pixel *dst = (pixel*)_dst; \
pixel *src = (pixel*)_src; \
stride /= sizeof(pixel); \
offset <<= (BIT_DEPTH-8); \
offset = ((offset + 1) | 1) << log2_denom; \
for(y=0; y<H; y++, dst += stride, src += stride){ \
for (y = 0; y < height; y++, dst += stride, src += stride) { \
op_scale2(0); \
op_scale2(1); \
if(W==2) continue; \
......@@ -88,16 +92,10 @@ static void FUNCC(biweight_h264_pixels ## W ## x ## H)(uint8_t *_dst, uint8_t *_
} \
}
H264_WEIGHT(16,16)
H264_WEIGHT(16,8)
H264_WEIGHT(8,16)
H264_WEIGHT(8,8)
H264_WEIGHT(8,4)
H264_WEIGHT(4,8)
H264_WEIGHT(4,4)
H264_WEIGHT(4,2)
H264_WEIGHT(2,4)
H264_WEIGHT(2,2)
H264_WEIGHT(16)
H264_WEIGHT(8)
H264_WEIGHT(4)
H264_WEIGHT(2)
#undef op_scale1
#undef op_scale2
......
......@@ -843,7 +843,8 @@ static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,
}
static av_always_inline
void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset, int w, int h)
void weight_h264_W_altivec(uint8_t *block, int stride, int height,
int log2_denom, int weight, int offset, int w)
{
int y, aligned;
vec_u8 vblock;
......@@ -864,7 +865,7 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
voffset = vec_splat(vtemp, 5);
aligned = !((unsigned long)block & 0xf);
for (y=0; y<h; y++) {
for (y = 0; y < height; y++) {
vblock = vec_ld(0, block);
v0 = (vec_s16)vec_mergeh(zero_u8v, vblock);
......@@ -888,8 +889,8 @@ void weight_h264_WxH_altivec(uint8_t *block, int stride, int log2_denom, int wei
}
static av_always_inline
void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
int weightd, int weights, int offset, int w, int h)
void biweight_h264_W_altivec(uint8_t *dst, uint8_t *src, int stride, int height,
int log2_denom, int weightd, int weights, int offset, int w)
{
int y, dst_aligned, src_aligned;
vec_u8 vsrc, vdst;
......@@ -912,7 +913,7 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
dst_aligned = !((unsigned long)dst & 0xf);
src_aligned = !((unsigned long)src & 0xf);
for (y=0; y<h; y++) {
for (y = 0; y < height; y++) {
vdst = vec_ld(0, dst);
vsrc = vec_ld(0, src);
......@@ -952,19 +953,18 @@ void biweight_h264_WxH_altivec(uint8_t *dst, uint8_t *src, int stride, int log2_
}
}
#define H264_WEIGHT(W,H) \
static void ff_weight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
weight_h264_WxH_altivec(block, stride, log2_denom, weight, offset, W, H); \
#define H264_WEIGHT(W) \
static void ff_weight_h264_pixels ## W ## _altivec(uint8_t *block, int stride, int height, \
int log2_denom, int weight, int offset){ \
weight_h264_WxH_altivec(block, stride, height, log2_denom, weight, offset, W); \
}\
static void ff_biweight_h264_pixels ## W ## x ## H ## _altivec(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
biweight_h264_WxH_altivec(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
static void ff_biweight_h264_pixels ## W ## _altivec(uint8_t *dst, uint8_t *src, int stride, int height, \
int log2_denom, int weightd, int weights, int offset){ \
biweight_h264_WxH_altivec(dst, src, stride, height, log2_denom, weightd, weights, offset, W); \
}
H264_WEIGHT(16,16)
H264_WEIGHT(16, 8)
H264_WEIGHT( 8,16)
H264_WEIGHT( 8, 8)
H264_WEIGHT( 8, 4)
H264_WEIGHT(16)
H264_WEIGHT( 8)
void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
......@@ -1015,16 +1015,10 @@ void ff_h264dsp_init_ppc(H264DSPContext *c, const int bit_depth, const int chrom
c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16x16_altivec;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels16x8_altivec;
c->weight_h264_pixels_tab[2] = ff_weight_h264_pixels8x16_altivec;
c->weight_h264_pixels_tab[3] = ff_weight_h264_pixels8x8_altivec;
c->weight_h264_pixels_tab[4] = ff_weight_h264_pixels8x4_altivec;
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16x16_altivec;
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels16x8_altivec;
c->biweight_h264_pixels_tab[2] = ff_biweight_h264_pixels8x16_altivec;
c->biweight_h264_pixels_tab[3] = ff_biweight_h264_pixels8x8_altivec;
c->biweight_h264_pixels_tab[4] = ff_biweight_h264_pixels8x4_altivec;
c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels16_altivec;
c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels8_altivec;
c->biweight_h264_pixels_tab[0] = ff_biweight_h264_pixels16_altivec;
c->biweight_h264_pixels_tab[1] = ff_biweight_h264_pixels8_altivec;
}
}
}
......@@ -28,21 +28,20 @@ SECTION .text
;-----------------------------------------------------------------------------
; biweight pred:
;
; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
; int log2_denom, int weightd, int weights,
; int offset);
; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
; int height, int log2_denom, int weightd,
; int weights, int offset);
; and
; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
; int log2_denom, int weight,
; int offset);
; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
; int log2_denom, int weight, int offset);
;-----------------------------------------------------------------------------
%macro WEIGHT_SETUP 0
add r4, r4
inc r4
movd m3, r3d
movd m5, r4d
movd m6, r2d
add r5, r5
inc r5
movd m3, r4d
movd m5, r5d
movd m6, r3d
pslld m5, m6
psrld m5, 1
%if mmsize == 16
......@@ -71,60 +70,41 @@ SECTION .text
packuswb m0, m1
%endmacro
%macro WEIGHT_FUNC_DBL_MM 1
cglobal h264_weight_16x%1_mmx2, 5, 5, 0
INIT_MMX
cglobal h264_weight_16_mmx2, 6, 6, 0
WEIGHT_SETUP
mov r2, %1
%if %1 == 16
.nextrow
WEIGHT_OP 0, 4
mova [r0 ], m0
WEIGHT_OP 8, 12
mova [r0+8], m0
add r0, r1
dec r2
dec r2d
jnz .nextrow
REP_RET
%else
jmp mangle(ff_h264_weight_16x16_mmx2.nextrow)
%endif
%endmacro
INIT_MMX
WEIGHT_FUNC_DBL_MM 16
WEIGHT_FUNC_DBL_MM 8
%macro WEIGHT_FUNC_MM 4
cglobal h264_weight_%1x%2_%4, 7, 7, %3
%macro WEIGHT_FUNC_MM 3
cglobal h264_weight_%1_%3, 6, 6, %2
WEIGHT_SETUP
mov r2, %2
%if %2 == 16
.nextrow
WEIGHT_OP 0, mmsize/2
mova [r0], m0
add r0, r1
dec r2
dec r2d
jnz .nextrow
REP_RET
%else
jmp mangle(ff_h264_weight_%1x16_%4.nextrow)
%endif
%endmacro
INIT_MMX
WEIGHT_FUNC_MM 8, 16, 0, mmx2
WEIGHT_FUNC_MM 8, 8, 0, mmx2
WEIGHT_FUNC_MM 8, 4, 0, mmx2
WEIGHT_FUNC_MM 8, 0, mmx2
INIT_XMM
WEIGHT_FUNC_MM 16, 16, 8, sse2
WEIGHT_FUNC_MM 16, 8, 8, sse2
WEIGHT_FUNC_MM 16, 8, sse2
%macro WEIGHT_FUNC_HALF_MM 5
cglobal h264_weight_%1x%2_%5, 5, 5, %4
%macro WEIGHT_FUNC_HALF_MM 3
cglobal h264_weight_%1_%3, 6, 6, %2
WEIGHT_SETUP
mov r2, %2/2
sar r2d, 1
lea r3, [r1*2]
%if %2 == mmsize
.nextrow
WEIGHT_OP 0, r1
movh [r0], m0
......@@ -135,31 +115,34 @@ cglobal h264_weight_%1x%2_%5, 5, 5, %4
movh [r0+r1], m0
%endif
add r0, r3
dec r2
dec r2d
jnz .nextrow
REP_RET
%else
jmp mangle(ff_h264_weight_%1x%3_%5.nextrow)
%endif
%endmacro
INIT_MMX
WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2
WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2
WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2
WEIGHT_FUNC_HALF_MM 4, 0, mmx2
WEIGHT_FUNC_HALF_MM 4, 0, mmx2
WEIGHT_FUNC_HALF_MM 4, 0, mmx2
INIT_XMM
WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2
WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
WEIGHT_FUNC_HALF_MM 8, 8, sse2
WEIGHT_FUNC_HALF_MM 8, 8, sse2
WEIGHT_FUNC_HALF_MM 8, 8, sse2
%macro BIWEIGHT_SETUP 0
add r6, 1
or r6, 1
add r3, 1
movd m3, r4d
movd m4, r5d
movd m5, r6d
movd m6, r3d
%ifdef ARCH_X86_64
%define off_regd r11d
%else
%define off_regd r3d
%endif
mov off_regd, r7m
add off_regd, 1
or off_regd, 1
add r4, 1
movd m3, r5d
movd m4, r6d
movd m5, off_regd
movd m6, r4d
pslld m5, m6
psrld m5, 1
%if mmsize == 16
......@@ -195,11 +178,10 @@ WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
packuswb m0, m1
%endmacro
%macro BIWEIGHT_FUNC_DBL_MM 1
cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
INIT_MMX
cglobal h264_biweight_16_mmx2, 7, 7, 0
BIWEIGHT_SETUP
mov r3, %1
%if %1 == 16
movifnidn r3d, r3m
.nextrow
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, 4
......@@ -211,23 +193,14 @@ cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
mova [r0+8], m0
add r0, r2
add r1, r2
dec r3
dec r3d
jnz .nextrow
REP_RET
%else
jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow)
%endif
%endmacro
INIT_MMX
BIWEIGHT_FUNC_DBL_MM 16
BIWEIGHT_FUNC_DBL_MM 8
%macro BIWEIGHT_FUNC_MM 4
cglobal h264_biweight_%1x%2_%4, 7, 7, %3
%macro BIWEIGHT_FUNC_MM 3
cglobal h264_biweight_%1_%3, 7, 7, %2
BIWEIGHT_SETUP
mov r3, %2
%if %2 == 16
movifnidn r3d, r3m
.nextrow
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, mmsize/2
......@@ -235,28 +208,22 @@ cglobal h264_biweight_%1x%2_%4, 7, 7, %3
mova [r0], m0
add r0, r2
add r1, r2
dec r3
dec r3d
jnz .nextrow
REP_RET
%else
jmp mangle(ff_h264_biweight_%1x16_%4.nextrow)
%endif
%endmacro
INIT_MMX
BIWEIGHT_FUNC_MM 8, 16, 0, mmx2
BIWEIGHT_FUNC_MM 8, 8, 0, mmx2
BIWEIGHT_FUNC_MM 8, 4, 0, mmx2
BIWEIGHT_FUNC_MM 8, 0, mmx2
INIT_XMM
BIWEIGHT_FUNC_MM 16, 16, 8, sse2
BIWEIGHT_FUNC_MM 16, 8, 8, sse2
BIWEIGHT_FUNC_MM 16, 8, sse2
%macro BIWEIGHT_FUNC_HALF_MM 5
cglobal h264_biweight_%1x%2_%5, 7, 7, %4
%macro BIWEIGHT_FUNC_HALF_MM 3
cglobal h264_biweight_%1_%3, 7, 7, %2
BIWEIGHT_SETUP
mov r3, %2/2
movifnidn r3d, r3m
sar r3, 1
lea r4, [r2*2]
%if %2 == mmsize
.nextrow
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, r2
......@@ -270,31 +237,30 @@ cglobal h264_biweight_%1x%2_%5, 7, 7, %4
%endif
add r0, r4
add r1, r4
dec r3
dec r3d
jnz .nextrow
REP_RET
%else
jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow)
%endif
%endmacro
INIT_MMX
BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2
BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2
BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2
BIWEIGHT_FUNC_HALF_MM 4, 0, mmx2
INIT_XMM
BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2
BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
BIWEIGHT_FUNC_HALF_MM 8, 8, sse2
%macro BIWEIGHT_SSSE3_SETUP 0
add r6, 1
or r6, 1
add r3, 1
movd m4, r4d
movd m0, r5d
movd m5, r6d
movd m6, r3d
%ifdef ARCH_X86_64
%define off_regd r11d
%else
%define off_regd r3d
%endif
mov off_regd, r7m
add off_regd, 1
or off_regd, 1
add r4, 1
movd m4, r5d
movd m0, r6d
movd m5, off_regd
movd m6, r4d
pslld m5, m6
psrld m5, 1
punpcklbw m4, m0
......@@ -314,12 +280,11 @@ BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
packuswb m0, m2
%endmacro
%macro BIWEIGHT_SSSE3_16 1
cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
INIT_XMM
cglobal h264_biweight_16_ssse3, 7, 7, 8
BIWEIGHT_SSSE3_SETUP
mov r3, %1
movifnidn r3d, r3m
%if %1 == 16
.nextrow
movh m0, [r0]
movh m2, [r0+8]
......@@ -330,25 +295,17 @@ cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
mova [r0], m0
add r0, r2
add r1, r2
dec r3
dec r3d
jnz .nextrow
REP_RET
%else
jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow)
%endif
%endmacro
INIT_XMM
BIWEIGHT_SSSE3_16 16
BIWEIGHT_SSSE3_16 8
%macro BIWEIGHT_SSSE3_8 1
cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
cglobal h264_biweight_8_ssse3, 7, 7, 8
BIWEIGHT_SSSE3_SETUP
mov r3, %1/2
movifnidn r3d, r3m
sar r3, 1
lea r4, [r2*2]
%if %1 == 16
.nextrow
movh m0, [r0]
movh m1, [r1]
......@@ -361,15 +318,6 @@ cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
movhps [r0+r2], m0
add r0, r4
add r1, r4
dec r3
dec r3d
jnz .nextrow
REP_RET
%else
jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow)
%endif
%endmacro
INIT_XMM
BIWEIGHT_SSSE3_8 16
BIWEIGHT_SSSE3_8 8
BIWEIGHT_SSSE3_8 4
......@@ -36,33 +36,26 @@ cextern pw_1
SECTION .text
;-----------------------------------------------------------------------------
; void h264_weight(uint8_t *dst, int stride, int log2_denom,
; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
; int weight, int offset);
;-----------------------------------------------------------------------------
%ifdef ARCH_X86_32
DECLARE_REG_TMP 2
%else
DECLARE_REG_TMP 10
%endif
%macro WEIGHT_PROLOGUE 1
mov t0, %1
%macro WEIGHT_PROLOGUE 0
.prologue
PROLOGUE 0,5,8
PROLOGUE 0,6,8
movifnidn r0, r0mp
movifnidn r1d, r1m
movifnidn r3d, r3m
movifnidn r4d, r4m
movifnidn r5d, r5m
%endmacro
%macro WEIGHT_SETUP 1
mova m0, [pw_1]
movd m2, r2m
movd m2, r3m
pslld m0, m2 ; 1<<log2_denom
SPLATW m0, m0
shl r4, 19 ; *8, move to upper half of dword
lea r4, [r4+r3*2+0x10000]
movd m3, r4d ; weight<<1 | 1+(offset<<(3))
shl r5, 19 ; *8, move to upper half of dword
lea r5, [r5+r4*2+0x10000]
movd m3, r5d ; weight<<1 | 1+(offset<<(3))
pshufd m3, m3, 0
mova m4, [pw_pixel_max]
paddw m2, [sq_1] ; log2_denom+1
......@@ -96,8 +89,8 @@ DECLARE_REG_TMP 10
%endmacro
%macro WEIGHT_FUNC_DBL 1
cglobal h264_weight_16x16_10_%1
WEIGHT_PROLOGUE 16
cglobal h264_weight_16_10_%1
WEIGHT_PROLOGUE
WEIGHT_SETUP %1
.nextrow
WEIGHT_OP %1, 0
......@@ -105,13 +98,9 @@ cglobal h264_weight_16x16_10_%1
WEIGHT_OP %1, 16
mova [r0+16], m5
add r0, r1
dec t0
dec r2d
jnz .nextrow
REP_RET
cglobal h264_weight_16x8_10_%1
mov t0, 8
jmp mangle(ff_h264_weight_16x16_10_%1.prologue)
%endmacro
INIT_XMM
......@@ -120,24 +109,16 @@ WEIGHT_FUNC_DBL sse4
%macro WEIGHT_FUNC_MM 1
cglobal h264_weight_8x16_10_%1
WEIGHT_PROLOGUE 16
cglobal h264_weight_8_10_%1
WEIGHT_PROLOGUE
WEIGHT_SETUP %1
.nextrow
WEIGHT_OP %1, 0
mova [r0], m5
add r0, r1
dec t0
dec r2d
jnz .nextrow
REP_RET
cglobal h264_weight_8x8_10_%1
mov t0, 8
jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
cglobal h264_weight_8x4_10_%1
mov t0, 4
jmp mangle(ff_h264_weight_8x16_10_%1.prologue)
%endmacro
INIT_XMM
......@@ -146,8 +127,9 @@ WEIGHT_FUNC_MM sse4
%macro WEIGHT_FUNC_HALF_MM 1
cglobal h264_weight_4x8_10_%1
WEIGHT_PROLOGUE 4
cglobal h264_weight_4_10_%1
WEIGHT_PROLOGUE
sar r2d, 1
WEIGHT_SETUP %1
lea r3, [r1*2]
.nextrow
......@@ -155,17 +137,9 @@ cglobal h264_weight_4x8_10_%1
movh [r0], m5
movhps [r0+r1], m5
add r0, r3
dec t0
dec r2d
jnz .nextrow
REP_RET
cglobal h264_weight_4x4_10_%1
mov t0, 2
jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
cglobal h264_weight_4x2_10_%1
mov t0, 1
jmp mangle(ff_h264_weight_4x8_10_%1.prologue)
%endmacro
INIT_XMM
......@@ -174,40 +148,40 @@ WEIGHT_FUNC_HALF_MM sse4
;-----------------------------------------------------------------------------
; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int log2_denom,
; int weightd, int weights, int offset);
; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
; int log2_denom, int weightd, int weights, int offset);
;-----------------------------------------------------------------------------
%ifdef ARCH_X86_32
DECLARE_REG_TMP 2,3
DECLARE_REG_TMP 3
%else
DECLARE_REG_TMP 10,2
DECLARE_REG_TMP 10
%endif
%macro BIWEIGHT_PROLOGUE 1
mov t0, %1
%macro BIWEIGHT_PROLOGUE 0
.prologue
PROLOGUE 0,7,8
movifnidn r0, r0mp
movifnidn r1, r1mp
movifnidn t1d, r2m
movifnidn r4d, r4m
movifnidn r2d, r2m
movifnidn r5d, r5m
movifnidn r6d, r6m
movifnidn t0d, r7m
%endmacro
%macro BIWEIGHT_SETUP 1
lea r6, [r6*4+1] ; (offset<<2)+1
or r6, 1
shl r5, 16
or r4, r5
movd m4, r4d ; weightd | weights
movd m5, r6d ; (offset+1)|1
movd m6, r3m ; log2_denom
lea t0, [t0*4+1] ; (offset<<2)+1
or t0, 1
shl r6, 16
or r5, r6
movd m4, r5d ; weightd | weights
movd m5, t0d ; (offset+1)|1
movd m6, r4m ; log2_denom
pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
paddd m6, [sq_1]
pshufd m4, m4, 0
pshufd m5, m5, 0
mova m3, [pw_pixel_max]
movifnidn r3d, r3m
%ifnidn %1, sse4
pxor m7, m7
%endif
......@@ -243,23 +217,19 @@ DECLARE_REG_TMP 10,2
%endmacro
%macro BIWEIGHT_FUNC_DBL 1
cglobal h264_biweight_16x16_10_%1
BIWEIGHT_PROLOGUE 16
cglobal h264_biweight_16_10_%1
BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP %1
.nextrow
BIWEIGHT %1, 0
mova [r0 ], m0
BIWEIGHT %1, 16
mova [r0+16], m0
add r0, t1
add r1, t1
dec t0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
cglobal h264_biweight_16x8_10_%1
mov t0, 8
jmp mangle(ff_h264_biweight_16x16_10_%1.prologue)
%endmacro
INIT_XMM
......@@ -267,25 +237,17 @@ BIWEIGHT_FUNC_DBL sse2
BIWEIGHT_FUNC_DBL sse4
%macro BIWEIGHT_FUNC 1
cglobal h264_biweight_8x16_10_%1
BIWEIGHT_PROLOGUE 16
cglobal h264_biweight_8_10_%1
BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP %1
.nextrow
BIWEIGHT %1, 0
mova [r0], m0
add r0, t1
add r1, t1
dec t0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
cglobal h264_biweight_8x8_10_%1
mov t0, 8
jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
cglobal h264_biweight_8x4_10_%1
mov t0, 4
jmp mangle(ff_h264_biweight_8x16_10_%1.prologue)
%endmacro
INIT_XMM
......@@ -293,27 +255,20 @@ BIWEIGHT_FUNC sse2
BIWEIGHT_FUNC sse4
%macro BIWEIGHT_FUNC_HALF 1
cglobal h264_biweight_4x8_10_%1
BIWEIGHT_PROLOGUE 4
cglobal h264_biweight_4_10_%1
BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP %1
lea r4, [t1*2]
sar r3d, 1
lea r4, [r2*2]
.nextrow
BIWEIGHT %1, 0, t1
BIWEIGHT %1, 0, r2
movh [r0 ], m0
movhps [r0+t1], m0
movhps [r0+r2], m0
add r0, r4
add r1, r4
dec t0
dec r3d
jnz .nextrow
REP_RET
cglobal h264_biweight_4x4_10_%1
mov t0, 2
jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
cglobal h264_biweight_4x2_10_%1
mov t0, 1
jmp mangle(ff_h264_biweight_4x8_10_%1.prologue)
%endmacro
INIT_XMM
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment