Commit 272b252c authored by Christophe GISQUET's avatar Christophe GISQUET Committed by Ronald S. Bultje

rv40dsp: implement prescaled versions for biweight.

Quite often, the original weights are multiple of 512. By prescaling them
by 1/512 when they are computed (once per frame), no intermediate shifting
is needed, and no prescaling on each call either.

The x86 code already used that trick.
Signed-off-by: 's avatarRonald S. Bultje <rsbultje@gmail.com>
parent d3c59d50
...@@ -128,8 +128,8 @@ void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp) ...@@ -128,8 +128,8 @@ void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon; c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon; c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_neon; c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_16_neon;
c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_neon; c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_8_neon;
c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon; c->rv40_loop_filter_strength[0] = ff_rv40_h_loop_filter_strength_neon;
c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon; c->rv40_loop_filter_strength[1] = ff_rv40_v_loop_filter_strength_neon;
......
...@@ -521,7 +521,7 @@ static void rv34_pred_mv(RV34DecContext *r, int block_type, int subblock_no, int ...@@ -521,7 +521,7 @@ static void rv34_pred_mv(RV34DecContext *r, int block_type, int subblock_no, int
*/ */
static int calc_add_mv(RV34DecContext *r, int dir, int val) static int calc_add_mv(RV34DecContext *r, int dir, int val)
{ {
int mul = dir ? -r->weight2 : r->weight1; int mul = dir ? -r->mv_weight2 : r->mv_weight1;
return (val * mul + 0x2000) >> 14; return (val * mul + 0x2000) >> 14;
} }
...@@ -776,24 +776,24 @@ static void rv34_mc_1mv(RV34DecContext *r, const int block_type, ...@@ -776,24 +776,24 @@ static void rv34_mc_1mv(RV34DecContext *r, const int block_type,
static void rv4_weight(RV34DecContext *r) static void rv4_weight(RV34DecContext *r)
{ {
r->rdsp.rv40_weight_pixels_tab[0](r->s.dest[0], r->rdsp.rv40_weight_pixels_tab[r->scaled_weight][0](r->s.dest[0],
r->tmp_b_block_y[0], r->tmp_b_block_y[0],
r->tmp_b_block_y[1], r->tmp_b_block_y[1],
r->weight1, r->weight1,
r->weight2, r->weight2,
r->s.linesize); r->s.linesize);
r->rdsp.rv40_weight_pixels_tab[1](r->s.dest[1], r->rdsp.rv40_weight_pixels_tab[r->scaled_weight][1](r->s.dest[1],
r->tmp_b_block_uv[0], r->tmp_b_block_uv[0],
r->tmp_b_block_uv[2], r->tmp_b_block_uv[2],
r->weight1, r->weight1,
r->weight2, r->weight2,
r->s.uvlinesize); r->s.uvlinesize);
r->rdsp.rv40_weight_pixels_tab[1](r->s.dest[2], r->rdsp.rv40_weight_pixels_tab[r->scaled_weight][1](r->s.dest[2],
r->tmp_b_block_uv[1], r->tmp_b_block_uv[1],
r->tmp_b_block_uv[3], r->tmp_b_block_uv[3],
r->weight1, r->weight1,
r->weight2, r->weight2,
r->s.uvlinesize); r->s.uvlinesize);
} }
static void rv34_mc_2mv(RV34DecContext *r, const int block_type) static void rv34_mc_2mv(RV34DecContext *r, const int block_type)
...@@ -1703,11 +1703,21 @@ int ff_rv34_decode_frame(AVCodecContext *avctx, ...@@ -1703,11 +1703,21 @@ int ff_rv34_decode_frame(AVCodecContext *avctx,
int dist0 = GET_PTS_DIFF(r->cur_pts, r->last_pts); int dist0 = GET_PTS_DIFF(r->cur_pts, r->last_pts);
int dist1 = GET_PTS_DIFF(r->next_pts, r->cur_pts); int dist1 = GET_PTS_DIFF(r->next_pts, r->cur_pts);
if (!refdist) { if(!refdist){
r->weight1 = r->weight2 = 8192; r->mv_weight1 = r->mv_weight2 = r->weight1 = r->weight2 = 8192;
} else { r->scaled_weight = 0;
r->weight1 = (dist0 << 14) / refdist; }else{
r->weight2 = (dist1 << 14) / refdist; r->mv_weight1 = (dist0 << 14) / refdist;
r->mv_weight2 = (dist1 << 14) / refdist;
if((r->mv_weight1|r->mv_weight2) & 511){
r->weight1 = r->mv_weight1;
r->weight2 = r->mv_weight2;
r->scaled_weight = 0;
}else{
r->weight1 = r->mv_weight1 >> 9;
r->weight2 = r->mv_weight2 >> 9;
r->scaled_weight = 1;
}
} }
} }
s->mb_x = s->mb_y = 0; s->mb_x = s->mb_y = 0;
......
...@@ -106,7 +106,9 @@ typedef struct RV34DecContext{ ...@@ -106,7 +106,9 @@ typedef struct RV34DecContext{
int rpr; ///< one field size in RV30 slice header int rpr; ///< one field size in RV30 slice header
int cur_pts, last_pts, next_pts; int cur_pts, last_pts, next_pts;
int scaled_weight;
int weight1, weight2; ///< B frame distance fractions (0.14) used in motion compensation int weight1, weight2; ///< B frame distance fractions (0.14) used in motion compensation
int mv_weight1, mv_weight2;
uint16_t *cbp_luma; ///< CBP values for luma subblocks uint16_t *cbp_luma; ///< CBP values for luma subblocks
uint8_t *cbp_chroma; ///< CBP values for chroma subblocks uint8_t *cbp_chroma; ///< CBP values for chroma subblocks
......
...@@ -58,7 +58,12 @@ typedef struct RV34DSPContext { ...@@ -58,7 +58,12 @@ typedef struct RV34DSPContext {
qpel_mc_func avg_pixels_tab[4][16]; qpel_mc_func avg_pixels_tab[4][16];
h264_chroma_mc_func put_chroma_pixels_tab[3]; h264_chroma_mc_func put_chroma_pixels_tab[3];
h264_chroma_mc_func avg_chroma_pixels_tab[3]; h264_chroma_mc_func avg_chroma_pixels_tab[3];
rv40_weight_func rv40_weight_pixels_tab[2]; /**
* Biweight functions, first dimension is transform size (16/8),
* second is whether the weight is prescaled by 1/512 to skip
* the intermediate shifting.
*/
rv40_weight_func rv40_weight_pixels_tab[2][2];
rv34_inv_transform_func rv34_inv_transform; rv34_inv_transform_func rv34_inv_transform;
rv34_inv_transform_func rv34_inv_transform_dc; rv34_inv_transform_func rv34_inv_transform_dc;
rv34_idct_add_func rv34_idct_add; rv34_idct_add_func rv34_idct_add;
......
...@@ -278,7 +278,7 @@ RV40_CHROMA_MC(put_, op_put) ...@@ -278,7 +278,7 @@ RV40_CHROMA_MC(put_, op_put)
RV40_CHROMA_MC(avg_, op_avg) RV40_CHROMA_MC(avg_, op_avg)
#define RV40_WEIGHT_FUNC(size) \ #define RV40_WEIGHT_FUNC(size) \
static void rv40_weight_func_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\ static void rv40_weight_func_rnd_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\
{\ {\
int i, j;\ int i, j;\
\ \
...@@ -289,6 +289,18 @@ static void rv40_weight_func_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src ...@@ -289,6 +289,18 @@ static void rv40_weight_func_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src
src2 += stride;\ src2 += stride;\
dst += stride;\ dst += stride;\
}\ }\
}\
static void rv40_weight_func_nornd_ ## size (uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, ptrdiff_t stride)\
{\
int i, j;\
\
for (j = 0; j < size; j++) {\
for (i = 0; i < size; i++)\
dst[i] = (w2 * src1[i] + w1 * src2[i] + 0x10) >> 5;\
src1 += stride;\
src2 += stride;\
dst += stride;\
}\
} }
RV40_WEIGHT_FUNC(16) RV40_WEIGHT_FUNC(16)
...@@ -578,8 +590,10 @@ av_cold void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp) { ...@@ -578,8 +590,10 @@ av_cold void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp) {
c->avg_chroma_pixels_tab[0] = avg_rv40_chroma_mc8_c; c->avg_chroma_pixels_tab[0] = avg_rv40_chroma_mc8_c;
c->avg_chroma_pixels_tab[1] = avg_rv40_chroma_mc4_c; c->avg_chroma_pixels_tab[1] = avg_rv40_chroma_mc4_c;
c->rv40_weight_pixels_tab[0] = rv40_weight_func_16; c->rv40_weight_pixels_tab[0][0] = rv40_weight_func_rnd_16;
c->rv40_weight_pixels_tab[1] = rv40_weight_func_8; c->rv40_weight_pixels_tab[0][1] = rv40_weight_func_rnd_8;
c->rv40_weight_pixels_tab[1][0] = rv40_weight_func_nornd_16;
c->rv40_weight_pixels_tab[1][1] = rv40_weight_func_nornd_8;
c->rv40_weak_loop_filter[0] = rv40_h_weak_loop_filter; c->rv40_weak_loop_filter[0] = rv40_h_weak_loop_filter;
c->rv40_weak_loop_filter[1] = rv40_v_weak_loop_filter; c->rv40_weak_loop_filter[1] = rv40_v_weak_loop_filter;
......
...@@ -139,69 +139,61 @@ SECTION .text ...@@ -139,69 +139,61 @@ SECTION .text
; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride) ; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
; %1=size %2=num of xmm regs ; %1=size %2=num of xmm regs
%macro RV40_WEIGHT 2 ; The weights are FP0.14 notation of fractions depending on pts.
cglobal rv40_weight_func_%1, 6, 7, %2 ; For timebases without rounding error (i.e. PAL), the fractions
; can be simplified, and several operations can be avoided.
; Therefore, we check here whether they are multiples of 2^9 for
; those simplifications to occur.
%macro RV40_WEIGHT 3
cglobal rv40_weight_func_%1_%2, 6, 7, %3
%if cpuflag(ssse3) %if cpuflag(ssse3)
mova m1, [shift_round] mova m1, [shift_round]
%else %else
mova m1, [pw_16] mova m1, [pw_16]
%endif %endif
pxor m0, m0 pxor m0, m0
mov r6, r3
or r6, r4
; The weights are FP0.14 notation of fractions depending on pts.
; For timebases without rounding error (i.e. PAL), the fractions
; can be simplified, and several operations can be avoided.
; Therefore, we check here whether they are multiples of 2^9 for
; those simplifications to occur.
and r6, 0x1FF
; Set loop counter and increments ; Set loop counter and increments
%if mmsize == 8 %if mmsize == 8
mov r6, %1 mov r6, %2
%else %else
mov r6, (%1 * %1) / mmsize mov r6, (%2 * %2) / mmsize
%endif %endif
; Use result of test now
jz .loop_512
movd m2, r3 movd m2, r3
movd m3, r4 movd m3, r4
%ifidn %1,rnd
%define RND 0
SPLATW m2, m2 SPLATW m2, m2
SPLATW m3, m3 %else
%define RND 1
.loop:
MAIN_LOOP %1, 0
jnz .loop
REP_RET
; Weights are multiple of 512, which allows some shortcuts
.loop_512:
sar r3, 9
sar r4, 9
movd m2, r3
movd m3, r4
%if cpuflag(ssse3) %if cpuflag(ssse3)
punpcklbw m3, m2 punpcklbw m3, m2
SPLATW m3, m3
%else %else
SPLATW m2, m2 SPLATW m2, m2
SPLATW m3, m3
%endif %endif
.loop2: %endif
MAIN_LOOP %1, 1 SPLATW m3, m3
jnz .loop2
REP_RET
.loop:
MAIN_LOOP %2, RND
jnz .loop
REP_RET
%endmacro %endmacro
INIT_MMX mmx INIT_MMX mmx
RV40_WEIGHT 8, 0 RV40_WEIGHT rnd, 8, 3
RV40_WEIGHT 16, 0 RV40_WEIGHT rnd, 16, 4
RV40_WEIGHT nornd, 8, 3
RV40_WEIGHT nornd, 16, 4
INIT_XMM sse2 INIT_XMM sse2
RV40_WEIGHT 8, 8 RV40_WEIGHT rnd, 8, 3
RV40_WEIGHT 16, 8 RV40_WEIGHT rnd, 16, 4
RV40_WEIGHT nornd, 8, 3
RV40_WEIGHT nornd, 16, 4
INIT_XMM ssse3 INIT_XMM ssse3
RV40_WEIGHT 8, 8 RV40_WEIGHT rnd, 8, 3
RV40_WEIGHT 16, 8 RV40_WEIGHT rnd, 16, 4
RV40_WEIGHT nornd, 8, 3
RV40_WEIGHT nornd, 16, 4
...@@ -41,10 +41,14 @@ void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src, ...@@ -41,10 +41,14 @@ void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y); int stride, int h, int x, int y);
#define DECLARE_WEIGHT(opt) \ #define DECLARE_WEIGHT(opt) \
void ff_rv40_weight_func_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride); \ int w1, int w2, ptrdiff_t stride); \
void ff_rv40_weight_func_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride); int w1, int w2, ptrdiff_t stride); \
void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride); \
void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride);
DECLARE_WEIGHT(mmx) DECLARE_WEIGHT(mmx)
DECLARE_WEIGHT(sse2) DECLARE_WEIGHT(sse2)
DECLARE_WEIGHT(ssse3) DECLARE_WEIGHT(ssse3)
...@@ -57,8 +61,10 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp) ...@@ -57,8 +61,10 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
if (mm_flags & AV_CPU_FLAG_MMX) { if (mm_flags & AV_CPU_FLAG_MMX) {
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx; c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx; c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_mmx; c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmx;
c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_mmx; c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmx;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmx;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmx;
} }
if (mm_flags & AV_CPU_FLAG_MMX2) { if (mm_flags & AV_CPU_FLAG_MMX2) {
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2; c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmx2;
...@@ -68,12 +74,16 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp) ...@@ -68,12 +74,16 @@ void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp)
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow; c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
} }
if (mm_flags & AV_CPU_FLAG_SSE2) { if (mm_flags & AV_CPU_FLAG_SSE2) {
c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_sse2; c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_sse2; c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
} }
if (mm_flags & AV_CPU_FLAG_SSSE3) { if (mm_flags & AV_CPU_FLAG_SSSE3) {
c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_ssse3; c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_ssse3; c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
} }
#endif #endif
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment