Commit 1acd7d59 authored by Ronald S. Bultje's avatar Ronald S. Bultje Committed by Michael Niedermayer

h264: integrate clear_blocks calls with IDCT.

The non-intra-pcm branch in hl_decode_mb (simple, 8bpp) goes from 700
to 672 cycles, and the complete loop of decode_mb_cabac and hl_decode_mb
(in the decode_slice loop) goes from 1759 to 1733 cycles on the clip
tested (cathedral), i.e. almost 30 cycles per mb faster.
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent a1f1ca96
...@@ -22,9 +22,12 @@ ...@@ -22,9 +22,12 @@
function ff_h264_idct_add_neon, export=1 function ff_h264_idct_add_neon, export=1
vld1.64 {d0-d3}, [r1,:128] vld1.64 {d0-d3}, [r1,:128]
vmov.i16 q15, #0
vswp d1, d2 vswp d1, d2
vst1.16 {q15}, [r1,:128]!
vadd.i16 d4, d0, d1 vadd.i16 d4, d0, d1
vst1.16 {q15}, [r1,:128]!
vshr.s16 q8, q1, #1 vshr.s16 q8, q1, #1
vsub.i16 d5, d0, d1 vsub.i16 d5, d0, d1
vadd.i16 d6, d2, d17 vadd.i16 d6, d2, d17
...@@ -65,11 +68,14 @@ function ff_h264_idct_add_neon, export=1 ...@@ -65,11 +68,14 @@ function ff_h264_idct_add_neon, export=1
vst1.32 {d0[1]}, [r0,:32], r2 vst1.32 {d0[1]}, [r0,:32], r2
vst1.32 {d1[0]}, [r0,:32], r2 vst1.32 {d1[0]}, [r0,:32], r2
sub r1, r1, #32
bx lr bx lr
endfunc endfunc
function ff_h264_idct_dc_add_neon, export=1 function ff_h264_idct_dc_add_neon, export=1
mov r3, #0
vld1.16 {d2[],d3[]}, [r1,:16] vld1.16 {d2[],d3[]}, [r1,:16]
strh r3, [r1]
vrshr.s16 q1, q1, #6 vrshr.s16 q1, q1, #6
vld1.32 {d0[0]}, [r0,:32], r2 vld1.32 {d0[0]}, [r0,:32], r2
vld1.32 {d0[1]}, [r0,:32], r2 vld1.32 {d0[1]}, [r0,:32], r2
...@@ -148,7 +154,7 @@ function ff_h264_idct_add8_neon, export=1 ...@@ -148,7 +154,7 @@ function ff_h264_idct_add8_neon, export=1
add r5, r1, #16*4 add r5, r1, #16*4
add r1, r2, #16*32 add r1, r2, #16*32
mov r2, r3 mov r2, r3
mov r3, r1 mov r10, r1
ldr r6, [sp, #32] ldr r6, [sp, #32]
movrel r7, scan8+16 movrel r7, scan8+16
mov r12, #0 mov r12, #0
...@@ -156,7 +162,7 @@ function ff_h264_idct_add8_neon, export=1 ...@@ -156,7 +162,7 @@ function ff_h264_idct_add8_neon, export=1
ldr r0, [r5, r12, lsl #2] ldr r0, [r5, r12, lsl #2]
ldrb r8, [r6, r8] ldrb r8, [r6, r8]
add r0, r0, r4 add r0, r0, r4
add r1, r3, r12, lsl #5 add r1, r10, r12, lsl #5
cmp r8, #0 cmp r8, #0
ldrsh r8, [r1] ldrsh r8, [r1]
iteet ne iteet ne
...@@ -180,7 +186,9 @@ endfunc ...@@ -180,7 +186,9 @@ endfunc
qb .req q14 qb .req q14
vshr.s16 q2, q10, #1 vshr.s16 q2, q10, #1
vadd.i16 q0, q8, q12 vadd.i16 q0, q8, q12
vld1.16 {q14-q15},[r1,:128]! vld1.16 {q14-q15},[r1,:128]
vst1.16 {q7}, [r1,:128]!
vst1.16 {q7}, [r1,:128]!
vsub.i16 q1, q8, q12 vsub.i16 q1, q8, q12
vshr.s16 q3, q14, #1 vshr.s16 q3, q14, #1
vsub.i16 q2, q2, q14 vsub.i16 q2, q2, q14
...@@ -259,9 +267,16 @@ endfunc ...@@ -259,9 +267,16 @@ endfunc
.endm .endm
function ff_h264_idct8_add_neon, export=1 function ff_h264_idct8_add_neon, export=1
vld1.16 {q8-q9}, [r1,:128]! vmov.i16 q7, #0
vld1.16 {q10-q11},[r1,:128]! vld1.16 {q8-q9}, [r1,:128]
vld1.16 {q12-q13},[r1,:128]! vst1.16 {q7}, [r1,:128]!
vst1.16 {q7}, [r1,:128]!
vld1.16 {q10-q11},[r1,:128]
vst1.16 {q7}, [r1,:128]!
vst1.16 {q7}, [r1,:128]!
vld1.16 {q12-q13},[r1,:128]
vst1.16 {q7}, [r1,:128]!
vst1.16 {q7}, [r1,:128]!
idct8x8_cols 0 idct8x8_cols 0
idct8x8_cols 1 idct8x8_cols 1
...@@ -313,7 +328,9 @@ function ff_h264_idct8_add_neon, export=1 ...@@ -313,7 +328,9 @@ function ff_h264_idct8_add_neon, export=1
endfunc endfunc
function ff_h264_idct8_dc_add_neon, export=1 function ff_h264_idct8_dc_add_neon, export=1
mov r3, #0
vld1.16 {d30[],d31[]},[r1,:16] vld1.16 {d30[],d31[]},[r1,:16]
strh r3, [r1]
vld1.32 {d0}, [r0,:64], r2 vld1.32 {d0}, [r0,:64], r2
vrshr.s16 q15, q15, #6 vrshr.s16 q15, q15, #6
vld1.32 {d1}, [r0,:64], r2 vld1.32 {d1}, [r0,:64], r2
......
...@@ -2191,7 +2191,7 @@ static av_always_inline void hl_decode_mb_predict_luma(H264Context *h, ...@@ -2191,7 +2191,7 @@ static av_always_inline void hl_decode_mb_predict_luma(H264Context *h,
if (IS_8x8DCT(mb_type)) { if (IS_8x8DCT(mb_type)) {
if (transform_bypass) { if (transform_bypass) {
idct_dc_add = idct_dc_add =
idct_add = h->h264dsp.h264_add_pixels8; idct_add = h->h264dsp.h264_add_pixels8_clear;
} else { } else {
idct_dc_add = h->h264dsp.h264_idct8_dc_add; idct_dc_add = h->h264dsp.h264_idct8_dc_add;
idct_add = h->h264dsp.h264_idct8_add; idct_add = h->h264dsp.h264_idct8_add;
...@@ -2216,7 +2216,7 @@ static av_always_inline void hl_decode_mb_predict_luma(H264Context *h, ...@@ -2216,7 +2216,7 @@ static av_always_inline void hl_decode_mb_predict_luma(H264Context *h,
} else { } else {
if (transform_bypass) { if (transform_bypass) {
idct_dc_add = idct_dc_add =
idct_add = h->h264dsp.h264_add_pixels4; idct_add = h->h264dsp.h264_add_pixels4_clear;
} else { } else {
idct_dc_add = h->h264dsp.h264_idct_dc_add; idct_dc_add = h->h264dsp.h264_idct_dc_add;
idct_add = h->h264dsp.h264_idct_add; idct_add = h->h264dsp.h264_idct_add;
...@@ -2313,7 +2313,7 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, ...@@ -2313,7 +2313,7 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type,
for (i = 0; i < 16; i++) for (i = 0; i < 16; i++)
if (h->non_zero_count_cache[scan8[i + p * 16]] || if (h->non_zero_count_cache[scan8[i + p * 16]] ||
dctcoef_get(h->mb, pixel_shift, i * 16 + p * 256)) dctcoef_get(h->mb, pixel_shift, i * 16 + p * 256))
h->h264dsp.h264_add_pixels4(dest_y + block_offset[i], h->h264dsp.h264_add_pixels4_clear(dest_y + block_offset[i],
h->mb + (i * 16 + p * 256 << pixel_shift), h->mb + (i * 16 + p * 256 << pixel_shift),
linesize); linesize);
} }
...@@ -2326,8 +2326,8 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type, ...@@ -2326,8 +2326,8 @@ static av_always_inline void hl_decode_mb_idct_luma(H264Context *h, int mb_type,
} else if (h->cbp & 15) { } else if (h->cbp & 15) {
if (transform_bypass) { if (transform_bypass) {
const int di = IS_8x8DCT(mb_type) ? 4 : 1; const int di = IS_8x8DCT(mb_type) ? 4 : 1;
idct_add = IS_8x8DCT(mb_type) ? h->h264dsp.h264_add_pixels8 idct_add = IS_8x8DCT(mb_type) ? h->h264dsp.h264_add_pixels8_clear
: h->h264dsp.h264_add_pixels4; : h->h264dsp.h264_add_pixels4_clear;
for (i = 0; i < 16; i += di) for (i = 0; i < 16; i += di)
if (h->non_zero_count_cache[scan8[i + p * 16]]) if (h->non_zero_count_cache[scan8[i + p * 16]])
idct_add(dest_y + block_offset[i], idct_add(dest_y + block_offset[i],
......
...@@ -204,7 +204,7 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h) ...@@ -204,7 +204,7 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
h->mb + (16 * 16 * 2 << PIXEL_SHIFT), h->mb + (16 * 16 * 2 << PIXEL_SHIFT),
uvlinesize); uvlinesize);
} else { } else {
idct_add = h->h264dsp.h264_add_pixels4; idct_add = h->h264dsp.h264_add_pixels4_clear;
for (j = 1; j < 3; j++) { for (j = 1; j < 3; j++) {
for (i = j * 16; i < j * 16 + 4; i++) for (i = j * 16; i < j * 16 + 4; i++)
if (h->non_zero_count_cache[scan8[i]] || if (h->non_zero_count_cache[scan8[i]] ||
...@@ -258,10 +258,6 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h) ...@@ -258,10 +258,6 @@ static av_noinline void FUNC(hl_decode_mb)(H264Context *h)
} }
} }
} }
if (h->cbp || IS_INTRA(mb_type)) {
h->dsp.clear_blocks(h->mb);
h->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
}
} }
} }
...@@ -365,11 +361,6 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h) ...@@ -365,11 +361,6 @@ static av_noinline void FUNC(hl_decode_mb_444)(H264Context *h)
hl_decode_mb_idct_luma(h, mb_type, 1, SIMPLE, transform_bypass, hl_decode_mb_idct_luma(h, mb_type, 1, SIMPLE, transform_bypass,
PIXEL_SHIFT, block_offset, linesize, PIXEL_SHIFT, block_offset, linesize,
dest[p], p); dest[p], p);
if (h->cbp || IS_INTRA(mb_type)) {
h->dsp.clear_blocks(h->mb);
h->dsp.clear_blocks(h->mb + (24 * 16 << PIXEL_SHIFT));
}
} }
} }
......
...@@ -43,6 +43,8 @@ static void FUNCC(ff_h264_add_pixels4)(uint8_t *_dst, int16_t *_src, int stride) ...@@ -43,6 +43,8 @@ static void FUNCC(ff_h264_add_pixels4)(uint8_t *_dst, int16_t *_src, int stride)
dst += stride; dst += stride;
src += 4; src += 4;
} }
memset(_src, 0, sizeof(dctcoef) * 16);
} }
static void FUNCC(ff_h264_add_pixels8)(uint8_t *_dst, int16_t *_src, int stride) static void FUNCC(ff_h264_add_pixels8)(uint8_t *_dst, int16_t *_src, int stride)
...@@ -65,4 +67,6 @@ static void FUNCC(ff_h264_add_pixels8)(uint8_t *_dst, int16_t *_src, int stride) ...@@ -65,4 +67,6 @@ static void FUNCC(ff_h264_add_pixels8)(uint8_t *_dst, int16_t *_src, int stride)
dst += stride; dst += stride;
src += 8; src += 8;
} }
memset(_src, 0, sizeof(dctcoef) * 64);
} }
...@@ -66,8 +66,8 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo ...@@ -66,8 +66,8 @@ void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, const int chroma_fo
#define FUNC(a, depth) a ## _ ## depth ## _c #define FUNC(a, depth) a ## _ ## depth ## _c
#define ADDPX_DSP(depth) \ #define ADDPX_DSP(depth) \
c->h264_add_pixels4 = FUNC(ff_h264_add_pixels4, depth);\ c->h264_add_pixels4_clear = FUNC(ff_h264_add_pixels4, depth);\
c->h264_add_pixels8 = FUNC(ff_h264_add_pixels8, depth) c->h264_add_pixels8_clear = FUNC(ff_h264_add_pixels8, depth)
if (bit_depth > 8 && bit_depth <= 16) { if (bit_depth > 8 && bit_depth <= 16) {
ADDPX_DSP(16); ADDPX_DSP(16);
......
...@@ -103,8 +103,8 @@ typedef struct H264DSPContext { ...@@ -103,8 +103,8 @@ typedef struct H264DSPContext {
void (*h264_chroma_dc_dequant_idct)(int16_t *block, int qmul); void (*h264_chroma_dc_dequant_idct)(int16_t *block, int qmul);
/* bypass-transform */ /* bypass-transform */
void (*h264_add_pixels8)(uint8_t *dst, int16_t *block, int stride); void (*h264_add_pixels8_clear)(uint8_t *dst, int16_t *block, int stride);
void (*h264_add_pixels4)(uint8_t *dst, int16_t *block, int stride); void (*h264_add_pixels4_clear)(uint8_t *dst, int16_t *block, int stride);
} H264DSPContext; } H264DSPContext;
void ff_h264dsp_init(H264DSPContext *c, const int bit_depth, void ff_h264dsp_init(H264DSPContext *c, const int bit_depth,
......
...@@ -61,6 +61,8 @@ void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride) ...@@ -61,6 +61,8 @@ void FUNCC(ff_h264_idct_add)(uint8_t *_dst, int16_t *_block, int stride)
dst[i + 2*stride]= av_clip_pixel(dst[i + 2*stride] + ((z1 - z2) >> 6)); dst[i + 2*stride]= av_clip_pixel(dst[i + 2*stride] + ((z1 - z2) >> 6));
dst[i + 3*stride]= av_clip_pixel(dst[i + 3*stride] + ((z0 - z3) >> 6)); dst[i + 3*stride]= av_clip_pixel(dst[i + 3*stride] + ((z0 - z3) >> 6));
} }
memset(block, 0, 16 * sizeof(dctcoef));
} }
void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){
...@@ -133,14 +135,18 @@ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){ ...@@ -133,14 +135,18 @@ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, int16_t *_block, int stride){
dst[i + 6*stride] = av_clip_pixel( dst[i + 6*stride] + ((b2 - b5) >> 6) ); dst[i + 6*stride] = av_clip_pixel( dst[i + 6*stride] + ((b2 - b5) >> 6) );
dst[i + 7*stride] = av_clip_pixel( dst[i + 7*stride] + ((b0 - b7) >> 6) ); dst[i + 7*stride] = av_clip_pixel( dst[i + 7*stride] + ((b0 - b7) >> 6) );
} }
memset(block, 0, 64 * sizeof(dctcoef));
} }
// assumes all AC coefs are 0 // assumes all AC coefs are 0
void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *block, int stride){ void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *_block, int stride){
int i, j; int i, j;
int dc = (((dctcoef*)block)[0] + 32) >> 6;
pixel *dst = (pixel*)_dst; pixel *dst = (pixel*)_dst;
dctcoef *block = (dctcoef*)_block;
int dc = (block[0] + 32) >> 6;
stride >>= sizeof(pixel)-1; stride >>= sizeof(pixel)-1;
block[0] = 0;
for( j = 0; j < 4; j++ ) for( j = 0; j < 4; j++ )
{ {
for( i = 0; i < 4; i++ ) for( i = 0; i < 4; i++ )
...@@ -149,10 +155,12 @@ void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *block, int stride){ ...@@ -149,10 +155,12 @@ void FUNCC(ff_h264_idct_dc_add)(uint8_t *_dst, int16_t *block, int stride){
} }
} }
void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *block, int stride){ void FUNCC(ff_h264_idct8_dc_add)(uint8_t *_dst, int16_t *_block, int stride){
int i, j; int i, j;
int dc = (((dctcoef*)block)[0] + 32) >> 6;
pixel *dst = (pixel*)_dst; pixel *dst = (pixel*)_dst;
dctcoef *block = (dctcoef*)_block;
int dc = (block[0] + 32) >> 6;
block[0] = 0;
stride >>= sizeof(pixel)-1; stride >>= sizeof(pixel)-1;
for( j = 0; j < 8; j++ ) for( j = 0; j < 8; j++ )
{ {
......
...@@ -98,15 +98,15 @@ typedef struct H264PredContext { ...@@ -98,15 +98,15 @@ typedef struct H264PredContext {
void(*pred16x16[4 + 3 + 2])(uint8_t *src, ptrdiff_t stride); void(*pred16x16[4 + 3 + 2])(uint8_t *src, ptrdiff_t stride);
void(*pred4x4_add[2])(uint8_t *pix /*align 4*/, void(*pred4x4_add[2])(uint8_t *pix /*align 4*/,
const int16_t *block /*align 16*/, ptrdiff_t stride); int16_t *block /*align 16*/, ptrdiff_t stride);
void(*pred8x8l_add[2])(uint8_t *pix /*align 8*/, void(*pred8x8l_add[2])(uint8_t *pix /*align 8*/,
const int16_t *block /*align 16*/, ptrdiff_t stride); int16_t *block /*align 16*/, ptrdiff_t stride);
void(*pred8x8_add[3])(uint8_t *pix /*align 8*/, void(*pred8x8_add[3])(uint8_t *pix /*align 8*/,
const int *block_offset, const int *block_offset,
const int16_t *block /*align 16*/, ptrdiff_t stride); int16_t *block /*align 16*/, ptrdiff_t stride);
void(*pred16x16_add[3])(uint8_t *pix /*align 16*/, void(*pred16x16_add[3])(uint8_t *pix /*align 16*/,
const int *block_offset, const int *block_offset,
const int16_t *block /*align 16*/, ptrdiff_t stride); int16_t *block /*align 16*/, ptrdiff_t stride);
} H264PredContext; } H264PredContext;
void ff_h264_pred_init(H264PredContext *h, int codec_id, void ff_h264_pred_init(H264PredContext *h, int codec_id,
......
...@@ -1132,7 +1132,7 @@ static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft, ...@@ -1132,7 +1132,7 @@ static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft,
#undef PL #undef PL
#undef SRC #undef SRC
static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const int16_t *_block, static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block,
ptrdiff_t stride) ptrdiff_t stride)
{ {
int i; int i;
...@@ -1149,9 +1149,11 @@ static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const int16_t *_block, ...@@ -1149,9 +1149,11 @@ static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, const int16_t *_block,
pix++; pix++;
block++; block++;
} }
memset(_block, 0, sizeof(dctcoef) * 16);
} }
static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const int16_t *_block, static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block,
ptrdiff_t stride) ptrdiff_t stride)
{ {
int i; int i;
...@@ -1167,9 +1169,11 @@ static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const int16_t *_block, ...@@ -1167,9 +1169,11 @@ static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, const int16_t *_block,
pix+= stride; pix+= stride;
block+= 4; block+= 4;
} }
memset(_block, 0, sizeof(dctcoef) * 16);
} }
static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const int16_t *_block, static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block,
ptrdiff_t stride) ptrdiff_t stride)
{ {
int i; int i;
...@@ -1190,9 +1194,11 @@ static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const int16_t *_block, ...@@ -1190,9 +1194,11 @@ static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, const int16_t *_block,
pix++; pix++;
block++; block++;
} }
memset(_block, 0, sizeof(dctcoef) * 64);
} }
static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const int16_t *_block, static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block,
ptrdiff_t stride) ptrdiff_t stride)
{ {
int i; int i;
...@@ -1212,10 +1218,12 @@ static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const int16_t *_block, ...@@ -1212,10 +1218,12 @@ static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, const int16_t *_block,
pix+= stride; pix+= stride;
block+= 8; block+= 8;
} }
memset(_block, 0, sizeof(dctcoef) * 64);
} }
static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset,
const int16_t *block, int16_t *block,
ptrdiff_t stride) ptrdiff_t stride)
{ {
int i; int i;
...@@ -1225,7 +1233,7 @@ static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset, ...@@ -1225,7 +1233,7 @@ static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset,
static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix,
const int *block_offset, const int *block_offset,
const int16_t *block, int16_t *block,
ptrdiff_t stride) ptrdiff_t stride)
{ {
int i; int i;
...@@ -1234,7 +1242,7 @@ static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix, ...@@ -1234,7 +1242,7 @@ static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix,
} }
static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset,
const int16_t *block, ptrdiff_t stride) int16_t *block, ptrdiff_t stride)
{ {
int i; int i;
for(i=0; i<4; i++) for(i=0; i<4; i++)
...@@ -1242,7 +1250,7 @@ static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset, ...@@ -1242,7 +1250,7 @@ static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset,
} }
static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset,
const int16_t *block, ptrdiff_t stride) int16_t *block, ptrdiff_t stride)
{ {
int i; int i;
for(i=0; i<4; i++) for(i=0; i<4; i++)
...@@ -1252,7 +1260,7 @@ static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset, ...@@ -1252,7 +1260,7 @@ static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset,
} }
static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset,
const int16_t *block, int16_t *block,
ptrdiff_t stride) ptrdiff_t stride)
{ {
int i; int i;
...@@ -1262,7 +1270,7 @@ static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset, ...@@ -1262,7 +1270,7 @@ static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset,
static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix, static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix,
const int *block_offset, const int *block_offset,
const int16_t *block, ptrdiff_t stride) int16_t *block, ptrdiff_t stride)
{ {
int i; int i;
for(i=0; i<4; i++) for(i=0; i<4; i++)
......
...@@ -87,6 +87,7 @@ static void ff_h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride) ...@@ -87,6 +87,7 @@ static void ff_h264_idct_add_altivec(uint8_t *dst, int16_t *block, int stride)
vtmp1 = vec_sld(vtmp0, vtmp0, 8); vtmp1 = vec_sld(vtmp0, vtmp0, 8);
vtmp2 = vec_ld(16,block); vtmp2 = vec_ld(16,block);
vtmp3 = vec_sld(vtmp2, vtmp2, 8); vtmp3 = vec_sld(vtmp2, vtmp2, 8);
memset(block, 0, 16 * sizeof(int16_t));
VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3); VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3); VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
...@@ -206,6 +207,7 @@ static void ff_h264_idct8_add_altivec( uint8_t *dst, int16_t *dct, int stride ) ...@@ -206,6 +207,7 @@ static void ff_h264_idct8_add_altivec( uint8_t *dst, int16_t *dct, int stride )
s5 = vec_ld(0x50, (int16_t*)dct); s5 = vec_ld(0x50, (int16_t*)dct);
s6 = vec_ld(0x60, (int16_t*)dct); s6 = vec_ld(0x60, (int16_t*)dct);
s7 = vec_ld(0x70, (int16_t*)dct); s7 = vec_ld(0x70, (int16_t*)dct);
memset(dct, 0, 64 * sizeof(int16_t));
IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
d0, d1, d2, d3, d4, d5, d6, d7); d0, d1, d2, d3, d4, d5, d6, d7);
...@@ -234,6 +236,7 @@ static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *bl ...@@ -234,6 +236,7 @@ static av_always_inline void h264_idct_dc_add_internal(uint8_t *dst, int16_t *bl
int i; int i;
dc = (block[0] + 32) >> 6; dc = (block[0] + 32) >> 6;
block[0] = 0;
dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1); dc16 = vec_splat((vec_s16) vec_lde(0, &dc), 1);
if (size == 4) if (size == 4)
......
...@@ -219,6 +219,8 @@ void ff_svq3_add_idct_c(uint8_t *dst, int16_t *block, ...@@ -219,6 +219,8 @@ void ff_svq3_add_idct_c(uint8_t *dst, int16_t *block,
dst[i + stride * 2] = av_clip_uint8(dst[i + stride * 2] + ((z1 - z2) * qmul + rr >> 20)); dst[i + stride * 2] = av_clip_uint8(dst[i + stride * 2] + ((z1 - z2) * qmul + rr >> 20));
dst[i + stride * 3] = av_clip_uint8(dst[i + stride * 3] + ((z0 - z3) * qmul + rr >> 20)); dst[i + stride * 3] = av_clip_uint8(dst[i + stride * 3] + ((z0 - z3) * qmul + rr >> 20));
} }
memset(block, 0, 16 * sizeof(int16_t));
} }
static inline int svq3_decode_block(GetBitContext *gb, int16_t *block, static inline int svq3_decode_block(GetBitContext *gb, int16_t *block,
...@@ -669,8 +671,6 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type) ...@@ -669,8 +671,6 @@ static int svq3_decode_mb(SVQ3Context *s, unsigned int mb_type)
} }
if (!IS_SKIP(mb_type) || h->pict_type == AV_PICTURE_TYPE_B) { if (!IS_SKIP(mb_type) || h->pict_type == AV_PICTURE_TYPE_B) {
memset(h->non_zero_count_cache + 8, 0, 14 * 8 * sizeof(uint8_t)); memset(h->non_zero_count_cache + 8, 0, 14 * 8 * sizeof(uint8_t));
h->dsp.clear_blocks(h->mb + 0);
h->dsp.clear_blocks(h->mb + 384);
} }
if (!IS_INTRA16x16(mb_type) && if (!IS_INTRA16x16(mb_type) &&
......
...@@ -70,6 +70,10 @@ SECTION .text ...@@ -70,6 +70,10 @@ SECTION .text
paddw m0, m6 paddw m0, m6
IDCT4_1D w, 0, 1, 2, 3, 4, 5 IDCT4_1D w, 0, 1, 2, 3, 4, 5
pxor m7, m7 pxor m7, m7
movq [%2+ 0], m7
movq [%2+ 8], m7
movq [%2+16], m7
movq [%2+24], m7
STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
lea %1, [%1+%3*2] lea %1, [%1+%3*2]
...@@ -161,13 +165,31 @@ cglobal h264_idct_add_8, 3, 3, 0 ...@@ -161,13 +165,31 @@ cglobal h264_idct_add_8, 3, 3, 0
%endmacro %endmacro
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_MMX_END 3 %macro IDCT8_ADD_MMX_END 3-4
IDCT8_1D_FULL %2 IDCT8_1D_FULL %2
mova [%2 ], m5 mova [%2 ], m5
mova [%2+16], m6 mova [%2+16], m6
mova [%2+32], m7 mova [%2+32], m7
pxor m7, m7 pxor m7, m7
%if %0 == 4
movq [%4+ 0], m7
movq [%4+ 8], m7
movq [%4+ 16], m7
movq [%4+ 24], m7
movq [%4+ 32], m7
movq [%4+ 40], m7
movq [%4+ 48], m7
movq [%4+ 56], m7
movq [%4+ 64], m7
movq [%4+ 72], m7
movq [%4+ 80], m7
movq [%4+ 88], m7
movq [%4+ 96], m7
movq [%4+104], m7
movq [%4+112], m7
movq [%4+120], m7
%endif
STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
lea %1, [%1+%3*2] lea %1, [%1+%3*2]
STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
...@@ -190,7 +212,7 @@ cglobal h264_idct8_add_8, 3, 4, 0 ...@@ -190,7 +212,7 @@ cglobal h264_idct8_add_8, 3, 4, 0
IDCT8_ADD_MMX_START r1 , rsp IDCT8_ADD_MMX_START r1 , rsp
IDCT8_ADD_MMX_START r1+8, rsp+64 IDCT8_ADD_MMX_START r1+8, rsp+64
lea r3, [r0+4] lea r3, [r0+4]
IDCT8_ADD_MMX_END r0 , rsp, r2 IDCT8_ADD_MMX_END r0 , rsp, r2, r1
IDCT8_ADD_MMX_END r3 , rsp+8, r2 IDCT8_ADD_MMX_END r3 , rsp+8, r2
ADD rsp, pad ADD rsp, pad
...@@ -233,6 +255,14 @@ cglobal h264_idct8_add_8, 3, 4, 0 ...@@ -233,6 +255,14 @@ cglobal h264_idct8_add_8, 3, 4, 0
SWAP 0, 8 SWAP 0, 8
SWAP 1, 9 SWAP 1, 9
%endif %endif
mova [%2+ 0], m7
mova [%2+ 16], m7
mova [%2+ 32], m7
mova [%2+ 48], m7
mova [%2+ 64], m7
mova [%2+ 80], m7
mova [%2+ 96], m7
mova [%2+112], m7
lea %1, [%1+%3*4] lea %1, [%1+%3*4]
STORE_DIFF m4, m6, m7, [%1 ] STORE_DIFF m4, m6, m7, [%1 ]
STORE_DIFF m5, m6, m7, [%1+%3 ] STORE_DIFF m5, m6, m7, [%1+%3 ]
...@@ -246,19 +276,11 @@ cglobal h264_idct8_add_8, 3, 4, 10 ...@@ -246,19 +276,11 @@ cglobal h264_idct8_add_8, 3, 4, 10
IDCT8_ADD_SSE r0, r1, r2, r3 IDCT8_ADD_SSE r0, r1, r2, r3
RET RET
%macro DC_ADD_MMXEXT_INIT 2-3 %macro DC_ADD_MMXEXT_INIT 2
%if %0 == 2
movsx %1, word [%1]
add %1, 32 add %1, 32
sar %1, 6 sar %1, 6
movd m0, %1d movd m0, %1d
lea %1, [%2*3] lea %1, [%2*3]
%else
add %3, 32
sar %3, 6
movd m0, %3d
lea %3, [%2*3]
%endif
pshufw m0, m0, 0 pshufw m0, m0, 0
pxor m1, m1 pxor m1, m1
psubw m1, m0 psubw m1, m0
...@@ -287,19 +309,44 @@ cglobal h264_idct8_add_8, 3, 4, 10 ...@@ -287,19 +309,44 @@ cglobal h264_idct8_add_8, 3, 4, 10
INIT_MMX mmxext INIT_MMX mmxext
; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) ; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct_dc_add_8, 3, 3, 0 %if ARCH_X86_64
DC_ADD_MMXEXT_INIT r1, r2 cglobal h264_idct_dc_add_8, 3, 4, 0
DC_ADD_MMXEXT_OP movh, r0, r2, r1 movsx r3, word [r1]
mov word [r1], 0
DC_ADD_MMXEXT_INIT r3, r2
DC_ADD_MMXEXT_OP movh, r0, r2, r3
RET RET
; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride) ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct8_dc_add_8, 3, 3, 0 cglobal h264_idct8_dc_add_8, 3, 4, 0
DC_ADD_MMXEXT_INIT r1, r2 movsx r3, word [r1]
DC_ADD_MMXEXT_OP mova, r0, r2, r1 mov word [r1], 0
DC_ADD_MMXEXT_INIT r3, r2
DC_ADD_MMXEXT_OP mova, r0, r2, r3
lea r0, [r0+r2*4] lea r0, [r0+r2*4]
DC_ADD_MMXEXT_OP mova, r0, r2, r1 DC_ADD_MMXEXT_OP mova, r0, r2, r3
RET
%else
cglobal h264_idct_dc_add_8, 2, 3, 0
movsx r2, word [r1]
mov word [r1], 0
mov r1, r2m
DC_ADD_MMXEXT_INIT r2, r1
DC_ADD_MMXEXT_OP movh, r0, r1, r2
RET RET
; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct8_dc_add_8, 2, 3, 0
movsx r2, word [r1]
mov word [r1], 0
mov r1, r2m
DC_ADD_MMXEXT_INIT r2, r1
DC_ADD_MMXEXT_OP mova, r0, r1, r2
lea r0, [r0+r1*4]
DC_ADD_MMXEXT_OP mova, r0, r1, r2
RET
%endif
INIT_MMX mmx INIT_MMX mmx
; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride, const uint8_t nnzc[6*8]) ; int16_t *block, int stride, const uint8_t nnzc[6*8])
...@@ -343,7 +390,7 @@ cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, ...@@ -343,7 +390,7 @@ cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride,
add word [r2], 32 add word [r2], 32
IDCT8_ADD_MMX_START r2 , rsp IDCT8_ADD_MMX_START r2 , rsp
IDCT8_ADD_MMX_START r2+8, rsp+64 IDCT8_ADD_MMX_START r2+8, rsp+64
IDCT8_ADD_MMX_END r6 , rsp, r3 IDCT8_ADD_MMX_END r6 , rsp, r3, r2
mov r6d, dword [r1+r5*4] mov r6d, dword [r1+r5*4]
lea r6, [r0+r6+4] lea r6, [r0+r6+4]
IDCT8_ADD_MMX_END r6 , rsp+8, r3 IDCT8_ADD_MMX_END r6 , rsp+8, r3
...@@ -373,7 +420,8 @@ cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride ...@@ -373,7 +420,8 @@ cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride
movsx r6, word [r2] movsx r6, word [r2]
test r6, r6 test r6, r6
jz .no_dc jz .no_dc
DC_ADD_MMXEXT_INIT r2, r3, r6 mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
%define dst2q r1 %define dst2q r1
%define dst2d r1d %define dst2d r1d
...@@ -450,7 +498,8 @@ cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, s ...@@ -450,7 +498,8 @@ cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, s
movsx r6, word [r2] movsx r6, word [r2]
test r6, r6 test r6, r6
jz .skipblock jz .skipblock
DC_ADD_MMXEXT_INIT r2, r3, r6 mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
%define dst2q r1 %define dst2q r1
%define dst2d r1d %define dst2d r1d
...@@ -489,7 +538,8 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride ...@@ -489,7 +538,8 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride
movsx r6, word [r2] movsx r6, word [r2]
test r6, r6 test r6, r6
jz .no_dc jz .no_dc
DC_ADD_MMXEXT_INIT r2, r3, r6 mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
%define dst2q r1 %define dst2q r1
%define dst2d r1d %define dst2d r1d
...@@ -515,7 +565,7 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride ...@@ -515,7 +565,7 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride
add word [r2], 32 add word [r2], 32
IDCT8_ADD_MMX_START r2 , rsp IDCT8_ADD_MMX_START r2 , rsp
IDCT8_ADD_MMX_START r2+8, rsp+64 IDCT8_ADD_MMX_START r2+8, rsp+64
IDCT8_ADD_MMX_END r6 , rsp, r3 IDCT8_ADD_MMX_END r6 , rsp, r3, r2
mov r6d, dword [r1+r5*4] mov r6d, dword [r1+r5*4]
lea r6, [r0+r6+4] lea r6, [r0+r6+4]
IDCT8_ADD_MMX_END r6 , rsp+8, r3 IDCT8_ADD_MMX_END r6 , rsp+8, r3
...@@ -547,7 +597,8 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, strid ...@@ -547,7 +597,8 @@ cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, strid
test r6, r6 test r6, r6
jz .no_dc jz .no_dc
INIT_MMX cpuname INIT_MMX cpuname
DC_ADD_MMXEXT_INIT r2, r3, r6 mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 == 0 %if ARCH_X86_64 == 0
%define dst2q r1 %define dst2q r1
%define dst2d r1d %define dst2d r1d
...@@ -650,7 +701,8 @@ h264_idct_add8_mmxext_plane: ...@@ -650,7 +701,8 @@ h264_idct_add8_mmxext_plane:
movsx r6, word [r2] movsx r6, word [r2]
test r6, r6 test r6, r6
jz .skipblock jz .skipblock
DC_ADD_MMXEXT_INIT r2, r3, r6 mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 %if ARCH_X86_64
mov r0d, dword [r1+r5*4] mov r0d, dword [r1+r5*4]
add r0, [dst2q] add r0, [dst2q]
...@@ -693,7 +745,9 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, ...@@ -693,7 +745,9 @@ cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride,
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
h264_idct_dc_add8_mmxext: h264_idct_dc_add8_mmxext:
movd m0, [r2 ] ; 0 0 X D movd m0, [r2 ] ; 0 0 X D
mov word [r2+ 0], 0
punpcklwd m0, [r2+32] ; x X d D punpcklwd m0, [r2+32] ; x X d D
mov word [r2+32], 0
paddsw m0, [pw_32] paddsw m0, [pw_32]
psraw m0, 6 psraw m0, 6
punpcklwd m0, m0 ; d d D D punpcklwd m0, m0 ; d d D D
...@@ -723,6 +777,10 @@ h264_add8x4_idct_sse2: ...@@ -723,6 +777,10 @@ h264_add8x4_idct_sse2:
paddw m0, [pw_32] paddw m0, [pw_32]
IDCT4_1D w,0,1,2,3,4,5 IDCT4_1D w,0,1,2,3,4,5
pxor m7, m7 pxor m7, m7
mova [r2+ 0], m7
mova [r2+16], m7
mova [r2+32], m7
mova [r2+48], m7
STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
lea r0, [r0+r3*2] lea r0, [r0+r3*2]
STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
......
...@@ -66,6 +66,10 @@ SECTION .text ...@@ -66,6 +66,10 @@ SECTION .text
paddd m0, [pd_32] paddd m0, [pd_32]
IDCT4_1D d,0,1,2,3,4,5 IDCT4_1D d,0,1,2,3,4,5
pxor m5, m5 pxor m5, m5
mova [%2+ 0], m5
mova [%2+16], m5
mova [%2+32], m5
mova [%2+48], m5
STORE_DIFFx2 m0, m1, m4, m5, %1, %3 STORE_DIFFx2 m0, m1, m4, m5, %1, %3
lea %1, [%1+%3*2] lea %1, [%1+%3*2]
STORE_DIFFx2 m2, m3, m4, m5, %1, %3 STORE_DIFFx2 m2, m3, m4, m5, %1, %3
...@@ -100,6 +104,10 @@ add4x4_idct %+ SUFFIX: ...@@ -100,6 +104,10 @@ add4x4_idct %+ SUFFIX:
paddd m0, [pd_32] paddd m0, [pd_32]
IDCT4_1D d,0,1,2,3,4,5 IDCT4_1D d,0,1,2,3,4,5
pxor m5, m5 pxor m5, m5
mova [r2+ 0], m5
mova [r2+16], m5
mova [r2+32], m5
mova [r2+48], m5
STORE_DIFFx2 m0, m1, m4, m5, r5, r3 STORE_DIFFx2 m0, m1, m4, m5, r5, r3
lea r5, [r5+r3*2] lea r5, [r5+r3*2]
STORE_DIFFx2 m2, m3, m4, m5, r5, r3 STORE_DIFFx2 m2, m3, m4, m5, r5, r3
...@@ -187,6 +195,7 @@ IDCT_ADD16_10 ...@@ -187,6 +195,7 @@ IDCT_ADD16_10
INIT_MMX mmxext INIT_MMX mmxext
cglobal h264_idct_dc_add_10,3,3 cglobal h264_idct_dc_add_10,3,3
movd m0, [r1] movd m0, [r1]
mov dword [r1], 0
paddd m0, [pd_32] paddd m0, [pd_32]
psrad m0, 6 psrad m0, 6
lea r1, [r2*3] lea r1, [r2*3]
...@@ -199,11 +208,11 @@ cglobal h264_idct_dc_add_10,3,3 ...@@ -199,11 +208,11 @@ cglobal h264_idct_dc_add_10,3,3
; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride) ; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
;----------------------------------------------------------------------------- ;-----------------------------------------------------------------------------
%macro IDCT8_DC_ADD 0 %macro IDCT8_DC_ADD 0
cglobal h264_idct8_dc_add_10,3,3,7 cglobal h264_idct8_dc_add_10,3,4,7
mov r1d, [r1] movd m0, [r1]
add r1, 32 mov dword[r1], 0
sar r1, 6 paddd m0, [pd_32]
movd m0, r1d psrad m0, 6
lea r1, [r2*3] lea r1, [r2*3]
SPLATW m0, m0, 0 SPLATW m0, m0, 0
mova m6, [pw_pixel_max] mova m6, [pw_pixel_max]
...@@ -255,6 +264,8 @@ idct_dc_add %+ SUFFIX: ...@@ -255,6 +264,8 @@ idct_dc_add %+ SUFFIX:
add r5, r0 add r5, r0
movq m0, [r2+ 0] movq m0, [r2+ 0]
movhps m0, [r2+64] movhps m0, [r2+64]
mov dword [r2+ 0], 0
mov dword [r2+64], 0
paddd m0, [pd_32] paddd m0, [pd_32]
psrad m0, 6 psrad m0, 6
pshufhw m0, m0, 0 pshufhw m0, m0, 0
...@@ -473,6 +484,22 @@ h264_idct8_add1_10 %+ SUFFIX: ...@@ -473,6 +484,22 @@ h264_idct8_add1_10 %+ SUFFIX:
packssdw m8, m0 packssdw m8, m0
paddsw m8, [r0] paddsw m8, [r0]
pxor m0, m0 pxor m0, m0
mova [r1+ 0], m0
mova [r1+ 16], m0
mova [r1+ 32], m0
mova [r1+ 48], m0
mova [r1+ 64], m0
mova [r1+ 80], m0
mova [r1+ 96], m0
mova [r1+112], m0
mova [r1+128], m0
mova [r1+144], m0
mova [r1+160], m0
mova [r1+176], m0
mova [r1+192], m0
mova [r1+208], m0
mova [r1+224], m0
mova [r1+240], m0
CLIPW m8, m0, [pw_pixel_max] CLIPW m8, m0, [pw_pixel_max]
mova [r0], m8 mova [r0], m8
mova m8, [pw_pixel_max] mova m8, [pw_pixel_max]
...@@ -492,6 +519,22 @@ h264_idct8_add1_10 %+ SUFFIX: ...@@ -492,6 +519,22 @@ h264_idct8_add1_10 %+ SUFFIX:
lea r3, [r0+8] lea r3, [r0+8]
IDCT8_ADD_SSE_END r0, rsp, r2 IDCT8_ADD_SSE_END r0, rsp, r2
IDCT8_ADD_SSE_END r3, rsp+16, r2 IDCT8_ADD_SSE_END r3, rsp+16, r2
mova [r1+ 0], m7
mova [r1+ 16], m7
mova [r1+ 32], m7
mova [r1+ 48], m7
mova [r1+ 64], m7
mova [r1+ 80], m7
mova [r1+ 96], m7
mova [r1+112], m7
mova [r1+128], m7
mova [r1+144], m7
mova [r1+160], m7
mova [r1+176], m7
mova [r1+192], m7
mova [r1+208], m7
mova [r1+224], m7
mova [r1+240], m7
%endif ; ARCH_X86_64 %endif ; ARCH_X86_64
add rsp, pad add rsp, pad
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment