Commit a52ffc3f authored by Ronald S. Bultje's avatar Ronald S. Bultje

Move static inline function to a macro, so that constant propagation in

inline asm works for gcc-3.x also (hopefully). Should fix gcc-3.x FATE
breakage after r25254.

Originally committed as revision 25262 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent fc7c40c2
......@@ -63,123 +63,119 @@ void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTEL
/***********************************/
/* deblocking */
static av_always_inline void h264_loop_filter_strength_iteration_mmx2(int16_t bS[2][4][4], uint8_t nnz[40],
int8_t ref[2][40], int16_t mv[2][40][2],
int bidir, int edges, int step,
int mask_mv, int dir, const int d_idx,
const uint64_t mask_dir)
{
x86_reg b_idx;
mask_mv <<= 3;
for( b_idx=0; b_idx<edges; b_idx+=step ) {
if (!mask_dir)
__asm__ volatile(
"pxor %%mm0, %%mm0 \n\t"
::
);
if(!(mask_mv & b_idx)) {
if(bidir) {
__asm__ volatile(
"movd %a3(%0,%2), %%mm2 \n"
"punpckldq %a4(%0,%2), %%mm2 \n" // { ref0[bn], ref1[bn] }
"pshufw $0x44, 12(%0,%2), %%mm0 \n" // { ref0[b], ref0[b] }
"pshufw $0x44, 52(%0,%2), %%mm1 \n" // { ref1[b], ref1[b] }
"pshufw $0x4E, %%mm2, %%mm3 \n"
"psubb %%mm2, %%mm0 \n" // { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] }
"psubb %%mm3, %%mm1 \n" // { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] }
"por %%mm1, %%mm0 \n"
"movq %a5(%1,%2,4), %%mm1 \n"
"movq %a6(%1,%2,4), %%mm2 \n"
"movq %%mm1, %%mm3 \n"
"movq %%mm2, %%mm4 \n"
"psubw 48(%1,%2,4), %%mm1 \n"
"psubw 56(%1,%2,4), %%mm2 \n"
"psubw 208(%1,%2,4), %%mm3 \n"
"psubw 216(%1,%2,4), %%mm4 \n"
"packsswb %%mm2, %%mm1 \n"
"packsswb %%mm4, %%mm3 \n"
"paddb %%mm6, %%mm1 \n"
"paddb %%mm6, %%mm3 \n"
"psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
"psubusb %%mm5, %%mm3 \n"
"packsswb %%mm3, %%mm1 \n"
"por %%mm1, %%mm0 \n"
"movq %a7(%1,%2,4), %%mm1 \n"
"movq %a8(%1,%2,4), %%mm2 \n"
"movq %%mm1, %%mm3 \n"
"movq %%mm2, %%mm4 \n"
"psubw 48(%1,%2,4), %%mm1 \n"
"psubw 56(%1,%2,4), %%mm2 \n"
"psubw 208(%1,%2,4), %%mm3 \n"
"psubw 216(%1,%2,4), %%mm4 \n"
"packsswb %%mm2, %%mm1 \n"
"packsswb %%mm4, %%mm3 \n"
"paddb %%mm6, %%mm1 \n"
"paddb %%mm6, %%mm3 \n"
"psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
"psubusb %%mm5, %%mm3 \n"
"packsswb %%mm3, %%mm1 \n"
"pshufw $0x4E, %%mm1, %%mm1 \n"
"por %%mm1, %%mm0 \n"
"pshufw $0x4E, %%mm0, %%mm1 \n"
"pminub %%mm1, %%mm0 \n"
::"r"(ref),
"r"(mv),
"r"(b_idx),
"i"(d_idx+12),
"i"(d_idx+52),
"i"(d_idx*4+48),
"i"(d_idx*4+56),
"i"(d_idx*4+208),
"i"(d_idx*4+216)
);
} else {
__asm__ volatile(
"movd 12(%0,%2), %%mm0 \n"
"psubb %a3(%0,%2), %%mm0 \n" // ref[b] != ref[bn]
"movq 48(%1,%2,4), %%mm1 \n"
"movq 56(%1,%2,4), %%mm2 \n"
"psubw %a4(%1,%2,4), %%mm1 \n"
"psubw %a5(%1,%2,4), %%mm2 \n"
"packsswb %%mm2, %%mm1 \n"
"paddb %%mm6, %%mm1 \n"
"psubusb %%mm5, %%mm1 \n" // abs(mv[b] - mv[bn]) >= limit
"packsswb %%mm1, %%mm1 \n"
"por %%mm1, %%mm0 \n"
::"r"(ref),
"r"(mv),
"r"(b_idx),
"i"(d_idx+12),
"i"(d_idx*4+48),
"i"(d_idx*4+56)
);
}
}
__asm__ volatile(
"movd 12(%0,%1), %%mm1 \n"
"por %a2(%0,%1), %%mm1 \n" // nnz[b] || nnz[bn]
::"r"(nnz),
"r"(b_idx),
"i"(d_idx+12)
);
__asm__ volatile(
"pminub %%mm7, %%mm1 \n"
"pminub %%mm7, %%mm0 \n"
"psllw $1, %%mm1 \n"
"pxor %%mm2, %%mm2 \n"
"pmaxub %%mm0, %%mm1 \n"
"punpcklbw %%mm2, %%mm1 \n"
"movq %%mm1, %a1(%0,%2) \n"
::"r"(bS),
"i"(32*dir),
"r"(b_idx)
:"memory"
);
}
}
#define h264_loop_filter_strength_iteration_mmx2(bS, nz, ref, mv, bidir, edges, step, mask_mv, dir, d_idx, mask_dir) \
do { \
x86_reg b_idx; \
mask_mv <<= 3; \
for( b_idx=0; b_idx<edges; b_idx+=step ) { \
if (!mask_dir) \
__asm__ volatile( \
"pxor %%mm0, %%mm0 \n\t" \
:: \
); \
if(!(mask_mv & b_idx)) { \
if(bidir) { \
__asm__ volatile( \
"movd %a3(%0,%2), %%mm2 \n" \
"punpckldq %a4(%0,%2), %%mm2 \n" /* { ref0[bn], ref1[bn] } */ \
"pshufw $0x44, 12(%0,%2), %%mm0 \n" /* { ref0[b], ref0[b] } */ \
"pshufw $0x44, 52(%0,%2), %%mm1 \n" /* { ref1[b], ref1[b] } */ \
"pshufw $0x4E, %%mm2, %%mm3 \n" \
"psubb %%mm2, %%mm0 \n" /* { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] } */ \
"psubb %%mm3, %%mm1 \n" /* { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] } */ \
\
"por %%mm1, %%mm0 \n" \
"movq %a5(%1,%2,4), %%mm1 \n" \
"movq %a6(%1,%2,4), %%mm2 \n" \
"movq %%mm1, %%mm3 \n" \
"movq %%mm2, %%mm4 \n" \
"psubw 48(%1,%2,4), %%mm1 \n" \
"psubw 56(%1,%2,4), %%mm2 \n" \
"psubw 208(%1,%2,4), %%mm3 \n" \
"psubw 216(%1,%2,4), %%mm4 \n" \
"packsswb %%mm2, %%mm1 \n" \
"packsswb %%mm4, %%mm3 \n" \
"paddb %%mm6, %%mm1 \n" \
"paddb %%mm6, %%mm3 \n" \
"psubusb %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \
"psubusb %%mm5, %%mm3 \n" \
"packsswb %%mm3, %%mm1 \n" \
\
"por %%mm1, %%mm0 \n" \
"movq %a7(%1,%2,4), %%mm1 \n" \
"movq %a8(%1,%2,4), %%mm2 \n" \
"movq %%mm1, %%mm3 \n" \
"movq %%mm2, %%mm4 \n" \
"psubw 48(%1,%2,4), %%mm1 \n" \
"psubw 56(%1,%2,4), %%mm2 \n" \
"psubw 208(%1,%2,4), %%mm3 \n" \
"psubw 216(%1,%2,4), %%mm4 \n" \
"packsswb %%mm2, %%mm1 \n" \
"packsswb %%mm4, %%mm3 \n" \
"paddb %%mm6, %%mm1 \n" \
"paddb %%mm6, %%mm3 \n" \
"psubusb %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \
"psubusb %%mm5, %%mm3 \n" \
"packsswb %%mm3, %%mm1 \n" \
\
"pshufw $0x4E, %%mm1, %%mm1 \n" \
"por %%mm1, %%mm0 \n" \
"pshufw $0x4E, %%mm0, %%mm1 \n" \
"pminub %%mm1, %%mm0 \n" \
::"r"(ref), \
"r"(mv), \
"r"(b_idx), \
"i"(d_idx+12), \
"i"(d_idx+52), \
"i"(d_idx*4+48), \
"i"(d_idx*4+56), \
"i"(d_idx*4+208), \
"i"(d_idx*4+216) \
); \
} else { \
__asm__ volatile( \
"movd 12(%0,%2), %%mm0 \n" \
"psubb %a3(%0,%2), %%mm0 \n" /* ref[b] != ref[bn] */ \
"movq 48(%1,%2,4), %%mm1 \n" \
"movq 56(%1,%2,4), %%mm2 \n" \
"psubw %a4(%1,%2,4), %%mm1 \n" \
"psubw %a5(%1,%2,4), %%mm2 \n" \
"packsswb %%mm2, %%mm1 \n" \
"paddb %%mm6, %%mm1 \n" \
"psubusb %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \
"packsswb %%mm1, %%mm1 \n" \
"por %%mm1, %%mm0 \n" \
::"r"(ref), \
"r"(mv), \
"r"(b_idx), \
"i"(d_idx+12), \
"i"(d_idx*4+48), \
"i"(d_idx*4+56) \
); \
} \
} \
__asm__ volatile( \
"movd 12(%0,%1), %%mm1 \n" \
"por %a2(%0,%1), %%mm1 \n" /* nnz[b] || nnz[bn] */ \
::"r"(nnz), \
"r"(b_idx), \
"i"(d_idx+12) \
); \
__asm__ volatile( \
"pminub %%mm7, %%mm1 \n" \
"pminub %%mm7, %%mm0 \n" \
"psllw $1, %%mm1 \n" \
"pxor %%mm2, %%mm2 \n" \
"pmaxub %%mm0, %%mm1 \n" \
"punpcklbw %%mm2, %%mm1 \n" \
"movq %%mm1, %a1(%0,%2) \n" \
::"r"(bS), \
"i"(32*dir), \
"r"(b_idx) \
:"memory" \
); \
} \
} while (0)
static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment