Commit 5adf43e4 authored by Loren Merritt's avatar Loren Merritt

cosmetics: remove code duplication in hadamard8_diff_mmx

Originally committed as revision 8946 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent c5a60225
...@@ -1519,28 +1519,23 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t ...@@ -1519,28 +1519,23 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t
LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\ LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\ LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
#define MMABS(a,z)\ #define MMABS_MMX(a,z)\
"pxor " #z ", " #z " \n\t"\ "pxor " #z ", " #z " \n\t"\
"pcmpgtw " #a ", " #z " \n\t"\ "pcmpgtw " #a ", " #z " \n\t"\
"pxor " #z ", " #a " \n\t"\ "pxor " #z ", " #a " \n\t"\
"psubw " #z ", " #a " \n\t" "psubw " #z ", " #a " \n\t"
#define MMABS_SUM(a,z, sum)\
"pxor " #z ", " #z " \n\t"\
"pcmpgtw " #a ", " #z " \n\t"\
"pxor " #z ", " #a " \n\t"\
"psubw " #z ", " #a " \n\t"\
"paddusw " #a ", " #sum " \n\t"
#define MMABS_MMX2(a,z)\ #define MMABS_MMX2(a,z)\
"pxor " #z ", " #z " \n\t"\ "pxor " #z ", " #z " \n\t"\
"psubw " #a ", " #z " \n\t"\ "psubw " #a ", " #z " \n\t"\
"pmaxsw " #z ", " #a " \n\t" "pmaxsw " #z ", " #a " \n\t"
#define MMABS_SUM_MMX(a,z, sum)\
MMABS_MMX(a,z)\
"paddusw " #a ", " #sum " \n\t"
#define MMABS_SUM_MMX2(a,z, sum)\ #define MMABS_SUM_MMX2(a,z, sum)\
"pxor " #z ", " #z " \n\t"\ MMABS_MMX2(a,z)\
"psubw " #a ", " #z " \n\t"\
"pmaxsw " #z ", " #a " \n\t"\
"paddusw " #a ", " #sum " \n\t" "paddusw " #a ", " #sum " \n\t"
#define LOAD4(o, a, b, c, d)\ #define LOAD4(o, a, b, c, d)\
...@@ -1555,178 +1550,117 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t ...@@ -1555,178 +1550,117 @@ static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t
"movq "#c", "#o"+32(%1) \n\t"\ "movq "#c", "#o"+32(%1) \n\t"\
"movq "#d", "#o"+48(%1) \n\t"\ "movq "#d", "#o"+48(%1) \n\t"\
static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ #define HSUM_MMX(a, t, dst)\
DECLARE_ALIGNED_8(uint64_t, temp[16]); "movq "#a", "#t" \n\t"\
int sum=0; "psrlq $32, "#a" \n\t"\
"paddusw "#t", "#a" \n\t"\
assert(h==8); "movq "#a", "#t" \n\t"\
"psrlq $16, "#a" \n\t"\
diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); "paddusw "#t", "#a" \n\t"\
"movd "#a", "#dst" \n\t"\
asm volatile(
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) #define HSUM_MMX2(a, t, dst)\
LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) "pshufw $0x0E, "#a", "#t" \n\t"\
"paddusw "#t", "#a" \n\t"\
HADAMARD48 "pshufw $0x01, "#a", "#t" \n\t"\
"paddusw "#t", "#a" \n\t"\
"movq %%mm7, 112(%1) \n\t" "movd "#a", "#dst" \n\t"\
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) #define HADAMARD8_DIFF_MMX(cpu) \
STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) static int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){\
DECLARE_ALIGNED_8(uint64_t, temp[16]);\
"movq 112(%1), %%mm7 \n\t" int sum=0;\
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) \
STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) assert(h==8);\
\
LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);\
LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) \
asm volatile(\
HADAMARD48 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)\
"movq %%mm7, 120(%1) \n\t" \
HADAMARD48\
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7) \
STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2) "movq %%mm7, 112(%1) \n\t"\
\
"movq 120(%1), %%mm7 \n\t" TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)\
"movq %%mm7, %%mm5 \n\t"//FIXME remove \
"movq %%mm6, %%mm7 \n\t" "movq 112(%1), %%mm7 \n\t"\
"movq %%mm0, %%mm6 \n\t" TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)\
\
LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3) LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)\
// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7) LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)\
\
HADAMARD48 HADAMARD48\
"movq %%mm7, 64(%1) \n\t" \
MMABS(%%mm0, %%mm7) "movq %%mm7, 120(%1) \n\t"\
MMABS_SUM(%%mm1, %%mm7, %%mm0) \
MMABS_SUM(%%mm2, %%mm7, %%mm0) TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)\
MMABS_SUM(%%mm3, %%mm7, %%mm0) STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)\
MMABS_SUM(%%mm4, %%mm7, %%mm0) \
MMABS_SUM(%%mm5, %%mm7, %%mm0) "movq 120(%1), %%mm7 \n\t"\
MMABS_SUM(%%mm6, %%mm7, %%mm0) TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)\
"movq 64(%1), %%mm1 \n\t" "movq %%mm7, %%mm5 \n\t"/*FIXME remove*/\
MMABS_SUM(%%mm1, %%mm7, %%mm0) "movq %%mm6, %%mm7 \n\t"\
"movq %%mm0, 64(%1) \n\t" "movq %%mm0, %%mm6 \n\t"\
\
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)\
LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7) \
HADAMARD48\
HADAMARD48 "movq %%mm7, 64(%1) \n\t"\
"movq %%mm7, (%1) \n\t" MMABS(%%mm0, %%mm7)\
MMABS(%%mm0, %%mm7) MMABS_SUM(%%mm1, %%mm7, %%mm0)\
MMABS_SUM(%%mm1, %%mm7, %%mm0) MMABS_SUM(%%mm2, %%mm7, %%mm0)\
MMABS_SUM(%%mm2, %%mm7, %%mm0) MMABS_SUM(%%mm3, %%mm7, %%mm0)\
MMABS_SUM(%%mm3, %%mm7, %%mm0) MMABS_SUM(%%mm4, %%mm7, %%mm0)\
MMABS_SUM(%%mm4, %%mm7, %%mm0) MMABS_SUM(%%mm5, %%mm7, %%mm0)\
MMABS_SUM(%%mm5, %%mm7, %%mm0) MMABS_SUM(%%mm6, %%mm7, %%mm0)\
MMABS_SUM(%%mm6, %%mm7, %%mm0) "movq 64(%1), %%mm1 \n\t"\
"movq (%1), %%mm1 \n\t" MMABS_SUM(%%mm1, %%mm7, %%mm0)\
MMABS_SUM(%%mm1, %%mm7, %%mm0) "movq %%mm0, 64(%1) \n\t"\
"movq 64(%1), %%mm1 \n\t" \
MMABS_SUM(%%mm1, %%mm7, %%mm0) LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)\
LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)\
"movq %%mm0, %%mm1 \n\t" \
"psrlq $32, %%mm0 \n\t" HADAMARD48\
"paddusw %%mm1, %%mm0 \n\t" "movq %%mm7, (%1) \n\t"\
"movq %%mm0, %%mm1 \n\t" MMABS(%%mm0, %%mm7)\
"psrlq $16, %%mm0 \n\t" MMABS_SUM(%%mm1, %%mm7, %%mm0)\
"paddusw %%mm1, %%mm0 \n\t" MMABS_SUM(%%mm2, %%mm7, %%mm0)\
"movd %%mm0, %0 \n\t" MMABS_SUM(%%mm3, %%mm7, %%mm0)\
MMABS_SUM(%%mm4, %%mm7, %%mm0)\
: "=r" (sum) MMABS_SUM(%%mm5, %%mm7, %%mm0)\
: "r"(temp) MMABS_SUM(%%mm6, %%mm7, %%mm0)\
); "movq (%1), %%mm1 \n\t"\
return sum&0xFFFF; MMABS_SUM(%%mm1, %%mm7, %%mm0)\
} "movq 64(%1), %%mm1 \n\t"\
MMABS_SUM(%%mm1, %%mm7, %%mm0)\
static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){ \
DECLARE_ALIGNED_8(uint64_t, temp[16]); HSUM(%%mm0, %%mm1, %0)\
int sum=0; \
: "=r" (sum)\
assert(h==8); : "r"(temp)\
);\
diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride); return sum&0xFFFF;\
}
asm volatile(
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3) #define MMABS(a,z) MMABS_MMX(a,z)
LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7) #define MMABS_SUM(a,z,sum) MMABS_SUM_MMX(a,z,sum)
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
HADAMARD48 HADAMARD8_DIFF_MMX(mmx)
#undef MMABS
"movq %%mm7, 112(%1) \n\t" #undef MMABS_SUM
#undef HSUM
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2) #define MMABS(a,z) MMABS_MMX2(a,z)
#define MMABS_SUM(a,z,sum) MMABS_SUM_MMX2(a,z,sum)
"movq 112(%1), %%mm7 \n\t" #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0) HADAMARD8_DIFF_MMX(mmx2)
STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6) #undef MMABS
#undef MMABS_SUM
LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3) #undef HSUM
LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
HADAMARD48
"movq %%mm7, 120(%1) \n\t"
TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
"movq 120(%1), %%mm7 \n\t"
TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
"movq %%mm7, %%mm5 \n\t"//FIXME remove
"movq %%mm6, %%mm7 \n\t"
"movq %%mm0, %%mm6 \n\t"
// STORE4(72, %%mm4, %%mm7, %%mm0, %%mm6) //FIXME remove
LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
// LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
HADAMARD48
"movq %%mm7, 64(%1) \n\t"
MMABS_MMX2(%%mm0, %%mm7)
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
"movq 64(%1), %%mm1 \n\t"
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
"movq %%mm0, 64(%1) \n\t"
LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
HADAMARD48
"movq %%mm7, (%1) \n\t"
MMABS_MMX2(%%mm0, %%mm7)
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
"movq (%1), %%mm1 \n\t"
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
"movq 64(%1), %%mm1 \n\t"
MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
"pshufw $0x0E, %%mm0, %%mm1 \n\t"
"paddusw %%mm1, %%mm0 \n\t"
"pshufw $0x01, %%mm0, %%mm1 \n\t"
"paddusw %%mm1, %%mm0 \n\t"
"movd %%mm0, %0 \n\t"
: "=r" (sum)
: "r"(temp)
);
return sum&0xFFFF;
}
WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx) WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2) WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment