Commit ce611a27 authored by Michael Niedermayer's avatar Michael Niedermayer

Change rounding of the horizontal DWT to match the vertical one.

This allows some simplifications and optimizations and should
not have any effect on quality.

Originally committed as revision 10172 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 7506d47a
...@@ -111,8 +111,7 @@ void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width){ ...@@ -111,8 +111,7 @@ void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width){
i = 0; i = 0;
asm volatile( asm volatile(
"pcmpeqd %%xmm7, %%xmm7 \n\t" "pslld $1, %%xmm7 \n\t"
"psrad $29, %%xmm7 \n\t"
::); ::);
for(; i<w_l-7; i+=8){ for(; i<w_l-7; i+=8){
asm volatile( asm volatile(
...@@ -157,25 +156,21 @@ void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width){ ...@@ -157,25 +156,21 @@ void ff_snow_horizontal_compose97i_sse2(DWTELEM *b, int width){
"movdqu 20(%1), %%xmm6 \n\t" "movdqu 20(%1), %%xmm6 \n\t"
"paddd (%1), %%xmm2 \n\t" "paddd (%1), %%xmm2 \n\t"
"paddd 16(%1), %%xmm6 \n\t" "paddd 16(%1), %%xmm6 \n\t"
"movdqa %%xmm2, %%xmm0 \n\t" "movdqu (%0), %%xmm0 \n\t"
"movdqa %%xmm6, %%xmm4 \n\t" "movdqu 16(%0), %%xmm4 \n\t"
"pslld $2, %%xmm2 \n\t" "paddd %%xmm2, %%xmm0 \n\t"
"pslld $2, %%xmm6 \n\t" "paddd %%xmm6, %%xmm4 \n\t"
"psubd %%xmm2, %%xmm0 \n\t" "psrad $1, %%xmm2 \n\t"
"psubd %%xmm6, %%xmm4 \n\t" "psrad $1, %%xmm6 \n\t"
"psrad $1, %%xmm0 \n\t" "paddd %%xmm0, %%xmm2 \n\t"
"psrad $1, %%xmm4 \n\t" "paddd %%xmm4, %%xmm6 \n\t"
"movdqu (%0), %%xmm2 \n\t"
"movdqu 16(%0), %%xmm6 \n\t"
"psubd %%xmm0, %%xmm2 \n\t"
"psubd %%xmm4, %%xmm6 \n\t"
"movdqa %%xmm2, (%2) \n\t" "movdqa %%xmm2, (%2) \n\t"
"movdqa %%xmm6, 16(%2) \n\t" "movdqa %%xmm6, 16(%2) \n\t"
:: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
: "memory" : "memory"
); );
} }
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS); snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
} }
{ {
...@@ -291,10 +286,9 @@ void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){ ...@@ -291,10 +286,9 @@ void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){
DWTELEM * const ref = b+w2 - 1; DWTELEM * const ref = b+w2 - 1;
i = 1; i = 1;
b[0] = b[0] + (((2 * ref[1] + W_BO-1) + 4 * b[0]) >> W_BS); b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
asm volatile( asm volatile(
"pcmpeqd %%mm7, %%mm7 \n\t" "pslld $1, %%mm7 \n\t"
"psrld $29, %%mm7 \n\t"
::); ::);
for(; i<w_l-3; i+=4){ for(; i<w_l-3; i+=4){
asm volatile( asm volatile(
...@@ -333,16 +327,12 @@ void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){ ...@@ -333,16 +327,12 @@ void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){
"movq 12(%1), %%mm6 \n\t" "movq 12(%1), %%mm6 \n\t"
"paddd (%1), %%mm2 \n\t" "paddd (%1), %%mm2 \n\t"
"paddd 8(%1), %%mm6 \n\t" "paddd 8(%1), %%mm6 \n\t"
"pxor %%mm0, %%mm0 \n\t" //note: the 2 xor could be avoided if we would flip the rounding direction
"pxor %%mm4, %%mm4 \n\t"
"psubd %%mm2, %%mm0 \n\t"
"psubd %%mm6, %%mm4 \n\t"
"psrad $1, %%mm0 \n\t"
"psrad $1, %%mm4 \n\t"
"psubd %%mm0, %%mm2 \n\t"
"psubd %%mm4, %%mm6 \n\t"
"movq (%0), %%mm0 \n\t" "movq (%0), %%mm0 \n\t"
"movq 8(%0), %%mm4 \n\t" "movq 8(%0), %%mm4 \n\t"
"paddd %%mm2, %%mm0 \n\t"
"paddd %%mm6, %%mm4 \n\t"
"psrad $1, %%mm2 \n\t"
"psrad $1, %%mm6 \n\t"
"paddd %%mm0, %%mm2 \n\t" "paddd %%mm0, %%mm2 \n\t"
"paddd %%mm4, %%mm6 \n\t" "paddd %%mm4, %%mm6 \n\t"
"movq %%mm2, (%2) \n\t" "movq %%mm2, (%2) \n\t"
...@@ -351,7 +341,7 @@ void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){ ...@@ -351,7 +341,7 @@ void ff_snow_horizontal_compose97i_mmx(DWTELEM *b, int width){
: "memory" : "memory"
); );
} }
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO, W_AS); snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
} }
{ {
......
...@@ -775,7 +775,7 @@ static av_always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int ...@@ -775,7 +775,7 @@ static av_always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int
int i; int i;
assert(shift == 4); assert(shift == 4);
#define LIFTS(src, ref, inv) ((inv) ? (src) + (((ref) + 4*(src))>>shift): (16*4*(src) + 4*(ref) + 8 + (5<<27))/(5*16) - (1<<23)) #define LIFTS(src, ref, inv) ((inv) ? (src) + (((ref) + 4*(src))>>shift): -((-16*4*(src) + 4*(ref) + add + 5 + (5<<27))/(5*16) - (1<<23)))
if(mirror_left){ if(mirror_left){
dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse); dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
dst += dst_step; dst += dst_step;
...@@ -1113,8 +1113,8 @@ static void horizontal_decompose97i(DWTELEM *b, int width){ ...@@ -1113,8 +1113,8 @@ static void horizontal_decompose97i(DWTELEM *b, int width){
DWTELEM temp[width]; DWTELEM temp[width];
const int w2= (width+1)>>1; const int w2= (width+1)>>1;
lift (temp+w2, b +1, b , 1, 2, 2, width, -W_AM, W_AO, W_AS, 1, 0); lift (temp+w2, b +1, b , 1, 2, 2, width, W_AM, W_AO, W_AS, 1, 1);
liftS(temp , b , temp+w2, 1, 2, 1, width, -W_BM, W_BO, W_BS, 0, 0); liftS(temp , b , temp+w2, 1, 2, 1, width, W_BM, W_BO, W_BS, 0, 0);
lift5(b +w2, temp+w2, temp , 1, 1, 1, width, W_CM, W_CO, W_CS, 1, 0); lift5(b +w2, temp+w2, temp , 1, 1, 1, width, W_CM, W_CO, W_CS, 1, 0);
lift (b , temp , b +w2, 1, 1, 1, width, W_DM, W_DO, W_DS, 0, 0); lift (b , temp , b +w2, 1, 1, 1, width, W_DM, W_DO, W_DS, 0, 0);
} }
...@@ -1150,7 +1150,7 @@ static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int w ...@@ -1150,7 +1150,7 @@ static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int w
#ifdef liftS #ifdef liftS
b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS; b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
#else #else
b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + 8*5 + (5<<27)) / (5*16) - (1<<23); b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + W_BO*5 + (5<<27)) / (5*16) - (1<<23);
#endif #endif
} }
} }
...@@ -1344,8 +1344,8 @@ void ff_snow_horizontal_compose97i(DWTELEM *b, int width){ ...@@ -1344,8 +1344,8 @@ void ff_snow_horizontal_compose97i(DWTELEM *b, int width){
lift (temp , b , b +w2, 1, 1, 1, width, W_DM, W_DO, W_DS, 0, 1); lift (temp , b , b +w2, 1, 1, 1, width, W_DM, W_DO, W_DS, 0, 1);
lift5(temp+w2, b +w2, temp , 1, 1, 1, width, W_CM, W_CO, W_CS, 1, 1); lift5(temp+w2, b +w2, temp , 1, 1, 1, width, W_CM, W_CO, W_CS, 1, 1);
liftS(b , temp , temp+w2, 2, 1, 1, width, W_BM, W_BO-1, W_BS, 0, 1); liftS(b , temp , temp+w2, 2, 1, 1, width, W_BM, W_BO, W_BS, 0, 1);
lift (b+1 , temp+w2, b , 2, 1, 2, width, -W_AM, W_AO, W_AS, 1, 1); lift (b+1 , temp+w2, b , 2, 1, 2, width, W_AM, W_AO, W_AS, 1, 0);
} }
static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){ static void vertical_compose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
......
...@@ -165,11 +165,11 @@ static av_always_inline void snow_horizontal_compose_lift_lead_out(int i, DWTELE ...@@ -165,11 +165,11 @@ static av_always_inline void snow_horizontal_compose_lift_lead_out(int i, DWTELE
static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, DWTELEM * dst, DWTELEM * src, DWTELEM * ref, int width, int w){ static av_always_inline void snow_horizontal_compose_liftS_lead_out(int i, DWTELEM * dst, DWTELEM * src, DWTELEM * ref, int width, int w){
for(; i<w; i++){ for(; i<w; i++){
dst[i] = src[i] + ((ref[i] + ref[(i+1)]+W_BO-1 + 4 * src[i]) >> W_BS); dst[i] = src[i] + ((ref[i] + ref[(i+1)]+W_BO + 4 * src[i]) >> W_BS);
} }
if(width&1){ if(width&1){
dst[w] = src[w] + ((2 * ref[w] + W_BO-1 + 4 * src[w]) >> W_BS); dst[w] = src[w] + ((2 * ref[w] + W_BO + 4 * src[w]) >> W_BS);
} }
} }
......
...@@ -141,9 +141,9 @@ f8f51fa737add17f7fecaefa118b57ed *./tests/data/a-ffv1.avi ...@@ -141,9 +141,9 @@ f8f51fa737add17f7fecaefa118b57ed *./tests/data/a-ffv1.avi
2654678 ./tests/data/a-ffv1.avi 2654678 ./tests/data/a-ffv1.avi
799d3db687f6cdd7a837ec156efc171f *./tests/data/out.yuv 799d3db687f6cdd7a837ec156efc171f *./tests/data/out.yuv
stddev: 0.00 PSNR:99.99 bytes:7602176 stddev: 0.00 PSNR:99.99 bytes:7602176
9078723c943de5d79490f54b99e6ea9e *./tests/data/a-snow.avi 958d649d09b7361d5f00b5b3fcccbcd2 *./tests/data/a-snow.avi
156656 ./tests/data/a-snow.avi 156606 ./tests/data/a-snow.avi
f2932084b52e2ede167c9ba21eae0656 *./tests/data/out.yuv b19cb7f9134f922326028c6bb44e96de *./tests/data/out.yuv
stddev: 23.14 PSNR:20.83 bytes:7602176 stddev: 23.14 PSNR:20.83 bytes:7602176
ba999e86070aa971376e7f317a022c37 *./tests/data/a-snow53.avi ba999e86070aa971376e7f317a022c37 *./tests/data/a-snow53.avi
3519486 ./tests/data/a-snow53.avi 3519486 ./tests/data/a-snow53.avi
......
...@@ -141,9 +141,9 @@ d72b0960e162d4998b9acbabb07e99ab *./tests/data/a-ffv1.avi ...@@ -141,9 +141,9 @@ d72b0960e162d4998b9acbabb07e99ab *./tests/data/a-ffv1.avi
3525804 ./tests/data/a-ffv1.avi 3525804 ./tests/data/a-ffv1.avi
dde5895817ad9d219f79a52d0bdfb001 *./tests/data/out.yuv dde5895817ad9d219f79a52d0bdfb001 *./tests/data/out.yuv
stddev: 0.00 PSNR:99.99 bytes:7602176 stddev: 0.00 PSNR:99.99 bytes:7602176
40a6e938ac2bd92ee12cd57925e86454 *./tests/data/a-snow.avi 2cfa1bdb443d04a890208a83fd239461 *./tests/data/a-snow.avi
68758 ./tests/data/a-snow.avi 68872 ./tests/data/a-snow.avi
1e356854142898c7c4aab4bfedadf235 *./tests/data/out.yuv 64a0495b7ab53509d3b791465262795c *./tests/data/out.yuv
stddev: 10.86 PSNR:27.40 bytes:7602176 stddev: 10.86 PSNR:27.40 bytes:7602176
3d0da6aeec9b80c6ee0ff4b747bdd0f0 *./tests/data/a-snow53.avi 3d0da6aeec9b80c6ee0ff4b747bdd0f0 *./tests/data/a-snow53.avi
2721980 ./tests/data/a-snow53.avi 2721980 ./tests/data/a-snow53.avi
......
...@@ -2046,51 +2046,51 @@ ret: 0 st:-1 ts:-0.645825 flags:1 ...@@ -2046,51 +2046,51 @@ ret: 0 st:-1 ts:-0.645825 flags:1
ret: 0 st: 0 dts:0.040000 pts:0.040000 pos:9610 size:1075 flags:0 ret: 0 st: 0 dts:0.040000 pts:0.040000 pos:9610 size:1075 flags:0
---------------- ----------------
tests/data/a-snow.avi tests/data/a-snow.avi
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1 ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1
ret: 0 st:-1 ts:-1.000000 flags:0 ret: 0 st:-1 ts:-1.000000 flags:0
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1 ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1
ret: 0 st:-1 ts:1.894167 flags:1 ret: 0 st:-1 ts:1.894167 flags:1
ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46794 size:3663 flags:1 ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46908 size:3663 flags:1
ret: 0 st: 0 ts:0.800000 flags:0 ret: 0 st: 0 ts:0.800000 flags:0
ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31726 size:3478 flags:1 ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31690 size:3478 flags:1
ret:-1 st: 0 ts:-0.320000 flags:1 ret:-1 st: 0 ts:-0.320000 flags:1
ret:-1 st:-1 ts:2.576668 flags:0 ret:-1 st:-1 ts:2.576668 flags:0
ret: 0 st:-1 ts:1.470835 flags:1 ret: 0 st:-1 ts:1.470835 flags:1
ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46794 size:3663 flags:1 ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46908 size:3663 flags:1
ret: 0 st: 0 ts:0.360000 flags:0 ret: 0 st: 0 ts:0.360000 flags:0
ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:18006 size:3229 flags:1 ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:17990 size:3229 flags:1
ret:-1 st: 0 ts:-0.760000 flags:1 ret:-1 st: 0 ts:-0.760000 flags:1
ret:-1 st:-1 ts:2.153336 flags:0 ret:-1 st:-1 ts:2.153336 flags:0
ret: 0 st:-1 ts:1.047503 flags:1 ret: 0 st:-1 ts:1.047503 flags:1
ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31726 size:3478 flags:1 ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31690 size:3478 flags:1
ret: 0 st: 0 ts:-0.040000 flags:0 ret: 0 st: 0 ts:-0.040000 flags:0
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1 ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1
ret: 0 st: 0 ts:2.840000 flags:1 ret: 0 st: 0 ts:2.840000 flags:1
ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63240 size:3635 flags:1 ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63350 size:3635 flags:1
ret: 0 st:-1 ts:1.730004 flags:0 ret: 0 st:-1 ts:1.730004 flags:0
ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63240 size:3635 flags:1 ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63350 size:3635 flags:1
ret: 0 st:-1 ts:0.624171 flags:1 ret: 0 st:-1 ts:0.624171 flags:1
ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:18006 size:3229 flags:1 ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:17990 size:3229 flags:1
ret: 0 st: 0 ts:-0.480000 flags:0 ret: 0 st: 0 ts:-0.480000 flags:0
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1 ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1
ret: 0 st: 0 ts:2.400000 flags:1 ret: 0 st: 0 ts:2.400000 flags:1
ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63240 size:3635 flags:1 ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63350 size:3635 flags:1
ret: 0 st:-1 ts:1.306672 flags:0 ret: 0 st:-1 ts:1.306672 flags:0
ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46794 size:3663 flags:1 ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46908 size:3663 flags:1
ret: 0 st:-1 ts:0.200839 flags:1 ret: 0 st:-1 ts:0.200839 flags:1
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1 ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1
ret: 0 st: 0 ts:-0.920000 flags:0 ret: 0 st: 0 ts:-0.920000 flags:0
ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2986 flags:1 ret: 0 st: 0 dts:0.000000 pts:0.000000 pos:5660 size:2987 flags:1
ret: 0 st: 0 ts:2.000000 flags:1 ret: 0 st: 0 ts:2.000000 flags:1
ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63240 size:3635 flags:1 ret: 0 st: 0 dts:1.920000 pts:1.920000 pos:63350 size:3635 flags:1
ret: 0 st:-1 ts:0.883340 flags:0 ret: 0 st:-1 ts:0.883340 flags:0
ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31726 size:3478 flags:1 ret: 0 st: 0 dts:0.960000 pts:0.960000 pos:31690 size:3478 flags:1
ret:-1 st:-1 ts:-0.222493 flags:1 ret:-1 st:-1 ts:-0.222493 flags:1
ret:-1 st: 0 ts:2.680000 flags:0 ret:-1 st: 0 ts:2.680000 flags:0
ret: 0 st: 0 ts:1.560000 flags:1 ret: 0 st: 0 ts:1.560000 flags:1
ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46794 size:3663 flags:1 ret: 0 st: 0 dts:1.440000 pts:1.440000 pos:46908 size:3663 flags:1
ret: 0 st:-1 ts:0.460008 flags:0 ret: 0 st:-1 ts:0.460008 flags:0
ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:18006 size:3229 flags:1 ret: 0 st: 0 dts:0.480000 pts:0.480000 pos:17990 size:3229 flags:1
ret:-1 st:-1 ts:-0.645825 flags:1 ret:-1 st:-1 ts:-0.645825 flags:1
---------------- ----------------
tests/data/a-snow53.avi tests/data/a-snow53.avi
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment