Commit 2494bdd9 authored by Loren Merritt's avatar Loren Merritt

gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5...

gcc 2.95 and 3.4.x on x86 32bit without fomit-frame-pointer can't even find 5 registers for asm input.
0.5% slower vorbis.

Originally committed as revision 5964 to svn://svn.ffmpeg.org/ffmpeg/trunk
parent 83318919
...@@ -154,20 +154,23 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, ...@@ -154,20 +154,23 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
in1 = input; in1 = input;
in2 = input + n2 - 1; in2 = input + n2 - 1;
for(k = 0; k < n4; k++) { for(k = 0; k < n4; k++) {
// FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it
asm volatile( asm volatile(
"movd %1, %%mm0 \n\t" "movd %0, %%mm0 \n\t"
"movd %3, %%mm1 \n\t" "movd %2, %%mm1 \n\t"
"punpckldq %2, %%mm0 \n\t" "punpckldq %1, %%mm0 \n\t"
"punpckldq %4, %%mm1 \n\t" "punpckldq %3, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t" "movq %%mm0, %%mm2 \n\t"
"pfmul %%mm1, %%mm0 \n\t" "pfmul %%mm1, %%mm0 \n\t"
"pswapd %%mm1, %%mm1 \n\t" "pswapd %%mm1, %%mm1 \n\t"
"pfmul %%mm1, %%mm2 \n\t" "pfmul %%mm1, %%mm2 \n\t"
"pfpnacc %%mm2, %%mm0 \n\t" "pfpnacc %%mm2, %%mm0 \n\t"
::"m"(in2[-2*k]), "m"(in1[2*k]),
"m"(tcos[k]), "m"(tsin[k])
);
asm volatile(
"movq %%mm0, %0 \n\t" "movq %%mm0, %0 \n\t"
:"=m"(z[revtab[k]]) :"=m"(z[revtab[k]])
:"m"(in2[-2*k]), "m"(in1[2*k]),
"m"(tcos[k]), "m"(tsin[k])
); );
} }
...@@ -190,11 +193,15 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, ...@@ -190,11 +193,15 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
); );
} }
z += n8;
asm volatile("movd %0, %%mm7" ::"r"(1<<31)); asm volatile("movd %0, %%mm7" ::"r"(1<<31));
for(k = 0; k < n8; k++) { for(k = 0; k < n8; k++) {
asm volatile( asm volatile(
"movq %4, %%mm0 \n\t" "movq %0, %%mm0 \n\t"
"pswapd %5, %%mm1 \n\t" "pswapd %1, %%mm1 \n\t"
::"m"(z[k]), "m"(z[-1-k])
);
asm volatile(
"movq %%mm0, %%mm2 \n\t" "movq %%mm0, %%mm2 \n\t"
"pxor %%mm7, %%mm2 \n\t" "pxor %%mm7, %%mm2 \n\t"
"punpckldq %%mm1, %%mm2 \n\t" "punpckldq %%mm1, %%mm2 \n\t"
...@@ -209,8 +216,7 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, ...@@ -209,8 +216,7 @@ void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output,
"movq %%mm3, %3 \n\t" // { z[n8-1-k].im, -z[n8+k].re } "movq %%mm3, %3 \n\t" // { z[n8-1-k].im, -z[n8+k].re }
:"=m"(output[2*k]), "=m"(output[n2-2-2*k]), :"=m"(output[2*k]), "=m"(output[n2-2-2*k]),
"=m"(output[n2+2*k]), "=m"(output[n-2-2*k]) "=m"(output[n2+2*k]), "=m"(output[n-2-2*k])
:"m"(z[n8+k]), "m"(z[n8-1-k]) ::"memory"
:"memory"
); );
} }
asm volatile("emms"); asm volatile("emms");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment