reorganize and simplify the VP3 IDCT stuff

Originally committed as revision 3071 to svn://svn.ffmpeg.org/ffmpeg/trunk

reorganize and simplify the VP3 IDCT stuff
Originally committed as revision 3071 to svn://svn.ffmpeg.org/ffmpeg/trunk
116824d0 · Mike Melanson · 4ea4b274 · 116824d0 · 116824d0 · 116824d0
Commit 116824d0 authored Apr 26, 2004 by Mike Melanson
7 changed files
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -3126,8 +3126,7 @@ void dsputil_init(DSPContext* c, AVCodecContext *avctx)

    /* VP3 DSP support */
    c->vp3_dsp_init = vp3_dsp_init_c;
-    c->vp3_idct_put = vp3_idct_put_c;
-    c->vp3_idct_add = vp3_idct_add_c;
+    c->vp3_idct = vp3_idct_c;

    c->get_pixels = get_pixels_c;
    c->diff_pixels = diff_pixels_c;

--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -62,23 +62,16 @@ extern uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];

 /* VP3 DSP functions */
 void vp3_dsp_init_c(void);
-void vp3_idct_put_c(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride);
-void vp3_idct_add_c(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride);
+void vp3_idct_c(int16_t *input_data, int16_t *dequant_matrix,
+    int coeff_count, DCTELEM *output_data);

 void vp3_dsp_init_mmx(void);
-void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride);
-void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride);
+void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
+    int coeff_count, DCTELEM *output_data);

 void vp3_dsp_init_sse2(void);
-void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride);
-void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride);
-
+void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix,
+    int coeff_count, DCTELEM *output_data);

 /* minimum alignment rules ;)
 if u notice errors in the align stuff, need more alignment for some asm code for some cpu
@@ -318,32 +311,16 @@ typedef struct DSPContext {

    /** 
     * This function is responsible for taking a block of zigzag'd,
-     * quantized DCT coefficients, reconstructing the original block of
-     * samples, and placing it into the output.
-     * @param input_data 64 zigzag'd, quantized DCT coefficients
-     * @param dequant_matrix 64 zigzag'd quantizer coefficients
-     * @param coeff_count index of the last coefficient
-     * @param dest the final output location where the transformed samples
-     * are to be placed
-     * @param stride the width in 8-bit samples of a line on this plane
-     */
-    void (*vp3_idct_put)(int16_t *input_data, int16_t *dequant_matrix,
-        int coeff_count, uint8_t *dest, int stride);
-
-    /** 
-     * This function is responsible for taking a block of zigzag'd,
-     * quantized DCT coefficients, reconstructing the original block of
-     * samples, and adding the transformed samples to an existing block of
-     * samples in the output.
+     * quantized DCT coefficients and reconstructing the original block of
+     * samples.
     * @param input_data 64 zigzag'd, quantized DCT coefficients
     * @param dequant_matrix 64 zigzag'd quantizer coefficients
     * @param coeff_count index of the last coefficient
-     * @param dest the final output location where the transformed samples
-     * are to be placed
-     * @param stride the width in 8-bit samples of a line on this plane
+     * @param output_samples space for 64 DCTELEMs where the transformed
+     * samples will be stored
     */
-    void (*vp3_idct_add)(int16_t *input_data, int16_t *dequant_matrix,
-        int coeff_count, uint8_t *dest, int stride);
+    void (*vp3_idct)(int16_t *input_data, int16_t *dequant_matrix,
+        int coeff_count, DCTELEM *output_samples);

 } DSPContext;


--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -2149,14 +2149,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
        /* VP3 optimized DSP functions */
        if (mm_flags & MM_SSE2) {
            c->vp3_dsp_init = vp3_dsp_init_sse2;
-            c->vp3_idct_put = vp3_idct_put_sse2;
-            c->vp3_idct_add = vp3_idct_add_sse2;
+            c->vp3_idct = vp3_idct_sse2;
        } else {
            c->vp3_dsp_init = vp3_dsp_init_mmx;
-            c->vp3_idct_put = vp3_idct_put_mmx;
-            c->vp3_idct_add = vp3_idct_add_mmx;
+            c->vp3_idct = vp3_idct_mmx;
        }
-        
+
 #ifdef CONFIG_ENCODERS
        c->get_pixels = get_pixels_mmx;
        c->diff_pixels = diff_pixels_mmx;

--- a/libavcodec/i386/vp3dsp_mmx.c
+++ b/libavcodec/i386/vp3dsp_mmx.c
@@ -279,8 +279,8 @@ void vp3_dsp_init_mmx(void)
    idct_constants[46] = idct_constants[47] = IdctAdjustBeforeShift;
 }

-static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
-     int16_t *output_data)
+void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
+    int coeff_count, int16_t *output_data)
 {
    /* eax = quantized input
     * ebx = dequantizer matrix
@@ -563,79 +563,3 @@ static void vp3_idct_mmx(int16_t *input_data, int16_t *dequant_matrix,
 #undef J

 }
-
-void vp3_idct_put_mmx(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride)
-{
-    int16_t transformed_data[64];
-    int16_t *op;
-    int i, j;
-    uint8_t vector128[8] = { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
-
-    vp3_idct_mmx(input_data, dequant_matrix, transformed_data);
-
-    /* place in final output */
-    op = transformed_data;
-    movq_m2r(*vector128, mm0);
-    for (i = 0; i < 8; i++) {
-#if 1
-        for (j = 0; j < 8; j++) {
-            if (*op < -128)
-                *dest = 0;
-            else if (*op > 127)
-                *dest = 255;
-            else
-                *dest = (uint8_t)(*op + 128);
-            op++;
-            dest++;
-        }
-        dest += (stride - 8);
-#else
-/* prototype optimization */
-        pxor_r2r(mm1, mm1);
-        packsswb_m2r(*(op + 4), mm1);
-        movq_r2r(mm1, mm2);
-        psrlq_i2r(32, mm2);
-        packsswb_m2r(*(op + 0), mm1);
-        op += 8;
-        por_r2r(mm2, mm1);
-        paddb_r2r(mm0, mm1);
-        movq_r2m(mm1, *dest);
-        dest += stride;
-#endif
-    }
-
-    /* be a good MMX citizen */
-    emms();
-}
-
-void vp3_idct_add_mmx(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride)
-{
-    int16_t transformed_data[64];
-    int16_t *op;
-    int i, j;
-    int16_t sample;
-
-    vp3_idct_mmx(input_data, dequant_matrix, transformed_data);
-
-    /* place in final output */
-    op = transformed_data;
-    for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-            sample = *dest + *op;
-            if (sample < 0)
-                *dest = 0;
-            else if (sample > 255)
-                *dest = 255;
-            else
-                *dest = (uint8_t)(sample & 0xFF);
-            op++;
-            dest++;
-        }
-        dest += (stride - 8);
-    }
-
-    /* be a good MMX citizen */
-    emms();
-}
--- a/libavcodec/i386/vp3dsp_sse2.c
+++ b/libavcodec/i386/vp3dsp_sse2.c
@@ -799,11 +799,12 @@ static unsigned short __align16 SSE2_idct_data[7 * 8] =
 void vp3_dsp_init_sse2(void)
 {
    /* nop */
+av_log(NULL, AV_LOG_INFO, "Hey! SSE2!\n");
 }


-static void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix,
-     int16_t *output_data)
+void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix,
+    int coeff_count, int16_t *output_data)
 {
    unsigned char *input_bytes = (unsigned char *)input_data;
    unsigned char *dequant_matrix_bytes = (unsigned char *)dequant_matrix;
@@ -832,59 +833,3 @@ static void vp3_idct_sse2(int16_t *input_data, int16_t *dequant_matrix,
        
    SSE2_Column_IDCT();
 }
-
-
-void vp3_idct_put_sse2(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride)
-{
-    int16_t transformed_data[64];
-    int16_t *op;
-    int i, j;
-
-    vp3_idct_sse2(input_data, dequant_matrix, transformed_data);
-
-    /* place in final output */
-    op = transformed_data;
-    for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-            if (*op < -128)
-                *dest = 0;
-            else if (*op > 127)
-                *dest = 255;
-            else
-                *dest = (uint8_t)(*op + 128);
-            op++;
-            dest++;
-        }
-        dest += (stride - 8);
-    }
-}
-
-
-void vp3_idct_add_sse2(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride)
-{
-    int16_t transformed_data[64];
-    int16_t *op;
-    int i, j;
-    int16_t sample;
-
-    vp3_idct_sse2(input_data, dequant_matrix, transformed_data);
-
-    /* place in final output */
-    op = transformed_data;
-    for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-            sample = *dest + *op;
-            if (sample < 0)
-                *dest = 0;
-            else if (sample > 255)
-                *dest = 255;
-            else
-                *dest = (uint8_t)(sample & 0xFF);
-            op++;
-            dest++;
-        }
-        dest += (stride - 8);
-    }
-}
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -2051,6 +2051,7 @@ static void render_fragments(Vp3DecodeContext *s,
    int m, n;
    int i = first_fragment;
    int16_t *dequantizer;
+    DCTELEM __align16 output_samples[64];
    unsigned char *output_plane;
    unsigned char *last_plane;
    unsigned char *golden_plane;
@@ -2060,6 +2061,10 @@ static void render_fragments(Vp3DecodeContext *s,
    int motion_halfpel_index;
    uint8_t *motion_source;

+    int16_t *op;
+    uint8_t *dest;
+    int j, k;
+
    debug_vp3("  vp3: rendering final fragments for %s\n",
        (plane == 0) ? "Y plane" : (plane == 1) ? "U plane" : "V plane");

@@ -2176,16 +2181,29 @@ av_log(s->avctx, AV_LOG_ERROR, " help! got beefy vector! (%X, %X)\n", motion_x,
                    s->all_fragments[i].coeffs[0], dequantizer[0]);

                /* invert DCT and place (or add) in final output */
+                s->dsp.vp3_idct(s->all_fragments[i].coeffs,
+                    dequantizer,
+                    s->all_fragments[i].coeff_count,
+                    output_samples);
                if (s->all_fragments[i].coding_method == MODE_INTRA) {
-                    s->dsp.vp3_idct_put(s->all_fragments[i].coeffs, 
-                        dequantizer,
-                        s->all_fragments[i].coeff_count,
-                        output_plane + s->all_fragments[i].first_pixel,
-                        stride);
+                    /* this really needs to be optimized sooner or later */
+                    op = output_samples;
+                    dest = output_plane + s->all_fragments[i].first_pixel;
+                    for (j = 0; j < 8; j++) {
+                        for (k = 0; k < 8; k++) {
+                            if (*op < -128)
+                                *dest = 0;
+                            else if (*op > 127)
+                                *dest = 255;
+                            else
+                                *dest = (uint8_t)(*op + 128);
+                            op++;
+                            dest++;
+                        }
+                        dest += (stride - 8);
+                    }
                } else {
-                    s->dsp.vp3_idct_add(s->all_fragments[i].coeffs, 
-                        dequantizer,
-                        s->all_fragments[i].coeff_count,
+                    s->dsp.add_pixels_clamped(output_samples,
                        output_plane + s->all_fragments[i].first_pixel,
                        stride);
                }

--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@@ -40,8 +40,10 @@ void vp3_dsp_init_c(void)
    /* nop */
 }

-static void vp3_idct_c(int32_t *dequantized_data, int16_t *output_data)
+void vp3_idct_c(int16_t *input_data, int16_t *dequant_matrix,
+    int coeff_count, int16_t *output_data)
 {
+    int32_t dequantized_data[64];
    int32_t *ip = dequantized_data;
    int16_t *op = output_data;

@@ -49,7 +51,13 @@ static void vp3_idct_c(int32_t *dequantized_data, int16_t *output_data)
    int32_t _Ed, _Gd, _Add, _Bdd, _Fd, _Hd;
    int32_t t1, t2;

-    int i;
+    int i, j;
+
+    /* de-zigzag and dequantize */
+    for (i = 0; i < coeff_count; i++) {
+        j = dezigzag_index[i];
+        dequantized_data[j] = dequant_matrix[i] * input_data[i];
+    }

    /* Inverse DCT on the rows now */
    for (i = 0; i < 8; i++) {
@@ -248,71 +256,3 @@ static void vp3_idct_c(int32_t *dequantized_data, int16_t *output_data)
        op++;
    }
 }
-
-void vp3_idct_put_c(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride)
-{
-    int32_t dequantized_data[64];
-    int16_t transformed_data[64];
-    int16_t *op;
-    int i, j;
-
-    /* de-zigzag and dequantize */
-    for (i = 0; i < coeff_count; i++) {
-        j = dezigzag_index[i];
-        dequantized_data[j] = dequant_matrix[i] * input_data[i];
-    }
-
-    vp3_idct_c(dequantized_data, transformed_data);
-
-    /* place in final output */
-    op = transformed_data;
-    for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-            if (*op < -128)
-                *dest = 0;
-            else if (*op > 127)
-                *dest = 255;
-            else
-                *dest = (uint8_t)(*op + 128);
-            op++;
-            dest++;
-        }
-        dest += (stride - 8);
-    }
-}
-
-void vp3_idct_add_c(int16_t *input_data, int16_t *dequant_matrix,
-    int coeff_count, uint8_t *dest, int stride)
-{
-    int32_t dequantized_data[64];
-    int16_t transformed_data[64];
-    int16_t *op;
-    int i, j;
-    int16_t sample;
-
-    /* de-zigzag and dequantize */
-    for (i = 0; i < coeff_count; i++) {
-        j = dezigzag_index[i];
-        dequantized_data[j] = dequant_matrix[i] * input_data[i];
-    }
-
-    vp3_idct_c(dequantized_data, transformed_data);
-
-    /* place in final output */
-    op = transformed_data;
-    for (i = 0; i < 8; i++) {
-        for (j = 0; j < 8; j++) {
-            sample = *dest + *op;
-            if (sample < 0)
-                *dest = 0;
-            else if (sample > 255)
-                *dest = 255;
-            else
-                *dest = (uint8_t)(sample & 0xFF);
-            op++;
-            dest++;
-        }
-        dest += (stride - 8);
-    }
-}