mpegvideo_mmx_template.c 16.8 KB
Newer Older
1
/*
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
 * MPEG video MMX templates
 *
 * Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20 21 22 23
#undef SPREADW
#undef PMAXW
#ifdef HAVE_MMX2
#define SPREADW(a) "pshufw $0, " #a ", " #a " \n\t"
24
#define PMAXW(a,b) "pmaxsw " #a ", " #b "     \n\t"
25
#define PMAX(a,b) \
26 27 28 29
            "pshufw $0x0E," #a ", " #b "        \n\t"\
            PMAXW(b, a)\
            "pshufw $0x01," #a ", " #b "        \n\t"\
            PMAXW(b, a)
30 31
#else
#define SPREADW(a) \
32 33
        "punpcklwd " #a ", " #a " \n\t"\
        "punpcklwd " #a ", " #a " \n\t"
34
#define PMAXW(a,b) \
35 36
        "psubusw " #a ", " #b " \n\t"\
        "paddw " #a ", " #b "   \n\t"
37
#define PMAX(a,b)  \
38 39 40 41 42 43
            "movq " #a ", " #b "                \n\t"\
            "psrlq $32, " #a "                  \n\t"\
            PMAXW(b, a)\
            "movq " #a ", " #b "                \n\t"\
            "psrlq $16, " #a "                  \n\t"\
            PMAXW(b, a)
44

45 46 47 48
#endif

static int RENAME(dct_quantize)(MpegEncContext *s,
                            DCTELEM *block, int n,
49
                            int qscale, int *overflow)
50
{
51 52
    long last_non_zero_p1;
    int level=0, q; //=0 is cuz gcc says uninitalized ...
53
    const uint16_t *qmat, *bias;
54
    DECLARE_ALIGNED_8(int16_t, temp_block[64]);
55

56
    assert((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
57

58
    //s->fdct (block);
59
    RENAMEl(ff_fdct) (block); //cant be anything else ...
60

61
    if(s->dct_error_sum)
Michael Niedermayer's avatar
Michael Niedermayer committed
62
        s->denoise_dct(s, block);
63

64 65 66
    if (s->mb_intra) {
        int dummy;
        if (n < 4)
67
            q = s->y_dc_scale;
68
        else
69
            q = s->c_dc_scale;
70
        /* note: block[0] is assumed to be positive */
71
        if (!s->h263_aic) {
72
#if 1
73
        asm volatile (
74 75 76
                "mul %%ecx                \n\t"
                : "=d" (level), "=a"(dummy)
                : "a" ((block[0]>>2) + q), "c" (inverse[q<<1])
77
        );
78
#else
79
        asm volatile (
80 81 82 83 84 85
                "xorl %%edx, %%edx        \n\t"
                "divw %%cx                \n\t"
                "movzwl %%ax, %%eax       \n\t"
                : "=a" (level)
                : "a" ((block[0]>>2) + q), "c" (q<<1)
                : "%edx"
86
        );
87
#endif
88 89
        } else
            /* For AIC we skip quant/dequant of INTRADC */
90
            level = (block[0] + 4)>>3;
91

92
        block[0]=0; //avoid fake overflow
93 94
//        temp_block[0] = (block[0] + (q >> 1)) / q;
        last_non_zero_p1 = 1;
95 96
        bias = s->q_intra_matrix16[qscale][1];
        qmat = s->q_intra_matrix16[qscale][0];
97 98
    } else {
        last_non_zero_p1 = 0;
99 100
        bias = s->q_inter_matrix16[qscale][1];
        qmat = s->q_inter_matrix16[qscale][0];
101 102
    }

103
    if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){
104

105
        asm volatile(
106
            "movd %%"REG_a", %%mm3              \n\t" // last_non_zero_p1
107
            SPREADW(%%mm3)
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
            "pxor %%mm7, %%mm7                  \n\t" // 0
            "pxor %%mm4, %%mm4                  \n\t" // 0
            "movq (%2), %%mm5                   \n\t" // qmat[0]
            "pxor %%mm6, %%mm6                  \n\t"
            "psubw (%3), %%mm6                  \n\t" // -bias[0]
            "mov $-128, %%"REG_a"               \n\t"
            ".balign 16                         \n\t"
            "1:                                 \n\t"
            "pxor %%mm1, %%mm1                  \n\t" // 0
            "movq (%1, %%"REG_a"), %%mm0        \n\t" // block[i]
            "pcmpgtw %%mm0, %%mm1               \n\t" // block[i] <= 0 ? 0xFF : 0x00
            "pxor %%mm1, %%mm0                  \n\t"
            "psubw %%mm1, %%mm0                 \n\t" // ABS(block[i])
            "psubusw %%mm6, %%mm0               \n\t" // ABS(block[i]) + bias[0]
            "pmulhw %%mm5, %%mm0                \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
            "por %%mm0, %%mm4                   \n\t"
            "pxor %%mm1, %%mm0                  \n\t"
            "psubw %%mm1, %%mm0                 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
            "movq %%mm0, (%5, %%"REG_a")        \n\t"
            "pcmpeqw %%mm7, %%mm0               \n\t" // out==0 ? 0xFF : 0x00
            "movq (%4, %%"REG_a"), %%mm1        \n\t"
            "movq %%mm7, (%1, %%"REG_a")        \n\t" // 0
            "pandn %%mm1, %%mm0                 \n\t"
            PMAXW(%%mm0, %%mm3)
            "add $8, %%"REG_a"                  \n\t"
            " js 1b                             \n\t"
            PMAX(%%mm3, %%mm0)
            "movd %%mm3, %%"REG_a"              \n\t"
            "movzb %%al, %%"REG_a"              \n\t" // last_non_zero_p1
            : "+a" (last_non_zero_p1)
138 139 140 141 142
            : "r" (block+64), "r" (qmat), "r" (bias),
              "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
        );
        // note the asm is split cuz gcc doesnt like that many operands ...
        asm volatile(
143 144 145 146 147
            "movd %1, %%mm1                     \n\t" // max_qcoeff
            SPREADW(%%mm1)
            "psubusw %%mm1, %%mm4               \n\t"
            "packuswb %%mm4, %%mm4              \n\t"
            "movd %%mm4, %0                     \n\t" // *overflow
148 149 150
        : "=g" (*overflow)
        : "g" (s->max_qcoeff)
        );
151 152
    }else{ // FMT_H263
        asm volatile(
153
            "movd %%"REG_a", %%mm3              \n\t" // last_non_zero_p1
154
            SPREADW(%%mm3)
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183
            "pxor %%mm7, %%mm7                  \n\t" // 0
            "pxor %%mm4, %%mm4                  \n\t" // 0
            "mov $-128, %%"REG_a"               \n\t"
            ".balign 16                         \n\t"
            "1:                                 \n\t"
            "pxor %%mm1, %%mm1                  \n\t" // 0
            "movq (%1, %%"REG_a"), %%mm0        \n\t" // block[i]
            "pcmpgtw %%mm0, %%mm1               \n\t" // block[i] <= 0 ? 0xFF : 0x00
            "pxor %%mm1, %%mm0                  \n\t"
            "psubw %%mm1, %%mm0                 \n\t" // ABS(block[i])
            "movq (%3, %%"REG_a"), %%mm6        \n\t" // bias[0]
            "paddusw %%mm6, %%mm0               \n\t" // ABS(block[i]) + bias[0]
            "movq (%2, %%"REG_a"), %%mm5        \n\t" // qmat[i]
            "pmulhw %%mm5, %%mm0                \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
            "por %%mm0, %%mm4                   \n\t"
            "pxor %%mm1, %%mm0                  \n\t"
            "psubw %%mm1, %%mm0                 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
            "movq %%mm0, (%5, %%"REG_a")        \n\t"
            "pcmpeqw %%mm7, %%mm0               \n\t" // out==0 ? 0xFF : 0x00
            "movq (%4, %%"REG_a"), %%mm1        \n\t"
            "movq %%mm7, (%1, %%"REG_a")        \n\t" // 0
            "pandn %%mm1, %%mm0                 \n\t"
            PMAXW(%%mm0, %%mm3)
            "add $8, %%"REG_a"                  \n\t"
            " js 1b                             \n\t"
            PMAX(%%mm3, %%mm0)
            "movd %%mm3, %%"REG_a"              \n\t"
            "movzb %%al, %%"REG_a"              \n\t" // last_non_zero_p1
            : "+a" (last_non_zero_p1)
184
            : "r" (block+64), "r" (qmat+64), "r" (bias+64),
185 186 187 188
              "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
        );
        // note the asm is split cuz gcc doesnt like that many operands ...
        asm volatile(
189 190 191 192 193
            "movd %1, %%mm1                     \n\t" // max_qcoeff
            SPREADW(%%mm1)
            "psubusw %%mm1, %%mm4               \n\t"
            "packuswb %%mm4, %%mm4              \n\t"
            "movd %%mm4, %0                     \n\t" // *overflow
194 195
        : "=g" (*overflow)
        : "g" (s->max_qcoeff)
196 197 198
        );
    }

199 200 201
    if(s->mb_intra) block[0]= level;
    else            block[0]= temp_block[0];

202
    if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){
203
        if(last_non_zero_p1 <= 1) goto end;
204 205
        block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08];
        block[0x20] = temp_block[0x10];
206
        if(last_non_zero_p1 <= 4) goto end;
207 208
        block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02];
        block[0x09] = temp_block[0x03];
209
        if(last_non_zero_p1 <= 7) goto end;
210 211
        block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11];
        block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20];
212
        if(last_non_zero_p1 <= 11) goto end;
213 214 215
        block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12];
        block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04];
        block[0x0C] = temp_block[0x05];
216
        if(last_non_zero_p1 <= 16) goto end;
217 218 219 220
        block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13];
        block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21];
        block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30];
        block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22];
221
        if(last_non_zero_p1 <= 24) goto end;
222 223 224 225
        block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14];
        block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06];
        block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E];
        block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C];
226
        if(last_non_zero_p1 <= 32) goto end;
227 228 229 230
        block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A];
        block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38];
        block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32];
        block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24];
231
        if(last_non_zero_p1 <= 40) goto end;
232 233 234 235
        block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16];
        block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17];
        block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25];
        block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33];
236
        if(last_non_zero_p1 <= 48) goto end;
237 238 239 240
        block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
        block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D];
        block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
        block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
241
        if(last_non_zero_p1 <= 56) goto end;
242 243 244
        block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C];
        block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
        block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
245
        block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
246
    }else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){
247
        if(last_non_zero_p1 <= 1) goto end;
248 249
        block[0x04] = temp_block[0x01];
        block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
250
        if(last_non_zero_p1 <= 4) goto end;
251 252
        block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
        block[0x05] = temp_block[0x03];
253
        if(last_non_zero_p1 <= 7) goto end;
254 255
        block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
        block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
256
        if(last_non_zero_p1 <= 11) goto end;
257 258 259
        block[0x1C] = temp_block[0x19];
        block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
        block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
260
        if(last_non_zero_p1 <= 16) goto end;
261 262 263 264
        block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
        block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
        block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
        block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
265
        if(last_non_zero_p1 <= 24) goto end;
266 267 268 269
        block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
        block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
        block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
        block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
270
        if(last_non_zero_p1 <= 32) goto end;
271 272 273 274
        block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
        block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
        block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
        block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
275
        if(last_non_zero_p1 <= 40) goto end;
276 277 278 279
        block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
        block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
        block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
        block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
280
        if(last_non_zero_p1 <= 48) goto end;
281 282 283 284
        block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
        block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
            block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
        block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
285
        if(last_non_zero_p1 <= 56) goto end;
286 287 288
        block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
        block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
        block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
289 290 291
        block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
    }else{
        if(last_non_zero_p1 <= 1) goto end;
292 293
        block[0x01] = temp_block[0x01];
        block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
294
        if(last_non_zero_p1 <= 4) goto end;
295 296
        block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02];
        block[0x03] = temp_block[0x03];
297
        if(last_non_zero_p1 <= 7) goto end;
298 299
        block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11];
        block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
300
        if(last_non_zero_p1 <= 11) goto end;
301 302 303
        block[0x19] = temp_block[0x19];
        block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B];
        block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05];
304
        if(last_non_zero_p1 <= 16) goto end;
305 306 307 308
        block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13];
        block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21];
        block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
        block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22];
309
        if(last_non_zero_p1 <= 24) goto end;
310 311 312 313
        block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14];
        block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06];
        block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E];
        block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C];
314
        if(last_non_zero_p1 <= 32) goto end;
315 316 317 318
        block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A];
        block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38];
        block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32];
        block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
319
        if(last_non_zero_p1 <= 40) goto end;
320 321 322 323
        block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16];
        block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
        block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25];
        block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33];
324
        if(last_non_zero_p1 <= 48) goto end;
325 326 327 328
        block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
        block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
        block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
        block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E];
329
        if(last_non_zero_p1 <= 56) goto end;
330 331 332
        block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C];
        block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
        block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
333 334 335
        block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
    }
    end:
336 337 338 339 340 341 342
/*
    for(i=0; i<last_non_zero_p1; i++)
    {
       int j= zigzag_direct_noperm[i];
       block[block_permute_op(j)]= temp_block[j];
    }
*/
343

344 345
    return last_non_zero_p1 - 1;
}