dnxhdenc.c 35 KB
Newer Older
1 2 3
/*
 * VC3/DNxHD encoder
 * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
4
 * Copyright (c) 2011 MirriAd Ltd
5 6
 *
 * VC-3 encoder funded by the British Broadcasting Corporation
7
 * 10 bit support added by MirriAd Ltd, Joseph Artsimovich <joseph@mirriad.com>
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

//#define DEBUG
#define RC_VARIANCE 1 // use variance or ssd for fast rc

29
#include "libavutil/opt.h"
30 31 32
#include "avcodec.h"
#include "dsputil.h"
#include "mpegvideo.h"
33
#include "mpegvideo_common.h"
34
#include "dnxhdenc.h"
35

36
#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
37
#define DNX10BIT_QMAT_SHIFT 18 // The largest value that will not lead to overflow for 10bit samples.
38 39

static const AVOption options[]={
40
    {"nitris_compat", "encode with Avid Nitris compatibility", offsetof(DNXHDEncContext, nitris_compat), AV_OPT_TYPE_INT, {.dbl = 0}, 0, 1, VE},
41 42 43 44
{NULL}
};
static const AVClass class = { "dnxhd", av_default_item_name, options, LIBAVUTIL_VERSION_INT };

45 46
#define LAMBDA_FRAC_BITS 10

47
static void dnxhd_8bit_get_pixels_8x4_sym(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
48 49 50 51 52 53 54 55 56 57
{
    int i;
    for (i = 0; i < 4; i++) {
        block[0] = pixels[0]; block[1] = pixels[1];
        block[2] = pixels[2]; block[3] = pixels[3];
        block[4] = pixels[4]; block[5] = pixels[5];
        block[6] = pixels[6]; block[7] = pixels[7];
        pixels += line_size;
        block += 8;
    }
58 59 60 61
    memcpy(block,      block -  8, sizeof(*block) * 8);
    memcpy(block +  8, block - 16, sizeof(*block) * 8);
    memcpy(block + 16, block - 24, sizeof(*block) * 8);
    memcpy(block + 24, block - 32, sizeof(*block) * 8);
62 63
}

64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
static av_always_inline void dnxhd_10bit_get_pixels_8x4_sym(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
{
    int i;

    block += 32;

    for (i = 0; i < 4; i++) {
        memcpy(block + i     * 8, pixels + i * line_size, 8 * sizeof(*block));
        memcpy(block - (i+1) * 8, pixels + i * line_size, 8 * sizeof(*block));
    }
}

static int dnxhd_10bit_dct_quantize(MpegEncContext *ctx, DCTELEM *block,
                                    int n, int qscale, int *overflow)
{
    const uint8_t *scantable= ctx->intra_scantable.scantable;
80
    const int *qmat = n<4 ? ctx->q_intra_matrix[qscale] : ctx->q_chroma_intra_matrix[qscale];
81
    int last_non_zero = 0;
82
    int i;
83 84 85 86 87 88

    ctx->dsp.fdct(block);

    // Divide by 4 with rounding, to compensate scaling of DCT coefficients
    block[0] = (block[0] + 2) >> 2;

89
    for (i = 1; i < 64; ++i) {
90 91 92 93 94 95 96 97 98 99 100 101
        int j = scantable[i];
        int sign = block[j] >> 31;
        int level = (block[j] ^ sign) - sign;
        level = level * qmat[j] >> DNX10BIT_QMAT_SHIFT;
        block[j] = (level ^ sign) - sign;
        if (level)
            last_non_zero = i;
    }

    return last_non_zero;
}

102 103
static int dnxhd_init_vlc(DNXHDEncContext *ctx)
{
104 105 106
    int i, j, level, run;
    int max_level = 1<<(ctx->cid_table->bit_depth+2);

107
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->vlc_codes, max_level*4*sizeof(*ctx->vlc_codes), fail);
108 109 110
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->vlc_bits,  max_level*4*sizeof(*ctx->vlc_bits) , fail);
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->run_codes, 63*2,                                fail);
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->run_bits,  63,                                  fail);
111

112 113
    ctx->vlc_codes += max_level*2;
    ctx->vlc_bits  += max_level*2;
114 115 116 117 118 119 120 121 122 123 124
    for (level = -max_level; level < max_level; level++) {
        for (run = 0; run < 2; run++) {
            int index = (level<<1)|run;
            int sign, offset = 0, alevel = level;

            MASK_ABS(sign, alevel);
            if (alevel > 64) {
                offset = (alevel-1)>>6;
                alevel -= offset<<6;
            }
            for (j = 0; j < 257; j++) {
125
                if (ctx->cid_table->ac_level[j] >> 1 == alevel &&
126 127
                    (!offset || (ctx->cid_table->ac_flags[j] & 1) && offset) &&
                    (!run    || (ctx->cid_table->ac_flags[j] & 2) && run)) {
128
                    assert(!ctx->vlc_codes[index]);
129
                    if (alevel) {
130 131
                        ctx->vlc_codes[index] = (ctx->cid_table->ac_codes[j]<<1)|(sign&1);
                        ctx->vlc_bits [index] = ctx->cid_table->ac_bits[j]+1;
132
                    } else {
133 134
                        ctx->vlc_codes[index] = ctx->cid_table->ac_codes[j];
                        ctx->vlc_bits [index] = ctx->cid_table->ac_bits [j];
135 136 137 138 139 140
                    }
                    break;
                }
            }
            assert(!alevel || j < 257);
            if (offset) {
141 142
                ctx->vlc_codes[index] = (ctx->vlc_codes[index]<<ctx->cid_table->index_bits)|offset;
                ctx->vlc_bits [index]+= ctx->cid_table->index_bits;
143 144
            }
        }
145 146 147 148
    }
    for (i = 0; i < 62; i++) {
        int run = ctx->cid_table->run[i];
        assert(run < 63);
149 150
        ctx->run_codes[run] = ctx->cid_table->run_codes[i];
        ctx->run_bits [run] = ctx->cid_table->run_bits[i];
151 152 153 154 155 156 157 158 159 160 161
    }
    return 0;
 fail:
    return -1;
}

static int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias)
{
    // init first elem to 1 to avoid div by 0 in convert_matrix
    uint16_t weight_matrix[64] = {1,}; // convert_matrix needs uint16_t*
    int qscale, i;
162 163
    const uint8_t *luma_weight_table   = ctx->cid_table->luma_weight;
    const uint8_t *chroma_weight_table = ctx->cid_table->chroma_weight;
164

165 166
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l,   (ctx->m.avctx->qmax+1) * 64 *     sizeof(int),      fail);
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c,   (ctx->m.avctx->qmax+1) * 64 *     sizeof(int),      fail);
167 168
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l16, (ctx->m.avctx->qmax+1) * 64 * 2 * sizeof(uint16_t), fail);
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c16, (ctx->m.avctx->qmax+1) * 64 * 2 * sizeof(uint16_t), fail);
169

170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
    if (ctx->cid_table->bit_depth == 8) {
        for (i = 1; i < 64; i++) {
            int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
            weight_matrix[j] = ctx->cid_table->luma_weight[i];
        }
        ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_l, ctx->qmatrix_l16, weight_matrix,
                          ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1);
        for (i = 1; i < 64; i++) {
            int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
            weight_matrix[j] = ctx->cid_table->chroma_weight[i];
        }
        ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_c, ctx->qmatrix_c16, weight_matrix,
                          ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1);

        for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) {
            for (i = 0; i < 64; i++) {
                ctx->qmatrix_l  [qscale]   [i] <<= 2; ctx->qmatrix_c  [qscale]   [i] <<= 2;
                ctx->qmatrix_l16[qscale][0][i] <<= 2; ctx->qmatrix_l16[qscale][1][i] <<= 2;
                ctx->qmatrix_c16[qscale][0][i] <<= 2; ctx->qmatrix_c16[qscale][1][i] <<= 2;
            }
        }
    } else {
        // 10-bit
        for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) {
            for (i = 1; i < 64; i++) {
                int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];

                // The quantization formula from the VC-3 standard is:
                // quantized = sign(block[i]) * floor(abs(block[i]/s) * p / (qscale * weight_table[i]))
                // Where p is 32 for 8-bit samples and 8 for 10-bit ones.
                // The s factor compensates scaling of DCT coefficients done by the DCT routines,
                // and therefore is not present in standard.  It's 8 for 8-bit samples and 4 for 10-bit ones.
                // We want values of ctx->qtmatrix_l and ctx->qtmatrix_r to be:
                // ((1 << DNX10BIT_QMAT_SHIFT) * (p / s)) / (qscale * weight_table[i])
                // For 10-bit samples, p / s == 2
                ctx->qmatrix_l[qscale][j] = (1 << (DNX10BIT_QMAT_SHIFT + 1)) / (qscale * luma_weight_table[i]);
                ctx->qmatrix_c[qscale][j] = (1 << (DNX10BIT_QMAT_SHIFT + 1)) / (qscale * chroma_weight_table[i]);
            }
208 209
        }
    }
210

211 212 213 214 215
    ctx->m.q_chroma_intra_matrix16 = ctx->qmatrix_c16;
    ctx->m.q_chroma_intra_matrix   = ctx->qmatrix_c;
    ctx->m.q_intra_matrix16        = ctx->qmatrix_l16;
    ctx->m.q_intra_matrix          = ctx->qmatrix_l;

216 217 218 219 220 221 222
    return 0;
 fail:
    return -1;
}

static int dnxhd_init_rc(DNXHDEncContext *ctx)
{
223
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_rc, 8160*ctx->m.avctx->qmax*sizeof(RCEntry), fail);
224
    if (ctx->m.avctx->mb_decision != FF_MB_DECISION_RD)
225
        FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_cmp, ctx->m.mb_num*sizeof(RCCMPEntry), fail);
226

227
    ctx->frame_bits = (ctx->cid_table->coding_unit_size - 640 - 4 - ctx->min_padding) * 8;
228 229 230 231 232 233 234 235 236 237
    ctx->qscale = 1;
    ctx->lambda = 2<<LAMBDA_FRAC_BITS; // qscale 2
    return 0;
 fail:
    return -1;
}

static int dnxhd_encode_init(AVCodecContext *avctx)
{
    DNXHDEncContext *ctx = avctx->priv_data;
238 239 240 241 242 243 244 245 246 247 248 249 250
    int i, index, bit_depth;

    switch (avctx->pix_fmt) {
    case PIX_FMT_YUV422P:
        bit_depth = 8;
        break;
    case PIX_FMT_YUV422P10:
        bit_depth = 10;
        break;
    default:
        av_log(avctx, AV_LOG_ERROR, "pixel format is incompatible with DNxHD\n");
        return -1;
    }
251

252 253
    ctx->cid = ff_dnxhd_find_cid(avctx, bit_depth);
    if (!ctx->cid) {
254 255 256
        av_log(avctx, AV_LOG_ERROR, "video parameters incompatible with DNxHD\n");
        return -1;
    }
257
    av_log(avctx, AV_LOG_DEBUG, "cid %d\n", ctx->cid);
258 259 260 261 262 263 264 265

    index = ff_dnxhd_get_cid_table(ctx->cid);
    ctx->cid_table = &ff_dnxhd_cid_table[index];

    ctx->m.avctx = avctx;
    ctx->m.mb_intra = 1;
    ctx->m.h263_aic = 1;

266
    avctx->bits_per_raw_sample = ctx->cid_table->bit_depth;
267

268 269
    dsputil_init(&ctx->m.dsp, avctx);
    ff_dct_common_init(&ctx->m);
270 271 272 273 274 275 276 277 278 279 280 281
    if (!ctx->m.dct_quantize)
        ctx->m.dct_quantize = dct_quantize_c;

    if (ctx->cid_table->bit_depth == 10) {
       ctx->m.dct_quantize = dnxhd_10bit_dct_quantize;
       ctx->get_pixels_8x4_sym = dnxhd_10bit_get_pixels_8x4_sym;
       ctx->block_width_l2 = 4;
    } else {
       ctx->get_pixels_8x4_sym = dnxhd_8bit_get_pixels_8x4_sym;
       ctx->block_width_l2 = 3;
    }

282
#if HAVE_MMX
283 284
    ff_dnxhd_init_mmx(ctx);
#endif
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300

    ctx->m.mb_height = (avctx->height + 15) / 16;
    ctx->m.mb_width  = (avctx->width  + 15) / 16;

    if (avctx->flags & CODEC_FLAG_INTERLACED_DCT) {
        ctx->interlaced = 1;
        ctx->m.mb_height /= 2;
    }

    ctx->m.mb_num = ctx->m.mb_height * ctx->m.mb_width;

    if (avctx->intra_quant_bias != FF_DEFAULT_QUANT_BIAS)
        ctx->m.intra_quant_bias = avctx->intra_quant_bias;
    if (dnxhd_init_qmat(ctx, ctx->m.intra_quant_bias, 0) < 0) // XXX tune lbias/cbias
        return -1;

301 302 303 304
    // Avid Nitris hardware decoder requires a minimum amount of padding in the coding unit payload
    if (ctx->nitris_compat)
        ctx->min_padding = 1600;

305 306 307 308 309
    if (dnxhd_init_vlc(ctx) < 0)
        return -1;
    if (dnxhd_init_rc(ctx) < 0)
        return -1;

310
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_size, ctx->m.mb_height*sizeof(uint32_t), fail);
311
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_offs, ctx->m.mb_height*sizeof(uint32_t), fail);
312
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_bits,    ctx->m.mb_num   *sizeof(uint16_t), fail);
313
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_qscale,  ctx->m.mb_num   *sizeof(uint8_t),  fail);
314 315

    ctx->frame.key_frame = 1;
316
    ctx->frame.pict_type = AV_PICTURE_TYPE_I;
317 318
    ctx->m.avctx->coded_frame = &ctx->frame;

319
    if (avctx->thread_count > MAX_THREADS) {
320 321 322 323 324 325 326 327 328 329 330
        av_log(avctx, AV_LOG_ERROR, "too many threads\n");
        return -1;
    }

    ctx->thread[0] = ctx;
    for (i = 1; i < avctx->thread_count; i++) {
        ctx->thread[i] =  av_malloc(sizeof(DNXHDEncContext));
        memcpy(ctx->thread[i], ctx, sizeof(DNXHDEncContext));
    }

    return 0;
331
 fail: //for FF_ALLOCZ_OR_GOTO
332 333 334 335 336 337 338 339
    return -1;
}

static int dnxhd_write_header(AVCodecContext *avctx, uint8_t *buf)
{
    DNXHDEncContext *ctx = avctx->priv_data;
    const uint8_t header_prefix[5] = { 0x00,0x00,0x02,0x80,0x01 };

340 341
    memset(buf, 0, 640);

342 343 344 345
    memcpy(buf, header_prefix, 5);
    buf[5] = ctx->interlaced ? ctx->cur_field+2 : 0x01;
    buf[6] = 0x80; // crc flag off
    buf[7] = 0xa0; // reserved
346
    AV_WB16(buf + 0x18, avctx->height>>ctx->interlaced); // ALPF
347
    AV_WB16(buf + 0x1a, avctx->width);  // SPL
348
    AV_WB16(buf + 0x1d, avctx->height>>ctx->interlaced); // NAL
349

350
    buf[0x21] = ctx->cid_table->bit_depth == 10 ? 0x58 : 0x38;
351
    buf[0x22] = 0x88 + (ctx->interlaced<<2);
352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391
    AV_WB32(buf + 0x28, ctx->cid); // CID
    buf[0x2c] = ctx->interlaced ? 0 : 0x80;

    buf[0x5f] = 0x01; // UDL

    buf[0x167] = 0x02; // reserved
    AV_WB16(buf + 0x16a, ctx->m.mb_height * 4 + 4); // MSIPS
    buf[0x16d] = ctx->m.mb_height; // Ns
    buf[0x16f] = 0x10; // reserved

    ctx->msip = buf + 0x170;
    return 0;
}

static av_always_inline void dnxhd_encode_dc(DNXHDEncContext *ctx, int diff)
{
    int nbits;
    if (diff < 0) {
        nbits = av_log2_16bit(-2*diff);
        diff--;
    } else {
        nbits = av_log2_16bit(2*diff);
    }
    put_bits(&ctx->m.pb, ctx->cid_table->dc_bits[nbits] + nbits,
             (ctx->cid_table->dc_codes[nbits]<<nbits) + (diff & ((1 << nbits) - 1)));
}

static av_always_inline void dnxhd_encode_block(DNXHDEncContext *ctx, DCTELEM *block, int last_index, int n)
{
    int last_non_zero = 0;
    int slevel, i, j;

    dnxhd_encode_dc(ctx, block[0] - ctx->m.last_dc[n]);
    ctx->m.last_dc[n] = block[0];

    for (i = 1; i <= last_index; i++) {
        j = ctx->m.intra_scantable.permutated[i];
        slevel = block[j];
        if (slevel) {
            int run_level = i - last_non_zero - 1;
392
            int rlevel = (slevel<<1)|!!run_level;
393
            put_bits(&ctx->m.pb, ctx->vlc_bits[rlevel], ctx->vlc_codes[rlevel]);
394
            if (run_level)
395
                put_bits(&ctx->m.pb, ctx->run_bits[run_level], ctx->run_codes[run_level]);
396 397 398
            last_non_zero = i;
        }
    }
399
    put_bits(&ctx->m.pb, ctx->vlc_bits[0], ctx->vlc_codes[0]); // EOB
400 401 402 403
}

static av_always_inline void dnxhd_unquantize_c(DNXHDEncContext *ctx, DCTELEM *block, int n, int qscale, int last_index)
{
Baptiste Coudurier's avatar
Baptiste Coudurier committed
404
    const uint8_t *weight_matrix;
405 406 407
    int level;
    int i;

Baptiste Coudurier's avatar
Baptiste Coudurier committed
408
    weight_matrix = (n&2) ? ctx->cid_table->chroma_weight : ctx->cid_table->luma_weight;
409 410 411 412 413 414

    for (i = 1; i <= last_index; i++) {
        int j = ctx->m.intra_scantable.permutated[i];
        level = block[j];
        if (level) {
            if (level < 0) {
Baptiste Coudurier's avatar
Baptiste Coudurier committed
415
                level = (1-2*level) * qscale * weight_matrix[i];
416 417 418 419 420 421 422 423 424
                if (ctx->cid_table->bit_depth == 10) {
                    if (weight_matrix[i] != 8)
                        level += 8;
                    level >>= 4;
                } else {
                    if (weight_matrix[i] != 32)
                        level += 32;
                    level >>= 6;
                }
425 426
                level = -level;
            } else {
Baptiste Coudurier's avatar
Baptiste Coudurier committed
427
                level = (2*level+1) * qscale * weight_matrix[i];
428 429 430 431 432 433 434 435 436
                if (ctx->cid_table->bit_depth == 10) {
                    if (weight_matrix[i] != 8)
                        level += 8;
                    level >>= 4;
                } else {
                    if (weight_matrix[i] != 32)
                        level += 32;
                    level >>= 6;
                }
437 438 439 440 441 442 443 444 445 446 447
            }
            block[j] = level;
        }
    }
}

static av_always_inline int dnxhd_ssd_block(DCTELEM *qblock, DCTELEM *block)
{
    int score = 0;
    int i;
    for (i = 0; i < 64; i++)
448
        score += (block[i] - qblock[i]) * (block[i] - qblock[i]);
449 450 451 452 453 454 455 456 457 458 459 460 461
    return score;
}

static av_always_inline int dnxhd_calc_ac_bits(DNXHDEncContext *ctx, DCTELEM *block, int last_index)
{
    int last_non_zero = 0;
    int bits = 0;
    int i, j, level;
    for (i = 1; i <= last_index; i++) {
        j = ctx->m.intra_scantable.permutated[i];
        level = block[j];
        if (level) {
            int run_level = i - last_non_zero - 1;
462
            bits += ctx->vlc_bits[(level<<1)|!!run_level]+ctx->run_bits[run_level];
463 464 465 466 467 468 469 470
            last_non_zero = i;
        }
    }
    return bits;
}

static av_always_inline void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, int mb_y)
{
471 472 473 474 475
    const int bs = ctx->block_width_l2;
    const int bw = 1 << bs;
    const uint8_t *ptr_y = ctx->thread[0]->src[0] + ((mb_y << 4) * ctx->m.linesize)   + (mb_x << bs+1);
    const uint8_t *ptr_u = ctx->thread[0]->src[1] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << bs);
    const uint8_t *ptr_v = ctx->thread[0]->src[2] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << bs);
476 477
    DSPContext *dsp = &ctx->m.dsp;

478 479 480 481
    dsp->get_pixels(ctx->blocks[0], ptr_y,      ctx->m.linesize);
    dsp->get_pixels(ctx->blocks[1], ptr_y + bw, ctx->m.linesize);
    dsp->get_pixels(ctx->blocks[2], ptr_u,      ctx->m.uvlinesize);
    dsp->get_pixels(ctx->blocks[3], ptr_v,      ctx->m.uvlinesize);
482

483
    if (mb_y+1 == ctx->m.mb_height && ctx->m.avctx->height == 1080) {
484
        if (ctx->interlaced) {
485 486 487 488
            ctx->get_pixels_8x4_sym(ctx->blocks[4], ptr_y + ctx->dct_y_offset,      ctx->m.linesize);
            ctx->get_pixels_8x4_sym(ctx->blocks[5], ptr_y + ctx->dct_y_offset + bw, ctx->m.linesize);
            ctx->get_pixels_8x4_sym(ctx->blocks[6], ptr_u + ctx->dct_uv_offset,     ctx->m.uvlinesize);
            ctx->get_pixels_8x4_sym(ctx->blocks[7], ptr_v + ctx->dct_uv_offset,     ctx->m.uvlinesize);
Baptiste Coudurier's avatar
Baptiste Coudurier committed
489
        } else {
490 491 492 493
            dsp->clear_block(ctx->blocks[4]);
            dsp->clear_block(ctx->blocks[5]);
            dsp->clear_block(ctx->blocks[6]);
            dsp->clear_block(ctx->blocks[7]);
Baptiste Coudurier's avatar
Baptiste Coudurier committed
494
        }
495
    } else {
496 497 498 499
        dsp->get_pixels(ctx->blocks[4], ptr_y + ctx->dct_y_offset,      ctx->m.linesize);
        dsp->get_pixels(ctx->blocks[5], ptr_y + ctx->dct_y_offset + bw, ctx->m.linesize);
        dsp->get_pixels(ctx->blocks[6], ptr_u + ctx->dct_uv_offset,     ctx->m.uvlinesize);
        dsp->get_pixels(ctx->blocks[7], ptr_v + ctx->dct_uv_offset,     ctx->m.uvlinesize);
500 501 502 503 504
    }
}

static av_always_inline int dnxhd_switch_matrix(DNXHDEncContext *ctx, int i)
{
505
    const static uint8_t component[8]={0,0,1,2,0,0,1,2};
506
    return component[i];
507 508
}

509
static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
510
{
511 512 513
    DNXHDEncContext *ctx = avctx->priv_data;
    int mb_y = jobnr, mb_x;
    int qscale = ctx->qscale;
514
    LOCAL_ALIGNED_16(DCTELEM, block, [64]);
515
    ctx = ctx->thread[threadnr];
516

517 518
    ctx->m.last_dc[0] =
    ctx->m.last_dc[1] =
519
    ctx->m.last_dc[2] = 1 << (ctx->cid_table->bit_depth + 2);
520 521 522 523 524 525 526 527 528 529 530 531 532 533 534

    for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
        unsigned mb = mb_y * ctx->m.mb_width + mb_x;
        int ssd     = 0;
        int ac_bits = 0;
        int dc_bits = 0;
        int i;

        dnxhd_get_blocks(ctx, mb_x, mb_y);

        for (i = 0; i < 8; i++) {
            DCTELEM *src_block = ctx->blocks[i];
            int overflow, nbits, diff, last_index;
            int n = dnxhd_switch_matrix(ctx, i);

535
            memcpy(block, src_block, 64*sizeof(*block));
536
            last_index = ctx->m.dct_quantize(&ctx->m, block, 4&(2*i), qscale, &overflow);
537 538 539 540 541
            ac_bits += dnxhd_calc_ac_bits(ctx, block, last_index);

            diff = block[0] - ctx->m.last_dc[n];
            if (diff < 0) nbits = av_log2_16bit(-2*diff);
            else          nbits = av_log2_16bit( 2*diff);
542 543

            assert(nbits < ctx->cid_table->bit_depth + 4);
544 545 546 547 548 549 550 551
            dc_bits += ctx->cid_table->dc_bits[nbits] + nbits;

            ctx->m.last_dc[n] = block[0];

            if (avctx->mb_decision == FF_MB_DECISION_RD || !RC_VARIANCE) {
                dnxhd_unquantize_c(ctx, block, i, qscale, last_index);
                ctx->m.dsp.idct(block);
                ssd += dnxhd_ssd_block(block, src_block);
552 553
            }
        }
554 555 556
        ctx->mb_rc[qscale][mb].ssd = ssd;
        ctx->mb_rc[qscale][mb].bits = ac_bits+dc_bits+12+8*ctx->vlc_bits[0];
    }
557 558 559
    return 0;
}

560
static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
561
{
562 563 564 565
    DNXHDEncContext *ctx = avctx->priv_data;
    int mb_y = jobnr, mb_x;
    ctx = ctx->thread[threadnr];
    init_put_bits(&ctx->m.pb, (uint8_t *)arg + 640 + ctx->slice_offs[jobnr], ctx->slice_size[jobnr]);
566

567 568
    ctx->m.last_dc[0] =
    ctx->m.last_dc[1] =
569
    ctx->m.last_dc[2] = 1 << (ctx->cid_table->bit_depth + 2);
570 571 572 573 574 575 576 577 578 579 580 581 582
    for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
        unsigned mb = mb_y * ctx->m.mb_width + mb_x;
        int qscale = ctx->mb_qscale[mb];
        int i;

        put_bits(&ctx->m.pb, 12, qscale<<1);

        dnxhd_get_blocks(ctx, mb_x, mb_y);

        for (i = 0; i < 8; i++) {
            DCTELEM *block = ctx->blocks[i];
            int last_index, overflow;
            int n = dnxhd_switch_matrix(ctx, i);
583
            last_index = ctx->m.dct_quantize(&ctx->m, block, 4&(2*i), qscale, &overflow);
584 585 586
            //START_TIMER;
            dnxhd_encode_block(ctx, block, last_index, n);
            //STOP_TIMER("encode_block");
587
        }
588 589 590
    }
    if (put_bits_count(&ctx->m.pb)&31)
        put_bits(&ctx->m.pb, 32-(put_bits_count(&ctx->m.pb)&31), 0);
591 592 593 594
    flush_put_bits(&ctx->m.pb);
    return 0;
}

595
static void dnxhd_setup_threads_slices(DNXHDEncContext *ctx)
596 597
{
    int mb_y, mb_x;
598 599 600 601
    int offset = 0;
    for (mb_y = 0; mb_y < ctx->m.mb_height; mb_y++) {
        int thread_size;
        ctx->slice_offs[mb_y] = offset;
602 603 604 605 606 607 608 609
        ctx->slice_size[mb_y] = 0;
        for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
            unsigned mb = mb_y * ctx->m.mb_width + mb_x;
            ctx->slice_size[mb_y] += ctx->mb_bits[mb];
        }
        ctx->slice_size[mb_y] = (ctx->slice_size[mb_y]+31)&~31;
        ctx->slice_size[mb_y] >>= 3;
        thread_size = ctx->slice_size[mb_y];
610 611 612 613
        offset += thread_size;
    }
}

614
static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
615
{
616 617 618
    DNXHDEncContext *ctx = avctx->priv_data;
    int mb_y = jobnr, mb_x;
    ctx = ctx->thread[threadnr];
619 620 621 622 623
    if (ctx->cid_table->bit_depth == 8) {
        uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y<<4) * ctx->m.linesize);
        for (mb_x = 0; mb_x < ctx->m.mb_width; ++mb_x, pix += 16) {
            unsigned mb  = mb_y * ctx->m.mb_width + mb_x;
            int sum = ctx->m.dsp.pix_sum(pix, ctx->m.linesize);
624
            int varc = (ctx->m.dsp.pix_norm1(pix, ctx->m.linesize) - (((unsigned)sum*sum)>>8)+128)>>8;
625 626 627 628 629 630 631 632 633 634 635
            ctx->mb_cmp[mb].value = varc;
            ctx->mb_cmp[mb].mb = mb;
        }
    } else { // 10-bit
        int const linesize = ctx->m.linesize >> 1;
        for (mb_x = 0; mb_x < ctx->m.mb_width; ++mb_x) {
            uint16_t *pix = (uint16_t*)ctx->thread[0]->src[0] + ((mb_y << 4) * linesize) + (mb_x << 4);
            unsigned mb  = mb_y * ctx->m.mb_width + mb_x;
            int sum = 0;
            int sqsum = 0;
            int mean, sqmean;
636
            int i, j;
637
            // Macroblocks are 16x16 pixels, unlike DCT blocks which are 8x8.
638 639
            for (i = 0; i < 16; ++i) {
                for (j = 0; j < 16; ++j) {
640 641 642 643 644 645 646 647 648 649 650 651 652
                    // Turn 16-bit pixels into 10-bit ones.
                    int const sample = (unsigned)pix[j] >> 6;
                    sum += sample;
                    sqsum += sample * sample;
                    // 2^10 * 2^10 * 16 * 16 = 2^28, which is less than INT_MAX
                }
                pix += linesize;
            }
            mean = sum >> 8; // 16*16 == 2^8
            sqmean = sqsum >> 8;
            ctx->mb_cmp[mb].value = sqmean - mean * mean;
            ctx->mb_cmp[mb].mb = mb;
        }
653
    }
654 655 656 657 658
    return 0;
}

static int dnxhd_encode_rdo(AVCodecContext *avctx, DNXHDEncContext *ctx)
{
659 660
    int lambda, up_step, down_step;
    int last_lower = INT_MAX, last_higher = 0;
661 662 663 664
    int x, y, q;

    for (q = 1; q < avctx->qmax; q++) {
        ctx->qscale = q;
665
        avctx->execute2(avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height);
666
    }
667
    up_step = down_step = 2<<LAMBDA_FRAC_BITS;
668 669 670 671 672
    lambda = ctx->lambda;

    for (;;) {
        int bits = 0;
        int end = 0;
673
        if (lambda == last_higher) {
674
            lambda++;
675
            end = 1; // need to set final qscales/bits
676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696
        }
        for (y = 0; y < ctx->m.mb_height; y++) {
            for (x = 0; x < ctx->m.mb_width; x++) {
                unsigned min = UINT_MAX;
                int qscale = 1;
                int mb = y*ctx->m.mb_width+x;
                for (q = 1; q < avctx->qmax; q++) {
                    unsigned score = ctx->mb_rc[q][mb].bits*lambda+(ctx->mb_rc[q][mb].ssd<<LAMBDA_FRAC_BITS);
                    if (score < min) {
                        min = score;
                        qscale = q;
                    }
                }
                bits += ctx->mb_rc[qscale][mb].bits;
                ctx->mb_qscale[mb] = qscale;
                ctx->mb_bits[mb] = ctx->mb_rc[qscale][mb].bits;
            }
            bits = (bits+31)&~31; // padding
            if (bits > ctx->frame_bits)
                break;
        }
697
        //av_dlog(ctx->m.avctx, "lambda %d, up %u, down %u, bits %d, frame %d\n",
698
        //        lambda, last_higher, last_lower, bits, ctx->frame_bits);
699 700 701 702 703 704
        if (end) {
            if (bits > ctx->frame_bits)
                return -1;
            break;
        }
        if (bits < ctx->frame_bits) {
705 706 707 708 709 710 711 712 713 714
            last_lower = FFMIN(lambda, last_lower);
            if (last_higher != 0)
                lambda = (lambda+last_higher)>>1;
            else
                lambda -= down_step;
            down_step *= 5; // XXX tune ?
            up_step = 1<<LAMBDA_FRAC_BITS;
            lambda = FFMAX(1, lambda);
            if (lambda == last_lower)
                break;
715
        } else {
716 717 718
            last_higher = FFMAX(lambda, last_higher);
            if (last_lower != INT_MAX)
                lambda = (lambda+last_lower)>>1;
719 720
            else if ((int64_t)lambda + up_step > INT_MAX)
                return -1;
721 722
            else
                lambda += up_step;
723
            up_step = FFMIN((int64_t)up_step*5, INT_MAX);
724
            down_step = 1<<LAMBDA_FRAC_BITS;
725 726
        }
    }
727
    //av_dlog(ctx->m.avctx, "out lambda %d\n", lambda);
728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746
    ctx->lambda = lambda;
    return 0;
}

static int dnxhd_find_qscale(DNXHDEncContext *ctx)
{
    int bits = 0;
    int up_step = 1;
    int down_step = 1;
    int last_higher = 0;
    int last_lower = INT_MAX;
    int qscale;
    int x, y;

    qscale = ctx->qscale;
    for (;;) {
        bits = 0;
        ctx->qscale = qscale;
        // XXX avoid recalculating bits
747
        ctx->m.avctx->execute2(ctx->m.avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height);
748 749 750 751 752 753 754
        for (y = 0; y < ctx->m.mb_height; y++) {
            for (x = 0; x < ctx->m.mb_width; x++)
                bits += ctx->mb_rc[qscale][y*ctx->m.mb_width+x].bits;
            bits = (bits+31)&~31; // padding
            if (bits > ctx->frame_bits)
                break;
        }
755
        //av_dlog(ctx->m.avctx, "%d, qscale %d, bits %d, frame %d, higher %d, lower %d\n",
756 757 758
        //        ctx->m.avctx->frame_number, qscale, bits, ctx->frame_bits, last_higher, last_lower);
        if (bits < ctx->frame_bits) {
            if (qscale == 1)
759
                return 1;
760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784
            if (last_higher == qscale - 1) {
                qscale = last_higher;
                break;
            }
            last_lower = FFMIN(qscale, last_lower);
            if (last_higher != 0)
                qscale = (qscale+last_higher)>>1;
            else
                qscale -= down_step++;
            if (qscale < 1)
                qscale = 1;
            up_step = 1;
        } else {
            if (last_lower == qscale + 1)
                break;
            last_higher = FFMAX(qscale, last_higher);
            if (last_lower != INT_MAX)
                qscale = (qscale+last_lower)>>1;
            else
                qscale += up_step++;
            down_step = 1;
            if (qscale >= ctx->m.avctx->qmax)
                return -1;
        }
    }
785
    //av_dlog(ctx->m.avctx, "out qscale %d\n", qscale);
786 787 788 789
    ctx->qscale = qscale;
    return 0;
}

790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832
#define BUCKET_BITS 8
#define RADIX_PASSES 4
#define NBUCKETS (1 << BUCKET_BITS)

static inline int get_bucket(int value, int shift)
{
    value >>= shift;
    value &= NBUCKETS - 1;
    return NBUCKETS - 1 - value;
}

static void radix_count(const RCCMPEntry *data, int size, int buckets[RADIX_PASSES][NBUCKETS])
{
    int i, j;
    memset(buckets, 0, sizeof(buckets[0][0]) * RADIX_PASSES * NBUCKETS);
    for (i = 0; i < size; i++) {
        int v = data[i].value;
        for (j = 0; j < RADIX_PASSES; j++) {
            buckets[j][get_bucket(v, 0)]++;
            v >>= BUCKET_BITS;
        }
        assert(!v);
    }
    for (j = 0; j < RADIX_PASSES; j++) {
        int offset = size;
        for (i = NBUCKETS - 1; i >= 0; i--)
            buckets[j][i] = offset -= buckets[j][i];
        assert(!buckets[j][0]);
    }
}

static void radix_sort_pass(RCCMPEntry *dst, const RCCMPEntry *data, int size, int buckets[NBUCKETS], int pass)
{
    int shift = pass * BUCKET_BITS;
    int i;
    for (i = 0; i < size; i++) {
        int v = get_bucket(data[i].value, shift);
        int pos = buckets[v]++;
        dst[pos] = data[i];
    }
}

static void radix_sort(RCCMPEntry *data, int size)
833
{
834 835 836 837 838 839 840 841 842 843
    int buckets[RADIX_PASSES][NBUCKETS];
    RCCMPEntry *tmp = av_malloc(sizeof(*tmp) * size);
    radix_count(data, size, buckets);
    radix_sort_pass(tmp, data, size, buckets[0], 0);
    radix_sort_pass(data, tmp, size, buckets[1], 1);
    if (buckets[2][NBUCKETS - 1] || buckets[3][NBUCKETS - 1]) {
        radix_sort_pass(tmp, data, size, buckets[2], 2);
        radix_sort_pass(data, tmp, size, buckets[3], 3);
    }
    av_free(tmp);
844 845
}

846
static int dnxhd_encode_fast(AVCodecContext *avctx, DNXHDEncContext *ctx)
847 848
{
    int max_bits = 0;
849 850
    int ret, x, y;
    if ((ret = dnxhd_find_qscale(ctx)) < 0)
851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868
        return -1;
    for (y = 0; y < ctx->m.mb_height; y++) {
        for (x = 0; x < ctx->m.mb_width; x++) {
            int mb = y*ctx->m.mb_width+x;
            int delta_bits;
            ctx->mb_qscale[mb] = ctx->qscale;
            ctx->mb_bits[mb] = ctx->mb_rc[ctx->qscale][mb].bits;
            max_bits += ctx->mb_rc[ctx->qscale][mb].bits;
            if (!RC_VARIANCE) {
                delta_bits = ctx->mb_rc[ctx->qscale][mb].bits-ctx->mb_rc[ctx->qscale+1][mb].bits;
                ctx->mb_cmp[mb].mb = mb;
                ctx->mb_cmp[mb].value = delta_bits ?
                    ((ctx->mb_rc[ctx->qscale][mb].ssd-ctx->mb_rc[ctx->qscale+1][mb].ssd)*100)/delta_bits
                    : INT_MIN; //avoid increasing qscale
            }
        }
        max_bits += 31; //worst padding
    }
869
    if (!ret) {
870
        if (RC_VARIANCE)
871
            avctx->execute2(avctx, dnxhd_mb_var_thread, NULL, NULL, ctx->m.mb_height);
872
        radix_sort(ctx->mb_cmp, ctx->m.mb_num);
873 874 875 876 877 878 879 880 881 882
        for (x = 0; x < ctx->m.mb_num && max_bits > ctx->frame_bits; x++) {
            int mb = ctx->mb_cmp[x].mb;
            max_bits -= ctx->mb_rc[ctx->qscale][mb].bits - ctx->mb_rc[ctx->qscale+1][mb].bits;
            ctx->mb_qscale[mb] = ctx->qscale+1;
            ctx->mb_bits[mb] = ctx->mb_rc[ctx->qscale+1][mb].bits;
        }
    }
    return 0;
}

Michael Niedermayer's avatar
Michael Niedermayer committed
883
static void dnxhd_load_picture(DNXHDEncContext *ctx, const AVFrame *frame)
884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902
{
    int i;

    for (i = 0; i < 3; i++) {
        ctx->frame.data[i]     = frame->data[i];
        ctx->frame.linesize[i] = frame->linesize[i];
    }

    for (i = 0; i < ctx->m.avctx->thread_count; i++) {
        ctx->thread[i]->m.linesize    = ctx->frame.linesize[0]<<ctx->interlaced;
        ctx->thread[i]->m.uvlinesize  = ctx->frame.linesize[1]<<ctx->interlaced;
        ctx->thread[i]->dct_y_offset  = ctx->m.linesize  *8;
        ctx->thread[i]->dct_uv_offset = ctx->m.uvlinesize*8;
    }

    ctx->frame.interlaced_frame = frame->interlaced_frame;
    ctx->cur_field = frame->interlaced_frame && !frame->top_field_first;
}

903
static int dnxhd_encode_picture(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data)
904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927
{
    DNXHDEncContext *ctx = avctx->priv_data;
    int first_field = 1;
    int offset, i, ret;

    if (buf_size < ctx->cid_table->frame_size) {
        av_log(avctx, AV_LOG_ERROR, "output buffer is too small to compress picture\n");
        return -1;
    }

    dnxhd_load_picture(ctx, data);

 encode_coding_unit:
    for (i = 0; i < 3; i++) {
        ctx->src[i] = ctx->frame.data[i];
        if (ctx->interlaced && ctx->cur_field)
            ctx->src[i] += ctx->frame.linesize[i];
    }

    dnxhd_write_header(avctx, buf);

    if (avctx->mb_decision == FF_MB_DECISION_RD)
        ret = dnxhd_encode_rdo(avctx, ctx);
    else
928
        ret = dnxhd_encode_fast(avctx, ctx);
929
    if (ret < 0) {
930 931
        av_log(avctx, AV_LOG_ERROR,
               "picture could not fit ratecontrol constraints, increase qmax\n");
932 933 934
        return -1;
    }

935
    dnxhd_setup_threads_slices(ctx);
936 937 938 939 940 941 942 943

    offset = 0;
    for (i = 0; i < ctx->m.mb_height; i++) {
        AV_WB32(ctx->msip + i * 4, offset);
        offset += ctx->slice_size[i];
        assert(!(ctx->slice_size[i] & 3));
    }

944
    avctx->execute2(avctx, dnxhd_encode_thread, buf, NULL, ctx->m.mb_height);
945

946 947 948
    assert(640 + offset + 4 <= ctx->cid_table->coding_unit_size);
    memset(buf + 640 + offset, 0, ctx->cid_table->coding_unit_size - 4 - offset - 640);

949 950 951 952 953 954 955 956 957 958
    AV_WB32(buf + ctx->cid_table->coding_unit_size - 4, 0x600DC0DE); // EOF

    if (ctx->interlaced && first_field) {
        first_field     = 0;
        ctx->cur_field ^= 1;
        buf      += ctx->cid_table->coding_unit_size;
        buf_size -= ctx->cid_table->coding_unit_size;
        goto encode_coding_unit;
    }

959 960
    ctx->frame.quality = ctx->qscale*FF_QP2LAMBDA;

961 962 963 964 965 966
    return ctx->cid_table->frame_size;
}

static int dnxhd_encode_end(AVCodecContext *avctx)
{
    DNXHDEncContext *ctx = avctx->priv_data;
967
    int max_level = 1<<(ctx->cid_table->bit_depth+2);
968 969
    int i;

970 971 972 973
    av_free(ctx->vlc_codes-max_level*2);
    av_free(ctx->vlc_bits -max_level*2);
    av_freep(&ctx->run_codes);
    av_freep(&ctx->run_bits);
974 975 976 977 978 979

    av_freep(&ctx->mb_bits);
    av_freep(&ctx->mb_qscale);
    av_freep(&ctx->mb_rc);
    av_freep(&ctx->mb_cmp);
    av_freep(&ctx->slice_size);
980
    av_freep(&ctx->slice_offs);
981 982 983 984 985 986 987 988 989 990 991 992

    av_freep(&ctx->qmatrix_c);
    av_freep(&ctx->qmatrix_l);
    av_freep(&ctx->qmatrix_c16);
    av_freep(&ctx->qmatrix_l16);

    for (i = 1; i < avctx->thread_count; i++)
        av_freep(&ctx->thread[i]);

    return 0;
}

993
AVCodec ff_dnxhd_encoder = {
994 995 996 997 998 999 1000
    .name           = "dnxhd",
    .type           = AVMEDIA_TYPE_VIDEO,
    .id             = CODEC_ID_DNXHD,
    .priv_data_size = sizeof(DNXHDEncContext),
    .init           = dnxhd_encode_init,
    .encode         = dnxhd_encode_picture,
    .close          = dnxhd_encode_end,
1001
    .capabilities = CODEC_CAP_SLICE_THREADS,
1002
    .pix_fmts = (const enum PixelFormat[]){PIX_FMT_YUV422P, PIX_FMT_YUV422P10, PIX_FMT_NONE},
1003
    .long_name = NULL_IF_CONFIG_SMALL("VC3/DNxHD"),
1004
    .priv_class = &class,
1005
};