dnxhdenc.c 35.3 KB
Newer Older
1 2 3
/*
 * VC3/DNxHD encoder
 * Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
4
 * Copyright (c) 2011 MirriAd Ltd
5 6
 *
 * VC-3 encoder funded by the British Broadcasting Corporation
7
 * 10 bit support added by MirriAd Ltd, Joseph Artsimovich <joseph@mirriad.com>
8
 *
9
 * This file is part of Libav.
10
 *
11
 * Libav is free software; you can redistribute it and/or
12 13 14 15
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
16
 * Libav is distributed in the hope that it will be useful,
17 18 19 20 21
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
22
 * License along with Libav; if not, write to the Free Software
23 24 25 26 27 28
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

//#define DEBUG
#define RC_VARIANCE 1 // use variance or ssd for fast rc

29
#include "libavutil/opt.h"
30 31
#include "avcodec.h"
#include "dsputil.h"
32
#include "internal.h"
33
#include "mpegvideo.h"
34
#include "mpegvideo_common.h"
35
#include "dnxhdenc.h"
36

37
#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
38
#define DNX10BIT_QMAT_SHIFT 18 // The largest value that will not lead to overflow for 10bit samples.
39 40

static const AVOption options[]={
41
    {"nitris_compat", "encode with Avid Nitris compatibility", offsetof(DNXHDEncContext, nitris_compat), AV_OPT_TYPE_INT, {.dbl = 0}, 0, 1, VE},
42 43 44 45
{NULL}
};
static const AVClass class = { "dnxhd", av_default_item_name, options, LIBAVUTIL_VERSION_INT };

46 47
#define LAMBDA_FRAC_BITS 10

48
static void dnxhd_8bit_get_pixels_8x4_sym(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
49 50 51 52 53 54 55 56 57 58
{
    int i;
    for (i = 0; i < 4; i++) {
        block[0] = pixels[0]; block[1] = pixels[1];
        block[2] = pixels[2]; block[3] = pixels[3];
        block[4] = pixels[4]; block[5] = pixels[5];
        block[6] = pixels[6]; block[7] = pixels[7];
        pixels += line_size;
        block += 8;
    }
59 60 61 62
    memcpy(block,      block -  8, sizeof(*block) * 8);
    memcpy(block +  8, block - 16, sizeof(*block) * 8);
    memcpy(block + 16, block - 24, sizeof(*block) * 8);
    memcpy(block + 24, block - 32, sizeof(*block) * 8);
63 64
}

65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
static av_always_inline void dnxhd_10bit_get_pixels_8x4_sym(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
{
    int i;

    block += 32;

    for (i = 0; i < 4; i++) {
        memcpy(block + i     * 8, pixels + i * line_size, 8 * sizeof(*block));
        memcpy(block - (i+1) * 8, pixels + i * line_size, 8 * sizeof(*block));
    }
}

static int dnxhd_10bit_dct_quantize(MpegEncContext *ctx, DCTELEM *block,
                                    int n, int qscale, int *overflow)
{
    const uint8_t *scantable= ctx->intra_scantable.scantable;
    const int *qmat = ctx->q_intra_matrix[qscale];
    int last_non_zero = 0;
83
    int i;
84 85 86 87 88 89

    ctx->dsp.fdct(block);

    // Divide by 4 with rounding, to compensate scaling of DCT coefficients
    block[0] = (block[0] + 2) >> 2;

90
    for (i = 1; i < 64; ++i) {
91 92 93 94 95 96 97 98 99 100 101 102
        int j = scantable[i];
        int sign = block[j] >> 31;
        int level = (block[j] ^ sign) - sign;
        level = level * qmat[j] >> DNX10BIT_QMAT_SHIFT;
        block[j] = (level ^ sign) - sign;
        if (level)
            last_non_zero = i;
    }

    return last_non_zero;
}

103 104
static int dnxhd_init_vlc(DNXHDEncContext *ctx)
{
105 106 107
    int i, j, level, run;
    int max_level = 1<<(ctx->cid_table->bit_depth+2);

108
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->vlc_codes, max_level*4*sizeof(*ctx->vlc_codes), fail);
109 110 111
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->vlc_bits,  max_level*4*sizeof(*ctx->vlc_bits) , fail);
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->run_codes, 63*2,                                fail);
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->run_bits,  63,                                  fail);
112

113 114
    ctx->vlc_codes += max_level*2;
    ctx->vlc_bits  += max_level*2;
115 116 117 118 119 120 121 122 123 124 125 126 127 128
    for (level = -max_level; level < max_level; level++) {
        for (run = 0; run < 2; run++) {
            int index = (level<<1)|run;
            int sign, offset = 0, alevel = level;

            MASK_ABS(sign, alevel);
            if (alevel > 64) {
                offset = (alevel-1)>>6;
                alevel -= offset<<6;
            }
            for (j = 0; j < 257; j++) {
                if (ctx->cid_table->ac_level[j] == alevel &&
                    (!offset || (ctx->cid_table->ac_index_flag[j] && offset)) &&
                    (!run    || (ctx->cid_table->ac_run_flag  [j] && run))) {
129
                    assert(!ctx->vlc_codes[index]);
130
                    if (alevel) {
131 132
                        ctx->vlc_codes[index] = (ctx->cid_table->ac_codes[j]<<1)|(sign&1);
                        ctx->vlc_bits [index] = ctx->cid_table->ac_bits[j]+1;
133
                    } else {
134 135
                        ctx->vlc_codes[index] = ctx->cid_table->ac_codes[j];
                        ctx->vlc_bits [index] = ctx->cid_table->ac_bits [j];
136 137 138 139 140 141
                    }
                    break;
                }
            }
            assert(!alevel || j < 257);
            if (offset) {
142 143
                ctx->vlc_codes[index] = (ctx->vlc_codes[index]<<ctx->cid_table->index_bits)|offset;
                ctx->vlc_bits [index]+= ctx->cid_table->index_bits;
144 145
            }
        }
146 147 148 149
    }
    for (i = 0; i < 62; i++) {
        int run = ctx->cid_table->run[i];
        assert(run < 63);
150 151
        ctx->run_codes[run] = ctx->cid_table->run_codes[i];
        ctx->run_bits [run] = ctx->cid_table->run_bits[i];
152 153 154 155 156 157 158 159 160 161 162
    }
    return 0;
 fail:
    return -1;
}

static int dnxhd_init_qmat(DNXHDEncContext *ctx, int lbias, int cbias)
{
    // init first elem to 1 to avoid div by 0 in convert_matrix
    uint16_t weight_matrix[64] = {1,}; // convert_matrix needs uint16_t*
    int qscale, i;
163 164
    const uint8_t *luma_weight_table   = ctx->cid_table->luma_weight;
    const uint8_t *chroma_weight_table = ctx->cid_table->chroma_weight;
165

166 167
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l,   (ctx->m.avctx->qmax+1) * 64 *     sizeof(int),      fail);
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c,   (ctx->m.avctx->qmax+1) * 64 *     sizeof(int),      fail);
168 169
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_l16, (ctx->m.avctx->qmax+1) * 64 * 2 * sizeof(uint16_t), fail);
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->qmatrix_c16, (ctx->m.avctx->qmax+1) * 64 * 2 * sizeof(uint16_t), fail);
170

171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
    if (ctx->cid_table->bit_depth == 8) {
        for (i = 1; i < 64; i++) {
            int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
            weight_matrix[j] = ctx->cid_table->luma_weight[i];
        }
        ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_l, ctx->qmatrix_l16, weight_matrix,
                          ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1);
        for (i = 1; i < 64; i++) {
            int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];
            weight_matrix[j] = ctx->cid_table->chroma_weight[i];
        }
        ff_convert_matrix(&ctx->m.dsp, ctx->qmatrix_c, ctx->qmatrix_c16, weight_matrix,
                          ctx->m.intra_quant_bias, 1, ctx->m.avctx->qmax, 1);

        for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) {
            for (i = 0; i < 64; i++) {
                ctx->qmatrix_l  [qscale]   [i] <<= 2; ctx->qmatrix_c  [qscale]   [i] <<= 2;
                ctx->qmatrix_l16[qscale][0][i] <<= 2; ctx->qmatrix_l16[qscale][1][i] <<= 2;
                ctx->qmatrix_c16[qscale][0][i] <<= 2; ctx->qmatrix_c16[qscale][1][i] <<= 2;
            }
        }
    } else {
        // 10-bit
        for (qscale = 1; qscale <= ctx->m.avctx->qmax; qscale++) {
            for (i = 1; i < 64; i++) {
                int j = ctx->m.dsp.idct_permutation[ff_zigzag_direct[i]];

                // The quantization formula from the VC-3 standard is:
                // quantized = sign(block[i]) * floor(abs(block[i]/s) * p / (qscale * weight_table[i]))
                // Where p is 32 for 8-bit samples and 8 for 10-bit ones.
                // The s factor compensates scaling of DCT coefficients done by the DCT routines,
                // and therefore is not present in standard.  It's 8 for 8-bit samples and 4 for 10-bit ones.
                // We want values of ctx->qtmatrix_l and ctx->qtmatrix_r to be:
                // ((1 << DNX10BIT_QMAT_SHIFT) * (p / s)) / (qscale * weight_table[i])
                // For 10-bit samples, p / s == 2
                ctx->qmatrix_l[qscale][j] = (1 << (DNX10BIT_QMAT_SHIFT + 1)) / (qscale * luma_weight_table[i]);
                ctx->qmatrix_c[qscale][j] = (1 << (DNX10BIT_QMAT_SHIFT + 1)) / (qscale * chroma_weight_table[i]);
            }
209 210
        }
    }
211

212 213 214 215 216 217 218
    return 0;
 fail:
    return -1;
}

static int dnxhd_init_rc(DNXHDEncContext *ctx)
{
219
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_rc, 8160*ctx->m.avctx->qmax*sizeof(RCEntry), fail);
220
    if (ctx->m.avctx->mb_decision != FF_MB_DECISION_RD)
221
        FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_cmp, ctx->m.mb_num*sizeof(RCCMPEntry), fail);
222

223
    ctx->frame_bits = (ctx->cid_table->coding_unit_size - 640 - 4 - ctx->min_padding) * 8;
224 225 226 227 228 229 230 231 232 233
    ctx->qscale = 1;
    ctx->lambda = 2<<LAMBDA_FRAC_BITS; // qscale 2
    return 0;
 fail:
    return -1;
}

static int dnxhd_encode_init(AVCodecContext *avctx)
{
    DNXHDEncContext *ctx = avctx->priv_data;
234 235 236 237 238 239 240 241 242 243 244 245 246
    int i, index, bit_depth;

    switch (avctx->pix_fmt) {
    case PIX_FMT_YUV422P:
        bit_depth = 8;
        break;
    case PIX_FMT_YUV422P10:
        bit_depth = 10;
        break;
    default:
        av_log(avctx, AV_LOG_ERROR, "pixel format is incompatible with DNxHD\n");
        return -1;
    }
247

248 249
    ctx->cid = ff_dnxhd_find_cid(avctx, bit_depth);
    if (!ctx->cid) {
250 251 252
        av_log(avctx, AV_LOG_ERROR, "video parameters incompatible with DNxHD\n");
        return -1;
    }
253
    av_log(avctx, AV_LOG_DEBUG, "cid %d\n", ctx->cid);
254 255 256 257 258 259 260 261

    index = ff_dnxhd_get_cid_table(ctx->cid);
    ctx->cid_table = &ff_dnxhd_cid_table[index];

    ctx->m.avctx = avctx;
    ctx->m.mb_intra = 1;
    ctx->m.h263_aic = 1;

262
    avctx->bits_per_raw_sample = ctx->cid_table->bit_depth;
263

264
    ff_dsputil_init(&ctx->m.dsp, avctx);
265
    ff_dct_common_init(&ctx->m);
266
    if (!ctx->m.dct_quantize)
267
        ctx->m.dct_quantize = ff_dct_quantize_c;
268 269 270 271 272 273 274 275 276 277

    if (ctx->cid_table->bit_depth == 10) {
       ctx->m.dct_quantize = dnxhd_10bit_dct_quantize;
       ctx->get_pixels_8x4_sym = dnxhd_10bit_get_pixels_8x4_sym;
       ctx->block_width_l2 = 4;
    } else {
       ctx->get_pixels_8x4_sym = dnxhd_8bit_get_pixels_8x4_sym;
       ctx->block_width_l2 = 3;
    }

278
#if HAVE_MMX
279 280
    ff_dnxhd_init_mmx(ctx);
#endif
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296

    ctx->m.mb_height = (avctx->height + 15) / 16;
    ctx->m.mb_width  = (avctx->width  + 15) / 16;

    if (avctx->flags & CODEC_FLAG_INTERLACED_DCT) {
        ctx->interlaced = 1;
        ctx->m.mb_height /= 2;
    }

    ctx->m.mb_num = ctx->m.mb_height * ctx->m.mb_width;

    if (avctx->intra_quant_bias != FF_DEFAULT_QUANT_BIAS)
        ctx->m.intra_quant_bias = avctx->intra_quant_bias;
    if (dnxhd_init_qmat(ctx, ctx->m.intra_quant_bias, 0) < 0) // XXX tune lbias/cbias
        return -1;

297 298 299 300
    // Avid Nitris hardware decoder requires a minimum amount of padding in the coding unit payload
    if (ctx->nitris_compat)
        ctx->min_padding = 1600;

301 302 303 304 305
    if (dnxhd_init_vlc(ctx) < 0)
        return -1;
    if (dnxhd_init_rc(ctx) < 0)
        return -1;

306
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_size, ctx->m.mb_height*sizeof(uint32_t), fail);
307
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->slice_offs, ctx->m.mb_height*sizeof(uint32_t), fail);
308
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_bits,    ctx->m.mb_num   *sizeof(uint16_t), fail);
309
    FF_ALLOCZ_OR_GOTO(ctx->m.avctx, ctx->mb_qscale,  ctx->m.mb_num   *sizeof(uint8_t),  fail);
310 311

    ctx->frame.key_frame = 1;
312
    ctx->frame.pict_type = AV_PICTURE_TYPE_I;
313 314
    ctx->m.avctx->coded_frame = &ctx->frame;

315
    if (avctx->thread_count > MAX_THREADS) {
316 317 318 319 320 321 322 323 324 325 326
        av_log(avctx, AV_LOG_ERROR, "too many threads\n");
        return -1;
    }

    ctx->thread[0] = ctx;
    for (i = 1; i < avctx->thread_count; i++) {
        ctx->thread[i] =  av_malloc(sizeof(DNXHDEncContext));
        memcpy(ctx->thread[i], ctx, sizeof(DNXHDEncContext));
    }

    return 0;
327
 fail: //for FF_ALLOCZ_OR_GOTO
328 329 330 331 332 333 334 335
    return -1;
}

static int dnxhd_write_header(AVCodecContext *avctx, uint8_t *buf)
{
    DNXHDEncContext *ctx = avctx->priv_data;
    const uint8_t header_prefix[5] = { 0x00,0x00,0x02,0x80,0x01 };

336 337
    memset(buf, 0, 640);

338 339 340 341
    memcpy(buf, header_prefix, 5);
    buf[5] = ctx->interlaced ? ctx->cur_field+2 : 0x01;
    buf[6] = 0x80; // crc flag off
    buf[7] = 0xa0; // reserved
342
    AV_WB16(buf + 0x18, avctx->height>>ctx->interlaced); // ALPF
343
    AV_WB16(buf + 0x1a, avctx->width);  // SPL
344
    AV_WB16(buf + 0x1d, avctx->height>>ctx->interlaced); // NAL
345

346
    buf[0x21] = ctx->cid_table->bit_depth == 10 ? 0x58 : 0x38;
347
    buf[0x22] = 0x88 + (ctx->interlaced<<2);
348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
    AV_WB32(buf + 0x28, ctx->cid); // CID
    buf[0x2c] = ctx->interlaced ? 0 : 0x80;

    buf[0x5f] = 0x01; // UDL

    buf[0x167] = 0x02; // reserved
    AV_WB16(buf + 0x16a, ctx->m.mb_height * 4 + 4); // MSIPS
    buf[0x16d] = ctx->m.mb_height; // Ns
    buf[0x16f] = 0x10; // reserved

    ctx->msip = buf + 0x170;
    return 0;
}

static av_always_inline void dnxhd_encode_dc(DNXHDEncContext *ctx, int diff)
{
    int nbits;
    if (diff < 0) {
        nbits = av_log2_16bit(-2*diff);
        diff--;
    } else {
        nbits = av_log2_16bit(2*diff);
    }
    put_bits(&ctx->m.pb, ctx->cid_table->dc_bits[nbits] + nbits,
             (ctx->cid_table->dc_codes[nbits]<<nbits) + (diff & ((1 << nbits) - 1)));
}

static av_always_inline void dnxhd_encode_block(DNXHDEncContext *ctx, DCTELEM *block, int last_index, int n)
{
    int last_non_zero = 0;
    int slevel, i, j;

    dnxhd_encode_dc(ctx, block[0] - ctx->m.last_dc[n]);
    ctx->m.last_dc[n] = block[0];

    for (i = 1; i <= last_index; i++) {
        j = ctx->m.intra_scantable.permutated[i];
        slevel = block[j];
        if (slevel) {
            int run_level = i - last_non_zero - 1;
388
            int rlevel = (slevel<<1)|!!run_level;
389
            put_bits(&ctx->m.pb, ctx->vlc_bits[rlevel], ctx->vlc_codes[rlevel]);
390
            if (run_level)
391
                put_bits(&ctx->m.pb, ctx->run_bits[run_level], ctx->run_codes[run_level]);
392 393 394
            last_non_zero = i;
        }
    }
395
    put_bits(&ctx->m.pb, ctx->vlc_bits[0], ctx->vlc_codes[0]); // EOB
396 397 398 399
}

static av_always_inline void dnxhd_unquantize_c(DNXHDEncContext *ctx, DCTELEM *block, int n, int qscale, int last_index)
{
Baptiste Coudurier's avatar
Baptiste Coudurier committed
400
    const uint8_t *weight_matrix;
401 402 403
    int level;
    int i;

Baptiste Coudurier's avatar
Baptiste Coudurier committed
404
    weight_matrix = (n&2) ? ctx->cid_table->chroma_weight : ctx->cid_table->luma_weight;
405 406 407 408 409 410

    for (i = 1; i <= last_index; i++) {
        int j = ctx->m.intra_scantable.permutated[i];
        level = block[j];
        if (level) {
            if (level < 0) {
Baptiste Coudurier's avatar
Baptiste Coudurier committed
411
                level = (1-2*level) * qscale * weight_matrix[i];
412 413 414 415 416 417 418 419 420
                if (ctx->cid_table->bit_depth == 10) {
                    if (weight_matrix[i] != 8)
                        level += 8;
                    level >>= 4;
                } else {
                    if (weight_matrix[i] != 32)
                        level += 32;
                    level >>= 6;
                }
421 422
                level = -level;
            } else {
Baptiste Coudurier's avatar
Baptiste Coudurier committed
423
                level = (2*level+1) * qscale * weight_matrix[i];
424 425 426 427 428 429 430 431 432
                if (ctx->cid_table->bit_depth == 10) {
                    if (weight_matrix[i] != 8)
                        level += 8;
                    level >>= 4;
                } else {
                    if (weight_matrix[i] != 32)
                        level += 32;
                    level >>= 6;
                }
433 434 435 436 437 438 439 440 441 442 443
            }
            block[j] = level;
        }
    }
}

static av_always_inline int dnxhd_ssd_block(DCTELEM *qblock, DCTELEM *block)
{
    int score = 0;
    int i;
    for (i = 0; i < 64; i++)
444
        score += (block[i] - qblock[i]) * (block[i] - qblock[i]);
445 446 447 448 449 450 451 452 453 454 455 456 457
    return score;
}

static av_always_inline int dnxhd_calc_ac_bits(DNXHDEncContext *ctx, DCTELEM *block, int last_index)
{
    int last_non_zero = 0;
    int bits = 0;
    int i, j, level;
    for (i = 1; i <= last_index; i++) {
        j = ctx->m.intra_scantable.permutated[i];
        level = block[j];
        if (level) {
            int run_level = i - last_non_zero - 1;
458
            bits += ctx->vlc_bits[(level<<1)|!!run_level]+ctx->run_bits[run_level];
459 460 461 462 463 464 465 466
            last_non_zero = i;
        }
    }
    return bits;
}

static av_always_inline void dnxhd_get_blocks(DNXHDEncContext *ctx, int mb_x, int mb_y)
{
467 468 469 470 471
    const int bs = ctx->block_width_l2;
    const int bw = 1 << bs;
    const uint8_t *ptr_y = ctx->thread[0]->src[0] + ((mb_y << 4) * ctx->m.linesize)   + (mb_x << bs+1);
    const uint8_t *ptr_u = ctx->thread[0]->src[1] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << bs);
    const uint8_t *ptr_v = ctx->thread[0]->src[2] + ((mb_y << 4) * ctx->m.uvlinesize) + (mb_x << bs);
472 473
    DSPContext *dsp = &ctx->m.dsp;

474 475 476 477
    dsp->get_pixels(ctx->blocks[0], ptr_y,      ctx->m.linesize);
    dsp->get_pixels(ctx->blocks[1], ptr_y + bw, ctx->m.linesize);
    dsp->get_pixels(ctx->blocks[2], ptr_u,      ctx->m.uvlinesize);
    dsp->get_pixels(ctx->blocks[3], ptr_v,      ctx->m.uvlinesize);
478

479
    if (mb_y+1 == ctx->m.mb_height && ctx->m.avctx->height == 1080) {
480
        if (ctx->interlaced) {
481 482 483 484
            ctx->get_pixels_8x4_sym(ctx->blocks[4], ptr_y + ctx->dct_y_offset,      ctx->m.linesize);
            ctx->get_pixels_8x4_sym(ctx->blocks[5], ptr_y + ctx->dct_y_offset + bw, ctx->m.linesize);
            ctx->get_pixels_8x4_sym(ctx->blocks[6], ptr_u + ctx->dct_uv_offset,     ctx->m.uvlinesize);
            ctx->get_pixels_8x4_sym(ctx->blocks[7], ptr_v + ctx->dct_uv_offset,     ctx->m.uvlinesize);
Baptiste Coudurier's avatar
Baptiste Coudurier committed
485
        } else {
486 487 488 489
            dsp->clear_block(ctx->blocks[4]);
            dsp->clear_block(ctx->blocks[5]);
            dsp->clear_block(ctx->blocks[6]);
            dsp->clear_block(ctx->blocks[7]);
Baptiste Coudurier's avatar
Baptiste Coudurier committed
490
        }
491
    } else {
492 493 494 495
        dsp->get_pixels(ctx->blocks[4], ptr_y + ctx->dct_y_offset,      ctx->m.linesize);
        dsp->get_pixels(ctx->blocks[5], ptr_y + ctx->dct_y_offset + bw, ctx->m.linesize);
        dsp->get_pixels(ctx->blocks[6], ptr_u + ctx->dct_uv_offset,     ctx->m.uvlinesize);
        dsp->get_pixels(ctx->blocks[7], ptr_v + ctx->dct_uv_offset,     ctx->m.uvlinesize);
496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511
    }
}

static av_always_inline int dnxhd_switch_matrix(DNXHDEncContext *ctx, int i)
{
    if (i&2) {
        ctx->m.q_intra_matrix16 = ctx->qmatrix_c16;
        ctx->m.q_intra_matrix   = ctx->qmatrix_c;
        return 1 + (i&1);
    } else {
        ctx->m.q_intra_matrix16 = ctx->qmatrix_l16;
        ctx->m.q_intra_matrix   = ctx->qmatrix_l;
        return 0;
    }
}

512
static int dnxhd_calc_bits_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
513
{
514 515 516
    DNXHDEncContext *ctx = avctx->priv_data;
    int mb_y = jobnr, mb_x;
    int qscale = ctx->qscale;
517
    LOCAL_ALIGNED_16(DCTELEM, block, [64]);
518
    ctx = ctx->thread[threadnr];
519

520 521
    ctx->m.last_dc[0] =
    ctx->m.last_dc[1] =
522
    ctx->m.last_dc[2] = 1 << (ctx->cid_table->bit_depth + 2);
523 524 525 526 527 528 529 530 531 532 533 534 535 536 537

    for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
        unsigned mb = mb_y * ctx->m.mb_width + mb_x;
        int ssd     = 0;
        int ac_bits = 0;
        int dc_bits = 0;
        int i;

        dnxhd_get_blocks(ctx, mb_x, mb_y);

        for (i = 0; i < 8; i++) {
            DCTELEM *src_block = ctx->blocks[i];
            int overflow, nbits, diff, last_index;
            int n = dnxhd_switch_matrix(ctx, i);

538
            memcpy(block, src_block, 64*sizeof(*block));
539
            last_index = ctx->m.dct_quantize(&ctx->m, block, i, qscale, &overflow);
540 541 542 543 544
            ac_bits += dnxhd_calc_ac_bits(ctx, block, last_index);

            diff = block[0] - ctx->m.last_dc[n];
            if (diff < 0) nbits = av_log2_16bit(-2*diff);
            else          nbits = av_log2_16bit( 2*diff);
545 546

            assert(nbits < ctx->cid_table->bit_depth + 4);
547 548 549 550 551 552 553 554
            dc_bits += ctx->cid_table->dc_bits[nbits] + nbits;

            ctx->m.last_dc[n] = block[0];

            if (avctx->mb_decision == FF_MB_DECISION_RD || !RC_VARIANCE) {
                dnxhd_unquantize_c(ctx, block, i, qscale, last_index);
                ctx->m.dsp.idct(block);
                ssd += dnxhd_ssd_block(block, src_block);
555 556
            }
        }
557 558 559
        ctx->mb_rc[qscale][mb].ssd = ssd;
        ctx->mb_rc[qscale][mb].bits = ac_bits+dc_bits+12+8*ctx->vlc_bits[0];
    }
560 561 562
    return 0;
}

563
static int dnxhd_encode_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
564
{
565 566 567 568
    DNXHDEncContext *ctx = avctx->priv_data;
    int mb_y = jobnr, mb_x;
    ctx = ctx->thread[threadnr];
    init_put_bits(&ctx->m.pb, (uint8_t *)arg + 640 + ctx->slice_offs[jobnr], ctx->slice_size[jobnr]);
569

570 571
    ctx->m.last_dc[0] =
    ctx->m.last_dc[1] =
572
    ctx->m.last_dc[2] = 1 << (ctx->cid_table->bit_depth + 2);
573 574 575 576 577 578 579 580 581 582 583
    for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
        unsigned mb = mb_y * ctx->m.mb_width + mb_x;
        int qscale = ctx->mb_qscale[mb];
        int i;

        put_bits(&ctx->m.pb, 12, qscale<<1);

        dnxhd_get_blocks(ctx, mb_x, mb_y);

        for (i = 0; i < 8; i++) {
            DCTELEM *block = ctx->blocks[i];
584 585 586
            int overflow, n = dnxhd_switch_matrix(ctx, i);
            int last_index = ctx->m.dct_quantize(&ctx->m, block, i,
                                                 qscale, &overflow);
587 588 589
            //START_TIMER;
            dnxhd_encode_block(ctx, block, last_index, n);
            //STOP_TIMER("encode_block");
590
        }
591 592 593
    }
    if (put_bits_count(&ctx->m.pb)&31)
        put_bits(&ctx->m.pb, 32-(put_bits_count(&ctx->m.pb)&31), 0);
594 595 596 597
    flush_put_bits(&ctx->m.pb);
    return 0;
}

598
static void dnxhd_setup_threads_slices(DNXHDEncContext *ctx)
599 600
{
    int mb_y, mb_x;
601 602 603 604
    int offset = 0;
    for (mb_y = 0; mb_y < ctx->m.mb_height; mb_y++) {
        int thread_size;
        ctx->slice_offs[mb_y] = offset;
605 606 607 608 609 610 611 612
        ctx->slice_size[mb_y] = 0;
        for (mb_x = 0; mb_x < ctx->m.mb_width; mb_x++) {
            unsigned mb = mb_y * ctx->m.mb_width + mb_x;
            ctx->slice_size[mb_y] += ctx->mb_bits[mb];
        }
        ctx->slice_size[mb_y] = (ctx->slice_size[mb_y]+31)&~31;
        ctx->slice_size[mb_y] >>= 3;
        thread_size = ctx->slice_size[mb_y];
613 614 615 616
        offset += thread_size;
    }
}

617
static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg, int jobnr, int threadnr)
618
{
619 620 621
    DNXHDEncContext *ctx = avctx->priv_data;
    int mb_y = jobnr, mb_x;
    ctx = ctx->thread[threadnr];
622 623 624 625 626
    if (ctx->cid_table->bit_depth == 8) {
        uint8_t *pix = ctx->thread[0]->src[0] + ((mb_y<<4) * ctx->m.linesize);
        for (mb_x = 0; mb_x < ctx->m.mb_width; ++mb_x, pix += 16) {
            unsigned mb  = mb_y * ctx->m.mb_width + mb_x;
            int sum = ctx->m.dsp.pix_sum(pix, ctx->m.linesize);
627
            int varc = (ctx->m.dsp.pix_norm1(pix, ctx->m.linesize) - (((unsigned)sum*sum)>>8)+128)>>8;
628 629 630 631 632 633 634 635 636 637 638
            ctx->mb_cmp[mb].value = varc;
            ctx->mb_cmp[mb].mb = mb;
        }
    } else { // 10-bit
        int const linesize = ctx->m.linesize >> 1;
        for (mb_x = 0; mb_x < ctx->m.mb_width; ++mb_x) {
            uint16_t *pix = (uint16_t*)ctx->thread[0]->src[0] + ((mb_y << 4) * linesize) + (mb_x << 4);
            unsigned mb  = mb_y * ctx->m.mb_width + mb_x;
            int sum = 0;
            int sqsum = 0;
            int mean, sqmean;
639
            int i, j;
640
            // Macroblocks are 16x16 pixels, unlike DCT blocks which are 8x8.
641 642
            for (i = 0; i < 16; ++i) {
                for (j = 0; j < 16; ++j) {
643 644 645 646 647 648 649 650 651 652 653 654 655
                    // Turn 16-bit pixels into 10-bit ones.
                    int const sample = (unsigned)pix[j] >> 6;
                    sum += sample;
                    sqsum += sample * sample;
                    // 2^10 * 2^10 * 16 * 16 = 2^28, which is less than INT_MAX
                }
                pix += linesize;
            }
            mean = sum >> 8; // 16*16 == 2^8
            sqmean = sqsum >> 8;
            ctx->mb_cmp[mb].value = sqmean - mean * mean;
            ctx->mb_cmp[mb].mb = mb;
        }
656
    }
657 658 659 660 661
    return 0;
}

static int dnxhd_encode_rdo(AVCodecContext *avctx, DNXHDEncContext *ctx)
{
662 663
    int lambda, up_step, down_step;
    int last_lower = INT_MAX, last_higher = 0;
664 665 666 667
    int x, y, q;

    for (q = 1; q < avctx->qmax; q++) {
        ctx->qscale = q;
668
        avctx->execute2(avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height);
669
    }
670
    up_step = down_step = 2<<LAMBDA_FRAC_BITS;
671 672 673 674 675
    lambda = ctx->lambda;

    for (;;) {
        int bits = 0;
        int end = 0;
676
        if (lambda == last_higher) {
677
            lambda++;
678
            end = 1; // need to set final qscales/bits
679 680 681 682 683 684 685
        }
        for (y = 0; y < ctx->m.mb_height; y++) {
            for (x = 0; x < ctx->m.mb_width; x++) {
                unsigned min = UINT_MAX;
                int qscale = 1;
                int mb = y*ctx->m.mb_width+x;
                for (q = 1; q < avctx->qmax; q++) {
686 687
                    unsigned score = ctx->mb_rc[q][mb].bits*lambda+
                        ((unsigned)ctx->mb_rc[q][mb].ssd<<LAMBDA_FRAC_BITS);
688 689 690 691 692 693 694 695 696 697 698 699 700
                    if (score < min) {
                        min = score;
                        qscale = q;
                    }
                }
                bits += ctx->mb_rc[qscale][mb].bits;
                ctx->mb_qscale[mb] = qscale;
                ctx->mb_bits[mb] = ctx->mb_rc[qscale][mb].bits;
            }
            bits = (bits+31)&~31; // padding
            if (bits > ctx->frame_bits)
                break;
        }
701
        //av_dlog(ctx->m.avctx, "lambda %d, up %u, down %u, bits %d, frame %d\n",
702
        //        lambda, last_higher, last_lower, bits, ctx->frame_bits);
703 704 705 706 707 708
        if (end) {
            if (bits > ctx->frame_bits)
                return -1;
            break;
        }
        if (bits < ctx->frame_bits) {
709 710 711 712 713
            last_lower = FFMIN(lambda, last_lower);
            if (last_higher != 0)
                lambda = (lambda+last_higher)>>1;
            else
                lambda -= down_step;
714
            down_step = FFMIN((int64_t)down_step*5, INT_MAX);
715 716 717 718
            up_step = 1<<LAMBDA_FRAC_BITS;
            lambda = FFMAX(1, lambda);
            if (lambda == last_lower)
                break;
719
        } else {
720 721 722
            last_higher = FFMAX(lambda, last_higher);
            if (last_lower != INT_MAX)
                lambda = (lambda+last_lower)>>1;
723 724
            else if ((int64_t)lambda + up_step > INT_MAX)
                return -1;
725 726
            else
                lambda += up_step;
727
            up_step = FFMIN((int64_t)up_step*5, INT_MAX);
728
            down_step = 1<<LAMBDA_FRAC_BITS;
729 730
        }
    }
731
    //av_dlog(ctx->m.avctx, "out lambda %d\n", lambda);
732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750
    ctx->lambda = lambda;
    return 0;
}

static int dnxhd_find_qscale(DNXHDEncContext *ctx)
{
    int bits = 0;
    int up_step = 1;
    int down_step = 1;
    int last_higher = 0;
    int last_lower = INT_MAX;
    int qscale;
    int x, y;

    qscale = ctx->qscale;
    for (;;) {
        bits = 0;
        ctx->qscale = qscale;
        // XXX avoid recalculating bits
751
        ctx->m.avctx->execute2(ctx->m.avctx, dnxhd_calc_bits_thread, NULL, NULL, ctx->m.mb_height);
752 753 754 755 756 757 758
        for (y = 0; y < ctx->m.mb_height; y++) {
            for (x = 0; x < ctx->m.mb_width; x++)
                bits += ctx->mb_rc[qscale][y*ctx->m.mb_width+x].bits;
            bits = (bits+31)&~31; // padding
            if (bits > ctx->frame_bits)
                break;
        }
759
        //av_dlog(ctx->m.avctx, "%d, qscale %d, bits %d, frame %d, higher %d, lower %d\n",
760 761 762
        //        ctx->m.avctx->frame_number, qscale, bits, ctx->frame_bits, last_higher, last_lower);
        if (bits < ctx->frame_bits) {
            if (qscale == 1)
763
                return 1;
764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788
            if (last_higher == qscale - 1) {
                qscale = last_higher;
                break;
            }
            last_lower = FFMIN(qscale, last_lower);
            if (last_higher != 0)
                qscale = (qscale+last_higher)>>1;
            else
                qscale -= down_step++;
            if (qscale < 1)
                qscale = 1;
            up_step = 1;
        } else {
            if (last_lower == qscale + 1)
                break;
            last_higher = FFMAX(qscale, last_higher);
            if (last_lower != INT_MAX)
                qscale = (qscale+last_lower)>>1;
            else
                qscale += up_step++;
            down_step = 1;
            if (qscale >= ctx->m.avctx->qmax)
                return -1;
        }
    }
789
    //av_dlog(ctx->m.avctx, "out qscale %d\n", qscale);
790 791 792 793
    ctx->qscale = qscale;
    return 0;
}

794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836
#define BUCKET_BITS 8
#define RADIX_PASSES 4
#define NBUCKETS (1 << BUCKET_BITS)

static inline int get_bucket(int value, int shift)
{
    value >>= shift;
    value &= NBUCKETS - 1;
    return NBUCKETS - 1 - value;
}

static void radix_count(const RCCMPEntry *data, int size, int buckets[RADIX_PASSES][NBUCKETS])
{
    int i, j;
    memset(buckets, 0, sizeof(buckets[0][0]) * RADIX_PASSES * NBUCKETS);
    for (i = 0; i < size; i++) {
        int v = data[i].value;
        for (j = 0; j < RADIX_PASSES; j++) {
            buckets[j][get_bucket(v, 0)]++;
            v >>= BUCKET_BITS;
        }
        assert(!v);
    }
    for (j = 0; j < RADIX_PASSES; j++) {
        int offset = size;
        for (i = NBUCKETS - 1; i >= 0; i--)
            buckets[j][i] = offset -= buckets[j][i];
        assert(!buckets[j][0]);
    }
}

static void radix_sort_pass(RCCMPEntry *dst, const RCCMPEntry *data, int size, int buckets[NBUCKETS], int pass)
{
    int shift = pass * BUCKET_BITS;
    int i;
    for (i = 0; i < size; i++) {
        int v = get_bucket(data[i].value, shift);
        int pos = buckets[v]++;
        dst[pos] = data[i];
    }
}

static void radix_sort(RCCMPEntry *data, int size)
837
{
838 839 840 841 842 843 844 845 846 847
    int buckets[RADIX_PASSES][NBUCKETS];
    RCCMPEntry *tmp = av_malloc(sizeof(*tmp) * size);
    radix_count(data, size, buckets);
    radix_sort_pass(tmp, data, size, buckets[0], 0);
    radix_sort_pass(data, tmp, size, buckets[1], 1);
    if (buckets[2][NBUCKETS - 1] || buckets[3][NBUCKETS - 1]) {
        radix_sort_pass(tmp, data, size, buckets[2], 2);
        radix_sort_pass(data, tmp, size, buckets[3], 3);
    }
    av_free(tmp);
848 849
}

850
static int dnxhd_encode_fast(AVCodecContext *avctx, DNXHDEncContext *ctx)
851 852
{
    int max_bits = 0;
853 854
    int ret, x, y;
    if ((ret = dnxhd_find_qscale(ctx)) < 0)
855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872
        return -1;
    for (y = 0; y < ctx->m.mb_height; y++) {
        for (x = 0; x < ctx->m.mb_width; x++) {
            int mb = y*ctx->m.mb_width+x;
            int delta_bits;
            ctx->mb_qscale[mb] = ctx->qscale;
            ctx->mb_bits[mb] = ctx->mb_rc[ctx->qscale][mb].bits;
            max_bits += ctx->mb_rc[ctx->qscale][mb].bits;
            if (!RC_VARIANCE) {
                delta_bits = ctx->mb_rc[ctx->qscale][mb].bits-ctx->mb_rc[ctx->qscale+1][mb].bits;
                ctx->mb_cmp[mb].mb = mb;
                ctx->mb_cmp[mb].value = delta_bits ?
                    ((ctx->mb_rc[ctx->qscale][mb].ssd-ctx->mb_rc[ctx->qscale+1][mb].ssd)*100)/delta_bits
                    : INT_MIN; //avoid increasing qscale
            }
        }
        max_bits += 31; //worst padding
    }
873
    if (!ret) {
874
        if (RC_VARIANCE)
875
            avctx->execute2(avctx, dnxhd_mb_var_thread, NULL, NULL, ctx->m.mb_height);
876
        radix_sort(ctx->mb_cmp, ctx->m.mb_num);
877 878 879 880 881 882 883 884 885 886
        for (x = 0; x < ctx->m.mb_num && max_bits > ctx->frame_bits; x++) {
            int mb = ctx->mb_cmp[x].mb;
            max_bits -= ctx->mb_rc[ctx->qscale][mb].bits - ctx->mb_rc[ctx->qscale+1][mb].bits;
            ctx->mb_qscale[mb] = ctx->qscale+1;
            ctx->mb_bits[mb] = ctx->mb_rc[ctx->qscale+1][mb].bits;
        }
    }
    return 0;
}

Michael Niedermayer's avatar
Michael Niedermayer committed
887
static void dnxhd_load_picture(DNXHDEncContext *ctx, const AVFrame *frame)
888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906
{
    int i;

    for (i = 0; i < 3; i++) {
        ctx->frame.data[i]     = frame->data[i];
        ctx->frame.linesize[i] = frame->linesize[i];
    }

    for (i = 0; i < ctx->m.avctx->thread_count; i++) {
        ctx->thread[i]->m.linesize    = ctx->frame.linesize[0]<<ctx->interlaced;
        ctx->thread[i]->m.uvlinesize  = ctx->frame.linesize[1]<<ctx->interlaced;
        ctx->thread[i]->dct_y_offset  = ctx->m.linesize  *8;
        ctx->thread[i]->dct_uv_offset = ctx->m.uvlinesize*8;
    }

    ctx->frame.interlaced_frame = frame->interlaced_frame;
    ctx->cur_field = frame->interlaced_frame && !frame->top_field_first;
}

907 908
static int dnxhd_encode_picture(AVCodecContext *avctx, AVPacket *pkt,
                                const AVFrame *frame, int *got_packet)
909 910 911 912
{
    DNXHDEncContext *ctx = avctx->priv_data;
    int first_field = 1;
    int offset, i, ret;
913
    uint8_t *buf;
914

915
    if ((ret = ff_alloc_packet(pkt, ctx->cid_table->frame_size)) < 0) {
916
        av_log(avctx, AV_LOG_ERROR, "output buffer is too small to compress picture\n");
917
        return ret;
918
    }
919
    buf = pkt->data;
920

921
    dnxhd_load_picture(ctx, frame);
922 923 924 925 926 927 928 929 930 931 932 933 934

 encode_coding_unit:
    for (i = 0; i < 3; i++) {
        ctx->src[i] = ctx->frame.data[i];
        if (ctx->interlaced && ctx->cur_field)
            ctx->src[i] += ctx->frame.linesize[i];
    }

    dnxhd_write_header(avctx, buf);

    if (avctx->mb_decision == FF_MB_DECISION_RD)
        ret = dnxhd_encode_rdo(avctx, ctx);
    else
935
        ret = dnxhd_encode_fast(avctx, ctx);
936
    if (ret < 0) {
937 938
        av_log(avctx, AV_LOG_ERROR,
               "picture could not fit ratecontrol constraints, increase qmax\n");
939 940 941
        return -1;
    }

942
    dnxhd_setup_threads_slices(ctx);
943 944 945 946 947 948 949 950

    offset = 0;
    for (i = 0; i < ctx->m.mb_height; i++) {
        AV_WB32(ctx->msip + i * 4, offset);
        offset += ctx->slice_size[i];
        assert(!(ctx->slice_size[i] & 3));
    }

951
    avctx->execute2(avctx, dnxhd_encode_thread, buf, NULL, ctx->m.mb_height);
952

953 954 955
    assert(640 + offset + 4 <= ctx->cid_table->coding_unit_size);
    memset(buf + 640 + offset, 0, ctx->cid_table->coding_unit_size - 4 - offset - 640);

956 957 958 959 960 961 962 963 964
    AV_WB32(buf + ctx->cid_table->coding_unit_size - 4, 0x600DC0DE); // EOF

    if (ctx->interlaced && first_field) {
        first_field     = 0;
        ctx->cur_field ^= 1;
        buf      += ctx->cid_table->coding_unit_size;
        goto encode_coding_unit;
    }

965 966
    ctx->frame.quality = ctx->qscale*FF_QP2LAMBDA;

967 968 969
    pkt->flags |= AV_PKT_FLAG_KEY;
    *got_packet = 1;
    return 0;
970 971 972 973 974
}

static int dnxhd_encode_end(AVCodecContext *avctx)
{
    DNXHDEncContext *ctx = avctx->priv_data;
975
    int max_level = 1<<(ctx->cid_table->bit_depth+2);
976 977
    int i;

978 979 980 981
    av_free(ctx->vlc_codes-max_level*2);
    av_free(ctx->vlc_bits -max_level*2);
    av_freep(&ctx->run_codes);
    av_freep(&ctx->run_bits);
982 983 984 985 986 987

    av_freep(&ctx->mb_bits);
    av_freep(&ctx->mb_qscale);
    av_freep(&ctx->mb_rc);
    av_freep(&ctx->mb_cmp);
    av_freep(&ctx->slice_size);
988
    av_freep(&ctx->slice_offs);
989 990 991 992 993 994 995 996 997 998 999 1000

    av_freep(&ctx->qmatrix_c);
    av_freep(&ctx->qmatrix_l);
    av_freep(&ctx->qmatrix_c16);
    av_freep(&ctx->qmatrix_l16);

    for (i = 1; i < avctx->thread_count; i++)
        av_freep(&ctx->thread[i]);

    return 0;
}

1001
AVCodec ff_dnxhd_encoder = {
1002 1003 1004 1005 1006
    .name           = "dnxhd",
    .type           = AVMEDIA_TYPE_VIDEO,
    .id             = CODEC_ID_DNXHD,
    .priv_data_size = sizeof(DNXHDEncContext),
    .init           = dnxhd_encode_init,
1007
    .encode2        = dnxhd_encode_picture,
1008
    .close          = dnxhd_encode_end,
1009 1010 1011 1012 1013 1014
    .capabilities   = CODEC_CAP_SLICE_THREADS,
    .pix_fmts       = (const enum PixelFormat[]){ PIX_FMT_YUV422P,
                                                  PIX_FMT_YUV422P10,
                                                  PIX_FMT_NONE },
    .long_name      = NULL_IF_CONFIG_SMALL("VC3/DNxHD"),
    .priv_class     = &class,
1015
};