h264_mb.c 37.3 KB
Newer Older
1 2 3 4
/*
 * H.26L/H.264/AVC/JVT/14496-10/... decoder
 * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8 9 10 11
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13 14 15 16 17
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19 20 21 22 23
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

/**
 * @file
24
 * H.264 / AVC / MPEG-4 part10 macroblock decoding
25 26 27 28 29 30 31 32 33
 */

#include <stdint.h>

#include "config.h"

#include "libavutil/common.h"
#include "libavutil/intreadwrite.h"
#include "avcodec.h"
34
#include "h264dec.h"
35
#include "h264_ps.h"
36
#include "qpeldsp.h"
37 38
#include "thread.h"

39
static inline int get_lowest_part_list_y(H264SliceContext *sl,
40
                                         int n, int height, int y_offset, int list)
41
{
42
    int raw_my             = sl->mv_cache[list][scan8[n]][1];
43 44 45 46
    int filter_height_down = (raw_my & 3) ? 3 : 0;
    int full_my            = (raw_my >> 2) + y_offset;
    int bottom             = full_my + filter_height_down + height;

47 48 49
    av_assert2(height >= 0);

    return FFMAX(0, bottom);
50 51
}

52
static inline void get_lowest_part_y(const H264Context *h, H264SliceContext *sl,
53
                                     int16_t refs[2][48], int n,
54 55 56 57 58
                                     int height, int y_offset, int list0,
                                     int list1, int *nrefs)
{
    int my;

59
    y_offset += 16 * (sl->mb_y >> MB_FIELD(sl));
60 61

    if (list0) {
62
        int ref_n = sl->ref_cache[0][scan8[n]];
63
        H264Ref *ref = &sl->ref_list[0][ref_n];
64 65 66 67

        // Error resilience puts the current picture in the ref list.
        // Don't try to wait on these as it will cause a deadlock.
        // Fields can wait on each other, though.
68
        if (ref->parent->tf.progress->data != h->cur_pic.tf.progress->data ||
69
            (ref->reference & 3) != h->picture_structure) {
70
            my = get_lowest_part_list_y(sl, n, height, y_offset, 0);
71 72 73 74 75 76 77
            if (refs[0][ref_n] < 0)
                nrefs[0] += 1;
            refs[0][ref_n] = FFMAX(refs[0][ref_n], my);
        }
    }

    if (list1) {
78
        int ref_n    = sl->ref_cache[1][scan8[n]];
79
        H264Ref *ref = &sl->ref_list[1][ref_n];
80

81
        if (ref->parent->tf.progress->data != h->cur_pic.tf.progress->data ||
82
            (ref->reference & 3) != h->picture_structure) {
83
            my = get_lowest_part_list_y(sl, n, height, y_offset, 1);
84 85 86 87 88 89 90 91 92 93
            if (refs[1][ref_n] < 0)
                nrefs[1] += 1;
            refs[1][ref_n] = FFMAX(refs[1][ref_n], my);
        }
    }
}

/**
 * Wait until all reference frames are available for MC operations.
 *
94
 * @param h the H.264 context
95
 */
96
static void await_references(const H264Context *h, H264SliceContext *sl)
97
{
98
    const int mb_xy   = sl->mb_xy;
99
    const int mb_type = h->cur_pic.mb_type[mb_xy];
100
    int16_t refs[2][48];
101 102 103 104 105 106
    int nrefs[2] = { 0 };
    int ref, list;

    memset(refs, -1, sizeof(refs));

    if (IS_16X16(mb_type)) {
107
        get_lowest_part_y(h, sl, refs, 0, 16, 0,
108 109
                          IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs);
    } else if (IS_16X8(mb_type)) {
110
        get_lowest_part_y(h, sl, refs, 0, 8, 0,
111
                          IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs);
112
        get_lowest_part_y(h, sl, refs, 8, 8, 8,
113 114
                          IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs);
    } else if (IS_8X16(mb_type)) {
115
        get_lowest_part_y(h, sl, refs, 0, 16, 0,
116
                          IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1), nrefs);
117
        get_lowest_part_y(h, sl, refs, 4, 16, 0,
118 119 120 121
                          IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1), nrefs);
    } else {
        int i;

122
        av_assert2(IS_8X8(mb_type));
123 124

        for (i = 0; i < 4; i++) {
125
            const int sub_mb_type = sl->sub_mb_type[i];
126 127 128 129
            const int n           = 4 * i;
            int y_offset          = (i & 2) << 2;

            if (IS_SUB_8X8(sub_mb_type)) {
130
                get_lowest_part_y(h, sl, refs, n, 8, y_offset,
131 132 133 134
                                  IS_DIR(sub_mb_type, 0, 0),
                                  IS_DIR(sub_mb_type, 0, 1),
                                  nrefs);
            } else if (IS_SUB_8X4(sub_mb_type)) {
135
                get_lowest_part_y(h, sl, refs, n, 4, y_offset,
136 137 138
                                  IS_DIR(sub_mb_type, 0, 0),
                                  IS_DIR(sub_mb_type, 0, 1),
                                  nrefs);
139
                get_lowest_part_y(h, sl, refs, n + 2, 4, y_offset + 4,
140 141 142 143
                                  IS_DIR(sub_mb_type, 0, 0),
                                  IS_DIR(sub_mb_type, 0, 1),
                                  nrefs);
            } else if (IS_SUB_4X8(sub_mb_type)) {
144
                get_lowest_part_y(h, sl, refs, n, 8, y_offset,
145 146 147
                                  IS_DIR(sub_mb_type, 0, 0),
                                  IS_DIR(sub_mb_type, 0, 1),
                                  nrefs);
148
                get_lowest_part_y(h, sl, refs, n + 1, 8, y_offset,
149 150 151 152 153
                                  IS_DIR(sub_mb_type, 0, 0),
                                  IS_DIR(sub_mb_type, 0, 1),
                                  nrefs);
            } else {
                int j;
154
                av_assert2(IS_SUB_4X4(sub_mb_type));
155 156
                for (j = 0; j < 4; j++) {
                    int sub_y_offset = y_offset + 2 * (j & 2);
157
                    get_lowest_part_y(h, sl, refs, n + j, 4, sub_y_offset,
158 159 160 161 162 163 164 165
                                      IS_DIR(sub_mb_type, 0, 0),
                                      IS_DIR(sub_mb_type, 0, 1),
                                      nrefs);
                }
            }
        }
    }

166
    for (list = sl->list_count - 1; list >= 0; list--)
167 168 169
        for (ref = 0; ref < 48 && nrefs[list]; ref++) {
            int row = refs[list][ref];
            if (row >= 0) {
170
                H264Ref *ref_pic  = &sl->ref_list[list][ref];
171
                int ref_field         = ref_pic->reference - 1;
172
                int ref_field_picture = ref_pic->parent->field_picture;
173 174
                int pic_height        = 16 * h->mb_height >> ref_field_picture;

175
                row <<= MB_MBAFF(sl);
176 177 178
                nrefs[list]--;

                if (!FIELD_PICTURE(h) && ref_field_picture) { // frame referencing two fields
179
                    av_assert2((ref_pic->parent->reference & 3) == 3);
180
                    ff_thread_await_progress(&ref_pic->parent->tf,
181 182 183
                                             FFMIN((row >> 1) - !(row & 1),
                                                   pic_height - 1),
                                             1);
184
                    ff_thread_await_progress(&ref_pic->parent->tf,
185 186 187
                                             FFMIN((row >> 1), pic_height - 1),
                                             0);
                } else if (FIELD_PICTURE(h) && !ref_field_picture) { // field referencing one field of a frame
188
                    ff_thread_await_progress(&ref_pic->parent->tf,
189 190 191 192
                                             FFMIN(row * 2 + ref_field,
                                                   pic_height - 1),
                                             0);
                } else if (FIELD_PICTURE(h)) {
193
                    ff_thread_await_progress(&ref_pic->parent->tf,
194 195 196
                                             FFMIN(row, pic_height - 1),
                                             ref_field);
                } else {
197
                    ff_thread_await_progress(&ref_pic->parent->tf,
198 199 200 201 202 203 204
                                             FFMIN(row, pic_height - 1),
                                             0);
                }
            }
        }
}

205
static av_always_inline void mc_dir_part(const H264Context *h, H264SliceContext *sl,
206
                                         H264Ref *pic,
207 208 209 210 211
                                         int n, int square, int height,
                                         int delta, int list,
                                         uint8_t *dest_y, uint8_t *dest_cb,
                                         uint8_t *dest_cr,
                                         int src_x_offset, int src_y_offset,
212
                                         const qpel_mc_func *qpix_op,
213 214 215
                                         h264_chroma_mc_func chroma_op,
                                         int pixel_shift, int chroma_idc)
{
216 217
    const int mx      = sl->mv_cache[list][scan8[n]][0] + src_x_offset * 8;
    int my            = sl->mv_cache[list][scan8[n]][1] + src_y_offset * 8;
218
    const int luma_xy = (mx & 3) + ((my & 3) << 2);
219
    ptrdiff_t offset  = (mx >> 2) * (1 << pixel_shift) + (my >> 2) * sl->mb_linesize;
220
    uint8_t *src_y    = pic->data[0] + offset;
221 222 223 224 225 226 227
    uint8_t *src_cb, *src_cr;
    int extra_width  = 0;
    int extra_height = 0;
    int emu = 0;
    const int full_mx    = mx >> 2;
    const int full_my    = my >> 2;
    const int pic_width  = 16 * h->mb_width;
228
    const int pic_height = 16 * h->mb_height >> MB_FIELD(sl);
229 230 231 232 233 234 235 236 237 238 239
    int ysh;

    if (mx & 7)
        extra_width -= 3;
    if (my & 7)
        extra_height -= 3;

    if (full_mx                <          0 - extra_width  ||
        full_my                <          0 - extra_height ||
        full_mx + 16 /*FIXME*/ > pic_width  + extra_width  ||
        full_my + 16 /*FIXME*/ > pic_height + extra_height) {
240
        h->vdsp.emulated_edge_mc(sl->edge_emu_buffer,
241 242
                                 src_y - (2 << pixel_shift) - 2 * sl->mb_linesize,
                                 sl->mb_linesize, sl->mb_linesize,
243 244
                                 16 + 5, 16 + 5 /*FIXME*/, full_mx - 2,
                                 full_my - 2, pic_width, pic_height);
245
        src_y = sl->edge_emu_buffer + (2 << pixel_shift) + 2 * sl->mb_linesize;
246 247 248
        emu   = 1;
    }

249
    qpix_op[luma_xy](dest_y, src_y, sl->mb_linesize); // FIXME try variable height perhaps?
250
    if (!square)
251
        qpix_op[luma_xy](dest_y + delta, src_y + delta, sl->mb_linesize);
252

253
    if (CONFIG_GRAY && h->flags & AV_CODEC_FLAG_GRAY)
254 255 256
        return;

    if (chroma_idc == 3 /* yuv444 */) {
257
        src_cb = pic->data[1] + offset;
258
        if (emu) {
259
            h->vdsp.emulated_edge_mc(sl->edge_emu_buffer,
260 261
                                     src_cb - (2 << pixel_shift) - 2 * sl->mb_linesize,
                                     sl->mb_linesize, sl->mb_linesize,
262 263 264
                                     16 + 5, 16 + 5 /*FIXME*/,
                                     full_mx - 2, full_my - 2,
                                     pic_width, pic_height);
265
            src_cb = sl->edge_emu_buffer + (2 << pixel_shift) + 2 * sl->mb_linesize;
266
        }
267
        qpix_op[luma_xy](dest_cb, src_cb, sl->mb_linesize); // FIXME try variable height perhaps?
268
        if (!square)
269
            qpix_op[luma_xy](dest_cb + delta, src_cb + delta, sl->mb_linesize);
270

271
        src_cr = pic->data[2] + offset;
272
        if (emu) {
273
            h->vdsp.emulated_edge_mc(sl->edge_emu_buffer,
274 275
                                     src_cr - (2 << pixel_shift) - 2 * sl->mb_linesize,
                                     sl->mb_linesize, sl->mb_linesize,
276 277 278
                                     16 + 5, 16 + 5 /*FIXME*/,
                                     full_mx - 2, full_my - 2,
                                     pic_width, pic_height);
279
            src_cr = sl->edge_emu_buffer + (2 << pixel_shift) + 2 * sl->mb_linesize;
280
        }
281
        qpix_op[luma_xy](dest_cr, src_cr, sl->mb_linesize); // FIXME try variable height perhaps?
282
        if (!square)
283
            qpix_op[luma_xy](dest_cr + delta, src_cr + delta, sl->mb_linesize);
284 285 286 287
        return;
    }

    ysh = 3 - (chroma_idc == 2 /* yuv422 */);
288
    if (chroma_idc == 1 /* yuv420 */ && MB_FIELD(sl)) {
289
        // chroma offset when predicting from a field of opposite parity
290
        my  += 2 * ((sl->mb_y & 1) - (pic->reference - 1));
291 292 293
        emu |= (my >> 3) < 0 || (my >> 3) + 8 >= (pic_height >> 1);
    }

294
    src_cb = pic->data[1] + ((mx >> 3) * (1 << pixel_shift)) +
295
             (my >> ysh) * sl->mb_uvlinesize;
296
    src_cr = pic->data[2] + ((mx >> 3) * (1 << pixel_shift)) +
297
             (my >> ysh) * sl->mb_uvlinesize;
298 299

    if (emu) {
300
        h->vdsp.emulated_edge_mc(sl->edge_emu_buffer, src_cb,
301
                                 sl->mb_uvlinesize, sl->mb_uvlinesize,
302 303
                                 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
                                 pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
304
        src_cb = sl->edge_emu_buffer;
305
    }
306
    chroma_op(dest_cb, src_cb, sl->mb_uvlinesize,
307
              height >> (chroma_idc == 1 /* yuv420 */),
308
              mx & 7, ((unsigned)my << (chroma_idc == 2 /* yuv422 */)) & 7);
309 310

    if (emu) {
311
        h->vdsp.emulated_edge_mc(sl->edge_emu_buffer, src_cr,
312
                                 sl->mb_uvlinesize, sl->mb_uvlinesize,
313 314
                                 9, 8 * chroma_idc + 1, (mx >> 3), (my >> ysh),
                                 pic_width >> 1, pic_height >> (chroma_idc == 1 /* yuv420 */));
315
        src_cr = sl->edge_emu_buffer;
316
    }
317
    chroma_op(dest_cr, src_cr, sl->mb_uvlinesize, height >> (chroma_idc == 1 /* yuv420 */),
318
              mx & 7, ((unsigned)my << (chroma_idc == 2 /* yuv422 */)) & 7);
319 320
}

321
static av_always_inline void mc_part_std(const H264Context *h, H264SliceContext *sl,
322
                                         int n, int square,
323 324 325 326
                                         int height, int delta,
                                         uint8_t *dest_y, uint8_t *dest_cb,
                                         uint8_t *dest_cr,
                                         int x_offset, int y_offset,
327
                                         const qpel_mc_func *qpix_put,
328
                                         h264_chroma_mc_func chroma_put,
329
                                         const qpel_mc_func *qpix_avg,
330 331 332 333
                                         h264_chroma_mc_func chroma_avg,
                                         int list0, int list1,
                                         int pixel_shift, int chroma_idc)
{
334
    const qpel_mc_func *qpix_op   = qpix_put;
335 336
    h264_chroma_mc_func chroma_op = chroma_put;

337
    dest_y += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
338
    if (chroma_idc == 3 /* yuv444 */) {
339 340
        dest_cb += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
        dest_cr += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
341
    } else if (chroma_idc == 2 /* yuv422 */) {
342 343
        dest_cb += (x_offset << pixel_shift) + 2 * y_offset * sl->mb_uvlinesize;
        dest_cr += (x_offset << pixel_shift) + 2 * y_offset * sl->mb_uvlinesize;
344
    } else { /* yuv420 */
345 346
        dest_cb += (x_offset << pixel_shift) + y_offset * sl->mb_uvlinesize;
        dest_cr += (x_offset << pixel_shift) + y_offset * sl->mb_uvlinesize;
347
    }
348
    x_offset += 8 * sl->mb_x;
349
    y_offset += 8 * (sl->mb_y >> MB_FIELD(sl));
350 351

    if (list0) {
352
        H264Ref *ref = &sl->ref_list[0][sl->ref_cache[0][scan8[n]]];
353
        mc_dir_part(h, sl, ref, n, square, height, delta, 0,
354 355 356 357 358 359 360 361
                    dest_y, dest_cb, dest_cr, x_offset, y_offset,
                    qpix_op, chroma_op, pixel_shift, chroma_idc);

        qpix_op   = qpix_avg;
        chroma_op = chroma_avg;
    }

    if (list1) {
362
        H264Ref *ref = &sl->ref_list[1][sl->ref_cache[1][scan8[n]]];
363
        mc_dir_part(h, sl, ref, n, square, height, delta, 1,
364 365 366 367 368
                    dest_y, dest_cb, dest_cr, x_offset, y_offset,
                    qpix_op, chroma_op, pixel_shift, chroma_idc);
    }
}

369
static av_always_inline void mc_part_weighted(const H264Context *h, H264SliceContext *sl,
370
                                              int n, int square,
371 372 373 374
                                              int height, int delta,
                                              uint8_t *dest_y, uint8_t *dest_cb,
                                              uint8_t *dest_cr,
                                              int x_offset, int y_offset,
375
                                              const qpel_mc_func *qpix_put,
376 377 378 379 380 381 382 383 384 385
                                              h264_chroma_mc_func chroma_put,
                                              h264_weight_func luma_weight_op,
                                              h264_weight_func chroma_weight_op,
                                              h264_biweight_func luma_weight_avg,
                                              h264_biweight_func chroma_weight_avg,
                                              int list0, int list1,
                                              int pixel_shift, int chroma_idc)
{
    int chroma_height;

386
    dest_y += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
387 388 389 390
    if (chroma_idc == 3 /* yuv444 */) {
        chroma_height     = height;
        chroma_weight_avg = luma_weight_avg;
        chroma_weight_op  = luma_weight_op;
391 392
        dest_cb += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
        dest_cr += (2 * x_offset << pixel_shift) + 2 * y_offset * sl->mb_linesize;
393 394
    } else if (chroma_idc == 2 /* yuv422 */) {
        chroma_height = height;
395 396
        dest_cb      += (x_offset << pixel_shift) + 2 * y_offset * sl->mb_uvlinesize;
        dest_cr      += (x_offset << pixel_shift) + 2 * y_offset * sl->mb_uvlinesize;
397 398
    } else { /* yuv420 */
        chroma_height = height >> 1;
399 400
        dest_cb      += (x_offset << pixel_shift) + y_offset * sl->mb_uvlinesize;
        dest_cr      += (x_offset << pixel_shift) + y_offset * sl->mb_uvlinesize;
401
    }
402
    x_offset += 8 * sl->mb_x;
403
    y_offset += 8 * (sl->mb_y >> MB_FIELD(sl));
404 405 406 407

    if (list0 && list1) {
        /* don't optimize for luma-only case, since B-frames usually
         * use implicit weights => chroma too. */
408 409 410
        uint8_t *tmp_cb = sl->bipred_scratchpad;
        uint8_t *tmp_cr = sl->bipred_scratchpad + (16 << pixel_shift);
        uint8_t *tmp_y  = sl->bipred_scratchpad + 16 * sl->mb_uvlinesize;
411 412
        int refn0       = sl->ref_cache[0][scan8[n]];
        int refn1       = sl->ref_cache[1][scan8[n]];
413

414
        mc_dir_part(h, sl, &sl->ref_list[0][refn0], n, square, height, delta, 0,
415 416 417
                    dest_y, dest_cb, dest_cr,
                    x_offset, y_offset, qpix_put, chroma_put,
                    pixel_shift, chroma_idc);
418
        mc_dir_part(h, sl, &sl->ref_list[1][refn1], n, square, height, delta, 1,
419 420 421 422
                    tmp_y, tmp_cb, tmp_cr,
                    x_offset, y_offset, qpix_put, chroma_put,
                    pixel_shift, chroma_idc);

423 424
        if (sl->pwt.use_weight == 2) {
            int weight0 = sl->pwt.implicit_weight[refn0][refn1][sl->mb_y & 1];
425
            int weight1 = 64 - weight0;
426
            luma_weight_avg(dest_y, tmp_y, sl->mb_linesize,
427
                            height, 5, weight0, weight1, 0);
428
            if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
429
                chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize,
430
                                  chroma_height, 5, weight0, weight1, 0);
431
                chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize,
432 433
                                  chroma_height, 5, weight0, weight1, 0);
            }
434
        } else {
435
            luma_weight_avg(dest_y, tmp_y, sl->mb_linesize, height,
436 437 438 439 440
                            sl->pwt.luma_log2_weight_denom,
                            sl->pwt.luma_weight[refn0][0][0],
                            sl->pwt.luma_weight[refn1][1][0],
                            sl->pwt.luma_weight[refn0][0][1] +
                            sl->pwt.luma_weight[refn1][1][1]);
441
            if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
442
                chroma_weight_avg(dest_cb, tmp_cb, sl->mb_uvlinesize, chroma_height,
443 444 445 446 447
                                  sl->pwt.chroma_log2_weight_denom,
                                  sl->pwt.chroma_weight[refn0][0][0][0],
                                  sl->pwt.chroma_weight[refn1][1][0][0],
                                  sl->pwt.chroma_weight[refn0][0][0][1] +
                                  sl->pwt.chroma_weight[refn1][1][0][1]);
448
                chroma_weight_avg(dest_cr, tmp_cr, sl->mb_uvlinesize, chroma_height,
449 450 451 452 453
                                  sl->pwt.chroma_log2_weight_denom,
                                  sl->pwt.chroma_weight[refn0][0][1][0],
                                  sl->pwt.chroma_weight[refn1][1][1][0],
                                  sl->pwt.chroma_weight[refn0][0][1][1] +
                                  sl->pwt.chroma_weight[refn1][1][1][1]);
454
            }
455 456 457
        }
    } else {
        int list     = list1 ? 1 : 0;
458
        int refn     = sl->ref_cache[list][scan8[n]];
459
        H264Ref *ref = &sl->ref_list[list][refn];
460
        mc_dir_part(h, sl, ref, n, square, height, delta, list,
461 462 463
                    dest_y, dest_cb, dest_cr, x_offset, y_offset,
                    qpix_put, chroma_put, pixel_shift, chroma_idc);

464
        luma_weight_op(dest_y, sl->mb_linesize, height,
465 466 467
                       sl->pwt.luma_log2_weight_denom,
                       sl->pwt.luma_weight[refn][list][0],
                       sl->pwt.luma_weight[refn][list][1]);
468
        if (!CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
469
            if (sl->pwt.use_weight_chroma) {
470
                chroma_weight_op(dest_cb, sl->mb_uvlinesize, chroma_height,
471 472 473
                                 sl->pwt.chroma_log2_weight_denom,
                                 sl->pwt.chroma_weight[refn][list][0][0],
                                 sl->pwt.chroma_weight[refn][list][0][1]);
474
                chroma_weight_op(dest_cr, sl->mb_uvlinesize, chroma_height,
475 476 477
                                 sl->pwt.chroma_log2_weight_denom,
                                 sl->pwt.chroma_weight[refn][list][1][0],
                                 sl->pwt.chroma_weight[refn][list][1][1]);
478
            }
479 480 481 482
        }
    }
}

483
static av_always_inline void prefetch_motion(const H264Context *h, H264SliceContext *sl,
484 485
                                             int list, int pixel_shift,
                                             int chroma_idc)
486 487 488
{
    /* fetch pixels for estimated mv 4 macroblocks ahead
     * optimized for 64byte cache lines */
489
    const int refn = sl->ref_cache[list][scan8[0]];
490
    if (refn >= 0) {
491 492
        const int mx  = (sl->mv_cache[list][scan8[0]][0] >> 2) + 16 * sl->mb_x + 8;
        const int my  = (sl->mv_cache[list][scan8[0]][1] >> 2) + 16 * sl->mb_y;
493
        uint8_t **src = sl->ref_list[list][refn].data;
494
        int off       =  mx * (1<< pixel_shift) +
495
                        (my + (sl->mb_x & 3) * 4) * sl->mb_linesize +
496
                        (64 << pixel_shift);
497
        h->vdsp.prefetch(src[0] + off, sl->linesize, 4);
498
        if (chroma_idc == 3 /* yuv444 */) {
499 500
            h->vdsp.prefetch(src[1] + off, sl->linesize, 4);
            h->vdsp.prefetch(src[2] + off, sl->linesize, 4);
501
        } else {
502
            off= ((mx>>1)+64) * (1<<pixel_shift) + ((my>>1) + (sl->mb_x&7))*sl->uvlinesize;
503 504 505 506 507
            h->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
        }
    }
}

508
static av_always_inline void xchg_mb_border(const H264Context *h, H264SliceContext *sl,
509
                                            uint8_t *src_y,
510 511 512 513 514 515 516 517 518 519 520 521
                                            uint8_t *src_cb, uint8_t *src_cr,
                                            int linesize, int uvlinesize,
                                            int xchg, int chroma444,
                                            int simple, int pixel_shift)
{
    int deblock_topleft;
    int deblock_top;
    int top_idx = 1;
    uint8_t *top_border_m1;
    uint8_t *top_border;

    if (!simple && FRAME_MBAFF(h)) {
522
        if (sl->mb_y & 1) {
523
            if (!MB_MBAFF(sl))
524 525
                return;
        } else {
526
            top_idx = MB_MBAFF(sl) ? 0 : 1;
527 528 529
        }
    }

530
    if (sl->deblocking_filter == 2) {
531
        deblock_topleft = h->slice_table[sl->mb_xy - 1 - h->mb_stride] == sl->slice_num;
532
        deblock_top     = sl->top_type;
533
    } else {
534
        deblock_topleft = (sl->mb_x > 0);
535
        deblock_top     = (sl->mb_y > !!MB_FIELD(sl));
536 537 538 539 540 541
    }

    src_y  -= linesize   + 1 + pixel_shift;
    src_cb -= uvlinesize + 1 + pixel_shift;
    src_cr -= uvlinesize + 1 + pixel_shift;

542 543
    top_border_m1 = sl->top_borders[top_idx][sl->mb_x - 1];
    top_border    = sl->top_borders[top_idx][sl->mb_x];
544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564

#define XCHG(a, b, xchg)                        \
    if (pixel_shift) {                          \
        if (xchg) {                             \
            AV_SWAP64(b + 0, a + 0);            \
            AV_SWAP64(b + 8, a + 8);            \
        } else {                                \
            AV_COPY128(b, a);                   \
        }                                       \
    } else if (xchg)                            \
        AV_SWAP64(b, a);                        \
    else                                        \
        AV_COPY64(b, a);

    if (deblock_top) {
        if (deblock_topleft) {
            XCHG(top_border_m1 + (8 << pixel_shift),
                 src_y - (7 << pixel_shift), 1);
        }
        XCHG(top_border + (0 << pixel_shift), src_y + (1 << pixel_shift), xchg);
        XCHG(top_border + (8 << pixel_shift), src_y + (9 << pixel_shift), 1);
565
        if (sl->mb_x + 1 < h->mb_width) {
566
            XCHG(sl->top_borders[top_idx][sl->mb_x + 1],
567 568
                 src_y + (17 << pixel_shift), 1);
        }
569
        if (simple || !CONFIG_GRAY || !(h->flags & AV_CODEC_FLAG_GRAY)) {
570
            if (chroma444) {
571 572 573 574 575 576 577 578
                if (deblock_topleft) {
                    XCHG(top_border_m1 + (24 << pixel_shift), src_cb - (7 << pixel_shift), 1);
                    XCHG(top_border_m1 + (40 << pixel_shift), src_cr - (7 << pixel_shift), 1);
                }
                XCHG(top_border + (16 << pixel_shift), src_cb + (1 << pixel_shift), xchg);
                XCHG(top_border + (24 << pixel_shift), src_cb + (9 << pixel_shift), 1);
                XCHG(top_border + (32 << pixel_shift), src_cr + (1 << pixel_shift), xchg);
                XCHG(top_border + (40 << pixel_shift), src_cr + (9 << pixel_shift), 1);
579
                if (sl->mb_x + 1 < h->mb_width) {
580 581
                    XCHG(sl->top_borders[top_idx][sl->mb_x + 1] + (16 << pixel_shift), src_cb + (17 << pixel_shift), 1);
                    XCHG(sl->top_borders[top_idx][sl->mb_x + 1] + (32 << pixel_shift), src_cr + (17 << pixel_shift), 1);
582
                }
583
            } else {
584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
                if (deblock_topleft) {
                    XCHG(top_border_m1 + (16 << pixel_shift), src_cb - (7 << pixel_shift), 1);
                    XCHG(top_border_m1 + (24 << pixel_shift), src_cr - (7 << pixel_shift), 1);
                }
                XCHG(top_border + (16 << pixel_shift), src_cb + 1 + pixel_shift, 1);
                XCHG(top_border + (24 << pixel_shift), src_cr + 1 + pixel_shift, 1);
            }
        }
    }
}

static av_always_inline int dctcoef_get(int16_t *mb, int high_bit_depth,
                                        int index)
{
    if (high_bit_depth) {
        return AV_RN32A(((int32_t *)mb) + index);
    } else
        return AV_RN16A(mb + index);
}

static av_always_inline void dctcoef_set(int16_t *mb, int high_bit_depth,
                                         int index, int value)
{
    if (high_bit_depth) {
        AV_WN32A(((int32_t *)mb) + index, value);
    } else
        AV_WN16A(mb + index, value);
}

613
static av_always_inline void hl_decode_mb_predict_luma(const H264Context *h,
614
                                                       H264SliceContext *sl,
615
                                                       int mb_type, int simple,
616 617
                                                       int transform_bypass,
                                                       int pixel_shift,
618
                                                       const int *block_offset,
619 620 621 622 623 624
                                                       int linesize,
                                                       uint8_t *dest_y, int p)
{
    void (*idct_add)(uint8_t *dst, int16_t *block, int stride);
    void (*idct_dc_add)(uint8_t *dst, int16_t *block, int stride);
    int i;
625
    int qscale = p == 0 ? sl->qscale : sl->chroma_qp[p - 1];
626 627 628 629 630 631 632 633 634 635 636 637
    block_offset += 16 * p;
    if (IS_INTRA4x4(mb_type)) {
        if (IS_8x8DCT(mb_type)) {
            if (transform_bypass) {
                idct_dc_add =
                idct_add    = h->h264dsp.h264_add_pixels8_clear;
            } else {
                idct_dc_add = h->h264dsp.h264_idct8_dc_add;
                idct_add    = h->h264dsp.h264_idct8_add;
            }
            for (i = 0; i < 16; i += 4) {
                uint8_t *const ptr = dest_y + block_offset[i];
638
                const int dir      = sl->intra4x4_pred_mode_cache[scan8[i]];
639
                if (transform_bypass && h->ps.sps->profile_idc == 244 && dir <= 1) {
640
                    if (h->sei.unregistered.x264_build < 151U) {
641
                        h->hpc.pred8x8l_add[dir](ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
642
                    } else
643
                        h->hpc.pred8x8l_filter_add[dir](ptr, sl->mb + (i * 16 + p * 256 << pixel_shift),
644 645
                                                        (sl-> topleft_samples_available << i) & 0x8000,
                                                        (sl->topright_samples_available << i) & 0x4000, linesize);
646
                } else {
647
                    const int nnz = sl->non_zero_count_cache[scan8[i + p * 16]];
648 649
                    h->hpc.pred8x8l[dir](ptr, (sl->topleft_samples_available << i) & 0x8000,
                                         (sl->topright_samples_available << i) & 0x4000, linesize);
650
                    if (nnz) {
651 652
                        if (nnz == 1 && dctcoef_get(sl->mb, pixel_shift, i * 16 + p * 256))
                            idct_dc_add(ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
653
                        else
654
                            idct_add(ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
655 656 657 658 659 660 661 662 663 664 665 666 667
                    }
                }
            }
        } else {
            if (transform_bypass) {
                idct_dc_add  =
                idct_add     = h->h264dsp.h264_add_pixels4_clear;
            } else {
                idct_dc_add = h->h264dsp.h264_idct_dc_add;
                idct_add    = h->h264dsp.h264_idct_add;
            }
            for (i = 0; i < 16; i++) {
                uint8_t *const ptr = dest_y + block_offset[i];
668
                const int dir      = sl->intra4x4_pred_mode_cache[scan8[i]];
669

670
                if (transform_bypass && h->ps.sps->profile_idc == 244 && dir <= 1) {
671
                    h->hpc.pred4x4_add[dir](ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
672 673 674 675 676
                } else {
                    uint8_t *topright;
                    int nnz, tr;
                    uint64_t tr_high;
                    if (dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED) {
677
                        const int topright_avail = (sl->topright_samples_available << i) & 0x8000;
678
                        av_assert2(sl->mb_y || linesize <= block_offset[i]);
679 680 681 682 683 684 685 686 687 688 689 690 691 692
                        if (!topright_avail) {
                            if (pixel_shift) {
                                tr_high  = ((uint16_t *)ptr)[3 - linesize / 2] * 0x0001000100010001ULL;
                                topright = (uint8_t *)&tr_high;
                            } else {
                                tr       = ptr[3 - linesize] * 0x01010101u;
                                topright = (uint8_t *)&tr;
                            }
                        } else
                            topright = ptr + (4 << pixel_shift) - linesize;
                    } else
                        topright = NULL;

                    h->hpc.pred4x4[dir](ptr, topright, linesize);
693
                    nnz = sl->non_zero_count_cache[scan8[i + p * 16]];
694
                    if (nnz) {
695 696 697 698
                        if (nnz == 1 && dctcoef_get(sl->mb, pixel_shift, i * 16 + p * 256))
                            idct_dc_add(ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
                        else
                            idct_add(ptr, sl->mb + (i * 16 + p * 256 << pixel_shift), linesize);
699 700 701 702 703
                    }
                }
            }
        }
    } else {
704
        h->hpc.pred16x16[sl->intra16x16_pred_mode](dest_y, linesize);
705 706 707 708
        if (sl->non_zero_count_cache[scan8[LUMA_DC_BLOCK_INDEX + p]]) {
            if (!transform_bypass)
                h->h264dsp.h264_luma_dc_dequant_idct(sl->mb + (p * 256 << pixel_shift),
                                                     sl->mb_luma_dc[p],
709
                                                     h->ps.pps->dequant4_coeff[p][qscale][0]);
710 711 712 713 714 715 716 717 718 719 720 721
            else {
                static const uint8_t dc_mapping[16] = {
                     0 * 16,  1 * 16,  4 * 16,  5 * 16,
                     2 * 16,  3 * 16,  6 * 16,  7 * 16,
                     8 * 16,  9 * 16, 12 * 16, 13 * 16,
                    10 * 16, 11 * 16, 14 * 16, 15 * 16
                };
                for (i = 0; i < 16; i++)
                    dctcoef_set(sl->mb + (p * 256 << pixel_shift),
                                pixel_shift, dc_mapping[i],
                                dctcoef_get(sl->mb_luma_dc[p],
                                            pixel_shift, i));
722
            }
723
        }
724 725 726
    }
}

727
static av_always_inline void hl_decode_mb_idct_luma(const H264Context *h, H264SliceContext *sl,
728
                                                    int mb_type, int simple,
729 730
                                                    int transform_bypass,
                                                    int pixel_shift,
731
                                                    const int *block_offset,
732 733 734 735 736 737 738
                                                    int linesize,
                                                    uint8_t *dest_y, int p)
{
    void (*idct_add)(uint8_t *dst, int16_t *block, int stride);
    int i;
    block_offset += 16 * p;
    if (!IS_INTRA4x4(mb_type)) {
739 740
        if (IS_INTRA16x16(mb_type)) {
            if (transform_bypass) {
741
                if (h->ps.sps->profile_idc == 244 &&
742 743 744 745 746
                    (sl->intra16x16_pred_mode == VERT_PRED8x8 ||
                     sl->intra16x16_pred_mode == HOR_PRED8x8)) {
                    h->hpc.pred16x16_add[sl->intra16x16_pred_mode](dest_y, block_offset,
                                                                   sl->mb + (p * 256 << pixel_shift),
                                                                   linesize);
747
                } else {
748 749 750 751 752 753
                    for (i = 0; i < 16; i++)
                        if (sl->non_zero_count_cache[scan8[i + p * 16]] ||
                            dctcoef_get(sl->mb, pixel_shift, i * 16 + p * 256))
                            h->h264dsp.h264_add_pixels4_clear(dest_y + block_offset[i],
                                                              sl->mb + (i * 16 + p * 256 << pixel_shift),
                                                              linesize);
754
                }
755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781
            } else {
                h->h264dsp.h264_idct_add16intra(dest_y, block_offset,
                                                sl->mb + (p * 256 << pixel_shift),
                                                linesize,
                                                sl->non_zero_count_cache + p * 5 * 8);
            }
        } else if (sl->cbp & 15) {
            if (transform_bypass) {
                const int di = IS_8x8DCT(mb_type) ? 4 : 1;
                idct_add = IS_8x8DCT(mb_type) ? h->h264dsp.h264_add_pixels8_clear
                    : h->h264dsp.h264_add_pixels4_clear;
                for (i = 0; i < 16; i += di)
                    if (sl->non_zero_count_cache[scan8[i + p * 16]])
                        idct_add(dest_y + block_offset[i],
                                 sl->mb + (i * 16 + p * 256 << pixel_shift),
                                 linesize);
            } else {
                if (IS_8x8DCT(mb_type))
                    h->h264dsp.h264_idct8_add4(dest_y, block_offset,
                                               sl->mb + (p * 256 << pixel_shift),
                                               linesize,
                                               sl->non_zero_count_cache + p * 5 * 8);
                else
                    h->h264dsp.h264_idct_add16(dest_y, block_offset,
                                               sl->mb + (p * 256 << pixel_shift),
                                               linesize,
                                               sl->non_zero_count_cache + p * 5 * 8);
782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798
            }
        }
    }
}

#define BITS   8
#define SIMPLE 1
#include "h264_mb_template.c"

#undef  BITS
#define BITS   16
#include "h264_mb_template.c"

#undef  SIMPLE
#define SIMPLE 0
#include "h264_mb_template.c"

799
void ff_h264_hl_decode_mb(const H264Context *h, H264SliceContext *sl)
800
{
801
    const int mb_xy   = sl->mb_xy;
802
    const int mb_type = h->cur_pic.mb_type[mb_xy];
803
    int is_complex    = CONFIG_SMALL || sl->is_complex ||
804
                        IS_INTRA_PCM(mb_type) || sl->qscale == 0;
805 806 807

    if (CHROMA444(h)) {
        if (is_complex || h->pixel_shift)
808
            hl_decode_mb_444_complex(h, sl);
809
        else
810
            hl_decode_mb_444_simple_8(h, sl);
811
    } else if (is_complex) {
812
        hl_decode_mb_complex(h, sl);
813
    } else if (h->pixel_shift) {
814
        hl_decode_mb_simple_16(h, sl);
815
    } else
816
        hl_decode_mb_simple_8(h, sl);
817
}