videodsp_init.c 12.9 KB
Newer Older
1
/*
2
 * Copyright (C) 2002-2012 Michael Niedermayer
3 4
 * Copyright (C) 2012 Ronald S. Bultje
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8 9 10 11
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13 14 15 16 17
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19 20 21 22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "config.h"
23
#include "libavutil/attributes.h"
24
#include "libavutil/avassert.h"
25 26 27 28
#include "libavutil/common.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
29
#include "libavutil/x86/cpu.h"
30 31 32
#include "libavcodec/videodsp.h"

#if HAVE_YASM
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
                                const uint8_t *src, x86_reg src_stride,
                                x86_reg start_y, x86_reg end_y, x86_reg bh);
typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
                                const uint8_t *src, x86_reg src_stride,
                                x86_reg start_y, x86_reg end_y, x86_reg bh,
                                x86_reg w);

extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix16_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix17_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix18_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix19_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx;
#if ARCH_X86_32
64
static emu_edge_vfix_func * const vfixtbl_mmx[22] = {
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
    &ff_emu_edge_vfix1_mmx,  &ff_emu_edge_vfix2_mmx,  &ff_emu_edge_vfix3_mmx,
    &ff_emu_edge_vfix4_mmx,  &ff_emu_edge_vfix5_mmx,  &ff_emu_edge_vfix6_mmx,
    &ff_emu_edge_vfix7_mmx,  &ff_emu_edge_vfix8_mmx,  &ff_emu_edge_vfix9_mmx,
    &ff_emu_edge_vfix10_mmx, &ff_emu_edge_vfix11_mmx, &ff_emu_edge_vfix12_mmx,
    &ff_emu_edge_vfix13_mmx, &ff_emu_edge_vfix14_mmx, &ff_emu_edge_vfix15_mmx,
    &ff_emu_edge_vfix16_mmx, &ff_emu_edge_vfix17_mmx, &ff_emu_edge_vfix18_mmx,
    &ff_emu_edge_vfix19_mmx, &ff_emu_edge_vfix20_mmx, &ff_emu_edge_vfix21_mmx,
    &ff_emu_edge_vfix22_mmx
};
#endif
extern emu_edge_vvar_func ff_emu_edge_vvar_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix16_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix17_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix18_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
83
static emu_edge_vfix_func * const vfixtbl_sse[22] = {
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
    ff_emu_edge_vfix1_mmx,  ff_emu_edge_vfix2_mmx,  ff_emu_edge_vfix3_mmx,
    ff_emu_edge_vfix4_mmx,  ff_emu_edge_vfix5_mmx,  ff_emu_edge_vfix6_mmx,
    ff_emu_edge_vfix7_mmx,  ff_emu_edge_vfix8_mmx,  ff_emu_edge_vfix9_mmx,
    ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx,
    ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx,
    ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse,
    ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse,
    ff_emu_edge_vfix22_sse
};
extern emu_edge_vvar_func ff_emu_edge_vvar_sse;

typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride,
                                x86_reg start_x, x86_reg bh);
typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride,
                                x86_reg start_x, x86_reg n_words, x86_reg bh);

extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix16_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx;
#if ARCH_X86_32
112
static emu_edge_hfix_func * const hfixtbl_mmx[11] = {
113 114 115 116 117 118 119
    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
    ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
    ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx,
    ff_emu_edge_hfix20_mmx, ff_emu_edge_hfix22_mmx
};
#endif
extern emu_edge_hvar_func ff_emu_edge_hvar_mmx;
120 121 122 123
extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
124
static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
125 126 127 128
    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
    ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
    ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
    ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
129
};
130
extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
#if HAVE_AVX2_EXTERNAL
extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
    ff_emu_edge_hfix8_avx2,  ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
    ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
    ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
};
extern emu_edge_hvar_func ff_emu_edge_hvar_avx2;
#endif
148

149 150 151
static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
                                              ptrdiff_t dst_stride,
                                              ptrdiff_t src_stride,
152 153 154
                                              x86_reg block_w, x86_reg block_h,
                                              x86_reg src_x, x86_reg src_y,
                                              x86_reg w, x86_reg h,
155
                                              emu_edge_vfix_func * const *vfix_tbl,
156
                                              emu_edge_vvar_func *v_extend_var,
157
                                              emu_edge_hfix_func * const *hfix_tbl,
158
                                              emu_edge_hvar_func *h_extend_var)
159
{
160
    x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
161

162
    if (!w || !h)
163
        return;
164 165

    if (src_y >= h) {
166
        src -= src_y*src_stride;
167
        src_y_add = h - 1;
168 169
        src_y     = h - 1;
    } else if (src_y <= -block_h) {
170
        src -= src_y*src_stride;
171
        src_y_add = 1 - block_h;
172 173 174 175 176 177 178 179 180 181 182 183 184 185
        src_y     = 1 - block_h;
    }
    if (src_x >= w) {
        src   += w - 1 - src_x;
        src_x  = w - 1;
    } else if (src_x <= -block_w) {
        src   += 1 - block_w - src_x;
        src_x  = 1 - block_w;
    }

    start_y = FFMAX(0, -src_y);
    start_x = FFMAX(0, -src_x);
    end_y   = FFMIN(block_h, h-src_y);
    end_x   = FFMIN(block_w, w-src_x);
186 187
    av_assert2(start_x < end_x && block_w > 0);
    av_assert2(start_y < end_y && block_h > 0);
188 189

    // fill in the to-be-copied part plus all above/below
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
    src += (src_y_add + start_y) * src_stride + start_x;
    w = end_x - start_x;
    if (w <= 22) {
        vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
                        start_y, end_y, block_h);
    } else {
        v_extend_var(dst + start_x, dst_stride, src, src_stride,
                     start_y, end_y, block_h, w);
    }

    // fill left
    if (start_x) {
        if (start_x <= 22) {
            hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h);
        } else {
            h_extend_var(dst, dst_stride,
                         start_x, (start_x + 1) >> 1, block_h);
        }
    }

    // fill right
    p = block_w - end_x;
    if (p) {
        if (p <= 22) {
            hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride,
                                   -!(p & 1), block_h);
        } else {
            h_extend_var(dst + end_x - (p & 1), dst_stride,
                         -!(p & 1), (p + 1) >> 1, block_h);
        }
    }
221 222 223 224
}

#if ARCH_X86_32
static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
225 226
                                             ptrdiff_t buf_stride,
                                             ptrdiff_t src_stride,
227 228 229
                                             int block_w, int block_h,
                                             int src_x, int src_y, int w, int h)
{
230
    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
231 232
                     src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx,
                     hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
233 234
}

235
static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
236 237
                                             ptrdiff_t buf_stride,
                                             ptrdiff_t src_stride,
238 239 240
                                             int block_w, int block_h,
                                             int src_x, int src_y, int w, int h)
{
241
    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
242
                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
243
                     hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
244
}
245
#endif
246

247 248 249
static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
                                              ptrdiff_t buf_stride,
                                              ptrdiff_t src_stride,
250
                                              int block_w, int block_h,
251 252
                                              int src_x, int src_y, int w,
                                              int h)
253
{
254 255
    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
256
                     hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
257
}
258 259 260 261 262 263 264 265 266 267 268 269 270 271

#if HAVE_AVX2_EXTERNAL
static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
                                              ptrdiff_t buf_stride,
                                              ptrdiff_t src_stride,
                                              int block_w, int block_h,
                                              int src_x, int src_y, int w,
                                              int h)
{
    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
                     hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
}
#endif /* HAVE_AVX2_EXTERNAL */
272 273 274 275 276
#endif /* HAVE_YASM */

void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h);

277
av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
278 279
{
#if HAVE_YASM
280
    int cpu_flags = av_get_cpu_flags();
281 282

#if ARCH_X86_32
283
    if (EXTERNAL_MMX(cpu_flags) && bpc <= 8) {
284 285
        ctx->emulated_edge_mc = emulated_edge_mc_mmx;
    }
286
    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
287 288 289
        ctx->prefetch = ff_prefetch_3dnow;
    }
#endif /* ARCH_X86_32 */
290
    if (EXTERNAL_MMXEXT(cpu_flags)) {
291 292
        ctx->prefetch = ff_prefetch_mmxext;
    }
293
#if ARCH_X86_32
294
    if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) {
295 296
        ctx->emulated_edge_mc = emulated_edge_mc_sse;
    }
297 298 299 300
#endif /* ARCH_X86_32 */
    if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
        ctx->emulated_edge_mc = emulated_edge_mc_sse2;
    }
301 302 303 304 305
#if HAVE_AVX2_EXTERNAL
    if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) {
        ctx->emulated_edge_mc = emulated_edge_mc_avx2;
    }
#endif
306 307
#endif /* HAVE_YASM */
}