videodsp_init.c 12.9 KB
Newer Older
1
/*
2
 * Copyright (C) 2002-2012 Michael Niedermayer
3 4
 * Copyright (C) 2012 Ronald S. Bultje
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8 9 10 11
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13 14 15 16 17
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19 20 21 22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "config.h"
23
#include "libavutil/attributes.h"
24
#include "libavutil/avassert.h"
25 26 27 28
#include "libavutil/common.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
29
#include "libavutil/x86/cpu.h"
30 31 32
#include "libavcodec/videodsp.h"

#if HAVE_YASM
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
                                const uint8_t *src, x86_reg src_stride,
                                x86_reg start_y, x86_reg end_y, x86_reg bh);
typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
                                const uint8_t *src, x86_reg src_stride,
                                x86_reg start_y, x86_reg end_y, x86_reg bh,
                                x86_reg w);

extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix16_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix17_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix18_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix19_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx;
#if ARCH_X86_32
64
static emu_edge_vfix_func * const vfixtbl_mmx[22] = {
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
    &ff_emu_edge_vfix1_mmx,  &ff_emu_edge_vfix2_mmx,  &ff_emu_edge_vfix3_mmx,
    &ff_emu_edge_vfix4_mmx,  &ff_emu_edge_vfix5_mmx,  &ff_emu_edge_vfix6_mmx,
    &ff_emu_edge_vfix7_mmx,  &ff_emu_edge_vfix8_mmx,  &ff_emu_edge_vfix9_mmx,
    &ff_emu_edge_vfix10_mmx, &ff_emu_edge_vfix11_mmx, &ff_emu_edge_vfix12_mmx,
    &ff_emu_edge_vfix13_mmx, &ff_emu_edge_vfix14_mmx, &ff_emu_edge_vfix15_mmx,
    &ff_emu_edge_vfix16_mmx, &ff_emu_edge_vfix17_mmx, &ff_emu_edge_vfix18_mmx,
    &ff_emu_edge_vfix19_mmx, &ff_emu_edge_vfix20_mmx, &ff_emu_edge_vfix21_mmx,
    &ff_emu_edge_vfix22_mmx
};
#endif
extern emu_edge_vvar_func ff_emu_edge_vvar_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix16_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix17_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix18_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
83
static emu_edge_vfix_func * const vfixtbl_sse[22] = {
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
    ff_emu_edge_vfix1_mmx,  ff_emu_edge_vfix2_mmx,  ff_emu_edge_vfix3_mmx,
    ff_emu_edge_vfix4_mmx,  ff_emu_edge_vfix5_mmx,  ff_emu_edge_vfix6_mmx,
    ff_emu_edge_vfix7_mmx,  ff_emu_edge_vfix8_mmx,  ff_emu_edge_vfix9_mmx,
    ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx,
    ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx,
    ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse,
    ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse,
    ff_emu_edge_vfix22_sse
};
extern emu_edge_vvar_func ff_emu_edge_vvar_sse;

typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride,
                                x86_reg start_x, x86_reg bh);
typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride,
                                x86_reg start_x, x86_reg n_words, x86_reg bh);

extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix16_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx;
#if ARCH_X86_32
112
static emu_edge_hfix_func * const hfixtbl_mmx[11] = {
113 114 115 116 117 118 119
    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
    ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
    ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx,
    ff_emu_edge_hfix20_mmx, ff_emu_edge_hfix22_mmx
};
#endif
extern emu_edge_hvar_func ff_emu_edge_hvar_mmx;
120 121 122 123
extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
124
static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
125 126 127 128
    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
    ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
    ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
    ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
129
};
130
extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
#if HAVE_AVX2_EXTERNAL
extern emu_edge_hfix_func ff_emu_edge_hfix8_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix10_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix12_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix14_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix16_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
    ff_emu_edge_hfix8_avx2,  ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
    ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
    ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
};
extern emu_edge_hvar_func ff_emu_edge_hvar_avx2;
#endif
148

149 150 151
static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
                                              ptrdiff_t dst_stride,
                                              ptrdiff_t src_stride,
152 153 154
                                              x86_reg block_w, x86_reg block_h,
                                              x86_reg src_x, x86_reg src_y,
                                              x86_reg w, x86_reg h,
155
                                              emu_edge_vfix_func * const *vfix_tbl,
156
                                              emu_edge_vvar_func *v_extend_var,
157
                                              emu_edge_hfix_func * const *hfix_tbl,
158
                                              emu_edge_hvar_func *h_extend_var)
159
{
160
    x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
161

162
    if (!w || !h)
163
        return;
164

165 166
    av_assert2(block_w <= FFABS(dst_stride));

167
    if (src_y >= h) {
168
        src -= src_y*src_stride;
169
        src_y_add = h - 1;
170 171
        src_y     = h - 1;
    } else if (src_y <= -block_h) {
172
        src -= src_y*src_stride;
173
        src_y_add = 1 - block_h;
174 175 176 177 178 179 180 181 182 183 184 185 186 187
        src_y     = 1 - block_h;
    }
    if (src_x >= w) {
        src   += w - 1 - src_x;
        src_x  = w - 1;
    } else if (src_x <= -block_w) {
        src   += 1 - block_w - src_x;
        src_x  = 1 - block_w;
    }

    start_y = FFMAX(0, -src_y);
    start_x = FFMAX(0, -src_x);
    end_y   = FFMIN(block_h, h-src_y);
    end_x   = FFMIN(block_w, w-src_x);
188 189
    av_assert2(start_x < end_x && block_w > 0);
    av_assert2(start_y < end_y && block_h > 0);
190 191

    // fill in the to-be-copied part plus all above/below
192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
    src += (src_y_add + start_y) * src_stride + start_x;
    w = end_x - start_x;
    if (w <= 22) {
        vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
                        start_y, end_y, block_h);
    } else {
        v_extend_var(dst + start_x, dst_stride, src, src_stride,
                     start_y, end_y, block_h, w);
    }

    // fill left
    if (start_x) {
        if (start_x <= 22) {
            hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h);
        } else {
            h_extend_var(dst, dst_stride,
                         start_x, (start_x + 1) >> 1, block_h);
        }
    }

    // fill right
    p = block_w - end_x;
    if (p) {
        if (p <= 22) {
            hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride,
                                   -!(p & 1), block_h);
        } else {
            h_extend_var(dst + end_x - (p & 1), dst_stride,
                         -!(p & 1), (p + 1) >> 1, block_h);
        }
    }
223 224 225 226
}

#if ARCH_X86_32
static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
227 228
                                             ptrdiff_t buf_stride,
                                             ptrdiff_t src_stride,
229 230 231
                                             int block_w, int block_h,
                                             int src_x, int src_y, int w, int h)
{
232
    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
233 234
                     src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx,
                     hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
235 236
}

237
static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
238 239
                                             ptrdiff_t buf_stride,
                                             ptrdiff_t src_stride,
240 241 242
                                             int block_w, int block_h,
                                             int src_x, int src_y, int w, int h)
{
243
    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
244
                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
245
                     hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
246
}
247
#endif
248

249 250 251
static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
                                              ptrdiff_t buf_stride,
                                              ptrdiff_t src_stride,
252
                                              int block_w, int block_h,
253 254
                                              int src_x, int src_y, int w,
                                              int h)
255
{
256 257
    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
258
                     hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
259
}
260 261 262 263 264 265 266 267 268 269 270 271 272 273

#if HAVE_AVX2_EXTERNAL
static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
                                              ptrdiff_t buf_stride,
                                              ptrdiff_t src_stride,
                                              int block_w, int block_h,
                                              int src_x, int src_y, int w,
                                              int h)
{
    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
                     hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
}
#endif /* HAVE_AVX2_EXTERNAL */
274 275 276 277 278
#endif /* HAVE_YASM */

void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h);

279
av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
280 281
{
#if HAVE_YASM
282
    int cpu_flags = av_get_cpu_flags();
283 284

#if ARCH_X86_32
285
    if (EXTERNAL_MMX(cpu_flags) && bpc <= 8) {
286 287
        ctx->emulated_edge_mc = emulated_edge_mc_mmx;
    }
288
    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
289 290 291
        ctx->prefetch = ff_prefetch_3dnow;
    }
#endif /* ARCH_X86_32 */
292
    if (EXTERNAL_MMXEXT(cpu_flags)) {
293 294
        ctx->prefetch = ff_prefetch_mmxext;
    }
295
#if ARCH_X86_32
296
    if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) {
297 298
        ctx->emulated_edge_mc = emulated_edge_mc_sse;
    }
299 300 301 302
#endif /* ARCH_X86_32 */
    if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
        ctx->emulated_edge_mc = emulated_edge_mc_sse2;
    }
303 304 305 306 307
#if HAVE_AVX2_EXTERNAL
    if (EXTERNAL_AVX2(cpu_flags) && bpc <= 8) {
        ctx->emulated_edge_mc = emulated_edge_mc_avx2;
    }
#endif
308 309
#endif /* HAVE_YASM */
}