videodsp_init.c 11.2 KB
Newer Older
1
/*
2
 * Copyright (C) 2002-2012 Michael Niedermayer
3 4
 * Copyright (C) 2012 Ronald S. Bultje
 *
5
 * This file is part of FFmpeg.
6
 *
7
 * FFmpeg is free software; you can redistribute it and/or
8 9 10 11
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
12
 * FFmpeg is distributed in the hope that it will be useful,
13 14 15 16 17
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
18
 * License along with FFmpeg; if not, write to the Free Software
19 20 21 22
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "config.h"
23
#include "libavutil/attributes.h"
24
#include "libavutil/avassert.h"
25 26 27 28
#include "libavutil/common.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
29
#include "libavutil/x86/cpu.h"
30 31 32
#include "libavcodec/videodsp.h"

#if HAVE_YASM
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
                                const uint8_t *src, x86_reg src_stride,
                                x86_reg start_y, x86_reg end_y, x86_reg bh);
typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
                                const uint8_t *src, x86_reg src_stride,
                                x86_reg start_y, x86_reg end_y, x86_reg bh,
                                x86_reg w);

extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix16_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix17_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix18_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix19_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx;
#if ARCH_X86_32
static emu_edge_vfix_func *vfixtbl_mmx[22] = {
    &ff_emu_edge_vfix1_mmx,  &ff_emu_edge_vfix2_mmx,  &ff_emu_edge_vfix3_mmx,
    &ff_emu_edge_vfix4_mmx,  &ff_emu_edge_vfix5_mmx,  &ff_emu_edge_vfix6_mmx,
    &ff_emu_edge_vfix7_mmx,  &ff_emu_edge_vfix8_mmx,  &ff_emu_edge_vfix9_mmx,
    &ff_emu_edge_vfix10_mmx, &ff_emu_edge_vfix11_mmx, &ff_emu_edge_vfix12_mmx,
    &ff_emu_edge_vfix13_mmx, &ff_emu_edge_vfix14_mmx, &ff_emu_edge_vfix15_mmx,
    &ff_emu_edge_vfix16_mmx, &ff_emu_edge_vfix17_mmx, &ff_emu_edge_vfix18_mmx,
    &ff_emu_edge_vfix19_mmx, &ff_emu_edge_vfix20_mmx, &ff_emu_edge_vfix21_mmx,
    &ff_emu_edge_vfix22_mmx
};
#endif
extern emu_edge_vvar_func ff_emu_edge_vvar_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix16_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix17_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix18_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
static emu_edge_vfix_func *vfixtbl_sse[22] = {
    ff_emu_edge_vfix1_mmx,  ff_emu_edge_vfix2_mmx,  ff_emu_edge_vfix3_mmx,
    ff_emu_edge_vfix4_mmx,  ff_emu_edge_vfix5_mmx,  ff_emu_edge_vfix6_mmx,
    ff_emu_edge_vfix7_mmx,  ff_emu_edge_vfix8_mmx,  ff_emu_edge_vfix9_mmx,
    ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx,
    ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx,
    ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse,
    ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse,
    ff_emu_edge_vfix22_sse
};
extern emu_edge_vvar_func ff_emu_edge_vvar_sse;

typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride,
                                x86_reg start_x, x86_reg bh);
typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride,
                                x86_reg start_x, x86_reg n_words, x86_reg bh);

extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix16_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx;
#if ARCH_X86_32
static emu_edge_hfix_func *hfixtbl_mmx[11] = {
    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
    ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
    ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx,
    ff_emu_edge_hfix20_mmx, ff_emu_edge_hfix22_mmx
};
#endif
extern emu_edge_hvar_func ff_emu_edge_hvar_mmx;
120 121 122 123 124
extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
static emu_edge_hfix_func *hfixtbl_sse2[11] = {
125 126
    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
    ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
127 128
    ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
    ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
129
};
130
extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
131

132 133 134
static av_always_inline void emulated_edge_mc(uint8_t *dst, const uint8_t *src,
                                              ptrdiff_t dst_stride,
                                              ptrdiff_t src_stride,
135 136 137 138 139 140 141
                                              x86_reg block_w, x86_reg block_h,
                                              x86_reg src_x, x86_reg src_y,
                                              x86_reg w, x86_reg h,
                                              emu_edge_vfix_func **vfix_tbl,
                                              emu_edge_vvar_func *v_extend_var,
                                              emu_edge_hfix_func **hfix_tbl,
                                              emu_edge_hvar_func *h_extend_var)
142
{
143
    x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
144

145
    if (!w || !h)
146
        return;
147 148

    if (src_y >= h) {
149
        src -= src_y*src_stride;
150
        src_y_add = h - 1;
151 152
        src_y     = h - 1;
    } else if (src_y <= -block_h) {
153
        src -= src_y*src_stride;
154
        src_y_add = 1 - block_h;
155 156 157 158 159 160 161 162 163 164 165 166 167 168
        src_y     = 1 - block_h;
    }
    if (src_x >= w) {
        src   += w - 1 - src_x;
        src_x  = w - 1;
    } else if (src_x <= -block_w) {
        src   += 1 - block_w - src_x;
        src_x  = 1 - block_w;
    }

    start_y = FFMAX(0, -src_y);
    start_x = FFMAX(0, -src_x);
    end_y   = FFMIN(block_h, h-src_y);
    end_x   = FFMIN(block_w, w-src_x);
169 170
    av_assert2(start_x < end_x && block_w > 0);
    av_assert2(start_y < end_y && block_h > 0);
171 172

    // fill in the to-be-copied part plus all above/below
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
    src += (src_y_add + start_y) * src_stride + start_x;
    w = end_x - start_x;
    if (w <= 22) {
        vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
                        start_y, end_y, block_h);
    } else {
        v_extend_var(dst + start_x, dst_stride, src, src_stride,
                     start_y, end_y, block_h, w);
    }

    // fill left
    if (start_x) {
        if (start_x <= 22) {
            hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h);
        } else {
            h_extend_var(dst, dst_stride,
                         start_x, (start_x + 1) >> 1, block_h);
        }
    }

    // fill right
    p = block_w - end_x;
    if (p) {
        if (p <= 22) {
            hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride,
                                   -!(p & 1), block_h);
        } else {
            h_extend_var(dst + end_x - (p & 1), dst_stride,
                         -!(p & 1), (p + 1) >> 1, block_h);
        }
    }
204 205 206 207
}

#if ARCH_X86_32
static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
208 209
                                             ptrdiff_t buf_stride,
                                             ptrdiff_t src_stride,
210 211 212
                                             int block_w, int block_h,
                                             int src_x, int src_y, int w, int h)
{
213
    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
214 215
                     src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx,
                     hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
216 217
}

218
static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
219 220
                                             ptrdiff_t buf_stride,
                                             ptrdiff_t src_stride,
221 222 223
                                             int block_w, int block_h,
                                             int src_x, int src_y, int w, int h)
{
224
    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
225 226 227
                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
                     hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
}
228
#endif
229

230 231 232
static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
                                              ptrdiff_t buf_stride,
                                              ptrdiff_t src_stride,
233
                                              int block_w, int block_h,
234 235
                                              int src_x, int src_y, int w,
                                              int h)
236
{
237 238
    emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
239
                     hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
240 241 242 243 244 245
}
#endif /* HAVE_YASM */

void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h);

246
av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
247 248
{
#if HAVE_YASM
249
    int cpu_flags = av_get_cpu_flags();
250 251

#if ARCH_X86_32
252
    if (EXTERNAL_MMX(cpu_flags) && bpc <= 8) {
253 254
        ctx->emulated_edge_mc = emulated_edge_mc_mmx;
    }
255
    if (EXTERNAL_AMD3DNOW(cpu_flags)) {
256 257 258
        ctx->prefetch = ff_prefetch_3dnow;
    }
#endif /* ARCH_X86_32 */
259
    if (EXTERNAL_MMXEXT(cpu_flags)) {
260 261
        ctx->prefetch = ff_prefetch_mmxext;
    }
262
#if ARCH_X86_32
263
    if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) {
264 265
        ctx->emulated_edge_mc = emulated_edge_mc_sse;
    }
266 267 268 269
#endif /* ARCH_X86_32 */
    if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
        ctx->emulated_edge_mc = emulated_edge_mc_sse2;
    }
270 271
#endif /* HAVE_YASM */
}