dsputil.h 31 KB
Newer Older
1 2 3
/*
 * DSP utils
 * Copyright (c) 2000, 2001, 2002 Fabrice Bellard.
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6 7 8
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
9 10
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14 15 16 17 18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
Michael Niedermayer's avatar
Michael Niedermayer committed
22 23 24

/**
 * @file dsputil.h
Michael Niedermayer's avatar
Michael Niedermayer committed
25
 * DSP utils.
26 27
 * note, many functions in here may use MMX which trashes the FPU state, it is
 * absolutely necessary to call emms_c() between dsp & float/double code
Michael Niedermayer's avatar
Michael Niedermayer committed
28 29
 */

30 31
#ifndef FFMPEG_DSPUTIL_H
#define FFMPEG_DSPUTIL_H
Fabrice Bellard's avatar
Fabrice Bellard committed
32

33
#include "avcodec.h"
Fabrice Bellard's avatar
Fabrice Bellard committed
34

Michael Niedermayer's avatar
Michael Niedermayer committed
35

Michael Niedermayer's avatar
Michael Niedermayer committed
36
//#define DEBUG
Fabrice Bellard's avatar
Fabrice Bellard committed
37 38
/* dct code */
typedef short DCTELEM;
39
typedef int DWTELEM;
40
typedef short IDWTELEM;
Fabrice Bellard's avatar
Fabrice Bellard committed
41

42
void fdct_ifast (DCTELEM *data);
43
void fdct_ifast248 (DCTELEM *data);
44
void ff_jpeg_fdct_islow (DCTELEM *data);
45
void ff_fdct248_islow (DCTELEM *data);
Fabrice Bellard's avatar
Fabrice Bellard committed
46 47

void j_rev_dct (DCTELEM *data);
48
void j_rev_dct4 (DCTELEM *data);
49
void j_rev_dct2 (DCTELEM *data);
50
void j_rev_dct1 (DCTELEM *data);
51
void ff_wmv2_idct_c(DCTELEM *data);
Fabrice Bellard's avatar
Fabrice Bellard committed
52

53
void ff_fdct_mmx(DCTELEM *block);
54
void ff_fdct_mmx2(DCTELEM *block);
55
void ff_fdct_sse2(DCTELEM *block);
Fabrice Bellard's avatar
Fabrice Bellard committed
56

57
void ff_h264_idct8_add_c(uint8_t *dst, DCTELEM *block, int stride);
58
void ff_h264_idct_add_c(uint8_t *dst, DCTELEM *block, int stride);
59 60
void ff_h264_idct8_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
void ff_h264_idct_dc_add_c(uint8_t *dst, DCTELEM *block, int stride);
61 62 63
void ff_h264_lowres_idct_add_c(uint8_t *dst, int stride, DCTELEM *block);
void ff_h264_lowres_idct_put_c(uint8_t *dst, int stride, DCTELEM *block);

64 65
void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1,
                              const float *src2, int src3, int blocksize, int step);
66 67
void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1,
                             const float *win, float add_bias, int len);
Michael Niedermayer's avatar
Michael Niedermayer committed
68
void ff_float_to_int16_c(int16_t *dst, const float *src, long len);
69
void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels);
70

71
/* encoding scans */
72 73 74
extern const uint8_t ff_alternate_horizontal_scan[64];
extern const uint8_t ff_alternate_vertical_scan[64];
extern const uint8_t ff_zigzag_direct[64];
75
extern const uint8_t ff_zigzag248_direct[64];
76

Fabrice Bellard's avatar
Fabrice Bellard committed
77
/* pixel operations */
78
#define MAX_NEG_CROP 1024
Fabrice Bellard's avatar
Fabrice Bellard committed
79 80

/* temporary */
81
extern uint32_t ff_squareTbl[512];
82
extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP];
Fabrice Bellard's avatar
Fabrice Bellard committed
83

84
/* VP3 DSP functions */
85 86 87
void ff_vp3_idct_c(DCTELEM *block/* align 16*/);
void ff_vp3_idct_put_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
Fabrice Bellard's avatar
Fabrice Bellard committed
88

89 90 91 92 93
/* 1/2^n downscaling functions from imgconvert.c */
void ff_img_copy_plane(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
94 95 96

void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
              int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
97

Michael Niedermayer's avatar
Michael Niedermayer committed
98
/* minimum alignment rules ;)
Diego Biurrun's avatar
Diego Biurrun committed
99 100 101 102 103 104
If you notice errors in the align stuff, need more alignment for some ASM code
for some CPU or need to use a function with less aligned data then send a mail
to the ffmpeg-devel mailing list, ...

!warning These alignments might not match reality, (missing attribute((align))
stuff somewhere possible).
Diego Biurrun's avatar
Diego Biurrun committed
105
I (Michael) did not check them, these are just the alignments which I think
Diego Biurrun's avatar
Diego Biurrun committed
106
could be reached easily ...
Fabrice Bellard's avatar
Fabrice Bellard committed
107

Michael Niedermayer's avatar
Michael Niedermayer committed
108 109 110
!future video codecs might need functions with less strict alignment
*/

111
/*
112 113 114 115
void get_pixels_c(DCTELEM *block, const uint8_t *pixels, int line_size);
void diff_pixels_c(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride);
void put_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
void add_pixels_clamped_c(const DCTELEM *block, uint8_t *pixels, int line_size);
116
void clear_blocks_c(DCTELEM *blocks);
117
*/
Fabrice Bellard's avatar
Fabrice Bellard committed
118 119

/* add and put pixel (decoding) */
Michael Niedermayer's avatar
Michael Niedermayer committed
120
// blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16
121
//h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller then 4
122
typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int h);
123
typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h);
124
typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
125
typedef void (*h264_chroma_mc_func)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x, int y);
126
typedef void (*h264_weight_func)(uint8_t *block, int stride, int log2_denom, int weight, int offset);
Loren Merritt's avatar
Loren Merritt committed
127
typedef void (*h264_biweight_func)(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset);
Michael Niedermayer's avatar
Michael Niedermayer committed
128

Michael Niedermayer's avatar
Michael Niedermayer committed
129
#define DEF_OLD_QPEL(name)\
130 131 132
void ff_put_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
void ff_avg_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
Michael Niedermayer's avatar
Michael Niedermayer committed
133 134 135 136 137 138 139 140 141 142 143 144 145

DEF_OLD_QPEL(qpel16_mc11_old_c)
DEF_OLD_QPEL(qpel16_mc31_old_c)
DEF_OLD_QPEL(qpel16_mc12_old_c)
DEF_OLD_QPEL(qpel16_mc32_old_c)
DEF_OLD_QPEL(qpel16_mc13_old_c)
DEF_OLD_QPEL(qpel16_mc33_old_c)
DEF_OLD_QPEL(qpel8_mc11_old_c)
DEF_OLD_QPEL(qpel8_mc31_old_c)
DEF_OLD_QPEL(qpel8_mc12_old_c)
DEF_OLD_QPEL(qpel8_mc32_old_c)
DEF_OLD_QPEL(qpel8_mc13_old_c)
DEF_OLD_QPEL(qpel8_mc33_old_c)
Michael Niedermayer's avatar
Michael Niedermayer committed
146 147 148 149 150 151

#define CALL_2X_PIXELS(a, b, n)\
static void a(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
    b(block  , pixels  , line_size, h);\
    b(block+n, pixels+n, line_size, h);\
}
Michael Niedermayer's avatar
Michael Niedermayer committed
152

Fabrice Bellard's avatar
Fabrice Bellard committed
153
/* motion estimation */
154
// h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller then 2
Diego Biurrun's avatar
Diego Biurrun committed
155
// although currently h<4 is not used as functions with width <8 are neither used nor implemented
156
typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
Michael Niedermayer's avatar
Michael Niedermayer committed
157

158

159 160 161
// for snow slices
typedef struct slice_buffer_s slice_buffer;

162 163 164 165 166 167 168 169 170
/**
 * Scantable.
 */
typedef struct ScanTable{
    const uint8_t *scantable;
    uint8_t permutated[64];
    uint8_t raster_end[64];
#ifdef ARCH_POWERPC
                /** Used by dct_quantize_altivec to find last-non-zero */
171
    DECLARE_ALIGNED(16, uint8_t, inverse[64]);
172 173 174 175 176
#endif
} ScanTable;

void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable);

177 178 179 180
void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize,
                         int block_w, int block_h,
                         int src_x, int src_y, int w, int h);

Michael Niedermayer's avatar
Michael Niedermayer committed
181 182 183
/**
 * DSPContext.
 */
184 185
typedef struct DSPContext {
    /* pixel ops : interface with DCT */
186 187 188
    void (*get_pixels)(DCTELEM *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size);
    void (*diff_pixels)(DCTELEM *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride);
    void (*put_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
189
    void (*put_signed_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
190
    void (*add_pixels_clamped)(const DCTELEM *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
Loren Merritt's avatar
Loren Merritt committed
191 192
    void (*add_pixels8)(uint8_t *pixels, DCTELEM *block, int line_size);
    void (*add_pixels4)(uint8_t *pixels, DCTELEM *block, int line_size);
193
    int (*sum_abs_dctelem)(DCTELEM *block/*align 16*/);
Michael Niedermayer's avatar
Michael Niedermayer committed
194 195 196
    /**
     * translational global motion compensation.
     */
197
    void (*gmc1)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x16, int y16, int rounder);
Michael Niedermayer's avatar
Michael Niedermayer committed
198 199 200
    /**
     * global motion compensation.
     */
201
    void (*gmc )(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int ox, int oy,
202
                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
203
    void (*clear_blocks)(DCTELEM *blocks/*align 16*/);
204 205
    int (*pix_sum)(uint8_t * pix, int line_size);
    int (*pix_norm1)(uint8_t * pix, int line_size);
206
// 16x16 8x8 4x4 2x2 16x8 8x4 4x2 8x16 4x8 2x4
207

208 209 210 211 212 213 214 215 216
    me_cmp_func sad[5]; /* identical to pix_absAxA except additional void * */
    me_cmp_func sse[5];
    me_cmp_func hadamard8_diff[5];
    me_cmp_func dct_sad[5];
    me_cmp_func quant_psnr[5];
    me_cmp_func bit[5];
    me_cmp_func rd[5];
    me_cmp_func vsad[5];
    me_cmp_func vsse[5];
217
    me_cmp_func nsse[5];
218 219
    me_cmp_func w53[5];
    me_cmp_func w97[5];
220
    me_cmp_func dct_max[5];
221
    me_cmp_func dct264_sad[5];
Michael Niedermayer's avatar
Michael Niedermayer committed
222

223 224 225 226
    me_cmp_func me_pre_cmp[5];
    me_cmp_func me_cmp[5];
    me_cmp_func me_sub_cmp[5];
    me_cmp_func mb_cmp[5];
227
    me_cmp_func ildct_cmp[5]; //only width 16 used
228
    me_cmp_func frame_skip_cmp[5]; //only width 8 used
229

230 231
    int (*ssd_int8_vs_int16)(const int8_t *pix1, const int16_t *pix2,
                             int size);
232

Michael Niedermayer's avatar
Michael Niedermayer committed
233 234
    /**
     * Halfpel motion compensation with rounding (a+b+1)>>1.
Luca Barbato's avatar
Luca Barbato committed
235
     * this is an array[4][4] of motion compensation functions for 4
236
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
237
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
Michael Niedermayer's avatar
Michael Niedermayer committed
238 239 240 241 242
     * @param block destination where the result is stored
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
243
    op_pixels_func put_pixels_tab[4][4];
Michael Niedermayer's avatar
Michael Niedermayer committed
244 245 246

    /**
     * Halfpel motion compensation with rounding (a+b+1)>>1.
247
     * This is an array[4][4] of motion compensation functions for 4
248
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
249
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
Michael Niedermayer's avatar
Michael Niedermayer committed
250 251 252 253 254
     * @param block destination into which the result is averaged (a+b+1)>>1
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
255
    op_pixels_func avg_pixels_tab[4][4];
Michael Niedermayer's avatar
Michael Niedermayer committed
256 257 258

    /**
     * Halfpel motion compensation with no rounding (a+b)>>1.
Luca Barbato's avatar
Luca Barbato committed
259
     * this is an array[2][4] of motion compensation functions for 2
Michael Niedermayer's avatar
Michael Niedermayer committed
260
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
261
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
Michael Niedermayer's avatar
Michael Niedermayer committed
262 263 264 265 266
     * @param block destination where the result is stored
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
Michael Niedermayer's avatar
Michael Niedermayer committed
267
    op_pixels_func put_no_rnd_pixels_tab[4][4];
Michael Niedermayer's avatar
Michael Niedermayer committed
268 269 270

    /**
     * Halfpel motion compensation with no rounding (a+b)>>1.
Luca Barbato's avatar
Luca Barbato committed
271
     * this is an array[2][4] of motion compensation functions for 2
Michael Niedermayer's avatar
Michael Niedermayer committed
272
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
273
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
Michael Niedermayer's avatar
Michael Niedermayer committed
274 275 276 277 278
     * @param block destination into which the result is averaged (a+b)>>1
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
Michael Niedermayer's avatar
Michael Niedermayer committed
279
    op_pixels_func avg_no_rnd_pixels_tab[4][4];
280

281
    void (*put_no_rnd_pixels_l2[2])(uint8_t *block/*align width (8 or 16)*/, const uint8_t *a/*align 1*/, const uint8_t *b/*align 1*/, int line_size, int h);
282

283 284
    /**
     * Thirdpel motion compensation with rounding (a+b+1)>>1.
Luca Barbato's avatar
Luca Barbato committed
285 286
     * this is an array[12] of motion compensation functions for the 9 thirdpe
     * positions<br>
287 288 289 290 291 292 293
     * *pixels_tab[ xthirdpel + 4*ythirdpel ]
     * @param block destination where the result is stored
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
    tpel_mc_func put_tpel_pixels_tab[11]; //FIXME individual func ptr per width?
294 295
    tpel_mc_func avg_tpel_pixels_tab[11]; //FIXME individual func ptr per width?

296 297 298 299
    qpel_mc_func put_qpel_pixels_tab[2][16];
    qpel_mc_func avg_qpel_pixels_tab[2][16];
    qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
    qpel_mc_func avg_no_rnd_qpel_pixels_tab[2][16];
Michael Niedermayer's avatar
Michael Niedermayer committed
300
    qpel_mc_func put_mspel_pixels_tab[8];
301

302
    /**
Luca Barbato's avatar
Luca Barbato committed
303
     * h264 Chroma MC
304 305
     */
    h264_chroma_mc_func put_h264_chroma_pixels_tab[3];
306 307
    /* This is really one func used in VC-1 decoding */
    h264_chroma_mc_func put_no_rnd_h264_chroma_pixels_tab[3];
308
    h264_chroma_mc_func avg_h264_chroma_pixels_tab[3];
309

310 311
    qpel_mc_func put_h264_qpel_pixels_tab[4][16];
    qpel_mc_func avg_h264_qpel_pixels_tab[4][16];
312

313 314 315
    qpel_mc_func put_2tap_qpel_pixels_tab[4][16];
    qpel_mc_func avg_2tap_qpel_pixels_tab[4][16];

316 317
    h264_weight_func weight_h264_pixels_tab[10];
    h264_biweight_func biweight_h264_pixels_tab[10];
318

319 320 321 322 323 324 325 326 327
    /* AVS specific */
    qpel_mc_func put_cavs_qpel_pixels_tab[2][16];
    qpel_mc_func avg_cavs_qpel_pixels_tab[2][16];
    void (*cavs_filter_lv)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
    void (*cavs_filter_lh)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
    void (*cavs_filter_cv)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
    void (*cavs_filter_ch)(uint8_t *pix, int stride, int alpha, int beta, int tc, int bs1, int bs2);
    void (*cavs_idct8_add)(uint8_t *dst, DCTELEM *block, int stride);

328
    me_cmp_func pix_abs[2][4];
329

Michael Niedermayer's avatar
Michael Niedermayer committed
330 331
    /* huffyuv specific */
    void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w);
332
    void (*add_bytes_l2)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 16*/, int w);
Michael Niedermayer's avatar
Michael Niedermayer committed
333
    void (*diff_bytes)(uint8_t *dst/*align 16*/, uint8_t *src1/*align 16*/, uint8_t *src2/*align 1*/,int w);
334 335 336 337 338
    /**
     * subtract huffyuv's variant of median prediction
     * note, this might read from src1[-1], src2[-1]
     */
    void (*sub_hfyu_median_prediction)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top);
339 340
    /* this might write to dst[w] */
    void (*add_png_paeth_prediction)(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
341
    void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w);
342

343 344 345 346 347 348
    void (*h264_v_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
    void (*h264_h_loop_filter_luma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
    void (*h264_v_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
    void (*h264_h_loop_filter_chroma)(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0);
    void (*h264_v_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta);
    void (*h264_h_loop_filter_chroma_intra)(uint8_t *pix, int stride, int alpha, int beta);
349 350
    // h264_loop_filter_strength: simd only. the C version is inlined in h264.c
    void (*h264_loop_filter_strength)(int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
351
                                      int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field);
352

Michael Niedermayer's avatar
Michael Niedermayer committed
353 354 355
    void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
    void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);

356
    void (*h261_loop_filter)(uint8_t *src, int stride);
357

358 359 360
    void (*x8_v_loop_filter)(uint8_t *src, int stride, int qscale);
    void (*x8_h_loop_filter)(uint8_t *src, int stride, int qscale);

361
    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
362
    void (*vorbis_inverse_coupling)(float *mag, float *ang, int blocksize);
Loren Merritt's avatar
Loren Merritt committed
363
    void (*ac3_downmix)(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
364 365
    /* no alignment needed */
    void (*flac_compute_autocorr)(const int32_t *data, int len, int lag, double *autoc);
366
    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
Loren Merritt's avatar
Loren Merritt committed
367
    void (*vector_fmul)(float *dst, const float *src, int len);
368 369 370
    void (*vector_fmul_reverse)(float *dst, const float *src0, const float *src1, int len);
    /* assume len is a multiple of 8, and src arrays are 16-byte aligned */
    void (*vector_fmul_add_add)(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step);
371 372
    /* assume len is a multiple of 4, and arrays are 16-byte aligned */
    void (*vector_fmul_window)(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len);
Loren Merritt's avatar
Loren Merritt committed
373 374
    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
    void (*int32_to_float_fmul_scalar)(float *dst, const int *src, float mul, int len);
375 376

    /* C version: convert floats from the range [384.0,386.0] to ints in [-32768,32767]
377
     * simd versions: convert floats from [-32768.0,32767.0] without rescaling and arrays are 16byte aligned */
Michael Niedermayer's avatar
Michael Niedermayer committed
378
    void (*float_to_int16)(int16_t *dst, const float *src, long len);
379
    void (*float_to_int16_interleave)(int16_t *dst, const float **src, long len, int channels);
380

381 382
    /* (I)DCT */
    void (*fdct)(DCTELEM *block/* align 16*/);
383
    void (*fdct248)(DCTELEM *block/* align 16*/);
384

385 386
    /* IDCT really*/
    void (*idct)(DCTELEM *block/* align 16*/);
387

Michael Niedermayer's avatar
Michael Niedermayer committed
388
    /**
Michael Niedermayer's avatar
Michael Niedermayer committed
389
     * block -> idct -> clip to unsigned 8 bit -> dest.
Michael Niedermayer's avatar
Michael Niedermayer committed
390
     * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...)
Panagiotis Issaris's avatar
Panagiotis Issaris committed
391
     * @param line_size size in bytes of a horizontal line of dest
Michael Niedermayer's avatar
Michael Niedermayer committed
392
     */
393
    void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
394

Michael Niedermayer's avatar
Michael Niedermayer committed
395 396
    /**
     * block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
Panagiotis Issaris's avatar
Panagiotis Issaris committed
397
     * @param line_size size in bytes of a horizontal line of dest
Michael Niedermayer's avatar
Michael Niedermayer committed
398
     */
399
    void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*align 16*/);
400

Michael Niedermayer's avatar
Michael Niedermayer committed
401
    /**
Michael Niedermayer's avatar
Michael Niedermayer committed
402
     * idct input permutation.
403 404 405 406
     * several optimized IDCTs need a permutated input (relative to the normal order of the reference
     * IDCT)
     * this permutation must be performed before the idct_put/add, note, normally this can be merged
     * with the zigzag/alternate scan<br>
Michael Niedermayer's avatar
Michael Niedermayer committed
407 408 409 410 411 412
     * an example to avoid confusion:
     * - (->decode coeffs -> zigzag reorder -> dequant -> reference idct ->...)
     * - (x -> referece dct -> reference idct -> x)
     * - (x -> referece dct -> simple_mmx_perm = idct_permutation -> simple_idct_mmx -> x)
     * - (->decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant -> simple_idct_mmx ->...)
     */
413 414 415 416 417 418
    uint8_t idct_permutation[64];
    int idct_permutation_type;
#define FF_NO_IDCT_PERM 1
#define FF_LIBMPEG2_IDCT_PERM 2
#define FF_SIMPLE_IDCT_PERM 3
#define FF_TRANSPOSE_IDCT_PERM 4
419
#define FF_PARTTRANS_IDCT_PERM 5
420
#define FF_SSE2_IDCT_PERM 6
421

422 423 424 425
    int (*try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale);
    void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
#define BASIS_SHIFT 16
#define RECON_SHIFT 6
426

427
    void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w);
428
#define EDGE_WIDTH 16
429

430
    /* h264 functions */
431
    void (*h264_idct_add)(uint8_t *dst, DCTELEM *block, int stride);
432
    void (*h264_idct8_add)(uint8_t *dst, DCTELEM *block, int stride);
433 434
    void (*h264_idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
    void (*h264_idct8_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
435
    void (*h264_dct)(DCTELEM block[4][4]);
436 437

    /* snow wavelet */
438 439
    void (*vertical_compose97i)(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width);
    void (*horizontal_compose97i)(IDWTELEM *b, int width);
440
    void (*inner_add_yblock)(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8);
441 442

    void (*prefetch)(void *mem, int stride, int h);
443 444

    void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
445 446 447

    /* vc1 functions */
    void (*vc1_inv_trans_8x8)(DCTELEM *b);
448 449 450
    void (*vc1_inv_trans_8x4)(uint8_t *dest, int line_size, DCTELEM *block);
    void (*vc1_inv_trans_4x8)(uint8_t *dest, int line_size, DCTELEM *block);
    void (*vc1_inv_trans_4x4)(uint8_t *dest, int line_size, DCTELEM *block);
451 452
    void (*vc1_v_overlap)(uint8_t* src, int stride);
    void (*vc1_h_overlap)(uint8_t* src, int stride);
453 454 455 456
    /* put 8x8 block with bicubic interpolation and quarterpel precision
     * last argument is actually round value instead of height
     */
    op_pixels_func put_vc1_mspel_pixels_tab[16];
457 458

    /* intrax8 functions */
459 460
    void (*x8_spatial_compensation[12])(uint8_t *src , uint8_t *dst, int linesize);
    void (*x8_setup_spatial_compensation)(uint8_t *src, uint8_t *dst, int linesize,
461 462
           int * range, int * sum,  int edges);

463 464 465
    /* ape functions */
    /**
     * Add contents of the second vector to the first one.
466
     * @param len length of vectors, should be multiple of 16
467 468 469 470
     */
    void (*add_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
    /**
     * Add contents of the second vector to the first one.
471
     * @param len length of vectors, should be multiple of 16
472 473 474 475
     */
    void (*sub_int16)(int16_t *v1/*align 16*/, int16_t *v2, int len);
    /**
     * Calculate scalar product of two vectors.
476
     * @param len length of vectors, should be multiple of 16
477 478 479
     * @param shift number of bits to discard from product
     */
    int32_t (*scalarproduct_int16)(int16_t *v1, int16_t *v2/*align 16*/, int len, int shift);
480 481
} DSPContext;

Måns Rullgård's avatar
Måns Rullgård committed
482
void dsputil_static_init(void);
483
void dsputil_init(DSPContext* p, AVCodecContext *avctx);
Fabrice Bellard's avatar
Fabrice Bellard committed
484

485 486
int ff_check_alignment(void);

487 488 489 490
/**
 * permute block according to permuatation.
 * @param last last non zero element in scantable order
 */
491
void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last);
492

493 494
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type);

495
#define         BYTE_VEC32(c)   ((c)*0x01010101UL)
Michael Niedermayer's avatar
Michael Niedermayer committed
496 497 498 499 500 501 502 503 504 505 506

static inline uint32_t rnd_avg32(uint32_t a, uint32_t b)
{
    return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
}

static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
{
    return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
}

507 508 509 510 511 512 513 514 515 516 517 518
static inline int get_penalty_factor(int lambda, int lambda2, int type){
    switch(type&0xFF){
    default:
    case FF_CMP_SAD:
        return lambda>>FF_LAMBDA_SHIFT;
    case FF_CMP_DCT:
        return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
    case FF_CMP_W53:
        return (4*lambda)>>(FF_LAMBDA_SHIFT);
    case FF_CMP_W97:
        return (2*lambda)>>(FF_LAMBDA_SHIFT);
    case FF_CMP_SATD:
519
    case FF_CMP_DCT264:
520 521 522 523 524 525 526 527 528 529 530
        return (2*lambda)>>FF_LAMBDA_SHIFT;
    case FF_CMP_RD:
    case FF_CMP_PSNR:
    case FF_CMP_SSE:
    case FF_CMP_NSSE:
        return lambda2>>FF_LAMBDA_SHIFT;
    case FF_CMP_BIT:
        return 1;
    }
}

Michael Niedermayer's avatar
Michael Niedermayer committed
531
/**
Michael Niedermayer's avatar
Michael Niedermayer committed
532
 * Empty mmx state.
Michael Niedermayer's avatar
Michael Niedermayer committed
533 534 535
 * this must be called between any dsp function and float/double code.
 * for example sin(); dsp->idct_put(); emms_c(); cos()
 */
536 537
#define emms_c()

538 539 540 541
/* should be defined by architectures supporting
   one or more MultiMedia extension */
int mm_support(void);

542 543 544 545 546 547 548 549 550 551
void dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_mlib(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_mmi(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);

552
#define DECLARE_ALIGNED_16(t, v) DECLARE_ALIGNED(16, t, v)
553

Fabrice Bellard's avatar
Fabrice Bellard committed
554
#if defined(HAVE_MMX)
Fabrice Bellard's avatar
Fabrice Bellard committed
555

556
#undef emms_c
557

Fabrice Bellard's avatar
Fabrice Bellard committed
558 559 560 561 562
#define MM_MMX    0x0001 /* standard MMX */
#define MM_3DNOW  0x0004 /* AMD 3DNOW */
#define MM_MMXEXT 0x0002 /* SSE integer functions or AMD MMX ext */
#define MM_SSE    0x0008 /* SSE functions */
#define MM_SSE2   0x0010 /* PIV SSE2 functions */
563
#define MM_3DNOWEXT  0x0020 /* AMD 3DNowExt */
564
#define MM_SSE3   0x0040 /* Prescott SSE3 functions */
565
#define MM_SSSE3  0x0080 /* Conroe SSSE3 functions */
Fabrice Bellard's avatar
Fabrice Bellard committed
566

Måns Rullgård's avatar
Måns Rullgård committed
567 568
extern int mm_flags;

569 570
void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
571
void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size);
Fabrice Bellard's avatar
Fabrice Bellard committed
572 573 574

static inline void emms(void)
{
575
    asm volatile ("emms;":::"memory");
576 577
}

Michael Niedermayer's avatar
Michael Niedermayer committed
578

579 580
#define emms_c() \
{\
Måns Rullgård's avatar
Måns Rullgård committed
581
    if (mm_flags & MM_MMX)\
582
        emms();\
Fabrice Bellard's avatar
Fabrice Bellard committed
583 584
}

585
void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
586

Fabrice Bellard's avatar
Fabrice Bellard committed
587 588
#elif defined(ARCH_ARMV4L)

589 590
#define MM_IWMMXT    0x0100 /* XScale IWMMXT */

Måns Rullgård's avatar
Måns Rullgård committed
591 592
extern int mm_flags;

593 594
#elif defined(ARCH_POWERPC)

595 596
#define MM_ALTIVEC    0x0001 /* standard AltiVec */

Måns Rullgård's avatar
Måns Rullgård committed
597 598
extern int mm_flags;

599
#define DECLARE_ALIGNED_8(t, v) DECLARE_ALIGNED(16, t, v)
600
#define STRIDE_ALIGN 16
601

602 603
#elif defined(HAVE_MMI)

604
#define DECLARE_ALIGNED_8(t, v) DECLARE_ALIGNED(16, t, v)
605
#define STRIDE_ALIGN 16
606

607 608 609 610 611
#else

#define mm_flags 0
#define mm_support() 0

612
#endif
Fabrice Bellard's avatar
Fabrice Bellard committed
613

614 615 616
#ifndef DECLARE_ALIGNED_8
#   define DECLARE_ALIGNED_8(t, v) DECLARE_ALIGNED(8, t, v)
#endif
Fabrice Bellard's avatar
Fabrice Bellard committed
617

618 619
#ifndef STRIDE_ALIGN
#   define STRIDE_ALIGN 8
Fabrice Bellard's avatar
Fabrice Bellard committed
620 621
#endif

622
/* PSNR */
623
void get_psnr(uint8_t *orig_image[3], uint8_t *coded_image[3],
624 625
              int orig_linesize[3], int coded_linesize,
              AVCodecContext *avctx);
626 627 628 629 630 631 632

/* FFT computation */

/* NOTE: soon integer code will be added, so you must use the
   FFTSample type */
typedef float FFTSample;

633 634
struct MDCTContext;

635 636 637 638 639 640 641 642 643 644
typedef struct FFTComplex {
    FFTSample re, im;
} FFTComplex;

typedef struct FFTContext {
    int nbits;
    int inverse;
    uint16_t *revtab;
    FFTComplex *exptab;
    FFTComplex *exptab1; /* only used by SSE code */
Loren Merritt's avatar
Loren Merritt committed
645 646
    FFTComplex *tmp_buf;
    void (*fft_permute)(struct FFTContext *s, FFTComplex *z);
647
    void (*fft_calc)(struct FFTContext *s, FFTComplex *z);
Loren Merritt's avatar
Loren Merritt committed
648 649
    void (*imdct_calc)(struct MDCTContext *s, FFTSample *output, const FFTSample *input);
    void (*imdct_half)(struct MDCTContext *s, FFTSample *output, const FFTSample *input);
650 651
} FFTContext;

652
int ff_fft_init(FFTContext *s, int nbits, int inverse);
Loren Merritt's avatar
Loren Merritt committed
653 654
void ff_fft_permute_c(FFTContext *s, FFTComplex *z);
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
655 656
void ff_fft_calc_c(FFTContext *s, FFTComplex *z);
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
657 658
void ff_fft_calc_3dn(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z);
659
void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z);
660

Loren Merritt's avatar
Loren Merritt committed
661 662 663 664
static inline void ff_fft_permute(FFTContext *s, FFTComplex *z)
{
    s->fft_permute(s, z);
}
665
static inline void ff_fft_calc(FFTContext *s, FFTComplex *z)
666 667 668
{
    s->fft_calc(s, z);
}
669
void ff_fft_end(FFTContext *s);
670 671 672 673 674 675 676 677 678 679 680 681

/* MDCT computation */

typedef struct MDCTContext {
    int n;  /* size of MDCT (i.e. number of input data * 2) */
    int nbits; /* n = 2^nbits */
    /* pre/post rotation tables */
    FFTSample *tcos;
    FFTSample *tsin;
    FFTContext fft;
} MDCTContext;

682 683 684 685 686 687 688 689 690
static inline void ff_imdct_calc(MDCTContext *s, FFTSample *output, const FFTSample *input)
{
    s->fft.imdct_calc(s, output, input);
}
static inline void ff_imdct_half(MDCTContext *s, FFTSample *output, const FFTSample *input)
{
    s->fft.imdct_half(s, output, input);
}

691 692 693
/**
 * Generate a Kaiser-Bessel Derived Window.
 * @param   window  pointer to half window
694 695
 * @param   alpha   determines window shape
 * @param   n       size of half window
696
 */
697
void ff_kbd_window_init(float *window, float alpha, int n);
698

699 700 701 702 703 704 705
/**
 * Generate a sine window.
 * @param   window  pointer to half window
 * @param   n       size of half window
 */
void ff_sine_window_init(float *window, int n);

Fabrice Bellard's avatar
Fabrice Bellard committed
706
int ff_mdct_init(MDCTContext *s, int nbits, int inverse);
707 708
void ff_imdct_calc_c(MDCTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_c(MDCTContext *s, FFTSample *output, const FFTSample *input);
Loren Merritt's avatar
Loren Merritt committed
709
void ff_imdct_calc_3dn(MDCTContext *s, FFTSample *output, const FFTSample *input);
Loren Merritt's avatar
Loren Merritt committed
710
void ff_imdct_half_3dn(MDCTContext *s, FFTSample *output, const FFTSample *input);
Loren Merritt's avatar
Loren Merritt committed
711
void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input);
Loren Merritt's avatar
Loren Merritt committed
712
void ff_imdct_half_3dn2(MDCTContext *s, FFTSample *output, const FFTSample *input);
Loren Merritt's avatar
Loren Merritt committed
713
void ff_imdct_calc_sse(MDCTContext *s, FFTSample *output, const FFTSample *input);
Loren Merritt's avatar
Loren Merritt committed
714
void ff_imdct_half_sse(MDCTContext *s, FFTSample *output, const FFTSample *input);
Loren Merritt's avatar
Loren Merritt committed
715
void ff_mdct_calc(MDCTContext *s, FFTSample *out, const FFTSample *input);
Fabrice Bellard's avatar
Fabrice Bellard committed
716
void ff_mdct_end(MDCTContext *s);
717

718
#define WRAPPER8_16(name8, name16)\
719 720 721 722 723
static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
    return name8(s, dst           , src           , stride, h)\
          +name8(s, dst+8         , src+8         , stride, h);\
}

724
#define WRAPPER8_16_SQ(name8, name16)\
725 726 727 728 729 730 731 732 733 734 735
static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
    int score=0;\
    score +=name8(s, dst           , src           , stride, 8);\
    score +=name8(s, dst+8         , src+8         , stride, 8);\
    if(h==16){\
        dst += 8*stride;\
        src += 8*stride;\
        score +=name8(s, dst           , src           , stride, 8);\
        score +=name8(s, dst+8         , src+8         , stride, 8);\
    }\
    return score;\
Michael Niedermayer's avatar
Michael Niedermayer committed
736 737
}

738 739 740 741 742 743

static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
{
    int i;
    for(i=0; i<h; i++)
    {
744
        AV_WN16(dst   , AV_RN16(src   ));
745 746 747 748 749 750 751 752 753 754
        dst+=dstStride;
        src+=srcStride;
    }
}

static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
{
    int i;
    for(i=0; i<h; i++)
    {
755
        AV_WN32(dst   , AV_RN32(src   ));
756 757 758 759 760 761 762 763 764 765
        dst+=dstStride;
        src+=srcStride;
    }
}

static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
{
    int i;
    for(i=0; i<h; i++)
    {
766 767
        AV_WN32(dst   , AV_RN32(src   ));
        AV_WN32(dst+4 , AV_RN32(src+4 ));
768 769 770 771 772 773 774 775 776 777
        dst+=dstStride;
        src+=srcStride;
    }
}

static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
{
    int i;
    for(i=0; i<h; i++)
    {
778 779
        AV_WN32(dst   , AV_RN32(src   ));
        AV_WN32(dst+4 , AV_RN32(src+4 ));
780 781 782 783 784 785 786 787 788 789 790
        dst[8]= src[8];
        dst+=dstStride;
        src+=srcStride;
    }
}

static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
{
    int i;
    for(i=0; i<h; i++)
    {
791 792 793 794
        AV_WN32(dst   , AV_RN32(src   ));
        AV_WN32(dst+4 , AV_RN32(src+4 ));
        AV_WN32(dst+8 , AV_RN32(src+8 ));
        AV_WN32(dst+12, AV_RN32(src+12));
795 796 797 798 799 800 801 802 803 804
        dst+=dstStride;
        src+=srcStride;
    }
}

static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
{
    int i;
    for(i=0; i<h; i++)
    {
805 806 807 808
        AV_WN32(dst   , AV_RN32(src   ));
        AV_WN32(dst+4 , AV_RN32(src+4 ));
        AV_WN32(dst+8 , AV_RN32(src+8 ));
        AV_WN32(dst+12, AV_RN32(src+12));
809 810 811 812 813 814
        dst[16]= src[16];
        dst+=dstStride;
        src+=srcStride;
    }
}

815
#endif /* FFMPEG_DSPUTIL_H */