dsputil.h 23.9 KB
Newer Older
1 2
/*
 * DSP utils
3
 * Copyright (c) 2000, 2001, 2002 Fabrice Bellard
4
 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5
 *
6 7 8
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
9 10
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14 15 16 17 18
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
Michael Niedermayer's avatar
Michael Niedermayer committed
22 23

/**
24
 * @file
Michael Niedermayer's avatar
Michael Niedermayer committed
25
 * DSP utils.
26 27
 * note, many functions in here may use MMX which trashes the FPU state, it is
 * absolutely necessary to call emms_c() between dsp & float/double code
Michael Niedermayer's avatar
Michael Niedermayer committed
28 29
 */

30 31
#ifndef AVCODEC_DSPUTIL_H
#define AVCODEC_DSPUTIL_H
Fabrice Bellard's avatar
Fabrice Bellard committed
32

33
#include "libavutil/intreadwrite.h"
34
#include "avcodec.h"
Fabrice Bellard's avatar
Fabrice Bellard committed
35

Michael Niedermayer's avatar
Michael Niedermayer committed
36

Michael Niedermayer's avatar
Michael Niedermayer committed
37
//#define DEBUG
Fabrice Bellard's avatar
Fabrice Bellard committed
38 39
/* dct code */

Diego Biurrun's avatar
Diego Biurrun committed
40 41 42 43 44 45
void ff_fdct_ifast(int16_t *data);
void ff_fdct_ifast248(int16_t *data);
void ff_jpeg_fdct_islow_8(int16_t *data);
void ff_jpeg_fdct_islow_10(int16_t *data);
void ff_fdct248_islow_8(int16_t *data);
void ff_fdct248_islow_10(int16_t *data);
Fabrice Bellard's avatar
Fabrice Bellard committed
46

Diego Biurrun's avatar
Diego Biurrun committed
47
void ff_j_rev_dct(int16_t *data);
48 49 50
void ff_j_rev_dct4(int16_t *data);
void ff_j_rev_dct2(int16_t *data);
void ff_j_rev_dct1(int16_t *data);
Fabrice Bellard's avatar
Fabrice Bellard committed
51

Diego Biurrun's avatar
Diego Biurrun committed
52 53 54
void ff_fdct_mmx(int16_t *block);
void ff_fdct_mmxext(int16_t *block);
void ff_fdct_sse2(int16_t *block);
Fabrice Bellard's avatar
Fabrice Bellard committed
55

56
#define H264_IDCT(depth) \
Diego Biurrun's avatar
Diego Biurrun committed
57 58 59 60 61 62 63 64 65 66 67 68
void ff_h264_idct8_add_ ## depth ## _c(uint8_t *dst, int16_t *block, int stride);\
void ff_h264_idct_add_ ## depth ## _c(uint8_t *dst, int16_t *block, int stride);\
void ff_h264_idct8_dc_add_ ## depth ## _c(uint8_t *dst, int16_t *block, int stride);\
void ff_h264_idct_dc_add_ ## depth ## _c(uint8_t *dst, int16_t *block, int stride);\
void ff_h264_idct_add16_ ## depth ## _c(uint8_t *dst, const int *blockoffset, int16_t *block, int stride, const uint8_t nnzc[6*8]);\
void ff_h264_idct_add16intra_ ## depth ## _c(uint8_t *dst, const int *blockoffset, int16_t *block, int stride, const uint8_t nnzc[6*8]);\
void ff_h264_idct8_add4_ ## depth ## _c(uint8_t *dst, const int *blockoffset, int16_t *block, int stride, const uint8_t nnzc[6*8]);\
void ff_h264_idct_add8_422_ ## depth ## _c(uint8_t **dest, const int *blockoffset, int16_t *block, int stride, const uint8_t nnzc[6*8]);\
void ff_h264_idct_add8_ ## depth ## _c(uint8_t **dest, const int *blockoffset, int16_t *block, int stride, const uint8_t nnzc[6*8]);\
void ff_h264_luma_dc_dequant_idct_ ## depth ## _c(int16_t *output, int16_t *input, int qmul);\
void ff_h264_chroma422_dc_dequant_idct_ ## depth ## _c(int16_t *block, int qmul);\
void ff_h264_chroma_dc_dequant_idct_ ## depth ## _c(int16_t *block, int qmul);
69 70 71 72

H264_IDCT( 8)
H264_IDCT( 9)
H264_IDCT(10)
73 74
H264_IDCT(12)
H264_IDCT(14)
75

Diego Biurrun's avatar
Diego Biurrun committed
76 77
void ff_svq3_luma_dc_dequant_idct_c(int16_t *output, int16_t *input, int qp);
void ff_svq3_add_idct_c(uint8_t *dst, int16_t *block, int stride, int qp, int dc);
78

79
/* encoding scans */
80 81 82
extern const uint8_t ff_alternate_horizontal_scan[64];
extern const uint8_t ff_alternate_vertical_scan[64];
extern const uint8_t ff_zigzag_direct[64];
83
extern const uint8_t ff_zigzag248_direct[64];
84

Fabrice Bellard's avatar
Fabrice Bellard committed
85
/* pixel operations */
86
#define MAX_NEG_CROP 1024
Fabrice Bellard's avatar
Fabrice Bellard committed
87 88

/* temporary */
89
extern uint32_t ff_squareTbl[512];
90
extern uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP];
Fabrice Bellard's avatar
Fabrice Bellard committed
91

92 93 94 95 96 97 98 99 100
#define PUTAVG_PIXELS(depth)\
void ff_put_pixels8x8_ ## depth ## _c(uint8_t *dst, uint8_t *src, int stride);\
void ff_avg_pixels8x8_ ## depth ## _c(uint8_t *dst, uint8_t *src, int stride);\
void ff_put_pixels16x16_ ## depth ## _c(uint8_t *dst, uint8_t *src, int stride);\
void ff_avg_pixels16x16_ ## depth ## _c(uint8_t *dst, uint8_t *src, int stride);

PUTAVG_PIXELS( 8)
PUTAVG_PIXELS( 9)
PUTAVG_PIXELS(10)
101 102
PUTAVG_PIXELS(12)
PUTAVG_PIXELS(14)
103 104 105 106 107

#define ff_put_pixels8x8_c ff_put_pixels8x8_8_c
#define ff_avg_pixels8x8_c ff_avg_pixels8x8_8_c
#define ff_put_pixels16x16_c ff_put_pixels16x16_8_c
#define ff_avg_pixels16x16_c ff_avg_pixels16x16_8_c
108

109 110 111 112 113 114
/* RV40 functions */
void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride);
void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride);
void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride);
void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride);

115 116 117 118
/* 1/2^n downscaling functions from imgconvert.c */
void ff_shrink22(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
void ff_shrink44(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
void ff_shrink88(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
119 120 121

void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
              int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
122

Michael Niedermayer's avatar
Michael Niedermayer committed
123
/* minimum alignment rules ;)
Diego Biurrun's avatar
Diego Biurrun committed
124 125 126 127 128 129
If you notice errors in the align stuff, need more alignment for some ASM code
for some CPU or need to use a function with less aligned data then send a mail
to the ffmpeg-devel mailing list, ...

!warning These alignments might not match reality, (missing attribute((align))
stuff somewhere possible).
Diego Biurrun's avatar
Diego Biurrun committed
130
I (Michael) did not check them, these are just the alignments which I think
Diego Biurrun's avatar
Diego Biurrun committed
131
could be reached easily ...
Fabrice Bellard's avatar
Fabrice Bellard committed
132

Michael Niedermayer's avatar
Michael Niedermayer committed
133 134 135
!future video codecs might need functions with less strict alignment
*/

136
/*
Diego Biurrun's avatar
Diego Biurrun committed
137 138 139 140 141
void get_pixels_c(int16_t *block, const uint8_t *pixels, int line_size);
void diff_pixels_c(int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride);
void put_pixels_clamped_c(const int16_t *block, uint8_t *pixels, int line_size);
void add_pixels_clamped_c(const int16_t *block, uint8_t *pixels, int line_size);
void clear_blocks_c(int16_t *blocks);
142
*/
Fabrice Bellard's avatar
Fabrice Bellard committed
143 144

/* add and put pixel (decoding) */
Michael Niedermayer's avatar
Michael Niedermayer committed
145
// blocksizes for op_pixels_func are 8x4,8x8 16x8 16x16
146
//h for op_pixels_func is limited to {width/2, width} but never larger than 16 and never smaller than 4
147
typedef void (*op_pixels_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, ptrdiff_t line_size, int h);
148
typedef void (*tpel_mc_func)(uint8_t *block/*align width (8 or 16)*/, const uint8_t *pixels/*align 1*/, int line_size, int w, int h);
149
typedef void (*qpel_mc_func)(uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
Michael Niedermayer's avatar
Michael Niedermayer committed
150

Kostya Shishkov's avatar
Kostya Shishkov committed
151 152
typedef void (*op_fill_func)(uint8_t *block/*align width (8 or 16)*/, uint8_t value, int line_size, int h);

Michael Niedermayer's avatar
Michael Niedermayer committed
153
#define DEF_OLD_QPEL(name)\
154 155 156
void ff_put_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
void ff_put_no_rnd_ ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);\
void ff_avg_        ## name (uint8_t *dst/*align width (8 or 16)*/, uint8_t *src/*align 1*/, int stride);
Michael Niedermayer's avatar
Michael Niedermayer committed
157 158 159 160 161 162 163 164 165 166 167 168 169

DEF_OLD_QPEL(qpel16_mc11_old_c)
DEF_OLD_QPEL(qpel16_mc31_old_c)
DEF_OLD_QPEL(qpel16_mc12_old_c)
DEF_OLD_QPEL(qpel16_mc32_old_c)
DEF_OLD_QPEL(qpel16_mc13_old_c)
DEF_OLD_QPEL(qpel16_mc33_old_c)
DEF_OLD_QPEL(qpel8_mc11_old_c)
DEF_OLD_QPEL(qpel8_mc31_old_c)
DEF_OLD_QPEL(qpel8_mc12_old_c)
DEF_OLD_QPEL(qpel8_mc32_old_c)
DEF_OLD_QPEL(qpel8_mc13_old_c)
DEF_OLD_QPEL(qpel8_mc33_old_c)
Michael Niedermayer's avatar
Michael Niedermayer committed
170 171

#define CALL_2X_PIXELS(a, b, n)\
172
static void a(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){\
Michael Niedermayer's avatar
Michael Niedermayer committed
173 174 175
    b(block  , pixels  , line_size, h);\
    b(block+n, pixels+n, line_size, h);\
}
Michael Niedermayer's avatar
Michael Niedermayer committed
176

Fabrice Bellard's avatar
Fabrice Bellard committed
177
/* motion estimation */
178
// h is limited to {width/2, width, 2*width} but never larger than 16 and never smaller than 2
Diego Biurrun's avatar
Diego Biurrun committed
179
// although currently h<4 is not used as functions with width <8 are neither used nor implemented
180
typedef int (*me_cmp_func)(void /*MpegEncContext*/ *s, uint8_t *blk1/*align width (8 or 16)*/, uint8_t *blk2/*align 1*/, int line_size, int h)/* __attribute__ ((const))*/;
Michael Niedermayer's avatar
Michael Niedermayer committed
181

182 183 184 185 186 187 188 189 190 191
/**
 * Scantable.
 */
typedef struct ScanTable{
    const uint8_t *scantable;
    uint8_t permutated[64];
    uint8_t raster_end[64];
} ScanTable;

void ff_init_scantable(uint8_t *, ScanTable *st, const uint8_t *src_scantable);
192 193
void ff_init_scantable_permutation(uint8_t *idct_permutation,
                                   int idct_permutation_type);
194

195
#define EMULATED_EDGE(depth) \
196
void ff_emulated_edge_mc_ ## depth (uint8_t *buf, const uint8_t *src, ptrdiff_t linesize,\
197
                         int block_w, int block_h,\
198 199
                         int src_x, int src_y, int w, int h);

200
EMULATED_EDGE(8)
201
EMULATED_EDGE(16)
202

Michael Niedermayer's avatar
Michael Niedermayer committed
203 204 205
/**
 * DSPContext.
 */
206
typedef struct DSPContext {
207 208 209 210 211
    /**
     * Size of DCT coefficients.
     */
    int dct_bits;

212
    /* pixel ops : interface with DCT */
Diego Biurrun's avatar
Diego Biurrun committed
213 214 215 216 217 218 219 220
    void (*get_pixels)(int16_t *block/*align 16*/, const uint8_t *pixels/*align 8*/, int line_size);
    void (*diff_pixels)(int16_t *block/*align 16*/, const uint8_t *s1/*align 8*/, const uint8_t *s2/*align 8*/, int stride);
    void (*put_pixels_clamped)(const int16_t *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
    void (*put_signed_pixels_clamped)(const int16_t *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
    void (*add_pixels_clamped)(const int16_t *block/*align 16*/, uint8_t *pixels/*align 8*/, int line_size);
    void (*add_pixels8)(uint8_t *pixels, int16_t *block, int line_size);
    void (*add_pixels4)(uint8_t *pixels, int16_t *block, int line_size);
    int (*sum_abs_dctelem)(int16_t *block/*align 16*/);
Michael Niedermayer's avatar
Michael Niedermayer committed
221 222 223
    /**
     * translational global motion compensation.
     */
224
    void (*gmc1)(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int srcStride, int h, int x16, int y16, int rounder);
Michael Niedermayer's avatar
Michael Niedermayer committed
225 226 227
    /**
     * global motion compensation.
     */
228
    void (*gmc )(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int ox, int oy,
229
                    int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height);
Diego Biurrun's avatar
Diego Biurrun committed
230 231
    void (*clear_block)(int16_t *block/*align 16*/);
    void (*clear_blocks)(int16_t *blocks/*align 16*/);
232 233
    int (*pix_sum)(uint8_t * pix, int line_size);
    int (*pix_norm1)(uint8_t * pix, int line_size);
234
// 16x16 8x8 4x4 2x2 16x8 8x4 4x2 8x16 4x8 2x4
235

236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
    me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */
    me_cmp_func sse[6];
    me_cmp_func hadamard8_diff[6];
    me_cmp_func dct_sad[6];
    me_cmp_func quant_psnr[6];
    me_cmp_func bit[6];
    me_cmp_func rd[6];
    me_cmp_func vsad[6];
    me_cmp_func vsse[6];
    me_cmp_func nsse[6];
    me_cmp_func w53[6];
    me_cmp_func w97[6];
    me_cmp_func dct_max[6];
    me_cmp_func dct264_sad[6];

    me_cmp_func me_pre_cmp[6];
    me_cmp_func me_cmp[6];
    me_cmp_func me_sub_cmp[6];
    me_cmp_func mb_cmp[6];
    me_cmp_func ildct_cmp[6]; //only width 16 used
    me_cmp_func frame_skip_cmp[6]; //only width 8 used
257

258 259
    int (*ssd_int8_vs_int16)(const int8_t *pix1, const int16_t *pix2,
                             int size);
260

Michael Niedermayer's avatar
Michael Niedermayer committed
261 262
    /**
     * Halfpel motion compensation with rounding (a+b+1)>>1.
Luca Barbato's avatar
Luca Barbato committed
263
     * this is an array[4][4] of motion compensation functions for 4
264
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
265
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
Michael Niedermayer's avatar
Michael Niedermayer committed
266 267 268 269 270
     * @param block destination where the result is stored
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
271
    op_pixels_func put_pixels_tab[4][4];
Michael Niedermayer's avatar
Michael Niedermayer committed
272 273 274

    /**
     * Halfpel motion compensation with rounding (a+b+1)>>1.
275
     * This is an array[4][4] of motion compensation functions for 4
276
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
277
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
Michael Niedermayer's avatar
Michael Niedermayer committed
278 279 280 281 282
     * @param block destination into which the result is averaged (a+b+1)>>1
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
283
    op_pixels_func avg_pixels_tab[4][4];
Michael Niedermayer's avatar
Michael Niedermayer committed
284 285 286

    /**
     * Halfpel motion compensation with no rounding (a+b)>>1.
Luca Barbato's avatar
Luca Barbato committed
287
     * this is an array[2][4] of motion compensation functions for 2
Michael Niedermayer's avatar
Michael Niedermayer committed
288
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
289
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
Michael Niedermayer's avatar
Michael Niedermayer committed
290 291 292 293 294
     * @param block destination where the result is stored
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
295
    op_pixels_func put_no_rnd_pixels_tab[4][4];
Michael Niedermayer's avatar
Michael Niedermayer committed
296 297 298

    /**
     * Halfpel motion compensation with no rounding (a+b)>>1.
299 300 301
     * this is an array[4] of motion compensation functions for 1
     * horizontal blocksize (16) and the 4 halfpel positions<br>
     * *pixels_tab[0][ xhalfpel + 2*yhalfpel ]
Michael Niedermayer's avatar
Michael Niedermayer committed
302 303 304 305 306
     * @param block destination into which the result is averaged (a+b)>>1
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
307
    op_pixels_func avg_no_rnd_pixels_tab[4];
308

309 310
    /**
     * Thirdpel motion compensation with rounding (a+b+1)>>1.
Luca Barbato's avatar
Luca Barbato committed
311 312
     * this is an array[12] of motion compensation functions for the 9 thirdpe
     * positions<br>
313 314 315 316 317 318 319
     * *pixels_tab[ xthirdpel + 4*ythirdpel ]
     * @param block destination where the result is stored
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height
     */
    tpel_mc_func put_tpel_pixels_tab[11]; //FIXME individual func ptr per width?
320 321
    tpel_mc_func avg_tpel_pixels_tab[11]; //FIXME individual func ptr per width?

322 323 324
    qpel_mc_func put_qpel_pixels_tab[2][16];
    qpel_mc_func avg_qpel_pixels_tab[2][16];
    qpel_mc_func put_no_rnd_qpel_pixels_tab[2][16];
Michael Niedermayer's avatar
Michael Niedermayer committed
325
    qpel_mc_func put_mspel_pixels_tab[8];
326

327
    me_cmp_func pix_abs[2][4];
328

Michael Niedermayer's avatar
Michael Niedermayer committed
329 330
    /* huffyuv specific */
    void (*add_bytes)(uint8_t *dst/*align 16*/, uint8_t *src/*align 16*/, int w);
331
    void (*diff_bytes)(uint8_t *dst/*align 16*/, const uint8_t *src1/*align 16*/, const uint8_t *src2/*align 1*/,int w);
332 333 334 335
    /**
     * subtract huffyuv's variant of median prediction
     * note, this might read from src1[-1], src2[-1]
     */
336 337
    void (*sub_hfyu_median_prediction)(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top);
    void (*add_hfyu_median_prediction)(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
338
    int  (*add_hfyu_left_prediction)(uint8_t *dst, const uint8_t *src, int w, int left);
339
    void (*add_hfyu_left_prediction_bgr32)(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha);
340
    /* this might write to dst[w] */
341
    void (*bswap_buf)(uint32_t *dst, const uint32_t *src, int w);
342
    void (*bswap16_buf)(uint16_t *dst, const uint16_t *src, int len);
343

Michael Niedermayer's avatar
Michael Niedermayer committed
344 345 346
    void (*h263_v_loop_filter)(uint8_t *src, int stride, int qscale);
    void (*h263_h_loop_filter)(uint8_t *src, int stride, int qscale);

347
    void (*h261_loop_filter)(uint8_t *src, int stride);
348

Loren Merritt's avatar
Loren Merritt committed
349
    /* assume len is a multiple of 8, and arrays are 16-byte aligned */
350
    void (*vector_clipf)(float *dst /* align 16 */, const float *src /* align 16 */, float min, float max, int len /* align 16 */);
351

352
    /* (I)DCT */
Diego Biurrun's avatar
Diego Biurrun committed
353 354
    void (*fdct)(int16_t *block/* align 16*/);
    void (*fdct248)(int16_t *block/* align 16*/);
355

356
    /* IDCT really*/
Diego Biurrun's avatar
Diego Biurrun committed
357
    void (*idct)(int16_t *block/* align 16*/);
358

Michael Niedermayer's avatar
Michael Niedermayer committed
359
    /**
Michael Niedermayer's avatar
Michael Niedermayer committed
360
     * block -> idct -> clip to unsigned 8 bit -> dest.
Michael Niedermayer's avatar
Michael Niedermayer committed
361
     * (-1392, 0, 0, ...) -> idct -> (-174, -174, ...) -> put -> (0, 0, ...)
Panagiotis Issaris's avatar
Panagiotis Issaris committed
362
     * @param line_size size in bytes of a horizontal line of dest
Michael Niedermayer's avatar
Michael Niedermayer committed
363
     */
Diego Biurrun's avatar
Diego Biurrun committed
364
    void (*idct_put)(uint8_t *dest/*align 8*/, int line_size, int16_t *block/*align 16*/);
365

Michael Niedermayer's avatar
Michael Niedermayer committed
366 367
    /**
     * block -> idct -> add dest -> clip to unsigned 8 bit -> dest.
Panagiotis Issaris's avatar
Panagiotis Issaris committed
368
     * @param line_size size in bytes of a horizontal line of dest
Michael Niedermayer's avatar
Michael Niedermayer committed
369
     */
Diego Biurrun's avatar
Diego Biurrun committed
370
    void (*idct_add)(uint8_t *dest/*align 8*/, int line_size, int16_t *block/*align 16*/);
371

Michael Niedermayer's avatar
Michael Niedermayer committed
372
    /**
Michael Niedermayer's avatar
Michael Niedermayer committed
373
     * idct input permutation.
374 375 376 377
     * several optimized IDCTs need a permutated input (relative to the normal order of the reference
     * IDCT)
     * this permutation must be performed before the idct_put/add, note, normally this can be merged
     * with the zigzag/alternate scan<br>
Michael Niedermayer's avatar
Michael Niedermayer committed
378 379
     * an example to avoid confusion:
     * - (->decode coeffs -> zigzag reorder -> dequant -> reference idct ->...)
380 381
     * - (x -> reference dct -> reference idct -> x)
     * - (x -> reference dct -> simple_mmx_perm = idct_permutation -> simple_idct_mmx -> x)
Michael Niedermayer's avatar
Michael Niedermayer committed
382 383
     * - (->decode coeffs -> zigzag reorder -> simple_mmx_perm -> dequant -> simple_idct_mmx ->...)
     */
384 385 386 387 388 389
    uint8_t idct_permutation[64];
    int idct_permutation_type;
#define FF_NO_IDCT_PERM 1
#define FF_LIBMPEG2_IDCT_PERM 2
#define FF_SIMPLE_IDCT_PERM 3
#define FF_TRANSPOSE_IDCT_PERM 4
390
#define FF_PARTTRANS_IDCT_PERM 5
391
#define FF_SSE2_IDCT_PERM 6
392

393 394 395 396
    int (*try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale);
    void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale);
#define BASIS_SHIFT 16
#define RECON_SHIFT 6
397

398
    void (*draw_edges)(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides);
399
#define EDGE_WIDTH 16
400 401
#define EDGE_TOP    1
#define EDGE_BOTTOM 2
402

403
    void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, int src_wrap, int width, int height);
404

405 406
    /**
     * Calculate scalar product of two vectors.
407
     * @param len length of vectors, should be multiple of 16
408
     */
409
    int32_t (*scalarproduct_int16)(const int16_t *v1, const int16_t *v2/*align 16*/, int len);
410 411 412 413 414 415
    /* ape functions */
    /**
     * Calculate scalar product of v1 and v2,
     * and v1[i] += v3[i] * mul
     * @param len length of vectors, should be multiple of 16
     */
416
    int32_t (*scalarproduct_and_madd_int16)(int16_t *v1/*align 16*/, const int16_t *v2, const int16_t *v3, int len, int mul);
Kostya Shishkov's avatar
Kostya Shishkov committed
417

418 419 420 421 422 423 424 425 426 427 428 429 430 431
    /**
     * Apply symmetric window in 16-bit fixed-point.
     * @param output destination array
     *               constraints: 16-byte aligned
     * @param input  source array
     *               constraints: 16-byte aligned
     * @param window window array
     *               constraints: 16-byte aligned, at least len/2 elements
     * @param len    full window length
     *               constraints: multiple of ? greater than zero
     */
    void (*apply_window_int16)(int16_t *output, const int16_t *input,
                               const int16_t *window, unsigned int len);

432 433 434 435 436 437 438
    /**
     * Clip each element in an array of int32_t to a given minimum and maximum value.
     * @param dst  destination array
     *             constraints: 16-byte aligned
     * @param src  source array
     *             constraints: 16-byte aligned
     * @param min  minimum value
439
     *             constraints: must be in the range [-(1 << 24), 1 << 24]
440
     * @param max  maximum value
441
     *             constraints: must be in the range [-(1 << 24), 1 << 24]
442 443 444 445 446 447
     * @param len  number of elements in the array
     *             constraints: multiple of 32 greater than zero
     */
    void (*vector_clip_int32)(int32_t *dst, const int32_t *src, int32_t min,
                              int32_t max, unsigned int len);

Kostya Shishkov's avatar
Kostya Shishkov committed
448
    op_fill_func fill_block_tab[2];
449 450
} DSPContext;

451 452
void ff_dsputil_static_init(void);
void ff_dsputil_init(DSPContext* p, AVCodecContext *avctx);
453
attribute_deprecated void dsputil_init(DSPContext* c, AVCodecContext *avctx);
Fabrice Bellard's avatar
Fabrice Bellard committed
454

455 456
int ff_check_alignment(void);

457 458 459 460
/**
 * permute block according to permuatation.
 * @param last last non zero element in scantable order
 */
Diego Biurrun's avatar
Diego Biurrun committed
461
void ff_block_permute(int16_t *block, uint8_t *permutation, const uint8_t *scantable, int last);
462

463 464
void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type);

465
#define         BYTE_VEC32(c)   ((c)*0x01010101UL)
466
#define         BYTE_VEC64(c)   ((c)*0x0001000100010001UL)
Michael Niedermayer's avatar
Michael Niedermayer committed
467 468 469 470 471 472 473 474 475 476 477

static inline uint32_t rnd_avg32(uint32_t a, uint32_t b)
{
    return (a | b) - (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
}

static inline uint32_t no_rnd_avg32(uint32_t a, uint32_t b)
{
    return (a & b) + (((a ^ b) & ~BYTE_VEC32(0x01)) >> 1);
}

478 479 480 481 482 483 484 485 486 487
static inline uint64_t rnd_avg64(uint64_t a, uint64_t b)
{
    return (a | b) - (((a ^ b) & ~BYTE_VEC64(0x01)) >> 1);
}

static inline uint64_t no_rnd_avg64(uint64_t a, uint64_t b)
{
    return (a & b) + (((a ^ b) & ~BYTE_VEC64(0x01)) >> 1);
}

488 489 490 491 492 493 494 495 496 497 498 499
static inline int get_penalty_factor(int lambda, int lambda2, int type){
    switch(type&0xFF){
    default:
    case FF_CMP_SAD:
        return lambda>>FF_LAMBDA_SHIFT;
    case FF_CMP_DCT:
        return (3*lambda)>>(FF_LAMBDA_SHIFT+1);
    case FF_CMP_W53:
        return (4*lambda)>>(FF_LAMBDA_SHIFT);
    case FF_CMP_W97:
        return (2*lambda)>>(FF_LAMBDA_SHIFT);
    case FF_CMP_SATD:
500
    case FF_CMP_DCT264:
501 502 503 504 505 506 507 508 509 510 511
        return (2*lambda)>>FF_LAMBDA_SHIFT;
    case FF_CMP_RD:
    case FF_CMP_PSNR:
    case FF_CMP_SSE:
    case FF_CMP_NSSE:
        return lambda2>>FF_LAMBDA_SHIFT;
    case FF_CMP_BIT:
        return 1;
    }
}

512 513 514 515 516 517 518
void ff_dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_arm(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_bfin(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_ppc(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_sh4(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_vis(DSPContext* c, AVCodecContext *avctx);
519

520
void ff_dsputil_init_dwt(DSPContext *c);
521

522
#if (ARCH_ARM && HAVE_NEON) || ARCH_PPC || HAVE_MMX
523
#   define STRIDE_ALIGN 16
524
#else
525
#   define STRIDE_ALIGN 8
Fabrice Bellard's avatar
Fabrice Bellard committed
526 527
#endif

528 529 530 531
// Some broken preprocessors need a second expansion
// to be forced to tokenize __VA_ARGS__
#define E(x) x

532 533 534 535
#define LOCAL_ALIGNED_A(a, t, v, s, o, ...)             \
    uint8_t la_##v[sizeof(t s o) + (a)];                \
    t (*v) o = (void *)FFALIGN((uintptr_t)la_##v, a)

536 537 538
#define LOCAL_ALIGNED_D(a, t, v, s, o, ...)             \
    DECLARE_ALIGNED(a, t, la_##v) s o;                  \
    t (*v) o = la_##v
539

540
#define LOCAL_ALIGNED(a, t, v, ...) E(LOCAL_ALIGNED_A(a, t, v, __VA_ARGS__,,))
541 542

#if HAVE_LOCAL_ALIGNED_8
543
#   define LOCAL_ALIGNED_8(t, v, ...) E(LOCAL_ALIGNED_D(8, t, v, __VA_ARGS__,,))
544
#else
545
#   define LOCAL_ALIGNED_8(t, v, ...) LOCAL_ALIGNED(8, t, v, __VA_ARGS__)
546 547 548
#endif

#if HAVE_LOCAL_ALIGNED_16
549
#   define LOCAL_ALIGNED_16(t, v, ...) E(LOCAL_ALIGNED_D(16, t, v, __VA_ARGS__,,))
550
#else
551
#   define LOCAL_ALIGNED_16(t, v, ...) LOCAL_ALIGNED(16, t, v, __VA_ARGS__)
552 553
#endif

554
#define WRAPPER8_16_SQ(name8, name16)\
555 556 557 558 559 560 561 562 563 564 565
static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
    int score=0;\
    score +=name8(s, dst           , src           , stride, 8);\
    score +=name8(s, dst+8         , src+8         , stride, 8);\
    if(h==16){\
        dst += 8*stride;\
        src += 8*stride;\
        score +=name8(s, dst           , src           , stride, 8);\
        score +=name8(s, dst+8         , src+8         , stride, 8);\
    }\
    return score;\
Michael Niedermayer's avatar
Michael Niedermayer committed
566 567
}

568

569
static inline void copy_block2(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
570 571 572 573
{
    int i;
    for(i=0; i<h; i++)
    {
574
        AV_COPY16U(dst, src);
575 576 577 578 579
        dst+=dstStride;
        src+=srcStride;
    }
}

580
static inline void copy_block4(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
581 582 583 584
{
    int i;
    for(i=0; i<h; i++)
    {
585
        AV_COPY32U(dst, src);
586 587 588 589 590
        dst+=dstStride;
        src+=srcStride;
    }
}

591
static inline void copy_block8(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
592 593 594 595
{
    int i;
    for(i=0; i<h; i++)
    {
596
        AV_COPY64U(dst, src);
597 598 599 600 601
        dst+=dstStride;
        src+=srcStride;
    }
}

602
static inline void copy_block9(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
603 604 605 606
{
    int i;
    for(i=0; i<h; i++)
    {
607
        AV_COPY64U(dst, src);
608 609 610 611 612 613
        dst[8]= src[8];
        dst+=dstStride;
        src+=srcStride;
    }
}

614
static inline void copy_block16(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
615 616 617 618
{
    int i;
    for(i=0; i<h; i++)
    {
619
        AV_COPY128U(dst, src);
620 621 622 623 624
        dst+=dstStride;
        src+=srcStride;
    }
}

625
static inline void copy_block17(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride, int h)
626 627 628 629
{
    int i;
    for(i=0; i<h; i++)
    {
630
        AV_COPY128U(dst, src);
631 632 633 634 635 636
        dst[16]= src[16];
        dst+=dstStride;
        src+=srcStride;
    }
}

637
#endif /* AVCODEC_DSPUTIL_H */