Commit 6df42f98 authored by Michael Niedermayer's avatar Michael Niedermayer

Merge remote-tracking branch 'qatar/master'

* qatar/master:
  SBR DSP: fix SSE code to not use SSE2 instructions.
  cpu: initialize mask to -1, so that by default, optimizations are used.
  error_resilience: initialize s->block_index[].
  svq3: protect against negative quantizers.
  Don't use ff_cropTbl[] for IDCT.
  swscale: make filterPos 32bit.
  FATE: add CPUFLAGS variable, mapping to -cpuflags avconv option.
  avconv: add -cpuflags option for setting supported cpuflags.
  cpu: add av_set_cpu_flags_mask().
  libx264: Allow overriding the sliced threads option
  avconv: fix counting encoded video size.

Conflicts:
	doc/APIchanges
	doc/fate.texi
	doc/ffmpeg.texi
	ffmpeg.c
	libavcodec/h264idct_template.c
	libavcodec/svq3.c
	libavutil/avutil.h
	libavutil/cpu.c
	libavutil/cpu.h
	libswscale/swscale.c
	tests/Makefile
	tests/fate-run.sh
	tests/regression-funcs.sh
Merged-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parents 57986c50 b5161908
...@@ -340,10 +340,7 @@ void parse_options(void *optctx, int argc, char **argv, const OptionDef *options ...@@ -340,10 +340,7 @@ void parse_options(void *optctx, int argc, char **argv, const OptionDef *options
} }
} }
/* int locate_option(int argc, char **argv, const OptionDef *options,
* Return index of option opt in argv or 0 if not found.
*/
static int locate_option(int argc, char **argv, const OptionDef *options,
const char *optname) const char *optname)
{ {
const OptionDef *po; const OptionDef *po;
...@@ -537,13 +534,54 @@ int opt_max_alloc(const char *opt, const char *arg) ...@@ -537,13 +534,54 @@ int opt_max_alloc(const char *opt, const char *arg)
int opt_cpuflags(const char *opt, const char *arg) int opt_cpuflags(const char *opt, const char *arg)
{ {
char *tail; #define CPUFLAG_MMX2 (AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMX2)
long flags = strtol(arg, &tail, 10); #define CPUFLAG_3DNOW (AV_CPU_FLAG_3DNOW | AV_CPU_FLAG_MMX)
#define CPUFLAG_3DNOWEXT (AV_CPU_FLAG_3DNOWEXT | CPUFLAG_3DNOW)
#define CPUFLAG_SSE (AV_CPU_FLAG_SSE | CPUFLAG_MMX2)
#define CPUFLAG_SSE2 (AV_CPU_FLAG_SSE2 | CPUFLAG_SSE)
#define CPUFLAG_SSE2SLOW (AV_CPU_FLAG_SSE2SLOW | CPUFLAG_SSE2)
#define CPUFLAG_SSE3 (AV_CPU_FLAG_SSE3 | CPUFLAG_SSE2)
#define CPUFLAG_SSE3SLOW (AV_CPU_FLAG_SSE3SLOW | CPUFLAG_SSE3)
#define CPUFLAG_SSSE3 (AV_CPU_FLAG_SSSE3 | CPUFLAG_SSE3)
#define CPUFLAG_SSE4 (AV_CPU_FLAG_SSE4 | CPUFLAG_SSSE3)
#define CPUFLAG_SSE42 (AV_CPU_FLAG_SSE42 | CPUFLAG_SSE4)
#define CPUFLAG_AVX (AV_CPU_FLAG_AVX | CPUFLAG_SSE42)
#define CPUFLAG_XOP (AV_CPU_FLAG_XOP | CPUFLAG_AVX)
#define CPUFLAG_FMA4 (AV_CPU_FLAG_FMA4 | CPUFLAG_AVX)
static const AVOption cpuflags_opts[] = {
{ "flags" , NULL, 0, AV_OPT_TYPE_FLAGS, { 0 }, INT64_MIN, INT64_MAX, .unit = "flags" },
{ "altivec" , NULL, 0, AV_OPT_TYPE_CONST, { AV_CPU_FLAG_ALTIVEC }, .unit = "flags" },
{ "mmx" , NULL, 0, AV_OPT_TYPE_CONST, { AV_CPU_FLAG_MMX }, .unit = "flags" },
{ "mmx2" , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_MMX2 }, .unit = "flags" },
{ "sse" , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_SSE }, .unit = "flags" },
{ "sse2" , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_SSE2 }, .unit = "flags" },
{ "sse2slow", NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_SSE2SLOW }, .unit = "flags" },
{ "sse3" , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_SSE3 }, .unit = "flags" },
{ "sse3slow", NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_SSE3SLOW }, .unit = "flags" },
{ "ssse3" , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_SSSE3 }, .unit = "flags" },
{ "atom" , NULL, 0, AV_OPT_TYPE_CONST, { AV_CPU_FLAG_ATOM }, .unit = "flags" },
{ "sse4.1" , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_SSE4 }, .unit = "flags" },
{ "sse4.2" , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_SSE42 }, .unit = "flags" },
{ "avx" , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_AVX }, .unit = "flags" },
{ "xop" , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_XOP }, .unit = "flags" },
{ "fma4" , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_FMA4 }, .unit = "flags" },
{ "3dnow" , NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_3DNOW }, .unit = "flags" },
{ "3dnowext", NULL, 0, AV_OPT_TYPE_CONST, { CPUFLAG_3DNOWEXT }, .unit = "flags" },
{ NULL },
};
static const AVClass class = {
.class_name = "cpuflags",
.item_name = av_default_item_name,
.option = cpuflags_opts,
.version = LIBAVUTIL_VERSION_INT,
};
int flags = av_get_cpu_flags();
int ret;
const AVClass *pclass = &class;
if ((ret = av_opt_eval_flags(&pclass, &cpuflags_opts[0], arg, &flags)) < 0)
return ret;
if (*tail) {
av_log(NULL, AV_LOG_FATAL, "Invalid cpuflags \"%s\".\n", arg);
exit_program(1);
}
av_force_cpu_flags(flags); av_force_cpu_flags(flags);
return 0; return 0;
} }
......
...@@ -206,6 +206,12 @@ int parse_option(void *optctx, const char *opt, const char *arg, ...@@ -206,6 +206,12 @@ int parse_option(void *optctx, const char *opt, const char *arg,
*/ */
void parse_loglevel(int argc, char **argv, const OptionDef *options); void parse_loglevel(int argc, char **argv, const OptionDef *options);
/**
* Return index of option opt in argv or 0 if not found.
*/
int locate_option(int argc, char **argv, const OptionDef *options,
const char *optname);
/** /**
* Check if the given stream matches a stream specifier. * Check if the given stream matches a stream specifier.
* *
......
...@@ -16,4 +16,4 @@ ...@@ -16,4 +16,4 @@
{ "debug", HAS_ARG, {(void*)opt_codec_debug}, "set debug flags", "flags" }, { "debug", HAS_ARG, {(void*)opt_codec_debug}, "set debug flags", "flags" },
{ "report", 0, {(void*)opt_report}, "generate a report" }, { "report", 0, {(void*)opt_report}, "generate a report" },
{ "max_alloc", HAS_ARG, {(void*)opt_max_alloc}, "set maximum size of a single allocated block", "bytes" }, { "max_alloc", HAS_ARG, {(void*)opt_max_alloc}, "set maximum size of a single allocated block", "bytes" },
{ "cpuflags", HAS_ARG, {(void*)opt_cpuflags}, "force specific cpu flags", "flags" }, { "cpuflags", HAS_ARG | OPT_EXPERT, {(void*)opt_cpuflags}, "force specific cpu flags", "flags" },
...@@ -134,6 +134,10 @@ It also implies @code{-loglevel verbose}. ...@@ -134,6 +134,10 @@ It also implies @code{-loglevel verbose}.
Note: setting the environment variable @code{FFREPORT} to any value has the Note: setting the environment variable @code{FFREPORT} to any value has the
same effect. same effect.
@item -cpuflags flags (@emph{global})
Allows setting and clearing cpu flags. This option is intended
for testing. Do not use it unless you know what you're doing.
@end table @end table
@section AVOptions @section AVOptions
......
...@@ -166,9 +166,11 @@ the synchronisation of the samples directory. ...@@ -166,9 +166,11 @@ the synchronisation of the samples directory.
@item THREADS @item THREADS
Specify how many threads to use while running regression tests, it is Specify how many threads to use while running regression tests, it is
quite useful to detect thread-related regressions. quite useful to detect thread-related regressions.
@item CPUFLAGS
Specify CPU flags.
@end table @end table
Example: Example:
@example @example
make V=1 SAMPLES=/var/fate/samples THREADS=2 fate make V=1 SAMPLES=/var/fate/samples THREADS=2 CPUFLAGS=mmx fate
@end example @end example
...@@ -4963,6 +4963,13 @@ static int opt_deinterlace(const char *opt, const char *arg) ...@@ -4963,6 +4963,13 @@ static int opt_deinterlace(const char *opt, const char *arg)
return 0; return 0;
} }
static void parse_cpuflags(int argc, char **argv, const OptionDef *options)
{
int idx = locate_option(argc, argv, options, "cpuflags");
if (idx && argv[idx + 1])
opt_cpuflags("cpuflags", argv[idx + 1]);
}
#define OFFSET(x) offsetof(OptionsContext, x) #define OFFSET(x) offsetof(OptionsContext, x)
static const OptionDef options[] = { static const OptionDef options[] = {
/* main options */ /* main options */
...@@ -5136,6 +5143,8 @@ int main(int argc, char **argv) ...@@ -5136,6 +5143,8 @@ int main(int argc, char **argv)
term_init(); term_init();
parse_cpuflags(argc, argv, options);
/* parse options */ /* parse options */
parse_options(&o, argc, argv, options, opt_output_file); parse_options(&o, argc, argv, options, opt_output_file);
......
...@@ -367,18 +367,17 @@ void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels, ...@@ -367,18 +367,17 @@ void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
int line_size) int line_size)
{ {
int i; int i;
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
/* read the pixels */ /* read the pixels */
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
pixels[0] = cm[block[0]]; pixels[0] = av_clip_uint8(block[0]);
pixels[1] = cm[block[1]]; pixels[1] = av_clip_uint8(block[1]);
pixels[2] = cm[block[2]]; pixels[2] = av_clip_uint8(block[2]);
pixels[3] = cm[block[3]]; pixels[3] = av_clip_uint8(block[3]);
pixels[4] = cm[block[4]]; pixels[4] = av_clip_uint8(block[4]);
pixels[5] = cm[block[5]]; pixels[5] = av_clip_uint8(block[5]);
pixels[6] = cm[block[6]]; pixels[6] = av_clip_uint8(block[6]);
pixels[7] = cm[block[7]]; pixels[7] = av_clip_uint8(block[7]);
pixels += line_size; pixels += line_size;
block += 8; block += 8;
...@@ -389,14 +388,13 @@ static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels ...@@ -389,14 +388,13 @@ static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels
int line_size) int line_size)
{ {
int i; int i;
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
/* read the pixels */ /* read the pixels */
for(i=0;i<4;i++) { for(i=0;i<4;i++) {
pixels[0] = cm[block[0]]; pixels[0] = av_clip_uint8(block[0]);
pixels[1] = cm[block[1]]; pixels[1] = av_clip_uint8(block[1]);
pixels[2] = cm[block[2]]; pixels[2] = av_clip_uint8(block[2]);
pixels[3] = cm[block[3]]; pixels[3] = av_clip_uint8(block[3]);
pixels += line_size; pixels += line_size;
block += 8; block += 8;
...@@ -407,12 +405,11 @@ static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels ...@@ -407,12 +405,11 @@ static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels
int line_size) int line_size)
{ {
int i; int i;
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
/* read the pixels */ /* read the pixels */
for(i=0;i<2;i++) { for(i=0;i<2;i++) {
pixels[0] = cm[block[0]]; pixels[0] = av_clip_uint8(block[0]);
pixels[1] = cm[block[1]]; pixels[1] = av_clip_uint8(block[1]);
pixels += line_size; pixels += line_size;
block += 8; block += 8;
...@@ -444,18 +441,17 @@ void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels, ...@@ -444,18 +441,17 @@ void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
int line_size) int line_size)
{ {
int i; int i;
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
/* read the pixels */ /* read the pixels */
for(i=0;i<8;i++) { for(i=0;i<8;i++) {
pixels[0] = cm[pixels[0] + block[0]]; pixels[0] = av_clip_uint8(pixels[0] + block[0]);
pixels[1] = cm[pixels[1] + block[1]]; pixels[1] = av_clip_uint8(pixels[1] + block[1]);
pixels[2] = cm[pixels[2] + block[2]]; pixels[2] = av_clip_uint8(pixels[2] + block[2]);
pixels[3] = cm[pixels[3] + block[3]]; pixels[3] = av_clip_uint8(pixels[3] + block[3]);
pixels[4] = cm[pixels[4] + block[4]]; pixels[4] = av_clip_uint8(pixels[4] + block[4]);
pixels[5] = cm[pixels[5] + block[5]]; pixels[5] = av_clip_uint8(pixels[5] + block[5]);
pixels[6] = cm[pixels[6] + block[6]]; pixels[6] = av_clip_uint8(pixels[6] + block[6]);
pixels[7] = cm[pixels[7] + block[7]]; pixels[7] = av_clip_uint8(pixels[7] + block[7]);
pixels += line_size; pixels += line_size;
block += 8; block += 8;
} }
...@@ -465,14 +461,13 @@ static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels ...@@ -465,14 +461,13 @@ static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels
int line_size) int line_size)
{ {
int i; int i;
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
/* read the pixels */ /* read the pixels */
for(i=0;i<4;i++) { for(i=0;i<4;i++) {
pixels[0] = cm[pixels[0] + block[0]]; pixels[0] = av_clip_uint8(pixels[0] + block[0]);
pixels[1] = cm[pixels[1] + block[1]]; pixels[1] = av_clip_uint8(pixels[1] + block[1]);
pixels[2] = cm[pixels[2] + block[2]]; pixels[2] = av_clip_uint8(pixels[2] + block[2]);
pixels[3] = cm[pixels[3] + block[3]]; pixels[3] = av_clip_uint8(pixels[3] + block[3]);
pixels += line_size; pixels += line_size;
block += 8; block += 8;
} }
...@@ -482,12 +477,11 @@ static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels ...@@ -482,12 +477,11 @@ static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels
int line_size) int line_size)
{ {
int i; int i;
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
/* read the pixels */ /* read the pixels */
for(i=0;i<2;i++) { for(i=0;i<2;i++) {
pixels[0] = cm[pixels[0] + block[0]]; pixels[0] = av_clip_uint8(pixels[0] + block[0]);
pixels[1] = cm[pixels[1] + block[1]]; pixels[1] = av_clip_uint8(pixels[1] + block[1]);
pixels += line_size; pixels += line_size;
block += 8; block += 8;
} }
...@@ -2779,15 +2773,11 @@ static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block) ...@@ -2779,15 +2773,11 @@ static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block) static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
{ {
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; dest[0] = av_clip_uint8((block[0] + 4)>>3);
dest[0] = cm[(block[0] + 4)>>3];
} }
static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block) static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
{ {
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
} }
static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; } static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
......
...@@ -440,9 +440,14 @@ static void guess_mv(MpegEncContext *s) ...@@ -440,9 +440,14 @@ static void guess_mv(MpegEncContext *s)
if ((!(s->avctx->error_concealment&FF_EC_GUESS_MVS)) || if ((!(s->avctx->error_concealment&FF_EC_GUESS_MVS)) ||
num_avail <= mb_width / 2) { num_avail <= mb_width / 2) {
for (mb_y = 0; mb_y < s->mb_height; mb_y++) { for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
s->mb_x = 0;
s->mb_y = mb_y;
ff_init_block_index(s);
for (mb_x = 0; mb_x < s->mb_width; mb_x++) { for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
const int mb_xy = mb_x + mb_y * s->mb_stride; const int mb_xy = mb_x + mb_y * s->mb_stride;
ff_update_block_index(s);
if (IS_INTRA(s->current_picture.f.mb_type[mb_xy])) if (IS_INTRA(s->current_picture.f.mb_type[mb_xy]))
continue; continue;
if (!(s->error_status_table[mb_xy] & ER_MV_ERROR)) if (!(s->error_status_table[mb_xy] & ER_MV_ERROR))
...@@ -477,6 +482,9 @@ static void guess_mv(MpegEncContext *s) ...@@ -477,6 +482,9 @@ static void guess_mv(MpegEncContext *s)
changed = 0; changed = 0;
for (mb_y = 0; mb_y < s->mb_height; mb_y++) { for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
s->mb_x = 0;
s->mb_y = mb_y;
ff_init_block_index(s);
for (mb_x = 0; mb_x < s->mb_width; mb_x++) { for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
const int mb_xy = mb_x + mb_y * s->mb_stride; const int mb_xy = mb_x + mb_y * s->mb_stride;
int mv_predictor[8][2] = { { 0 } }; int mv_predictor[8][2] = { { 0 } };
...@@ -488,6 +496,8 @@ static void guess_mv(MpegEncContext *s) ...@@ -488,6 +496,8 @@ static void guess_mv(MpegEncContext *s)
const int mot_index = (mb_x + mb_y * mot_stride) * mot_step; const int mot_index = (mb_x + mb_y * mot_stride) * mot_step;
int prev_x, prev_y, prev_ref; int prev_x, prev_y, prev_ref;
ff_update_block_index(s);
if ((mb_x ^ mb_y ^ pass) & 1) if ((mb_x ^ mb_y ^ pass) & 1)
continue; continue;
...@@ -1098,11 +1108,16 @@ void ff_er_frame_end(MpegEncContext *s) ...@@ -1098,11 +1108,16 @@ void ff_er_frame_end(MpegEncContext *s)
/* handle inter blocks with damaged AC */ /* handle inter blocks with damaged AC */
for (mb_y = 0; mb_y < s->mb_height; mb_y++) { for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
s->mb_x = 0;
s->mb_y = mb_y;
ff_init_block_index(s);
for (mb_x = 0; mb_x < s->mb_width; mb_x++) { for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
const int mb_xy = mb_x + mb_y * s->mb_stride; const int mb_xy = mb_x + mb_y * s->mb_stride;
const int mb_type = s->current_picture.f.mb_type[mb_xy]; const int mb_type = s->current_picture.f.mb_type[mb_xy];
int dir = !s->last_picture.f.data[0]; int dir = !s->last_picture.f.data[0];
ff_update_block_index(s);
error = s->error_status_table[mb_xy]; error = s->error_status_table[mb_xy];
if (IS_INTRA(mb_type)) if (IS_INTRA(mb_type))
...@@ -1140,11 +1155,16 @@ void ff_er_frame_end(MpegEncContext *s) ...@@ -1140,11 +1155,16 @@ void ff_er_frame_end(MpegEncContext *s)
/* guess MVs */ /* guess MVs */
if (s->pict_type == AV_PICTURE_TYPE_B) { if (s->pict_type == AV_PICTURE_TYPE_B) {
for (mb_y = 0; mb_y < s->mb_height; mb_y++) { for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
s->mb_x = 0;
s->mb_y = mb_y;
ff_init_block_index(s);
for (mb_x = 0; mb_x < s->mb_width; mb_x++) { for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
int xy = mb_x * 2 + mb_y * 2 * s->b8_stride; int xy = mb_x * 2 + mb_y * 2 * s->b8_stride;
const int mb_xy = mb_x + mb_y * s->mb_stride; const int mb_xy = mb_x + mb_y * s->mb_stride;
const int mb_type = s->current_picture.f.mb_type[mb_xy]; const int mb_type = s->current_picture.f.mb_type[mb_xy];
ff_update_block_index(s);
error = s->error_status_table[mb_xy]; error = s->error_status_table[mb_xy];
if (IS_INTRA(mb_type)) if (IS_INTRA(mb_type))
......
...@@ -49,7 +49,6 @@ static const uint8_t scan8[16*3]={ ...@@ -49,7 +49,6 @@ static const uint8_t scan8[16*3]={
void FUNCC(ff_h264_idct_add)(uint8_t *_dst, DCTELEM *_block, int stride) void FUNCC(ff_h264_idct_add)(uint8_t *_dst, DCTELEM *_block, int stride)
{ {
int i; int i;
INIT_CLIP
pixel *dst = (pixel*)_dst; pixel *dst = (pixel*)_dst;
dctcoef *block = (dctcoef*)_block; dctcoef *block = (dctcoef*)_block;
stride >>= sizeof(pixel)-1; stride >>= sizeof(pixel)-1;
...@@ -74,16 +73,15 @@ void FUNCC(ff_h264_idct_add)(uint8_t *_dst, DCTELEM *_block, int stride) ...@@ -74,16 +73,15 @@ void FUNCC(ff_h264_idct_add)(uint8_t *_dst, DCTELEM *_block, int stride)
const int z2= (block[1 + 4*i]>>1) - block[3 + 4*i]; const int z2= (block[1 + 4*i]>>1) - block[3 + 4*i];
const int z3= block[1 + 4*i] + (block[3 + 4*i]>>1); const int z3= block[1 + 4*i] + (block[3 + 4*i]>>1);
dst[i + 0*stride]= CLIP(dst[i + 0*stride] + ((z0 + z3) >> 6)); dst[i + 0*stride]= av_clip_pixel(dst[i + 0*stride] + ((z0 + z3) >> 6));
dst[i + 1*stride]= CLIP(dst[i + 1*stride] + ((z1 + z2) >> 6)); dst[i + 1*stride]= av_clip_pixel(dst[i + 1*stride] + ((z1 + z2) >> 6));
dst[i + 2*stride]= CLIP(dst[i + 2*stride] + ((z1 - z2) >> 6)); dst[i + 2*stride]= av_clip_pixel(dst[i + 2*stride] + ((z1 - z2) >> 6));
dst[i + 3*stride]= CLIP(dst[i + 3*stride] + ((z0 - z3) >> 6)); dst[i + 3*stride]= av_clip_pixel(dst[i + 3*stride] + ((z0 - z3) >> 6));
} }
} }
void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, DCTELEM *_block, int stride){ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, DCTELEM *_block, int stride){
int i; int i;
INIT_CLIP
pixel *dst = (pixel*)_dst; pixel *dst = (pixel*)_dst;
dctcoef *block = (dctcoef*)_block; dctcoef *block = (dctcoef*)_block;
stride >>= sizeof(pixel)-1; stride >>= sizeof(pixel)-1;
...@@ -143,14 +141,14 @@ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, DCTELEM *_block, int stride){ ...@@ -143,14 +141,14 @@ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, DCTELEM *_block, int stride){
const int b5 = (a3>>2) - a5; const int b5 = (a3>>2) - a5;
const int b7 = a7 - (a1>>2); const int b7 = a7 - (a1>>2);
dst[i + 0*stride] = CLIP( dst[i + 0*stride] + ((b0 + b7) >> 6) ); dst[i + 0*stride] = av_clip_pixel( dst[i + 0*stride] + ((b0 + b7) >> 6) );
dst[i + 1*stride] = CLIP( dst[i + 1*stride] + ((b2 + b5) >> 6) ); dst[i + 1*stride] = av_clip_pixel( dst[i + 1*stride] + ((b2 + b5) >> 6) );
dst[i + 2*stride] = CLIP( dst[i + 2*stride] + ((b4 + b3) >> 6) ); dst[i + 2*stride] = av_clip_pixel( dst[i + 2*stride] + ((b4 + b3) >> 6) );
dst[i + 3*stride] = CLIP( dst[i + 3*stride] + ((b6 + b1) >> 6) ); dst[i + 3*stride] = av_clip_pixel( dst[i + 3*stride] + ((b6 + b1) >> 6) );
dst[i + 4*stride] = CLIP( dst[i + 4*stride] + ((b6 - b1) >> 6) ); dst[i + 4*stride] = av_clip_pixel( dst[i + 4*stride] + ((b6 - b1) >> 6) );
dst[i + 5*stride] = CLIP( dst[i + 5*stride] + ((b4 - b3) >> 6) ); dst[i + 5*stride] = av_clip_pixel( dst[i + 5*stride] + ((b4 - b3) >> 6) );
dst[i + 6*stride] = CLIP( dst[i + 6*stride] + ((b2 - b5) >> 6) ); dst[i + 6*stride] = av_clip_pixel( dst[i + 6*stride] + ((b2 - b5) >> 6) );
dst[i + 7*stride] = CLIP( dst[i + 7*stride] + ((b0 - b7) >> 6) ); dst[i + 7*stride] = av_clip_pixel( dst[i + 7*stride] + ((b0 - b7) >> 6) );
} }
} }
...@@ -158,13 +156,12 @@ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, DCTELEM *_block, int stride){ ...@@ -158,13 +156,12 @@ void FUNCC(ff_h264_idct8_add)(uint8_t *_dst, DCTELEM *_block, int stride){
void FUNCC(ff_h264_idct_dc_add)(uint8_t *p_dst, DCTELEM *block, int stride){ void FUNCC(ff_h264_idct_dc_add)(uint8_t *p_dst, DCTELEM *block, int stride){
int i, j; int i, j;
int dc = (((dctcoef*)block)[0] + 32) >> 6; int dc = (((dctcoef*)block)[0] + 32) >> 6;
INIT_CLIP
pixel *dst = (pixel*)p_dst; pixel *dst = (pixel*)p_dst;
stride >>= sizeof(pixel)-1; stride >>= sizeof(pixel)-1;
for( j = 0; j < 4; j++ ) for( j = 0; j < 4; j++ )
{ {
for( i = 0; i < 4; i++ ) for( i = 0; i < 4; i++ )
dst[i] = CLIP( dst[i] + dc ); dst[i] = av_clip_pixel( dst[i] + dc );
dst += stride; dst += stride;
} }
} }
...@@ -172,13 +169,12 @@ void FUNCC(ff_h264_idct_dc_add)(uint8_t *p_dst, DCTELEM *block, int stride){ ...@@ -172,13 +169,12 @@ void FUNCC(ff_h264_idct_dc_add)(uint8_t *p_dst, DCTELEM *block, int stride){
void FUNCC(ff_h264_idct8_dc_add)(uint8_t *p_dst, DCTELEM *block, int stride){ void FUNCC(ff_h264_idct8_dc_add)(uint8_t *p_dst, DCTELEM *block, int stride){
int i, j; int i, j;
int dc = (((dctcoef*)block)[0] + 32) >> 6; int dc = (((dctcoef*)block)[0] + 32) >> 6;
INIT_CLIP
pixel *dst = (pixel*)p_dst; pixel *dst = (pixel*)p_dst;
stride >>= sizeof(pixel)-1; stride >>= sizeof(pixel)-1;
for( j = 0; j < 8; j++ ) for( j = 0; j < 8; j++ )
{ {
for( i = 0; i < 8; i++ ) for( i = 0; i < 8; i++ )
dst[i] = CLIP( dst[i] + dc ); dst[i] = av_clip_pixel( dst[i] + dc );
dst += stride; dst += stride;
} }
} }
......
...@@ -454,6 +454,8 @@ static av_cold int X264_init(AVCodecContext *avctx) ...@@ -454,6 +454,8 @@ static av_cold int X264_init(AVCodecContext *avctx)
x4->params.analyse.b_psnr = avctx->flags & CODEC_FLAG_PSNR; x4->params.analyse.b_psnr = avctx->flags & CODEC_FLAG_PSNR;
x4->params.i_threads = avctx->thread_count; x4->params.i_threads = avctx->thread_count;
if (avctx->thread_type)
x4->params.b_sliced_threads = avctx->thread_type == FF_THREAD_SLICE;
x4->params.b_interlaced = avctx->flags & CODEC_FLAG_INTERLACED_DCT; x4->params.b_interlaced = avctx->flags & CODEC_FLAG_INTERLACED_DCT;
...@@ -631,6 +633,7 @@ static const AVCodecDefault x264_defaults[] = { ...@@ -631,6 +633,7 @@ static const AVCodecDefault x264_defaults[] = {
{ "coder", "-1" }, { "coder", "-1" },
{ "cmp", "-1" }, { "cmp", "-1" },
{ "threads", AV_STRINGIFY(X264_THREADS_AUTO) }, { "threads", AV_STRINGIFY(X264_THREADS_AUTO) },
{ "thread_type", "0" },
{ NULL }, { NULL },
}; };
......
...@@ -102,15 +102,13 @@ static void rv34_inv_transform_noround_c(DCTELEM *block){ ...@@ -102,15 +102,13 @@ static void rv34_inv_transform_noround_c(DCTELEM *block){
static void rv34_idct_dc_add_c(uint8_t *dst, ptrdiff_t stride, int dc) static void rv34_idct_dc_add_c(uint8_t *dst, ptrdiff_t stride, int dc)
{ {
const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
int i, j; int i, j;
cm += (13*13*dc + 0x200) >> 10; dc = (13*13*dc + 0x200) >> 10;
for (i = 0; i < 4; i++) for (i = 0; i < 4; i++)
{ {
for (j = 0; j < 4; j++) for (j = 0; j < 4; j++)
dst[j] = cm[ dst[j] ]; dst[j] = av_clip_uint8( dst[j] + dc );
dst += stride; dst += stride;
} }
......
...@@ -132,7 +132,6 @@ void ff_simple_idct248_put(uint8_t *dest, int line_size, DCTELEM *block) ...@@ -132,7 +132,6 @@ void ff_simple_idct248_put(uint8_t *dest, int line_size, DCTELEM *block)
static inline void idct4col_add(uint8_t *dest, int line_size, const DCTELEM *col) static inline void idct4col_add(uint8_t *dest, int line_size, const DCTELEM *col)
{ {
int c0, c1, c2, c3, a0, a1, a2, a3; int c0, c1, c2, c3, a0, a1, a2, a3;
const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
a0 = col[8*0]; a0 = col[8*0];
a1 = col[8*1]; a1 = col[8*1];
...@@ -142,13 +141,13 @@ static inline void idct4col_add(uint8_t *dest, int line_size, const DCTELEM *col ...@@ -142,13 +141,13 @@ static inline void idct4col_add(uint8_t *dest, int line_size, const DCTELEM *col
c2 = (a0 - a2)*C3 + (1 << (C_SHIFT - 1)); c2 = (a0 - a2)*C3 + (1 << (C_SHIFT - 1));
c1 = a1 * C1 + a3 * C2; c1 = a1 * C1 + a3 * C2;
c3 = a1 * C2 - a3 * C1; c3 = a1 * C2 - a3 * C1;
dest[0] = cm[dest[0] + ((c0 + c1) >> C_SHIFT)]; dest[0] = av_clip_uint8(dest[0] + ((c0 + c1) >> C_SHIFT));
dest += line_size; dest += line_size;
dest[0] = cm[dest[0] + ((c2 + c3) >> C_SHIFT)]; dest[0] = av_clip_uint8(dest[0] + ((c2 + c3) >> C_SHIFT));
dest += line_size; dest += line_size;
dest[0] = cm[dest[0] + ((c2 - c3) >> C_SHIFT)]; dest[0] = av_clip_uint8(dest[0] + ((c2 - c3) >> C_SHIFT));
dest += line_size; dest += line_size;
dest[0] = cm[dest[0] + ((c0 - c1) >> C_SHIFT)]; dest[0] = av_clip_uint8(dest[0] + ((c0 - c1) >> C_SHIFT));
} }
#define RN_SHIFT 15 #define RN_SHIFT 15
...@@ -160,7 +159,6 @@ static inline void idct4col_add(uint8_t *dest, int line_size, const DCTELEM *col ...@@ -160,7 +159,6 @@ static inline void idct4col_add(uint8_t *dest, int line_size, const DCTELEM *col
static inline void idct4row(DCTELEM *row) static inline void idct4row(DCTELEM *row)
{ {
int c0, c1, c2, c3, a0, a1, a2, a3; int c0, c1, c2, c3, a0, a1, a2, a3;
//const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
a0 = row[0]; a0 = row[0];
a1 = row[1]; a1 = row[1];
......
...@@ -224,50 +224,48 @@ static inline void FUNC(idctSparseColPut)(pixel *dest, int line_size, ...@@ -224,50 +224,48 @@ static inline void FUNC(idctSparseColPut)(pixel *dest, int line_size,
DCTELEM *col) DCTELEM *col)
{ {
int a0, a1, a2, a3, b0, b1, b2, b3; int a0, a1, a2, a3, b0, b1, b2, b3;
INIT_CLIP;
IDCT_COLS; IDCT_COLS;
dest[0] = CLIP((a0 + b0) >> COL_SHIFT); dest[0] = av_clip_pixel((a0 + b0) >> COL_SHIFT);
dest += line_size; dest += line_size;
dest[0] = CLIP((a1 + b1) >> COL_SHIFT); dest[0] = av_clip_pixel((a1 + b1) >> COL_SHIFT);
dest += line_size; dest += line_size;
dest[0] = CLIP((a2 + b2) >> COL_SHIFT); dest[0] = av_clip_pixel((a2 + b2) >> COL_SHIFT);
dest += line_size; dest += line_size;
dest[0] = CLIP((a3 + b3) >> COL_SHIFT); dest[0] = av_clip_pixel((a3 + b3) >> COL_SHIFT);
dest += line_size; dest += line_size;
dest[0] = CLIP((a3 - b3) >> COL_SHIFT); dest[0] = av_clip_pixel((a3 - b3) >> COL_SHIFT);
dest += line_size; dest += line_size;
dest[0] = CLIP((a2 - b2) >> COL_SHIFT); dest[0] = av_clip_pixel((a2 - b2) >> COL_SHIFT);
dest += line_size; dest += line_size;
dest[0] = CLIP((a1 - b1) >> COL_SHIFT); dest[0] = av_clip_pixel((a1 - b1) >> COL_SHIFT);
dest += line_size; dest += line_size;
dest[0] = CLIP((a0 - b0) >> COL_SHIFT); dest[0] = av_clip_pixel((a0 - b0) >> COL_SHIFT);
} }
static inline void FUNC(idctSparseColAdd)(pixel *dest, int line_size, static inline void FUNC(idctSparseColAdd)(pixel *dest, int line_size,
DCTELEM *col) DCTELEM *col)
{ {
int a0, a1, a2, a3, b0, b1, b2, b3; int a0, a1, a2, a3, b0, b1, b2, b3;
INIT_CLIP;
IDCT_COLS; IDCT_COLS;
dest[0] = CLIP(dest[0] + ((a0 + b0) >> COL_SHIFT)); dest[0] = av_clip_pixel(dest[0] + ((a0 + b0) >> COL_SHIFT));
dest += line_size; dest += line_size;
dest[0] = CLIP(dest[0] + ((a1 + b1) >> COL_SHIFT)); dest[0] = av_clip_pixel(dest[0] + ((a1 + b1) >> COL_SHIFT));
dest += line_size; dest += line_size;
dest[0] = CLIP(dest[0] + ((a2 + b2) >> COL_SHIFT)); dest[0] = av_clip_pixel(dest[0] + ((a2 + b2) >> COL_SHIFT));
dest += line_size; dest += line_size;
dest[0] = CLIP(dest[0] + ((a3 + b3) >> COL_SHIFT)); dest[0] = av_clip_pixel(dest[0] + ((a3 + b3) >> COL_SHIFT));
dest += line_size; dest += line_size;
dest[0] = CLIP(dest[0] + ((a3 - b3) >> COL_SHIFT)); dest[0] = av_clip_pixel(dest[0] + ((a3 - b3) >> COL_SHIFT));
dest += line_size; dest += line_size;
dest[0] = CLIP(dest[0] + ((a2 - b2) >> COL_SHIFT)); dest[0] = av_clip_pixel(dest[0] + ((a2 - b2) >> COL_SHIFT));
dest += line_size; dest += line_size;
dest[0] = CLIP(dest[0] + ((a1 - b1) >> COL_SHIFT)); dest[0] = av_clip_pixel(dest[0] + ((a1 - b1) >> COL_SHIFT));
dest += line_size; dest += line_size;
dest[0] = CLIP(dest[0] + ((a0 - b0) >> COL_SHIFT)); dest[0] = av_clip_pixel(dest[0] + ((a0 - b0) >> COL_SHIFT));
} }
static inline void FUNC(idctSparseCol)(DCTELEM *col) static inline void FUNC(idctSparseCol)(DCTELEM *col)
......
...@@ -139,8 +139,6 @@ static void vc1_h_s_overlap_c(DCTELEM *left, DCTELEM *right) ...@@ -139,8 +139,6 @@ static void vc1_h_s_overlap_c(DCTELEM *left, DCTELEM *right)
* @see 8.6 * @see 8.6
*/ */
static av_always_inline int vc1_filter_line(uint8_t* src, int stride, int pq){ static av_always_inline int vc1_filter_line(uint8_t* src, int stride, int pq){
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
int a0 = (2*(src[-2*stride] - src[ 1*stride]) - 5*(src[-1*stride] - src[ 0*stride]) + 4) >> 3; int a0 = (2*(src[-2*stride] - src[ 1*stride]) - 5*(src[-1*stride] - src[ 0*stride]) + 4) >> 3;
int a0_sign = a0 >> 31; /* Store sign */ int a0_sign = a0 >> 31; /* Store sign */
a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */ a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
...@@ -163,8 +161,8 @@ static av_always_inline int vc1_filter_line(uint8_t* src, int stride, int pq){ ...@@ -163,8 +161,8 @@ static av_always_inline int vc1_filter_line(uint8_t* src, int stride, int pq){
else{ else{
d = FFMIN(d, clip); d = FFMIN(d, clip);
d = (d ^ d_sign) - d_sign; /* Restore sign */ d = (d ^ d_sign) - d_sign; /* Restore sign */
src[-1*stride] = cm[src[-1*stride] - d]; src[-1*stride] = av_clip_uint8(src[-1*stride] - d);
src[ 0*stride] = cm[src[ 0*stride] + d]; src[ 0*stride] = av_clip_uint8(src[ 0*stride] + d);
} }
return 1; return 1;
} }
...@@ -234,19 +232,17 @@ static void vc1_inv_trans_8x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block) ...@@ -234,19 +232,17 @@ static void vc1_inv_trans_8x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
{ {
int i; int i;
int dc = block[0]; int dc = block[0];
const uint8_t *cm;
dc = (3 * dc + 1) >> 1; dc = (3 * dc + 1) >> 1;
dc = (3 * dc + 16) >> 5; dc = (3 * dc + 16) >> 5;
cm = ff_cropTbl + MAX_NEG_CROP + dc;
for(i = 0; i < 8; i++){ for(i = 0; i < 8; i++){
dest[0] = cm[dest[0]]; dest[0] = av_clip_uint8(dest[0] + dc);
dest[1] = cm[dest[1]]; dest[1] = av_clip_uint8(dest[1] + dc);
dest[2] = cm[dest[2]]; dest[2] = av_clip_uint8(dest[2] + dc);
dest[3] = cm[dest[3]]; dest[3] = av_clip_uint8(dest[3] + dc);
dest[4] = cm[dest[4]]; dest[4] = av_clip_uint8(dest[4] + dc);
dest[5] = cm[dest[5]]; dest[5] = av_clip_uint8(dest[5] + dc);
dest[6] = cm[dest[6]]; dest[6] = av_clip_uint8(dest[6] + dc);
dest[7] = cm[dest[7]]; dest[7] = av_clip_uint8(dest[7] + dc);
dest += linesize; dest += linesize;
} }
} }
...@@ -326,19 +322,17 @@ static void vc1_inv_trans_8x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block) ...@@ -326,19 +322,17 @@ static void vc1_inv_trans_8x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
{ {
int i; int i;
int dc = block[0]; int dc = block[0];
const uint8_t *cm;
dc = ( 3 * dc + 1) >> 1; dc = ( 3 * dc + 1) >> 1;
dc = (17 * dc + 64) >> 7; dc = (17 * dc + 64) >> 7;
cm = ff_cropTbl + MAX_NEG_CROP + dc;
for(i = 0; i < 4; i++){ for(i = 0; i < 4; i++){
dest[0] = cm[dest[0]]; dest[0] = av_clip_uint8(dest[0] + dc);
dest[1] = cm[dest[1]]; dest[1] = av_clip_uint8(dest[1] + dc);
dest[2] = cm[dest[2]]; dest[2] = av_clip_uint8(dest[2] + dc);
dest[3] = cm[dest[3]]; dest[3] = av_clip_uint8(dest[3] + dc);
dest[4] = cm[dest[4]]; dest[4] = av_clip_uint8(dest[4] + dc);
dest[5] = cm[dest[5]]; dest[5] = av_clip_uint8(dest[5] + dc);
dest[6] = cm[dest[6]]; dest[6] = av_clip_uint8(dest[6] + dc);
dest[7] = cm[dest[7]]; dest[7] = av_clip_uint8(dest[7] + dc);
dest += linesize; dest += linesize;
} }
} }
...@@ -403,15 +397,13 @@ static void vc1_inv_trans_4x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block) ...@@ -403,15 +397,13 @@ static void vc1_inv_trans_4x8_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
{ {
int i; int i;
int dc = block[0]; int dc = block[0];
const uint8_t *cm;
dc = (17 * dc + 4) >> 3; dc = (17 * dc + 4) >> 3;
dc = (12 * dc + 64) >> 7; dc = (12 * dc + 64) >> 7;
cm = ff_cropTbl + MAX_NEG_CROP + dc;
for(i = 0; i < 8; i++){ for(i = 0; i < 8; i++){
dest[0] = cm[dest[0]]; dest[0] = av_clip_uint8(dest[0] + dc);
dest[1] = cm[dest[1]]; dest[1] = av_clip_uint8(dest[1] + dc);
dest[2] = cm[dest[2]]; dest[2] = av_clip_uint8(dest[2] + dc);
dest[3] = cm[dest[3]]; dest[3] = av_clip_uint8(dest[3] + dc);
dest += linesize; dest += linesize;
} }
} }
...@@ -476,15 +468,13 @@ static void vc1_inv_trans_4x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block) ...@@ -476,15 +468,13 @@ static void vc1_inv_trans_4x4_dc_c(uint8_t *dest, int linesize, DCTELEM *block)
{ {
int i; int i;
int dc = block[0]; int dc = block[0];
const uint8_t *cm;
dc = (17 * dc + 4) >> 3; dc = (17 * dc + 4) >> 3;
dc = (17 * dc + 64) >> 7; dc = (17 * dc + 64) >> 7;
cm = ff_cropTbl + MAX_NEG_CROP + dc;
for(i = 0; i < 4; i++){ for(i = 0; i < 4; i++){
dest[0] = cm[dest[0]]; dest[0] = av_clip_uint8(dest[0] + dc);
dest[1] = cm[dest[1]]; dest[1] = av_clip_uint8(dest[1] + dc);
dest[2] = cm[dest[2]]; dest[2] = av_clip_uint8(dest[2] + dc);
dest[3] = cm[dest[3]]; dest[3] = av_clip_uint8(dest[3] + dc);
dest += linesize; dest += linesize;
} }
} }
......
...@@ -41,7 +41,6 @@ ...@@ -41,7 +41,6 @@
static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int type) static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int type)
{ {
int16_t *ip = input; int16_t *ip = input;
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
int A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H; int A, B, C, D, Ad, Bd, Cd, Dd, E, F, G, H;
int Ed, Gd, Add, Bdd, Fd, Hd; int Ed, Gd, Add, Bdd, Fd, Hd;
...@@ -147,29 +146,29 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int ...@@ -147,29 +146,29 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
ip[5*8] = (Fd + Bdd ) >> 4; ip[5*8] = (Fd + Bdd ) >> 4;
ip[6*8] = (Fd - Bdd ) >> 4; ip[6*8] = (Fd - Bdd ) >> 4;
}else if(type==1){ }else if(type==1){
dst[0*stride] = cm[(Gd + Cd ) >> 4]; dst[0*stride] = av_clip_uint8((Gd + Cd ) >> 4);
dst[7*stride] = cm[(Gd - Cd ) >> 4]; dst[7*stride] = av_clip_uint8((Gd - Cd ) >> 4);
dst[1*stride] = cm[(Add + Hd ) >> 4]; dst[1*stride] = av_clip_uint8((Add + Hd ) >> 4);
dst[2*stride] = cm[(Add - Hd ) >> 4]; dst[2*stride] = av_clip_uint8((Add - Hd ) >> 4);
dst[3*stride] = cm[(Ed + Dd ) >> 4]; dst[3*stride] = av_clip_uint8((Ed + Dd ) >> 4);
dst[4*stride] = cm[(Ed - Dd ) >> 4]; dst[4*stride] = av_clip_uint8((Ed - Dd ) >> 4);
dst[5*stride] = cm[(Fd + Bdd ) >> 4]; dst[5*stride] = av_clip_uint8((Fd + Bdd ) >> 4);
dst[6*stride] = cm[(Fd - Bdd ) >> 4]; dst[6*stride] = av_clip_uint8((Fd - Bdd ) >> 4);
}else{ }else{
dst[0*stride] = cm[dst[0*stride] + ((Gd + Cd ) >> 4)]; dst[0*stride] = av_clip_uint8(dst[0*stride] + ((Gd + Cd ) >> 4));
dst[7*stride] = cm[dst[7*stride] + ((Gd - Cd ) >> 4)]; dst[7*stride] = av_clip_uint8(dst[7*stride] + ((Gd - Cd ) >> 4));
dst[1*stride] = cm[dst[1*stride] + ((Add + Hd ) >> 4)]; dst[1*stride] = av_clip_uint8(dst[1*stride] + ((Add + Hd ) >> 4));
dst[2*stride] = cm[dst[2*stride] + ((Add - Hd ) >> 4)]; dst[2*stride] = av_clip_uint8(dst[2*stride] + ((Add - Hd ) >> 4));
dst[3*stride] = cm[dst[3*stride] + ((Ed + Dd ) >> 4)]; dst[3*stride] = av_clip_uint8(dst[3*stride] + ((Ed + Dd ) >> 4));
dst[4*stride] = cm[dst[4*stride] + ((Ed - Dd ) >> 4)]; dst[4*stride] = av_clip_uint8(dst[4*stride] + ((Ed - Dd ) >> 4));
dst[5*stride] = cm[dst[5*stride] + ((Fd + Bdd ) >> 4)]; dst[5*stride] = av_clip_uint8(dst[5*stride] + ((Fd + Bdd ) >> 4));
dst[6*stride] = cm[dst[6*stride] + ((Fd - Bdd ) >> 4)]; dst[6*stride] = av_clip_uint8(dst[6*stride] + ((Fd - Bdd ) >> 4));
} }
} else { } else {
...@@ -190,18 +189,18 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int ...@@ -190,18 +189,18 @@ static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
dst[4*stride]= dst[4*stride]=
dst[5*stride]= dst[5*stride]=
dst[6*stride]= dst[6*stride]=
dst[7*stride]= cm[128 + ((xC4S4 * ip[0*8] + (IdctAdjustBeforeShift<<16))>>20)]; dst[7*stride]= av_clip_uint8(128 + ((xC4S4 * ip[0*8] + (IdctAdjustBeforeShift<<16))>>20));
}else{ }else{
if(ip[0*8]){ if(ip[0*8]){
int v= ((xC4S4 * ip[0*8] + (IdctAdjustBeforeShift<<16))>>20); int v= ((xC4S4 * ip[0*8] + (IdctAdjustBeforeShift<<16))>>20);
dst[0*stride] = cm[dst[0*stride] + v]; dst[0*stride] = av_clip_uint8(dst[0*stride] + v);
dst[1*stride] = cm[dst[1*stride] + v]; dst[1*stride] = av_clip_uint8(dst[1*stride] + v);
dst[2*stride] = cm[dst[2*stride] + v]; dst[2*stride] = av_clip_uint8(dst[2*stride] + v);
dst[3*stride] = cm[dst[3*stride] + v]; dst[3*stride] = av_clip_uint8(dst[3*stride] + v);
dst[4*stride] = cm[dst[4*stride] + v]; dst[4*stride] = av_clip_uint8(dst[4*stride] + v);
dst[5*stride] = cm[dst[5*stride] + v]; dst[5*stride] = av_clip_uint8(dst[5*stride] + v);
dst[6*stride] = cm[dst[6*stride] + v]; dst[6*stride] = av_clip_uint8(dst[6*stride] + v);
dst[7*stride] = cm[dst[7*stride] + v]; dst[7*stride] = av_clip_uint8(dst[7*stride] + v);
} }
} }
} }
...@@ -225,17 +224,16 @@ void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/* ...@@ -225,17 +224,16 @@ void ff_vp3_idct_add_c(uint8_t *dest/*align 8*/, int line_size, DCTELEM *block/*
void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/){ void ff_vp3_idct_dc_add_c(uint8_t *dest/*align 8*/, int line_size, const DCTELEM *block/*align 16*/){
int i, dc = (block[0] + 15) >> 5; int i, dc = (block[0] + 15) >> 5;
const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc;
for(i = 0; i < 8; i++){ for(i = 0; i < 8; i++){
dest[0] = cm[dest[0]]; dest[0] = av_clip_uint8(dest[0] + dc);
dest[1] = cm[dest[1]]; dest[1] = av_clip_uint8(dest[1] + dc);
dest[2] = cm[dest[2]]; dest[2] = av_clip_uint8(dest[2] + dc);
dest[3] = cm[dest[3]]; dest[3] = av_clip_uint8(dest[3] + dc);
dest[4] = cm[dest[4]]; dest[4] = av_clip_uint8(dest[4] + dc);
dest[5] = cm[dest[5]]; dest[5] = av_clip_uint8(dest[5] + dc);
dest[6] = cm[dest[6]]; dest[6] = av_clip_uint8(dest[6] + dc);
dest[7] = cm[dest[7]]; dest[7] = av_clip_uint8(dest[7] + dc);
dest += line_size; dest += line_size;
} }
} }
......
...@@ -80,7 +80,6 @@ static void vp8_luma_dc_wht_dc_c(DCTELEM block[4][4][16], DCTELEM dc[16]) ...@@ -80,7 +80,6 @@ static void vp8_luma_dc_wht_dc_c(DCTELEM block[4][4][16], DCTELEM dc[16])
static void vp8_idct_add_c(uint8_t *dst, DCTELEM block[16], ptrdiff_t stride) static void vp8_idct_add_c(uint8_t *dst, DCTELEM block[16], ptrdiff_t stride)
{ {
int i, t0, t1, t2, t3; int i, t0, t1, t2, t3;
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
DCTELEM tmp[16]; DCTELEM tmp[16];
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
...@@ -105,10 +104,10 @@ static void vp8_idct_add_c(uint8_t *dst, DCTELEM block[16], ptrdiff_t stride) ...@@ -105,10 +104,10 @@ static void vp8_idct_add_c(uint8_t *dst, DCTELEM block[16], ptrdiff_t stride)
t2 = MUL_35468(tmp[1*4+i]) - MUL_20091(tmp[3*4+i]); t2 = MUL_35468(tmp[1*4+i]) - MUL_20091(tmp[3*4+i]);
t3 = MUL_20091(tmp[1*4+i]) + MUL_35468(tmp[3*4+i]); t3 = MUL_20091(tmp[1*4+i]) + MUL_35468(tmp[3*4+i]);
dst[0] = cm[dst[0] + ((t0 + t3 + 4) >> 3)]; dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
dst[1] = cm[dst[1] + ((t1 + t2 + 4) >> 3)]; dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
dst[2] = cm[dst[2] + ((t1 - t2 + 4) >> 3)]; dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
dst[3] = cm[dst[3] + ((t0 - t3 + 4) >> 3)]; dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
dst += stride; dst += stride;
} }
} }
...@@ -116,14 +115,13 @@ static void vp8_idct_add_c(uint8_t *dst, DCTELEM block[16], ptrdiff_t stride) ...@@ -116,14 +115,13 @@ static void vp8_idct_add_c(uint8_t *dst, DCTELEM block[16], ptrdiff_t stride)
static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], ptrdiff_t stride) static void vp8_idct_dc_add_c(uint8_t *dst, DCTELEM block[16], ptrdiff_t stride)
{ {
int i, dc = (block[0] + 4) >> 3; int i, dc = (block[0] + 4) >> 3;
uint8_t *cm = ff_cropTbl + MAX_NEG_CROP + dc;
block[0] = 0; block[0] = 0;
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++) {
dst[0] = cm[dst[0]]; dst[0] = av_clip_uint8(dst[0] + dc);
dst[1] = cm[dst[1]]; dst[1] = av_clip_uint8(dst[1] + dc);
dst[2] = cm[dst[2]]; dst[2] = av_clip_uint8(dst[2] + dc);
dst[3] = cm[dst[3]]; dst[3] = av_clip_uint8(dst[3] + dc);
dst += stride; dst += stride;
} }
} }
......
...@@ -104,7 +104,7 @@ cglobal sbr_hf_g_filt, 5, 6, 5 ...@@ -104,7 +104,7 @@ cglobal sbr_hf_g_filt, 5, 6, 5
movq m2, [r1] movq m2, [r1]
punpckldq m0, m0 punpckldq m0, m0
mulps m2, m0 mulps m2, m0
movq [r0], m2 movlps [r0], m2
add r0, 8 add r0, 8
add r2, 4 add r2, 4
add r1, STEP add r1, STEP
......
...@@ -153,7 +153,7 @@ ...@@ -153,7 +153,7 @@
*/ */
#define LIBAVUTIL_VERSION_MAJOR 51 #define LIBAVUTIL_VERSION_MAJOR 51
#define LIBAVUTIL_VERSION_MINOR 41 #define LIBAVUTIL_VERSION_MINOR 42
#define LIBAVUTIL_VERSION_MICRO 100 #define LIBAVUTIL_VERSION_MICRO 100
#define LIBAVUTIL_VERSION_INT AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \ #define LIBAVUTIL_VERSION_INT AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
......
...@@ -23,7 +23,7 @@ static int flags, checked; ...@@ -23,7 +23,7 @@ static int flags, checked;
void av_force_cpu_flags(int arg){ void av_force_cpu_flags(int arg){
flags = arg; flags = arg;
checked = 1; checked = arg != -1;
} }
int av_get_cpu_flags(void) int av_get_cpu_flags(void)
...@@ -39,6 +39,13 @@ int av_get_cpu_flags(void) ...@@ -39,6 +39,13 @@ int av_get_cpu_flags(void)
return flags; return flags;
} }
void av_set_cpu_flags_mask(int mask)
{
checked = 0;
flags = av_get_cpu_flags() & mask;
checked = 1;
}
#ifdef TEST #ifdef TEST
#undef printf #undef printf
......
...@@ -21,6 +21,8 @@ ...@@ -21,6 +21,8 @@
#ifndef AVUTIL_CPU_H #ifndef AVUTIL_CPU_H
#define AVUTIL_CPU_H #define AVUTIL_CPU_H
#include "attributes.h"
#define AV_CPU_FLAG_FORCE 0x80000000 /* force usage of selected flags (OR) */ #define AV_CPU_FLAG_FORCE 0x80000000 /* force usage of selected flags (OR) */
/* lower 16 bits - CPU features */ /* lower 16 bits - CPU features */
...@@ -49,12 +51,19 @@ ...@@ -49,12 +51,19 @@
*/ */
int av_get_cpu_flags(void); int av_get_cpu_flags(void);
/** /**
* Disables cpu detection and forces the specified flags. * Disables cpu detection and forces the specified flags.
*/ */
void av_force_cpu_flags(int flags); void av_force_cpu_flags(int flags);
/**
* Set a mask on flags returned by av_get_cpu_flags().
* This function is mainly useful for testing.
* Please use av_force_cpu_flags() and av_get_cpu_flags() instead which are more flexible
*
* @warning this function is not thread safe.
*/
attribute_deprecated void av_set_cpu_flags_mask(int mask);
/* The following CPU-specific functions shall not be called directly. */ /* The following CPU-specific functions shall not be called directly. */
int ff_get_cpu_flags_arm(void); int ff_get_cpu_flags_arm(void);
......
...@@ -144,7 +144,7 @@ static void yuv2planeX_altivec(const int16_t *filter, int filterSize, ...@@ -144,7 +144,7 @@ static void yuv2planeX_altivec(const int16_t *filter, int filterSize,
static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW, static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter, const uint8_t *src, const int16_t *filter,
const int16_t *filterPos, int filterSize) const int32_t *filterPos, int filterSize)
{ {
register int i; register int i;
DECLARE_ALIGNED(16, int, tempo)[4]; DECLARE_ALIGNED(16, int, tempo)[4];
......
...@@ -63,7 +63,7 @@ static av_always_inline void fillPlane(uint8_t* plane, int stride, ...@@ -63,7 +63,7 @@ static av_always_inline void fillPlane(uint8_t* plane, int stride,
static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src, static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *_src,
const int16_t *filter, const int16_t *filter,
const int16_t *filterPos, int filterSize) const int32_t *filterPos, int filterSize)
{ {
int i; int i;
int32_t *dst = (int32_t *) _dst; int32_t *dst = (int32_t *) _dst;
...@@ -89,7 +89,7 @@ static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t ...@@ -89,7 +89,7 @@ static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t
static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src, static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *_src,
const int16_t *filter, const int16_t *filter,
const int16_t *filterPos, int filterSize) const int32_t *filterPos, int filterSize)
{ {
int i; int i;
const uint16_t *src = (const uint16_t *) _src; const uint16_t *src = (const uint16_t *) _src;
...@@ -113,7 +113,7 @@ static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t ...@@ -113,7 +113,7 @@ static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t
// bilinear / bicubic scaling // bilinear / bicubic scaling
static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src, static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *src,
const int16_t *filter, const int16_t *filterPos, const int16_t *filter, const int32_t *filterPos,
int filterSize) int filterSize)
{ {
int i; int i;
...@@ -131,7 +131,7 @@ static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t * ...@@ -131,7 +131,7 @@ static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW, const uint8_t *
} }
static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *src, static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW, const uint8_t *src,
const int16_t *filter, const int16_t *filterPos, const int16_t *filter, const int32_t *filterPos,
int filterSize) int filterSize)
{ {
int i; int i;
...@@ -234,7 +234,7 @@ static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth, ...@@ -234,7 +234,7 @@ static void hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth, static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
const uint8_t *src_in[4], int srcW, int xInc, const uint8_t *src_in[4], int srcW, int xInc,
const int16_t *hLumFilter, const int16_t *hLumFilter,
const int16_t *hLumFilterPos, int hLumFilterSize, const int32_t *hLumFilterPos, int hLumFilterSize,
uint8_t *formatConvBuffer, uint8_t *formatConvBuffer,
uint32_t *pal, int isAlpha) uint32_t *pal, int isAlpha)
{ {
...@@ -282,7 +282,7 @@ static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2, ...@@ -282,7 +282,7 @@ static void hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, static av_always_inline void hcscale(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth,
const uint8_t *src_in[4], const uint8_t *src_in[4],
int srcW, int xInc, const int16_t *hChrFilter, int srcW, int xInc, const int16_t *hChrFilter,
const int16_t *hChrFilterPos, int hChrFilterSize, const int32_t *hChrFilterPos, int hChrFilterSize,
uint8_t *formatConvBuffer, uint32_t *pal) uint8_t *formatConvBuffer, uint32_t *pal)
{ {
const uint8_t *src1 = src_in[1], *src2 = src_in[2]; const uint8_t *src1 = src_in[1], *src2 = src_in[2];
...@@ -326,10 +326,10 @@ static int swScale(SwsContext *c, const uint8_t* src[], ...@@ -326,10 +326,10 @@ static int swScale(SwsContext *c, const uint8_t* src[],
const int chrXInc= c->chrXInc; const int chrXInc= c->chrXInc;
const enum PixelFormat dstFormat= c->dstFormat; const enum PixelFormat dstFormat= c->dstFormat;
const int flags= c->flags; const int flags= c->flags;
int16_t *vLumFilterPos= c->vLumFilterPos; int32_t *vLumFilterPos= c->vLumFilterPos;
int16_t *vChrFilterPos= c->vChrFilterPos; int32_t *vChrFilterPos= c->vChrFilterPos;
int16_t *hLumFilterPos= c->hLumFilterPos; int32_t *hLumFilterPos= c->hLumFilterPos;
int16_t *hChrFilterPos= c->hChrFilterPos; int32_t *hChrFilterPos= c->hChrFilterPos;
int16_t *hLumFilter= c->hLumFilter; int16_t *hLumFilter= c->hLumFilter;
int16_t *hChrFilter= c->hChrFilter; int16_t *hChrFilter= c->hChrFilter;
int32_t *lumMmxFilter= c->lumMmxFilter; int32_t *lumMmxFilter= c->lumMmxFilter;
......
...@@ -299,10 +299,10 @@ typedef struct SwsContext { ...@@ -299,10 +299,10 @@ typedef struct SwsContext {
int16_t *hChrFilter; ///< Array of horizontal filter coefficients for chroma planes. int16_t *hChrFilter; ///< Array of horizontal filter coefficients for chroma planes.
int16_t *vLumFilter; ///< Array of vertical filter coefficients for luma/alpha planes. int16_t *vLumFilter; ///< Array of vertical filter coefficients for luma/alpha planes.
int16_t *vChrFilter; ///< Array of vertical filter coefficients for chroma planes. int16_t *vChrFilter; ///< Array of vertical filter coefficients for chroma planes.
int16_t *hLumFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for luma/alpha planes. int32_t *hLumFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for luma/alpha planes.
int16_t *hChrFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for chroma planes. int32_t *hChrFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for chroma planes.
int16_t *vLumFilterPos; ///< Array of vertical filter starting positions for each dst[i] for luma/alpha planes. int32_t *vLumFilterPos; ///< Array of vertical filter starting positions for each dst[i] for luma/alpha planes.
int16_t *vChrFilterPos; ///< Array of vertical filter starting positions for each dst[i] for chroma planes. int32_t *vChrFilterPos; ///< Array of vertical filter starting positions for each dst[i] for chroma planes.
int hLumFilterSize; ///< Horizontal filter size for luma/alpha pixels. int hLumFilterSize; ///< Horizontal filter size for luma/alpha pixels.
int hChrFilterSize; ///< Horizontal filter size for chroma pixels. int hChrFilterSize; ///< Horizontal filter size for chroma pixels.
int vLumFilterSize; ///< Vertical filter size for luma/alpha pixels. int vLumFilterSize; ///< Vertical filter size for luma/alpha pixels.
...@@ -515,10 +515,10 @@ typedef struct SwsContext { ...@@ -515,10 +515,10 @@ typedef struct SwsContext {
/** @{ */ /** @{ */
void (*hyScale)(struct SwsContext *c, int16_t *dst, int dstW, void (*hyScale)(struct SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter, const uint8_t *src, const int16_t *filter,
const int16_t *filterPos, int filterSize); const int32_t *filterPos, int filterSize);
void (*hcScale)(struct SwsContext *c, int16_t *dst, int dstW, void (*hcScale)(struct SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter, const uint8_t *src, const int16_t *filter,
const int16_t *filterPos, int filterSize); const int32_t *filterPos, int filterSize);
/** @} */ /** @} */
/// Color range conversion function for luma plane if needed. /// Color range conversion function for luma plane if needed.
......
...@@ -191,7 +191,7 @@ static double getSplineCoeff(double a, double b, double c, double d, double dist ...@@ -191,7 +191,7 @@ static double getSplineCoeff(double a, double b, double c, double d, double dist
dist-1.0); dist-1.0);
} }
static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSize, int xInc, static int initFilter(int16_t **outFilter, int32_t **filterPos, int *outFilterSize, int xInc,
int srcW, int dstW, int filterAlign, int one, int flags, int cpu_flags, int srcW, int dstW, int filterAlign, int one, int flags, int cpu_flags,
SwsVector *srcFilter, SwsVector *dstFilter, double param[2]) SwsVector *srcFilter, SwsVector *dstFilter, double param[2])
{ {
...@@ -207,7 +207,7 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi ...@@ -207,7 +207,7 @@ static int initFilter(int16_t **outFilter, int16_t **filterPos, int *outFilterSi
emms_c(); //FIXME this should not be required but it IS (even for non-MMX versions) emms_c(); //FIXME this should not be required but it IS (even for non-MMX versions)
// NOTE: the +3 is for the MMX(+1)/SSE(+3) scaler which reads over the end // NOTE: the +3 is for the MMX(+1)/SSE(+3) scaler which reads over the end
FF_ALLOC_OR_GOTO(NULL, *filterPos, (dstW+3)*sizeof(int16_t), fail); FF_ALLOC_OR_GOTO(NULL, *filterPos, (dstW+3)*sizeof(**filterPos), fail);
if (FFABS(xInc - 0x10000) <10) { // unscaled if (FFABS(xInc - 0x10000) <10) { // unscaled
int i; int i;
......
...@@ -38,7 +38,7 @@ SECTION .text ...@@ -38,7 +38,7 @@ SECTION .text
; (SwsContext *c, int{16,32}_t *dst, ; (SwsContext *c, int{16,32}_t *dst,
; int dstW, const uint{8,16}_t *src, ; int dstW, const uint{8,16}_t *src,
; const int16_t *filter, ; const int16_t *filter,
; const int16_t *filterPos, int filterSize); ; const int32_t *filterPos, int filterSize);
; ;
; Scale one horizontal line. Input is either 8-bits width or 16-bits width ; Scale one horizontal line. Input is either 8-bits width or 16-bits width
; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to ; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
...@@ -53,6 +53,9 @@ SECTION .text ...@@ -53,6 +53,9 @@ SECTION .text
cglobal hscale%1to%2_%4_%5, %6, 7, %7 cglobal hscale%1to%2_%4_%5, %6, 7, %7
%if ARCH_X86_64 %if ARCH_X86_64
movsxd r2, r2d movsxd r2, r2d
%define mov32 movsxd
%else ; x86-32
%define mov32 mov
%endif ; x86-64 %endif ; x86-64
%if %2 == 19 %if %2 == 19
%if mmsize == 8 ; mmx %if mmsize == 8 ; mmx
...@@ -95,14 +98,14 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 ...@@ -95,14 +98,14 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
%else ; %2 == 19 %else ; %2 == 19
lea r1, [r1+r2*(4>>r2shr)] lea r1, [r1+r2*(4>>r2shr)]
%endif ; %2 == 15/19 %endif ; %2 == 15/19
lea r5, [r5+r2*(2>>r2shr)] lea r5, [r5+r2*(4>>r2shr)]
neg r2 neg r2
.loop: .loop:
%if %3 == 4 ; filterSize == 4 scaling %if %3 == 4 ; filterSize == 4 scaling
; load 2x4 or 4x4 source pixels into m0/m1 ; load 2x4 or 4x4 source pixels into m0/m1
movsx r0, word [r5+r2*2+0] ; filterPos[0] mov32 r0, dword [r5+r2*4+0] ; filterPos[0]
movsx r6, word [r5+r2*2+2] ; filterPos[1] mov32 r6, dword [r5+r2*4+4] ; filterPos[1]
movlh m0, [r3+r0*srcmul] ; src[filterPos[0] + {0,1,2,3}] movlh m0, [r3+r0*srcmul] ; src[filterPos[0] + {0,1,2,3}]
%if mmsize == 8 %if mmsize == 8
movlh m1, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] movlh m1, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}]
...@@ -112,8 +115,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 ...@@ -112,8 +115,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
%else ; %1 == 8 %else ; %1 == 8
movd m4, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}] movd m4, [r3+r6*srcmul] ; src[filterPos[1] + {0,1,2,3}]
%endif %endif
movsx r0, word [r5+r2*2+4] ; filterPos[2] mov32 r0, dword [r5+r2*4+8] ; filterPos[2]
movsx r6, word [r5+r2*2+6] ; filterPos[3] mov32 r6, dword [r5+r2*4+12] ; filterPos[3]
movlh m1, [r3+r0*srcmul] ; src[filterPos[2] + {0,1,2,3}] movlh m1, [r3+r0*srcmul] ; src[filterPos[2] + {0,1,2,3}]
%if %1 > 8 %if %1 > 8
movhps m1, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}] movhps m1, [r3+r6*srcmul] ; src[filterPos[3] + {0,1,2,3}]
...@@ -156,8 +159,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 ...@@ -156,8 +159,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
%endif ; mmx/sse2/ssse3/sse4 %endif ; mmx/sse2/ssse3/sse4
%else ; %3 == 8, i.e. filterSize == 8 scaling %else ; %3 == 8, i.e. filterSize == 8 scaling
; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5 ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
movsx r0, word [r5+r2*1+0] ; filterPos[0] mov32 r0, dword [r5+r2*2+0] ; filterPos[0]
movsx r6, word [r5+r2*1+2] ; filterPos[1] mov32 r6, dword [r5+r2*2+4] ; filterPos[1]
movbh m0, [r3+ r0 *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}] movbh m0, [r3+ r0 *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
%if mmsize == 8 %if mmsize == 8
movbh m1, [r3+(r0+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}] movbh m1, [r3+(r0+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
...@@ -165,8 +168,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 ...@@ -165,8 +168,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
movbh m5, [r3+(r6+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}] movbh m5, [r3+(r6+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
%else ; mmsize == 16 %else ; mmsize == 16
movbh m1, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}] movbh m1, [r3+ r6 *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
movsx r0, word [r5+r2*1+4] ; filterPos[2] mov32 r0, dword [r5+r2*2+8] ; filterPos[2]
movsx r6, word [r5+r2*1+6] ; filterPos[3] mov32 r6, dword [r5+r2*2+12] ; filterPos[3]
movbh m4, [r3+ r0 *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}] movbh m4, [r3+ r0 *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
movbh m5, [r3+ r6 *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}] movbh m5, [r3+ r6 *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
%endif ; mmsize == 8/16 %endif ; mmsize == 8/16
...@@ -251,7 +254,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 ...@@ -251,7 +254,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
%define r1x r1 %define r1x r1
%define filter2 r6m %define filter2 r6m
%endif ; x86-32/64 %endif ; x86-32/64
lea r5, [r5+r2*2] lea r5, [r5+r2*4]
%if %2 == 15 %if %2 == 15
lea r1, [r1+r2*2] lea r1, [r1+r2*2]
%else ; %2 == 19 %else ; %2 == 19
...@@ -261,8 +264,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 ...@@ -261,8 +264,8 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
neg r2 neg r2
.loop: .loop:
movsx r0, word [r5+r2*2+0] ; filterPos[0] mov32 r0, dword [r5+r2*4+0] ; filterPos[0]
movsx r1x, word [r5+r2*2+2] ; filterPos[1] mov32 r1x, dword [r5+r2*4+4] ; filterPos[1]
; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)? ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
pxor m4, m4 pxor m4, m4
pxor m5, m5 pxor m5, m5
...@@ -293,7 +296,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7 ...@@ -293,7 +296,7 @@ cglobal hscale%1to%2_%4_%5, %6, 7, %7
jl .innerloop jl .innerloop
%ifidn %4, X4 %ifidn %4, X4
movsx r1x, word [r5+r2*2+2] ; filterPos[1] mov32 r1x, dword [r5+r2*4+4] ; filterPos[1]
movlh m0, [src_reg+r0 *srcmul] ; split last 4 srcpx of dstpx[0] movlh m0, [src_reg+r0 *srcmul] ; split last 4 srcpx of dstpx[0]
sub r1x, r6 ; and first 4 srcpx of dstpx[1] sub r1x, r6 ; and first 4 srcpx of dstpx[1]
%if %1 > 8 %if %1 > 8
......
...@@ -94,8 +94,8 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI ...@@ -94,8 +94,8 @@ void updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufI
int16_t **alpPixBuf= c->alpPixBuf; int16_t **alpPixBuf= c->alpPixBuf;
const int vLumBufSize= c->vLumBufSize; const int vLumBufSize= c->vLumBufSize;
const int vChrBufSize= c->vChrBufSize; const int vChrBufSize= c->vChrBufSize;
int16_t *vLumFilterPos= c->vLumFilterPos; int32_t *vLumFilterPos= c->vLumFilterPos;
int16_t *vChrFilterPos= c->vChrFilterPos; int32_t *vChrFilterPos= c->vChrFilterPos;
int16_t *vLumFilter= c->vLumFilter; int16_t *vLumFilter= c->vLumFilter;
int16_t *vChrFilter= c->vChrFilter; int16_t *vChrFilter= c->vChrFilter;
int32_t *lumMmxFilter= c->lumMmxFilter; int32_t *lumMmxFilter= c->lumMmxFilter;
...@@ -266,7 +266,7 @@ extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( ...@@ -266,7 +266,7 @@ extern void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt(
SwsContext *c, int16_t *data, \ SwsContext *c, int16_t *data, \
int dstW, const uint8_t *src, \ int dstW, const uint8_t *src, \
const int16_t *filter, \ const int16_t *filter, \
const int16_t *filterPos, int filterSize) const int32_t *filterPos, int filterSize)
#define SCALE_FUNCS(filter_n, opt) \ #define SCALE_FUNCS(filter_n, opt) \
SCALE_FUNC(filter_n, 8, 15, opt); \ SCALE_FUNC(filter_n, 8, 15, opt); \
......
...@@ -1450,7 +1450,7 @@ static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, ...@@ -1450,7 +1450,7 @@ static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst,
int dstWidth, const uint8_t *src, int dstWidth, const uint8_t *src,
int srcW, int xInc) int srcW, int xInc)
{ {
int16_t *filterPos = c->hLumFilterPos; int32_t *filterPos = c->hLumFilterPos;
int16_t *filter = c->hLumFilter; int16_t *filter = c->hLumFilter;
void *mmx2FilterCode= c->lumMmx2FilterCode; void *mmx2FilterCode= c->lumMmx2FilterCode;
int i; int i;
...@@ -1546,7 +1546,7 @@ static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2, ...@@ -1546,7 +1546,7 @@ static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2,
int dstWidth, const uint8_t *src1, int dstWidth, const uint8_t *src1,
const uint8_t *src2, int srcW, int xInc) const uint8_t *src2, int srcW, int xInc)
{ {
int16_t *filterPos = c->hChrFilterPos; int32_t *filterPos = c->hChrFilterPos;
int16_t *filter = c->hChrFilter; int16_t *filter = c->hChrFilter;
void *mmx2FilterCode= c->chrMmx2FilterCode; void *mmx2FilterCode= c->chrMmx2FilterCode;
int i; int i;
......
...@@ -142,7 +142,7 @@ fate:: $(FATE) ...@@ -142,7 +142,7 @@ fate:: $(FATE)
$(FATE): $(TOOL)$(EXESUF) $(FATE_UTILS:%=tests/%$(HOSTEXESUF)) $(FATE): $(TOOL)$(EXESUF) $(FATE_UTILS:%=tests/%$(HOSTEXESUF))
@echo "TEST $(@:fate-%=%)" @echo "TEST $(@:fate-%=%)"
$(Q)$(SRC_PATH)/tests/fate-run.sh $@ "$(SAMPLES)" "$(TARGET_EXEC)" "$(TARGET_PATH)" '$(CMD)' '$(CMP)' '$(REF)' '$(FUZZ)' '$(THREADS)' '$(THREAD_TYPE)' '$(TOOL)' $(Q)$(SRC_PATH)/tests/fate-run.sh $@ "$(SAMPLES)" "$(TARGET_EXEC)" "$(TARGET_PATH)" '$(CMD)' '$(CMP)' '$(REF)' '$(FUZZ)' '$(THREADS)' '$(THREAD_TYPE)' '$(CPUFLAGS)'
fate-list: fate-list:
@printf '%s\n' $(sort $(FATE)) @printf '%s\n' $(sort $(FATE))
......
...@@ -17,7 +17,7 @@ ref=${7:-"${base}/ref/fate/${test}"} ...@@ -17,7 +17,7 @@ ref=${7:-"${base}/ref/fate/${test}"}
fuzz=$8 fuzz=$8
threads=${9:-1} threads=${9:-1}
thread_type=${10:-frame+slice} thread_type=${10:-frame+slice}
tool=${11} cpuflags=${11:-all}
outdir="tests/data/fate" outdir="tests/data/fate"
outfile="${outdir}/${test}" outfile="${outdir}/${test}"
...@@ -51,7 +51,7 @@ run(){ ...@@ -51,7 +51,7 @@ run(){
} }
avconv(){ avconv(){
run $tool -nostats -threads $threads -thread_type $thread_type "$@" run ffmpeg -nostats -threads $threads -thread_type $thread_type -cpuflags $cpuflags "$@"
} }
framecrc(){ framecrc(){
...@@ -77,7 +77,7 @@ pcm(){ ...@@ -77,7 +77,7 @@ pcm(){
regtest(){ regtest(){
t="${test#$2-}" t="${test#$2-}"
ref=${base}/ref/$2/$t ref=${base}/ref/$2/$t
${base}/${1}-regression.sh $t $2 $3 "$target_exec" "$target_path" "$threads" "$thread_type" "$tool" "$samples" ${base}/${1}-regression.sh $t $2 $3 "$target_exec" "$target_path" "$threads" "$thread_type" "$cpuflags" "$samples"
} }
codectest(){ codectest(){
......
...@@ -10,7 +10,7 @@ raw_src_dir=$3 ...@@ -10,7 +10,7 @@ raw_src_dir=$3
target_exec=$4 target_exec=$4
target_path=$5 target_path=$5
threads=${6:-1} threads=${6:-1}
tool=$8 cpuflags=${8:-all}
samples=$9 samples=$9
datadir="./tests/data" datadir="./tests/data"
...@@ -20,7 +20,7 @@ this="$test.$test_ref" ...@@ -20,7 +20,7 @@ this="$test.$test_ref"
outfile="$datadir/$test_ref/" outfile="$datadir/$test_ref/"
# various files # various files
avconv="$target_exec ${target_path}/${tool}" avconv="$target_exec ${target_path}/ffmpeg"
tiny_psnr="tests/tiny_psnr" tiny_psnr="tests/tiny_psnr"
raw_src="${target_path}/$raw_src_dir/%02d.pgm" raw_src="${target_path}/$raw_src_dir/%02d.pgm"
raw_dst="$datadir/$this.out.yuv" raw_dst="$datadir/$this.out.yuv"
...@@ -45,7 +45,7 @@ echov(){ ...@@ -45,7 +45,7 @@ echov(){
. $(dirname $0)/md5.sh . $(dirname $0)/md5.sh
AVCONV_OPTS="-nostats -y" AVCONV_OPTS="-nostats -y -cpuflags $cpuflags"
COMMON_OPTS="-flags +bitexact -idct simple -sws_flags +accurate_rnd+bitexact" COMMON_OPTS="-flags +bitexact -idct simple -sws_flags +accurate_rnd+bitexact"
DEC_OPTS="$COMMON_OPTS -threads $threads" DEC_OPTS="$COMMON_OPTS -threads $threads"
ENC_OPTS="$COMMON_OPTS -threads 1 -dct fastint" ENC_OPTS="$COMMON_OPTS -threads 1 -dct fastint"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment