Commit c312bfac authored by James Almer's avatar James Almer

x86/cpu: add AV_CPU_FLAG_AVXSLOW flag

Reviewed-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
parent 52fc3e37
...@@ -15,6 +15,9 @@ libavutil: 2014-08-09 ...@@ -15,6 +15,9 @@ libavutil: 2014-08-09
API changes, most recent first: API changes, most recent first:
2015-05-27 - xxxxxxx - lavu 54.26.100 - cpu.h
Add AV_CPU_FLAG_AVXSLOW.
2015-05-26 - xxxxxxx - lavu 54.25.100 - rational.h 2015-05-26 - xxxxxxx - lavu 54.25.100 - rational.h
Add av_q2intfloat(). Add av_q2intfloat().
......
...@@ -59,6 +59,7 @@ void av_force_cpu_flags(int arg){ ...@@ -59,6 +59,7 @@ void av_force_cpu_flags(int arg){
AV_CPU_FLAG_SSE4 | AV_CPU_FLAG_SSE4 |
AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_SSE42 |
AV_CPU_FLAG_AVX | AV_CPU_FLAG_AVX |
AV_CPU_FLAG_AVXSLOW |
AV_CPU_FLAG_XOP | AV_CPU_FLAG_XOP |
AV_CPU_FLAG_FMA3 | AV_CPU_FLAG_FMA3 |
AV_CPU_FLAG_FMA4 | AV_CPU_FLAG_FMA4 |
...@@ -111,6 +112,7 @@ int av_parse_cpu_flags(const char *s) ...@@ -111,6 +112,7 @@ int av_parse_cpu_flags(const char *s)
#define CPUFLAG_SSE4 (AV_CPU_FLAG_SSE4 | CPUFLAG_SSSE3) #define CPUFLAG_SSE4 (AV_CPU_FLAG_SSE4 | CPUFLAG_SSSE3)
#define CPUFLAG_SSE42 (AV_CPU_FLAG_SSE42 | CPUFLAG_SSE4) #define CPUFLAG_SSE42 (AV_CPU_FLAG_SSE42 | CPUFLAG_SSE4)
#define CPUFLAG_AVX (AV_CPU_FLAG_AVX | CPUFLAG_SSE42) #define CPUFLAG_AVX (AV_CPU_FLAG_AVX | CPUFLAG_SSE42)
#define CPUFLAG_AVXSLOW (AV_CPU_FLAG_AVXSLOW | CPUFLAG_AVX)
#define CPUFLAG_XOP (AV_CPU_FLAG_XOP | CPUFLAG_AVX) #define CPUFLAG_XOP (AV_CPU_FLAG_XOP | CPUFLAG_AVX)
#define CPUFLAG_FMA3 (AV_CPU_FLAG_FMA3 | CPUFLAG_AVX) #define CPUFLAG_FMA3 (AV_CPU_FLAG_FMA3 | CPUFLAG_AVX)
#define CPUFLAG_FMA4 (AV_CPU_FLAG_FMA4 | CPUFLAG_AVX) #define CPUFLAG_FMA4 (AV_CPU_FLAG_FMA4 | CPUFLAG_AVX)
...@@ -133,6 +135,7 @@ int av_parse_cpu_flags(const char *s) ...@@ -133,6 +135,7 @@ int av_parse_cpu_flags(const char *s)
{ "sse4.1" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_SSE4 }, .unit = "flags" }, { "sse4.1" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_SSE4 }, .unit = "flags" },
{ "sse4.2" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_SSE42 }, .unit = "flags" }, { "sse4.2" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_SSE42 }, .unit = "flags" },
{ "avx" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVX }, .unit = "flags" }, { "avx" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVX }, .unit = "flags" },
{ "avxslow" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_AVXSLOW }, .unit = "flags" },
{ "xop" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_XOP }, .unit = "flags" }, { "xop" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_XOP }, .unit = "flags" },
{ "fma3" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_FMA3 }, .unit = "flags" }, { "fma3" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_FMA3 }, .unit = "flags" },
{ "fma4" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_FMA4 }, .unit = "flags" }, { "fma4" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = CPUFLAG_FMA4 }, .unit = "flags" },
...@@ -192,6 +195,7 @@ int av_parse_cpu_caps(unsigned *flags, const char *s) ...@@ -192,6 +195,7 @@ int av_parse_cpu_caps(unsigned *flags, const char *s)
{ "sse4.1" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_SSE4 }, .unit = "flags" }, { "sse4.1" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_SSE4 }, .unit = "flags" },
{ "sse4.2" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_SSE42 }, .unit = "flags" }, { "sse4.2" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_SSE42 }, .unit = "flags" },
{ "avx" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AVX }, .unit = "flags" }, { "avx" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AVX }, .unit = "flags" },
{ "avxslow" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_AVXSLOW }, .unit = "flags" },
{ "xop" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_XOP }, .unit = "flags" }, { "xop" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_XOP }, .unit = "flags" },
{ "fma3" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_FMA3 }, .unit = "flags" }, { "fma3" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_FMA3 }, .unit = "flags" },
{ "fma4" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_FMA4 }, .unit = "flags" }, { "fma4" , NULL, 0, AV_OPT_TYPE_CONST, { .i64 = AV_CPU_FLAG_FMA4 }, .unit = "flags" },
...@@ -320,6 +324,7 @@ static const struct { ...@@ -320,6 +324,7 @@ static const struct {
{ AV_CPU_FLAG_SSE4, "sse4.1" }, { AV_CPU_FLAG_SSE4, "sse4.1" },
{ AV_CPU_FLAG_SSE42, "sse4.2" }, { AV_CPU_FLAG_SSE42, "sse4.2" },
{ AV_CPU_FLAG_AVX, "avx" }, { AV_CPU_FLAG_AVX, "avx" },
{ AV_CPU_FLAG_AVXSLOW, "avxslow" },
{ AV_CPU_FLAG_XOP, "xop" }, { AV_CPU_FLAG_XOP, "xop" },
{ AV_CPU_FLAG_FMA3, "fma3" }, { AV_CPU_FLAG_FMA3, "fma3" },
{ AV_CPU_FLAG_FMA4, "fma4" }, { AV_CPU_FLAG_FMA4, "fma4" },
......
...@@ -43,6 +43,7 @@ ...@@ -43,6 +43,7 @@
#define AV_CPU_FLAG_SSE4 0x0100 ///< Penryn SSE4.1 functions #define AV_CPU_FLAG_SSE4 0x0100 ///< Penryn SSE4.1 functions
#define AV_CPU_FLAG_SSE42 0x0200 ///< Nehalem SSE4.2 functions #define AV_CPU_FLAG_SSE42 0x0200 ///< Nehalem SSE4.2 functions
#define AV_CPU_FLAG_AVX 0x4000 ///< AVX functions: requires OS support even if YMM registers aren't used #define AV_CPU_FLAG_AVX 0x4000 ///< AVX functions: requires OS support even if YMM registers aren't used
#define AV_CPU_FLAG_AVXSLOW 0x8000000 ///< AVX supported, but slow when using YMM registers (e.g. Bulldozer)
#define AV_CPU_FLAG_XOP 0x0400 ///< Bulldozer XOP functions #define AV_CPU_FLAG_XOP 0x0400 ///< Bulldozer XOP functions
#define AV_CPU_FLAG_FMA4 0x0800 ///< Bulldozer FMA4 functions #define AV_CPU_FLAG_FMA4 0x0800 ///< Bulldozer FMA4 functions
// #if LIBAVUTIL_VERSION_MAJOR <52 // #if LIBAVUTIL_VERSION_MAJOR <52
......
...@@ -56,7 +56,7 @@ ...@@ -56,7 +56,7 @@
*/ */
#define LIBAVUTIL_VERSION_MAJOR 54 #define LIBAVUTIL_VERSION_MAJOR 54
#define LIBAVUTIL_VERSION_MINOR 25 #define LIBAVUTIL_VERSION_MINOR 26
#define LIBAVUTIL_VERSION_MICRO 100 #define LIBAVUTIL_VERSION_MICRO 100
#define LIBAVUTIL_VERSION_INT AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \ #define LIBAVUTIL_VERSION_INT AV_VERSION_INT(LIBAVUTIL_VERSION_MAJOR, \
......
...@@ -167,6 +167,7 @@ int ff_get_cpu_flags_x86(void) ...@@ -167,6 +167,7 @@ int ff_get_cpu_flags_x86(void)
if (ext_caps & (1 << 22)) if (ext_caps & (1 << 22))
rval |= AV_CPU_FLAG_MMXEXT; rval |= AV_CPU_FLAG_MMXEXT;
if (!strncmp(vendor.c, "AuthenticAMD", 12)) {
/* Allow for selectively disabling SSE2 functions on AMD processors /* Allow for selectively disabling SSE2 functions on AMD processors
with SSE2 support but not SSE4a. This includes Athlon64, some with SSE2 support but not SSE4a. This includes Athlon64, some
Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster Opteron, and some Sempron processors. MMX, SSE, or 3DNow! are faster
...@@ -174,9 +175,19 @@ int ff_get_cpu_flags_x86(void) ...@@ -174,9 +175,19 @@ int ff_get_cpu_flags_x86(void)
AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case AV_CPU_FLAG_SSE2 and AV_CPU_FLAG_SSE2SLOW are both set in this case
so that SSE2 is used unless explicitly disabled by checking so that SSE2 is used unless explicitly disabled by checking
AV_CPU_FLAG_SSE2SLOW. */ AV_CPU_FLAG_SSE2SLOW. */
if (!strncmp(vendor.c, "AuthenticAMD", 12) && if (rval & AV_CPU_FLAG_SSE2 && !(ecx & 0x00000040))
rval & AV_CPU_FLAG_SSE2 && !(ecx & 0x00000040)) { rval |= AV_CPU_FLAG_SSE2SLOW;
rval |= AV_CPU_FLAG_SSE2SLOW;
/* Similar to the above but for AVX functions on AMD processors.
This is necessary only for functions using YMM registers on Bulldozer
based CPUs as they lack 256-bits execution units. SSE/AVX functions
using XMM registers are always faster on them.
AV_CPU_FLAG_AVX and AV_CPU_FLAG_AVXSLOW are both set so that AVX is
used unless explicitly disabled by checking AV_CPU_FLAG_AVXSLOW.
TODO: Confirm if Excavator is affected or not by this once it's
released, and update the check if necessary. Same for btver2. */
if (family == 0x15 && (rval & AV_CPU_FLAG_AVX))
rval |= AV_CPU_FLAG_AVXSLOW;
} }
/* XOP and FMA4 use the AVX instruction coding scheme, so they can't be /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment