avcodec: optimize mathops for Loongson-3 v1

HAVE_LOONGSON is replaced by HAVE_LOONGSON3. Even Loongson-2E and 2F support Loongson SIMD instructs but have low performance for decoding. We plan to focus on optimizing Loongson-3A1000, 3B1500 and 3A1500, and modify the configure file to support Loongson-2 series later by adding HAVE_LOONGSON2. Signed-off-by: Michael Niedermayer <michaelni@gmx.at>

avcodec: optimize mathops for Loongson-3 v1
HAVE_LOONGSON is replaced by HAVE_LOONGSON3. Even Loongson-2E and 2F support Loongson SIMD instructs but have low performance for decoding. We plan to focus on optimizing Loongson-3A1000, 3B1500 and 3A1500, and modify the configure file to support Loongson-2 series later by adding HAVE_LOONGSON2. Signed-off-by: Michael Niedermayer <michaelni@gmx.at>
0ace686a · ZhouXiaoyong · Michael Niedermayer · 4b8a8194 · 0ace686a · 0ace686a
Commit 0ace686a authored Apr 16, 2015 by ZhouXiaoyong Committed by Michael Niedermayer Apr 27, 2015
Show whitespace changes
Inline Side-by-side

Showing with 73 additions and 8 deletions

mathops.h libavcodec/mathops.h +3 -0

mathops.h libavcodec/mips/mathops.h +70 -8

No files found.
--- a/libavcodec/mathops.h
+++ b/libavcodec/mathops.h
@@ -211,6 +211,8 @@ if ((y) < (x)) {\
 #   define FASTDIV(a,b) ((uint32_t)((((uint64_t)a) * ff_inverse[b]) >> 32))
 #endif /* FASTDIV */

+#ifndef ff_sqrt
+#define ff_sqrt ff_sqrt
 static inline av_const unsigned int ff_sqrt(unsigned int a)
 {
    unsigned int b;
@@ -230,6 +232,7 @@ static inline av_const unsigned int ff_sqrt(unsigned int a)

    return b - (a < b * b);
 }
+#endif

 static inline int8_t ff_u8_to_s8(uint8_t a)
 {

--- a/libavcodec/mips/mathops.h
+++ b/libavcodec/mips/mathops.h
 /*
 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com>
+ * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
 *
 * This file is part of FFmpeg.
 *
@@ -27,14 +28,73 @@

 #if HAVE_INLINE_ASM

-#if HAVE_LOONGSON
+#if HAVE_LOONGSON3
+
+#define MULH MULH
+static inline av_const int MULH(int a, int b)
+{
+    int c;
+    __asm__ ("dmult %1, %2      \n\t"
+             "mflo %0           \n\t"
+             "dsrl %0, %0, 32   \n\t"
+             : "=r"(c)
+             : "r"(a),"r"(b)
+             : "hi", "lo");
+    return c;
+}
+
+#define UMULH UMULH
+static inline av_const unsigned UMULH(unsigned a, unsigned b)
+{
+    unsigned c;
+    __asm__ ("dmultu %1, %2     \n\t"
+             "mflo %0           \n\t"
+             "dsrl %0, %0, 32   \n\t"
+             : "=r"(c)
+             : "r"(a),"r"(b)
+             : "hi", "lo");
+    return c;
+}
+
+#define mid_pred mid_pred
+static inline av_const int mid_pred(int a, int b, int c)
+{
+    int t = b;
+    __asm__ ("sgt $8, %1, %2    \n\t"
+             "movn %0, %1, $8   \n\t"
+             "movn %1, %2, $8   \n\t"
+             "sgt $8, %1, %3    \n\t"
+             "movz %1, %3, $8   \n\t"
+             "sgt $8, %0, %1    \n\t"
+             "movn %0, %1, $8   \n\t"
+             : "+&r"(t),"+&r"(a)
+             : "r"(b),"r"(c)
+             : "$8");
+    return t;
+}
+
+#define ff_sqrt ff_sqrt
+static inline av_const unsigned int ff_sqrt(unsigned int a)
+{
+    unsigned int b;
+
+    __asm__ ("ctc1 %1, $f0      \n\t"
+             "sqrt.s $f2, $f0   \n\t"
+             "cvt.w.s $f0, $f2  \n\t"
+             "cfc1 %0, $f0      \n\t"
+             : "=r"(b)
+             : "r"(a));
+    return b;
+}

 static inline av_const int64_t MAC64(int64_t d, int a, int b)
 {
    int64_t m;
-    __asm__ ("dmult.g %1, %2, %3 \n\t"
+    __asm__ ("dmult %2, %3     \n\t"
+             "mflo  %1         \n\t"
             "daddu %0, %0, %1 \n\t"
-             : "+r"(d), "=&r"(m) : "r"(a), "r"(b));
+             : "+r"(d), "=&r"(m) : "r"(a), "r"(b)
+             : "hi", "lo");
    return d;
 }
 #define MAC64(d, a, b) ((d) = MAC64(d, a, b))
@@ -42,14 +102,16 @@ static inline av_const int64_t MAC64(int64_t d, int a, int b)
 static inline av_const int64_t MLS64(int64_t d, int a, int b)
 {
    int64_t m;
-    __asm__ ("dmult.g %1, %2, %3 \n\t"
+    __asm__ ("dmult %2, %3     \n\t"
+             "mflo  %1         \n\t"
             "dsubu %0, %0, %1 \n\t"
-             : "+r"(d), "=&r"(m) : "r"(a), "r"(b));
+             : "+r"(d), "=&r"(m) : "r"(a), "r"(b)
+             : "hi", "lo");
    return d;
 }
 #define MLS64(d, a, b) ((d) = MLS64(d, a, b))

-#endif
+#endif /* HAVE_LOONGSON3 */

 #endif /* HAVE_INLINE_ASM */