Commit ef1b4bdf authored by Kaustubh Raste's avatar Kaustubh Raste Committed by Michael Niedermayer

libavutil/mips: Updated msa generic macros

Reduced msa load-store code.
Removed inline asm of GP load-store for 64 bit.
Updated variable names in GP load-store macros for naming consistency.
Corrected macro descriptions.
Signed-off-by: 's avatarKaustubh Raste <kaustubh.raste@imgtec.com>
Reviewed-by: 's avatarManojkumar Bhosale <Manojkumar.Bhosale@imgtec.com>
Signed-off-by: 's avatarMichael Niedermayer <michael@niedermayer.cc>
parent 67af24da
...@@ -27,202 +27,163 @@ ...@@ -27,202 +27,163 @@
#define ALIGNMENT 16 #define ALIGNMENT 16
#define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1))) #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
#define LD_B(RTYPE, psrc) *((RTYPE *)(psrc)) #define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) #define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) #define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
#define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
#define LD_H(RTYPE, psrc) *((RTYPE *)(psrc)) #define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) #define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) #define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
#define LD_W(RTYPE, psrc) *((RTYPE *)(psrc)) #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
#define LD_UW(...) LD_W(v4u32, __VA_ARGS__) #define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) #define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
#define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) #define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) #define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) #define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
#define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
#if (__mips_isa_rev >= 6) #if (__mips_isa_rev >= 6)
#define LW(psrc) \ #define LH(psrc) \
( { \ ( { \
uint8_t *psrc_m = (uint8_t *) (psrc); \ uint16_t val_lh_m = *(uint16_t *)(psrc); \
uint32_t val_m; \ val_lh_m; \
\ } )
__asm__ volatile ( \
"lw %[val_m], %[psrc_m] \n\t" \ #define LW(psrc) \
\ ( { \
: [val_m] "=r" (val_m) \ uint32_t val_lw_m = *(uint32_t *)(psrc); \
: [psrc_m] "m" (*psrc_m) \ val_lw_m; \
); \
\
val_m; \
} ) } )
#if (__mips == 64) #if (__mips == 64)
#define LD(psrc) \ #define LD(psrc) \
( { \ ( { \
uint8_t *psrc_m = (uint8_t *) (psrc); \ uint64_t val_ld_m = *(uint64_t *)(psrc); \
uint64_t val_m = 0; \ val_ld_m; \
\
__asm__ volatile ( \
"ld %[val_m], %[psrc_m] \n\t" \
\
: [val_m] "=r" (val_m) \
: [psrc_m] "m" (*psrc_m) \
); \
\
val_m; \
} ) } )
#else // !(__mips == 64) #else // !(__mips == 64)
#define LD(psrc) \ #define LD(psrc) \
( { \ ( { \
uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
uint32_t val0_m, val1_m; \ uint32_t val0_ld_m, val1_ld_m; \
uint64_t val_m = 0; \ uint64_t val_ld_m = 0; \
\ \
val0_m = LW(psrc_ld_m); \ val0_ld_m = LW(psrc_ld_m); \
val1_m = LW(psrc_ld_m + 4); \ val1_ld_m = LW(psrc_ld_m + 4); \
\ \
val_m = (uint64_t) (val1_m); \ val_ld_m = (uint64_t) (val1_ld_m); \
val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \ val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
val_m = (uint64_t) (val_m | (uint64_t) val0_m); \ val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
\ \
val_m; \ val_ld_m; \
} ) } )
#endif // (__mips == 64) #endif // (__mips == 64)
#define SH(val, pdst) \ #define SH(val, pdst) *(uint16_t *)(pdst) = (val);
{ \ #define SW(val, pdst) *(uint32_t *)(pdst) = (val);
uint8_t *pdst_m = (uint8_t *) (pdst); \ #define SD(val, pdst) *(uint64_t *)(pdst) = (val);
uint16_t val_m = (val); \
\
__asm__ volatile ( \
"sh %[val_m], %[pdst_m] \n\t" \
\
: [pdst_m] "=m" (*pdst_m) \
: [val_m] "r" (val_m) \
); \
}
#define SW(val, pdst) \
{ \
uint8_t *pdst_m = (uint8_t *) (pdst); \
uint32_t val_m = (val); \
\
__asm__ volatile ( \
"sw %[val_m], %[pdst_m] \n\t" \
\
: [pdst_m] "=m" (*pdst_m) \
: [val_m] "r" (val_m) \
); \
}
#define SD(val, pdst) \
{ \
uint8_t *pdst_m = (uint8_t *) (pdst); \
uint64_t val_m = (val); \
\
__asm__ volatile ( \
"sd %[val_m], %[pdst_m] \n\t" \
\
: [pdst_m] "=m" (*pdst_m) \
: [val_m] "r" (val_m) \
); \
}
#else // !(__mips_isa_rev >= 6) #else // !(__mips_isa_rev >= 6)
#define LW(psrc) \ #define LH(psrc) \
( { \ ( { \
uint8_t *psrc_m = (uint8_t *) (psrc); \ uint8_t *psrc_lh_m = (uint8_t *) (psrc); \
uint32_t val_m; \ uint16_t val_lh_m; \
\ \
__asm__ volatile ( \ __asm__ volatile ( \
"ulw %[val_m], %[psrc_m] \n\t" \ "ulh %[val_lh_m], %[psrc_lh_m] \n\t" \
\ \
: [val_m] "=r" (val_m) \ : [val_lh_m] "=r" (val_lh_m) \
: [psrc_m] "m" (*psrc_m) \ : [psrc_lh_m] "m" (*psrc_lh_m) \
); \ ); \
\ \
val_m; \ val_lh_m; \
} )
#define LW(psrc) \
( { \
uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
uint32_t val_lw_m; \
\
__asm__ volatile ( \
"ulw %[val_lw_m], %[psrc_lw_m] \n\t" \
\
: [val_lw_m] "=r" (val_lw_m) \
: [psrc_lw_m] "m" (*psrc_lw_m) \
); \
\
val_lw_m; \
} ) } )
#if (__mips == 64) #if (__mips == 64)
#define LD(psrc) \ #define LD(psrc) \
( { \ ( { \
uint8_t *psrc_m = (uint8_t *) (psrc); \ uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
uint64_t val_m = 0; \ uint64_t val_ld_m = 0; \
\ \
__asm__ volatile ( \ __asm__ volatile ( \
"uld %[val_m], %[psrc_m] \n\t" \ "uld %[val_ld_m], %[psrc_ld_m] \n\t" \
\ \
: [val_m] "=r" (val_m) \ : [val_ld_m] "=r" (val_ld_m) \
: [psrc_m] "m" (*psrc_m) \ : [psrc_ld_m] "m" (*psrc_ld_m) \
); \ ); \
\ \
val_m; \ val_ld_m; \
} ) } )
#else // !(__mips == 64) #else // !(__mips == 64)
#define LD(psrc) \ #define LD(psrc) \
( { \ ( { \
uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
uint32_t val0_m, val1_m; \ uint32_t val0_ld_m, val1_ld_m; \
uint64_t val_m = 0; \ uint64_t val_ld_m = 0; \
\ \
val0_m = LW(psrc_ld_m); \ val0_ld_m = LW(psrc_ld_m); \
val1_m = LW(psrc_ld_m + 4); \ val1_ld_m = LW(psrc_ld_m + 4); \
\ \
val_m = (uint64_t) (val1_m); \ val_ld_m = (uint64_t) (val1_ld_m); \
val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \ val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
val_m = (uint64_t) (val_m | (uint64_t) val0_m); \ val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
\ \
val_m; \ val_ld_m; \
} ) } )
#endif // (__mips == 64) #endif // (__mips == 64)
#define SH(val, pdst) \ #define SH(val, pdst) \
{ \ { \
uint8_t *pdst_m = (uint8_t *) (pdst); \ uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
uint16_t val_m = (val); \ uint16_t val_sh_m = (val); \
\ \
__asm__ volatile ( \ __asm__ volatile ( \
"ush %[val_m], %[pdst_m] \n\t" \ "ush %[val_sh_m], %[pdst_sh_m] \n\t" \
\ \
: [pdst_m] "=m" (*pdst_m) \ : [pdst_sh_m] "=m" (*pdst_sh_m) \
: [val_m] "r" (val_m) \ : [val_sh_m] "r" (val_sh_m) \
); \ ); \
} }
#define SW(val, pdst) \ #define SW(val, pdst) \
{ \ { \
uint8_t *pdst_m = (uint8_t *) (pdst); \ uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
uint32_t val_m = (val); \ uint32_t val_sw_m = (val); \
\ \
__asm__ volatile ( \ __asm__ volatile ( \
"usw %[val_m], %[pdst_m] \n\t" \ "usw %[val_sw_m], %[pdst_sw_m] \n\t" \
\ \
: [pdst_m] "=m" (*pdst_m) \ : [pdst_sw_m] "=m" (*pdst_sw_m) \
: [val_m] "r" (val_m) \ : [val_sw_m] "r" (val_sw_m) \
); \ ); \
} }
#define SD(val, pdst) \ #define SD(val, pdst) \
{ \ { \
uint8_t *pdst_m1 = (uint8_t *) (pdst); \ uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
uint32_t val0_m, val1_m; \ uint32_t val0_sd_m, val1_sd_m; \
\ \
val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
\ \
SW(val0_m, pdst_m1); \ SW(val0_sd_m, pdst_sd_m); \
SW(val1_m, pdst_m1 + 4); \ SW(val1_sd_m, pdst_sd_m + 4); \
} }
#endif // (__mips_isa_rev >= 6) #endif // (__mips_isa_rev >= 6)
...@@ -291,122 +252,91 @@ ...@@ -291,122 +252,91 @@
SD(in3, (pdst) + 3 * stride); \ SD(in3, (pdst) + 3 * stride); \
} }
/* Description : Load vectors with 16 byte elements with stride /* Description : Load vector elements with stride
Arguments : Inputs - psrc (source pointer to load from) Arguments : Inputs - psrc (source pointer to load from)
- stride - stride
Outputs - out0, out1 Outputs - out0, out1
Return Type - as per RTYPE Return Type - as per RTYPE
Details : Loads 16 byte elements in 'out0' from (psrc) Details : Loads elements in 'out0' from (psrc)
Loads 16 byte elements in 'out1' from (psrc + stride) Loads elements in 'out1' from (psrc + stride)
*/ */
#define LD_B2(RTYPE, psrc, stride, out0, out1) \ #define LD_V2(RTYPE, psrc, stride, out0, out1) \
{ \ { \
out0 = LD_B(RTYPE, (psrc)); \ out0 = LD_V(RTYPE, (psrc)); \
out1 = LD_B(RTYPE, (psrc) + stride); \ out1 = LD_V(RTYPE, (psrc) + stride); \
} }
#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
#define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
#define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
#define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ #define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
{ \ { \
LD_B2(RTYPE, (psrc), stride, out0, out1); \ LD_V2(RTYPE, (psrc), stride, out0, out1); \
out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ out2 = LD_V(RTYPE, (psrc) + 2 * stride); \
} }
#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
#define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__) #define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
{ \ { \
LD_B2(RTYPE, (psrc), stride, out0, out1); \ LD_V2(RTYPE, (psrc), stride, out0, out1); \
LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
} }
#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
#define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
{ \ { \
LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ out4 = LD_V(RTYPE, (psrc) + 4 * stride); \
} }
#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
#define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \ #define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
{ \ { \
LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \ LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
} }
#define LD_UB6(...) LD_B6(v16u8, __VA_ARGS__) #define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
#define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__) #define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
#define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
#define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
#define LD_B7(RTYPE, psrc, stride, \ #define LD_V7(RTYPE, psrc, stride, \
out0, out1, out2, out3, out4, out5, out6) \ out0, out1, out2, out3, out4, out5, out6) \
{ \ { \
LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
} }
#define LD_UB7(...) LD_B7(v16u8, __VA_ARGS__) #define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
#define LD_B8(RTYPE, psrc, stride, \ #define LD_V8(RTYPE, psrc, stride, \
out0, out1, out2, out3, out4, out5, out6, out7) \ out0, out1, out2, out3, out4, out5, out6, out7) \
{ \ { \
LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
}
#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
/* Description : Load vectors with 8 halfword elements with stride
Arguments : Inputs - psrc (source pointer to load from)
- stride
Outputs - out0, out1
Details : Loads 8 halfword elements in 'out0' from (psrc)
Loads 8 halfword elements in 'out1' from (psrc + stride)
*/
#define LD_H2(RTYPE, psrc, stride, out0, out1) \
{ \
out0 = LD_H(RTYPE, (psrc)); \
out1 = LD_H(RTYPE, (psrc) + (stride)); \
}
#define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
{ \
LD_H2(RTYPE, (psrc), stride, out0, out1); \
LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
}
#define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__)
#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
#define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
{ \
LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
LD_H2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
} }
#define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__) #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
#define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__) #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
#define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
#define LD_H8(RTYPE, psrc, stride, \ #define LD_V16(RTYPE, psrc, stride, \
out0, out1, out2, out3, out4, out5, out6, out7) \
{ \
LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
}
#define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__)
#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
#define LD_H16(RTYPE, psrc, stride, \
out0, out1, out2, out3, out4, out5, out6, out7, \ out0, out1, out2, out3, out4, out5, out6, out7, \
out8, out9, out10, out11, out12, out13, out14, out15) \ out8, out9, out10, out11, out12, out13, out14, out15) \
{ \ { \
LD_H8(RTYPE, (psrc), stride, \ LD_V8(RTYPE, (psrc), stride, \
out0, out1, out2, out3, out4, out5, out6, out7); \ out0, out1, out2, out3, out4, out5, out6, out7); \
LD_H8(RTYPE, (psrc) + 8 * stride, stride, \ LD_V8(RTYPE, (psrc) + 8 * stride, stride, \
out8, out9, out10, out11, out12, out13, out14, out15); \ out8, out9, out10, out11, out12, out13, out14, out15); \
} }
#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
/* Description : Load as 4x4 block of signed halfword elements from 1D source /* Description : Load as 4x4 block of signed halfword elements from 1D source
data into 4 vectors (Each vector with 4 signed halfwords) data into 4 vectors (Each vector with 4 signed halfwords)
...@@ -421,103 +351,48 @@ ...@@ -421,103 +351,48 @@
out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2); \ out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2); \
} }
/* Description : Load 2 vectors of signed word elements with stride /* Description : Store vectors with stride
Arguments : Inputs - psrc (source pointer to load from)
- stride
Outputs - out0, out1
Return Type - signed word
*/
#define LD_SW2(psrc, stride, out0, out1) \
{ \
out0 = LD_SW((psrc)); \
out1 = LD_SW((psrc) + stride); \
}
/* Description : Store vectors of 16 byte elements with stride
Arguments : Inputs - in0, in1, stride
Outputs - pdst (destination pointer to store to)
Details : Stores 16 byte elements from 'in0' to (pdst)
Stores 16 byte elements from 'in1' to (pdst + stride)
*/
#define ST_B2(RTYPE, in0, in1, pdst, stride) \
{ \
ST_B(RTYPE, in0, (pdst)); \
ST_B(RTYPE, in1, (pdst) + stride); \
}
#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
#define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
{ \
ST_B2(RTYPE, in0, in1, (pdst), stride); \
ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
}
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
pdst, stride) \
{ \
ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
}
#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
/* Description : Store vectors of 8 halfword elements with stride
Arguments : Inputs - in0, in1, stride Arguments : Inputs - in0, in1, stride
Outputs - pdst (destination pointer to store to) Outputs - pdst (destination pointer to store to)
Details : Stores 8 halfword elements from 'in0' to (pdst) Details : Stores elements from 'in0' to (pdst)
Stores 8 halfword elements from 'in1' to (pdst + stride) Stores elements from 'in1' to (pdst + stride)
*/ */
#define ST_H2(RTYPE, in0, in1, pdst, stride) \ #define ST_V2(RTYPE, in0, in1, pdst, stride) \
{ \ { \
ST_H(RTYPE, in0, (pdst)); \ ST_V(RTYPE, in0, (pdst)); \
ST_H(RTYPE, in1, (pdst) + stride); \ ST_V(RTYPE, in1, (pdst) + stride); \
} }
#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__) #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) #define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
#define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
#define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
#define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \
{ \ { \
ST_H2(RTYPE, in0, in1, (pdst), stride); \ ST_V2(RTYPE, in0, in1, (pdst), stride); \
ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
} }
#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
#define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
#define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
#define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
#define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \ #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
{ \ { \
ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \ ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
} }
#define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__) #define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
{ \ { \
ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
}
#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
/* Description : Store vectors of word elements with stride
Arguments : Inputs - in0, in1, stride
Outputs - pdst (destination pointer to store to)
Return Type - signed word
Details : Stores 4 word elements from 'in0' to (pdst)
Stores 4 word elements from 'in1' to (pdst + stride)
*/
#define ST_SW2(in0, in1, pdst, stride) \
{ \
ST_SW(in0, (pdst)); \
ST_SW(in1, (pdst) + stride); \
}
#define ST_SW8(in0, in1, in2, in3, in4, in5, in6, in7, \
pdst, stride) \
{ \
ST_SW2(in0, in1, (pdst), stride); \
ST_SW2(in2, in3, (pdst) + 2 * stride, stride); \
ST_SW2(in4, in5, (pdst) + 4 * stride, stride); \
ST_SW2(in6, in7, (pdst) + 6 * stride, stride); \
} }
#define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
#define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
/* Description : Store as 2x4 byte block to destination memory from input vector /* Description : Store as 2x4 byte block to destination memory from input vector
Arguments : Inputs - in, stidx, pdst, stride Arguments : Inputs - in, stidx, pdst, stride
...@@ -776,7 +651,7 @@ ...@@ -776,7 +651,7 @@
/* Description : average with rounding (in0 + in1 + 1) / 2. /* Description : average with rounding (in0 + in1 + 1) / 2.
Arguments : Inputs - in0, in1, in2, in3, Arguments : Inputs - in0, in1, in2, in3,
Outputs - out0, out1 Outputs - out0, out1
Return Type - signed byte Return Type - as per RTYPE
Details : Each byte element from 'in0' vector is added with each byte Details : Each byte element from 'in0' vector is added with each byte
element from 'in1' vector. The addition of the elements plus 1 element from 'in1' vector. The addition of the elements plus 1
(for rounding) is done unsigned with full precision, (for rounding) is done unsigned with full precision,
...@@ -941,7 +816,7 @@ ...@@ -941,7 +816,7 @@
Arguments : Inputs - mult0, mult1 Arguments : Inputs - mult0, mult1
cnst0, cnst1 cnst0, cnst1
Outputs - out0, out1 Outputs - out0, out1
Return Type - unsigned halfword Return Type - as per RTYPE
Details : Unsigned byte elements from mult0 are multiplied with Details : Unsigned byte elements from mult0 are multiplied with
unsigned byte elements from cnst0 producing a result unsigned byte elements from cnst0 producing a result
twice the size of input i.e. unsigned halfword. twice the size of input i.e. unsigned halfword.
...@@ -969,7 +844,7 @@ ...@@ -969,7 +844,7 @@
Arguments : Inputs - mult0, mult1 Arguments : Inputs - mult0, mult1
cnst0, cnst1 cnst0, cnst1
Outputs - out0, out1 Outputs - out0, out1
Return Type - signed halfword Return Type - as per RTYPE
Details : Signed byte elements from mult0 are multiplied with Details : Signed byte elements from mult0 are multiplied with
signed byte elements from cnst0 producing a result signed byte elements from cnst0 producing a result
twice the size of input i.e. signed halfword. twice the size of input i.e. signed halfword.
...@@ -1004,7 +879,7 @@ ...@@ -1004,7 +879,7 @@
Arguments : Inputs - mult0, mult1 Arguments : Inputs - mult0, mult1
cnst0, cnst1 cnst0, cnst1
Outputs - out0, out1 Outputs - out0, out1
Return Type - signed word Return Type - as per RTYPE
Details : Signed halfword elements from mult0 are multiplied with Details : Signed halfword elements from mult0 are multiplied with
signed halfword elements from cnst0 producing a result signed halfword elements from cnst0 producing a result
twice the size of input i.e. signed word. twice the size of input i.e. signed word.
...@@ -1032,7 +907,7 @@ ...@@ -1032,7 +907,7 @@
Arguments : Inputs - mult0, mult1 Arguments : Inputs - mult0, mult1
cnst0, cnst1 cnst0, cnst1
Outputs - out0, out1 Outputs - out0, out1
Return Type - signed halfword Return Type - as per RTYPE
Details : Signed byte elements from mult0 are multiplied with Details : Signed byte elements from mult0 are multiplied with
signed byte elements from cnst0 producing a result signed byte elements from cnst0 producing a result
twice the size of input i.e. signed halfword. twice the size of input i.e. signed halfword.
...@@ -1061,7 +936,7 @@ ...@@ -1061,7 +936,7 @@
Arguments : Inputs - mult0, mult1 Arguments : Inputs - mult0, mult1
cnst0, cnst1 cnst0, cnst1
Outputs - out0, out1 Outputs - out0, out1
Return Type - unsigned halfword Return Type - as per RTYPE
Details : Unsigned byte elements from mult0 are multiplied with Details : Unsigned byte elements from mult0 are multiplied with
unsigned byte elements from cnst0 producing a result unsigned byte elements from cnst0 producing a result
twice the size of input i.e. unsigned halfword. twice the size of input i.e. unsigned halfword.
...@@ -1082,7 +957,7 @@ ...@@ -1082,7 +957,7 @@
Arguments : Inputs - mult0, mult1 Arguments : Inputs - mult0, mult1
cnst0, cnst1 cnst0, cnst1
Outputs - out0, out1 Outputs - out0, out1
Return Type - signed word Return Type - as per RTYPE
Details : Signed halfword elements from mult0 are multiplied with Details : Signed halfword elements from mult0 are multiplied with
signed halfword elements from cnst0 producing a result signed halfword elements from cnst0 producing a result
twice the size of input i.e. signed word. twice the size of input i.e. signed word.
...@@ -1111,7 +986,7 @@ ...@@ -1111,7 +986,7 @@
either vector are copied to the output vector either vector are copied to the output vector
Arguments : Inputs - in0, in1, min_vec Arguments : Inputs - in0, in1, min_vec
Outputs - in0, in1, (in place) Outputs - in0, in1, (in place)
Return Type - unsigned halfword Return Type - as per RTYPE
Details : Minimum of unsigned halfword element values from 'in0' and Details : Minimum of unsigned halfword element values from 'in0' and
'min_value' are written to output vector 'in0' 'min_value' are written to output vector 'in0'
*/ */
...@@ -1202,7 +1077,7 @@ ...@@ -1202,7 +1077,7 @@
\ \
res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \ res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \
res1_m = __msa_splati_d(res0_m, 1); \ res1_m = __msa_splati_d(res0_m, 1); \
res0_m = res0_m + res1_m; \ res0_m += res1_m; \
sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \ sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \
sum_m; \ sum_m; \
} ) } )
...@@ -1223,7 +1098,7 @@ ...@@ -1223,7 +1098,7 @@
res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \ res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \
res0_m = __msa_hadd_u_d(res_m, res_m); \ res0_m = __msa_hadd_u_d(res_m, res_m); \
res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \ res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \
res0_m = res0_m + res1_m; \ res0_m += res1_m; \
sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \ sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \
sum_m; \ sum_m; \
} ) } )
...@@ -1573,7 +1448,7 @@ ...@@ -1573,7 +1448,7 @@
/* Description : Interleave right half of halfword elements from vectors /* Description : Interleave right half of halfword elements from vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
Outputs - out0, out1, out2, out3 Outputs - out0, out1, out2, out3
Return Type - signed halfword Return Type - as per RTYPE
Details : Right half of halfword elements of in0 and right half of Details : Right half of halfword elements of in0 and right half of
halfword elements of in1 are interleaved and copied to out0. halfword elements of in1 are interleaved and copied to out0.
Right half of halfword elements of in2 and right half of Right half of halfword elements of in2 and right half of
...@@ -1625,16 +1500,16 @@ ...@@ -1625,16 +1500,16 @@
/* Description : Interleave right half of double word elements from vectors /* Description : Interleave right half of double word elements from vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
Outputs - out0, out1, out2, out3 Outputs - out0, out1, out2, out3
Return Type - unsigned double word Return Type - as per RTYPE
Details : Right half of double word elements of in0 and right half of Details : Right half of double word elements of in0 and right half of
double word elements of in1 are interleaved and copied to out0. double word elements of in1 are interleaved and copied to out0.
Right half of double word elements of in2 and right half of Right half of double word elements of in2 and right half of
double word elements of in3 are interleaved and copied to out1. double word elements of in3 are interleaved and copied to out1.
*/ */
#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
{ \ { \
out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1)); \ out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3)); \ out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3); \
} }
#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
...@@ -1643,7 +1518,7 @@ ...@@ -1643,7 +1518,7 @@
#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
{ \ { \
ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5)); \ out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5); \
} }
#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
...@@ -1698,14 +1573,14 @@ ...@@ -1698,14 +1573,14 @@
5-bit signed immediate value are copied to the output vector 5-bit signed immediate value are copied to the output vector
Arguments : Inputs - in0, in1, in2, in3, max_val Arguments : Inputs - in0, in1, in2, in3, max_val
Outputs - in0, in1, in2, in3 (in place) Outputs - in0, in1, in2, in3 (in place)
Return Type - unsigned halfword Return Type - as per RTYPE
Details : Maximum of signed halfword element values from 'in0' and Details : Maximum of signed halfword element values from 'in0' and
'max_val' are written to output vector 'in0' 'max_val' are written to output vector 'in0'
*/ */
#define MAXI_SH2(RTYPE, in0, in1, max_val) \ #define MAXI_SH2(RTYPE, in0, in1, max_val) \
{ \ { \
in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val)); \ in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val); \
in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val)); \ in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val); \
} }
#define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__) #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__) #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
...@@ -1722,7 +1597,7 @@ ...@@ -1722,7 +1597,7 @@
The element data width remains unchanged The element data width remains unchanged
Arguments : Inputs - in0, in1, in2, in3, sat_val Arguments : Inputs - in0, in1, in2, in3, sat_val
Outputs - in0, in1, in2, in3 (in place) Outputs - in0, in1, in2, in3 (in place)
Return Type - unsigned halfword Return Type - as per RTYPE
Details : Each unsigned halfword element from 'in0' is saturated to the Details : Each unsigned halfword element from 'in0' is saturated to the
value generated with (sat_val+1) bit range value generated with (sat_val+1) bit range
Results are in placed to original vectors Results are in placed to original vectors
...@@ -1738,7 +1613,7 @@ ...@@ -1738,7 +1613,7 @@
#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
{ \ { \
SAT_UH2(RTYPE, in0, in1, sat_val); \ SAT_UH2(RTYPE, in0, in1, sat_val); \
SAT_UH2(RTYPE, in2, in3, sat_val) \ SAT_UH2(RTYPE, in2, in3, sat_val); \
} }
#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
...@@ -1747,7 +1622,7 @@ ...@@ -1747,7 +1622,7 @@
The element data width remains unchanged The element data width remains unchanged
Arguments : Inputs - in0, in1, in2, in3, sat_val Arguments : Inputs - in0, in1, in2, in3, sat_val
Outputs - in0, in1, in2, in3 (in place) Outputs - in0, in1, in2, in3 (in place)
Return Type - unsigned halfword Return Type - as per RTYPE
Details : Each unsigned halfword element from 'in0' is saturated to the Details : Each unsigned halfword element from 'in0' is saturated to the
value generated with (sat_val+1) bit range value generated with (sat_val+1) bit range
Results are in placed to original vectors Results are in placed to original vectors
...@@ -1761,7 +1636,7 @@ ...@@ -1761,7 +1636,7 @@
#define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \ #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
{ \ { \
SAT_SH2(RTYPE, in0, in1, sat_val) \ SAT_SH2(RTYPE, in0, in1, sat_val); \
in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \ in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
} }
#define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__) #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
...@@ -1778,7 +1653,7 @@ ...@@ -1778,7 +1653,7 @@
The element data width remains unchanged The element data width remains unchanged
Arguments : Inputs - in0, in1, in2, in3, sat_val Arguments : Inputs - in0, in1, in2, in3, sat_val
Outputs - in0, in1, in2, in3 (in place) Outputs - in0, in1, in2, in3 (in place)
Return Type - unsigned word Return Type - as per RTYPE
Details : Each unsigned word element from 'in0' is saturated to the Details : Each unsigned word element from 'in0' is saturated to the
value generated with (sat_val+1) bit range value generated with (sat_val+1) bit range
Results are in placed to original vectors Results are in placed to original vectors
...@@ -1930,7 +1805,7 @@ ...@@ -1930,7 +1805,7 @@
/* Description : Pack even double word elements of vector pairs /* Description : Pack even double word elements of vector pairs
Arguments : Inputs - in0, in1, in2, in3 Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1 Outputs - out0, out1
Return Type - unsigned byte Return Type - as per RTYPE
Details : Even double elements of in0 are copied to the left half of Details : Even double elements of in0 are copied to the left half of
out0 & even double elements of in1 are copied to the right out0 & even double elements of in1 are copied to the right
half of out0. half of out0.
...@@ -2100,7 +1975,7 @@ ...@@ -2100,7 +1975,7 @@
/* Description : Shift right logical all halfword elements of vector /* Description : Shift right logical all halfword elements of vector
Arguments : Inputs - in0, in1, in2, in3, shift Arguments : Inputs - in0, in1, in2, in3, shift
Outputs - in0, in1, in2, in3 (in place) Outputs - in0, in1, in2, in3 (in place)
Return Type - unsigned halfword Return Type - as per RTYPE
Details : Each element of vector 'in0' is shifted right logical by Details : Each element of vector 'in0' is shifted right logical by
number of bits respective element holds in vector 'shift' and number of bits respective element holds in vector 'shift' and
result is in place written to 'in0' result is in place written to 'in0'
...@@ -2119,7 +1994,7 @@ ...@@ -2119,7 +1994,7 @@
/* Description : Shift right arithmetic rounded halfwords /* Description : Shift right arithmetic rounded halfwords
Arguments : Inputs - in0, in1, shift Arguments : Inputs - in0, in1, shift
Outputs - in0, in1, (in place) Outputs - in0, in1, (in place)
Return Type - unsigned halfword Return Type - as per RTYPE
Details : Each element of vector 'in0' is shifted right arithmetic by Details : Each element of vector 'in0' is shifted right arithmetic by
number of bits respective element holds in vector 'shift'. number of bits respective element holds in vector 'shift'.
The last discarded bit is added to shifted value for rounding The last discarded bit is added to shifted value for rounding
...@@ -2445,7 +2320,7 @@ ...@@ -2445,7 +2320,7 @@
/* Description : Transposes input 8x4 byte block into 4x8 /* Description : Transposes input 8x4 byte block into 4x8
Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block) Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
Outputs - out0, out1, out2, out3 (output 4x8 byte block) Outputs - out0, out1, out2, out3 (output 4x8 byte block)
Return Type - unsigned byte Return Type - as per RTYPE
Details : Details :
*/ */
#define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
...@@ -2472,7 +2347,7 @@ ...@@ -2472,7 +2347,7 @@
(input 8x8 byte block) (input 8x8 byte block)
Outputs - out0, out1, out2, out3, out4, out5, out6, out7 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
(output 8x8 byte block) (output 8x8 byte block)
Return Type - unsigned byte Return Type - as per RTYPE
Details : Details :
*/ */
#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
...@@ -2596,7 +2471,7 @@ ...@@ -2596,7 +2471,7 @@
/* Description : Transposes 8x8 block with half word elements in vectors /* Description : Transposes 8x8 block with half word elements in vectors
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
Outputs - out0, out1, out2, out3, out4, out5, out6, out7 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
Return Type - signed halfword Return Type - as per RTYPE
Details : Details :
*/ */
#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
...@@ -2646,8 +2521,6 @@ ...@@ -2646,8 +2521,6 @@
/* Description : Average byte elements from pair of vectors and store 8x4 byte /* Description : Average byte elements from pair of vectors and store 8x4 byte
block in destination memory block in destination memory
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
Outputs -
Return Type -
Details : Each byte element from input vector pair 'in0' and 'in1' are Details : Each byte element from input vector pair 'in0' and 'in1' are
averaged (a + b)/2 and stored in 'tmp0_m' averaged (a + b)/2 and stored in 'tmp0_m'
Each byte element from input vector pair 'in2' and 'in3' are Each byte element from input vector pair 'in2' and 'in3' are
...@@ -2679,8 +2552,6 @@ ...@@ -2679,8 +2552,6 @@
/* Description : Average byte elements from pair of vectors and store 16x4 byte /* Description : Average byte elements from pair of vectors and store 16x4 byte
block in destination memory block in destination memory
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
Outputs -
Return Type -
Details : Each byte element from input vector pair 'in0' and 'in1' are Details : Each byte element from input vector pair 'in0' and 'in1' are
averaged (a + b)/2 and stored in 'tmp0_m' averaged (a + b)/2 and stored in 'tmp0_m'
Each byte element from input vector pair 'in2' and 'in3' are Each byte element from input vector pair 'in2' and 'in3' are
...@@ -2707,8 +2578,6 @@ ...@@ -2707,8 +2578,6 @@
/* Description : Average rounded byte elements from pair of vectors and store /* Description : Average rounded byte elements from pair of vectors and store
8x4 byte block in destination memory 8x4 byte block in destination memory
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
Outputs -
Return Type -
Details : Each byte element from input vector pair 'in0' and 'in1' are Details : Each byte element from input vector pair 'in0' and 'in1' are
average rounded (a + b + 1)/2 and stored in 'tmp0_m' average rounded (a + b + 1)/2 and stored in 'tmp0_m'
Each byte element from input vector pair 'in2' and 'in3' are Each byte element from input vector pair 'in2' and 'in3' are
...@@ -2738,8 +2607,6 @@ ...@@ -2738,8 +2607,6 @@
/* Description : Average rounded byte elements from pair of vectors and store /* Description : Average rounded byte elements from pair of vectors and store
16x4 byte block in destination memory 16x4 byte block in destination memory
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
Outputs -
Return Type -
Details : Each byte element from input vector pair 'in0' and 'in1' are Details : Each byte element from input vector pair 'in0' and 'in1' are
average rounded (a + b + 1)/2 and stored in 'tmp0_m' average rounded (a + b + 1)/2 and stored in 'tmp0_m'
Each byte element from input vector pair 'in2' and 'in3' are Each byte element from input vector pair 'in2' and 'in3' are
...@@ -2764,8 +2631,6 @@ ...@@ -2764,8 +2631,6 @@
average rounded with destination and store 8x4 byte block average rounded with destination and store 8x4 byte block
in destination memory in destination memory
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
Outputs -
Return Type -
Details : Each byte element from input vector pair 'in0' and 'in1' are Details : Each byte element from input vector pair 'in0' and 'in1' are
average rounded (a + b + 1)/2 and stored in 'tmp0_m' average rounded (a + b + 1)/2 and stored in 'tmp0_m'
Each byte element from input vector pair 'in2' and 'in3' are Each byte element from input vector pair 'in2' and 'in3' are
...@@ -2794,8 +2659,6 @@ ...@@ -2794,8 +2659,6 @@
average rounded with destination and store 16x4 byte block average rounded with destination and store 16x4 byte block
in destination memory in destination memory
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
Outputs -
Return Type -
Details : Each byte element from input vector pair 'in0' and 'in1' are Details : Each byte element from input vector pair 'in0' and 'in1' are
average rounded (a + b + 1)/2 and stored in 'tmp0_m' average rounded (a + b + 1)/2 and stored in 'tmp0_m'
Each byte element from input vector pair 'in2' and 'in3' are Each byte element from input vector pair 'in2' and 'in3' are
...@@ -2822,8 +2685,6 @@ ...@@ -2822,8 +2685,6 @@
/* Description : Add block 4x4 /* Description : Add block 4x4
Arguments : Inputs - in0, in1, in2, in3, pdst, stride Arguments : Inputs - in0, in1, in2, in3, pdst, stride
Outputs -
Return Type - unsigned bytes
Details : Least significant 4 bytes from each input vector are added to Details : Least significant 4 bytes from each input vector are added to
the destination bytes, clipped between 0-255 and then stored. the destination bytes, clipped between 0-255 and then stored.
*/ */
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment