Commit 452ac2aa authored by James Almer's avatar James Almer Committed by Michael Niedermayer

lavu/ripemd: Fully unroll the transform function loops

crypto_bench RIPEMD-160 results using an AMD Athlon X2 7750+, mingw32-w64 GCC 4.8.1 x86_64

Before:
lavu       RIPEMD-160   size: 1048576  runs:   1024  time:   12.342 +- 0.199

After:
lavu       RIPEMD-160   size: 1048576  runs:   1024  time:   10.143 +- 0.192
Signed-off-by: 's avatarJames Almer <jamrial@gmail.com>
Signed-off-by: 's avatarMichael Niedermayer <michaelni@gmx.at>
parent b4e1630d
...@@ -128,37 +128,42 @@ static void ripemd128_transform(uint32_t *state, const uint8_t buffer[64], int e ...@@ -128,37 +128,42 @@ static void ripemd128_transform(uint32_t *state, const uint8_t buffer[64], int e
for (n = 0; n < 16; n++) for (n = 0; n < 16; n++)
block[n] = AV_RL32(buffer + 4 * n); block[n] = AV_RL32(buffer + 4 * n);
n = 0;
for (n = 0; n < 16;) { #define R128_0 \
ROUND128_0_TO_15(a,b,c,d,e,f,g,h); ROUND128_0_TO_15(a,b,c,d,e,f,g,h); \
ROUND128_0_TO_15(d,a,b,c,h,e,f,g); ROUND128_0_TO_15(d,a,b,c,h,e,f,g); \
ROUND128_0_TO_15(c,d,a,b,g,h,e,f); ROUND128_0_TO_15(c,d,a,b,g,h,e,f); \
ROUND128_0_TO_15(b,c,d,a,f,g,h,e); ROUND128_0_TO_15(b,c,d,a,f,g,h,e)
}
R128_0; R128_0; R128_0; R128_0;
SWAP(a,e) SWAP(a,e)
for (; n < 32;) { #define R128_16 \
ROUND128_16_TO_31(a,b,c,d,e,f,g,h); ROUND128_16_TO_31(a,b,c,d,e,f,g,h); \
ROUND128_16_TO_31(d,a,b,c,h,e,f,g); ROUND128_16_TO_31(d,a,b,c,h,e,f,g); \
ROUND128_16_TO_31(c,d,a,b,g,h,e,f); ROUND128_16_TO_31(c,d,a,b,g,h,e,f); \
ROUND128_16_TO_31(b,c,d,a,f,g,h,e); ROUND128_16_TO_31(b,c,d,a,f,g,h,e)
}
R128_16; R128_16; R128_16; R128_16;
SWAP(b,f) SWAP(b,f)
for (; n < 48;) { #define R128_32 \
ROUND128_32_TO_47(a,b,c,d,e,f,g,h); ROUND128_32_TO_47(a,b,c,d,e,f,g,h); \
ROUND128_32_TO_47(d,a,b,c,h,e,f,g); ROUND128_32_TO_47(d,a,b,c,h,e,f,g); \
ROUND128_32_TO_47(c,d,a,b,g,h,e,f); ROUND128_32_TO_47(c,d,a,b,g,h,e,f); \
ROUND128_32_TO_47(b,c,d,a,f,g,h,e); ROUND128_32_TO_47(b,c,d,a,f,g,h,e)
}
R128_32; R128_32; R128_32; R128_32;
SWAP(c,g) SWAP(c,g)
for (; n < 64;) { #define R128_48 \
ROUND128_48_TO_63(a,b,c,d,e,f,g,h); ROUND128_48_TO_63(a,b,c,d,e,f,g,h); \
ROUND128_48_TO_63(d,a,b,c,h,e,f,g); ROUND128_48_TO_63(d,a,b,c,h,e,f,g); \
ROUND128_48_TO_63(c,d,a,b,g,h,e,f); ROUND128_48_TO_63(c,d,a,b,g,h,e,f); \
ROUND128_48_TO_63(b,c,d,a,f,g,h,e); ROUND128_48_TO_63(b,c,d,a,f,g,h,e)
}
R128_48; R128_48; R128_48; R128_48;
SWAP(d,h) SWAP(d,h)
if (ext) { if (ext) {
...@@ -222,54 +227,60 @@ static void ripemd160_transform(uint32_t *state, const uint8_t buffer[64], int e ...@@ -222,54 +227,60 @@ static void ripemd160_transform(uint32_t *state, const uint8_t buffer[64], int e
for (n = 0; n < 16; n++) for (n = 0; n < 16; n++)
block[n] = AV_RL32(buffer + 4 * n); block[n] = AV_RL32(buffer + 4 * n);
n = 0;
for (n = 0; n < 16 - 1;) { #define R160_0 \
ROUND160_0_TO_15(a,b,c,d,e,f,g,h,i,j); ROUND160_0_TO_15(a,b,c,d,e,f,g,h,i,j); \
ROUND160_0_TO_15(e,a,b,c,d,j,f,g,h,i); ROUND160_0_TO_15(e,a,b,c,d,j,f,g,h,i); \
ROUND160_0_TO_15(d,e,a,b,c,i,j,f,g,h); ROUND160_0_TO_15(d,e,a,b,c,i,j,f,g,h); \
ROUND160_0_TO_15(c,d,e,a,b,h,i,j,f,g); ROUND160_0_TO_15(c,d,e,a,b,h,i,j,f,g); \
ROUND160_0_TO_15(b,c,d,e,a,g,h,i,j,f); ROUND160_0_TO_15(b,c,d,e,a,g,h,i,j,f)
}
R160_0; R160_0; R160_0;
ROUND160_0_TO_15(a,b,c,d,e,f,g,h,i,j); ROUND160_0_TO_15(a,b,c,d,e,f,g,h,i,j);
SWAP(a,f) SWAP(a,f)
for (; n < 32 - 1;) { #define R160_16 \
ROUND160_16_TO_31(e,a,b,c,d,j,f,g,h,i); ROUND160_16_TO_31(e,a,b,c,d,j,f,g,h,i); \
ROUND160_16_TO_31(d,e,a,b,c,i,j,f,g,h); ROUND160_16_TO_31(d,e,a,b,c,i,j,f,g,h); \
ROUND160_16_TO_31(c,d,e,a,b,h,i,j,f,g); ROUND160_16_TO_31(c,d,e,a,b,h,i,j,f,g); \
ROUND160_16_TO_31(b,c,d,e,a,g,h,i,j,f); ROUND160_16_TO_31(b,c,d,e,a,g,h,i,j,f); \
ROUND160_16_TO_31(a,b,c,d,e,f,g,h,i,j); ROUND160_16_TO_31(a,b,c,d,e,f,g,h,i,j)
}
R160_16; R160_16; R160_16;
ROUND160_16_TO_31(e,a,b,c,d,j,f,g,h,i); ROUND160_16_TO_31(e,a,b,c,d,j,f,g,h,i);
SWAP(b,g) SWAP(b,g)
for (; n < 48 - 1;) { #define R160_32 \
ROUND160_32_TO_47(d,e,a,b,c,i,j,f,g,h); ROUND160_32_TO_47(d,e,a,b,c,i,j,f,g,h); \
ROUND160_32_TO_47(c,d,e,a,b,h,i,j,f,g); ROUND160_32_TO_47(c,d,e,a,b,h,i,j,f,g); \
ROUND160_32_TO_47(b,c,d,e,a,g,h,i,j,f); ROUND160_32_TO_47(b,c,d,e,a,g,h,i,j,f); \
ROUND160_32_TO_47(a,b,c,d,e,f,g,h,i,j); ROUND160_32_TO_47(a,b,c,d,e,f,g,h,i,j); \
ROUND160_32_TO_47(e,a,b,c,d,j,f,g,h,i); ROUND160_32_TO_47(e,a,b,c,d,j,f,g,h,i)
}
R160_32; R160_32; R160_32;
ROUND160_32_TO_47(d,e,a,b,c,i,j,f,g,h); ROUND160_32_TO_47(d,e,a,b,c,i,j,f,g,h);
SWAP(c,h) SWAP(c,h)
for (; n < 64 - 1;) { #define R160_48 \
ROUND160_48_TO_63(c,d,e,a,b,h,i,j,f,g); ROUND160_48_TO_63(c,d,e,a,b,h,i,j,f,g); \
ROUND160_48_TO_63(b,c,d,e,a,g,h,i,j,f); ROUND160_48_TO_63(b,c,d,e,a,g,h,i,j,f); \
ROUND160_48_TO_63(a,b,c,d,e,f,g,h,i,j); ROUND160_48_TO_63(a,b,c,d,e,f,g,h,i,j); \
ROUND160_48_TO_63(e,a,b,c,d,j,f,g,h,i); ROUND160_48_TO_63(e,a,b,c,d,j,f,g,h,i); \
ROUND160_48_TO_63(d,e,a,b,c,i,j,f,g,h); ROUND160_48_TO_63(d,e,a,b,c,i,j,f,g,h)
}
R160_48; R160_48; R160_48;
ROUND160_48_TO_63(c,d,e,a,b,h,i,j,f,g); ROUND160_48_TO_63(c,d,e,a,b,h,i,j,f,g);
SWAP(d,i) SWAP(d,i)
for (; n < 75;) { #define R160_64 \
ROUND160_64_TO_79(b,c,d,e,a,g,h,i,j,f); ROUND160_64_TO_79(b,c,d,e,a,g,h,i,j,f); \
ROUND160_64_TO_79(a,b,c,d,e,f,g,h,i,j); ROUND160_64_TO_79(a,b,c,d,e,f,g,h,i,j); \
ROUND160_64_TO_79(e,a,b,c,d,j,f,g,h,i); ROUND160_64_TO_79(e,a,b,c,d,j,f,g,h,i); \
ROUND160_64_TO_79(d,e,a,b,c,i,j,f,g,h); ROUND160_64_TO_79(d,e,a,b,c,i,j,f,g,h); \
ROUND160_64_TO_79(c,d,e,a,b,h,i,j,f,g); ROUND160_64_TO_79(c,d,e,a,b,h,i,j,f,g)
}
R160_64; R160_64; R160_64;
ROUND160_64_TO_79(b,c,d,e,a,g,h,i,j,f); ROUND160_64_TO_79(b,c,d,e,a,g,h,i,j,f);
SWAP(e,j) SWAP(e,j)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment