avfilter/vf_transpose: rewrite for x86 SIMD

Transpose first in chunks of 8x8 blocks. 15% faster overall.

avfilter/vf_transpose: rewrite for x86 SIMD
Transpose first in chunks of 8x8 blocks. 15% faster overall.
7b19e76a · Paul B Mahol · cbd8e070 · 7b19e76a
Commit 7b19e76a authored Aug 15, 2017 by Paul B Mahol
Show whitespace changes
Inline Side-by-side

Showing with 141 additions and 41 deletions

vf_transpose.c libavfilter/vf_transpose.c +141 -41

No files found.
--- a/libavfilter/vf_transpose.c
+++ b/libavfilter/vf_transpose.c
@@ -58,6 +58,12 @@ typedef struct TransContext {

    int passthrough;    ///< PassthroughType, landscape passthrough mode enabled
    int dir;            ///< TransposeDir
+
+    void (*transpose_8x8)(uint8_t *src, ptrdiff_t src_linesize,
+                          uint8_t *dst, ptrdiff_t dst_linesize);
+    void (*transpose_block)(uint8_t *src, ptrdiff_t src_linesize,
+                            uint8_t *dst, ptrdiff_t dst_linesize,
+                            int w, int h);
 } TransContext;

 static int query_formats(AVFilterContext *ctx)
@@ -79,6 +85,109 @@ static int query_formats(AVFilterContext *ctx)
    return ff_set_common_formats(ctx, pix_fmts);
 }

+static inline void transpose_block_8_c(uint8_t *src, ptrdiff_t src_linesize,
+                                       uint8_t *dst, ptrdiff_t dst_linesize,
+                                       int w, int h)
+{
+    int x, y;
+    for (y = 0; y < h; y++, dst += dst_linesize, src++)
+        for (x = 0; x < w; x++)
+            dst[x] = src[x*src_linesize];
+}
+
+static void transpose_8x8_8_c(uint8_t *src, ptrdiff_t src_linesize,
+                              uint8_t *dst, ptrdiff_t dst_linesize)
+{
+    transpose_block_8_c(src, src_linesize, dst, dst_linesize, 8, 8);
+}
+
+static inline void transpose_block_16_c(uint8_t *src, ptrdiff_t src_linesize,
+                                        uint8_t *dst, ptrdiff_t dst_linesize,
+                                        int w, int h)
+{
+    int x, y;
+    for (y = 0; y < h; y++, dst += dst_linesize, src += 2)
+        for (x = 0; x < w; x++)
+            *((uint16_t *)(dst + 2*x)) = *((uint16_t *)(src + x*src_linesize));
+}
+
+static void transpose_8x8_16_c(uint8_t *src, ptrdiff_t src_linesize,
+                               uint8_t *dst, ptrdiff_t dst_linesize)
+{
+    transpose_block_16_c(src, src_linesize, dst, dst_linesize, 8, 8);
+}
+
+static inline void transpose_block_24_c(uint8_t *src, ptrdiff_t src_linesize,
+                                        uint8_t *dst, ptrdiff_t dst_linesize,
+                                        int w, int h)
+{
+    int x, y;
+    for (y = 0; y < h; y++, dst += dst_linesize) {
+        for (x = 0; x < w; x++) {
+            int32_t v = AV_RB24(src + x*src_linesize + y*3);
+            AV_WB24(dst + 3*x, v);
+        }
+    }
+}
+
+static void transpose_8x8_24_c(uint8_t *src, ptrdiff_t src_linesize,
+                               uint8_t *dst, ptrdiff_t dst_linesize)
+{
+    transpose_block_24_c(src, src_linesize, dst, dst_linesize, 8, 8);
+}
+
+static inline void transpose_block_32_c(uint8_t *src, ptrdiff_t src_linesize,
+                                        uint8_t *dst, ptrdiff_t dst_linesize,
+                                        int w, int h)
+{
+    int x, y;
+    for (y = 0; y < h; y++, dst += dst_linesize, src += 4) {
+        for (x = 0; x < w; x++)
+            *((uint32_t *)(dst + 4*x)) = *((uint32_t *)(src + x*src_linesize));
+    }
+}
+
+static void transpose_8x8_32_c(uint8_t *src, ptrdiff_t src_linesize,
+                               uint8_t *dst, ptrdiff_t dst_linesize)
+{
+    transpose_block_32_c(src, src_linesize, dst, dst_linesize, 8, 8);
+}
+
+static inline void transpose_block_48_c(uint8_t *src, ptrdiff_t src_linesize,
+                                        uint8_t *dst, ptrdiff_t dst_linesize,
+                                        int w, int h)
+{
+    int x, y;
+    for (y = 0; y < h; y++, dst += dst_linesize, src += 6) {
+        for (x = 0; x < w; x++) {
+            int64_t v = AV_RB48(src + x*src_linesize);
+            AV_WB48(dst + 6*x, v);
+        }
+    }
+}
+
+static void transpose_8x8_48_c(uint8_t *src, ptrdiff_t src_linesize,
+                               uint8_t *dst, ptrdiff_t dst_linesize)
+{
+    transpose_block_48_c(src, src_linesize, dst, dst_linesize, 8, 8);
+}
+
+static inline void transpose_block_64_c(uint8_t *src, ptrdiff_t src_linesize,
+                                        uint8_t *dst, ptrdiff_t dst_linesize,
+                                        int w, int h)
+{
+    int x, y;
+    for (y = 0; y < h; y++, dst += dst_linesize, src += 8)
+        for (x = 0; x < w; x++)
+            *((uint64_t *)(dst + 8*x)) = *((uint64_t *)(src + x*src_linesize));
+}
+
+static void transpose_8x8_64_c(uint8_t *src, ptrdiff_t src_linesize,
+                               uint8_t *dst, ptrdiff_t dst_linesize)
+{
+    transpose_block_64_c(src, src_linesize, dst, dst_linesize, 8, 8);
+}
+
 static int config_props_output(AVFilterLink *outlink)
 {
    AVFilterContext *ctx = outlink->src;
@@ -118,6 +227,21 @@ static int config_props_output(AVFilterLink *outlink)
    else
        outlink->sample_aspect_ratio = inlink->sample_aspect_ratio;

+    switch (s->pixsteps[0]) {
+    case 1: s->transpose_block = transpose_block_8_c;
+            s->transpose_8x8   = transpose_8x8_8_c;  break;
+    case 2: s->transpose_block = transpose_block_16_c;
+            s->transpose_8x8   = transpose_8x8_16_c; break;
+    case 3: s->transpose_block = transpose_block_24_c;
+            s->transpose_8x8   = transpose_8x8_24_c; break;
+    case 4: s->transpose_block = transpose_block_32_c;
+            s->transpose_8x8   = transpose_8x8_32_c; break;
+    case 6: s->transpose_block = transpose_block_48_c;
+            s->transpose_8x8   = transpose_8x8_48_c; break;
+    case 8: s->transpose_block = transpose_block_64_c;
+            s->transpose_8x8   = transpose_8x8_64_c; break;
+    }
+
    av_log(ctx, AV_LOG_VERBOSE,
           "w:%d h:%d dir:%d -> w:%d h:%d rotation:%s vflip:%d\n",
           inlink->w, inlink->h, s->dir, outlink->w, outlink->h,
@@ -176,49 +300,25 @@ static int filter_slice(AVFilterContext *ctx, void *arg, int jobnr,
            dstlinesize *= -1;
        }

-        switch (pixstep) {
-        case 1:
-            for (y = start; y < end; y++, dst += dstlinesize)
-                for (x = 0; x < outw; x++)
-                    dst[x] = src[x * srclinesize + y];
-            break;
-        case 2:
-            for (y = start; y < end; y++, dst += dstlinesize) {
-                for (x = 0; x < outw; x++)
-                    *((uint16_t *)(dst + 2 * x)) =
-                        *((uint16_t *)(src + x * srclinesize + y * 2));
-            }
-            break;
-        case 3:
-            for (y = start; y < end; y++, dst += dstlinesize) {
-                for (x = 0; x < outw; x++) {
-                    int32_t v = AV_RB24(src + x * srclinesize + y * 3);
-                    AV_WB24(dst + 3 * x, v);
+        for (y = start; y < end - 7; y += 8) {
+            for (x = 0; x < outw - 7; x += 8) {
+                s->transpose_8x8(src + x * srclinesize + y * pixstep,
+                                 srclinesize,
+                                 dst + (y - start) * dstlinesize + x * pixstep,
+                                 dstlinesize);
            }
+            if (outw - x > 0 && end - y > 0)
+                s->transpose_block(src + x * srclinesize + y * pixstep,
+                                   srclinesize,
+                                   dst + (y - start) * dstlinesize + x * pixstep,
+                                   dstlinesize, outw - x, end - y);
        }
-            break;
-        case 4:
-            for (y = start; y < end; y++, dst += dstlinesize) {
-                for (x = 0; x < outw; x++)
-                    *((uint32_t *)(dst + 4 * x)) =
-                        *((uint32_t *)(src + x * srclinesize + y * 4));
-            }
-            break;
-        case 6:
-            for (y = start; y < end; y++, dst += dstlinesize) {
-                for (x = 0; x < outw; x++) {
-                    int64_t v = AV_RB48(src + x * srclinesize + y*6);
-                    AV_WB48(dst + 6*x, v);
-                }
-            }
-            break;
-        case 8:
-            for (y = start; y < end; y++, dst += dstlinesize) {
-                for (x = 0; x < outw; x++)
-                    *((uint64_t *)(dst + 8*x)) = *((uint64_t *)(src + x * srclinesize + y*8));
-            }
-            break;
-        }
+
+        if (end - y > 0)
+            s->transpose_block(src + 0 * srclinesize + y * pixstep,
+                               srclinesize,
+                               dst + (y - start) * dstlinesize + 0 * pixstep,
+                               dstlinesize, outw, end - y);
    }

    return 0;