hevc_deblock.asm 28.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
;*****************************************************************************
;* SSE2-optimized HEVC deblocking code
;*****************************************************************************
;* Copyright (C) 2013 VTT
;*
;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_RODATA

29 30
cextern pw_1023
%define pw_pixel_max_10 pw_1023
31 32 33
pw_pixel_max_12: times 8 dw ((1 << 12)-1)
pw_m2:           times 8 dw -2
pd_1 :           times 4 dd  1
34 35 36

cextern pw_4
cextern pw_8
37
cextern pw_m1
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56

SECTION .text
INIT_XMM sse2

; in: 8 rows of 4 bytes in %4..%11
; out: 4 rows of 8 words in m0..m3
%macro TRANSPOSE4x8B_LOAD 8
    movd             m0, %1
    movd             m2, %2
    movd             m1, %3
    movd             m3, %4

    punpcklbw        m0, m2
    punpcklbw        m1, m3
    punpcklwd        m0, m1

    movd             m4, %5
    movd             m6, %6
    movd             m5, %7
57
    movd             m3, %8
58 59

    punpcklbw        m4, m6
60
    punpcklbw        m5, m3
61 62
    punpcklwd        m4, m5

63
    punpckhdq        m2, m0, m4
64 65 66
    punpckldq        m0, m4

    pxor             m5, m5
67
    punpckhbw        m1, m0, m5
68
    punpcklbw        m0, m5
69
    punpckhbw        m3, m2, m5
70 71 72 73 74 75
    punpcklbw        m2, m5
%endmacro

; in: 4 rows of 8 words in m0..m3
; out: 8 rows of 4 bytes in %1..%8
%macro TRANSPOSE8x4B_STORE 8
76 77 78 79
    packuswb         m0, m2
    packuswb         m1, m3
    SBUTTERFLY bw, 0, 1, 2
    SBUTTERFLY wd, 0, 1, 2
80 81 82 83 84 85 86 87 88

    movd             %1, m0
    pshufd           m0, m0, 0x39
    movd             %2, m0
    pshufd           m0, m0, 0x39
    movd             %3, m0
    pshufd           m0, m0, 0x39
    movd             %4, m0

89 90 91 92 93 94 95
    movd             %5, m1
    pshufd           m1, m1, 0x39
    movd             %6, m1
    pshufd           m1, m1, 0x39
    movd             %7, m1
    pshufd           m1, m1, 0x39
    movd             %8, m1
96 97 98 99 100 101 102 103 104 105 106 107
%endmacro

; in: 8 rows of 4 words in %4..%11
; out: 4 rows of 8 words in m0..m3
%macro TRANSPOSE4x8W_LOAD 8
    movq             m0, %1
    movq             m2, %2
    movq             m1, %3
    movq             m3, %4

    punpcklwd        m0, m2
    punpcklwd        m1, m3
108
    punpckhdq        m2, m0, m1
109 110 111 112 113
    punpckldq        m0, m1

    movq             m4, %5
    movq             m6, %6
    movq             m5, %7
114
    movq             m3, %8
115 116

    punpcklwd        m4, m6
117
    punpcklwd        m5, m3
118
    punpckhdq        m6, m4, m5
119 120
    punpckldq        m4, m5

121
    punpckhqdq       m1, m0, m4
122
    punpcklqdq       m0, m4
123
    punpckhqdq       m3, m2, m6
124 125 126 127 128 129
    punpcklqdq       m2, m6

%endmacro

; in: 4 rows of 8 words in m0..m3
; out: 8 rows of 4 words in %1..%8
130
%macro TRANSPOSE8x4W_STORE 9
131 132
    TRANSPOSE4x4W     0, 1, 2, 3, 4

133
    pxor             m5, m5; zeros reg
134 135 136 137
    CLIPW            m0, m5, %9
    CLIPW            m1, m5, %9
    CLIPW            m2, m5, %9
    CLIPW            m3, m5, %9
138 139

    movq             %1, m0
140
    movhps           %2, m0
141 142 143 144 145 146
    movq             %3, m1
    movhps           %4, m1
    movq             %5, m2
    movhps           %6, m2
    movq             %7, m3
    movhps           %8, m3
147 148 149 150 151 152 153 154 155 156 157 158
%endmacro

; in: 8 rows of 8 bytes in %1..%8
; out: 8 rows of 8 words in m0..m7
%macro TRANSPOSE8x8B_LOAD 8
    movq             m7, %1
    movq             m2, %2
    movq             m1, %3
    movq             m3, %4

    punpcklbw        m7, m2
    punpcklbw        m1, m3
159
    punpcklwd        m3, m7, m1
160 161 162 163 164 165 166 167 168
    punpckhwd        m7, m1

    movq             m4, %5
    movq             m6, %6
    movq             m5, %7
    movq            m15, %8

    punpcklbw        m4, m6
    punpcklbw        m5, m15
169
    punpcklwd        m9, m4, m5
170 171
    punpckhwd        m4, m5

172
    punpckldq        m1, m3, m9;  0, 1
173 174
    punpckhdq        m3, m9;  2, 3

175
    punpckldq        m5, m7, m4;  4, 5
176 177 178 179
    punpckhdq        m7, m4;  6, 7

    pxor            m13, m13

180
    punpcklbw        m0, m1, m13; 0 in 16 bit
181 182
    punpckhbw        m1, m13; 1 in 16 bit

183
    punpcklbw        m2, m3, m13; 2
184 185
    punpckhbw        m3, m13; 3

186
    punpcklbw        m4, m5, m13; 4
187 188
    punpckhbw        m5, m13; 5

189
    punpcklbw        m6, m7, m13; 6
190 191 192 193 194 195 196
    punpckhbw        m7, m13; 7
%endmacro


; in: 8 rows of 8 words in m0..m8
; out: 8 rows of 8 bytes in %1..%8
%macro TRANSPOSE8x8B_STORE 8
197 198 199 200 201
    packuswb         m0, m4
    packuswb         m1, m5
    packuswb         m2, m6
    packuswb         m3, m7
    TRANSPOSE2x4x4B   0, 1, 2, 3, 4
202 203

    movq             %1, m0
204
    movhps           %2, m0
205 206 207 208 209 210
    movq             %3, m1
    movhps           %4, m1
    movq             %5, m2
    movhps           %6, m2
    movq             %7, m3
    movhps           %8, m3
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
%endmacro

; in: 8 rows of 8 words in %1..%8
; out: 8 rows of 8 words in m0..m7
%macro TRANSPOSE8x8W_LOAD 8
    movdqu           m0, %1
    movdqu           m1, %2
    movdqu           m2, %3
    movdqu           m3, %4
    movdqu           m4, %5
    movdqu           m5, %6
    movdqu           m6, %7
    movdqu           m7, %8
    TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
%endmacro

; in: 8 rows of 8 words in m0..m8
; out: 8 rows of 8 words in %1..%8
229
%macro TRANSPOSE8x8W_STORE 9
230 231 232
    TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8

    pxor             m8, m8
233 234 235 236 237 238 239 240
    CLIPW            m0, m8, %9
    CLIPW            m1, m8, %9
    CLIPW            m2, m8, %9
    CLIPW            m3, m8, %9
    CLIPW            m4, m8, %9
    CLIPW            m5, m8, %9
    CLIPW            m6, m8, %9
    CLIPW            m7, m8, %9
241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258

    movdqu           %1, m0
    movdqu           %2, m1
    movdqu           %3, m2
    movdqu           %4, m3
    movdqu           %5, m4
    movdqu           %6, m5
    movdqu           %7, m6
    movdqu           %8, m7
%endmacro


; in: %2 clobbered
; out: %1
; mask in m11
; clobbers m10
%macro MASKED_COPY 2
    pand             %2, m11 ; and mask
259
    pandn           m10, m11, %1; and -mask
260
    por              %2, m10
261
    mova             %1, %2
262 263 264 265 266 267 268 269 270
%endmacro

; in: %2 clobbered
; out: %1
; mask in %3, will be clobbered
%macro MASKED_COPY2 3
    pand             %2, %3 ; and mask
    pandn            %3, %1; and -mask
    por              %2, %3
271
    mova             %1, %2
272 273 274
%endmacro

ALIGN 16
275
; input in m0 ... m3 and tcs in r2. Output in m1 and m2
276
%macro CHROMA_DEBLOCK_BODY 1
277 278
    psubw            m4, m2, m1; q0 - p0
    psubw            m5, m0, m3; p1 - q1
279 280 281 282
    psllw            m4, 2; << 2
    paddw            m5, m4;

    ;tc calculations
283
    movq             m6, [tcq]; tc0
284
    punpcklwd        m6, m6
285
    pshufd           m6, m6, 0xA0; tc0, tc1
286 287 288
%if cpuflag(ssse3)
    psignw           m4, m6, [pw_m1]; -tc0, -tc1
%else
289
    pmullw           m4, m6, [pw_m1]; -tc0, -tc1
290
%endif
291 292
    ;end tc calculations

293
    paddw            m5, [pw_4]; +4
294 295
    psraw            m5, 3; >> 3

296
%if %1 > 8
297 298
    psllw            m4, %1-8; << (BIT_DEPTH - 8)
    psllw            m6, %1-8; << (BIT_DEPTH - 8)
299
%endif
300 301 302 303 304 305
    pmaxsw           m5, m4
    pminsw           m5, m6
    paddw            m1, m5; p0 + delta0
    psubw            m2, m5; q0 - delta0
%endmacro

306
; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6
307
%macro LUMA_DEBLOCK_BODY 2
308 309
    psllw            m9, m2, 1; *2
    psubw           m10, m1, m9
310
    paddw           m10, m3
311
    ABS1            m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3
312

313 314
    psllw            m9, m5, 1; *2
    psubw           m11, m6, m9
315
    paddw           m11, m4
316
    ABS1            m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3
317 318

    ;beta calculations
319
%if %1 > 8
320
    shl             betaq, %1 - 8
321
%endif
322
    movd            m13, betad
323
    SPLATW          m13, m13, 0
324 325
    ;end beta calculations

326
    paddw            m9, m10, m11;   0d0, 0d3  ,  1d0, 1d3
327 328 329 330 331 332 333 334 335 336

    pshufhw         m14, m9, 0x0f ;0b00001111;  0d3 0d3 0d0 0d0 in high
    pshuflw         m14, m14, 0x0f ;0b00001111;  1d3 1d3 1d0 1d0 in low

    pshufhw          m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
    pshuflw          m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3

    paddw           m14, m9; 0d0+0d3, 1d0+1d3

    ;compare
337
    pcmpgtw         m15, m13, m14
338
    movmskps        r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
339
    test            r13, r13
340
    je              .bypassluma
341 342

    ;weak / strong decision compare to beta_2
343 344
    psraw           m15, m13, 2;   beta >> 2
    psllw            m8, m9, 1;
345
    pcmpgtw         m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
346
    movmskps        r6, m15;
347 348 349 350 351 352
    ;end weak / strong decision

    ; weak filter nd_p/q calculation
    pshufd           m8, m10, 0x31
    psrld            m8, 16
    paddw            m8, m10
353
    movd            r7d, m8
354
    pshufd           m8, m8, 0x4E
355
    movd            r8d, m8
356 357 358 359

    pshufd           m8, m11, 0x31
    psrld            m8, 16
    paddw            m8, m11
360
    movd            r9d, m8
361
    pshufd           m8, m8, 0x4E
362
    movd           r10d, m8
363 364 365
    ; end calc for weak filter

    ; filtering mask
366 367 368
    mov             r11, r13
    shr             r11, 3
    movd            m15, r11d
369
    and             r13, 1
370
    movd            m11, r13d
371
    shufps          m11, m15, 0
372 373
    shl             r11, 1
    or              r13, r11
374

375
    pcmpeqd         m11, [pd_1]; filtering mask
376 377 378

    ;decide between strong and weak filtering
    ;tc25 calculations
379
    mov            r11d, [tcq];
380
%if %1 > 8
381
    shl             r11, %1 - 8
382
%endif
383
    movd             m8, r11d; tc0
384
    mov             r3d, [tcq+4];
385
%if %1 > 8
386
    shl              r3, %1 - 8
387
%endif
388
    add            r11d, r3d; tc0 + tc1
389
    jz             .bypassluma
390
    movd             m9, r3d; tc1
391
    punpcklwd        m8, m8
392 393
    punpcklwd        m9, m9
    shufps           m8, m9, 0; tc0, tc1
394
    mova             m9, m8
395 396 397 398 399
    psllw            m8, 2; tc << 2
    pavgw            m8, m9; tc25 = ((tc * 5 + 1) >> 1)
    ;end tc25 calculations

    ;----beta_3 comparison-----
400
    psubw           m12, m0, m3;      p3 - p0
401
    ABS1            m12, m14; abs(p3 - p0)
402

403
    psubw           m15, m7, m4;      q3 - q0
404
    ABS1            m15, m14; abs(q3 - q0)
405 406 407 408 409 410 411 412

    paddw           m12, m15; abs(p3 - p0) + abs(q3 - q0)

    pshufhw         m12, m12, 0xf0 ;0b11110000;
    pshuflw         m12, m12, 0xf0 ;0b11110000;

    psraw           m13, 3; beta >> 3
    pcmpgtw         m13, m12;
413
    movmskps        r11, m13;
414
    and             r6, r11; strong mask , beta_2 and beta_3 comparisons
415 416
    ;----beta_3 comparison end-----
    ;----tc25 comparison---
417
    psubw           m12, m3, m4;      p0 - q0
418
    ABS1            m12, m14; abs(p0 - q0)
419 420 421 422 423

    pshufhw         m12, m12, 0xf0 ;0b11110000;
    pshuflw         m12, m12, 0xf0 ;0b11110000;

    pcmpgtw          m8, m12; tc25 comparisons
424
    movmskps        r11, m8;
425
    and             r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
426
    ;----tc25 comparison end---
427
    mov             r11, r6;
428
    shr             r11, 1;
429
    and             r6, r11; strong mask, bits 2 and 0
430

431
    pmullw          m14, m9, [pw_m2]; -tc * 2
432
    paddw            m9, m9
433

434 435 436 437 438
    and             r6, 5; 0b101
    mov             r11, r6; strong mask
    shr             r6, 2;
    movd            m12, r6d; store to xmm for mask generation
    shl             r6, 1
439 440
    and             r11, 1
    movd            m10, r11d; store to xmm for mask generation
441
    or              r6, r11; final strong mask, bits 1 and 0
442
    jz      .weakfilter
443 444

    shufps          m10, m12, 0
445
    pcmpeqd         m10, [pd_1]; strong mask
446

447
    mova            m13, [pw_4]; 4 in every cell
448
    pand            m11, m10; combine filtering mask and strong mask
449
    paddw           m12, m2, m3;          p1 +   p0
450
    paddw           m12, m4;          p1 +   p0 +   q0
451
    mova            m10, m12; copy
452
    paddw           m12, m12;       2*p1 + 2*p0 + 2*q0
453 454 455 456 457 458 459 460 461
    paddw           m12, m1;   p2 + 2*p1 + 2*p0 + 2*q0
    paddw           m12, m5;   p2 + 2*p1 + 2*p0 + 2*q0 + q1
    paddw           m12, m13;  p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4
    psraw           m12, 3;  ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3)
    psubw           m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0
    pmaxsw          m12, m14
    pminsw          m12, m9; av_clip( , -2 * tc, 2 * tc)
    paddw           m12, m3; p0'

462
    paddw           m15, m1, m10; p2 + p1 + p0 + q0
463 464 465 466 467 468 469 470
    psrlw           m13, 1; 2 in every cell
    paddw           m15, m13; p2 + p1 + p0 + q0 + 2
    psraw           m15, 2;  (p2 + p1 + p0 + q0 + 2) >> 2
    psubw           m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1
    pmaxsw          m15, m14
    pminsw          m15, m9; av_clip( , -2 * tc, 2 * tc)
    paddw           m15, m2; p1'

471
    paddw            m8, m1, m0;     p3 +   p2
472
    paddw            m8, m8;   2*p3 + 2*p2
473 474
    paddw            m8, m1;   2*p3 + 3*p2
    paddw            m8, m10;  2*p3 + 3*p2 + p1 + p0 + q0
475
    paddw           m13, m13
476 477 478 479 480 481 482 483
    paddw            m8, m13;  2*p3 + 3*p2 + p1 + p0 + q0 + 4
    psraw            m8, 3;   (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
    psubw            m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2
    pmaxsw           m8, m14
    pminsw           m8, m9; av_clip( , -2 * tc, 2 * tc)
    paddw            m8, m1; p2'
    MASKED_COPY      m1, m8

484
    paddw            m8, m3, m4;         p0 +   q0
485
    paddw            m8, m5;         p0 +   q0 +   q1
486
    paddw            m8, m8;       2*p0 + 2*q0 + 2*q1
487 488 489 490 491 492 493 494 495 496
    paddw            m8, m2;  p1 + 2*p0 + 2*q0 + 2*q1
    paddw            m8, m6;  p1 + 2*p0 + 2*q0 + 2*q1 + q2
    paddw            m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4
    psraw            m8, 3;  (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3
    psubw            m8, m4;
    pmaxsw           m8, m14
    pminsw           m8, m9; av_clip( , -2 * tc, 2 * tc)
    paddw            m8, m4; q0'
    MASKED_COPY      m2, m15

497
    paddw           m15, m3, m4;   p0 + q0
498
    paddw           m15, m5;   p0 + q0 + q1
499
    mova            m10, m15;
500 501 502 503 504 505 506 507 508 509 510
    paddw           m15, m6;   p0 + q0 + q1 + q2
    psrlw           m13, 1; 2 in every cell
    paddw           m15, m13;  p0 + q0 + q1 + q2 + 2
    psraw           m15, 2;   (p0 + q0 + q1 + q2 + 2) >> 2
    psubw           m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1
    pmaxsw          m15, m14
    pminsw          m15, m9; av_clip( , -2 * tc, 2 * tc)
    paddw           m15, m5; q1'

    paddw           m13, m7;      q3 + 2
    paddw           m13, m6;      q3 +  q2 + 2
511 512
    paddw           m13, m13;   2*q3 + 2*q2 + 4
    paddw           m13, m6;    2*q3 + 3*q2 + 4
513 514 515 516 517 518 519 520 521 522 523 524
    paddw           m13, m10;   2*q3 + 3*q2 + q1 + q0 + p0 + 4
    psraw           m13, 3;    (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3
    psubw           m13, m6;  ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2
    pmaxsw          m13, m14
    pminsw          m13, m9; av_clip( , -2 * tc, 2 * tc)
    paddw           m13, m6; q2'

    MASKED_COPY      m6, m13
    MASKED_COPY      m5, m15
    MASKED_COPY      m4, m8
    MASKED_COPY      m3, m12

525
.weakfilter:
526 527
    not             r6; strong mask -> weak mask
    and             r6, r13; final weak filtering mask, bits 0 and 1
528
    jz             .store
529 530

    ; weak filtering mask
531
    mov             r11, r6
532 533
    shr             r11, 1
    movd            m12, r11d
534 535
    and             r6, 1
    movd            m11, r6d
536
    shufps          m11, m12, 0
537
    pcmpeqd         m11, [pd_1]; filtering mask
538

539
    mov             r13, betaq
540
    shr             r13, 1;
541 542
    add             betaq, r13
    shr             betaq, 3; ((beta + (beta >> 1)) >> 3))
543

544
    mova            m13, [pw_8]
545 546
    psubw           m12, m4, m3 ; q0 - p0
    psllw           m10, m12, 3; 8 * (q0 - p0)
547 548
    paddw           m12, m10 ; 9 * (q0 - p0)

549 550
    psubw           m10, m5, m2 ; q1 - p1
    psllw            m8, m10, 1; 2 * ( q1 - p1 )
551 552 553 554
    paddw           m10, m8; 3 * ( q1 - p1 )
    psubw           m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 )
    paddw           m12, m13; + 8
    psraw           m12, 4; >> 4 , delta0
555
    PABSW           m13, m12; abs(delta0)
556 557


558
    psllw           m10, m9, 2; 8 * tc
559 560 561 562 563 564 565 566 567 568 569
    paddw           m10, m9; 10 * tc
    pcmpgtw         m10, m13
    pand            m11, m10

    psraw            m9, 1;   tc * 2 -> tc
    psraw           m14, 1; -tc * 2 -> -tc

    pmaxsw          m12, m14
    pminsw          m12, m9;  av_clip(delta0, -tc, tc)

    psraw            m9, 1;   tc -> tc / 2
570 571 572
%if cpuflag(ssse3)
    psignw          m14, m9, [pw_m1]; -tc / 2
%else
573
    pmullw          m14, m9, [pw_m1]; -tc / 2
574
%endif
575

576
    pavgw           m15, m1, m3;   (p2 + p0 + 1) >> 1
577 578 579 580 581 582 583 584
    psubw           m15, m2;  ((p2 + p0 + 1) >> 1) - p1
    paddw           m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0
    psraw           m15, 1;   (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
    pmaxsw          m15, m14
    pminsw          m15, m9; av_clip(deltap1, -tc/2, tc/2)
    paddw           m15, m2; p1'

    ;beta calculations
585
    movd            m10, betad
586
    SPLATW          m10, m10, 0
587

588 589
    movd            m13, r7d; 1dp0 + 1dp3
    movd             m8, r8d; 0dp0 + 0dp3
590 591 592
    punpcklwd        m8, m8
    punpcklwd       m13, m13
    shufps          m13, m8, 0;
593
    pcmpgtw          m8, m10, m13
594 595 596 597
    pand             m8, m11
    ;end beta calculations
    MASKED_COPY2     m2, m15, m8; write p1'

598
    pavgw            m8, m6, m4;   (q2 + q0 + 1) >> 1
599 600 601 602 603 604 605
    psubw            m8, m5;  ((q2 + q0 + 1) >> 1) - q1
    psubw            m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0)
    psraw            m8, 1;   ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
    pmaxsw           m8, m14
    pminsw           m8, m9; av_clip(deltaq1, -tc/2, tc/2)
    paddw            m8, m5; q1'

606 607
    movd            m13, r9d;
    movd            m15, r10d;
608 609 610 611 612 613 614 615
    punpcklwd       m15, m15
    punpcklwd       m13, m13
    shufps          m13, m15, 0; dq0 + dq3

    pcmpgtw         m10, m13; compare to ((beta+(beta>>1))>>3)
    pand            m10, m11
    MASKED_COPY2     m5, m8, m10; write q1'

616
    paddw           m15, m3, m12 ; p0 + delta0
617 618
    MASKED_COPY      m3, m15

619
    psubw            m8, m4, m12 ; q0 - delta0
620 621 622 623
    MASKED_COPY      m4, m8
%endmacro

;-----------------------------------------------------------------------------
624
; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
625
;                                   uint8_t *_no_p, uint8_t *_no_q);
626
;-----------------------------------------------------------------------------
627
%macro LOOP_FILTER_CHROMA 0
628 629 630 631 632 633
cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
    sub            pixq, 2
    lea       r3strideq, [3*strideq]
    mov           pix0q, pixq
    add            pixq, r3strideq
    TRANSPOSE4x8B_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
634
    CHROMA_DEBLOCK_BODY 8
635
    TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
636 637
    RET

638 639 640 641 642 643
cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
    sub            pixq, 4
    lea       r3strideq, [3*strideq]
    mov           pix0q, pixq
    add            pixq, r3strideq
    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
644
    CHROMA_DEBLOCK_BODY 10
645 646 647 648 649 650 651 652 653 654 655
    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
    RET

cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
    sub            pixq, 4
    lea       r3strideq, [3*strideq]
    mov           pix0q, pixq
    add            pixq, r3strideq
    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
    CHROMA_DEBLOCK_BODY 12
    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
656 657 658
    RET

;-----------------------------------------------------------------------------
659
; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
660
;                                   uint8_t *_no_p, uint8_t *_no_q);
661
;-----------------------------------------------------------------------------
662 663 664 665 666 667 668 669
cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
    mov           pix0q, pixq
    sub           pix0q, strideq
    sub           pix0q, strideq
    movq             m0, [pix0q];    p1
    movq             m1, [pix0q+strideq]; p0
    movq             m2, [pixq];    q0
    movq             m3, [pixq+strideq]; q1
670 671 672 673 674 675
    pxor             m5, m5; zeros reg
    punpcklbw        m0, m5
    punpcklbw        m1, m5
    punpcklbw        m2, m5
    punpcklbw        m3, m5
    CHROMA_DEBLOCK_BODY  8
676 677 678
    packuswb         m1, m2
    movh[pix0q+strideq], m1
    movhps       [pixq], m1
679 680
    RET

681 682 683 684 685 686 687 688
cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
    mov          pix0q, pixq
    sub          pix0q, strideq
    sub          pix0q, strideq
    movu            m0, [pix0q];    p1
    movu            m1, [pix0q+strideq]; p0
    movu            m2, [pixq];    q0
    movu            m3, [pixq+strideq]; q1
689 690
    CHROMA_DEBLOCK_BODY 10
    pxor            m5, m5; zeros reg
691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708
    CLIPW           m1, m5, [pw_pixel_max_10]
    CLIPW           m2, m5, [pw_pixel_max_10]
    movu [pix0q+strideq], m1
    movu        [pixq], m2
    RET

cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
    mov          pix0q, pixq
    sub          pix0q, strideq
    sub          pix0q, strideq
    movu            m0, [pix0q];    p1
    movu            m1, [pix0q+strideq]; p0
    movu            m2, [pixq];    q0
    movu            m3, [pixq+strideq]; q1
    CHROMA_DEBLOCK_BODY 12
    pxor            m5, m5; zeros reg
    CLIPW           m1, m5, [pw_pixel_max_12]
    CLIPW           m2, m5, [pw_pixel_max_12]
709 710
    movu [pix0q+strideq], m1
    movu        [pixq], m2
711
    RET
712 713 714 715 716 717
%endmacro

INIT_XMM sse2
LOOP_FILTER_CHROMA
INIT_XMM avx
LOOP_FILTER_CHROMA
718 719

%if ARCH_X86_64
720
%macro LOOP_FILTER_LUMA 0
721
;-----------------------------------------------------------------------------
722
; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
723
;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
724
;-----------------------------------------------------------------------------
725 726 727 728 729 730
cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
    sub            pixq, 4
    lea           pix0q, [3 * r1]
    mov     src3strideq, pixq
    add            pixq, pix0q
    TRANSPOSE8x8B_LOAD  PASS8ROWS(src3strideq, pixq, r1, pix0q)
731
    LUMA_DEBLOCK_BODY 8, v
732
.store:
733
    TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q)
734
.bypassluma:
735 736
    RET

737
cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
738
    sub            pixq, 8
739 740 741 742
    lea           pix0q, [3 * strideq]
    mov     src3strideq, pixq
    add            pixq, pix0q
    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
743
    LUMA_DEBLOCK_BODY 10, v
744
.store:
745
    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10]
746 747 748
.bypassluma:
    RET

749
cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
750
    sub            pixq, 8
751 752 753 754
    lea           pix0q, [3 * strideq]
    mov     src3strideq, pixq
    add            pixq, pix0q
    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
755 756
    LUMA_DEBLOCK_BODY 12, v
.store:
757
    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
758
.bypassluma:
759 760 761
    RET

;-----------------------------------------------------------------------------
762
; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
763
;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
764
;-----------------------------------------------------------------------------
765
cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
766
    lea     src3strideq, [3 * strideq]
767 768 769
    mov           pix0q, pixq
    sub           pix0q, src3strideq
    sub           pix0q, strideq
770 771 772 773 774 775 776 777
    movq             m0, [pix0q];               p3
    movq             m1, [pix0q +     strideq]; p2
    movq             m2, [pix0q + 2 * strideq]; p1
    movq             m3, [pix0q + src3strideq]; p0
    movq             m4, [pixq];                q0
    movq             m5, [pixq +     strideq];  q1
    movq             m6, [pixq + 2 * strideq];  q2
    movq             m7, [pixq + src3strideq];  q3
778 779 780 781 782 783 784 785 786
    pxor             m8, m8
    punpcklbw        m0, m8
    punpcklbw        m1, m8
    punpcklbw        m2, m8
    punpcklbw        m3, m8
    punpcklbw        m4, m8
    punpcklbw        m5, m8
    punpcklbw        m6, m8
    punpcklbw        m7, m8
787
    LUMA_DEBLOCK_BODY 8, h
788
.store:
789 790 791
    packuswb          m1, m2
    packuswb          m3, m4
    packuswb          m5, m6
792 793 794 795 796 797
    movh   [pix0q +     strideq], m1
    movhps [pix0q + 2 * strideq], m1
    movh   [pix0q + src3strideq], m3
    movhps [pixq               ], m3
    movh   [pixq  +     strideq], m5
    movhps [pixq  + 2 * strideq], m5
798
.bypassluma:
799 800
    RET

801
cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
802 803 804 805 806 807 808 809 810 811 812 813 814
    lea                  src3strideq, [3 * strideq]
    mov                        pix0q, pixq
    sub                        pix0q, src3strideq
    sub                        pix0q, strideq
    movdqu                        m0, [pix0q];               p3
    movdqu                        m1, [pix0q +     strideq]; p2
    movdqu                        m2, [pix0q + 2 * strideq]; p1
    movdqu                        m3, [pix0q + src3strideq]; p0
    movdqu                        m4, [pixq];                q0
    movdqu                        m5, [pixq  +     strideq]; q1
    movdqu                        m6, [pixq  + 2 * strideq]; q2
    movdqu                        m7, [pixq  + src3strideq]; q3
    LUMA_DEBLOCK_BODY             10, h
815
.store:
816
    pxor                          m8, m8; zeros reg
817 818 819 820 821 822 823 824 825 826 827 828 829 830 831
    CLIPW                         m1, m8, [pw_pixel_max_10]
    CLIPW                         m2, m8, [pw_pixel_max_10]
    CLIPW                         m3, m8, [pw_pixel_max_10]
    CLIPW                         m4, m8, [pw_pixel_max_10]
    CLIPW                         m5, m8, [pw_pixel_max_10]
    CLIPW                         m6, m8, [pw_pixel_max_10]
    movdqu     [pix0q +     strideq], m1;  p2
    movdqu     [pix0q + 2 * strideq], m2;  p1
    movdqu     [pix0q + src3strideq], m3;  p0
    movdqu     [pixq               ], m4;  q0
    movdqu     [pixq  +     strideq], m5;  q1
    movdqu     [pixq  + 2 * strideq], m6;  q2
.bypassluma:
    RET

832
cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853
    lea                  src3strideq, [3 * strideq]
    mov                        pix0q, pixq
    sub                        pix0q, src3strideq
    sub                        pix0q, strideq
    movdqu                        m0, [pix0q];               p3
    movdqu                        m1, [pix0q +     strideq]; p2
    movdqu                        m2, [pix0q + 2 * strideq]; p1
    movdqu                        m3, [pix0q + src3strideq]; p0
    movdqu                        m4, [pixq];                q0
    movdqu                        m5, [pixq  +     strideq]; q1
    movdqu                        m6, [pixq  + 2 * strideq]; q2
    movdqu                        m7, [pixq  + src3strideq]; q3
    LUMA_DEBLOCK_BODY             12, h
.store:
    pxor                          m8, m8; zeros reg
    CLIPW                         m1, m8, [pw_pixel_max_12]
    CLIPW                         m2, m8, [pw_pixel_max_12]
    CLIPW                         m3, m8, [pw_pixel_max_12]
    CLIPW                         m4, m8, [pw_pixel_max_12]
    CLIPW                         m5, m8, [pw_pixel_max_12]
    CLIPW                         m6, m8, [pw_pixel_max_12]
854 855 856 857 858 859
    movdqu     [pix0q +     strideq], m1;  p2
    movdqu     [pix0q + 2 * strideq], m2;  p1
    movdqu     [pix0q + src3strideq], m3;  p0
    movdqu     [pixq               ], m4;  q0
    movdqu     [pixq  +     strideq], m5;  q1
    movdqu     [pixq  + 2 * strideq], m6;  q2
860
.bypassluma:
861
    RET
862

863 864 865 866 867 868
%endmacro

INIT_XMM sse2
LOOP_FILTER_LUMA
INIT_XMM ssse3
LOOP_FILTER_LUMA
869 870
INIT_XMM avx
LOOP_FILTER_LUMA
871
%endif