hevc_deblock.asm 28.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
;*****************************************************************************
;* SSE2-optimized HEVC deblocking code
;*****************************************************************************
;* Copyright (C) 2013 VTT
;*
;* Authors: Seppo Tomperi <seppo.tomperi@vtt.fi>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************

%include "libavutil/x86/x86util.asm"

SECTION_RODATA

29 30
cextern pw_1023
%define pw_pixel_max_10 pw_1023
31 32 33
pw_pixel_max_12: times 8 dw ((1 << 12)-1)
pw_m2:           times 8 dw -2
pd_1 :           times 4 dd  1
34 35 36

cextern pw_4
cextern pw_8
37
cextern pw_m1
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61

SECTION .text
INIT_XMM sse2

; expands to [base],...,[base+7*stride]
%define PASS8ROWS(base, base3, stride, stride3) \
    [base], [base+stride], [base+stride*2], [base3], \
    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]

; in: 8 rows of 4 bytes in %4..%11
; out: 4 rows of 8 words in m0..m3
%macro TRANSPOSE4x8B_LOAD 8
    movd             m0, %1
    movd             m2, %2
    movd             m1, %3
    movd             m3, %4

    punpcklbw        m0, m2
    punpcklbw        m1, m3
    punpcklwd        m0, m1

    movd             m4, %5
    movd             m6, %6
    movd             m5, %7
62
    movd             m3, %8
63 64

    punpcklbw        m4, m6
65
    punpcklbw        m5, m3
66 67
    punpcklwd        m4, m5

68
    punpckhdq        m2, m0, m4
69 70 71
    punpckldq        m0, m4

    pxor             m5, m5
72
    punpckhbw        m1, m0, m5
73
    punpcklbw        m0, m5
74
    punpckhbw        m3, m2, m5
75 76 77 78 79 80
    punpcklbw        m2, m5
%endmacro

; in: 4 rows of 8 words in m0..m3
; out: 8 rows of 4 bytes in %1..%8
%macro TRANSPOSE8x4B_STORE 8
81 82 83 84
    packuswb         m0, m2
    packuswb         m1, m3
    SBUTTERFLY bw, 0, 1, 2
    SBUTTERFLY wd, 0, 1, 2
85 86 87 88 89 90 91 92 93

    movd             %1, m0
    pshufd           m0, m0, 0x39
    movd             %2, m0
    pshufd           m0, m0, 0x39
    movd             %3, m0
    pshufd           m0, m0, 0x39
    movd             %4, m0

94 95 96 97 98 99 100
    movd             %5, m1
    pshufd           m1, m1, 0x39
    movd             %6, m1
    pshufd           m1, m1, 0x39
    movd             %7, m1
    pshufd           m1, m1, 0x39
    movd             %8, m1
101 102 103 104 105 106 107 108 109 110 111 112
%endmacro

; in: 8 rows of 4 words in %4..%11
; out: 4 rows of 8 words in m0..m3
%macro TRANSPOSE4x8W_LOAD 8
    movq             m0, %1
    movq             m2, %2
    movq             m1, %3
    movq             m3, %4

    punpcklwd        m0, m2
    punpcklwd        m1, m3
113
    punpckhdq        m2, m0, m1
114 115 116 117 118
    punpckldq        m0, m1

    movq             m4, %5
    movq             m6, %6
    movq             m5, %7
119
    movq             m3, %8
120 121

    punpcklwd        m4, m6
122
    punpcklwd        m5, m3
123
    punpckhdq        m6, m4, m5
124 125
    punpckldq        m4, m5

126
    punpckhqdq       m1, m0, m4
127
    punpcklqdq       m0, m4
128
    punpckhqdq       m3, m2, m6
129 130 131 132 133 134
    punpcklqdq       m2, m6

%endmacro

; in: 4 rows of 8 words in m0..m3
; out: 8 rows of 4 words in %1..%8
135
%macro TRANSPOSE8x4W_STORE 9
136 137
    TRANSPOSE4x4W     0, 1, 2, 3, 4

138
    pxor             m5, m5; zeros reg
139 140 141 142
    CLIPW            m0, m5, %9
    CLIPW            m1, m5, %9
    CLIPW            m2, m5, %9
    CLIPW            m3, m5, %9
143 144

    movq             %1, m0
145
    movhps           %2, m0
146 147 148 149 150 151
    movq             %3, m1
    movhps           %4, m1
    movq             %5, m2
    movhps           %6, m2
    movq             %7, m3
    movhps           %8, m3
152 153 154 155 156 157 158 159 160 161 162 163
%endmacro

; in: 8 rows of 8 bytes in %1..%8
; out: 8 rows of 8 words in m0..m7
%macro TRANSPOSE8x8B_LOAD 8
    movq             m7, %1
    movq             m2, %2
    movq             m1, %3
    movq             m3, %4

    punpcklbw        m7, m2
    punpcklbw        m1, m3
164
    punpcklwd        m3, m7, m1
165 166 167 168 169 170 171 172 173
    punpckhwd        m7, m1

    movq             m4, %5
    movq             m6, %6
    movq             m5, %7
    movq            m15, %8

    punpcklbw        m4, m6
    punpcklbw        m5, m15
174
    punpcklwd        m9, m4, m5
175 176
    punpckhwd        m4, m5

177
    punpckldq        m1, m3, m9;  0, 1
178 179
    punpckhdq        m3, m9;  2, 3

180
    punpckldq        m5, m7, m4;  4, 5
181 182 183 184
    punpckhdq        m7, m4;  6, 7

    pxor            m13, m13

185
    punpcklbw        m0, m1, m13; 0 in 16 bit
186 187
    punpckhbw        m1, m13; 1 in 16 bit

188
    punpcklbw        m2, m3, m13; 2
189 190
    punpckhbw        m3, m13; 3

191
    punpcklbw        m4, m5, m13; 4
192 193
    punpckhbw        m5, m13; 5

194
    punpcklbw        m6, m7, m13; 6
195 196 197 198 199 200 201
    punpckhbw        m7, m13; 7
%endmacro


; in: 8 rows of 8 words in m0..m8
; out: 8 rows of 8 bytes in %1..%8
%macro TRANSPOSE8x8B_STORE 8
202 203 204 205 206
    packuswb         m0, m4
    packuswb         m1, m5
    packuswb         m2, m6
    packuswb         m3, m7
    TRANSPOSE2x4x4B   0, 1, 2, 3, 4
207 208

    movq             %1, m0
209
    movhps           %2, m0
210 211 212 213 214 215
    movq             %3, m1
    movhps           %4, m1
    movq             %5, m2
    movhps           %6, m2
    movq             %7, m3
    movhps           %8, m3
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
%endmacro

; in: 8 rows of 8 words in %1..%8
; out: 8 rows of 8 words in m0..m7
%macro TRANSPOSE8x8W_LOAD 8
    movdqu           m0, %1
    movdqu           m1, %2
    movdqu           m2, %3
    movdqu           m3, %4
    movdqu           m4, %5
    movdqu           m5, %6
    movdqu           m6, %7
    movdqu           m7, %8
    TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8
%endmacro

; in: 8 rows of 8 words in m0..m8
; out: 8 rows of 8 words in %1..%8
234
%macro TRANSPOSE8x8W_STORE 9
235 236 237
    TRANSPOSE8x8W     0, 1, 2, 3, 4, 5, 6, 7, 8

    pxor             m8, m8
238 239 240 241 242 243 244 245
    CLIPW            m0, m8, %9
    CLIPW            m1, m8, %9
    CLIPW            m2, m8, %9
    CLIPW            m3, m8, %9
    CLIPW            m4, m8, %9
    CLIPW            m5, m8, %9
    CLIPW            m6, m8, %9
    CLIPW            m7, m8, %9
246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263

    movdqu           %1, m0
    movdqu           %2, m1
    movdqu           %3, m2
    movdqu           %4, m3
    movdqu           %5, m4
    movdqu           %6, m5
    movdqu           %7, m6
    movdqu           %8, m7
%endmacro


; in: %2 clobbered
; out: %1
; mask in m11
; clobbers m10
%macro MASKED_COPY 2
    pand             %2, m11 ; and mask
264
    pandn           m10, m11, %1; and -mask
265
    por              %2, m10
266
    mova             %1, %2
267 268 269 270 271 272 273 274 275
%endmacro

; in: %2 clobbered
; out: %1
; mask in %3, will be clobbered
%macro MASKED_COPY2 3
    pand             %2, %3 ; and mask
    pandn            %3, %1; and -mask
    por              %2, %3
276
    mova             %1, %2
277 278 279
%endmacro

ALIGN 16
280
; input in m0 ... m3 and tcs in r2. Output in m1 and m2
281
%macro CHROMA_DEBLOCK_BODY 1
282 283
    psubw            m4, m2, m1; q0 - p0
    psubw            m5, m0, m3; p1 - q1
284 285 286 287
    psllw            m4, 2; << 2
    paddw            m5, m4;

    ;tc calculations
288
    movq             m6, [tcq]; tc0
289
    punpcklwd        m6, m6
290
    pshufd           m6, m6, 0xA0; tc0, tc1
291 292 293
%if cpuflag(ssse3)
    psignw           m4, m6, [pw_m1]; -tc0, -tc1
%else
294
    pmullw           m4, m6, [pw_m1]; -tc0, -tc1
295
%endif
296 297
    ;end tc calculations

298
    paddw            m5, [pw_4]; +4
299 300
    psraw            m5, 3; >> 3

301
%if %1 > 8
302 303
    psllw            m4, %1-8; << (BIT_DEPTH - 8)
    psllw            m6, %1-8; << (BIT_DEPTH - 8)
304
%endif
305 306 307 308 309 310
    pmaxsw           m5, m4
    pminsw           m5, m6
    paddw            m1, m5; p0 + delta0
    psubw            m2, m5; q0 - delta0
%endmacro

311
; input in m0 ... m7, beta in r2 tcs in r3. Output in m1...m6
312
%macro LUMA_DEBLOCK_BODY 2
313 314
    psllw            m9, m2, 1; *2
    psubw           m10, m1, m9
315
    paddw           m10, m3
316
    ABS1            m10, m11 ; 0dp0, 0dp3 , 1dp0, 1dp3
317

318 319
    psllw            m9, m5, 1; *2
    psubw           m11, m6, m9
320
    paddw           m11, m4
321
    ABS1            m11, m13 ; 0dq0, 0dq3 , 1dq0, 1dq3
322 323

    ;beta calculations
324
%if %1 > 8
325
    shl             betaq, %1 - 8
326
%endif
327
    movd            m13, betad
328
    SPLATW          m13, m13, 0
329 330
    ;end beta calculations

331
    paddw            m9, m10, m11;   0d0, 0d3  ,  1d0, 1d3
332 333 334 335 336 337 338 339 340 341

    pshufhw         m14, m9, 0x0f ;0b00001111;  0d3 0d3 0d0 0d0 in high
    pshuflw         m14, m14, 0x0f ;0b00001111;  1d3 1d3 1d0 1d0 in low

    pshufhw          m9, m9, 0xf0 ;0b11110000; 0d0 0d0 0d3 0d3
    pshuflw          m9, m9, 0xf0 ;0b11110000; 1d0 1d0 1d3 1d3

    paddw           m14, m9; 0d0+0d3, 1d0+1d3

    ;compare
342
    pcmpgtw         m15, m13, m14
343
    movmskps        r13, m15 ;filtering mask 0d0 + 0d3 < beta0 (bit 2 or 3) , 1d0 + 1d3 < beta1 (bit 0 or 1)
344
    test            r13, r13
345
    je              .bypassluma
346 347

    ;weak / strong decision compare to beta_2
348 349
    psraw           m15, m13, 2;   beta >> 2
    psllw            m8, m9, 1;
350
    pcmpgtw         m15, m8; (d0 << 1) < beta_2, (d3 << 1) < beta_2
351
    movmskps        r6, m15;
352 353 354 355 356 357
    ;end weak / strong decision

    ; weak filter nd_p/q calculation
    pshufd           m8, m10, 0x31
    psrld            m8, 16
    paddw            m8, m10
358
    movd            r7d, m8
359
    pshufd           m8, m8, 0x4E
360
    movd            r8d, m8
361 362 363 364

    pshufd           m8, m11, 0x31
    psrld            m8, 16
    paddw            m8, m11
365
    movd            r9d, m8
366
    pshufd           m8, m8, 0x4E
367
    movd           r10d, m8
368 369 370
    ; end calc for weak filter

    ; filtering mask
371 372 373
    mov             r11, r13
    shr             r11, 3
    movd            m15, r11d
374
    and             r13, 1
375
    movd            m11, r13d
376
    shufps          m11, m15, 0
377 378
    shl             r11, 1
    or              r13, r11
379

380
    pcmpeqd         m11, [pd_1]; filtering mask
381 382 383

    ;decide between strong and weak filtering
    ;tc25 calculations
384
    mov            r11d, [tcq];
385
%if %1 > 8
386
    shl             r11, %1 - 8
387
%endif
388
    movd             m8, r11d; tc0
389
    mov             r3d, [tcq+4];
390
%if %1 > 8
391
    shl              r3, %1 - 8
392
%endif
393
    add            r11d, r3d; tc0 + tc1
394
    jz             .bypassluma
395
    movd             m9, r3d; tc1
396
    punpcklwd        m8, m8
397 398
    punpcklwd        m9, m9
    shufps           m8, m9, 0; tc0, tc1
399
    mova             m9, m8
400 401 402 403 404
    psllw            m8, 2; tc << 2
    pavgw            m8, m9; tc25 = ((tc * 5 + 1) >> 1)
    ;end tc25 calculations

    ;----beta_3 comparison-----
405
    psubw           m12, m0, m3;      p3 - p0
406
    ABS1            m12, m14; abs(p3 - p0)
407

408
    psubw           m15, m7, m4;      q3 - q0
409
    ABS1            m15, m14; abs(q3 - q0)
410 411 412 413 414 415 416 417

    paddw           m12, m15; abs(p3 - p0) + abs(q3 - q0)

    pshufhw         m12, m12, 0xf0 ;0b11110000;
    pshuflw         m12, m12, 0xf0 ;0b11110000;

    psraw           m13, 3; beta >> 3
    pcmpgtw         m13, m12;
418
    movmskps        r11, m13;
419
    and             r6, r11; strong mask , beta_2 and beta_3 comparisons
420 421
    ;----beta_3 comparison end-----
    ;----tc25 comparison---
422
    psubw           m12, m3, m4;      p0 - q0
423
    ABS1            m12, m14; abs(p0 - q0)
424 425 426 427 428

    pshufhw         m12, m12, 0xf0 ;0b11110000;
    pshuflw         m12, m12, 0xf0 ;0b11110000;

    pcmpgtw          m8, m12; tc25 comparisons
429
    movmskps        r11, m8;
430
    and             r6, r11; strong mask, beta_2, beta_3 and tc25 comparisons
431
    ;----tc25 comparison end---
432
    mov             r11, r6;
433
    shr             r11, 1;
434
    and             r6, r11; strong mask, bits 2 and 0
435

436
    pmullw          m14, m9, [pw_m2]; -tc * 2
437
    paddw            m9, m9
438

439 440 441 442 443
    and             r6, 5; 0b101
    mov             r11, r6; strong mask
    shr             r6, 2;
    movd            m12, r6d; store to xmm for mask generation
    shl             r6, 1
444 445
    and             r11, 1
    movd            m10, r11d; store to xmm for mask generation
446
    or              r6, r11; final strong mask, bits 1 and 0
447
    jz      .weakfilter
448 449

    shufps          m10, m12, 0
450
    pcmpeqd         m10, [pd_1]; strong mask
451

452
    mova            m13, [pw_4]; 4 in every cell
453
    pand            m11, m10; combine filtering mask and strong mask
454
    paddw           m12, m2, m3;          p1 +   p0
455
    paddw           m12, m4;          p1 +   p0 +   q0
456
    mova            m10, m12; copy
457
    paddw           m12, m12;       2*p1 + 2*p0 + 2*q0
458 459 460 461 462 463 464 465 466
    paddw           m12, m1;   p2 + 2*p1 + 2*p0 + 2*q0
    paddw           m12, m5;   p2 + 2*p1 + 2*p0 + 2*q0 + q1
    paddw           m12, m13;  p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4
    psraw           m12, 3;  ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3)
    psubw           m12, m3; ((p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4) >> 3) - p0
    pmaxsw          m12, m14
    pminsw          m12, m9; av_clip( , -2 * tc, 2 * tc)
    paddw           m12, m3; p0'

467
    paddw           m15, m1, m10; p2 + p1 + p0 + q0
468 469 470 471 472 473 474 475
    psrlw           m13, 1; 2 in every cell
    paddw           m15, m13; p2 + p1 + p0 + q0 + 2
    psraw           m15, 2;  (p2 + p1 + p0 + q0 + 2) >> 2
    psubw           m15, m2;((p2 + p1 + p0 + q0 + 2) >> 2) - p1
    pmaxsw          m15, m14
    pminsw          m15, m9; av_clip( , -2 * tc, 2 * tc)
    paddw           m15, m2; p1'

476
    paddw            m8, m1, m0;     p3 +   p2
477
    paddw            m8, m8;   2*p3 + 2*p2
478 479
    paddw            m8, m1;   2*p3 + 3*p2
    paddw            m8, m10;  2*p3 + 3*p2 + p1 + p0 + q0
480
    paddw           m13, m13
481 482 483 484 485 486 487 488
    paddw            m8, m13;  2*p3 + 3*p2 + p1 + p0 + q0 + 4
    psraw            m8, 3;   (2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3
    psubw            m8, m1; ((2*p3 + 3*p2 + p1 + p0 + q0 + 4) >> 3) - p2
    pmaxsw           m8, m14
    pminsw           m8, m9; av_clip( , -2 * tc, 2 * tc)
    paddw            m8, m1; p2'
    MASKED_COPY      m1, m8

489
    paddw            m8, m3, m4;         p0 +   q0
490
    paddw            m8, m5;         p0 +   q0 +   q1
491
    paddw            m8, m8;       2*p0 + 2*q0 + 2*q1
492 493 494 495 496 497 498 499 500 501
    paddw            m8, m2;  p1 + 2*p0 + 2*q0 + 2*q1
    paddw            m8, m6;  p1 + 2*p0 + 2*q0 + 2*q1 + q2
    paddw            m8, m13; p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4
    psraw            m8, 3;  (p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4) >>3
    psubw            m8, m4;
    pmaxsw           m8, m14
    pminsw           m8, m9; av_clip( , -2 * tc, 2 * tc)
    paddw            m8, m4; q0'
    MASKED_COPY      m2, m15

502
    paddw           m15, m3, m4;   p0 + q0
503
    paddw           m15, m5;   p0 + q0 + q1
504
    mova            m10, m15;
505 506 507 508 509 510 511 512 513 514 515
    paddw           m15, m6;   p0 + q0 + q1 + q2
    psrlw           m13, 1; 2 in every cell
    paddw           m15, m13;  p0 + q0 + q1 + q2 + 2
    psraw           m15, 2;   (p0 + q0 + q1 + q2 + 2) >> 2
    psubw           m15, m5; ((p0 + q0 + q1 + q2 + 2) >> 2) - q1
    pmaxsw          m15, m14
    pminsw          m15, m9; av_clip( , -2 * tc, 2 * tc)
    paddw           m15, m5; q1'

    paddw           m13, m7;      q3 + 2
    paddw           m13, m6;      q3 +  q2 + 2
516 517
    paddw           m13, m13;   2*q3 + 2*q2 + 4
    paddw           m13, m6;    2*q3 + 3*q2 + 4
518 519 520 521 522 523 524 525 526 527 528 529
    paddw           m13, m10;   2*q3 + 3*q2 + q1 + q0 + p0 + 4
    psraw           m13, 3;    (2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3
    psubw           m13, m6;  ((2*q3 + 3*q2 + q1 + q0 + p0 + 4) >> 3) - q2
    pmaxsw          m13, m14
    pminsw          m13, m9; av_clip( , -2 * tc, 2 * tc)
    paddw           m13, m6; q2'

    MASKED_COPY      m6, m13
    MASKED_COPY      m5, m15
    MASKED_COPY      m4, m8
    MASKED_COPY      m3, m12

530
.weakfilter:
531 532
    not             r6; strong mask -> weak mask
    and             r6, r13; final weak filtering mask, bits 0 and 1
533
    jz             .store
534 535

    ; weak filtering mask
536
    mov             r11, r6
537 538
    shr             r11, 1
    movd            m12, r11d
539 540
    and             r6, 1
    movd            m11, r6d
541
    shufps          m11, m12, 0
542
    pcmpeqd         m11, [pd_1]; filtering mask
543

544
    mov             r13, betaq
545
    shr             r13, 1;
546 547
    add             betaq, r13
    shr             betaq, 3; ((beta + (beta >> 1)) >> 3))
548

549
    mova            m13, [pw_8]
550 551
    psubw           m12, m4, m3 ; q0 - p0
    psllw           m10, m12, 3; 8 * (q0 - p0)
552 553
    paddw           m12, m10 ; 9 * (q0 - p0)

554 555
    psubw           m10, m5, m2 ; q1 - p1
    psllw            m8, m10, 1; 2 * ( q1 - p1 )
556 557 558 559
    paddw           m10, m8; 3 * ( q1 - p1 )
    psubw           m12, m10; 9 * (q0 - p0) - 3 * ( q1 - p1 )
    paddw           m12, m13; + 8
    psraw           m12, 4; >> 4 , delta0
560
    PABSW           m13, m12; abs(delta0)
561 562


563
    psllw           m10, m9, 2; 8 * tc
564 565 566 567 568 569 570 571 572 573 574
    paddw           m10, m9; 10 * tc
    pcmpgtw         m10, m13
    pand            m11, m10

    psraw            m9, 1;   tc * 2 -> tc
    psraw           m14, 1; -tc * 2 -> -tc

    pmaxsw          m12, m14
    pminsw          m12, m9;  av_clip(delta0, -tc, tc)

    psraw            m9, 1;   tc -> tc / 2
575 576 577
%if cpuflag(ssse3)
    psignw          m14, m9, [pw_m1]; -tc / 2
%else
578
    pmullw          m14, m9, [pw_m1]; -tc / 2
579
%endif
580

581
    pavgw           m15, m1, m3;   (p2 + p0 + 1) >> 1
582 583 584 585 586 587 588 589
    psubw           m15, m2;  ((p2 + p0 + 1) >> 1) - p1
    paddw           m15, m12; ((p2 + p0 + 1) >> 1) - p1 + delta0
    psraw           m15, 1;   (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
    pmaxsw          m15, m14
    pminsw          m15, m9; av_clip(deltap1, -tc/2, tc/2)
    paddw           m15, m2; p1'

    ;beta calculations
590
    movd            m10, betad
591
    SPLATW          m10, m10, 0
592

593 594
    movd            m13, r7d; 1dp0 + 1dp3
    movd             m8, r8d; 0dp0 + 0dp3
595 596 597
    punpcklwd        m8, m8
    punpcklwd       m13, m13
    shufps          m13, m8, 0;
598
    pcmpgtw          m8, m10, m13
599 600 601 602
    pand             m8, m11
    ;end beta calculations
    MASKED_COPY2     m2, m15, m8; write p1'

603
    pavgw            m8, m6, m4;   (q2 + q0 + 1) >> 1
604 605 606 607 608 609 610
    psubw            m8, m5;  ((q2 + q0 + 1) >> 1) - q1
    psubw            m8, m12; ((q2 + q0 + 1) >> 1) - q1 - delta0)
    psraw            m8, 1;   ((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
    pmaxsw           m8, m14
    pminsw           m8, m9; av_clip(deltaq1, -tc/2, tc/2)
    paddw            m8, m5; q1'

611 612
    movd            m13, r9d;
    movd            m15, r10d;
613 614 615 616 617 618 619 620
    punpcklwd       m15, m15
    punpcklwd       m13, m13
    shufps          m13, m15, 0; dq0 + dq3

    pcmpgtw         m10, m13; compare to ((beta+(beta>>1))>>3)
    pand            m10, m11
    MASKED_COPY2     m5, m8, m10; write q1'

621
    paddw           m15, m3, m12 ; p0 + delta0
622 623
    MASKED_COPY      m3, m15

624
    psubw            m8, m4, m12 ; q0 - delta0
625 626 627 628
    MASKED_COPY      m4, m8
%endmacro

;-----------------------------------------------------------------------------
629
; void ff_hevc_v_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
630
;                                   uint8_t *_no_p, uint8_t *_no_q);
631
;-----------------------------------------------------------------------------
632
%macro LOOP_FILTER_CHROMA 0
633 634 635 636 637 638
cglobal hevc_v_loop_filter_chroma_8, 3, 5, 7, pix, stride, tc, pix0, r3stride
    sub            pixq, 2
    lea       r3strideq, [3*strideq]
    mov           pix0q, pixq
    add            pixq, r3strideq
    TRANSPOSE4x8B_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
639
    CHROMA_DEBLOCK_BODY 8
640
    TRANSPOSE8x4B_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq)
641 642
    RET

643 644 645 646 647 648
cglobal hevc_v_loop_filter_chroma_10, 3, 5, 7, pix, stride, tc, pix0, r3stride
    sub            pixq, 4
    lea       r3strideq, [3*strideq]
    mov           pix0q, pixq
    add            pixq, r3strideq
    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
649
    CHROMA_DEBLOCK_BODY 10
650 651 652 653 654 655 656 657 658 659 660
    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_10]
    RET

cglobal hevc_v_loop_filter_chroma_12, 3, 5, 7, pix, stride, tc, pix0, r3stride
    sub            pixq, 4
    lea       r3strideq, [3*strideq]
    mov           pix0q, pixq
    add            pixq, r3strideq
    TRANSPOSE4x8W_LOAD  PASS8ROWS(pix0q, pixq, strideq, r3strideq)
    CHROMA_DEBLOCK_BODY 12
    TRANSPOSE8x4W_STORE PASS8ROWS(pix0q, pixq, strideq, r3strideq), [pw_pixel_max_12]
661 662 663
    RET

;-----------------------------------------------------------------------------
664
; void ff_hevc_h_loop_filter_chroma(uint8_t *_pix, ptrdiff_t _stride, int32_t *tc,
665
;                                   uint8_t *_no_p, uint8_t *_no_q);
666
;-----------------------------------------------------------------------------
667 668 669 670 671 672 673 674
cglobal hevc_h_loop_filter_chroma_8, 3, 4, 7, pix, stride, tc, pix0
    mov           pix0q, pixq
    sub           pix0q, strideq
    sub           pix0q, strideq
    movq             m0, [pix0q];    p1
    movq             m1, [pix0q+strideq]; p0
    movq             m2, [pixq];    q0
    movq             m3, [pixq+strideq]; q1
675 676 677 678 679 680
    pxor             m5, m5; zeros reg
    punpcklbw        m0, m5
    punpcklbw        m1, m5
    punpcklbw        m2, m5
    punpcklbw        m3, m5
    CHROMA_DEBLOCK_BODY  8
681 682 683
    packuswb         m1, m2
    movh[pix0q+strideq], m1
    movhps       [pixq], m1
684 685
    RET

686 687 688 689 690 691 692 693
cglobal hevc_h_loop_filter_chroma_10, 3, 4, 7, pix, stride, tc, pix0
    mov          pix0q, pixq
    sub          pix0q, strideq
    sub          pix0q, strideq
    movu            m0, [pix0q];    p1
    movu            m1, [pix0q+strideq]; p0
    movu            m2, [pixq];    q0
    movu            m3, [pixq+strideq]; q1
694 695
    CHROMA_DEBLOCK_BODY 10
    pxor            m5, m5; zeros reg
696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713
    CLIPW           m1, m5, [pw_pixel_max_10]
    CLIPW           m2, m5, [pw_pixel_max_10]
    movu [pix0q+strideq], m1
    movu        [pixq], m2
    RET

cglobal hevc_h_loop_filter_chroma_12, 3, 4, 7, pix, stride, tc, pix0
    mov          pix0q, pixq
    sub          pix0q, strideq
    sub          pix0q, strideq
    movu            m0, [pix0q];    p1
    movu            m1, [pix0q+strideq]; p0
    movu            m2, [pixq];    q0
    movu            m3, [pixq+strideq]; q1
    CHROMA_DEBLOCK_BODY 12
    pxor            m5, m5; zeros reg
    CLIPW           m1, m5, [pw_pixel_max_12]
    CLIPW           m2, m5, [pw_pixel_max_12]
714 715
    movu [pix0q+strideq], m1
    movu        [pixq], m2
716
    RET
717 718 719 720 721 722
%endmacro

INIT_XMM sse2
LOOP_FILTER_CHROMA
INIT_XMM avx
LOOP_FILTER_CHROMA
723 724

%if ARCH_X86_64
725
%macro LOOP_FILTER_LUMA 0
726
;-----------------------------------------------------------------------------
727
; void ff_hevc_v_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
728
;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
729
;-----------------------------------------------------------------------------
730 731 732 733 734 735
cglobal hevc_v_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
    sub            pixq, 4
    lea           pix0q, [3 * r1]
    mov     src3strideq, pixq
    add            pixq, pix0q
    TRANSPOSE8x8B_LOAD  PASS8ROWS(src3strideq, pixq, r1, pix0q)
736
    LUMA_DEBLOCK_BODY 8, v
737
.store:
738
    TRANSPOSE8x8B_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q)
739
.bypassluma:
740 741
    RET

742
cglobal hevc_v_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
743
    sub            pixq, 8
744 745 746 747
    lea           pix0q, [3 * strideq]
    mov     src3strideq, pixq
    add            pixq, pix0q
    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
748
    LUMA_DEBLOCK_BODY 10, v
749
.store:
750
    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_10]
751 752 753
.bypassluma:
    RET

754
cglobal hevc_v_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
755
    sub            pixq, 8
756 757 758 759
    lea           pix0q, [3 * strideq]
    mov     src3strideq, pixq
    add            pixq, pix0q
    TRANSPOSE8x8W_LOAD  PASS8ROWS(src3strideq, pixq, strideq, pix0q)
760 761
    LUMA_DEBLOCK_BODY 12, v
.store:
762
    TRANSPOSE8x8W_STORE PASS8ROWS(src3strideq, pixq, r1, pix0q), [pw_pixel_max_12]
763
.bypassluma:
764 765 766
    RET

;-----------------------------------------------------------------------------
767
; void ff_hevc_h_loop_filter_luma(uint8_t *_pix, ptrdiff_t _stride, int beta,
768
;                                 int32_t *tc, uint8_t *_no_p, uint8_t *_no_q);
769
;-----------------------------------------------------------------------------
770
cglobal hevc_h_loop_filter_luma_8, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
771
    lea     src3strideq, [3 * strideq]
772 773 774
    mov           pix0q, pixq
    sub           pix0q, src3strideq
    sub           pix0q, strideq
775 776 777 778 779 780 781 782
    movq             m0, [pix0q];               p3
    movq             m1, [pix0q +     strideq]; p2
    movq             m2, [pix0q + 2 * strideq]; p1
    movq             m3, [pix0q + src3strideq]; p0
    movq             m4, [pixq];                q0
    movq             m5, [pixq +     strideq];  q1
    movq             m6, [pixq + 2 * strideq];  q2
    movq             m7, [pixq + src3strideq];  q3
783 784 785 786 787 788 789 790 791
    pxor             m8, m8
    punpcklbw        m0, m8
    punpcklbw        m1, m8
    punpcklbw        m2, m8
    punpcklbw        m3, m8
    punpcklbw        m4, m8
    punpcklbw        m5, m8
    punpcklbw        m6, m8
    punpcklbw        m7, m8
792
    LUMA_DEBLOCK_BODY 8, h
793
.store:
794 795 796
    packuswb          m1, m2
    packuswb          m3, m4
    packuswb          m5, m6
797 798 799 800 801 802
    movh   [pix0q +     strideq], m1
    movhps [pix0q + 2 * strideq], m1
    movh   [pix0q + src3strideq], m3
    movhps [pixq               ], m3
    movh   [pixq  +     strideq], m5
    movhps [pixq  + 2 * strideq], m5
803
.bypassluma:
804 805
    RET

806
cglobal hevc_h_loop_filter_luma_10, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
807 808 809 810 811 812 813 814 815 816 817 818 819
    lea                  src3strideq, [3 * strideq]
    mov                        pix0q, pixq
    sub                        pix0q, src3strideq
    sub                        pix0q, strideq
    movdqu                        m0, [pix0q];               p3
    movdqu                        m1, [pix0q +     strideq]; p2
    movdqu                        m2, [pix0q + 2 * strideq]; p1
    movdqu                        m3, [pix0q + src3strideq]; p0
    movdqu                        m4, [pixq];                q0
    movdqu                        m5, [pixq  +     strideq]; q1
    movdqu                        m6, [pixq  + 2 * strideq]; q2
    movdqu                        m7, [pixq  + src3strideq]; q3
    LUMA_DEBLOCK_BODY             10, h
820
.store:
821
    pxor                          m8, m8; zeros reg
822 823 824 825 826 827 828 829 830 831 832 833 834 835 836
    CLIPW                         m1, m8, [pw_pixel_max_10]
    CLIPW                         m2, m8, [pw_pixel_max_10]
    CLIPW                         m3, m8, [pw_pixel_max_10]
    CLIPW                         m4, m8, [pw_pixel_max_10]
    CLIPW                         m5, m8, [pw_pixel_max_10]
    CLIPW                         m6, m8, [pw_pixel_max_10]
    movdqu     [pix0q +     strideq], m1;  p2
    movdqu     [pix0q + 2 * strideq], m2;  p1
    movdqu     [pix0q + src3strideq], m3;  p0
    movdqu     [pixq               ], m4;  q0
    movdqu     [pixq  +     strideq], m5;  q1
    movdqu     [pixq  + 2 * strideq], m6;  q2
.bypassluma:
    RET

837
cglobal hevc_h_loop_filter_luma_12, 4, 14, 16, pix, stride, beta, tc, pix0, src3stride
838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858
    lea                  src3strideq, [3 * strideq]
    mov                        pix0q, pixq
    sub                        pix0q, src3strideq
    sub                        pix0q, strideq
    movdqu                        m0, [pix0q];               p3
    movdqu                        m1, [pix0q +     strideq]; p2
    movdqu                        m2, [pix0q + 2 * strideq]; p1
    movdqu                        m3, [pix0q + src3strideq]; p0
    movdqu                        m4, [pixq];                q0
    movdqu                        m5, [pixq  +     strideq]; q1
    movdqu                        m6, [pixq  + 2 * strideq]; q2
    movdqu                        m7, [pixq  + src3strideq]; q3
    LUMA_DEBLOCK_BODY             12, h
.store:
    pxor                          m8, m8; zeros reg
    CLIPW                         m1, m8, [pw_pixel_max_12]
    CLIPW                         m2, m8, [pw_pixel_max_12]
    CLIPW                         m3, m8, [pw_pixel_max_12]
    CLIPW                         m4, m8, [pw_pixel_max_12]
    CLIPW                         m5, m8, [pw_pixel_max_12]
    CLIPW                         m6, m8, [pw_pixel_max_12]
859 860 861 862 863 864
    movdqu     [pix0q +     strideq], m1;  p2
    movdqu     [pix0q + 2 * strideq], m2;  p1
    movdqu     [pix0q + src3strideq], m3;  p0
    movdqu     [pixq               ], m4;  q0
    movdqu     [pixq  +     strideq], m5;  q1
    movdqu     [pixq  + 2 * strideq], m6;  q2
865
.bypassluma:
866
    RET
867

868 869 870 871 872 873
%endmacro

INIT_XMM sse2
LOOP_FILTER_LUMA
INIT_XMM ssse3
LOOP_FILTER_LUMA
874 875
INIT_XMM avx
LOOP_FILTER_LUMA
876
%endif