Bug 1846318 update dav1d to e58afe4dd9057591882a01c31382c203e8a61c92 r=chunmin

Depends on D187495

Differential Revision: https://phabricator.services.mozilla.com/D187496
This commit is contained in:
Karl Tomlinson 2023-09-25 19:34:48 +00:00
parent bfae62bdc0
commit 0d02f04be0
40 changed files with 1831 additions and 734 deletions

View file

@ -133,6 +133,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
'../../../third_party/dav1d/src/x86/mc16_sse.asm',
'../../../third_party/dav1d/src/x86/mc_sse.asm',
'../../../third_party/dav1d/src/x86/msac.asm',
'../../../third_party/dav1d/src/x86/pal.asm',
'../../../third_party/dav1d/src/x86/refmvs.asm',
]

View file

@ -85,6 +85,7 @@ SOURCES += [
'../../third_party/dav1d/src/mem.c',
'../../third_party/dav1d/src/msac.c',
'../../third_party/dav1d/src/obu.c',
'../../third_party/dav1d/src/pal.c',
'../../third_party/dav1d/src/picture.c',
'../../third_party/dav1d/src/qm.c',
'../../third_party/dav1d/src/ref.c',

View file

@ -20,11 +20,11 @@ origin:
# Human-readable identifier for this version/release
# Generally "version NNN", "tag SSS", "bookmark SSS"
release: 616bfd1506a8a75c6a358e578cbec9ca11931502 (2023-07-01T11:36:39.000+03:00).
release: e58afe4dd9057591882a01c31382c203e8a61c92 (2023-07-25T16:10:07.000+02:00).
# Revision to pull in
# Must be a long or short commit SHA (long preferred)
revision: 616bfd1506a8a75c6a358e578cbec9ca11931502
revision: e58afe4dd9057591882a01c31382c203e8a61c92
# The package's license, where possible using the mnemonic from
# https://spdx.org/licenses/

View file

@ -1,2 +1,2 @@
/* auto-generated, do not edit */
#define DAV1D_VERSION "616bfd1506a8a75c6a358e578cbec9ca11931502"
#define DAV1D_VERSION "e58afe4dd9057591882a01c31382c203e8a61c92"

View file

@ -34,7 +34,7 @@
#include "common/attributes.h"
#if !defined(BITDEPTH)
typedef void pixel;
typedef uint8_t pixel; /* can't be void due to pointer-to-array usage */
typedef void coef;
#define HIGHBD_DECL_SUFFIX /* nothing */
#define HIGHBD_CALL_SUFFIX /* nothing */

View file

@ -1481,8 +1481,8 @@ function fgy_32x32_8bpc_neon, export=1
calc_offset r6, lr, r6, 0, 0
add_offset r5, r6, lr, r5, r9
add r4, r4, #32 // grain_lut += BLOCK_SIZE * bx
add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
add r4, r4, #32 // grain_lut += FG_BLOCK_SIZE * bx
add r6, r11, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
ldr r10, [sp, #120] // type
adr r11, L(fgy_loop_tbl)
@ -1490,8 +1490,8 @@ function fgy_32x32_8bpc_neon, export=1
tst r10, #1
ldr r10, [r11, r10, lsl #2]
add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
add r8, r8, #32 // grain_lut += BLOCK_SIZE * bx
add r8, r8, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
add r8, r8, #32 // grain_lut += FG_BLOCK_SIZE * bx
add r11, r11, r10
@ -1695,10 +1695,10 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
calc_offset r8, r12, r8, \sx, \sy
add_offset r5, r8, r12, r5, r10
add r4, r4, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
add r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
add r4, r4, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
add r11, r11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
movrel_local r12, overlap_coeffs_\sx
ldr lr, [sp, #132] // type

View file

@ -1353,8 +1353,8 @@ function fgy_32x32_16bpc_neon, export=1
calc_offset r6, lr, r6, 0, 0
add_offset r5, r6, lr, r5, r9
add r4, r4, #32*2 // grain_lut += BLOCK_SIZE * bx
add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
add r4, r4, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
add r6, r11, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
ldr r10, [sp, #120] // type
adr r11, L(fgy_loop_tbl)
@ -1362,8 +1362,8 @@ function fgy_32x32_16bpc_neon, export=1
tst r10, #1
ldr r10, [r11, r10, lsl #2]
add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
add r8, r8, #32*2 // grain_lut += BLOCK_SIZE * bx
add r8, r8, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
add r8, r8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
add r11, r11, r10
@ -1651,10 +1651,10 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1
vmov.16 d31[3], r7 // overlap y [1]
add r4, r4, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
add r11, r11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
add r4, r4, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
add r11, r11, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
movrel_local r12, overlap_coeffs_\sx
ldr lr, [sp, #132] // type

View file

@ -1576,17 +1576,17 @@ L(ipred_filter_tbl):
endfunc
// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
// const uint16_t *const pal, const uint8_t *idx,
// const pixel *const pal, const uint8_t *idx,
// const int w, const int h);
function pal_pred_8bpc_neon, export=1
push {r4-r5, lr}
ldrd r4, r5, [sp, #12]
vld1.16 {q0}, [r2, :128]
vld1.8 {d0}, [r2, :64]
clz lr, r4
adr r12, L(pal_pred_tbl)
sub lr, lr, #25
vmov.i8 q15, #7
ldr lr, [r12, lr, lsl #2]
vmovn.i16 d0, q0
add r12, r12, lr
add r2, r0, r1
bx r12
@ -1602,8 +1602,11 @@ L(pal_pred_tbl):
40:
lsl r1, r1, #1
4:
vld1.8 {q1}, [r3, :128]!
vld1.8 {d2}, [r3, :64]!
subs r5, r5, #4
vshr.u8 d3, d2, #4
vand.u8 d2, d2, d30
vzip.8 d2, d3
vtbl.8 d2, {d0}, d2
vtbl.8 d3, {d0}, d3
vst1.32 {d2[0]}, [r0, :32], r1
@ -1615,8 +1618,11 @@ L(pal_pred_tbl):
80:
lsl r1, r1, #1
8:
vld1.8 {q1, q2}, [r3, :128]!
vld1.8 {q1}, [r3, :64]!
subs r5, r5, #4
vshr.u8 q2, q1, #4
vand.u8 q1, q1, q15
vzip.8 q1, q2
vtbl.8 d2, {d0}, d2
vtbl.8 d3, {d0}, d3
vst1.8 {d2}, [r0, :64], r1
@ -1630,9 +1636,14 @@ L(pal_pred_tbl):
160:
lsl r1, r1, #1
16:
vld1.8 {q8, q9}, [r3, :128]!
vld1.8 {q10, q11}, [r3, :64]!
subs r5, r5, #4
vld1.8 {q10, q11}, [r3, :128]!
vand.u8 q8, q10, q15
vshr.u8 q9, q10, #4
vand.u8 q10, q11, q15
vshr.u8 q11, q11, #4
vzip.8 q8, q9
vzip.8 q10, q11
vtbl.8 d16, {d0}, d16
vtbl.8 d17, {d0}, d17
vtbl.8 d18, {d0}, d18
@ -1650,9 +1661,14 @@ L(pal_pred_tbl):
320:
lsl r1, r1, #1
32:
vld1.8 {q8, q9}, [r3, :128]!
vld1.8 {q10, q11}, [r3, :64]!
subs r5, r5, #2
vld1.8 {q10, q11}, [r3, :128]!
vand.u8 q8, q10, q15
vshr.u8 q9, q10, #4
vand.u8 q10, q11, q15
vshr.u8 q11, q11, #4
vzip.8 q8, q9
vzip.8 q10, q11
vtbl.8 d16, {d0}, d16
vtbl.8 d17, {d0}, d17
vtbl.8 d18, {d0}, d18
@ -1668,9 +1684,14 @@ L(pal_pred_tbl):
640:
sub r1, r1, #32
64:
vld1.8 {q8, q9}, [r3, :128]!
vld1.8 {q10, q11}, [r3, :64]!
subs r5, r5, #1
vld1.8 {q10, q11}, [r3, :128]!
vand.u8 q8, q10, q15
vshr.u8 q9, q10, #4
vand.u8 q10, q11, q15
vshr.u8 q11, q11, #4
vzip.8 q8, q9
vzip.8 q10, q11
vtbl.8 d16, {d0}, d16
vtbl.8 d17, {d0}, d17
vtbl.8 d18, {d0}, d18

View file

@ -1732,7 +1732,7 @@ function ipred_filter_16bpc_neon, export=1
endfunc
// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const uint16_t *const pal, const uint8_t *idx,
// const pixel *const pal, const uint8_t *idx,
// const int w, const int h);
function pal_pred_16bpc_neon, export=1
push {r4-r5, lr}
@ -1742,6 +1742,7 @@ function pal_pred_16bpc_neon, export=1
clz lr, r4
adr r12, L(pal_pred_tbl)
sub lr, lr, #25
vmov.i8 q13, #7
ldr lr, [r12, lr, lsl #2]
vmov.i16 q15, #0x100
add r12, r12, lr
@ -1759,8 +1760,11 @@ L(pal_pred_tbl):
40:
lsl r1, r1, #1
4:
vld1.8 {q1}, [r3, :128]!
vld1.8 {d2}, [r3, :64]!
subs r5, r5, #4
vshr.u8 d3, d2, #4
vand.u8 d2, d2, d26
vzip.8 d2, d3
// Restructure q1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
vadd.i8 q0, q1, q1
vadd.i8 q1, q1, q1
@ -1780,8 +1784,11 @@ L(pal_pred_tbl):
80:
lsl r1, r1, #1
8:
vld1.8 {q1, q2}, [r3, :128]!
vld1.8 {q1}, [r3, :64]!
subs r5, r5, #4
vshr.u8 q2, q1, #4
vand.u8 q1, q1, q13
vzip.8 q1, q2
// Prefer doing the adds twice, instead of chaining a vmov after
// the add.
vadd.i8 q0, q1, q1
@ -1811,9 +1818,14 @@ L(pal_pred_tbl):
160:
lsl r1, r1, #1
16:
vld1.8 {q2, q3}, [r3, :128]!
vld1.8 {q10, q11}, [r3, :64]!
subs r5, r5, #4
vld1.8 {q10, q11}, [r3, :128]!
vand.u8 q2, q10, q13
vshr.u8 q3, q10, #4
vand.u8 q10, q11, q13
vshr.u8 q11, q11, #4
vzip.8 q2, q3
vzip.8 q10, q11
vadd.i8 q0, q2, q2
vadd.i8 q1, q2, q2
vadd.i8 q2, q3, q3
@ -1860,9 +1872,14 @@ L(pal_pred_tbl):
lsl r1, r1, #1
sub r1, r1, #32
32:
vld1.8 {q2, q3}, [r3, :128]!
vld1.8 {q10, q11}, [r3, :64]!
subs r5, r5, #2
vld1.8 {q10, q11}, [r3, :128]!
vand.u8 q2, q10, q13
vshr.u8 q3, q10, #4
vand.u8 q10, q11, q13
vshr.u8 q11, q11, #4
vzip.8 q2, q3
vzip.8 q10, q11
vadd.i8 q0, q2, q2
vadd.i8 q1, q2, q2
vadd.i8 q2, q3, q3
@ -1908,9 +1925,14 @@ L(pal_pred_tbl):
640:
sub r1, r1, #96
64:
vld1.8 {q2, q3}, [r3, :128]!
vld1.8 {q10, q11}, [r3, :64]!
subs r5, r5, #1
vld1.8 {q10, q11}, [r3, :128]!
vand.u8 q2, q10, q13
vshr.u8 q3, q10, #4
vand.u8 q10, q11, q13
vshr.u8 q11, q11, #4
vzip.8 q2, q3
vzip.8 q10, q11
vadd.i8 q0, q2, q2
vadd.i8 q1, q2, q2
vadd.i8 q2, q3, q3

View file

@ -1409,14 +1409,14 @@ function fgy_32x32_8bpc_neon, export=1
ldr w11, [sp, #24] // type
adr x13, L(fgy_loop_tbl)
add x4, x12, #32 // grain_lut += BLOCK_SIZE * bx
add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
add x4, x12, #32 // grain_lut += FG_BLOCK_SIZE * bx
add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
tst w11, #1
ldrh w11, [x13, w11, uxtw #1]
add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
add x8, x8, #32 // grain_lut += BLOCK_SIZE * bx
add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
add x8, x8, #32 // grain_lut += FG_BLOCK_SIZE * bx
sub x11, x13, w11, uxtw
@ -1638,10 +1638,10 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
add_offset x17, w16, x17, x5, x10
add_offset x5, w8, x11, x5, x10
add x4, x13, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
add x11, x11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
add x4, x13, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
add x11, x11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
ldr w13, [sp, #64] // type

View file

@ -1308,14 +1308,14 @@ function fgy_32x32_16bpc_neon, export=1
ldr w11, [sp, #88] // type
adr x13, L(fgy_loop_tbl)
add x4, x12, #32*2 // grain_lut += BLOCK_SIZE * bx
add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
add x4, x12, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
tst w11, #1
ldrh w11, [x13, w11, uxtw #1]
add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
add x8, x8, #32*2 // grain_lut += BLOCK_SIZE * bx
add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
add x8, x8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
sub x11, x13, w11, uxtw
@ -1581,10 +1581,10 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1
add_offset x17, w16, x17, x5, x10
add_offset x5, w8, x11, x5, x10
add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
add x4, x13, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
add x11, x11, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
ldr w13, [sp, #112] // type

View file

@ -3921,23 +3921,26 @@ L(ipred_filter_tbl):
endfunc
// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
// const uint16_t *const pal, const uint8_t *idx,
// const pixel *const pal, const uint8_t *idx,
// const int w, const int h);
function pal_pred_8bpc_neon, export=1
ld1 {v0.8h}, [x2]
ld1 {v0.8b}, [x2]
clz w9, w4
adr x6, L(pal_pred_tbl)
sub w9, w9, #25
movi v31.16b, #7
ldrh w9, [x6, w9, uxtw #1]
xtn v0.8b, v0.8h
sub x6, x6, w9, uxtw
add x2, x0, x1
lsl x1, x1, #1
br x6
4:
AARCH64_VALID_JUMP_TARGET
ld1 {v1.16b}, [x3], #16
ld1 {v1.8b}, [x3], #8
subs w5, w5, #4
ushr v3.8b, v1.8b, #4
and v2.8b, v1.8b, v31.8b
zip1 v1.16b, v2.16b, v3.16b
tbl v1.16b, {v0.16b}, v1.16b
st1 {v1.s}[0], [x0], x1
st1 {v1.s}[1], [x2], x1
@ -3947,8 +3950,12 @@ function pal_pred_8bpc_neon, export=1
ret
8:
AARCH64_VALID_JUMP_TARGET
ld1 {v1.16b, v2.16b}, [x3], #32
ld1 {v1.16b}, [x3], #16
subs w5, w5, #4
ushr v4.16b, v1.16b, #4
and v3.16b, v1.16b, v31.16b
zip1 v1.16b, v3.16b, v4.16b
zip2 v2.16b, v3.16b, v4.16b
tbl v1.16b, {v0.16b}, v1.16b
st1 {v1.d}[0], [x0], x1
tbl v2.16b, {v0.16b}, v2.16b
@ -3959,9 +3966,17 @@ function pal_pred_8bpc_neon, export=1
ret
16:
AARCH64_VALID_JUMP_TARGET
ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64
ld1 {v1.16b, v2.16b}, [x3], #32
subs w5, w5, #4
ushr v5.16b, v1.16b, #4
and v4.16b, v1.16b, v31.16b
ushr v7.16b, v2.16b, #4
and v6.16b, v2.16b, v31.16b
zip1 v1.16b, v4.16b, v5.16b
zip2 v2.16b, v4.16b, v5.16b
zip1 v3.16b, v6.16b, v7.16b
tbl v1.16b, {v0.16b}, v1.16b
zip2 v4.16b, v6.16b, v7.16b
tbl v2.16b, {v0.16b}, v2.16b
st1 {v1.16b}, [x0], x1
tbl v3.16b, {v0.16b}, v3.16b
@ -3974,10 +3989,25 @@ function pal_pred_8bpc_neon, export=1
32:
AARCH64_VALID_JUMP_TARGET
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
subs w5, w5, #4
ushr v21.16b, v16.16b, #4
and v20.16b, v16.16b, v31.16b
ushr v23.16b, v17.16b, #4
and v22.16b, v17.16b, v31.16b
ushr v25.16b, v18.16b, #4
and v24.16b, v18.16b, v31.16b
ushr v27.16b, v19.16b, #4
and v26.16b, v19.16b, v31.16b
zip1 v16.16b, v20.16b, v21.16b
zip2 v17.16b, v20.16b, v21.16b
zip1 v18.16b, v22.16b, v23.16b
zip2 v19.16b, v22.16b, v23.16b
zip1 v20.16b, v24.16b, v25.16b
zip2 v21.16b, v24.16b, v25.16b
tbl v16.16b, {v0.16b}, v16.16b
zip1 v22.16b, v26.16b, v27.16b
tbl v17.16b, {v0.16b}, v17.16b
zip2 v23.16b, v26.16b, v27.16b
tbl v18.16b, {v0.16b}, v18.16b
tbl v19.16b, {v0.16b}, v19.16b
tbl v20.16b, {v0.16b}, v20.16b
@ -3993,10 +4023,25 @@ function pal_pred_8bpc_neon, export=1
64:
AARCH64_VALID_JUMP_TARGET
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
subs w5, w5, #2
ushr v21.16b, v16.16b, #4
and v20.16b, v16.16b, v31.16b
ushr v23.16b, v17.16b, #4
and v22.16b, v17.16b, v31.16b
ushr v25.16b, v18.16b, #4
and v24.16b, v18.16b, v31.16b
ushr v27.16b, v19.16b, #4
and v26.16b, v19.16b, v31.16b
zip1 v16.16b, v20.16b, v21.16b
zip2 v17.16b, v20.16b, v21.16b
zip1 v18.16b, v22.16b, v23.16b
zip2 v19.16b, v22.16b, v23.16b
zip1 v20.16b, v24.16b, v25.16b
zip2 v21.16b, v24.16b, v25.16b
tbl v16.16b, {v0.16b}, v16.16b
zip1 v22.16b, v26.16b, v27.16b
tbl v17.16b, {v0.16b}, v17.16b
zip2 v23.16b, v26.16b, v27.16b
tbl v18.16b, {v0.16b}, v18.16b
tbl v19.16b, {v0.16b}, v19.16b
st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1

View file

@ -4179,13 +4179,14 @@ function ipred_filter_16bpc_neon, export=1
endfunc
// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
// const uint16_t *const pal, const uint8_t *idx,
// const pixel *const pal, const uint8_t *idx,
// const int w, const int h);
function pal_pred_16bpc_neon, export=1
ld1 {v30.8h}, [x2]
clz w9, w4
adr x6, L(pal_pred_tbl)
sub w9, w9, #25
movi v29.16b, #7
ldrh w9, [x6, w9, uxtw #1]
movi v31.8h, #1, lsl #8
sub x6, x6, w9, uxtw
@ -4195,8 +4196,11 @@ function pal_pred_16bpc_neon, export=1
add x2, x0, x1
lsl x1, x1, #1
4:
ld1 {v1.16b}, [x3], #16
ld1 {v1.8b}, [x3], #8
subs w5, w5, #4
ushr v3.8b, v1.8b, #4
and v2.8b, v1.8b, v29.8b
zip1 v1.16b, v2.16b, v3.16b
// Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
add v1.16b, v1.16b, v1.16b
zip1 v0.16b, v1.16b, v1.16b
@ -4216,8 +4220,12 @@ function pal_pred_16bpc_neon, export=1
add x2, x0, x1
lsl x1, x1, #1
8:
ld1 {v2.16b, v3.16b}, [x3], #32
ld1 {v2.16b}, [x3], #16
subs w5, w5, #4
ushr v4.16b, v2.16b, #4
and v3.16b, v2.16b, v29.16b
zip1 v2.16b, v3.16b, v4.16b
zip2 v3.16b, v3.16b, v4.16b
add v2.16b, v2.16b, v2.16b
add v3.16b, v3.16b, v3.16b
zip1 v0.16b, v2.16b, v2.16b
@ -4243,8 +4251,16 @@ function pal_pred_16bpc_neon, export=1
add x2, x0, x1
lsl x1, x1, #1
16:
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
ld1 {v4.16b, v5.16b}, [x3], #32
subs w5, w5, #4
ushr v7.16b, v4.16b, #4
and v6.16b, v4.16b, v29.16b
ushr v3.16b, v5.16b, #4
and v2.16b, v5.16b, v29.16b
zip1 v4.16b, v6.16b, v7.16b
zip2 v5.16b, v6.16b, v7.16b
zip1 v6.16b, v2.16b, v3.16b
zip2 v7.16b, v2.16b, v3.16b
add v4.16b, v4.16b, v4.16b
add v5.16b, v5.16b, v5.16b
add v6.16b, v6.16b, v6.16b
@ -4284,8 +4300,16 @@ function pal_pred_16bpc_neon, export=1
add x2, x0, x1
lsl x1, x1, #1
32:
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
ld1 {v4.16b, v5.16b}, [x3], #32
subs w5, w5, #2
ushr v7.16b, v4.16b, #4
and v6.16b, v4.16b, v29.16b
ushr v3.16b, v5.16b, #4
and v2.16b, v5.16b, v29.16b
zip1 v4.16b, v6.16b, v7.16b
zip2 v5.16b, v6.16b, v7.16b
zip1 v6.16b, v2.16b, v3.16b
zip2 v7.16b, v2.16b, v3.16b
add v4.16b, v4.16b, v4.16b
add v5.16b, v5.16b, v5.16b
add v6.16b, v6.16b, v6.16b
@ -4322,8 +4346,16 @@ function pal_pred_16bpc_neon, export=1
AARCH64_VALID_JUMP_TARGET
add x2, x0, #64
64:
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
ld1 {v4.16b, v5.16b}, [x3], #32
subs w5, w5, #1
ushr v7.16b, v4.16b, #4
and v6.16b, v4.16b, v29.16b
ushr v3.16b, v5.16b, #4
and v2.16b, v5.16b, v29.16b
zip1 v4.16b, v6.16b, v7.16b
zip2 v5.16b, v6.16b, v7.16b
zip1 v6.16b, v2.16b, v3.16b
zip2 v7.16b, v2.16b, v3.16b
add v4.16b, v4.16b, v4.16b
add v5.16b, v5.16b, v5.16b
add v6.16b, v6.16b, v6.16b

View file

@ -91,8 +91,8 @@ static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row,
int offsets[2 /* col offset */][2 /* row offset */];
// process this row in BLOCK_SIZE^2 blocks
for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
// process this row in FG_BLOCK_SIZE^2 blocks
for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) {
if (data->overlap_flag && bx) {
// shift previous offsets left
@ -155,8 +155,8 @@ fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
\
int offsets[2 /* col offset */][2 /* row offset */]; \
\
/* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \
for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \
/* process this row in FG_BLOCK_SIZE^2 blocks (subsampled) */ \
for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) { \
if (data->overlap_flag && bx) { \
/* shift previous offsets left */ \
for (int i = 0; i < rows; i++) \

View file

@ -22,7 +22,7 @@ BEGIN
VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder"
VALUE "InternalName", "dav1d"
VALUE "OriginalFilename", "libdav1d.dll"
VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
VALUE "LegalCopyright", L"Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
END
END
BLOCK "VarFileInfo"

View file

@ -370,142 +370,6 @@ static inline int findoddzero(const uint8_t *buf, int len) {
return 0;
}
static void read_pal_plane(Dav1dTaskContext *const t, Av1Block *const b,
const int pl, const int sz_ctx,
const int bx4, const int by4)
{
Dav1dTileState *const ts = t->ts;
const Dav1dFrameContext *const f = t->f;
const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
uint16_t cache[16], used_cache[8];
int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
int n_cache = 0;
// don't reuse above palette outside SB64 boundaries
int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
const uint16_t *l = t->al_pal[1][by4][pl], *a = t->al_pal[0][bx4][pl];
// fill/sort cache
while (l_cache && a_cache) {
if (*l < *a) {
if (!n_cache || cache[n_cache - 1] != *l)
cache[n_cache++] = *l;
l++;
l_cache--;
} else {
if (*a == *l) {
l++;
l_cache--;
}
if (!n_cache || cache[n_cache - 1] != *a)
cache[n_cache++] = *a;
a++;
a_cache--;
}
}
if (l_cache) {
do {
if (!n_cache || cache[n_cache - 1] != *l)
cache[n_cache++] = *l;
l++;
} while (--l_cache > 0);
} else if (a_cache) {
do {
if (!n_cache || cache[n_cache - 1] != *a)
cache[n_cache++] = *a;
a++;
} while (--a_cache > 0);
}
// find reused cache entries
int i = 0;
for (int n = 0; n < n_cache && i < pal_sz; n++)
if (dav1d_msac_decode_bool_equi(&ts->msac))
used_cache[i++] = cache[n];
const int n_used_cache = i;
// parse new entries
uint16_t *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))][pl] : t->scratch.pal[pl];
if (i < pal_sz) {
int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
if (i < pal_sz) {
int bits = f->cur.p.bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
const int max = (1 << f->cur.p.bpc) - 1;
do {
const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
prev = pal[i++] = imin(prev + delta + !pl, max);
if (prev + !pl >= max) {
for (; i < pal_sz; i++)
pal[i] = max;
break;
}
bits = imin(bits, 1 + ulog2(max - prev - !pl));
} while (i < pal_sz);
}
// merge cache+new entries
int n = 0, m = n_used_cache;
for (i = 0; i < pal_sz; i++) {
if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
pal[i] = used_cache[n++];
} else {
assert(m < pal_sz);
pal[i] = pal[m++];
}
}
} else {
memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
}
if (DEBUG_BLOCK_INFO) {
printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
for (int n = 0; n < n_cache; n++)
printf("%c%02x", n ? ' ' : '[', cache[n]);
printf("%s, pal=", n_cache ? "]" : "[]");
for (int n = 0; n < pal_sz; n++)
printf("%c%02x", n ? ' ' : '[', pal[n]);
printf("]\n");
}
}
static void read_pal_uv(Dav1dTaskContext *const t, Av1Block *const b,
const int sz_ctx, const int bx4, const int by4)
{
read_pal_plane(t, b, 1, sz_ctx, bx4, by4);
// V pal coding
Dav1dTileState *const ts = t->ts;
const Dav1dFrameContext *const f = t->f;
uint16_t *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))][2] : t->scratch.pal[2];
if (dav1d_msac_decode_bool_equi(&ts->msac)) {
const int bits = f->cur.p.bpc - 4 +
dav1d_msac_decode_bools(&ts->msac, 2);
int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
const int max = (1 << f->cur.p.bpc) - 1;
for (int i = 1; i < b->pal_sz[1]; i++) {
int delta = dav1d_msac_decode_bools(&ts->msac, bits);
if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
prev = pal[i] = (prev + delta) & max;
}
} else {
for (int i = 0; i < b->pal_sz[1]; i++)
pal[i] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
}
if (DEBUG_BLOCK_INFO) {
printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
for (int n = 0; n < b->pal_sz[1]; n++)
printf("%c%02x", n ? ' ' : '[', pal[n]);
printf("]\n");
}
}
// meant to be SIMD'able, so that theoretical complexity of this function
// times block size goes from w4*h4 to w4+h4-1
// a and b are previous two lines containing (a) top/left entries or (b)
@ -584,7 +448,8 @@ static void read_pal_indices(Dav1dTaskContext *const t,
Dav1dTileState *const ts = t->ts;
const ptrdiff_t stride = bw4 * 4;
assert(pal_idx);
pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
pixel *const pal_tmp = t->scratch.pal_idx_uv;
pal_tmp[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
uint16_t (*const color_map_cdf)[8] =
ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2];
uint8_t (*const order)[8] = t->scratch.pal_order;
@ -593,23 +458,16 @@ static void read_pal_indices(Dav1dTaskContext *const t,
// top/left-to-bottom/right diagonals ("wave-front")
const int first = imin(i, w4 * 4 - 1);
const int last = imax(0, i - h4 * 4 + 1);
order_palette(pal_idx, stride, i, first, last, order, ctx);
order_palette(pal_tmp, stride, i, first, last, order, ctx);
for (int j = first, m = 0; j >= last; j--, m++) {
const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
color_map_cdf[ctx[m]], b->pal_sz[pl] - 1);
pal_idx[(i - j) * stride + j] = order[m][color_idx];
pal_tmp[(i - j) * stride + j] = order[m][color_idx];
}
}
// fill invisible edges
if (bw4 > w4)
for (int y = 0; y < 4 * h4; y++)
memset(&pal_idx[y * stride + 4 * w4],
pal_idx[y * stride + 4 * w4 - 1], 4 * (bw4 - w4));
if (h4 < bh4) {
const uint8_t *const src = &pal_idx[stride * (4 * h4 - 1)];
for (int y = h4 * 4; y < bh4 * 4; y++)
memcpy(&pal_idx[y * stride], src, bw4 * 4);
}
t->c->pal_dsp.pal_idx_finish(pal_idx, pal_tmp, bw4 * 4, bh4 * 4,
w4 * 4, h4 * 4);
}
static void read_vartx_tree(Dav1dTaskContext *const t,
@ -1306,7 +1164,7 @@ static int decode_b(Dav1dTaskContext *const t,
if (DEBUG_BLOCK_INFO)
printf("Post-y_pal[%d]: r=%d\n", use_y_pal, ts->msac.rng);
if (use_y_pal)
read_pal_plane(t, b, 0, sz_ctx, bx4, by4);
f->bd_fn.read_pal_plane(t, b, 0, sz_ctx, bx4, by4);
}
if (has_chroma && b->uv_mode == DC_PRED) {
@ -1316,7 +1174,7 @@ static int decode_b(Dav1dTaskContext *const t,
if (DEBUG_BLOCK_INFO)
printf("Post-uv_pal[%d]: r=%d\n", use_uv_pal, ts->msac.rng);
if (use_uv_pal) // see aomedia bug 2183 for why we use luma coordinates
read_pal_uv(t, b, sz_ctx, bx4, by4);
f->bd_fn.read_pal_uv(t, b, sz_ctx, bx4, by4);
}
}
@ -1341,9 +1199,9 @@ static int decode_b(Dav1dTaskContext *const t,
const int p = t->frame_thread.pass & 1;
assert(ts->frame_thread[p].pal_idx);
pal_idx = ts->frame_thread[p].pal_idx;
ts->frame_thread[p].pal_idx += bw4 * bh4 * 16;
ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
} else
pal_idx = t->scratch.pal_idx;
pal_idx = t->scratch.pal_idx_y;
read_pal_indices(t, pal_idx, b, 0, w4, h4, bw4, bh4);
if (DEBUG_BLOCK_INFO)
printf("Post-y-pal-indices: r=%d\n", ts->msac.rng);
@ -1355,9 +1213,9 @@ static int decode_b(Dav1dTaskContext *const t,
const int p = t->frame_thread.pass & 1;
assert(ts->frame_thread[p].pal_idx);
pal_idx = ts->frame_thread[p].pal_idx;
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16;
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
} else
pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
pal_idx = t->scratch.pal_idx_uv;
read_pal_indices(t, pal_idx, b, 1, cw4, ch4, cbw4, cbh4);
if (DEBUG_BLOCK_INFO)
printf("Post-uv-pal-indices: r=%d\n", ts->msac.rng);
@ -1430,34 +1288,16 @@ static int decode_b(Dav1dTaskContext *const t,
case_set(bh4, l., 1, by4);
case_set(bw4, a->, 0, bx4);
#undef set_ctx
if (b->pal_sz[0]) {
uint16_t *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0];
for (int x = 0; x < bw4; x++)
memcpy(t->al_pal[0][bx4 + x][0], pal, 16);
for (int y = 0; y < bh4; y++)
memcpy(t->al_pal[1][by4 + y][0], pal, 16);
}
if (b->pal_sz[0])
f->bd_fn.copy_pal_block_y(t, bx4, by4, bw4, bh4);
if (has_chroma) {
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
case_set(cbh4, l., 1, cby4);
case_set(cbw4, a->, 0, cbx4);
#undef set_ctx
if (b->pal_sz[1]) {
const uint16_t (*const pal)[8] = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) *
(f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))] :
t->scratch.pal;
// see aomedia bug 2183 for why we use luma coordinates here
for (int pl = 1; pl <= 2; pl++) {
for (int x = 0; x < bw4; x++)
memcpy(t->al_pal[0][bx4 + x][pl], pal[pl], 16);
for (int y = 0; y < bh4; y++)
memcpy(t->al_pal[1][by4 + y][pl], pal[pl], 16);
}
}
if (b->pal_sz[1])
f->bd_fn.copy_pal_block_uv(t, bx4, by4, bw4, bh4);
}
if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc)
splat_intraref(f->c, t, bs, bw4, bh4);
@ -2642,7 +2482,10 @@ static void setup_tile(Dav1dTileState *const ts,
const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
for (int p = 0; p < 2; p++) {
ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ?
&f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] :
&f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 8] :
NULL;
ts->frame_thread[p].cbi = f->frame_thread.cbi ?
&f->frame_thread.cbi[(size_t)tile_start_off * size_mul[0] / 64] :
NULL;
ts->frame_thread[p].cf = f->frame_thread.cf ?
(uint8_t*)f->frame_thread.cf +
@ -3015,6 +2858,19 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
}
}
const int cbi_sz = num_sb128 * size_mul[0];
if (cbi_sz != f->frame_thread.cbi_sz) {
dav1d_free_aligned(f->frame_thread.cbi);
f->frame_thread.cbi =
dav1d_alloc_aligned(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) *
cbi_sz * 32 * 32 / 4, 64);
if (!f->frame_thread.cbi) {
f->frame_thread.cbi_sz = 0;
goto error;
}
f->frame_thread.cbi_sz = cbi_sz;
}
const int cf_sz = (num_sb128 * size_mul[0]) << hbd;
if (cf_sz != f->frame_thread.cf_sz) {
dav1d_free_aligned(f->frame_thread.cf);
@ -3029,16 +2885,17 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
}
if (f->frame_hdr->allow_screen_content_tools) {
if (num_sb128 != f->frame_thread.pal_sz) {
const int pal_sz = num_sb128 << hbd;
if (pal_sz != f->frame_thread.pal_sz) {
dav1d_free_aligned(f->frame_thread.pal);
f->frame_thread.pal =
dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal) *
num_sb128 * 16 * 16, 64);
pal_sz * 16 * 16, 64);
if (!f->frame_thread.pal) {
f->frame_thread.pal_sz = 0;
goto error;
}
f->frame_thread.pal_sz = num_sb128;
f->frame_thread.pal_sz = pal_sz;
}
const int pal_idx_sz = num_sb128 * size_mul[1];
@ -3046,7 +2903,7 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
dav1d_free_aligned(f->frame_thread.pal_idx);
f->frame_thread.pal_idx =
dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal_idx) *
pal_idx_sz * 128 * 128 / 4, 64);
pal_idx_sz * 128 * 128 / 8, 64);
if (!f->frame_thread.pal_idx) {
f->frame_thread.pal_idx_sz = 0;
goto error;
@ -3171,12 +3028,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
}
if (c->n_fc > 1) {
dav1d_free(f->frame_thread.b);
dav1d_free(f->frame_thread.cbi);
f->frame_thread.b = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.b) *
num_sb128 * 32 * 32);
f->frame_thread.cbi = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) *
num_sb128 * 32 * 32);
if (!f->frame_thread.b || !f->frame_thread.cbi) {
if (!f->frame_thread.b) {
f->lf.mask_sz = 0;
goto error;
}
@ -3584,7 +3438,11 @@ int dav1d_submit_frame(Dav1dContext *const c) {
f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \
f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \
f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc
f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc; \
f->bd_fn.copy_pal_block_y = dav1d_copy_pal_block_y_##bd##bpc; \
f->bd_fn.copy_pal_block_uv = dav1d_copy_pal_block_uv_##bd##bpc; \
f->bd_fn.read_pal_plane = dav1d_read_pal_plane_##bd##bpc; \
f->bd_fn.read_pal_uv = dav1d_read_pal_uv_##bd##bpc
if (!f->seq_hdr->hbd) {
#if CONFIG_8BPC
assign_bitdepth_case(8);

View file

@ -172,14 +172,14 @@ void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp,
const int cpw = (out->p.w + ss_x) >> ss_x;
const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
pixel *const luma_src =
((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);
((pixel *) in->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(in->stride[0]);
#if BITDEPTH != 8
const int bitdepth_max = (1 << out->p.bpc) - 1;
#endif
if (data->num_y_points) {
const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE);
dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]),
const int bh = imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE);
dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[0]),
luma_src, out->stride[0], data,
out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
}
@ -190,7 +190,7 @@ void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp,
return;
}
const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;
const int bh = (imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE) + ss_y) >> ss_y;
// extend padding pixels
if (out->p.w & ss_x) {
@ -201,7 +201,7 @@ void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp,
}
}
const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
const ptrdiff_t uv_off = row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
if (data->chroma_scaling_from_luma) {
for (int pl = 0; pl < 2; pl++)
dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
@ -232,7 +232,7 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
#else
uint8_t scaling[3][SCALING_SIZE];
#endif
const int rows = (out->p.h + 31) >> 5;
const int rows = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE;
bitfn(dav1d_prep_grain)(dsp, out, in, scaling, grain_lut);
for (int row = 0; row < rows; row++)

View file

@ -34,7 +34,7 @@
#define GRAIN_WIDTH 82
#define GRAIN_HEIGHT 73
#define BLOCK_SIZE 32
#define FG_BLOCK_SIZE 32
#if !defined(BITDEPTH) || BITDEPTH == 8
#define SCALING_SIZE 256
typedef int8_t entry;

View file

@ -162,8 +162,8 @@ static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH],
const int randval = offsets[bx][by];
const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF));
return grain_lut[offy + y + (BLOCK_SIZE >> suby) * by]
[offx + x + (BLOCK_SIZE >> subx) * bx];
return grain_lut[offy + y + (FG_BLOCK_SIZE >> suby) * by]
[offx + x + (FG_BLOCK_SIZE >> subx) * bx];
}
static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
@ -195,13 +195,13 @@ static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
}
assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
assert(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);
int offsets[2 /* col offset */][2 /* row offset */];
// process this row in BLOCK_SIZE^2 blocks
for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
const int bw = imin(BLOCK_SIZE, (int) pw - bx);
// process this row in FG_BLOCK_SIZE^2 blocks
for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) {
const int bw = imin(FG_BLOCK_SIZE, (int) pw - bx);
if (data->overlap_flag && bx) {
// shift previous offsets left
@ -306,13 +306,13 @@ fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
}
assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
assert(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);
int offsets[2 /* col offset */][2 /* row offset */];
// process this row in BLOCK_SIZE^2 blocks (subsampled)
for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
const int bw = imin(BLOCK_SIZE >> sx, (int)(pw - bx));
// process this row in FG_BLOCK_SIZE^2 blocks (subsampled)
for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) {
const int bw = imin(FG_BLOCK_SIZE >> sx, (int)(pw - bx));
if (data->overlap_flag && bx) {
// shift previous offsets left
for (int i = 0; i < rows; i++)

View file

@ -53,6 +53,7 @@ typedef struct Dav1dTask Dav1dTask;
#include "src/looprestoration.h"
#include "src/mc.h"
#include "src/msac.h"
#include "src/pal.h"
#include "src/picture.h"
#include "src/recon.h"
#include "src/refmvs.h"
@ -174,6 +175,7 @@ struct Dav1dContext {
CdfThreadContext cdf[8];
Dav1dDSPContext dsp[3 /* 8, 10, 12 bits/component */];
Dav1dPalDSPContext pal_dsp;
Dav1dRefmvsDSPContext refmvs_dsp;
Dav1dPicAllocator allocator;
@ -253,6 +255,10 @@ struct Dav1dFrameContext {
filter_sbrow_fn filter_sbrow_lr;
backup_ipred_edge_fn backup_ipred_edge;
read_coef_blocks_fn read_coef_blocks;
copy_pal_block_fn copy_pal_block_y;
copy_pal_block_fn copy_pal_block_uv;
read_pal_plane_fn read_pal_plane;
read_pal_uv_fn read_pal_uv;
} bd_fn;
int ipred_edge_sz;
@ -274,14 +280,14 @@ struct Dav1dFrameContext {
atomic_uint *frame_progress, *copy_lpf_progress;
// indexed using t->by * f->b4_stride + t->bx
Av1Block *b;
int16_t (*cbi)[3 /* plane */]; /* bits 0-4: txtp, bits 5-15: eob */
int16_t *cbi; /* bits 0-4: txtp, bits 5-15: eob */
// indexed using (t->by >> 1) * (f->b4_stride >> 1) + (t->bx >> 1)
uint16_t (*pal)[3 /* plane */][8 /* idx */];
pixel (*pal)[3 /* plane */][8 /* idx */];
// iterated over inside tile state
uint8_t *pal_idx;
coef *cf;
int prog_sz;
int pal_sz, pal_idx_sz, cf_sz;
int cbi_sz, pal_sz, pal_idx_sz, cf_sz;
// start offsets per tile
int *tile_start_off;
} frame_thread;
@ -358,6 +364,7 @@ struct Dav1dTileState {
atomic_int progress[2 /* 0: reconstruction, 1: entropy */];
struct {
uint8_t *pal_idx;
int16_t *cbi;
coef *cf;
} frame_thread[2 /* 0: reconstruction, 1: entropy */];
@ -387,9 +394,10 @@ struct Dav1dTaskContext {
int16_t cf_8bpc [32 * 32];
int32_t cf_16bpc[32 * 32];
};
// FIXME types can be changed to pixel (and dynamically allocated)
// which would make copy/assign operations slightly faster?
uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
union {
uint8_t al_pal_8bpc [2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
uint16_t al_pal_16bpc[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
};
uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
ALIGN(union, 64) {
struct {
@ -419,16 +427,18 @@ struct Dav1dTaskContext {
int16_t ac[32 * 32]; // intra-only
uint8_t txtp_map[32 * 32]; // inter-only
};
uint8_t pal_idx[2 * 64 * 64];
uint16_t pal[3 /* plane */][8 /* palette_idx */];
ALIGN(union, 64) {
uint8_t pal_idx_y[32 * 64];
uint8_t pal_idx_uv[64 * 64]; /* also used as pre-pack scratch buffer */
union {
struct {
uint8_t interintra_8bpc[64 * 64];
uint8_t edge_8bpc[257];
ALIGN(uint8_t pal_8bpc[3 /* plane */][8 /* palette_idx */], 8);
};
struct {
uint16_t interintra_16bpc[64 * 64];
uint16_t edge_16bpc[257];
ALIGN(uint16_t pal_16bpc[3 /* plane */][8 /* palette_idx */], 16);
};
};
};

View file

@ -74,7 +74,7 @@ typedef decl_cfl_pred_fn(*cfl_pred_fn);
* - only 16-byte alignment is guaranteed for idx.
*/
#define decl_pal_pred_fn(name) \
void (name)(pixel *dst, ptrdiff_t stride, const uint16_t *pal, \
void (name)(pixel *dst, ptrdiff_t stride, const pixel *pal, \
const uint8_t *idx, int w, int h)
typedef decl_pal_pred_fn(*pal_pred_fn);

View file

@ -715,13 +715,16 @@ cfl_ac_fn(422, 1, 0)
cfl_ac_fn(444, 0, 0)
static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
const uint16_t *const pal, const uint8_t *idx,
const pixel *const pal, const uint8_t *idx,
const int w, const int h)
{
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++)
dst[x] = (pixel) pal[idx[x]];
idx += w;
for (int x = 0; x < w; x += 2) {
const int i = *idx++;
assert(!(i & 0x88));
dst[x + 0] = pal[i & 7];
dst[x + 1] = pal[i >> 4];
}
dst += PXSTRIDE(stride);
}
}

View file

@ -52,11 +52,10 @@
static COLD void init_internal(void) {
dav1d_init_cpu();
dav1d_init_interintra_masks();
dav1d_init_ii_wedge_masks();
dav1d_init_intra_edge_tree();
dav1d_init_qm_tables();
dav1d_init_thread();
dav1d_init_wedge_masks();
}
COLD const char *dav1d_version(void) {
@ -287,6 +286,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
t->task_thread.td.inited = 1;
}
}
dav1d_pal_dsp_init(&c->pal_dsp);
dav1d_refmvs_dsp_init(&c->refmvs_dsp);
pthread_attr_destroy(&thread_attr);
@ -641,11 +641,11 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
if (c->n_fc > 1) {
dav1d_free(f->tile_thread.lowest_pixel_mem);
dav1d_free(f->frame_thread.b);
dav1d_free_aligned(f->frame_thread.cbi);
dav1d_free_aligned(f->frame_thread.pal_idx);
dav1d_free_aligned(f->frame_thread.cf);
dav1d_free(f->frame_thread.tile_start_off);
dav1d_free_aligned(f->frame_thread.pal);
dav1d_free(f->frame_thread.cbi);
}
if (c->n_tc > 1) {
pthread_mutex_destroy(&f->task_thread.pending_tasks.lock);

View file

@ -42,6 +42,7 @@ libdav1d_sources = files(
'mem.c',
'msac.c',
'obu.c',
'pal.c',
'picture.c',
'qm.c',
'ref.c',
@ -167,6 +168,7 @@ if is_asm_enabled
libdav1d_sources_asm = files(
'x86/cpuid.asm',
'x86/msac.asm',
'x86/pal.asm',
'x86/refmvs.asm',
'x86/itx_avx512.asm',
'x86/cdef_avx2.asm',

77
third_party/dav1d/src/pal.c vendored Normal file
View file

@ -0,0 +1,77 @@
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "config.h"
#include <string.h>
#include "common/attributes.h"
#include "src/pal.h"
// fill invisible edges and pack to 4-bit (2 pixels per byte)
static void pal_idx_finish_c(uint8_t *dst, const uint8_t *src,
const int bw, const int bh,
const int w, const int h)
{
assert(bw >= 4 && bw <= 64 && !(bw & (bw - 1)));
assert(bh >= 4 && bh <= 64 && !(bh & (bh - 1)));
assert(w >= 4 && w <= bw && !(w & 3));
assert(h >= 4 && h <= bh && !(h & 3));
const int dst_w = w / 2;
const int dst_bw = bw / 2;
for (int y = 0; y < h; y++, src += bw, dst += dst_bw) {
for (int x = 0; x < dst_w; x++)
dst[x] = src[x * 2 + 0] | (src[x * 2 + 1] << 4);
if (dst_w < dst_bw)
memset(dst + dst_w, src[w - 1] * 0x11, dst_bw - dst_w);
}
if (h < bh) {
const uint8_t *const last_row = &dst[-dst_bw];
for (int y = h; y < bh; y++, dst += dst_bw)
memcpy(dst, last_row, dst_bw);
}
}
#if HAVE_ASM
#if ARCH_X86
#include "src/x86/pal.h"
#endif
#endif
COLD void dav1d_pal_dsp_init(Dav1dPalDSPContext *const c) {
c->pal_idx_finish = pal_idx_finish_c;
#if HAVE_ASM
#if ARCH_X86
pal_dsp_init_x86(c);
#endif
#endif
}

43
third_party/dav1d/src/pal.h vendored Normal file
View file

@ -0,0 +1,43 @@
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef DAV1D_SRC_PAL_H
#define DAV1D_SRC_PAL_H
#include <stdint.h>
#define decl_pal_idx_finish_fn(name) \
void (name)(uint8_t *dst, const uint8_t *src, int bw, int bh, int w, int h)
typedef decl_pal_idx_finish_fn(*pal_idx_finish_fn);
typedef struct Dav1dPalDSPContext {
pal_idx_finish_fn pal_idx_finish;
} Dav1dPalDSPContext;
void dav1d_pal_dsp_init(Dav1dPalDSPContext *dsp);
#endif /* DAV1D_SRC_PAL_H */

View file

@ -57,6 +57,18 @@ typedef decl_backup_ipred_edge_fn(*backup_ipred_edge_fn);
void (name)(Dav1dTaskContext *t, enum BlockSize bs, const Av1Block *b)
typedef decl_read_coef_blocks_fn(*read_coef_blocks_fn);
#define decl_copy_pal_block_fn(name) \
void (name)(Dav1dTaskContext *t, int bx4, int by4, int bw4, int bh4)
typedef decl_copy_pal_block_fn(*copy_pal_block_fn);
#define decl_read_pal_plane_fn(name) \
void (name)(Dav1dTaskContext *t, Av1Block *b, int pl, int sz_ctx, int bx4, int by4)
typedef decl_read_pal_plane_fn(*read_pal_plane_fn);
#define decl_read_pal_uv_fn(name) \
void (name)(Dav1dTaskContext *t, Av1Block *b, int sz_ctx, int bx4, int by4)
typedef decl_read_pal_uv_fn(*read_pal_uv_fn);
decl_recon_b_intra_fn(dav1d_recon_b_intra_8bpc);
decl_recon_b_intra_fn(dav1d_recon_b_intra_16bpc);
@ -82,4 +94,13 @@ decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_16bpc);
decl_read_coef_blocks_fn(dav1d_read_coef_blocks_8bpc);
decl_read_coef_blocks_fn(dav1d_read_coef_blocks_16bpc);
decl_copy_pal_block_fn(dav1d_copy_pal_block_y_8bpc);
decl_copy_pal_block_fn(dav1d_copy_pal_block_y_16bpc);
decl_copy_pal_block_fn(dav1d_copy_pal_block_uv_8bpc);
decl_copy_pal_block_fn(dav1d_copy_pal_block_uv_16bpc);
decl_read_pal_plane_fn(dav1d_read_pal_plane_8bpc);
decl_read_pal_plane_fn(dav1d_read_pal_plane_16bpc);
decl_read_pal_uv_fn(dav1d_read_pal_uv_8bpc);
decl_read_pal_uv_fn(dav1d_read_pal_uv_16bpc);
#endif /* DAV1D_SRC_RECON_H */

View file

@ -770,14 +770,12 @@ static void read_coef_tree(Dav1dTaskContext *const t,
uint8_t cf_ctx;
int eob;
coef *cf;
int16_t *cbi;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1;
assert(ts->frame_thread[p].cf);
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
cbi = f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
} else {
cf = bitfn(t->cf);
}
@ -804,10 +802,11 @@ static void read_coef_tree(Dav1dTaskContext *const t,
case_set_upto16(txw,,,);
#undef set_ctx
if (t->frame_thread.pass == 1)
cbi[0] = eob * (1 << 5) + txtp;
*ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
} else {
eob = cbi[0] >> 5;
txtp = cbi[0] & 0x1f;
const int cbi = *ts->frame_thread[0].cbi++;
eob = cbi >> 5;
txtp = cbi & 0x1f;
}
if (!(t->frame_thread.pass & 1)) {
assert(dst);
@ -872,8 +871,6 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
for (y = init_y, t->by += init_y; y < sub_h4;
y += t_dim->h, t->by += t_dim->h, y_off++)
{
int16_t (*const cbi)[3] =
&f->frame_thread.cbi[t->by * f->b4_stride];
int x_off = !!init_x;
for (x = init_x, t->bx += init_x; x < sub_w4;
x += t_dim->w, t->bx += t_dim->w, x_off++)
@ -891,7 +888,7 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
if (DEBUG_BLOCK_INFO)
printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
b->tx, txtp, eob, ts->msac.rng);
cbi[t->bx][0] = eob * (1 << 5) + txtp;
*ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
@ -917,8 +914,6 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
{
int16_t (*const cbi)[3] =
&f->frame_thread.cbi[t->by * f->b4_stride];
for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
{
@ -936,7 +931,7 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
printf("Post-uv-cf-blk[pl=%d,tx=%d,"
"txtp=%d,eob=%d]: r=%d\n",
pl, b->uvtx, txtp, eob, ts->msac.rng);
cbi[t->bx][pl + 1] = eob * (1 << 5) + txtp;
*ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
@ -1236,13 +1231,14 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
const int p = t->frame_thread.pass & 1;
assert(ts->frame_thread[p].pal_idx);
pal_idx = ts->frame_thread[p].pal_idx;
ts->frame_thread[p].pal_idx += bw4 * bh4 * 16;
ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
} else {
pal_idx = t->scratch.pal_idx;
pal_idx = t->scratch.pal_idx_y;
}
const uint16_t *const pal = t->frame_thread.pass ?
const pixel *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0];
((t->bx >> 1) + (t->by & 1))][0] :
bytefn(t->scratch.pal)[0];
f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
pal_idx, bw4 * 4, bh4 * 4);
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
@ -1319,10 +1315,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
enum TxfmType txtp;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1;
const int cbi = *ts->frame_thread[p].cbi++;
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
const int cbi =
f->frame_thread.cbi[t->by * f->b4_stride + t->bx][0];
eob = cbi >> 5;
txtp = cbi & 0x1f;
} else {
@ -1428,7 +1423,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
} else if (b->pal_sz[1]) {
const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
(t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
const uint16_t (*pal)[8];
const pixel (*pal)[8];
const uint8_t *pal_idx;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1;
@ -1436,10 +1431,10 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))];
pal_idx = ts->frame_thread[p].pal_idx;
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16;
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
} else {
pal = t->scratch.pal;
pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
pal = bytefn(t->scratch.pal);
pal_idx = t->scratch.pal_idx_uv;
}
f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
@ -1543,10 +1538,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
coef *cf;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1;
const int cbi = *ts->frame_thread[p].cbi++;
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
const int cbi =
f->frame_thread.cbi[t->by * f->b4_stride + t->bx][pl + 1];
eob = cbi >> 5;
txtp = cbi & 0x1f;
} else {
@ -1682,12 +1676,8 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
HIGHBD_CALL_SUFFIX);
const uint8_t *const ii_mask =
b->interintra_type == INTER_INTRA_BLEND ?
dav1d_ii_masks[bs][0][b->interintra_mode] :
dav1d_wedge_masks[bs][0][0][b->wedge_idx];
dsp->mc.blend(dst, f->cur.stride[0], tmp,
bw4 * 4, bh4 * 4, ii_mask);
bw4 * 4, bh4 * 4, II_MASK(0, bs, b));
}
if (!has_chroma) goto skip_inter_chroma_pred;
@ -1790,10 +1780,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
// FIXME for 8x32 with 4:2:2 subsampling, this probably does
// the wrong thing since it will select 4x16, not 4x32, as a
// transform size...
const uint8_t *const ii_mask =
b->interintra_type == INTER_INTRA_BLEND ?
dav1d_ii_masks[bs][chr_layout_idx][b->interintra_mode] :
dav1d_wedge_masks[bs][chr_layout_idx][0][b->wedge_idx];
const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b);
for (int pl = 0; pl < 2; pl++) {
pixel *const tmp = bitfn(t->scratch.interintra);
@ -1871,12 +1858,12 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
mask = seg_mask;
break;
case COMP_INTER_WEDGE:
mask = dav1d_wedge_masks[bs][0][0][b->wedge_idx];
mask = WEDGE_MASK(0, bs, 0, b->wedge_idx);
dsp->mc.mask(dst, f->cur.stride[0],
tmp[b->mask_sign], tmp[!b->mask_sign],
bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);
if (has_chroma)
mask = dav1d_wedge_masks[bs][chr_layout_idx][b->mask_sign][b->wedge_idx];
mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx);
break;
}
@ -1993,10 +1980,9 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
enum TxfmType txtp;
if (t->frame_thread.pass) {
const int p = t->frame_thread.pass & 1;
const int cbi = *ts->frame_thread[p].cbi++;
cf = ts->frame_thread[p].cf;
ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
const int cbi =
f->frame_thread.cbi[t->by * f->b4_stride + t->bx][pl + 1];
eob = cbi >> 5;
txtp = cbi & 0x1f;
} else {
@ -2198,3 +2184,178 @@ void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
4 * (ts->tiling.col_end - x_off) >> ss_hor);
}
}
void bytefn(dav1d_copy_pal_block_y)(Dav1dTaskContext *const t,
const int bx4, const int by4,
const int bw4, const int bh4)
{
const Dav1dFrameContext *const f = t->f;
pixel *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))][0] :
bytefn(t->scratch.pal)[0];
for (int x = 0; x < bw4; x++)
memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel));
for (int y = 0; y < bh4; y++)
memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel));
}
void bytefn(dav1d_copy_pal_block_uv)(Dav1dTaskContext *const t,
const int bx4, const int by4,
const int bw4, const int bh4)
{
const Dav1dFrameContext *const f = t->f;
const pixel (*const pal)[8] = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))] :
bytefn(t->scratch.pal);
// see aomedia bug 2183 for why we use luma coordinates here
for (int pl = 1; pl <= 2; pl++) {
for (int x = 0; x < bw4; x++)
memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel));
for (int y = 0; y < bh4; y++)
memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel));
}
}
void bytefn(dav1d_read_pal_plane)(Dav1dTaskContext *const t, Av1Block *const b,
const int pl, const int sz_ctx,
const int bx4, const int by4)
{
Dav1dTileState *const ts = t->ts;
const Dav1dFrameContext *const f = t->f;
const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
pixel cache[16], used_cache[8];
int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
int n_cache = 0;
// don't reuse above palette outside SB64 boundaries
int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
const pixel *l = bytefn(t->al_pal)[1][by4][pl];
const pixel *a = bytefn(t->al_pal)[0][bx4][pl];
// fill/sort cache
while (l_cache && a_cache) {
if (*l < *a) {
if (!n_cache || cache[n_cache - 1] != *l)
cache[n_cache++] = *l;
l++;
l_cache--;
} else {
if (*a == *l) {
l++;
l_cache--;
}
if (!n_cache || cache[n_cache - 1] != *a)
cache[n_cache++] = *a;
a++;
a_cache--;
}
}
if (l_cache) {
do {
if (!n_cache || cache[n_cache - 1] != *l)
cache[n_cache++] = *l;
l++;
} while (--l_cache > 0);
} else if (a_cache) {
do {
if (!n_cache || cache[n_cache - 1] != *a)
cache[n_cache++] = *a;
a++;
} while (--a_cache > 0);
}
// find reused cache entries
int i = 0;
for (int n = 0; n < n_cache && i < pal_sz; n++)
if (dav1d_msac_decode_bool_equi(&ts->msac))
used_cache[i++] = cache[n];
const int n_used_cache = i;
// parse new entries
pixel *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))][pl] :
bytefn(t->scratch.pal)[pl];
if (i < pal_sz) {
const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc);
if (i < pal_sz) {
int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
const int max = (1 << bpc) - 1;
do {
const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
prev = pal[i++] = imin(prev + delta + !pl, max);
if (prev + !pl >= max) {
for (; i < pal_sz; i++)
pal[i] = max;
break;
}
bits = imin(bits, 1 + ulog2(max - prev - !pl));
} while (i < pal_sz);
}
// merge cache+new entries
int n = 0, m = n_used_cache;
for (i = 0; i < pal_sz; i++) {
if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
pal[i] = used_cache[n++];
} else {
assert(m < pal_sz);
pal[i] = pal[m++];
}
}
} else {
memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
}
if (DEBUG_BLOCK_INFO) {
printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
for (int n = 0; n < n_cache; n++)
printf("%c%02x", n ? ' ' : '[', cache[n]);
printf("%s, pal=", n_cache ? "]" : "[]");
for (int n = 0; n < pal_sz; n++)
printf("%c%02x", n ? ' ' : '[', pal[n]);
printf("]\n");
}
}
void bytefn(dav1d_read_pal_uv)(Dav1dTaskContext *const t, Av1Block *const b,
const int sz_ctx, const int bx4, const int by4)
{
bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4);
// V pal coding
Dav1dTileState *const ts = t->ts;
const Dav1dFrameContext *const f = t->f;
pixel *const pal = t->frame_thread.pass ?
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
((t->bx >> 1) + (t->by & 1))][2] :
bytefn(t->scratch.pal)[2];
const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
if (dav1d_msac_decode_bool_equi(&ts->msac)) {
const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2);
int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc);
const int max = (1 << bpc) - 1;
for (int i = 1; i < b->pal_sz[1]; i++) {
int delta = dav1d_msac_decode_bools(&ts->msac, bits);
if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
prev = pal[i] = (prev + delta) & max;
}
} else {
for (int i = 0; i < b->pal_sz[1]; i++)
pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc);
}
if (DEBUG_BLOCK_INFO) {
printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
for (int n = 0; n < b->pal_sz[1]; n++)
printf("%c%02x", n ? ' ' : '[', pal[n]);
printf("]\n");
}
}

View file

@ -500,7 +500,7 @@ static inline void delayed_fg_task(const Dav1dContext *const c,
case DAV1D_TASK_TYPE_FG_APPLY:;
int row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
pthread_mutex_unlock(&ttd->lock);
int progmax = (out->p.h + 31) >> 5;
int progmax = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE;
fg_apply_loop:
if (row + 1 < progmax)
pthread_cond_signal(&ttd->cond);

View file

@ -83,37 +83,7 @@ static const wedge_code_type wedge_codebook_16_heqw[16] = {
{ WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
};
static uint8_t ALIGN(wedge_masks_444_32x32[2 * 16 * 32 * 32], 64);
static uint8_t ALIGN(wedge_masks_444_32x16[2 * 16 * 32 * 16], 64);
static uint8_t ALIGN(wedge_masks_444_32x8 [2 * 16 * 32 * 8], 64);
static uint8_t ALIGN(wedge_masks_444_16x32[2 * 16 * 16 * 32], 64);
static uint8_t ALIGN(wedge_masks_444_16x16[2 * 16 * 16 * 16], 64);
static uint8_t ALIGN(wedge_masks_444_16x8 [2 * 16 * 16 * 8], 64);
static uint8_t ALIGN(wedge_masks_444_8x32 [2 * 16 * 8 * 32], 64);
static uint8_t ALIGN(wedge_masks_444_8x16 [2 * 16 * 8 * 16], 64);
static uint8_t ALIGN(wedge_masks_444_8x8 [2 * 16 * 8 * 8], 64);
static uint8_t ALIGN(wedge_masks_422_16x32[2 * 16 * 16 * 32], 64);
static uint8_t ALIGN(wedge_masks_422_16x16[2 * 16 * 16 * 16], 64);
static uint8_t ALIGN(wedge_masks_422_16x8 [2 * 16 * 16 * 8], 64);
static uint8_t ALIGN(wedge_masks_422_8x32 [2 * 16 * 8 * 32], 64);
static uint8_t ALIGN(wedge_masks_422_8x16 [2 * 16 * 8 * 16], 64);
static uint8_t ALIGN(wedge_masks_422_8x8 [2 * 16 * 8 * 8], 64);
static uint8_t ALIGN(wedge_masks_422_4x32 [2 * 16 * 4 * 32], 64);
static uint8_t ALIGN(wedge_masks_422_4x16 [2 * 16 * 4 * 16], 64);
static uint8_t ALIGN(wedge_masks_422_4x8 [2 * 16 * 4 * 8], 32);
static uint8_t ALIGN(wedge_masks_420_16x16[2 * 16 * 16 * 16], 64);
static uint8_t ALIGN(wedge_masks_420_16x8 [2 * 16 * 16 * 8], 64);
static uint8_t ALIGN(wedge_masks_420_16x4 [2 * 16 * 16 * 4], 64);
static uint8_t ALIGN(wedge_masks_420_8x16 [2 * 16 * 8 * 16], 64);
static uint8_t ALIGN(wedge_masks_420_8x8 [2 * 16 * 8 * 8], 64);
static uint8_t ALIGN(wedge_masks_420_8x4 [2 * 16 * 8 * 4], 64);
static uint8_t ALIGN(wedge_masks_420_4x16 [2 * 16 * 4 * 16], 64);
static uint8_t ALIGN(wedge_masks_420_4x8 [2 * 16 * 4 * 8], 32);
static uint8_t ALIGN(wedge_masks_420_4x4 [2 * 16 * 4 * 4], 16);
const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3][2][16];
Dav1dMasks dav1d_masks;
static void insert_border(uint8_t *const dst, const uint8_t *const src,
const int ctr)
@ -136,29 +106,33 @@ static void hflip(uint8_t *const dst, const uint8_t *const src) {
dst[y_off + 64 - 1 - x] = src[y_off + x];
}
static void invert(uint8_t *const dst, const uint8_t *const src,
const int w, const int h)
{
for (int y = 0, y_off = 0; y < h; y++, y_off += w)
for (int x = 0; x < w; x++)
dst[y_off + x] = 64 - src[y_off + x];
}
static void copy2d(uint8_t *dst, const uint8_t *src,
static void copy2d(uint8_t *dst, const uint8_t *src, int sign,
const int w, const int h, const int x_off, const int y_off)
{
src += y_off * 64 + x_off;
for (int y = 0; y < h; y++) {
memcpy(dst, src, w);
src += 64;
dst += w;
if (sign) {
for (int y = 0; y < h; y++) {
for (int x = 0; x < w; x++)
dst[x] = 64 - src[x];
src += 64;
dst += w;
}
} else {
for (int y = 0; y < h; y++) {
memcpy(dst, src, w);
src += 64;
dst += w;
}
}
}
static COLD void init_chroma(uint8_t *chroma, const uint8_t *luma,
const int sign, const int w, const int h,
const int ss_ver)
#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
static COLD uint16_t init_chroma(uint8_t *chroma, const uint8_t *luma,
const int sign, const int w, const int h,
const int ss_ver)
{
const uint16_t offset = MASK_OFFSET(chroma);
for (int y = 0; y < h; y += 1 + ss_ver) {
for (int x = 0; x < w; x += 2) {
int sum = luma[x] + luma[x + 1] + 1;
@ -168,62 +142,69 @@ static COLD void init_chroma(uint8_t *chroma, const uint8_t *luma,
luma += w << ss_ver;
chroma += w >> 1;
}
return offset;
}
static COLD void fill2d_16x2(uint8_t *dst, const int w, const int h,
const enum BlockSize bs,
static COLD void fill2d_16x2(const int w, const int h, const enum BlockSize bs,
const uint8_t (*const master)[64 * 64],
const wedge_code_type *const cb,
uint8_t *masks_444, uint8_t *masks_422,
uint8_t *masks_420, const unsigned signs)
uint8_t *masks_420, unsigned signs)
{
uint8_t *ptr = dst;
for (int n = 0; n < 16; n++) {
copy2d(ptr, master[cb[n].direction], w, h,
32 - (w * cb[n].x_offset >> 3), 32 - (h * cb[n].y_offset >> 3));
ptr += w * h;
}
for (int n = 0, off = 0; n < 16; n++, off += w * h)
invert(ptr + off, dst + off, w, h);
const int n_stride_444 = (w * h);
const int n_stride_422 = n_stride_444 >> 1;
const int n_stride_420 = n_stride_444 >> 2;
const int sign_stride_444 = 16 * n_stride_444;
const int sign_stride_422 = 16 * n_stride_422;
const int sign_stride_420 = 16 * n_stride_420;
// assign pointers in externally visible array
// assign pointer offsets in lookup table
for (int n = 0; n < 16; n++) {
const int sign = (signs >> n) & 1;
dav1d_wedge_masks[bs][0][0][n] = &masks_444[ sign * sign_stride_444];
const int sign = signs & 1;
copy2d(masks_444, master[cb[n].direction], sign, w, h,
32 - (w * cb[n].x_offset >> 3), 32 - (h * cb[n].y_offset >> 3));
// not using !sign is intentional here, since 444 does not require
// any rounding since no chroma subsampling is applied.
dav1d_wedge_masks[bs][0][1][n] = &masks_444[ sign * sign_stride_444];
dav1d_wedge_masks[bs][1][0][n] = &masks_422[ sign * sign_stride_422];
dav1d_wedge_masks[bs][1][1][n] = &masks_422[!sign * sign_stride_422];
dav1d_wedge_masks[bs][2][0][n] = &masks_420[ sign * sign_stride_420];
dav1d_wedge_masks[bs][2][1][n] = &masks_420[!sign * sign_stride_420];
dav1d_masks.offsets[0][bs].wedge[0][n] =
dav1d_masks.offsets[0][bs].wedge[1][n] = MASK_OFFSET(masks_444);
dav1d_masks.offsets[1][bs].wedge[0][n] =
init_chroma(&masks_422[ sign * sign_stride_422], masks_444, 0, w, h, 0);
dav1d_masks.offsets[1][bs].wedge[1][n] =
init_chroma(&masks_422[!sign * sign_stride_422], masks_444, 1, w, h, 0);
dav1d_masks.offsets[2][bs].wedge[0][n] =
init_chroma(&masks_420[ sign * sign_stride_420], masks_444, 0, w, h, 1);
dav1d_masks.offsets[2][bs].wedge[1][n] =
init_chroma(&masks_420[!sign * sign_stride_420], masks_444, 1, w, h, 1);
signs >>= 1;
masks_444 += n_stride_444;
masks_422 += n_stride_422;
masks_420 += n_stride_420;
// since the pointers come from inside, we know that
// violation of the const is OK here. Any other approach
// means we would have to duplicate the sign correction
// logic in two places, which isn't very nice, or mark
// the table faced externally as non-const, which also sucks
init_chroma((uint8_t *)dav1d_wedge_masks[bs][1][0][n],
dav1d_wedge_masks[bs][0][0][n], 0, w, h, 0);
init_chroma((uint8_t *)dav1d_wedge_masks[bs][1][1][n],
dav1d_wedge_masks[bs][0][0][n], 1, w, h, 0);
init_chroma((uint8_t *)dav1d_wedge_masks[bs][2][0][n],
dav1d_wedge_masks[bs][0][0][n], 0, w, h, 1);
init_chroma((uint8_t *)dav1d_wedge_masks[bs][2][1][n],
dav1d_wedge_masks[bs][0][0][n], 1, w, h, 1);
}
}
COLD void dav1d_init_wedge_masks(void) {
static COLD void build_nondc_ii_masks(uint8_t *const mask_v, const int w,
const int h, const int step)
{
static const uint8_t ii_weights_1d[32] = {
60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1,
};
uint8_t *const mask_h = &mask_v[w * h];
uint8_t *const mask_sm = &mask_h[w * h];
for (int y = 0, off = 0; y < h; y++, off += w) {
memset(&mask_v[off], ii_weights_1d[y * step], w);
for (int x = 0; x < w; x++) {
mask_sm[off + x] = ii_weights_1d[imin(x, y) * step];
mask_h[off + x] = ii_weights_1d[x * step];
}
}
}
COLD void dav1d_init_ii_wedge_masks(void) {
// This function is guaranteed to be called only once
enum WedgeMasterLineType {
@ -257,9 +238,11 @@ COLD void dav1d_init_wedge_masks(void) {
hflip(master[WEDGE_OBLIQUE153], master[WEDGE_OBLIQUE27]);
#define fill(w, h, sz_422, sz_420, hvsw, signs) \
fill2d_16x2((uint8_t *) wedge_masks_444_##w##x##h, w, h, BS_##w##x##h, \
master, wedge_codebook_16_##hvsw, wedge_masks_444_##w##x##h, \
wedge_masks_422_##sz_422, wedge_masks_420_##sz_420, signs)
fill2d_16x2(w, h, BS_##w##x##h - BS_32x32, \
master, wedge_codebook_16_##hvsw, \
dav1d_masks.wedge_444_##w##x##h, \
dav1d_masks.wedge_422_##sz_422, \
dav1d_masks.wedge_420_##sz_420, signs)
fill(32, 32, 16x32, 16x16, heqw, 0x7bfb);
fill(32, 16, 16x16, 16x8, hltw, 0x7beb);
@ -271,72 +254,46 @@ COLD void dav1d_init_wedge_masks(void) {
fill( 8, 16, 4x16, 4x8, hgtw, 0x7beb);
fill( 8, 8, 4x8, 4x4, heqw, 0x7bfb);
#undef fill
}
#define N_II_PRED_MODES (N_INTER_INTRA_PRED_MODES - 1)
static uint8_t ALIGN(ii_dc_mask[32 * 32], 64);
static uint8_t ALIGN(ii_nondc_mask_32x32[N_II_PRED_MODES][32 * 32], 64);
static uint8_t ALIGN(ii_nondc_mask_16x32[N_II_PRED_MODES][16 * 32], 64);
static uint8_t ALIGN(ii_nondc_mask_16x16[N_II_PRED_MODES][16 * 16], 64);
static uint8_t ALIGN(ii_nondc_mask_8x32 [N_II_PRED_MODES][ 8 * 32], 64);
static uint8_t ALIGN(ii_nondc_mask_8x16 [N_II_PRED_MODES][ 8 * 16], 64);
static uint8_t ALIGN(ii_nondc_mask_8x8 [N_II_PRED_MODES][ 8 * 8], 64);
static uint8_t ALIGN(ii_nondc_mask_4x16 [N_II_PRED_MODES][ 4 * 16], 64);
static uint8_t ALIGN(ii_nondc_mask_4x8 [N_II_PRED_MODES][ 4 * 8], 32);
static uint8_t ALIGN(ii_nondc_mask_4x4 [N_II_PRED_MODES][ 4 * 4], 16);
#undef N_II_PRED_MODES
memset(dav1d_masks.ii_dc, 32, 32 * 32);
for (int c = 0; c < 3; c++) {
dav1d_masks.offsets[c][BS_32x32-BS_32x32].ii[II_DC_PRED] =
dav1d_masks.offsets[c][BS_32x16-BS_32x32].ii[II_DC_PRED] =
dav1d_masks.offsets[c][BS_16x32-BS_32x32].ii[II_DC_PRED] =
dav1d_masks.offsets[c][BS_16x16-BS_32x32].ii[II_DC_PRED] =
dav1d_masks.offsets[c][BS_16x8 -BS_32x32].ii[II_DC_PRED] =
dav1d_masks.offsets[c][BS_8x16 -BS_32x32].ii[II_DC_PRED] =
dav1d_masks.offsets[c][BS_8x8 -BS_32x32].ii[II_DC_PRED] =
MASK_OFFSET(dav1d_masks.ii_dc);
}
#define set1(sz) \
[II_DC_PRED] = ii_dc_mask, \
[II_VERT_PRED] = ii_nondc_mask_##sz[II_VERT_PRED - 1], \
[II_HOR_PRED] = ii_nondc_mask_##sz[II_HOR_PRED - 1], \
[II_SMOOTH_PRED] = ii_nondc_mask_##sz[II_SMOOTH_PRED - 1]
#define set(sz_444, sz_422, sz_420) \
{ { set1(sz_444) }, { set1(sz_422) }, { set1(sz_420) } }
const uint8_t *dav1d_ii_masks[N_BS_SIZES][3][N_INTER_INTRA_PRED_MODES] = {
[BS_8x8] = set( 8x8, 4x8, 4x4),
[BS_8x16] = set( 8x16, 4x16, 4x8),
[BS_16x8] = set(16x16, 8x8, 8x8),
[BS_16x16] = set(16x16, 8x16, 8x8),
[BS_16x32] = set(16x32, 8x32, 8x16),
[BS_32x16] = set(32x32, 16x16, 16x16),
[BS_32x32] = set(32x32, 16x32, 16x16),
};
#undef set
#undef set1
#define BUILD_NONDC_II_MASKS(w, h, step) \
build_nondc_ii_masks(dav1d_masks.ii_nondc_##w##x##h, w, h, step)
static COLD void build_nondc_ii_masks(uint8_t *const mask_v,
uint8_t *const mask_h,
uint8_t *const mask_sm,
const int w, const int h, const int step)
{
static const uint8_t ii_weights_1d[] = {
60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1,
};
#define ASSIGN_NONDC_II_OFFSET(bs, w444, h444, w422, h422, w420, h420) \
dav1d_masks.offsets[0][bs-BS_32x32].ii[p + 1] = \
MASK_OFFSET(&dav1d_masks.ii_nondc_##w444##x##h444[p*w444*h444]); \
dav1d_masks.offsets[1][bs-BS_32x32].ii[p + 1] = \
MASK_OFFSET(&dav1d_masks.ii_nondc_##w422##x##h422[p*w422*h422]); \
dav1d_masks.offsets[2][bs-BS_32x32].ii[p + 1] = \
MASK_OFFSET(&dav1d_masks.ii_nondc_##w420##x##h420[p*w420*h420])
for (int y = 0, off = 0; y < h; y++, off += w) {
memset(&mask_v[off], ii_weights_1d[y * step], w);
for (int x = 0; x < w; x++) {
mask_sm[off + x] = ii_weights_1d[imin(x, y) * step];
mask_h[off + x] = ii_weights_1d[x * step];
}
BUILD_NONDC_II_MASKS(32, 32, 1);
BUILD_NONDC_II_MASKS(16, 32, 1);
BUILD_NONDC_II_MASKS(16, 16, 2);
BUILD_NONDC_II_MASKS( 8, 32, 1);
BUILD_NONDC_II_MASKS( 8, 16, 2);
BUILD_NONDC_II_MASKS( 8, 8, 4);
BUILD_NONDC_II_MASKS( 4, 16, 2);
BUILD_NONDC_II_MASKS( 4, 8, 4);
BUILD_NONDC_II_MASKS( 4, 4, 8);
for (int p = 0; p < 3; p++) {
ASSIGN_NONDC_II_OFFSET(BS_32x32, 32, 32, 16, 32, 16, 16);
ASSIGN_NONDC_II_OFFSET(BS_32x16, 32, 32, 16, 16, 16, 16);
ASSIGN_NONDC_II_OFFSET(BS_16x32, 16, 32, 8, 32, 8, 16);
ASSIGN_NONDC_II_OFFSET(BS_16x16, 16, 16, 8, 16, 8, 8);
ASSIGN_NONDC_II_OFFSET(BS_16x8, 16, 16, 8, 8, 8, 8);
ASSIGN_NONDC_II_OFFSET(BS_8x16, 8, 16, 4, 16, 4, 8);
ASSIGN_NONDC_II_OFFSET(BS_8x8, 8, 8, 4, 8, 4, 4);
}
}
COLD void dav1d_init_interintra_masks(void) {
// This function is guaranteed to be called only once
memset(ii_dc_mask, 32, 32 * 32);
#define set(a) a[II_VERT_PRED - 1], a[II_HOR_PRED - 1], a[II_SMOOTH_PRED - 1]
build_nondc_ii_masks(set(ii_nondc_mask_32x32), 32, 32, 1);
build_nondc_ii_masks(set(ii_nondc_mask_16x32), 16, 32, 1);
build_nondc_ii_masks(set(ii_nondc_mask_16x16), 16, 16, 2);
build_nondc_ii_masks(set(ii_nondc_mask_8x32), 8, 32, 1);
build_nondc_ii_masks(set(ii_nondc_mask_8x16), 8, 16, 2);
build_nondc_ii_masks(set(ii_nondc_mask_8x8), 8, 8, 4);
build_nondc_ii_masks(set(ii_nondc_mask_4x16), 4, 16, 2);
build_nondc_ii_masks(set(ii_nondc_mask_4x8), 4, 8, 4);
build_nondc_ii_masks(set(ii_nondc_mask_4x4), 4, 4, 8);
#undef set
}

View file

@ -30,12 +30,67 @@
#include "src/levels.h"
void dav1d_init_wedge_masks(void);
EXTERN const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
[2 /* sign */][16 /* wedge_idx */];
typedef struct {
/* Offsets, in units of 8 bytes, relative to the start of the struct. */
struct {
uint16_t wedge[2 /* sign */][16 /* wedge_idx */];
uint16_t ii[N_INTER_INTRA_PRED_MODES];
} offsets[3 /* 444, 422, 420 */][BS_8x8 - BS_32x32 + 1];
void dav1d_init_interintra_masks(void);
EXTERN const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
[N_INTER_INTRA_PRED_MODES];
uint8_t ALIGN(wedge_444_32x32[ 16 * 32 * 32], 64);
uint8_t ALIGN(wedge_444_32x16[ 16 * 32 * 16], 64);
uint8_t ALIGN(wedge_444_32x8 [ 16 * 32 * 8], 64);
uint8_t ALIGN(wedge_444_16x32[ 16 * 16 * 32], 64);
uint8_t ALIGN(wedge_444_16x16[ 16 * 16 * 16], 64);
uint8_t ALIGN(wedge_444_16x8 [ 16 * 16 * 8], 64);
uint8_t ALIGN(wedge_444_8x32 [ 16 * 8 * 32], 64);
uint8_t ALIGN(wedge_444_8x16 [ 16 * 8 * 16], 64);
uint8_t ALIGN(wedge_444_8x8 [ 16 * 8 * 8], 64);
uint8_t ALIGN(wedge_422_16x32[2 * 16 * 16 * 32], 64);
uint8_t ALIGN(wedge_422_16x16[2 * 16 * 16 * 16], 64);
uint8_t ALIGN(wedge_422_16x8 [2 * 16 * 16 * 8], 64);
uint8_t ALIGN(wedge_422_8x32 [2 * 16 * 8 * 32], 64);
uint8_t ALIGN(wedge_422_8x16 [2 * 16 * 8 * 16], 64);
uint8_t ALIGN(wedge_422_8x8 [2 * 16 * 8 * 8], 64);
uint8_t ALIGN(wedge_422_4x32 [2 * 16 * 4 * 32], 64);
uint8_t ALIGN(wedge_422_4x16 [2 * 16 * 4 * 16], 64);
uint8_t ALIGN(wedge_422_4x8 [2 * 16 * 4 * 8], 64);
uint8_t ALIGN(wedge_420_16x16[2 * 16 * 16 * 16], 64);
uint8_t ALIGN(wedge_420_16x8 [2 * 16 * 16 * 8], 64);
uint8_t ALIGN(wedge_420_16x4 [2 * 16 * 16 * 4], 64);
uint8_t ALIGN(wedge_420_8x16 [2 * 16 * 8 * 16], 64);
uint8_t ALIGN(wedge_420_8x8 [2 * 16 * 8 * 8], 64);
uint8_t ALIGN(wedge_420_8x4 [2 * 16 * 8 * 4], 64);
uint8_t ALIGN(wedge_420_4x16 [2 * 16 * 4 * 16], 64);
uint8_t ALIGN(wedge_420_4x8 [2 * 16 * 4 * 8], 64);
uint8_t ALIGN(wedge_420_4x4 [2 * 16 * 4 * 4], 64);
uint8_t ALIGN(ii_dc [ 32 * 32], 64);
uint8_t ALIGN(ii_nondc_32x32[3 * 32 * 32], 64);
uint8_t ALIGN(ii_nondc_16x32[3 * 16 * 32], 64);
uint8_t ALIGN(ii_nondc_16x16[3 * 16 * 16], 64);
uint8_t ALIGN(ii_nondc_8x32 [3 * 8 * 32], 64);
uint8_t ALIGN(ii_nondc_8x16 [3 * 8 * 16], 64);
uint8_t ALIGN(ii_nondc_8x8 [3 * 8 * 8], 64);
uint8_t ALIGN(ii_nondc_4x16 [3 * 4 * 16], 64);
uint8_t ALIGN(ii_nondc_4x8 [3 * 4 * 8], 32);
uint8_t ALIGN(ii_nondc_4x4 [3 * 4 * 4], 16);
} Dav1dMasks;
#define II_MASK(c, bs, b) \
((const uint8_t*)((uintptr_t)&dav1d_masks + \
(size_t)((b)->interintra_type == INTER_INTRA_BLEND ? \
dav1d_masks.offsets[c][(bs)-BS_32x32].ii[(b)->interintra_mode] : \
dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[0][(b)->wedge_idx]) * 8))
#define WEDGE_MASK(c, bs, sign, idx) \
((const uint8_t*)((uintptr_t)&dav1d_masks + \
(size_t)dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[sign][idx] * 8))
EXTERN Dav1dMasks dav1d_masks;
void dav1d_init_ii_wedge_masks(void);
#endif /* DAV1D_SRC_WEDGE_H */

View file

@ -4885,24 +4885,26 @@ cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
jg .w32_wpad
jmp .w32_hpad
cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h
vbroadcasti128 m3, [palq]
cglobal pal_pred_16bpc, 4, 6, 6, dst, stride, pal, idx, w, h
vbroadcasti128 m4, [palq]
lea r2, [pal_pred_16bpc_avx2_table]
tzcnt wd, wm
vbroadcasti128 m4, [pal_pred_shuf]
vbroadcasti128 m5, [pal_pred_shuf]
movifnidn hd, hm
movsxd wq, [r2+wq*4]
pshufb m3, m4
punpckhqdq m4, m3, m3
pshufb m4, m5
punpckhqdq m5, m4, m4
add wq, r2
DEFINE_ARGS dst, stride, stride3, idx, w, h
lea stride3q, [strideq*3]
jmp wq
.w4:
mova xm2, [idxq]
add idxq, 16
pshufb xm1, xm3, xm2
pshufb xm2, xm4, xm2
movq xm0, [idxq]
add idxq, 8
psrlw xm1, xm0, 4
punpcklbw xm0, xm1
pshufb xm1, xm4, xm0
pshufb xm2, xm5, xm0
punpcklbw xm0, xm1, xm2
punpckhbw xm1, xm2
movq [dstq+strideq*0], xm0
@ -4914,10 +4916,12 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h
jg .w4
RET
.w8:
movu m2, [idxq] ; only 16-byte alignment
add idxq, 32
pshufb m1, m3, m2
pshufb m2, m4, m2
pmovzxbw m2, [idxq]
add idxq, 16
psllw m1, m2, 4
por m2, m1
pshufb m1, m4, m2
pshufb m2, m5, m2
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], xm0
@ -4929,19 +4933,22 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h
jg .w8
RET
.w16:
vpermq m2, [idxq+ 0], q3120
vpermq m5, [idxq+32], q3120
add idxq, 64
pshufb m1, m3, m2
pshufb m2, m4, m2
pshufd m3, [idxq], q3120
add idxq, 32
vpermq m3, m3, q3120
psrlw m1, m3, 4
punpcklbw m2, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m2
pshufb m2, m5, m2
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
pshufb m1, m3, m5
pshufb m2, m4, m5
punpcklbw m0, m1, m2
punpckhbw m1, m2
pshufb m1, m4, m3
pshufb m3, m5, m3
punpcklbw m0, m1, m3
punpckhbw m1, m3
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m1
lea dstq, [dstq+strideq*4]
@ -4949,41 +4956,47 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h
jg .w16
RET
.w32:
vpermq m2, [idxq+ 0], q3120
vpermq m5, [idxq+32], q3120
add idxq, 64
pshufb m1, m3, m2
pshufb m2, m4, m2
pshufd m3, [idxq], q3120
add idxq, 32
vpermq m3, m3, q3120
psrlw m1, m3, 4
punpcklbw m2, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m2
pshufb m2, m5, m2
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0+ 0], m0
mova [dstq+strideq*0+32], m1
pshufb m1, m3, m5
pshufb m2, m4, m5
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*1+ 0], m0
mova [dstq+strideq*1+32], m1
mova [dstq+ 0], m0
mova [dstq+32], m1
pshufb m1, m4, m3
pshufb m3, m5, m3
punpcklbw m0, m1, m3
punpckhbw m1, m3
mova [dstq+strideq+ 0], m0
mova [dstq+strideq+32], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w32
RET
.w64:
vpermq m2, [idxq+ 0], q3120
vpermq m5, [idxq+32], q3120
add idxq, 64
pshufb m1, m3, m2
pshufb m2, m4, m2
pshufd m3, [idxq], q3120
add idxq, 32
vpermq m3, m3, q3120
psrlw m1, m3, 4
punpcklbw m2, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m2
pshufb m2, m5, m2
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+ 0], m0
mova [dstq+32], m1
pshufb m1, m3, m5
pshufb m2, m4, m5
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+64], m0
mova [dstq+96], m1
mova [dstq+32*0], m0
mova [dstq+32*1], m1
pshufb m1, m4, m3
pshufb m3, m5, m3
punpcklbw m0, m1, m3
punpckhbw m1, m3
mova [dstq+32*2], m0
mova [dstq+32*3], m1
add dstq, strideq
dec hd
jg .w64

View file

@ -38,10 +38,10 @@ smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
pal_pred_perm: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39
db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
pal_pred_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51
db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55
db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5
times 4 db 10, 11, 12, 13, 2, 3, -1, -1
filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7
@ -57,6 +57,8 @@ filter_shift: times 2 dw 6
dd 0
times 2 dw 4
dd 9
pal_unpack: db 0, 8, 4, 12, 32, 40, 36, 44
db 16, 24, 20, 28, 48, 56, 52, 60
%macro JMP_TABLE 3-*
%xdefine %1_%2_table (%%table - 2*4)
@ -610,20 +612,23 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
jg .w64_loop
RET
cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3
lea r6, [pal_pred_16bpc_avx512icl_table]
tzcnt wd, wm
mova m2, [pal_pred_perm]
movsxd wq, [r6+wq*4]
mova xm3, [palq]
mova m3, [pal_pred_perm]
movifnidn hd, hm
movsxd wq, [r6+wq*4]
vpbroadcastq m4, [pal_unpack+0]
vpbroadcastq m5, [pal_unpack+8]
add wq, r6
vbroadcasti32x4 m6, [palq]
lea stride3q, [strideq*3]
jmp wq
.w4:
pmovzxbw ym0, [idxq]
add idxq, 16
vpermw ym0, ym0, ym3
pmovzxbd ym0, [idxq]
add idxq, 8
vpmultishiftqb ym0, ym4, ym0
vpermw ym0, ym0, ym6
vextracti32x4 xm1, ym0, 1
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
@ -634,9 +639,10 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
jg .w4
RET
.w8:
pmovzxbw m0, [idxq]
add idxq, 32
vpermw m0, m0, m3
pmovzxbd m0, [idxq]
add idxq, 16
vpmultishiftqb m0, m4, m0
vpermw m0, m0, m6
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
vextracti32x4 [dstq+strideq*2], m0, 2
@ -646,11 +652,13 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
jg .w8
RET
.w16:
vpermb m1, m2, [idxq]
add idxq, 64
vpermw m0, m1, m3
movu ym1, [idxq]
add idxq, 32
vpermb m1, m3, m1
vpmultishiftqb m1, m4, m1
vpermw m0, m1, m6
psrlw m1, 8
vpermw m1, m1, m3
vpermw m1, m1, m6
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], ym1
@ -660,27 +668,41 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
jg .w16
RET
.w32:
vpermb m1, m2, [idxq]
vpermb m2, m3, [idxq]
add idxq, 64
vpermw m0, m1, m3
vpmultishiftqb m1, m4, m2
vpmultishiftqb m2, m5, m2
vpermw m0, m1, m6
psrlw m1, 8
vpermw m1, m1, m3
vpermw m1, m1, m6
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
vpermw m0, m2, m6
psrlw m2, 8
vpermw m1, m2, m6
mova [dstq+strideq*2], m0
mova [dstq+stride3q ], m1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w32
RET
.w64:
vpermb m1, m2, [idxq]
vpermb m2, m3, [idxq]
add idxq, 64
vpermw m0, m1, m3
vpmultishiftqb m1, m4, m2
vpmultishiftqb m2, m5, m2
vpermw m0, m1, m6
psrlw m1, 8
vpermw m1, m1, m3
mova [dstq+64*0], m0
mova [dstq+64*1], m1
add dstq, strideq
dec hd
vpermw m1, m1, m6
mova [dstq+ 0], m0
mova [dstq+64], m1
vpermw m0, m2, m6
psrlw m2, 8
vpermw m1, m2, m6
mova [dstq+strideq+ 0], m0
mova [dstq+strideq+64], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w64
RET

View file

@ -3964,25 +3964,27 @@ cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
jg .w32_hpad_loop
jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h
cglobal pal_pred_16bpc, 4, 5, 6, dst, stride, pal, idx, w, h
%define base r2-pal_pred_16bpc_ssse3_table
%if ARCH_X86_32
%define hd r2d
%endif
mova m3, [palq]
mova m4, [palq]
LEA r2, pal_pred_16bpc_ssse3_table
tzcnt wd, wm
pshufb m3, [base+pal_pred_shuf]
pshufb m4, [base+pal_pred_shuf]
movsxd wq, [r2+wq*4]
pshufd m4, m3, q1032
pshufd m5, m4, q1032
add wq, r2
movifnidn hd, hm
jmp wq
.w4:
mova m0, [idxq]
add idxq, 16
pshufb m1, m3, m0
pshufb m2, m4, m0
movq m0, [idxq]
add idxq, 8
psrlw m1, m0, 4
punpcklbw m0, m1
pshufb m1, m4, m0
pshufb m2, m5, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
movq [dstq+strideq*0], m0
@ -3995,77 +3997,102 @@ cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h
jg .w4
RET
.w8:
mova m0, [idxq]
movu m3, [idxq]
add idxq, 16
pshufb m1, m3, m0
pshufb m2, m4, m0
psrlw m1, m3, 4
punpcklbw m0, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m0
pshufb m2, m5, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
pshufb m1, m4, m3
pshufb m2, m5, m3
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 4
jg .w8
RET
.w16:
mova m0, [idxq]
movu m3, [idxq]
add idxq, 16
pshufb m1, m3, m0
pshufb m2, m4, m0
psrlw m1, m3, 4
punpcklbw m0, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m0
pshufb m2, m5, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+16*0], m0
mova [dstq+16*1], m1
add dstq, strideq
dec hd
mova [dstq+ 0], m0
mova [dstq+16], m1
pshufb m1, m4, m3
pshufb m2, m5, m3
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq+ 0], m0
mova [dstq+strideq+16], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w16
RET
.w32:
mova m0, [idxq+16*0]
pshufb m1, m3, m0
pshufb m2, m4, m0
movu m3, [idxq]
add idxq, 16
psrlw m1, m3, 4
punpcklbw m0, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m0
pshufb m2, m5, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova m2, [idxq+16*1]
add idxq, 16*2
mova [dstq+16*0], m0
pshufb m0, m3, m2
mova [dstq+16*1], m1
pshufb m1, m4, m2
punpcklbw m2, m0, m1
punpckhbw m0, m1
mova [dstq+16*2], m2
mova [dstq+16*3], m0
pshufb m1, m4, m3
pshufb m2, m5, m3
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+16*2], m0
mova [dstq+16*3], m1
add dstq, strideq
dec hd
jg .w32
RET
.w64:
mova m0, [idxq+16*0]
pshufb m1, m3, m0
pshufb m2, m4, m0
movu m3, [idxq+16*0]
psrlw m1, m3, 4
punpcklbw m0, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m0
pshufb m2, m5, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova m2, [idxq+16*1]
mova [dstq+16*0], m0
pshufb m0, m3, m2
mova [dstq+16*1], m1
pshufb m1, m4, m2
punpcklbw m2, m0, m1
punpckhbw m0, m1
mova m1, [idxq+16*2]
mova [dstq+16*2], m2
pshufb m2, m3, m1
mova [dstq+16*3], m0
pshufb m0, m4, m1
punpcklbw m1, m2, m0
punpckhbw m2, m0
mova m0, [idxq+16*3]
add idxq, 16*4
mova [dstq+16*4], m1
pshufb m1, m3, m0
mova [dstq+16*5], m2
pshufb m2, m4, m0
pshufb m1, m4, m3
pshufb m2, m5, m3
movu m3, [idxq+16*1]
add idxq, 32
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+16*2], m0
mova [dstq+16*3], m1
psrlw m1, m3, 4
punpcklbw m0, m3, m1
punpckhbw m3, m1
pshufb m1, m4, m0
pshufb m2, m5, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+16*4], m0
mova [dstq+16*5], m1
pshufb m1, m4, m3
pshufb m2, m5, m3
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+16*6], m0

View file

@ -5307,18 +5307,20 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_
RET
cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
vbroadcasti128 m4, [palq]
vpbroadcastq m4, [palq]
lea r2, [pal_pred_avx2_table]
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r2+wq*4]
packuswb m4, m4
add wq, r2
lea r2, [strideq*3]
jmp wq
.w4:
pshufb xm0, xm4, [idxq]
add idxq, 16
movq xm0, [idxq]
add idxq, 8
psrlw xm1, xm0, 4
punpcklbw xm0, xm1
pshufb xm0, xm4, xm0
movd [dstq+strideq*0], xm0
pextrd [dstq+strideq*1], xm0, 1
pextrd [dstq+strideq*2], xm0, 2
@ -5327,11 +5329,14 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
sub hd, 4
jg .w4
RET
ALIGN function_align
.w8:
pshufb xm0, xm4, [idxq+16*0]
pshufb xm1, xm4, [idxq+16*1]
add idxq, 16*2
movu xm2, [idxq]
add idxq, 16
pshufb xm1, xm4, xm2
psrlw xm2, 4
pshufb xm2, xm4, xm2
punpcklbw xm0, xm1, xm2
punpckhbw xm1, xm2
movq [dstq+strideq*0], xm0
movhps [dstq+strideq*1], xm0
movq [dstq+strideq*2], xm1
@ -5340,47 +5345,48 @@ ALIGN function_align
sub hd, 4
jg .w8
RET
ALIGN function_align
.w16:
pshufb m0, m4, [idxq+32*0]
pshufb m1, m4, [idxq+32*1]
add idxq, 32*2
movu m2, [idxq]
add idxq, 32
pshufb m1, m4, m2
psrlw m2, 4
pshufb m2, m4, m2
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], xm0
vextracti128 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], xm1
mova [dstq+strideq*1], xm1
vextracti128 [dstq+strideq*2], m0, 1
vextracti128 [dstq+r2 ], m1, 1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w16
RET
ALIGN function_align
.w32:
pshufb m0, m4, [idxq+32*0]
pshufb m1, m4, [idxq+32*1]
pshufb m2, m4, [idxq+32*2]
pshufb m3, m4, [idxq+32*3]
add idxq, 32*4
vpermq m2, [idxq], q3120
add idxq, 32
pshufb m1, m4, m2
psrlw m2, 4
pshufb m2, m4, m2
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+r2 ], m3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w32
RET
ALIGN function_align
.w64:
pshufb m0, m4, [idxq+32*0]
pshufb m1, m4, [idxq+32*1]
pshufb m2, m4, [idxq+32*2]
pshufb m3, m4, [idxq+32*3]
add idxq, 32*4
mova [dstq+strideq*0+32*0], m0
mova [dstq+strideq*0+32*1], m1
mova [dstq+strideq*1+32*0], m2
mova [dstq+strideq*1+32*1], m3
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w32
RET
.w64:
vpermq m2, [idxq], q3120
add idxq, 32
pshufb m1, m4, m2
psrlw m2, 4
pshufb m2, m4, m2
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+32*0], m0
mova [dstq+32*1], m1
add dstq, strideq
dec hd
jg .w64
RET

View file

@ -95,6 +95,8 @@ smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79
db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127
ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4
db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
pb_127_m127: times 2 db 127, -127
pb_128: times 4 db 128
@ -126,7 +128,6 @@ JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64
JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64
JMP_TABLE pal_pred_8bpc, avx512icl, w4, w8, w16, w32, w64
SECTION .text
@ -1111,19 +1112,20 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
jg .w64_loop
RET
cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
lea r6, [pal_pred_8bpc_avx512icl_table]
tzcnt wd, wm
vbroadcasti32x4 m4, [palq]
cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3
movifnidn wd, wm
movifnidn hd, hm
movsxd wq, [r6+wq*4]
packuswb m4, m4
add wq, r6
lea stride3q, [strideq*3]
jmp wq
cmp wd, 8
jg .w32
movq xmm3, [palq]
je .w8
.w4:
pshufb xmm0, xm4, [idxq]
add idxq, 16
movq xmm0, [idxq]
add idxq, 8
psrlw xmm1, xmm0, 4
punpcklbw xmm0, xmm1
pshufb xmm0, xmm3, xmm0
movd [dstq+strideq*0], xmm0
pextrd [dstq+strideq*1], xmm0, 1
pextrd [dstq+strideq*2], xmm0, 2
@ -1133,9 +1135,13 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
jg .w4
RET
.w8:
pshufb xmm0, xm4, [idxq+16*0]
pshufb xmm1, xm4, [idxq+16*1]
add idxq, 16*2
movu xmm2, [idxq]
add idxq, 16
pshufb xmm1, xmm3, xmm2
psrlw xmm2, 4
pshufb xmm2, xmm3, xmm2
punpcklbw xmm0, xmm1, xmm2
punpckhbw xmm1, xmm2
movq [dstq+strideq*0], xmm0
movhps [dstq+strideq*1], xmm0
movq [dstq+strideq*2], xmm1
@ -1145,8 +1151,10 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
jg .w8
RET
.w16:
pshufb m0, m4, [idxq]
add idxq, 64
pmovzxdq m0, [idxq]
add idxq, 32
vpmultishiftqb m0, m3, m0
pshufb m0, m5, m0
mova [dstq+strideq*0], xm0
vextracti32x4 [dstq+strideq*1], ym0, 1
vextracti32x4 [dstq+strideq*2], m0, 2
@ -1156,29 +1164,39 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
jg .w16
RET
.w32:
pshufb m0, m4, [idxq+64*0]
pshufb m1, m4, [idxq+64*1]
add idxq, 64*2
vpbroadcastq m3, [pal_unpack+0]
vpbroadcastq m5, [palq]
cmp wd, 32
jl .w16
pmovzxbd m2, [pal_perm]
vpbroadcastq m4, [pal_unpack+8]
jg .w64
.w32_loop:
vpermd m1, m2, [idxq]
add idxq, 64
vpmultishiftqb m0, m3, m1
vpmultishiftqb m1, m4, m1
pshufb m0, m5, m0
pshufb m1, m5, m1
mova [dstq+strideq*0], ym0
vextracti32x8 [dstq+strideq*1], m0, 1
mova [dstq+strideq*2], ym1
vextracti32x8 [dstq+stride3q ], m1, 1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w32
jg .w32_loop
RET
.w64:
pshufb m0, m4, [idxq+64*0]
pshufb m1, m4, [idxq+64*1]
pshufb m2, m4, [idxq+64*2]
pshufb m3, m4, [idxq+64*3]
add idxq, 64*4
vpermd m1, m2, [idxq]
add idxq, 64
vpmultishiftqb m0, m3, m1
vpmultishiftqb m1, m4, m1
pshufb m0, m5, m0
pshufb m1, m5, m1
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
mova [dstq+strideq*2], m2
mova [dstq+stride3q ], m3
lea dstq, [dstq+strideq*4]
sub hd, 4
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w64
RET

View file

@ -3479,26 +3479,28 @@ cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy
jg .end_transpose_loop
RET
;---------------------------------------------------------------------------------------
;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal,
; const uint8_t *idx, const int w, const int h);
;---------------------------------------------------------------------------------------
;-------------------------------------------------------------------------------
;int dav1d_pal_pred_ssse3(pixel *dst, ptrdiff_t stride, const pixel *pal,
; const uint8_t *idx, int w, int h);
;-------------------------------------------------------------------------------
cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
mova m4, [palq]
movq m4, [palq]
LEA r2, pal_pred_ssse3_table
tzcnt wd, wm
movifnidn hd, hm
movsxd wq, [r2+wq*4]
packuswb m4, m4
add wq, r2
lea r2, [strideq*3]
jmp wq
.w4:
pshufb m0, m4, [idxq]
add idxq, 16
movd [dstq ], m0
movq m1, [idxq]
add idxq, 8
psrlw m0, m1, 4
punpcklbw m1, m0
pshufb m0, m4, m1
movd [dstq+strideq*0], m0
pshuflw m1, m0, q1032
movd [dstq+strideq ], m1
movd [dstq+strideq*1], m1
punpckhqdq m0, m0
movd [dstq+strideq*2], m0
psrlq m0, 32
@ -3507,60 +3509,68 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
sub hd, 4
jg .w4
RET
ALIGN function_align
.w8:
pshufb m0, m4, [idxq]
pshufb m1, m4, [idxq+16]
add idxq, 32
movq [dstq ], m0
movhps [dstq+strideq ], m0
movu m0, [idxq]
add idxq, 16
pshufb m1, m4, m0
psrlw m0, 4
pshufb m2, m4, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
movq [dstq+strideq*0], m0
movhps [dstq+strideq*1], m0
movq [dstq+strideq*2], m1
movhps [dstq+r2 ], m1
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w8
RET
ALIGN function_align
.w16:
pshufb m0, m4, [idxq]
pshufb m1, m4, [idxq+16]
pshufb m2, m4, [idxq+32]
pshufb m3, m4, [idxq+48]
add idxq, 64
mova [dstq ], m0
mova [dstq+strideq ], m1
mova [dstq+strideq*2], m2
mova [dstq+r2 ], m3
lea dstq, [dstq+strideq*4]
sub hd, 4
jg .w16
RET
ALIGN function_align
.w32:
pshufb m0, m4, [idxq]
pshufb m1, m4, [idxq+16]
pshufb m2, m4, [idxq+32]
pshufb m3, m4, [idxq+48]
add idxq, 64
mova [dstq ], m0
mova [dstq+16 ], m1
mova [dstq+strideq ], m2
mova [dstq+strideq+16], m3
movu m0, [idxq]
add idxq, 16
pshufb m1, m4, m0
psrlw m0, 4
pshufb m2, m4, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+strideq*0], m0
mova [dstq+strideq*1], m1
lea dstq, [dstq+strideq*2]
sub hd, 2
jg .w16
RET
.w32:
movu m0, [idxq]
add idxq, 16
pshufb m1, m4, m0
psrlw m0, 4
pshufb m2, m4, m0
punpcklbw m0, m1, m2
punpckhbw m1, m2
mova [dstq+16*0], m0
mova [dstq+16*1], m1
add dstq, strideq
dec hd
jg .w32
RET
ALIGN function_align
.w64:
pshufb m0, m4, [idxq]
pshufb m1, m4, [idxq+16]
pshufb m2, m4, [idxq+32]
pshufb m3, m4, [idxq+48]
add idxq, 64
mova [dstq ], m0
mova [dstq+16], m1
mova [dstq+32], m2
mova [dstq+48], m3
movu m0, [idxq+16*0]
movu m2, [idxq+16*1]
add idxq, 32
pshufb m1, m4, m0
psrlw m0, 4
pshufb m3, m4, m0
punpcklbw m0, m1, m3
punpckhbw m1, m3
mova [dstq+16*0], m0
mova [dstq+16*1], m1
pshufb m1, m4, m2
psrlw m2, 4
pshufb m3, m4, m2
punpcklbw m0, m1, m3
punpckhbw m1, m3
mova [dstq+16*2], m0
mova [dstq+16*3], m1
add dstq, strideq
sub hd, 1
jg .w64

641
third_party/dav1d/src/x86/pal.asm vendored Normal file
View file

@ -0,0 +1,641 @@
; Copyright © 2023, VideoLAN and dav1d authors
; Copyright © 2023, Two Orioles, LLC
; All rights reserved.
;
; Redistribution and use in source and binary forms, with or without
; modification, are permitted provided that the following conditions are met:
;
; 1. Redistributions of source code must retain the above copyright notice, this
; list of conditions and the following disclaimer.
;
; 2. Redistributions in binary form must reproduce the above copyright notice,
; this list of conditions and the following disclaimer in the documentation
; and/or other materials provided with the distribution.
;
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
%include "config.asm"
%include "ext/x86/x86inc.asm"
SECTION_RODATA 64
pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
%if ARCH_X86_64
db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
%endif
pal_idx_w8_padh: db 0, 1, 2, 3, 3, 3, 3, 3, 8, 9, 10, 11, 11, 11, 11, 11
pb_1_16: times 4 db 1, 16
%if ARCH_X86_64
pb_32: times 4 db 32
%endif
%macro JMP_TABLE 2-*
%xdefine %1_table (%%table - 2*4)
%xdefine %%base mangle(private_prefix %+ _%1)
%%table:
%rep %0 - 1
dd %%base %+ .w%2 - (%%table - 2*4)
%rotate 1
%endrep
%endmacro
JMP_TABLE pal_idx_finish_ssse3, 4, 8, 16, 32, 64
%if ARCH_X86_64
JMP_TABLE pal_idx_finish_avx2, 4, 8, 16, 32, 64
JMP_TABLE pal_idx_finish_avx512icl, 4, 8, 16, 32, 64
%endif
SECTION .text
INIT_XMM ssse3
cglobal pal_idx_finish, 2, 7, 6, dst, src, bw, bh, w, h
%define base r6-pal_idx_finish_ssse3_table
LEA r6, pal_idx_finish_ssse3_table
tzcnt bwd, bwm
movifnidn bhd, bhm
movifnidn wd, wm
movifnidn hd, hm
movsxd bwq, [r6+bwq*4]
movddup m3, [base+pb_1_16]
add bwq, r6
sub bhd, hd
jmp bwq
.w4:
mova m0, [srcq]
add srcq, 16
pmaddubsw m0, m3
packuswb m0, m0
movq [dstq], m0
add dstq, 8
sub hd, 4
jg .w4
test bhd, bhd
jz .w4_end
pshuflw m0, m0, q3333
.w4_padv:
movq [dstq], m0
add dstq, 8
sub bhd, 4
jg .w4_padv
.w4_end:
RET
.w8_padh:
pshufb m0, m2
pshufb m1, m2
jmp .w8_main
.w8:
mova m2, [base+pal_idx_w8_padh]
.w8_loop:
mova m0, [srcq+16*0]
mova m1, [srcq+16*1]
cmp wd, 8
jl .w8_padh
.w8_main:
pmaddubsw m0, m3
pmaddubsw m1, m3
add srcq, 16*2
packuswb m0, m1
movu [dstq], m0
add dstq, 16
sub hd, 4
jg .w8_loop
test bhd, bhd
jz .w8_end
pshufd m0, m0, q3333
.w8_padv:
movu [dstq], m0
add dstq, 16
sub bhd, 4
jg .w8_padv
.w8_end:
RET
.w16_padh:
pshufb m0, m4
pshufb m1, m4
jmp .w16_main
.w16:
cmp wd, 16
je .w16_loop
call .setup_padh
.w16_loop:
mova m0, [srcq+16*0]
mova m1, [srcq+16*1]
cmp wd, 16
jl .w16_padh
.w16_main:
pmaddubsw m0, m3
pmaddubsw m1, m3
add srcq, 16*2
packuswb m0, m1
movu [dstq], m0
add dstq, 16
sub hd, 2
jg .w16_loop
test bhd, bhd
jz .w16_end
punpckhqdq m0, m0
.w16_padv:
movu [dstq+16*0], m0
movu [dstq+16*1], m0
add dstq, 16*2
sub bhd, 4
jg .w16_padv
.w16_end:
RET
.w32_padh:
cmp wd, 16
jg .w32_padh2
pshufb m1, m0, m5
pshufb m0, m4
jmp .w32_main
.w32_padh2:
pshufb m1, m4
jmp .w32_main
.w32:
cmp wd, 32
je .w32_loop
call .setup_padh
.w32_loop:
mova m0, [srcq+16*0]
mova m1, [srcq+16*1]
cmp wd, 32
jl .w32_padh
.w32_main:
pmaddubsw m0, m3
pmaddubsw m1, m3
add srcq, 16*2
packuswb m0, m1
movu [dstq], m0
add dstq, 16
dec hd
jg .w32_loop
test bhd, bhd
jz .w32_end
.w32_padv:
movu [dstq+16*0], m0
movu [dstq+16*1], m0
movu [dstq+16*2], m0
movu [dstq+16*3], m0
add dstq, 16*4
sub bhd, 4
jg .w32_padv
.w32_end:
RET
.w64_padh:
cmp wd, 16
jg .w64_padh2
pshufb m1, m0, m5
pshufb m0, m4
pmaddubsw m0, m3
pmaddubsw m1, m3
packuswb m0, m1
packuswb m1, m1
jmp .w64_main
.w64_padh2:
pshufb m1, m4
pmaddubsw m0, m3
pmaddubsw m2, m1, m3
pshufb m1, m5
pmaddubsw m1, m3
packuswb m0, m2
packuswb m1, m1
jmp .w64_main
.w64_padh3:
cmp wd, 48
jg .w64_padh4
pshufb m2, m1, m5
pshufb m1, m4
jmp .w64_main2
.w64_padh4:
pshufb m2, m4
jmp .w64_main2
.w64:
cmp wd, 64
je .w64_loop
call .setup_padh
.w64_loop:
mova m0, [srcq+16*0]
mova m1, [srcq+16*1]
cmp wd, 32
jle .w64_padh
pmaddubsw m0, m3
pmaddubsw m1, m3
packuswb m0, m1
mova m1, [srcq+16*2]
mova m2, [srcq+16*3]
cmp wd, 64
jl .w64_padh3
.w64_main2:
pmaddubsw m1, m3
pmaddubsw m2, m3
packuswb m1, m2
.w64_main:
add srcq, 16*4
movu [dstq+16*0], m0
movu [dstq+16*1], m1
add dstq, 16*2
dec hd
jg .w64_loop
test bhd, bhd
jz .w64_end
.w64_padv:
movu [dstq+16*0], m0
movu [dstq+16*1], m1
movu [dstq+16*2], m0
movu [dstq+16*3], m1
add dstq, 16*4
sub bhd, 2
jg .w64_padv
.w64_end:
RET
.setup_padh:
mova m4, [base+pb_0to63]
lea r6d, [wq-1]
and r6d, 15
movd m5, r6d
pxor m0, m0
pshufb m5, m0
pminub m4, m5
ret
%if ARCH_X86_64
INIT_YMM avx2
cglobal pal_idx_finish, 4, 7, 5, dst, src, bw, bh, w, h
%define base r6-pal_idx_finish_avx2_table
lea r6, [pal_idx_finish_avx2_table]
tzcnt bwd, bwd
movifnidn wd, wm
movifnidn hd, hm
movsxd bwq, [r6+bwq*4]
vpbroadcastd m2, [base+pb_1_16]
dec wd
add bwq, r6
sub bhd, hd
jmp bwq
.w4:
mova xm0, [srcq]
add srcq, 16
pmaddubsw xm0, xm2
packuswb xm0, xm0
movq [dstq], xm0
add dstq, 8
sub hd, 4
jg .w4
test bhd, bhd
jz .w4_end
pshuflw xm0, xm0, q3333
.w4_padv:
movq [dstq], xm0
add dstq, 8
sub bhd, 4
jg .w4_padv
.w4_end:
RET
.w8_padh:
pshufb xm0, xm3
pshufb xm1, xm3
jmp .w8_main
.w8:
mova xm3, [base+pal_idx_w8_padh]
.w8_loop:
mova xm0, [srcq+16*0]
mova xm1, [srcq+16*1]
cmp wd, 7
jl .w8_padh
.w8_main:
pmaddubsw xm0, xm2
pmaddubsw xm1, xm2
add srcq, 16*2
packuswb xm0, xm1
movu [dstq], xm0
add dstq, 16
sub hd, 4
jg .w8_loop
test bhd, bhd
jz .w8_end
pshufd xm0, xm0, q3333
.w8_padv:
movu [dstq], xm0
add dstq, 16
sub bhd, 4
jg .w8_padv
.w8_end:
RET
.w16_padh:
pshufb m0, m3
pshufb m1, m3
jmp .w16_main
.w16:
cmp wd, 15
je .w16_loop
vbroadcasti128 m0, [base+pb_0to63]
movd xm3, wd
vpbroadcastb m3, xm3
pminub m3, m0
.w16_loop:
mova m0, [srcq+32*0]
mova m1, [srcq+32*1]
cmp wd, 15
jl .w16_padh
.w16_main:
pmaddubsw m0, m2
pmaddubsw m1, m2
add srcq, 32*2
packuswb m0, m1
vpermq m1, m0, q3120
movu [dstq], m1
add dstq, 32
sub hd, 4
jg .w16_loop
test bhd, bhd
jz .w16_end
vpermq m0, m0, q3333
.w16_padv:
movu [dstq], m0
add dstq, 32
sub bhd, 4
jg .w16_padv
.w16_end:
RET
.w32_padh:
cmp wd, 15
jg .w32_padh2
vinserti128 m0, xm0, 1
vinserti128 m1, xm1, 1
.w32_padh2:
pshufb m0, m3
pshufb m1, m3
jmp .w32_main
.w32:
cmp wd, 31
je .w32_loop
movd xm3, wd
vpbroadcastb m3, xm3
pminub m3, [base+pb_0to63]
.w32_loop:
mova m0, [srcq+32*0]
mova m1, [srcq+32*1]
cmp wd, 31
jl .w32_padh
.w32_main:
pmaddubsw m0, m2
pmaddubsw m1, m2
add srcq, 32*2
packuswb m0, m1
vpermq m1, m0, q3120
movu [dstq], m1
add dstq, 32
sub hd, 2
jg .w32_loop
test bhd, bhd
jz .w32_end
vpermq m0, m0, q3131
.w32_padv:
movu [dstq+32*0], m0
movu [dstq+32*1], m0
add dstq, 32*2
sub bhd, 4
jg .w32_padv
.w32_end:
RET
.w64_padh:
cmp wd, 15
jg .w64_padh2
vinserti128 m1, m0, xm0, 1
pshufb m0, m1, m3
pshufb m1, m4
jmp .w64_main
.w64_padh2:
cmp wd, 31
jg .w64_padh3
vperm2i128 m1, m0, m0, 0x11
pshufb m0, m3
pshufb m1, m4
jmp .w64_main
.w64_padh3:
cmp wd, 47
jg .w64_padh4
vinserti128 m1, xm1, 1
.w64_padh4:
pshufb m1, m3
jmp .w64_main
.w64:
cmp wd, 63
je .w64_loop
mov r6d, wd
and r6d, 31
movd xm4, r6d
vpbroadcastb m4, xm4
pminub m3, m4, [pb_0to63]
.w64_loop:
mova m0, [srcq+32*0]
mova m1, [srcq+32*1]
cmp wd, 63
jl .w64_padh
.w64_main:
pmaddubsw m0, m2
pmaddubsw m1, m2
add srcq, 32*2
packuswb m0, m1
vpermq m0, m0, q3120
movu [dstq], m0
add dstq, 32
dec hd
jg .w64_loop
test bhd, bhd
jz .w64_end
.w64_padv:
movu [dstq+32*0], m0
movu [dstq+32*1], m0
movu [dstq+32*2], m0
movu [dstq+32*3], m0
add dstq, 32*4
sub bhd, 4
jg .w64_padv
.w64_end:
RET
INIT_ZMM avx512icl
cglobal pal_idx_finish, 4, 7, 7, dst, src, bw, bh, w, h
%define base r6-pal_idx_finish_avx512icl_table
lea r6, [pal_idx_finish_avx512icl_table]
tzcnt bwd, bwd
movifnidn wd, wm
movifnidn hd, hm
movsxd bwq, [r6+bwq*4]
vpbroadcastd m4, [base+pb_1_16]
dec wd
add bwq, r6
sub bhd, hd
jmp bwq
.w4:
mova xmm0, [srcq]
add srcq, 16
pmaddubsw xmm0, xm4
packuswb xmm0, xmm0
movq [dstq], xmm0
add dstq, 8
sub hd, 4
jg .w4
test bhd, bhd
jz .w4_end
pshuflw xmm0, xmm0, q3333
.w4_padv:
movq [dstq], xmm0
add dstq, 8
sub bhd, 4
jg .w4_padv
.w4_end:
RET
.w8_padh:
pshufb xmm0, xmm2
pshufb xmm1, xmm2
jmp .w8_main
.w8:
mova xmm2, [base+pal_idx_w8_padh]
.w8_loop:
mova xmm0, [srcq+16*0]
mova xmm1, [srcq+16*1]
cmp wd, 7
jl .w8_padh
.w8_main:
pmaddubsw xmm0, xm4
pmaddubsw xmm1, xm4
add srcq, 16*2
packuswb xmm0, xmm1
movu [dstq], xmm0
add dstq, 16
sub hd, 4
jg .w8_loop
test bhd, bhd
jz .w8_end
pshufd xmm0, xmm0, q3333
.w8_padv:
movu [dstq], xmm0
add dstq, 16
sub bhd, 4
jg .w8_padv
.w8_end:
RET
.w16_padh:
pshufb m0, m2
jmp .w16_main
.w16:
cmp wd, 15
je .w16_loop
vbroadcasti32x4 m2, [base+pb_0to63]
vpbroadcastb m0, wd
pminub m2, m0
.w16_loop:
mova m0, [srcq]
cmp wd, 15
jl .w16_padh
.w16_main:
pmaddubsw m0, m4
add srcq, 64
vpmovwb ym0, m0
movu [dstq], ym0
add dstq, 32
sub hd, 4
jg .w16_loop
test bhd, bhd
jz .w16_end
vpermq ym0, ym0, q3333
.w16_padv:
movu [dstq], ym0
add dstq, 32
sub bhd, 4
jg .w16_padv
.w16_end:
RET
.w32_padh:
vpermb m0, m2, m0
vpermb m1, m2, m1
jmp .w32_main
.w32:
mova m2, [base+pb_0to63]
paddb m3, m2, m2
cmp wd, 31
je .w32_loop
vpbroadcastb m0, wd
mov r6d, 0xff00
kmovw k1, r6d
vpaddd m0{k1}, [pb_32] {1to16}
pminub m2, m0
.w32_loop:
mova m0, [srcq+64*0]
mova m1, [srcq+64*1]
cmp wd, 31
jl .w32_padh
.w32_main:
pmaddubsw m0, m4
pmaddubsw m1, m4
add srcq, 64*2
vpermt2b m0, m3, m1
movu [dstq], m0
add dstq, 64
sub hd, 4
jg .w32_loop
test bhd, bhd
jz .w32_end
vshufi32x4 m0, m0, q3333
.w32_padv:
movu [dstq], m0
add dstq, 64
sub bhd, 4
jg .w32_padv
.w32_end:
RET
.w64_padh:
REPX {vpermb x, m5, x}, m0, m1, m2, m3
jmp .w64_main
.w64:
mova m5, [base+pb_0to63]
paddb m6, m5, m5
cmp wd, 63
je .w64_loop
vpbroadcastb m0, wd
pminub m5, m0
.w64_loop:
mova m0, [srcq+64*0]
mova m1, [srcq+64*1]
mova m2, [srcq+64*2]
mova m3, [srcq+64*3]
cmp wd, 63
jl .w64_padh
.w64_main:
REPX {pmaddubsw x, m4}, m0, m1, m2, m3
add srcq, 64*4
vpermt2b m0, m6, m1
vpermt2b m2, m6, m3
movu [dstq+64*0], m0
movu [dstq+64*1], m2
add dstq, 64*2
sub hd, 4
jg .w64_loop
test bhd, bhd
jz .w64_end
vshufi32x4 m2, m2, q3232
.w64_padv:
movu [dstq+64*0], m2
movu [dstq+64*1], m2
add dstq, 64*2
sub bhd, 4
jg .w64_padv
.w64_end:
RET
%endif ; ARCH_X86_64

50
third_party/dav1d/src/x86/pal.h vendored Normal file
View file

@ -0,0 +1,50 @@
/*
* Copyright © 2023, VideoLAN and dav1d authors
* Copyright © 2023, Two Orioles, LLC
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "src/cpu.h"
decl_pal_idx_finish_fn(dav1d_pal_idx_finish_ssse3);
decl_pal_idx_finish_fn(dav1d_pal_idx_finish_avx2);
decl_pal_idx_finish_fn(dav1d_pal_idx_finish_avx512icl);
static ALWAYS_INLINE void pal_dsp_init_x86(Dav1dPalDSPContext *const c) {
const unsigned flags = dav1d_get_cpu_flags();
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
c->pal_idx_finish = dav1d_pal_idx_finish_ssse3;
#if ARCH_X86_64
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
c->pal_idx_finish = dav1d_pal_idx_finish_avx2;
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
c->pal_idx_finish = dav1d_pal_idx_finish_avx512icl;
#endif
}

View file

@ -35,6 +35,7 @@ if is_asm_enabled
checkasm_sources = files(
'checkasm/checkasm.c',
'checkasm/msac.c',
'checkasm/pal.c',
'checkasm/refmvs.c',
)