forked from mirrors/gecko-dev
Bug 1846318 update dav1d to e58afe4dd9057591882a01c31382c203e8a61c92 r=chunmin
Depends on D187495 Differential Revision: https://phabricator.services.mozilla.com/D187496
This commit is contained in:
parent
bfae62bdc0
commit
0d02f04be0
40 changed files with 1831 additions and 734 deletions
|
|
@ -133,6 +133,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
|
|||
'../../../third_party/dav1d/src/x86/mc16_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/mc_sse.asm',
|
||||
'../../../third_party/dav1d/src/x86/msac.asm',
|
||||
'../../../third_party/dav1d/src/x86/pal.asm',
|
||||
'../../../third_party/dav1d/src/x86/refmvs.asm',
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -85,6 +85,7 @@ SOURCES += [
|
|||
'../../third_party/dav1d/src/mem.c',
|
||||
'../../third_party/dav1d/src/msac.c',
|
||||
'../../third_party/dav1d/src/obu.c',
|
||||
'../../third_party/dav1d/src/pal.c',
|
||||
'../../third_party/dav1d/src/picture.c',
|
||||
'../../third_party/dav1d/src/qm.c',
|
||||
'../../third_party/dav1d/src/ref.c',
|
||||
|
|
|
|||
|
|
@ -20,11 +20,11 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: 616bfd1506a8a75c6a358e578cbec9ca11931502 (2023-07-01T11:36:39.000+03:00).
|
||||
release: e58afe4dd9057591882a01c31382c203e8a61c92 (2023-07-25T16:10:07.000+02:00).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: 616bfd1506a8a75c6a358e578cbec9ca11931502
|
||||
revision: e58afe4dd9057591882a01c31382c203e8a61c92
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
|||
|
|
@ -1,2 +1,2 @@
|
|||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "616bfd1506a8a75c6a358e578cbec9ca11931502"
|
||||
#define DAV1D_VERSION "e58afe4dd9057591882a01c31382c203e8a61c92"
|
||||
|
|
|
|||
2
third_party/dav1d/include/common/bitdepth.h
vendored
2
third_party/dav1d/include/common/bitdepth.h
vendored
|
|
@ -34,7 +34,7 @@
|
|||
#include "common/attributes.h"
|
||||
|
||||
#if !defined(BITDEPTH)
|
||||
typedef void pixel;
|
||||
typedef uint8_t pixel; /* can't be void due to pointer-to-array usage */
|
||||
typedef void coef;
|
||||
#define HIGHBD_DECL_SUFFIX /* nothing */
|
||||
#define HIGHBD_CALL_SUFFIX /* nothing */
|
||||
|
|
|
|||
16
third_party/dav1d/src/arm/32/filmgrain.S
vendored
16
third_party/dav1d/src/arm/32/filmgrain.S
vendored
|
|
@ -1481,8 +1481,8 @@ function fgy_32x32_8bpc_neon, export=1
|
|||
calc_offset r6, lr, r6, 0, 0
|
||||
add_offset r5, r6, lr, r5, r9
|
||||
|
||||
add r4, r4, #32 // grain_lut += BLOCK_SIZE * bx
|
||||
add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add r4, r4, #32 // grain_lut += FG_BLOCK_SIZE * bx
|
||||
add r6, r11, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
|
||||
ldr r10, [sp, #120] // type
|
||||
adr r11, L(fgy_loop_tbl)
|
||||
|
|
@ -1490,8 +1490,8 @@ function fgy_32x32_8bpc_neon, export=1
|
|||
tst r10, #1
|
||||
ldr r10, [r11, r10, lsl #2]
|
||||
|
||||
add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add r8, r8, #32 // grain_lut += BLOCK_SIZE * bx
|
||||
add r8, r8, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
add r8, r8, #32 // grain_lut += FG_BLOCK_SIZE * bx
|
||||
|
||||
add r11, r11, r10
|
||||
|
||||
|
|
@ -1695,10 +1695,10 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
|
|||
calc_offset r8, r12, r8, \sx, \sy
|
||||
add_offset r5, r8, r12, r5, r10
|
||||
|
||||
add r4, r4, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
|
||||
add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
|
||||
add r4, r4, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
|
||||
add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
add r11, r11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
|
||||
|
||||
movrel_local r12, overlap_coeffs_\sx
|
||||
ldr lr, [sp, #132] // type
|
||||
|
|
|
|||
16
third_party/dav1d/src/arm/32/filmgrain16.S
vendored
16
third_party/dav1d/src/arm/32/filmgrain16.S
vendored
|
|
@ -1353,8 +1353,8 @@ function fgy_32x32_16bpc_neon, export=1
|
|||
calc_offset r6, lr, r6, 0, 0
|
||||
add_offset r5, r6, lr, r5, r9
|
||||
|
||||
add r4, r4, #32*2 // grain_lut += BLOCK_SIZE * bx
|
||||
add r6, r11, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add r4, r4, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
|
||||
add r6, r11, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
|
||||
ldr r10, [sp, #120] // type
|
||||
adr r11, L(fgy_loop_tbl)
|
||||
|
|
@ -1362,8 +1362,8 @@ function fgy_32x32_16bpc_neon, export=1
|
|||
tst r10, #1
|
||||
ldr r10, [r11, r10, lsl #2]
|
||||
|
||||
add r8, r8, r9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add r8, r8, #32*2 // grain_lut += BLOCK_SIZE * bx
|
||||
add r8, r8, r9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
add r8, r8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
|
||||
|
||||
add r11, r11, r10
|
||||
|
||||
|
|
@ -1651,10 +1651,10 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1
|
|||
|
||||
vmov.16 d31[3], r7 // overlap y [1]
|
||||
|
||||
add r4, r4, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
|
||||
add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add r11, r11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
|
||||
add r4, r4, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
|
||||
add r8, lr, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
add r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
add r11, r11, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
|
||||
|
||||
movrel_local r12, overlap_coeffs_\sx
|
||||
ldr lr, [sp, #132] // type
|
||||
|
|
|
|||
43
third_party/dav1d/src/arm/32/ipred.S
vendored
43
third_party/dav1d/src/arm/32/ipred.S
vendored
|
|
@ -1576,17 +1576,17 @@ L(ipred_filter_tbl):
|
|||
endfunc
|
||||
|
||||
// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const uint16_t *const pal, const uint8_t *idx,
|
||||
// const pixel *const pal, const uint8_t *idx,
|
||||
// const int w, const int h);
|
||||
function pal_pred_8bpc_neon, export=1
|
||||
push {r4-r5, lr}
|
||||
ldrd r4, r5, [sp, #12]
|
||||
vld1.16 {q0}, [r2, :128]
|
||||
vld1.8 {d0}, [r2, :64]
|
||||
clz lr, r4
|
||||
adr r12, L(pal_pred_tbl)
|
||||
sub lr, lr, #25
|
||||
vmov.i8 q15, #7
|
||||
ldr lr, [r12, lr, lsl #2]
|
||||
vmovn.i16 d0, q0
|
||||
add r12, r12, lr
|
||||
add r2, r0, r1
|
||||
bx r12
|
||||
|
|
@ -1602,8 +1602,11 @@ L(pal_pred_tbl):
|
|||
40:
|
||||
lsl r1, r1, #1
|
||||
4:
|
||||
vld1.8 {q1}, [r3, :128]!
|
||||
vld1.8 {d2}, [r3, :64]!
|
||||
subs r5, r5, #4
|
||||
vshr.u8 d3, d2, #4
|
||||
vand.u8 d2, d2, d30
|
||||
vzip.8 d2, d3
|
||||
vtbl.8 d2, {d0}, d2
|
||||
vtbl.8 d3, {d0}, d3
|
||||
vst1.32 {d2[0]}, [r0, :32], r1
|
||||
|
|
@ -1615,8 +1618,11 @@ L(pal_pred_tbl):
|
|||
80:
|
||||
lsl r1, r1, #1
|
||||
8:
|
||||
vld1.8 {q1, q2}, [r3, :128]!
|
||||
vld1.8 {q1}, [r3, :64]!
|
||||
subs r5, r5, #4
|
||||
vshr.u8 q2, q1, #4
|
||||
vand.u8 q1, q1, q15
|
||||
vzip.8 q1, q2
|
||||
vtbl.8 d2, {d0}, d2
|
||||
vtbl.8 d3, {d0}, d3
|
||||
vst1.8 {d2}, [r0, :64], r1
|
||||
|
|
@ -1630,9 +1636,14 @@ L(pal_pred_tbl):
|
|||
160:
|
||||
lsl r1, r1, #1
|
||||
16:
|
||||
vld1.8 {q8, q9}, [r3, :128]!
|
||||
vld1.8 {q10, q11}, [r3, :64]!
|
||||
subs r5, r5, #4
|
||||
vld1.8 {q10, q11}, [r3, :128]!
|
||||
vand.u8 q8, q10, q15
|
||||
vshr.u8 q9, q10, #4
|
||||
vand.u8 q10, q11, q15
|
||||
vshr.u8 q11, q11, #4
|
||||
vzip.8 q8, q9
|
||||
vzip.8 q10, q11
|
||||
vtbl.8 d16, {d0}, d16
|
||||
vtbl.8 d17, {d0}, d17
|
||||
vtbl.8 d18, {d0}, d18
|
||||
|
|
@ -1650,9 +1661,14 @@ L(pal_pred_tbl):
|
|||
320:
|
||||
lsl r1, r1, #1
|
||||
32:
|
||||
vld1.8 {q8, q9}, [r3, :128]!
|
||||
vld1.8 {q10, q11}, [r3, :64]!
|
||||
subs r5, r5, #2
|
||||
vld1.8 {q10, q11}, [r3, :128]!
|
||||
vand.u8 q8, q10, q15
|
||||
vshr.u8 q9, q10, #4
|
||||
vand.u8 q10, q11, q15
|
||||
vshr.u8 q11, q11, #4
|
||||
vzip.8 q8, q9
|
||||
vzip.8 q10, q11
|
||||
vtbl.8 d16, {d0}, d16
|
||||
vtbl.8 d17, {d0}, d17
|
||||
vtbl.8 d18, {d0}, d18
|
||||
|
|
@ -1668,9 +1684,14 @@ L(pal_pred_tbl):
|
|||
640:
|
||||
sub r1, r1, #32
|
||||
64:
|
||||
vld1.8 {q8, q9}, [r3, :128]!
|
||||
vld1.8 {q10, q11}, [r3, :64]!
|
||||
subs r5, r5, #1
|
||||
vld1.8 {q10, q11}, [r3, :128]!
|
||||
vand.u8 q8, q10, q15
|
||||
vshr.u8 q9, q10, #4
|
||||
vand.u8 q10, q11, q15
|
||||
vshr.u8 q11, q11, #4
|
||||
vzip.8 q8, q9
|
||||
vzip.8 q10, q11
|
||||
vtbl.8 d16, {d0}, d16
|
||||
vtbl.8 d17, {d0}, d17
|
||||
vtbl.8 d18, {d0}, d18
|
||||
|
|
|
|||
40
third_party/dav1d/src/arm/32/ipred16.S
vendored
40
third_party/dav1d/src/arm/32/ipred16.S
vendored
|
|
@ -1732,7 +1732,7 @@ function ipred_filter_16bpc_neon, export=1
|
|||
endfunc
|
||||
|
||||
// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const uint16_t *const pal, const uint8_t *idx,
|
||||
// const pixel *const pal, const uint8_t *idx,
|
||||
// const int w, const int h);
|
||||
function pal_pred_16bpc_neon, export=1
|
||||
push {r4-r5, lr}
|
||||
|
|
@ -1742,6 +1742,7 @@ function pal_pred_16bpc_neon, export=1
|
|||
clz lr, r4
|
||||
adr r12, L(pal_pred_tbl)
|
||||
sub lr, lr, #25
|
||||
vmov.i8 q13, #7
|
||||
ldr lr, [r12, lr, lsl #2]
|
||||
vmov.i16 q15, #0x100
|
||||
add r12, r12, lr
|
||||
|
|
@ -1759,8 +1760,11 @@ L(pal_pred_tbl):
|
|||
40:
|
||||
lsl r1, r1, #1
|
||||
4:
|
||||
vld1.8 {q1}, [r3, :128]!
|
||||
vld1.8 {d2}, [r3, :64]!
|
||||
subs r5, r5, #4
|
||||
vshr.u8 d3, d2, #4
|
||||
vand.u8 d2, d2, d26
|
||||
vzip.8 d2, d3
|
||||
// Restructure q1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
|
||||
vadd.i8 q0, q1, q1
|
||||
vadd.i8 q1, q1, q1
|
||||
|
|
@ -1780,8 +1784,11 @@ L(pal_pred_tbl):
|
|||
80:
|
||||
lsl r1, r1, #1
|
||||
8:
|
||||
vld1.8 {q1, q2}, [r3, :128]!
|
||||
vld1.8 {q1}, [r3, :64]!
|
||||
subs r5, r5, #4
|
||||
vshr.u8 q2, q1, #4
|
||||
vand.u8 q1, q1, q13
|
||||
vzip.8 q1, q2
|
||||
// Prefer doing the adds twice, instead of chaining a vmov after
|
||||
// the add.
|
||||
vadd.i8 q0, q1, q1
|
||||
|
|
@ -1811,9 +1818,14 @@ L(pal_pred_tbl):
|
|||
160:
|
||||
lsl r1, r1, #1
|
||||
16:
|
||||
vld1.8 {q2, q3}, [r3, :128]!
|
||||
vld1.8 {q10, q11}, [r3, :64]!
|
||||
subs r5, r5, #4
|
||||
vld1.8 {q10, q11}, [r3, :128]!
|
||||
vand.u8 q2, q10, q13
|
||||
vshr.u8 q3, q10, #4
|
||||
vand.u8 q10, q11, q13
|
||||
vshr.u8 q11, q11, #4
|
||||
vzip.8 q2, q3
|
||||
vzip.8 q10, q11
|
||||
vadd.i8 q0, q2, q2
|
||||
vadd.i8 q1, q2, q2
|
||||
vadd.i8 q2, q3, q3
|
||||
|
|
@ -1860,9 +1872,14 @@ L(pal_pred_tbl):
|
|||
lsl r1, r1, #1
|
||||
sub r1, r1, #32
|
||||
32:
|
||||
vld1.8 {q2, q3}, [r3, :128]!
|
||||
vld1.8 {q10, q11}, [r3, :64]!
|
||||
subs r5, r5, #2
|
||||
vld1.8 {q10, q11}, [r3, :128]!
|
||||
vand.u8 q2, q10, q13
|
||||
vshr.u8 q3, q10, #4
|
||||
vand.u8 q10, q11, q13
|
||||
vshr.u8 q11, q11, #4
|
||||
vzip.8 q2, q3
|
||||
vzip.8 q10, q11
|
||||
vadd.i8 q0, q2, q2
|
||||
vadd.i8 q1, q2, q2
|
||||
vadd.i8 q2, q3, q3
|
||||
|
|
@ -1908,9 +1925,14 @@ L(pal_pred_tbl):
|
|||
640:
|
||||
sub r1, r1, #96
|
||||
64:
|
||||
vld1.8 {q2, q3}, [r3, :128]!
|
||||
vld1.8 {q10, q11}, [r3, :64]!
|
||||
subs r5, r5, #1
|
||||
vld1.8 {q10, q11}, [r3, :128]!
|
||||
vand.u8 q2, q10, q13
|
||||
vshr.u8 q3, q10, #4
|
||||
vand.u8 q10, q11, q13
|
||||
vshr.u8 q11, q11, #4
|
||||
vzip.8 q2, q3
|
||||
vzip.8 q10, q11
|
||||
vadd.i8 q0, q2, q2
|
||||
vadd.i8 q1, q2, q2
|
||||
vadd.i8 q2, q3, q3
|
||||
|
|
|
|||
16
third_party/dav1d/src/arm/64/filmgrain.S
vendored
16
third_party/dav1d/src/arm/64/filmgrain.S
vendored
|
|
@ -1409,14 +1409,14 @@ function fgy_32x32_8bpc_neon, export=1
|
|||
ldr w11, [sp, #24] // type
|
||||
adr x13, L(fgy_loop_tbl)
|
||||
|
||||
add x4, x12, #32 // grain_lut += BLOCK_SIZE * bx
|
||||
add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add x4, x12, #32 // grain_lut += FG_BLOCK_SIZE * bx
|
||||
add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
|
||||
tst w11, #1
|
||||
ldrh w11, [x13, w11, uxtw #1]
|
||||
|
||||
add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add x8, x8, #32 // grain_lut += BLOCK_SIZE * bx
|
||||
add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
add x8, x8, #32 // grain_lut += FG_BLOCK_SIZE * bx
|
||||
|
||||
sub x11, x13, w11, uxtw
|
||||
|
||||
|
|
@ -1638,10 +1638,10 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
|
|||
add_offset x17, w16, x17, x5, x10
|
||||
add_offset x5, w8, x11, x5, x10
|
||||
|
||||
add x4, x13, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
|
||||
add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add x11, x11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
|
||||
add x4, x13, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
|
||||
add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
add x11, x11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
|
||||
|
||||
ldr w13, [sp, #64] // type
|
||||
|
||||
|
|
|
|||
16
third_party/dav1d/src/arm/64/filmgrain16.S
vendored
16
third_party/dav1d/src/arm/64/filmgrain16.S
vendored
|
|
@ -1308,14 +1308,14 @@ function fgy_32x32_16bpc_neon, export=1
|
|||
ldr w11, [sp, #88] // type
|
||||
adr x13, L(fgy_loop_tbl)
|
||||
|
||||
add x4, x12, #32*2 // grain_lut += BLOCK_SIZE * bx
|
||||
add x6, x14, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add x4, x12, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
|
||||
add x6, x14, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
|
||||
tst w11, #1
|
||||
ldrh w11, [x13, w11, uxtw #1]
|
||||
|
||||
add x8, x16, x9, lsl #5 // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add x8, x8, #32*2 // grain_lut += BLOCK_SIZE * bx
|
||||
add x8, x16, x9, lsl #5 // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
add x8, x8, #32*2 // grain_lut += FG_BLOCK_SIZE * bx
|
||||
|
||||
sub x11, x13, w11, uxtw
|
||||
|
||||
|
|
@ -1581,10 +1581,10 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1
|
|||
add_offset x17, w16, x17, x5, x10
|
||||
add_offset x5, w8, x11, x5, x10
|
||||
|
||||
add x4, x13, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
|
||||
add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
|
||||
add x11, x11, #2*(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
|
||||
add x4, x13, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
|
||||
add x8, x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
add x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
|
||||
add x11, x11, #2*(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
|
||||
|
||||
ldr w13, [sp, #112] // type
|
||||
|
||||
|
|
|
|||
61
third_party/dav1d/src/arm/64/ipred.S
vendored
61
third_party/dav1d/src/arm/64/ipred.S
vendored
|
|
@ -3921,23 +3921,26 @@ L(ipred_filter_tbl):
|
|||
endfunc
|
||||
|
||||
// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const uint16_t *const pal, const uint8_t *idx,
|
||||
// const pixel *const pal, const uint8_t *idx,
|
||||
// const int w, const int h);
|
||||
function pal_pred_8bpc_neon, export=1
|
||||
ld1 {v0.8h}, [x2]
|
||||
ld1 {v0.8b}, [x2]
|
||||
clz w9, w4
|
||||
adr x6, L(pal_pred_tbl)
|
||||
sub w9, w9, #25
|
||||
movi v31.16b, #7
|
||||
ldrh w9, [x6, w9, uxtw #1]
|
||||
xtn v0.8b, v0.8h
|
||||
sub x6, x6, w9, uxtw
|
||||
add x2, x0, x1
|
||||
lsl x1, x1, #1
|
||||
br x6
|
||||
4:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ld1 {v1.16b}, [x3], #16
|
||||
ld1 {v1.8b}, [x3], #8
|
||||
subs w5, w5, #4
|
||||
ushr v3.8b, v1.8b, #4
|
||||
and v2.8b, v1.8b, v31.8b
|
||||
zip1 v1.16b, v2.16b, v3.16b
|
||||
tbl v1.16b, {v0.16b}, v1.16b
|
||||
st1 {v1.s}[0], [x0], x1
|
||||
st1 {v1.s}[1], [x2], x1
|
||||
|
|
@ -3947,8 +3950,12 @@ function pal_pred_8bpc_neon, export=1
|
|||
ret
|
||||
8:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ld1 {v1.16b, v2.16b}, [x3], #32
|
||||
ld1 {v1.16b}, [x3], #16
|
||||
subs w5, w5, #4
|
||||
ushr v4.16b, v1.16b, #4
|
||||
and v3.16b, v1.16b, v31.16b
|
||||
zip1 v1.16b, v3.16b, v4.16b
|
||||
zip2 v2.16b, v3.16b, v4.16b
|
||||
tbl v1.16b, {v0.16b}, v1.16b
|
||||
st1 {v1.d}[0], [x0], x1
|
||||
tbl v2.16b, {v0.16b}, v2.16b
|
||||
|
|
@ -3959,9 +3966,17 @@ function pal_pred_8bpc_neon, export=1
|
|||
ret
|
||||
16:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ld1 {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64
|
||||
ld1 {v1.16b, v2.16b}, [x3], #32
|
||||
subs w5, w5, #4
|
||||
ushr v5.16b, v1.16b, #4
|
||||
and v4.16b, v1.16b, v31.16b
|
||||
ushr v7.16b, v2.16b, #4
|
||||
and v6.16b, v2.16b, v31.16b
|
||||
zip1 v1.16b, v4.16b, v5.16b
|
||||
zip2 v2.16b, v4.16b, v5.16b
|
||||
zip1 v3.16b, v6.16b, v7.16b
|
||||
tbl v1.16b, {v0.16b}, v1.16b
|
||||
zip2 v4.16b, v6.16b, v7.16b
|
||||
tbl v2.16b, {v0.16b}, v2.16b
|
||||
st1 {v1.16b}, [x0], x1
|
||||
tbl v3.16b, {v0.16b}, v3.16b
|
||||
|
|
@ -3974,10 +3989,25 @@ function pal_pred_8bpc_neon, export=1
|
|||
32:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
|
||||
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
|
||||
subs w5, w5, #4
|
||||
ushr v21.16b, v16.16b, #4
|
||||
and v20.16b, v16.16b, v31.16b
|
||||
ushr v23.16b, v17.16b, #4
|
||||
and v22.16b, v17.16b, v31.16b
|
||||
ushr v25.16b, v18.16b, #4
|
||||
and v24.16b, v18.16b, v31.16b
|
||||
ushr v27.16b, v19.16b, #4
|
||||
and v26.16b, v19.16b, v31.16b
|
||||
zip1 v16.16b, v20.16b, v21.16b
|
||||
zip2 v17.16b, v20.16b, v21.16b
|
||||
zip1 v18.16b, v22.16b, v23.16b
|
||||
zip2 v19.16b, v22.16b, v23.16b
|
||||
zip1 v20.16b, v24.16b, v25.16b
|
||||
zip2 v21.16b, v24.16b, v25.16b
|
||||
tbl v16.16b, {v0.16b}, v16.16b
|
||||
zip1 v22.16b, v26.16b, v27.16b
|
||||
tbl v17.16b, {v0.16b}, v17.16b
|
||||
zip2 v23.16b, v26.16b, v27.16b
|
||||
tbl v18.16b, {v0.16b}, v18.16b
|
||||
tbl v19.16b, {v0.16b}, v19.16b
|
||||
tbl v20.16b, {v0.16b}, v20.16b
|
||||
|
|
@ -3993,10 +4023,25 @@ function pal_pred_8bpc_neon, export=1
|
|||
64:
|
||||
AARCH64_VALID_JUMP_TARGET
|
||||
ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
|
||||
ld1 {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
|
||||
subs w5, w5, #2
|
||||
ushr v21.16b, v16.16b, #4
|
||||
and v20.16b, v16.16b, v31.16b
|
||||
ushr v23.16b, v17.16b, #4
|
||||
and v22.16b, v17.16b, v31.16b
|
||||
ushr v25.16b, v18.16b, #4
|
||||
and v24.16b, v18.16b, v31.16b
|
||||
ushr v27.16b, v19.16b, #4
|
||||
and v26.16b, v19.16b, v31.16b
|
||||
zip1 v16.16b, v20.16b, v21.16b
|
||||
zip2 v17.16b, v20.16b, v21.16b
|
||||
zip1 v18.16b, v22.16b, v23.16b
|
||||
zip2 v19.16b, v22.16b, v23.16b
|
||||
zip1 v20.16b, v24.16b, v25.16b
|
||||
zip2 v21.16b, v24.16b, v25.16b
|
||||
tbl v16.16b, {v0.16b}, v16.16b
|
||||
zip1 v22.16b, v26.16b, v27.16b
|
||||
tbl v17.16b, {v0.16b}, v17.16b
|
||||
zip2 v23.16b, v26.16b, v27.16b
|
||||
tbl v18.16b, {v0.16b}, v18.16b
|
||||
tbl v19.16b, {v0.16b}, v19.16b
|
||||
st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
|
||||
|
|
|
|||
44
third_party/dav1d/src/arm/64/ipred16.S
vendored
44
third_party/dav1d/src/arm/64/ipred16.S
vendored
|
|
@ -4179,13 +4179,14 @@ function ipred_filter_16bpc_neon, export=1
|
|||
endfunc
|
||||
|
||||
// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
|
||||
// const uint16_t *const pal, const uint8_t *idx,
|
||||
// const pixel *const pal, const uint8_t *idx,
|
||||
// const int w, const int h);
|
||||
function pal_pred_16bpc_neon, export=1
|
||||
ld1 {v30.8h}, [x2]
|
||||
clz w9, w4
|
||||
adr x6, L(pal_pred_tbl)
|
||||
sub w9, w9, #25
|
||||
movi v29.16b, #7
|
||||
ldrh w9, [x6, w9, uxtw #1]
|
||||
movi v31.8h, #1, lsl #8
|
||||
sub x6, x6, w9, uxtw
|
||||
|
|
@ -4195,8 +4196,11 @@ function pal_pred_16bpc_neon, export=1
|
|||
add x2, x0, x1
|
||||
lsl x1, x1, #1
|
||||
4:
|
||||
ld1 {v1.16b}, [x3], #16
|
||||
ld1 {v1.8b}, [x3], #8
|
||||
subs w5, w5, #4
|
||||
ushr v3.8b, v1.8b, #4
|
||||
and v2.8b, v1.8b, v29.8b
|
||||
zip1 v1.16b, v2.16b, v3.16b
|
||||
// Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
|
||||
add v1.16b, v1.16b, v1.16b
|
||||
zip1 v0.16b, v1.16b, v1.16b
|
||||
|
|
@ -4216,8 +4220,12 @@ function pal_pred_16bpc_neon, export=1
|
|||
add x2, x0, x1
|
||||
lsl x1, x1, #1
|
||||
8:
|
||||
ld1 {v2.16b, v3.16b}, [x3], #32
|
||||
ld1 {v2.16b}, [x3], #16
|
||||
subs w5, w5, #4
|
||||
ushr v4.16b, v2.16b, #4
|
||||
and v3.16b, v2.16b, v29.16b
|
||||
zip1 v2.16b, v3.16b, v4.16b
|
||||
zip2 v3.16b, v3.16b, v4.16b
|
||||
add v2.16b, v2.16b, v2.16b
|
||||
add v3.16b, v3.16b, v3.16b
|
||||
zip1 v0.16b, v2.16b, v2.16b
|
||||
|
|
@ -4243,8 +4251,16 @@ function pal_pred_16bpc_neon, export=1
|
|||
add x2, x0, x1
|
||||
lsl x1, x1, #1
|
||||
16:
|
||||
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
|
||||
ld1 {v4.16b, v5.16b}, [x3], #32
|
||||
subs w5, w5, #4
|
||||
ushr v7.16b, v4.16b, #4
|
||||
and v6.16b, v4.16b, v29.16b
|
||||
ushr v3.16b, v5.16b, #4
|
||||
and v2.16b, v5.16b, v29.16b
|
||||
zip1 v4.16b, v6.16b, v7.16b
|
||||
zip2 v5.16b, v6.16b, v7.16b
|
||||
zip1 v6.16b, v2.16b, v3.16b
|
||||
zip2 v7.16b, v2.16b, v3.16b
|
||||
add v4.16b, v4.16b, v4.16b
|
||||
add v5.16b, v5.16b, v5.16b
|
||||
add v6.16b, v6.16b, v6.16b
|
||||
|
|
@ -4284,8 +4300,16 @@ function pal_pred_16bpc_neon, export=1
|
|||
add x2, x0, x1
|
||||
lsl x1, x1, #1
|
||||
32:
|
||||
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
|
||||
ld1 {v4.16b, v5.16b}, [x3], #32
|
||||
subs w5, w5, #2
|
||||
ushr v7.16b, v4.16b, #4
|
||||
and v6.16b, v4.16b, v29.16b
|
||||
ushr v3.16b, v5.16b, #4
|
||||
and v2.16b, v5.16b, v29.16b
|
||||
zip1 v4.16b, v6.16b, v7.16b
|
||||
zip2 v5.16b, v6.16b, v7.16b
|
||||
zip1 v6.16b, v2.16b, v3.16b
|
||||
zip2 v7.16b, v2.16b, v3.16b
|
||||
add v4.16b, v4.16b, v4.16b
|
||||
add v5.16b, v5.16b, v5.16b
|
||||
add v6.16b, v6.16b, v6.16b
|
||||
|
|
@ -4322,8 +4346,16 @@ function pal_pred_16bpc_neon, export=1
|
|||
AARCH64_VALID_JUMP_TARGET
|
||||
add x2, x0, #64
|
||||
64:
|
||||
ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
|
||||
ld1 {v4.16b, v5.16b}, [x3], #32
|
||||
subs w5, w5, #1
|
||||
ushr v7.16b, v4.16b, #4
|
||||
and v6.16b, v4.16b, v29.16b
|
||||
ushr v3.16b, v5.16b, #4
|
||||
and v2.16b, v5.16b, v29.16b
|
||||
zip1 v4.16b, v6.16b, v7.16b
|
||||
zip2 v5.16b, v6.16b, v7.16b
|
||||
zip1 v6.16b, v2.16b, v3.16b
|
||||
zip2 v7.16b, v2.16b, v3.16b
|
||||
add v4.16b, v4.16b, v4.16b
|
||||
add v5.16b, v5.16b, v5.16b
|
||||
add v6.16b, v6.16b, v6.16b
|
||||
|
|
|
|||
8
third_party/dav1d/src/arm/filmgrain.h
vendored
8
third_party/dav1d/src/arm/filmgrain.h
vendored
|
|
@ -91,8 +91,8 @@ static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row,
|
|||
|
||||
int offsets[2 /* col offset */][2 /* row offset */];
|
||||
|
||||
// process this row in BLOCK_SIZE^2 blocks
|
||||
for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
|
||||
// process this row in FG_BLOCK_SIZE^2 blocks
|
||||
for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) {
|
||||
|
||||
if (data->overlap_flag && bx) {
|
||||
// shift previous offsets left
|
||||
|
|
@ -155,8 +155,8 @@ fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
|
|||
\
|
||||
int offsets[2 /* col offset */][2 /* row offset */]; \
|
||||
\
|
||||
/* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \
|
||||
for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \
|
||||
/* process this row in FG_BLOCK_SIZE^2 blocks (subsampled) */ \
|
||||
for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) { \
|
||||
if (data->overlap_flag && bx) { \
|
||||
/* shift previous offsets left */ \
|
||||
for (int i = 0; i < rows; i++) \
|
||||
|
|
|
|||
2
third_party/dav1d/src/dav1d.rc.in
vendored
2
third_party/dav1d/src/dav1d.rc.in
vendored
|
|
@ -22,7 +22,7 @@ BEGIN
|
|||
VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder"
|
||||
VALUE "InternalName", "dav1d"
|
||||
VALUE "OriginalFilename", "libdav1d.dll"
|
||||
VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
|
||||
VALUE "LegalCopyright", L"Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
|
||||
END
|
||||
END
|
||||
BLOCK "VarFileInfo"
|
||||
|
|
|
|||
232
third_party/dav1d/src/decode.c
vendored
232
third_party/dav1d/src/decode.c
vendored
|
|
@ -370,142 +370,6 @@ static inline int findoddzero(const uint8_t *buf, int len) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
static void read_pal_plane(Dav1dTaskContext *const t, Av1Block *const b,
|
||||
const int pl, const int sz_ctx,
|
||||
const int bx4, const int by4)
|
||||
{
|
||||
Dav1dTileState *const ts = t->ts;
|
||||
const Dav1dFrameContext *const f = t->f;
|
||||
const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
|
||||
uint16_t cache[16], used_cache[8];
|
||||
int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
|
||||
int n_cache = 0;
|
||||
// don't reuse above palette outside SB64 boundaries
|
||||
int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
|
||||
const uint16_t *l = t->al_pal[1][by4][pl], *a = t->al_pal[0][bx4][pl];
|
||||
|
||||
// fill/sort cache
|
||||
while (l_cache && a_cache) {
|
||||
if (*l < *a) {
|
||||
if (!n_cache || cache[n_cache - 1] != *l)
|
||||
cache[n_cache++] = *l;
|
||||
l++;
|
||||
l_cache--;
|
||||
} else {
|
||||
if (*a == *l) {
|
||||
l++;
|
||||
l_cache--;
|
||||
}
|
||||
if (!n_cache || cache[n_cache - 1] != *a)
|
||||
cache[n_cache++] = *a;
|
||||
a++;
|
||||
a_cache--;
|
||||
}
|
||||
}
|
||||
if (l_cache) {
|
||||
do {
|
||||
if (!n_cache || cache[n_cache - 1] != *l)
|
||||
cache[n_cache++] = *l;
|
||||
l++;
|
||||
} while (--l_cache > 0);
|
||||
} else if (a_cache) {
|
||||
do {
|
||||
if (!n_cache || cache[n_cache - 1] != *a)
|
||||
cache[n_cache++] = *a;
|
||||
a++;
|
||||
} while (--a_cache > 0);
|
||||
}
|
||||
|
||||
// find reused cache entries
|
||||
int i = 0;
|
||||
for (int n = 0; n < n_cache && i < pal_sz; n++)
|
||||
if (dav1d_msac_decode_bool_equi(&ts->msac))
|
||||
used_cache[i++] = cache[n];
|
||||
const int n_used_cache = i;
|
||||
|
||||
// parse new entries
|
||||
uint16_t *const pal = t->frame_thread.pass ?
|
||||
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
|
||||
((t->bx >> 1) + (t->by & 1))][pl] : t->scratch.pal[pl];
|
||||
if (i < pal_sz) {
|
||||
int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
|
||||
|
||||
if (i < pal_sz) {
|
||||
int bits = f->cur.p.bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
|
||||
const int max = (1 << f->cur.p.bpc) - 1;
|
||||
|
||||
do {
|
||||
const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
|
||||
prev = pal[i++] = imin(prev + delta + !pl, max);
|
||||
if (prev + !pl >= max) {
|
||||
for (; i < pal_sz; i++)
|
||||
pal[i] = max;
|
||||
break;
|
||||
}
|
||||
bits = imin(bits, 1 + ulog2(max - prev - !pl));
|
||||
} while (i < pal_sz);
|
||||
}
|
||||
|
||||
// merge cache+new entries
|
||||
int n = 0, m = n_used_cache;
|
||||
for (i = 0; i < pal_sz; i++) {
|
||||
if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
|
||||
pal[i] = used_cache[n++];
|
||||
} else {
|
||||
assert(m < pal_sz);
|
||||
pal[i] = pal[m++];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
|
||||
}
|
||||
|
||||
if (DEBUG_BLOCK_INFO) {
|
||||
printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
|
||||
pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
|
||||
for (int n = 0; n < n_cache; n++)
|
||||
printf("%c%02x", n ? ' ' : '[', cache[n]);
|
||||
printf("%s, pal=", n_cache ? "]" : "[]");
|
||||
for (int n = 0; n < pal_sz; n++)
|
||||
printf("%c%02x", n ? ' ' : '[', pal[n]);
|
||||
printf("]\n");
|
||||
}
|
||||
}
|
||||
|
||||
static void read_pal_uv(Dav1dTaskContext *const t, Av1Block *const b,
|
||||
const int sz_ctx, const int bx4, const int by4)
|
||||
{
|
||||
read_pal_plane(t, b, 1, sz_ctx, bx4, by4);
|
||||
|
||||
// V pal coding
|
||||
Dav1dTileState *const ts = t->ts;
|
||||
const Dav1dFrameContext *const f = t->f;
|
||||
uint16_t *const pal = t->frame_thread.pass ?
|
||||
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
|
||||
((t->bx >> 1) + (t->by & 1))][2] : t->scratch.pal[2];
|
||||
if (dav1d_msac_decode_bool_equi(&ts->msac)) {
|
||||
const int bits = f->cur.p.bpc - 4 +
|
||||
dav1d_msac_decode_bools(&ts->msac, 2);
|
||||
int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
|
||||
const int max = (1 << f->cur.p.bpc) - 1;
|
||||
for (int i = 1; i < b->pal_sz[1]; i++) {
|
||||
int delta = dav1d_msac_decode_bools(&ts->msac, bits);
|
||||
if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
|
||||
prev = pal[i] = (prev + delta) & max;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < b->pal_sz[1]; i++)
|
||||
pal[i] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
|
||||
}
|
||||
if (DEBUG_BLOCK_INFO) {
|
||||
printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
|
||||
for (int n = 0; n < b->pal_sz[1]; n++)
|
||||
printf("%c%02x", n ? ' ' : '[', pal[n]);
|
||||
printf("]\n");
|
||||
}
|
||||
}
|
||||
|
||||
// meant to be SIMD'able, so that theoretical complexity of this function
|
||||
// times block size goes from w4*h4 to w4+h4-1
|
||||
// a and b are previous two lines containing (a) top/left entries or (b)
|
||||
|
|
@ -584,7 +448,8 @@ static void read_pal_indices(Dav1dTaskContext *const t,
|
|||
Dav1dTileState *const ts = t->ts;
|
||||
const ptrdiff_t stride = bw4 * 4;
|
||||
assert(pal_idx);
|
||||
pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
|
||||
pixel *const pal_tmp = t->scratch.pal_idx_uv;
|
||||
pal_tmp[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
|
||||
uint16_t (*const color_map_cdf)[8] =
|
||||
ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2];
|
||||
uint8_t (*const order)[8] = t->scratch.pal_order;
|
||||
|
|
@ -593,23 +458,16 @@ static void read_pal_indices(Dav1dTaskContext *const t,
|
|||
// top/left-to-bottom/right diagonals ("wave-front")
|
||||
const int first = imin(i, w4 * 4 - 1);
|
||||
const int last = imax(0, i - h4 * 4 + 1);
|
||||
order_palette(pal_idx, stride, i, first, last, order, ctx);
|
||||
order_palette(pal_tmp, stride, i, first, last, order, ctx);
|
||||
for (int j = first, m = 0; j >= last; j--, m++) {
|
||||
const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
color_map_cdf[ctx[m]], b->pal_sz[pl] - 1);
|
||||
pal_idx[(i - j) * stride + j] = order[m][color_idx];
|
||||
pal_tmp[(i - j) * stride + j] = order[m][color_idx];
|
||||
}
|
||||
}
|
||||
// fill invisible edges
|
||||
if (bw4 > w4)
|
||||
for (int y = 0; y < 4 * h4; y++)
|
||||
memset(&pal_idx[y * stride + 4 * w4],
|
||||
pal_idx[y * stride + 4 * w4 - 1], 4 * (bw4 - w4));
|
||||
if (h4 < bh4) {
|
||||
const uint8_t *const src = &pal_idx[stride * (4 * h4 - 1)];
|
||||
for (int y = h4 * 4; y < bh4 * 4; y++)
|
||||
memcpy(&pal_idx[y * stride], src, bw4 * 4);
|
||||
}
|
||||
|
||||
t->c->pal_dsp.pal_idx_finish(pal_idx, pal_tmp, bw4 * 4, bh4 * 4,
|
||||
w4 * 4, h4 * 4);
|
||||
}
|
||||
|
||||
static void read_vartx_tree(Dav1dTaskContext *const t,
|
||||
|
|
@ -1306,7 +1164,7 @@ static int decode_b(Dav1dTaskContext *const t,
|
|||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-y_pal[%d]: r=%d\n", use_y_pal, ts->msac.rng);
|
||||
if (use_y_pal)
|
||||
read_pal_plane(t, b, 0, sz_ctx, bx4, by4);
|
||||
f->bd_fn.read_pal_plane(t, b, 0, sz_ctx, bx4, by4);
|
||||
}
|
||||
|
||||
if (has_chroma && b->uv_mode == DC_PRED) {
|
||||
|
|
@ -1316,7 +1174,7 @@ static int decode_b(Dav1dTaskContext *const t,
|
|||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-uv_pal[%d]: r=%d\n", use_uv_pal, ts->msac.rng);
|
||||
if (use_uv_pal) // see aomedia bug 2183 for why we use luma coordinates
|
||||
read_pal_uv(t, b, sz_ctx, bx4, by4);
|
||||
f->bd_fn.read_pal_uv(t, b, sz_ctx, bx4, by4);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1341,9 +1199,9 @@ static int decode_b(Dav1dTaskContext *const t,
|
|||
const int p = t->frame_thread.pass & 1;
|
||||
assert(ts->frame_thread[p].pal_idx);
|
||||
pal_idx = ts->frame_thread[p].pal_idx;
|
||||
ts->frame_thread[p].pal_idx += bw4 * bh4 * 16;
|
||||
ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
|
||||
} else
|
||||
pal_idx = t->scratch.pal_idx;
|
||||
pal_idx = t->scratch.pal_idx_y;
|
||||
read_pal_indices(t, pal_idx, b, 0, w4, h4, bw4, bh4);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-y-pal-indices: r=%d\n", ts->msac.rng);
|
||||
|
|
@ -1355,9 +1213,9 @@ static int decode_b(Dav1dTaskContext *const t,
|
|||
const int p = t->frame_thread.pass & 1;
|
||||
assert(ts->frame_thread[p].pal_idx);
|
||||
pal_idx = ts->frame_thread[p].pal_idx;
|
||||
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16;
|
||||
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
|
||||
} else
|
||||
pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
|
||||
pal_idx = t->scratch.pal_idx_uv;
|
||||
read_pal_indices(t, pal_idx, b, 1, cw4, ch4, cbw4, cbh4);
|
||||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-uv-pal-indices: r=%d\n", ts->msac.rng);
|
||||
|
|
@ -1430,34 +1288,16 @@ static int decode_b(Dav1dTaskContext *const t,
|
|||
case_set(bh4, l., 1, by4);
|
||||
case_set(bw4, a->, 0, bx4);
|
||||
#undef set_ctx
|
||||
if (b->pal_sz[0]) {
|
||||
uint16_t *const pal = t->frame_thread.pass ?
|
||||
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
|
||||
((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0];
|
||||
for (int x = 0; x < bw4; x++)
|
||||
memcpy(t->al_pal[0][bx4 + x][0], pal, 16);
|
||||
for (int y = 0; y < bh4; y++)
|
||||
memcpy(t->al_pal[1][by4 + y][0], pal, 16);
|
||||
}
|
||||
if (b->pal_sz[0])
|
||||
f->bd_fn.copy_pal_block_y(t, bx4, by4, bw4, bh4);
|
||||
if (has_chroma) {
|
||||
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
|
||||
rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
|
||||
case_set(cbh4, l., 1, cby4);
|
||||
case_set(cbw4, a->, 0, cbx4);
|
||||
#undef set_ctx
|
||||
if (b->pal_sz[1]) {
|
||||
const uint16_t (*const pal)[8] = t->frame_thread.pass ?
|
||||
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) *
|
||||
(f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))] :
|
||||
t->scratch.pal;
|
||||
// see aomedia bug 2183 for why we use luma coordinates here
|
||||
for (int pl = 1; pl <= 2; pl++) {
|
||||
for (int x = 0; x < bw4; x++)
|
||||
memcpy(t->al_pal[0][bx4 + x][pl], pal[pl], 16);
|
||||
for (int y = 0; y < bh4; y++)
|
||||
memcpy(t->al_pal[1][by4 + y][pl], pal[pl], 16);
|
||||
}
|
||||
}
|
||||
if (b->pal_sz[1])
|
||||
f->bd_fn.copy_pal_block_uv(t, bx4, by4, bw4, bh4);
|
||||
}
|
||||
if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc)
|
||||
splat_intraref(f->c, t, bs, bw4, bh4);
|
||||
|
|
@ -2642,7 +2482,10 @@ static void setup_tile(Dav1dTileState *const ts,
|
|||
const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
|
||||
for (int p = 0; p < 2; p++) {
|
||||
ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ?
|
||||
&f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] :
|
||||
&f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 8] :
|
||||
NULL;
|
||||
ts->frame_thread[p].cbi = f->frame_thread.cbi ?
|
||||
&f->frame_thread.cbi[(size_t)tile_start_off * size_mul[0] / 64] :
|
||||
NULL;
|
||||
ts->frame_thread[p].cf = f->frame_thread.cf ?
|
||||
(uint8_t*)f->frame_thread.cf +
|
||||
|
|
@ -3015,6 +2858,19 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
|||
}
|
||||
}
|
||||
|
||||
const int cbi_sz = num_sb128 * size_mul[0];
|
||||
if (cbi_sz != f->frame_thread.cbi_sz) {
|
||||
dav1d_free_aligned(f->frame_thread.cbi);
|
||||
f->frame_thread.cbi =
|
||||
dav1d_alloc_aligned(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) *
|
||||
cbi_sz * 32 * 32 / 4, 64);
|
||||
if (!f->frame_thread.cbi) {
|
||||
f->frame_thread.cbi_sz = 0;
|
||||
goto error;
|
||||
}
|
||||
f->frame_thread.cbi_sz = cbi_sz;
|
||||
}
|
||||
|
||||
const int cf_sz = (num_sb128 * size_mul[0]) << hbd;
|
||||
if (cf_sz != f->frame_thread.cf_sz) {
|
||||
dav1d_free_aligned(f->frame_thread.cf);
|
||||
|
|
@ -3029,16 +2885,17 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
|||
}
|
||||
|
||||
if (f->frame_hdr->allow_screen_content_tools) {
|
||||
if (num_sb128 != f->frame_thread.pal_sz) {
|
||||
const int pal_sz = num_sb128 << hbd;
|
||||
if (pal_sz != f->frame_thread.pal_sz) {
|
||||
dav1d_free_aligned(f->frame_thread.pal);
|
||||
f->frame_thread.pal =
|
||||
dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal) *
|
||||
num_sb128 * 16 * 16, 64);
|
||||
pal_sz * 16 * 16, 64);
|
||||
if (!f->frame_thread.pal) {
|
||||
f->frame_thread.pal_sz = 0;
|
||||
goto error;
|
||||
}
|
||||
f->frame_thread.pal_sz = num_sb128;
|
||||
f->frame_thread.pal_sz = pal_sz;
|
||||
}
|
||||
|
||||
const int pal_idx_sz = num_sb128 * size_mul[1];
|
||||
|
|
@ -3046,7 +2903,7 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
|||
dav1d_free_aligned(f->frame_thread.pal_idx);
|
||||
f->frame_thread.pal_idx =
|
||||
dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal_idx) *
|
||||
pal_idx_sz * 128 * 128 / 4, 64);
|
||||
pal_idx_sz * 128 * 128 / 8, 64);
|
||||
if (!f->frame_thread.pal_idx) {
|
||||
f->frame_thread.pal_idx_sz = 0;
|
||||
goto error;
|
||||
|
|
@ -3171,12 +3028,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
|
|||
}
|
||||
if (c->n_fc > 1) {
|
||||
dav1d_free(f->frame_thread.b);
|
||||
dav1d_free(f->frame_thread.cbi);
|
||||
f->frame_thread.b = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.b) *
|
||||
num_sb128 * 32 * 32);
|
||||
f->frame_thread.cbi = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) *
|
||||
num_sb128 * 32 * 32);
|
||||
if (!f->frame_thread.b || !f->frame_thread.cbi) {
|
||||
if (!f->frame_thread.b) {
|
||||
f->lf.mask_sz = 0;
|
||||
goto error;
|
||||
}
|
||||
|
|
@ -3584,7 +3438,11 @@ int dav1d_submit_frame(Dav1dContext *const c) {
|
|||
f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \
|
||||
f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \
|
||||
f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
|
||||
f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc
|
||||
f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc; \
|
||||
f->bd_fn.copy_pal_block_y = dav1d_copy_pal_block_y_##bd##bpc; \
|
||||
f->bd_fn.copy_pal_block_uv = dav1d_copy_pal_block_uv_##bd##bpc; \
|
||||
f->bd_fn.read_pal_plane = dav1d_read_pal_plane_##bd##bpc; \
|
||||
f->bd_fn.read_pal_uv = dav1d_read_pal_uv_##bd##bpc
|
||||
if (!f->seq_hdr->hbd) {
|
||||
#if CONFIG_8BPC
|
||||
assign_bitdepth_case(8);
|
||||
|
|
|
|||
12
third_party/dav1d/src/fg_apply_tmpl.c
vendored
12
third_party/dav1d/src/fg_apply_tmpl.c
vendored
|
|
@ -172,14 +172,14 @@ void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp,
|
|||
const int cpw = (out->p.w + ss_x) >> ss_x;
|
||||
const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
|
||||
pixel *const luma_src =
|
||||
((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);
|
||||
((pixel *) in->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(in->stride[0]);
|
||||
#if BITDEPTH != 8
|
||||
const int bitdepth_max = (1 << out->p.bpc) - 1;
|
||||
#endif
|
||||
|
||||
if (data->num_y_points) {
|
||||
const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE);
|
||||
dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]),
|
||||
const int bh = imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE);
|
||||
dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[0]),
|
||||
luma_src, out->stride[0], data,
|
||||
out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
|
||||
}
|
||||
|
|
@ -190,7 +190,7 @@ void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp,
|
|||
return;
|
||||
}
|
||||
|
||||
const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;
|
||||
const int bh = (imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE) + ss_y) >> ss_y;
|
||||
|
||||
// extend padding pixels
|
||||
if (out->p.w & ss_x) {
|
||||
|
|
@ -201,7 +201,7 @@ void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp,
|
|||
}
|
||||
}
|
||||
|
||||
const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
|
||||
const ptrdiff_t uv_off = row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
|
||||
if (data->chroma_scaling_from_luma) {
|
||||
for (int pl = 0; pl < 2; pl++)
|
||||
dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
|
||||
|
|
@ -232,7 +232,7 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
|
|||
#else
|
||||
uint8_t scaling[3][SCALING_SIZE];
|
||||
#endif
|
||||
const int rows = (out->p.h + 31) >> 5;
|
||||
const int rows = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE;
|
||||
|
||||
bitfn(dav1d_prep_grain)(dsp, out, in, scaling, grain_lut);
|
||||
for (int row = 0; row < rows; row++)
|
||||
|
|
|
|||
2
third_party/dav1d/src/filmgrain.h
vendored
2
third_party/dav1d/src/filmgrain.h
vendored
|
|
@ -34,7 +34,7 @@
|
|||
|
||||
#define GRAIN_WIDTH 82
|
||||
#define GRAIN_HEIGHT 73
|
||||
#define BLOCK_SIZE 32
|
||||
#define FG_BLOCK_SIZE 32
|
||||
#if !defined(BITDEPTH) || BITDEPTH == 8
|
||||
#define SCALING_SIZE 256
|
||||
typedef int8_t entry;
|
||||
|
|
|
|||
20
third_party/dav1d/src/filmgrain_tmpl.c
vendored
20
third_party/dav1d/src/filmgrain_tmpl.c
vendored
|
|
@ -162,8 +162,8 @@ static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH],
|
|||
const int randval = offsets[bx][by];
|
||||
const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
|
||||
const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF));
|
||||
return grain_lut[offy + y + (BLOCK_SIZE >> suby) * by]
|
||||
[offx + x + (BLOCK_SIZE >> subx) * bx];
|
||||
return grain_lut[offy + y + (FG_BLOCK_SIZE >> suby) * by]
|
||||
[offx + x + (FG_BLOCK_SIZE >> subx) * bx];
|
||||
}
|
||||
|
||||
static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
|
||||
|
|
@ -195,13 +195,13 @@ static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
|
|||
seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
|
||||
}
|
||||
|
||||
assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
|
||||
assert(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);
|
||||
|
||||
int offsets[2 /* col offset */][2 /* row offset */];
|
||||
|
||||
// process this row in BLOCK_SIZE^2 blocks
|
||||
for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
|
||||
const int bw = imin(BLOCK_SIZE, (int) pw - bx);
|
||||
// process this row in FG_BLOCK_SIZE^2 blocks
|
||||
for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) {
|
||||
const int bw = imin(FG_BLOCK_SIZE, (int) pw - bx);
|
||||
|
||||
if (data->overlap_flag && bx) {
|
||||
// shift previous offsets left
|
||||
|
|
@ -306,13 +306,13 @@ fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
|
|||
seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
|
||||
}
|
||||
|
||||
assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
|
||||
assert(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);
|
||||
|
||||
int offsets[2 /* col offset */][2 /* row offset */];
|
||||
|
||||
// process this row in BLOCK_SIZE^2 blocks (subsampled)
|
||||
for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
|
||||
const int bw = imin(BLOCK_SIZE >> sx, (int)(pw - bx));
|
||||
// process this row in FG_BLOCK_SIZE^2 blocks (subsampled)
|
||||
for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) {
|
||||
const int bw = imin(FG_BLOCK_SIZE >> sx, (int)(pw - bx));
|
||||
if (data->overlap_flag && bx) {
|
||||
// shift previous offsets left
|
||||
for (int i = 0; i < rows; i++)
|
||||
|
|
|
|||
28
third_party/dav1d/src/internal.h
vendored
28
third_party/dav1d/src/internal.h
vendored
|
|
@ -53,6 +53,7 @@ typedef struct Dav1dTask Dav1dTask;
|
|||
#include "src/looprestoration.h"
|
||||
#include "src/mc.h"
|
||||
#include "src/msac.h"
|
||||
#include "src/pal.h"
|
||||
#include "src/picture.h"
|
||||
#include "src/recon.h"
|
||||
#include "src/refmvs.h"
|
||||
|
|
@ -174,6 +175,7 @@ struct Dav1dContext {
|
|||
CdfThreadContext cdf[8];
|
||||
|
||||
Dav1dDSPContext dsp[3 /* 8, 10, 12 bits/component */];
|
||||
Dav1dPalDSPContext pal_dsp;
|
||||
Dav1dRefmvsDSPContext refmvs_dsp;
|
||||
|
||||
Dav1dPicAllocator allocator;
|
||||
|
|
@ -253,6 +255,10 @@ struct Dav1dFrameContext {
|
|||
filter_sbrow_fn filter_sbrow_lr;
|
||||
backup_ipred_edge_fn backup_ipred_edge;
|
||||
read_coef_blocks_fn read_coef_blocks;
|
||||
copy_pal_block_fn copy_pal_block_y;
|
||||
copy_pal_block_fn copy_pal_block_uv;
|
||||
read_pal_plane_fn read_pal_plane;
|
||||
read_pal_uv_fn read_pal_uv;
|
||||
} bd_fn;
|
||||
|
||||
int ipred_edge_sz;
|
||||
|
|
@ -274,14 +280,14 @@ struct Dav1dFrameContext {
|
|||
atomic_uint *frame_progress, *copy_lpf_progress;
|
||||
// indexed using t->by * f->b4_stride + t->bx
|
||||
Av1Block *b;
|
||||
int16_t (*cbi)[3 /* plane */]; /* bits 0-4: txtp, bits 5-15: eob */
|
||||
int16_t *cbi; /* bits 0-4: txtp, bits 5-15: eob */
|
||||
// indexed using (t->by >> 1) * (f->b4_stride >> 1) + (t->bx >> 1)
|
||||
uint16_t (*pal)[3 /* plane */][8 /* idx */];
|
||||
pixel (*pal)[3 /* plane */][8 /* idx */];
|
||||
// iterated over inside tile state
|
||||
uint8_t *pal_idx;
|
||||
coef *cf;
|
||||
int prog_sz;
|
||||
int pal_sz, pal_idx_sz, cf_sz;
|
||||
int cbi_sz, pal_sz, pal_idx_sz, cf_sz;
|
||||
// start offsets per tile
|
||||
int *tile_start_off;
|
||||
} frame_thread;
|
||||
|
|
@ -358,6 +364,7 @@ struct Dav1dTileState {
|
|||
atomic_int progress[2 /* 0: reconstruction, 1: entropy */];
|
||||
struct {
|
||||
uint8_t *pal_idx;
|
||||
int16_t *cbi;
|
||||
coef *cf;
|
||||
} frame_thread[2 /* 0: reconstruction, 1: entropy */];
|
||||
|
||||
|
|
@ -387,9 +394,10 @@ struct Dav1dTaskContext {
|
|||
int16_t cf_8bpc [32 * 32];
|
||||
int32_t cf_16bpc[32 * 32];
|
||||
};
|
||||
// FIXME types can be changed to pixel (and dynamically allocated)
|
||||
// which would make copy/assign operations slightly faster?
|
||||
uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
|
||||
union {
|
||||
uint8_t al_pal_8bpc [2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
|
||||
uint16_t al_pal_16bpc[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
|
||||
};
|
||||
uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
|
||||
ALIGN(union, 64) {
|
||||
struct {
|
||||
|
|
@ -419,16 +427,18 @@ struct Dav1dTaskContext {
|
|||
int16_t ac[32 * 32]; // intra-only
|
||||
uint8_t txtp_map[32 * 32]; // inter-only
|
||||
};
|
||||
uint8_t pal_idx[2 * 64 * 64];
|
||||
uint16_t pal[3 /* plane */][8 /* palette_idx */];
|
||||
ALIGN(union, 64) {
|
||||
uint8_t pal_idx_y[32 * 64];
|
||||
uint8_t pal_idx_uv[64 * 64]; /* also used as pre-pack scratch buffer */
|
||||
union {
|
||||
struct {
|
||||
uint8_t interintra_8bpc[64 * 64];
|
||||
uint8_t edge_8bpc[257];
|
||||
ALIGN(uint8_t pal_8bpc[3 /* plane */][8 /* palette_idx */], 8);
|
||||
};
|
||||
struct {
|
||||
uint16_t interintra_16bpc[64 * 64];
|
||||
uint16_t edge_16bpc[257];
|
||||
ALIGN(uint16_t pal_16bpc[3 /* plane */][8 /* palette_idx */], 16);
|
||||
};
|
||||
};
|
||||
};
|
||||
|
|
|
|||
2
third_party/dav1d/src/ipred.h
vendored
2
third_party/dav1d/src/ipred.h
vendored
|
|
@ -74,7 +74,7 @@ typedef decl_cfl_pred_fn(*cfl_pred_fn);
|
|||
* - only 16-byte alignment is guaranteed for idx.
|
||||
*/
|
||||
#define decl_pal_pred_fn(name) \
|
||||
void (name)(pixel *dst, ptrdiff_t stride, const uint16_t *pal, \
|
||||
void (name)(pixel *dst, ptrdiff_t stride, const pixel *pal, \
|
||||
const uint8_t *idx, int w, int h)
|
||||
typedef decl_pal_pred_fn(*pal_pred_fn);
|
||||
|
||||
|
|
|
|||
11
third_party/dav1d/src/ipred_tmpl.c
vendored
11
third_party/dav1d/src/ipred_tmpl.c
vendored
|
|
@ -715,13 +715,16 @@ cfl_ac_fn(422, 1, 0)
|
|||
cfl_ac_fn(444, 0, 0)
|
||||
|
||||
static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
|
||||
const uint16_t *const pal, const uint8_t *idx,
|
||||
const pixel *const pal, const uint8_t *idx,
|
||||
const int w, const int h)
|
||||
{
|
||||
for (int y = 0; y < h; y++) {
|
||||
for (int x = 0; x < w; x++)
|
||||
dst[x] = (pixel) pal[idx[x]];
|
||||
idx += w;
|
||||
for (int x = 0; x < w; x += 2) {
|
||||
const int i = *idx++;
|
||||
assert(!(i & 0x88));
|
||||
dst[x + 0] = pal[i & 7];
|
||||
dst[x + 1] = pal[i >> 4];
|
||||
}
|
||||
dst += PXSTRIDE(stride);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
6
third_party/dav1d/src/lib.c
vendored
6
third_party/dav1d/src/lib.c
vendored
|
|
@ -52,11 +52,10 @@
|
|||
|
||||
static COLD void init_internal(void) {
|
||||
dav1d_init_cpu();
|
||||
dav1d_init_interintra_masks();
|
||||
dav1d_init_ii_wedge_masks();
|
||||
dav1d_init_intra_edge_tree();
|
||||
dav1d_init_qm_tables();
|
||||
dav1d_init_thread();
|
||||
dav1d_init_wedge_masks();
|
||||
}
|
||||
|
||||
COLD const char *dav1d_version(void) {
|
||||
|
|
@ -287,6 +286,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
|
|||
t->task_thread.td.inited = 1;
|
||||
}
|
||||
}
|
||||
dav1d_pal_dsp_init(&c->pal_dsp);
|
||||
dav1d_refmvs_dsp_init(&c->refmvs_dsp);
|
||||
|
||||
pthread_attr_destroy(&thread_attr);
|
||||
|
|
@ -641,11 +641,11 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
|
|||
if (c->n_fc > 1) {
|
||||
dav1d_free(f->tile_thread.lowest_pixel_mem);
|
||||
dav1d_free(f->frame_thread.b);
|
||||
dav1d_free_aligned(f->frame_thread.cbi);
|
||||
dav1d_free_aligned(f->frame_thread.pal_idx);
|
||||
dav1d_free_aligned(f->frame_thread.cf);
|
||||
dav1d_free(f->frame_thread.tile_start_off);
|
||||
dav1d_free_aligned(f->frame_thread.pal);
|
||||
dav1d_free(f->frame_thread.cbi);
|
||||
}
|
||||
if (c->n_tc > 1) {
|
||||
pthread_mutex_destroy(&f->task_thread.pending_tasks.lock);
|
||||
|
|
|
|||
2
third_party/dav1d/src/meson.build
vendored
2
third_party/dav1d/src/meson.build
vendored
|
|
@ -42,6 +42,7 @@ libdav1d_sources = files(
|
|||
'mem.c',
|
||||
'msac.c',
|
||||
'obu.c',
|
||||
'pal.c',
|
||||
'picture.c',
|
||||
'qm.c',
|
||||
'ref.c',
|
||||
|
|
@ -167,6 +168,7 @@ if is_asm_enabled
|
|||
libdav1d_sources_asm = files(
|
||||
'x86/cpuid.asm',
|
||||
'x86/msac.asm',
|
||||
'x86/pal.asm',
|
||||
'x86/refmvs.asm',
|
||||
'x86/itx_avx512.asm',
|
||||
'x86/cdef_avx2.asm',
|
||||
|
|
|
|||
77
third_party/dav1d/src/pal.c
vendored
Normal file
77
third_party/dav1d/src/pal.c
vendored
Normal file
|
|
@ -0,0 +1,77 @@
|
|||
/*
|
||||
* Copyright © 2023, VideoLAN and dav1d authors
|
||||
* Copyright © 2023, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <string.h>
|
||||
|
||||
#include "common/attributes.h"
|
||||
|
||||
#include "src/pal.h"
|
||||
|
||||
// fill invisible edges and pack to 4-bit (2 pixels per byte)
|
||||
static void pal_idx_finish_c(uint8_t *dst, const uint8_t *src,
|
||||
const int bw, const int bh,
|
||||
const int w, const int h)
|
||||
{
|
||||
assert(bw >= 4 && bw <= 64 && !(bw & (bw - 1)));
|
||||
assert(bh >= 4 && bh <= 64 && !(bh & (bh - 1)));
|
||||
assert(w >= 4 && w <= bw && !(w & 3));
|
||||
assert(h >= 4 && h <= bh && !(h & 3));
|
||||
|
||||
const int dst_w = w / 2;
|
||||
const int dst_bw = bw / 2;
|
||||
|
||||
for (int y = 0; y < h; y++, src += bw, dst += dst_bw) {
|
||||
for (int x = 0; x < dst_w; x++)
|
||||
dst[x] = src[x * 2 + 0] | (src[x * 2 + 1] << 4);
|
||||
if (dst_w < dst_bw)
|
||||
memset(dst + dst_w, src[w - 1] * 0x11, dst_bw - dst_w);
|
||||
}
|
||||
|
||||
if (h < bh) {
|
||||
const uint8_t *const last_row = &dst[-dst_bw];
|
||||
for (int y = h; y < bh; y++, dst += dst_bw)
|
||||
memcpy(dst, last_row, dst_bw);
|
||||
}
|
||||
}
|
||||
|
||||
#if HAVE_ASM
|
||||
#if ARCH_X86
|
||||
#include "src/x86/pal.h"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
COLD void dav1d_pal_dsp_init(Dav1dPalDSPContext *const c) {
|
||||
c->pal_idx_finish = pal_idx_finish_c;
|
||||
|
||||
#if HAVE_ASM
|
||||
#if ARCH_X86
|
||||
pal_dsp_init_x86(c);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
43
third_party/dav1d/src/pal.h
vendored
Normal file
43
third_party/dav1d/src/pal.h
vendored
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
/*
|
||||
* Copyright © 2023, VideoLAN and dav1d authors
|
||||
* Copyright © 2023, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#ifndef DAV1D_SRC_PAL_H
|
||||
#define DAV1D_SRC_PAL_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define decl_pal_idx_finish_fn(name) \
|
||||
void (name)(uint8_t *dst, const uint8_t *src, int bw, int bh, int w, int h)
|
||||
typedef decl_pal_idx_finish_fn(*pal_idx_finish_fn);
|
||||
|
||||
typedef struct Dav1dPalDSPContext {
|
||||
pal_idx_finish_fn pal_idx_finish;
|
||||
} Dav1dPalDSPContext;
|
||||
|
||||
void dav1d_pal_dsp_init(Dav1dPalDSPContext *dsp);
|
||||
|
||||
#endif /* DAV1D_SRC_PAL_H */
|
||||
21
third_party/dav1d/src/recon.h
vendored
21
third_party/dav1d/src/recon.h
vendored
|
|
@ -57,6 +57,18 @@ typedef decl_backup_ipred_edge_fn(*backup_ipred_edge_fn);
|
|||
void (name)(Dav1dTaskContext *t, enum BlockSize bs, const Av1Block *b)
|
||||
typedef decl_read_coef_blocks_fn(*read_coef_blocks_fn);
|
||||
|
||||
#define decl_copy_pal_block_fn(name) \
|
||||
void (name)(Dav1dTaskContext *t, int bx4, int by4, int bw4, int bh4)
|
||||
typedef decl_copy_pal_block_fn(*copy_pal_block_fn);
|
||||
|
||||
#define decl_read_pal_plane_fn(name) \
|
||||
void (name)(Dav1dTaskContext *t, Av1Block *b, int pl, int sz_ctx, int bx4, int by4)
|
||||
typedef decl_read_pal_plane_fn(*read_pal_plane_fn);
|
||||
|
||||
#define decl_read_pal_uv_fn(name) \
|
||||
void (name)(Dav1dTaskContext *t, Av1Block *b, int sz_ctx, int bx4, int by4)
|
||||
typedef decl_read_pal_uv_fn(*read_pal_uv_fn);
|
||||
|
||||
decl_recon_b_intra_fn(dav1d_recon_b_intra_8bpc);
|
||||
decl_recon_b_intra_fn(dav1d_recon_b_intra_16bpc);
|
||||
|
||||
|
|
@ -82,4 +94,13 @@ decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_16bpc);
|
|||
decl_read_coef_blocks_fn(dav1d_read_coef_blocks_8bpc);
|
||||
decl_read_coef_blocks_fn(dav1d_read_coef_blocks_16bpc);
|
||||
|
||||
decl_copy_pal_block_fn(dav1d_copy_pal_block_y_8bpc);
|
||||
decl_copy_pal_block_fn(dav1d_copy_pal_block_y_16bpc);
|
||||
decl_copy_pal_block_fn(dav1d_copy_pal_block_uv_8bpc);
|
||||
decl_copy_pal_block_fn(dav1d_copy_pal_block_uv_16bpc);
|
||||
decl_read_pal_plane_fn(dav1d_read_pal_plane_8bpc);
|
||||
decl_read_pal_plane_fn(dav1d_read_pal_plane_16bpc);
|
||||
decl_read_pal_uv_fn(dav1d_read_pal_uv_8bpc);
|
||||
decl_read_pal_uv_fn(dav1d_read_pal_uv_16bpc);
|
||||
|
||||
#endif /* DAV1D_SRC_RECON_H */
|
||||
|
|
|
|||
233
third_party/dav1d/src/recon_tmpl.c
vendored
233
third_party/dav1d/src/recon_tmpl.c
vendored
|
|
@ -770,14 +770,12 @@ static void read_coef_tree(Dav1dTaskContext *const t,
|
|||
uint8_t cf_ctx;
|
||||
int eob;
|
||||
coef *cf;
|
||||
int16_t *cbi;
|
||||
|
||||
if (t->frame_thread.pass) {
|
||||
const int p = t->frame_thread.pass & 1;
|
||||
assert(ts->frame_thread[p].cf);
|
||||
cf = ts->frame_thread[p].cf;
|
||||
ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
|
||||
cbi = f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
|
||||
} else {
|
||||
cf = bitfn(t->cf);
|
||||
}
|
||||
|
|
@ -804,10 +802,11 @@ static void read_coef_tree(Dav1dTaskContext *const t,
|
|||
case_set_upto16(txw,,,);
|
||||
#undef set_ctx
|
||||
if (t->frame_thread.pass == 1)
|
||||
cbi[0] = eob * (1 << 5) + txtp;
|
||||
*ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
|
||||
} else {
|
||||
eob = cbi[0] >> 5;
|
||||
txtp = cbi[0] & 0x1f;
|
||||
const int cbi = *ts->frame_thread[0].cbi++;
|
||||
eob = cbi >> 5;
|
||||
txtp = cbi & 0x1f;
|
||||
}
|
||||
if (!(t->frame_thread.pass & 1)) {
|
||||
assert(dst);
|
||||
|
|
@ -872,8 +871,6 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
|
|||
for (y = init_y, t->by += init_y; y < sub_h4;
|
||||
y += t_dim->h, t->by += t_dim->h, y_off++)
|
||||
{
|
||||
int16_t (*const cbi)[3] =
|
||||
&f->frame_thread.cbi[t->by * f->b4_stride];
|
||||
int x_off = !!init_x;
|
||||
for (x = init_x, t->bx += init_x; x < sub_w4;
|
||||
x += t_dim->w, t->bx += t_dim->w, x_off++)
|
||||
|
|
@ -891,7 +888,7 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
|
|||
if (DEBUG_BLOCK_INFO)
|
||||
printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
|
||||
b->tx, txtp, eob, ts->msac.rng);
|
||||
cbi[t->bx][0] = eob * (1 << 5) + txtp;
|
||||
*ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
|
||||
ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
|
||||
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
|
||||
rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
|
||||
|
|
@ -917,8 +914,6 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
|
|||
for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
|
||||
y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
|
||||
{
|
||||
int16_t (*const cbi)[3] =
|
||||
&f->frame_thread.cbi[t->by * f->b4_stride];
|
||||
for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
|
||||
x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
|
||||
{
|
||||
|
|
@ -936,7 +931,7 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
|
|||
printf("Post-uv-cf-blk[pl=%d,tx=%d,"
|
||||
"txtp=%d,eob=%d]: r=%d\n",
|
||||
pl, b->uvtx, txtp, eob, ts->msac.rng);
|
||||
cbi[t->bx][pl + 1] = eob * (1 << 5) + txtp;
|
||||
*ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
|
||||
ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
|
||||
#define set_ctx(type, dir, diridx, off, mul, rep_macro) \
|
||||
rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
|
||||
|
|
@ -1236,13 +1231,14 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
|
|||
const int p = t->frame_thread.pass & 1;
|
||||
assert(ts->frame_thread[p].pal_idx);
|
||||
pal_idx = ts->frame_thread[p].pal_idx;
|
||||
ts->frame_thread[p].pal_idx += bw4 * bh4 * 16;
|
||||
ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
|
||||
} else {
|
||||
pal_idx = t->scratch.pal_idx;
|
||||
pal_idx = t->scratch.pal_idx_y;
|
||||
}
|
||||
const uint16_t *const pal = t->frame_thread.pass ?
|
||||
const pixel *const pal = t->frame_thread.pass ?
|
||||
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
|
||||
((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0];
|
||||
((t->bx >> 1) + (t->by & 1))][0] :
|
||||
bytefn(t->scratch.pal)[0];
|
||||
f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
|
||||
pal_idx, bw4 * 4, bh4 * 4);
|
||||
if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
|
||||
|
|
@ -1319,10 +1315,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
|
|||
enum TxfmType txtp;
|
||||
if (t->frame_thread.pass) {
|
||||
const int p = t->frame_thread.pass & 1;
|
||||
const int cbi = *ts->frame_thread[p].cbi++;
|
||||
cf = ts->frame_thread[p].cf;
|
||||
ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
|
||||
const int cbi =
|
||||
f->frame_thread.cbi[t->by * f->b4_stride + t->bx][0];
|
||||
eob = cbi >> 5;
|
||||
txtp = cbi & 0x1f;
|
||||
} else {
|
||||
|
|
@ -1428,7 +1423,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
|
|||
} else if (b->pal_sz[1]) {
|
||||
const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
|
||||
(t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
|
||||
const uint16_t (*pal)[8];
|
||||
const pixel (*pal)[8];
|
||||
const uint8_t *pal_idx;
|
||||
if (t->frame_thread.pass) {
|
||||
const int p = t->frame_thread.pass & 1;
|
||||
|
|
@ -1436,10 +1431,10 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
|
|||
pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
|
||||
((t->bx >> 1) + (t->by & 1))];
|
||||
pal_idx = ts->frame_thread[p].pal_idx;
|
||||
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16;
|
||||
ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
|
||||
} else {
|
||||
pal = t->scratch.pal;
|
||||
pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
|
||||
pal = bytefn(t->scratch.pal);
|
||||
pal_idx = t->scratch.pal_idx_uv;
|
||||
}
|
||||
|
||||
f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
|
||||
|
|
@ -1543,10 +1538,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
|
|||
coef *cf;
|
||||
if (t->frame_thread.pass) {
|
||||
const int p = t->frame_thread.pass & 1;
|
||||
const int cbi = *ts->frame_thread[p].cbi++;
|
||||
cf = ts->frame_thread[p].cf;
|
||||
ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
|
||||
const int cbi =
|
||||
f->frame_thread.cbi[t->by * f->b4_stride + t->bx][pl + 1];
|
||||
eob = cbi >> 5;
|
||||
txtp = cbi & 0x1f;
|
||||
} else {
|
||||
|
|
@ -1682,12 +1676,8 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
|
|||
dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
|
||||
tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
|
||||
HIGHBD_CALL_SUFFIX);
|
||||
const uint8_t *const ii_mask =
|
||||
b->interintra_type == INTER_INTRA_BLEND ?
|
||||
dav1d_ii_masks[bs][0][b->interintra_mode] :
|
||||
dav1d_wedge_masks[bs][0][0][b->wedge_idx];
|
||||
dsp->mc.blend(dst, f->cur.stride[0], tmp,
|
||||
bw4 * 4, bh4 * 4, ii_mask);
|
||||
bw4 * 4, bh4 * 4, II_MASK(0, bs, b));
|
||||
}
|
||||
|
||||
if (!has_chroma) goto skip_inter_chroma_pred;
|
||||
|
|
@ -1790,10 +1780,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
|
|||
// FIXME for 8x32 with 4:2:2 subsampling, this probably does
|
||||
// the wrong thing since it will select 4x16, not 4x32, as a
|
||||
// transform size...
|
||||
const uint8_t *const ii_mask =
|
||||
b->interintra_type == INTER_INTRA_BLEND ?
|
||||
dav1d_ii_masks[bs][chr_layout_idx][b->interintra_mode] :
|
||||
dav1d_wedge_masks[bs][chr_layout_idx][0][b->wedge_idx];
|
||||
const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b);
|
||||
|
||||
for (int pl = 0; pl < 2; pl++) {
|
||||
pixel *const tmp = bitfn(t->scratch.interintra);
|
||||
|
|
@ -1871,12 +1858,12 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
|
|||
mask = seg_mask;
|
||||
break;
|
||||
case COMP_INTER_WEDGE:
|
||||
mask = dav1d_wedge_masks[bs][0][0][b->wedge_idx];
|
||||
mask = WEDGE_MASK(0, bs, 0, b->wedge_idx);
|
||||
dsp->mc.mask(dst, f->cur.stride[0],
|
||||
tmp[b->mask_sign], tmp[!b->mask_sign],
|
||||
bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);
|
||||
if (has_chroma)
|
||||
mask = dav1d_wedge_masks[bs][chr_layout_idx][b->mask_sign][b->wedge_idx];
|
||||
mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx);
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
@ -1993,10 +1980,9 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
|
|||
enum TxfmType txtp;
|
||||
if (t->frame_thread.pass) {
|
||||
const int p = t->frame_thread.pass & 1;
|
||||
const int cbi = *ts->frame_thread[p].cbi++;
|
||||
cf = ts->frame_thread[p].cf;
|
||||
ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
|
||||
const int cbi =
|
||||
f->frame_thread.cbi[t->by * f->b4_stride + t->bx][pl + 1];
|
||||
eob = cbi >> 5;
|
||||
txtp = cbi & 0x1f;
|
||||
} else {
|
||||
|
|
@ -2198,3 +2184,178 @@ void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
|
|||
4 * (ts->tiling.col_end - x_off) >> ss_hor);
|
||||
}
|
||||
}
|
||||
|
||||
void bytefn(dav1d_copy_pal_block_y)(Dav1dTaskContext *const t,
|
||||
const int bx4, const int by4,
|
||||
const int bw4, const int bh4)
|
||||
|
||||
{
|
||||
const Dav1dFrameContext *const f = t->f;
|
||||
pixel *const pal = t->frame_thread.pass ?
|
||||
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
|
||||
((t->bx >> 1) + (t->by & 1))][0] :
|
||||
bytefn(t->scratch.pal)[0];
|
||||
for (int x = 0; x < bw4; x++)
|
||||
memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel));
|
||||
for (int y = 0; y < bh4; y++)
|
||||
memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel));
|
||||
}
|
||||
|
||||
void bytefn(dav1d_copy_pal_block_uv)(Dav1dTaskContext *const t,
|
||||
const int bx4, const int by4,
|
||||
const int bw4, const int bh4)
|
||||
|
||||
{
|
||||
const Dav1dFrameContext *const f = t->f;
|
||||
const pixel (*const pal)[8] = t->frame_thread.pass ?
|
||||
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
|
||||
((t->bx >> 1) + (t->by & 1))] :
|
||||
bytefn(t->scratch.pal);
|
||||
// see aomedia bug 2183 for why we use luma coordinates here
|
||||
for (int pl = 1; pl <= 2; pl++) {
|
||||
for (int x = 0; x < bw4; x++)
|
||||
memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel));
|
||||
for (int y = 0; y < bh4; y++)
|
||||
memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel));
|
||||
}
|
||||
}
|
||||
|
||||
void bytefn(dav1d_read_pal_plane)(Dav1dTaskContext *const t, Av1Block *const b,
|
||||
const int pl, const int sz_ctx,
|
||||
const int bx4, const int by4)
|
||||
{
|
||||
Dav1dTileState *const ts = t->ts;
|
||||
const Dav1dFrameContext *const f = t->f;
|
||||
const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
|
||||
ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
|
||||
pixel cache[16], used_cache[8];
|
||||
int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
|
||||
int n_cache = 0;
|
||||
// don't reuse above palette outside SB64 boundaries
|
||||
int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
|
||||
const pixel *l = bytefn(t->al_pal)[1][by4][pl];
|
||||
const pixel *a = bytefn(t->al_pal)[0][bx4][pl];
|
||||
|
||||
// fill/sort cache
|
||||
while (l_cache && a_cache) {
|
||||
if (*l < *a) {
|
||||
if (!n_cache || cache[n_cache - 1] != *l)
|
||||
cache[n_cache++] = *l;
|
||||
l++;
|
||||
l_cache--;
|
||||
} else {
|
||||
if (*a == *l) {
|
||||
l++;
|
||||
l_cache--;
|
||||
}
|
||||
if (!n_cache || cache[n_cache - 1] != *a)
|
||||
cache[n_cache++] = *a;
|
||||
a++;
|
||||
a_cache--;
|
||||
}
|
||||
}
|
||||
if (l_cache) {
|
||||
do {
|
||||
if (!n_cache || cache[n_cache - 1] != *l)
|
||||
cache[n_cache++] = *l;
|
||||
l++;
|
||||
} while (--l_cache > 0);
|
||||
} else if (a_cache) {
|
||||
do {
|
||||
if (!n_cache || cache[n_cache - 1] != *a)
|
||||
cache[n_cache++] = *a;
|
||||
a++;
|
||||
} while (--a_cache > 0);
|
||||
}
|
||||
|
||||
// find reused cache entries
|
||||
int i = 0;
|
||||
for (int n = 0; n < n_cache && i < pal_sz; n++)
|
||||
if (dav1d_msac_decode_bool_equi(&ts->msac))
|
||||
used_cache[i++] = cache[n];
|
||||
const int n_used_cache = i;
|
||||
|
||||
// parse new entries
|
||||
pixel *const pal = t->frame_thread.pass ?
|
||||
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
|
||||
((t->bx >> 1) + (t->by & 1))][pl] :
|
||||
bytefn(t->scratch.pal)[pl];
|
||||
if (i < pal_sz) {
|
||||
const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
|
||||
int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc);
|
||||
|
||||
if (i < pal_sz) {
|
||||
int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
|
||||
const int max = (1 << bpc) - 1;
|
||||
|
||||
do {
|
||||
const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
|
||||
prev = pal[i++] = imin(prev + delta + !pl, max);
|
||||
if (prev + !pl >= max) {
|
||||
for (; i < pal_sz; i++)
|
||||
pal[i] = max;
|
||||
break;
|
||||
}
|
||||
bits = imin(bits, 1 + ulog2(max - prev - !pl));
|
||||
} while (i < pal_sz);
|
||||
}
|
||||
|
||||
// merge cache+new entries
|
||||
int n = 0, m = n_used_cache;
|
||||
for (i = 0; i < pal_sz; i++) {
|
||||
if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
|
||||
pal[i] = used_cache[n++];
|
||||
} else {
|
||||
assert(m < pal_sz);
|
||||
pal[i] = pal[m++];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
|
||||
}
|
||||
|
||||
if (DEBUG_BLOCK_INFO) {
|
||||
printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
|
||||
pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
|
||||
for (int n = 0; n < n_cache; n++)
|
||||
printf("%c%02x", n ? ' ' : '[', cache[n]);
|
||||
printf("%s, pal=", n_cache ? "]" : "[]");
|
||||
for (int n = 0; n < pal_sz; n++)
|
||||
printf("%c%02x", n ? ' ' : '[', pal[n]);
|
||||
printf("]\n");
|
||||
}
|
||||
}
|
||||
|
||||
void bytefn(dav1d_read_pal_uv)(Dav1dTaskContext *const t, Av1Block *const b,
|
||||
const int sz_ctx, const int bx4, const int by4)
|
||||
{
|
||||
bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4);
|
||||
|
||||
// V pal coding
|
||||
Dav1dTileState *const ts = t->ts;
|
||||
const Dav1dFrameContext *const f = t->f;
|
||||
pixel *const pal = t->frame_thread.pass ?
|
||||
f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
|
||||
((t->bx >> 1) + (t->by & 1))][2] :
|
||||
bytefn(t->scratch.pal)[2];
|
||||
const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
|
||||
if (dav1d_msac_decode_bool_equi(&ts->msac)) {
|
||||
const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2);
|
||||
int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc);
|
||||
const int max = (1 << bpc) - 1;
|
||||
for (int i = 1; i < b->pal_sz[1]; i++) {
|
||||
int delta = dav1d_msac_decode_bools(&ts->msac, bits);
|
||||
if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
|
||||
prev = pal[i] = (prev + delta) & max;
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < b->pal_sz[1]; i++)
|
||||
pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc);
|
||||
}
|
||||
if (DEBUG_BLOCK_INFO) {
|
||||
printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
|
||||
for (int n = 0; n < b->pal_sz[1]; n++)
|
||||
printf("%c%02x", n ? ' ' : '[', pal[n]);
|
||||
printf("]\n");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
2
third_party/dav1d/src/thread_task.c
vendored
2
third_party/dav1d/src/thread_task.c
vendored
|
|
@ -500,7 +500,7 @@ static inline void delayed_fg_task(const Dav1dContext *const c,
|
|||
case DAV1D_TASK_TYPE_FG_APPLY:;
|
||||
int row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
|
||||
pthread_mutex_unlock(&ttd->lock);
|
||||
int progmax = (out->p.h + 31) >> 5;
|
||||
int progmax = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE;
|
||||
fg_apply_loop:
|
||||
if (row + 1 < progmax)
|
||||
pthread_cond_signal(&ttd->cond);
|
||||
|
|
|
|||
255
third_party/dav1d/src/wedge.c
vendored
255
third_party/dav1d/src/wedge.c
vendored
|
|
@ -83,37 +83,7 @@ static const wedge_code_type wedge_codebook_16_heqw[16] = {
|
|||
{ WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
|
||||
};
|
||||
|
||||
static uint8_t ALIGN(wedge_masks_444_32x32[2 * 16 * 32 * 32], 64);
|
||||
static uint8_t ALIGN(wedge_masks_444_32x16[2 * 16 * 32 * 16], 64);
|
||||
static uint8_t ALIGN(wedge_masks_444_32x8 [2 * 16 * 32 * 8], 64);
|
||||
static uint8_t ALIGN(wedge_masks_444_16x32[2 * 16 * 16 * 32], 64);
|
||||
static uint8_t ALIGN(wedge_masks_444_16x16[2 * 16 * 16 * 16], 64);
|
||||
static uint8_t ALIGN(wedge_masks_444_16x8 [2 * 16 * 16 * 8], 64);
|
||||
static uint8_t ALIGN(wedge_masks_444_8x32 [2 * 16 * 8 * 32], 64);
|
||||
static uint8_t ALIGN(wedge_masks_444_8x16 [2 * 16 * 8 * 16], 64);
|
||||
static uint8_t ALIGN(wedge_masks_444_8x8 [2 * 16 * 8 * 8], 64);
|
||||
|
||||
static uint8_t ALIGN(wedge_masks_422_16x32[2 * 16 * 16 * 32], 64);
|
||||
static uint8_t ALIGN(wedge_masks_422_16x16[2 * 16 * 16 * 16], 64);
|
||||
static uint8_t ALIGN(wedge_masks_422_16x8 [2 * 16 * 16 * 8], 64);
|
||||
static uint8_t ALIGN(wedge_masks_422_8x32 [2 * 16 * 8 * 32], 64);
|
||||
static uint8_t ALIGN(wedge_masks_422_8x16 [2 * 16 * 8 * 16], 64);
|
||||
static uint8_t ALIGN(wedge_masks_422_8x8 [2 * 16 * 8 * 8], 64);
|
||||
static uint8_t ALIGN(wedge_masks_422_4x32 [2 * 16 * 4 * 32], 64);
|
||||
static uint8_t ALIGN(wedge_masks_422_4x16 [2 * 16 * 4 * 16], 64);
|
||||
static uint8_t ALIGN(wedge_masks_422_4x8 [2 * 16 * 4 * 8], 32);
|
||||
|
||||
static uint8_t ALIGN(wedge_masks_420_16x16[2 * 16 * 16 * 16], 64);
|
||||
static uint8_t ALIGN(wedge_masks_420_16x8 [2 * 16 * 16 * 8], 64);
|
||||
static uint8_t ALIGN(wedge_masks_420_16x4 [2 * 16 * 16 * 4], 64);
|
||||
static uint8_t ALIGN(wedge_masks_420_8x16 [2 * 16 * 8 * 16], 64);
|
||||
static uint8_t ALIGN(wedge_masks_420_8x8 [2 * 16 * 8 * 8], 64);
|
||||
static uint8_t ALIGN(wedge_masks_420_8x4 [2 * 16 * 8 * 4], 64);
|
||||
static uint8_t ALIGN(wedge_masks_420_4x16 [2 * 16 * 4 * 16], 64);
|
||||
static uint8_t ALIGN(wedge_masks_420_4x8 [2 * 16 * 4 * 8], 32);
|
||||
static uint8_t ALIGN(wedge_masks_420_4x4 [2 * 16 * 4 * 4], 16);
|
||||
|
||||
const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3][2][16];
|
||||
Dav1dMasks dav1d_masks;
|
||||
|
||||
static void insert_border(uint8_t *const dst, const uint8_t *const src,
|
||||
const int ctr)
|
||||
|
|
@ -136,29 +106,33 @@ static void hflip(uint8_t *const dst, const uint8_t *const src) {
|
|||
dst[y_off + 64 - 1 - x] = src[y_off + x];
|
||||
}
|
||||
|
||||
static void invert(uint8_t *const dst, const uint8_t *const src,
|
||||
const int w, const int h)
|
||||
{
|
||||
for (int y = 0, y_off = 0; y < h; y++, y_off += w)
|
||||
for (int x = 0; x < w; x++)
|
||||
dst[y_off + x] = 64 - src[y_off + x];
|
||||
}
|
||||
|
||||
static void copy2d(uint8_t *dst, const uint8_t *src,
|
||||
static void copy2d(uint8_t *dst, const uint8_t *src, int sign,
|
||||
const int w, const int h, const int x_off, const int y_off)
|
||||
{
|
||||
src += y_off * 64 + x_off;
|
||||
for (int y = 0; y < h; y++) {
|
||||
memcpy(dst, src, w);
|
||||
src += 64;
|
||||
dst += w;
|
||||
if (sign) {
|
||||
for (int y = 0; y < h; y++) {
|
||||
for (int x = 0; x < w; x++)
|
||||
dst[x] = 64 - src[x];
|
||||
src += 64;
|
||||
dst += w;
|
||||
}
|
||||
} else {
|
||||
for (int y = 0; y < h; y++) {
|
||||
memcpy(dst, src, w);
|
||||
src += 64;
|
||||
dst += w;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static COLD void init_chroma(uint8_t *chroma, const uint8_t *luma,
|
||||
const int sign, const int w, const int h,
|
||||
const int ss_ver)
|
||||
#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
|
||||
|
||||
static COLD uint16_t init_chroma(uint8_t *chroma, const uint8_t *luma,
|
||||
const int sign, const int w, const int h,
|
||||
const int ss_ver)
|
||||
{
|
||||
const uint16_t offset = MASK_OFFSET(chroma);
|
||||
for (int y = 0; y < h; y += 1 + ss_ver) {
|
||||
for (int x = 0; x < w; x += 2) {
|
||||
int sum = luma[x] + luma[x + 1] + 1;
|
||||
|
|
@ -168,62 +142,69 @@ static COLD void init_chroma(uint8_t *chroma, const uint8_t *luma,
|
|||
luma += w << ss_ver;
|
||||
chroma += w >> 1;
|
||||
}
|
||||
return offset;
|
||||
}
|
||||
|
||||
static COLD void fill2d_16x2(uint8_t *dst, const int w, const int h,
|
||||
const enum BlockSize bs,
|
||||
static COLD void fill2d_16x2(const int w, const int h, const enum BlockSize bs,
|
||||
const uint8_t (*const master)[64 * 64],
|
||||
const wedge_code_type *const cb,
|
||||
uint8_t *masks_444, uint8_t *masks_422,
|
||||
uint8_t *masks_420, const unsigned signs)
|
||||
uint8_t *masks_420, unsigned signs)
|
||||
{
|
||||
uint8_t *ptr = dst;
|
||||
for (int n = 0; n < 16; n++) {
|
||||
copy2d(ptr, master[cb[n].direction], w, h,
|
||||
32 - (w * cb[n].x_offset >> 3), 32 - (h * cb[n].y_offset >> 3));
|
||||
ptr += w * h;
|
||||
}
|
||||
for (int n = 0, off = 0; n < 16; n++, off += w * h)
|
||||
invert(ptr + off, dst + off, w, h);
|
||||
|
||||
const int n_stride_444 = (w * h);
|
||||
const int n_stride_422 = n_stride_444 >> 1;
|
||||
const int n_stride_420 = n_stride_444 >> 2;
|
||||
const int sign_stride_444 = 16 * n_stride_444;
|
||||
const int sign_stride_422 = 16 * n_stride_422;
|
||||
const int sign_stride_420 = 16 * n_stride_420;
|
||||
// assign pointers in externally visible array
|
||||
|
||||
// assign pointer offsets in lookup table
|
||||
for (int n = 0; n < 16; n++) {
|
||||
const int sign = (signs >> n) & 1;
|
||||
dav1d_wedge_masks[bs][0][0][n] = &masks_444[ sign * sign_stride_444];
|
||||
const int sign = signs & 1;
|
||||
|
||||
copy2d(masks_444, master[cb[n].direction], sign, w, h,
|
||||
32 - (w * cb[n].x_offset >> 3), 32 - (h * cb[n].y_offset >> 3));
|
||||
|
||||
// not using !sign is intentional here, since 444 does not require
|
||||
// any rounding since no chroma subsampling is applied.
|
||||
dav1d_wedge_masks[bs][0][1][n] = &masks_444[ sign * sign_stride_444];
|
||||
dav1d_wedge_masks[bs][1][0][n] = &masks_422[ sign * sign_stride_422];
|
||||
dav1d_wedge_masks[bs][1][1][n] = &masks_422[!sign * sign_stride_422];
|
||||
dav1d_wedge_masks[bs][2][0][n] = &masks_420[ sign * sign_stride_420];
|
||||
dav1d_wedge_masks[bs][2][1][n] = &masks_420[!sign * sign_stride_420];
|
||||
dav1d_masks.offsets[0][bs].wedge[0][n] =
|
||||
dav1d_masks.offsets[0][bs].wedge[1][n] = MASK_OFFSET(masks_444);
|
||||
|
||||
dav1d_masks.offsets[1][bs].wedge[0][n] =
|
||||
init_chroma(&masks_422[ sign * sign_stride_422], masks_444, 0, w, h, 0);
|
||||
dav1d_masks.offsets[1][bs].wedge[1][n] =
|
||||
init_chroma(&masks_422[!sign * sign_stride_422], masks_444, 1, w, h, 0);
|
||||
dav1d_masks.offsets[2][bs].wedge[0][n] =
|
||||
init_chroma(&masks_420[ sign * sign_stride_420], masks_444, 0, w, h, 1);
|
||||
dav1d_masks.offsets[2][bs].wedge[1][n] =
|
||||
init_chroma(&masks_420[!sign * sign_stride_420], masks_444, 1, w, h, 1);
|
||||
|
||||
signs >>= 1;
|
||||
masks_444 += n_stride_444;
|
||||
masks_422 += n_stride_422;
|
||||
masks_420 += n_stride_420;
|
||||
|
||||
// since the pointers come from inside, we know that
|
||||
// violation of the const is OK here. Any other approach
|
||||
// means we would have to duplicate the sign correction
|
||||
// logic in two places, which isn't very nice, or mark
|
||||
// the table faced externally as non-const, which also sucks
|
||||
init_chroma((uint8_t *)dav1d_wedge_masks[bs][1][0][n],
|
||||
dav1d_wedge_masks[bs][0][0][n], 0, w, h, 0);
|
||||
init_chroma((uint8_t *)dav1d_wedge_masks[bs][1][1][n],
|
||||
dav1d_wedge_masks[bs][0][0][n], 1, w, h, 0);
|
||||
init_chroma((uint8_t *)dav1d_wedge_masks[bs][2][0][n],
|
||||
dav1d_wedge_masks[bs][0][0][n], 0, w, h, 1);
|
||||
init_chroma((uint8_t *)dav1d_wedge_masks[bs][2][1][n],
|
||||
dav1d_wedge_masks[bs][0][0][n], 1, w, h, 1);
|
||||
}
|
||||
}
|
||||
|
||||
COLD void dav1d_init_wedge_masks(void) {
|
||||
static COLD void build_nondc_ii_masks(uint8_t *const mask_v, const int w,
|
||||
const int h, const int step)
|
||||
{
|
||||
static const uint8_t ii_weights_1d[32] = {
|
||||
60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
|
||||
6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1,
|
||||
};
|
||||
|
||||
uint8_t *const mask_h = &mask_v[w * h];
|
||||
uint8_t *const mask_sm = &mask_h[w * h];
|
||||
for (int y = 0, off = 0; y < h; y++, off += w) {
|
||||
memset(&mask_v[off], ii_weights_1d[y * step], w);
|
||||
for (int x = 0; x < w; x++) {
|
||||
mask_sm[off + x] = ii_weights_1d[imin(x, y) * step];
|
||||
mask_h[off + x] = ii_weights_1d[x * step];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
COLD void dav1d_init_ii_wedge_masks(void) {
|
||||
// This function is guaranteed to be called only once
|
||||
|
||||
enum WedgeMasterLineType {
|
||||
|
|
@ -257,9 +238,11 @@ COLD void dav1d_init_wedge_masks(void) {
|
|||
hflip(master[WEDGE_OBLIQUE153], master[WEDGE_OBLIQUE27]);
|
||||
|
||||
#define fill(w, h, sz_422, sz_420, hvsw, signs) \
|
||||
fill2d_16x2((uint8_t *) wedge_masks_444_##w##x##h, w, h, BS_##w##x##h, \
|
||||
master, wedge_codebook_16_##hvsw, wedge_masks_444_##w##x##h, \
|
||||
wedge_masks_422_##sz_422, wedge_masks_420_##sz_420, signs)
|
||||
fill2d_16x2(w, h, BS_##w##x##h - BS_32x32, \
|
||||
master, wedge_codebook_16_##hvsw, \
|
||||
dav1d_masks.wedge_444_##w##x##h, \
|
||||
dav1d_masks.wedge_422_##sz_422, \
|
||||
dav1d_masks.wedge_420_##sz_420, signs)
|
||||
|
||||
fill(32, 32, 16x32, 16x16, heqw, 0x7bfb);
|
||||
fill(32, 16, 16x16, 16x8, hltw, 0x7beb);
|
||||
|
|
@ -271,72 +254,46 @@ COLD void dav1d_init_wedge_masks(void) {
|
|||
fill( 8, 16, 4x16, 4x8, hgtw, 0x7beb);
|
||||
fill( 8, 8, 4x8, 4x4, heqw, 0x7bfb);
|
||||
#undef fill
|
||||
}
|
||||
|
||||
#define N_II_PRED_MODES (N_INTER_INTRA_PRED_MODES - 1)
|
||||
static uint8_t ALIGN(ii_dc_mask[32 * 32], 64);
|
||||
static uint8_t ALIGN(ii_nondc_mask_32x32[N_II_PRED_MODES][32 * 32], 64);
|
||||
static uint8_t ALIGN(ii_nondc_mask_16x32[N_II_PRED_MODES][16 * 32], 64);
|
||||
static uint8_t ALIGN(ii_nondc_mask_16x16[N_II_PRED_MODES][16 * 16], 64);
|
||||
static uint8_t ALIGN(ii_nondc_mask_8x32 [N_II_PRED_MODES][ 8 * 32], 64);
|
||||
static uint8_t ALIGN(ii_nondc_mask_8x16 [N_II_PRED_MODES][ 8 * 16], 64);
|
||||
static uint8_t ALIGN(ii_nondc_mask_8x8 [N_II_PRED_MODES][ 8 * 8], 64);
|
||||
static uint8_t ALIGN(ii_nondc_mask_4x16 [N_II_PRED_MODES][ 4 * 16], 64);
|
||||
static uint8_t ALIGN(ii_nondc_mask_4x8 [N_II_PRED_MODES][ 4 * 8], 32);
|
||||
static uint8_t ALIGN(ii_nondc_mask_4x4 [N_II_PRED_MODES][ 4 * 4], 16);
|
||||
#undef N_II_PRED_MODES
|
||||
memset(dav1d_masks.ii_dc, 32, 32 * 32);
|
||||
for (int c = 0; c < 3; c++) {
|
||||
dav1d_masks.offsets[c][BS_32x32-BS_32x32].ii[II_DC_PRED] =
|
||||
dav1d_masks.offsets[c][BS_32x16-BS_32x32].ii[II_DC_PRED] =
|
||||
dav1d_masks.offsets[c][BS_16x32-BS_32x32].ii[II_DC_PRED] =
|
||||
dav1d_masks.offsets[c][BS_16x16-BS_32x32].ii[II_DC_PRED] =
|
||||
dav1d_masks.offsets[c][BS_16x8 -BS_32x32].ii[II_DC_PRED] =
|
||||
dav1d_masks.offsets[c][BS_8x16 -BS_32x32].ii[II_DC_PRED] =
|
||||
dav1d_masks.offsets[c][BS_8x8 -BS_32x32].ii[II_DC_PRED] =
|
||||
MASK_OFFSET(dav1d_masks.ii_dc);
|
||||
}
|
||||
|
||||
#define set1(sz) \
|
||||
[II_DC_PRED] = ii_dc_mask, \
|
||||
[II_VERT_PRED] = ii_nondc_mask_##sz[II_VERT_PRED - 1], \
|
||||
[II_HOR_PRED] = ii_nondc_mask_##sz[II_HOR_PRED - 1], \
|
||||
[II_SMOOTH_PRED] = ii_nondc_mask_##sz[II_SMOOTH_PRED - 1]
|
||||
#define set(sz_444, sz_422, sz_420) \
|
||||
{ { set1(sz_444) }, { set1(sz_422) }, { set1(sz_420) } }
|
||||
const uint8_t *dav1d_ii_masks[N_BS_SIZES][3][N_INTER_INTRA_PRED_MODES] = {
|
||||
[BS_8x8] = set( 8x8, 4x8, 4x4),
|
||||
[BS_8x16] = set( 8x16, 4x16, 4x8),
|
||||
[BS_16x8] = set(16x16, 8x8, 8x8),
|
||||
[BS_16x16] = set(16x16, 8x16, 8x8),
|
||||
[BS_16x32] = set(16x32, 8x32, 8x16),
|
||||
[BS_32x16] = set(32x32, 16x16, 16x16),
|
||||
[BS_32x32] = set(32x32, 16x32, 16x16),
|
||||
};
|
||||
#undef set
|
||||
#undef set1
|
||||
#define BUILD_NONDC_II_MASKS(w, h, step) \
|
||||
build_nondc_ii_masks(dav1d_masks.ii_nondc_##w##x##h, w, h, step)
|
||||
|
||||
static COLD void build_nondc_ii_masks(uint8_t *const mask_v,
|
||||
uint8_t *const mask_h,
|
||||
uint8_t *const mask_sm,
|
||||
const int w, const int h, const int step)
|
||||
{
|
||||
static const uint8_t ii_weights_1d[] = {
|
||||
60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10, 8, 7,
|
||||
6, 6, 5, 4, 4, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1,
|
||||
};
|
||||
#define ASSIGN_NONDC_II_OFFSET(bs, w444, h444, w422, h422, w420, h420) \
|
||||
dav1d_masks.offsets[0][bs-BS_32x32].ii[p + 1] = \
|
||||
MASK_OFFSET(&dav1d_masks.ii_nondc_##w444##x##h444[p*w444*h444]); \
|
||||
dav1d_masks.offsets[1][bs-BS_32x32].ii[p + 1] = \
|
||||
MASK_OFFSET(&dav1d_masks.ii_nondc_##w422##x##h422[p*w422*h422]); \
|
||||
dav1d_masks.offsets[2][bs-BS_32x32].ii[p + 1] = \
|
||||
MASK_OFFSET(&dav1d_masks.ii_nondc_##w420##x##h420[p*w420*h420])
|
||||
|
||||
for (int y = 0, off = 0; y < h; y++, off += w) {
|
||||
memset(&mask_v[off], ii_weights_1d[y * step], w);
|
||||
for (int x = 0; x < w; x++) {
|
||||
mask_sm[off + x] = ii_weights_1d[imin(x, y) * step];
|
||||
mask_h[off + x] = ii_weights_1d[x * step];
|
||||
}
|
||||
BUILD_NONDC_II_MASKS(32, 32, 1);
|
||||
BUILD_NONDC_II_MASKS(16, 32, 1);
|
||||
BUILD_NONDC_II_MASKS(16, 16, 2);
|
||||
BUILD_NONDC_II_MASKS( 8, 32, 1);
|
||||
BUILD_NONDC_II_MASKS( 8, 16, 2);
|
||||
BUILD_NONDC_II_MASKS( 8, 8, 4);
|
||||
BUILD_NONDC_II_MASKS( 4, 16, 2);
|
||||
BUILD_NONDC_II_MASKS( 4, 8, 4);
|
||||
BUILD_NONDC_II_MASKS( 4, 4, 8);
|
||||
for (int p = 0; p < 3; p++) {
|
||||
ASSIGN_NONDC_II_OFFSET(BS_32x32, 32, 32, 16, 32, 16, 16);
|
||||
ASSIGN_NONDC_II_OFFSET(BS_32x16, 32, 32, 16, 16, 16, 16);
|
||||
ASSIGN_NONDC_II_OFFSET(BS_16x32, 16, 32, 8, 32, 8, 16);
|
||||
ASSIGN_NONDC_II_OFFSET(BS_16x16, 16, 16, 8, 16, 8, 8);
|
||||
ASSIGN_NONDC_II_OFFSET(BS_16x8, 16, 16, 8, 8, 8, 8);
|
||||
ASSIGN_NONDC_II_OFFSET(BS_8x16, 8, 16, 4, 16, 4, 8);
|
||||
ASSIGN_NONDC_II_OFFSET(BS_8x8, 8, 8, 4, 8, 4, 4);
|
||||
}
|
||||
}
|
||||
|
||||
COLD void dav1d_init_interintra_masks(void) {
|
||||
// This function is guaranteed to be called only once
|
||||
|
||||
memset(ii_dc_mask, 32, 32 * 32);
|
||||
#define set(a) a[II_VERT_PRED - 1], a[II_HOR_PRED - 1], a[II_SMOOTH_PRED - 1]
|
||||
build_nondc_ii_masks(set(ii_nondc_mask_32x32), 32, 32, 1);
|
||||
build_nondc_ii_masks(set(ii_nondc_mask_16x32), 16, 32, 1);
|
||||
build_nondc_ii_masks(set(ii_nondc_mask_16x16), 16, 16, 2);
|
||||
build_nondc_ii_masks(set(ii_nondc_mask_8x32), 8, 32, 1);
|
||||
build_nondc_ii_masks(set(ii_nondc_mask_8x16), 8, 16, 2);
|
||||
build_nondc_ii_masks(set(ii_nondc_mask_8x8), 8, 8, 4);
|
||||
build_nondc_ii_masks(set(ii_nondc_mask_4x16), 4, 16, 2);
|
||||
build_nondc_ii_masks(set(ii_nondc_mask_4x8), 4, 8, 4);
|
||||
build_nondc_ii_masks(set(ii_nondc_mask_4x4), 4, 4, 8);
|
||||
#undef set
|
||||
}
|
||||
|
|
|
|||
67
third_party/dav1d/src/wedge.h
vendored
67
third_party/dav1d/src/wedge.h
vendored
|
|
@ -30,12 +30,67 @@
|
|||
|
||||
#include "src/levels.h"
|
||||
|
||||
void dav1d_init_wedge_masks(void);
|
||||
EXTERN const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
|
||||
[2 /* sign */][16 /* wedge_idx */];
|
||||
typedef struct {
|
||||
/* Offsets, in units of 8 bytes, relative to the start of the struct. */
|
||||
struct {
|
||||
uint16_t wedge[2 /* sign */][16 /* wedge_idx */];
|
||||
uint16_t ii[N_INTER_INTRA_PRED_MODES];
|
||||
} offsets[3 /* 444, 422, 420 */][BS_8x8 - BS_32x32 + 1];
|
||||
|
||||
void dav1d_init_interintra_masks(void);
|
||||
EXTERN const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
|
||||
[N_INTER_INTRA_PRED_MODES];
|
||||
uint8_t ALIGN(wedge_444_32x32[ 16 * 32 * 32], 64);
|
||||
uint8_t ALIGN(wedge_444_32x16[ 16 * 32 * 16], 64);
|
||||
uint8_t ALIGN(wedge_444_32x8 [ 16 * 32 * 8], 64);
|
||||
uint8_t ALIGN(wedge_444_16x32[ 16 * 16 * 32], 64);
|
||||
uint8_t ALIGN(wedge_444_16x16[ 16 * 16 * 16], 64);
|
||||
uint8_t ALIGN(wedge_444_16x8 [ 16 * 16 * 8], 64);
|
||||
uint8_t ALIGN(wedge_444_8x32 [ 16 * 8 * 32], 64);
|
||||
uint8_t ALIGN(wedge_444_8x16 [ 16 * 8 * 16], 64);
|
||||
uint8_t ALIGN(wedge_444_8x8 [ 16 * 8 * 8], 64);
|
||||
|
||||
uint8_t ALIGN(wedge_422_16x32[2 * 16 * 16 * 32], 64);
|
||||
uint8_t ALIGN(wedge_422_16x16[2 * 16 * 16 * 16], 64);
|
||||
uint8_t ALIGN(wedge_422_16x8 [2 * 16 * 16 * 8], 64);
|
||||
uint8_t ALIGN(wedge_422_8x32 [2 * 16 * 8 * 32], 64);
|
||||
uint8_t ALIGN(wedge_422_8x16 [2 * 16 * 8 * 16], 64);
|
||||
uint8_t ALIGN(wedge_422_8x8 [2 * 16 * 8 * 8], 64);
|
||||
uint8_t ALIGN(wedge_422_4x32 [2 * 16 * 4 * 32], 64);
|
||||
uint8_t ALIGN(wedge_422_4x16 [2 * 16 * 4 * 16], 64);
|
||||
uint8_t ALIGN(wedge_422_4x8 [2 * 16 * 4 * 8], 64);
|
||||
|
||||
uint8_t ALIGN(wedge_420_16x16[2 * 16 * 16 * 16], 64);
|
||||
uint8_t ALIGN(wedge_420_16x8 [2 * 16 * 16 * 8], 64);
|
||||
uint8_t ALIGN(wedge_420_16x4 [2 * 16 * 16 * 4], 64);
|
||||
uint8_t ALIGN(wedge_420_8x16 [2 * 16 * 8 * 16], 64);
|
||||
uint8_t ALIGN(wedge_420_8x8 [2 * 16 * 8 * 8], 64);
|
||||
uint8_t ALIGN(wedge_420_8x4 [2 * 16 * 8 * 4], 64);
|
||||
uint8_t ALIGN(wedge_420_4x16 [2 * 16 * 4 * 16], 64);
|
||||
uint8_t ALIGN(wedge_420_4x8 [2 * 16 * 4 * 8], 64);
|
||||
uint8_t ALIGN(wedge_420_4x4 [2 * 16 * 4 * 4], 64);
|
||||
|
||||
uint8_t ALIGN(ii_dc [ 32 * 32], 64);
|
||||
uint8_t ALIGN(ii_nondc_32x32[3 * 32 * 32], 64);
|
||||
uint8_t ALIGN(ii_nondc_16x32[3 * 16 * 32], 64);
|
||||
uint8_t ALIGN(ii_nondc_16x16[3 * 16 * 16], 64);
|
||||
uint8_t ALIGN(ii_nondc_8x32 [3 * 8 * 32], 64);
|
||||
uint8_t ALIGN(ii_nondc_8x16 [3 * 8 * 16], 64);
|
||||
uint8_t ALIGN(ii_nondc_8x8 [3 * 8 * 8], 64);
|
||||
uint8_t ALIGN(ii_nondc_4x16 [3 * 4 * 16], 64);
|
||||
uint8_t ALIGN(ii_nondc_4x8 [3 * 4 * 8], 32);
|
||||
uint8_t ALIGN(ii_nondc_4x4 [3 * 4 * 4], 16);
|
||||
} Dav1dMasks;
|
||||
|
||||
#define II_MASK(c, bs, b) \
|
||||
((const uint8_t*)((uintptr_t)&dav1d_masks + \
|
||||
(size_t)((b)->interintra_type == INTER_INTRA_BLEND ? \
|
||||
dav1d_masks.offsets[c][(bs)-BS_32x32].ii[(b)->interintra_mode] : \
|
||||
dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[0][(b)->wedge_idx]) * 8))
|
||||
|
||||
#define WEDGE_MASK(c, bs, sign, idx) \
|
||||
((const uint8_t*)((uintptr_t)&dav1d_masks + \
|
||||
(size_t)dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[sign][idx] * 8))
|
||||
|
||||
EXTERN Dav1dMasks dav1d_masks;
|
||||
|
||||
void dav1d_init_ii_wedge_masks(void);
|
||||
|
||||
#endif /* DAV1D_SRC_WEDGE_H */
|
||||
|
|
|
|||
109
third_party/dav1d/src/x86/ipred16_avx2.asm
vendored
109
third_party/dav1d/src/x86/ipred16_avx2.asm
vendored
|
|
@ -4885,24 +4885,26 @@ cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
|
|||
jg .w32_wpad
|
||||
jmp .w32_hpad
|
||||
|
||||
cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h
|
||||
vbroadcasti128 m3, [palq]
|
||||
cglobal pal_pred_16bpc, 4, 6, 6, dst, stride, pal, idx, w, h
|
||||
vbroadcasti128 m4, [palq]
|
||||
lea r2, [pal_pred_16bpc_avx2_table]
|
||||
tzcnt wd, wm
|
||||
vbroadcasti128 m4, [pal_pred_shuf]
|
||||
vbroadcasti128 m5, [pal_pred_shuf]
|
||||
movifnidn hd, hm
|
||||
movsxd wq, [r2+wq*4]
|
||||
pshufb m3, m4
|
||||
punpckhqdq m4, m3, m3
|
||||
pshufb m4, m5
|
||||
punpckhqdq m5, m4, m4
|
||||
add wq, r2
|
||||
DEFINE_ARGS dst, stride, stride3, idx, w, h
|
||||
lea stride3q, [strideq*3]
|
||||
jmp wq
|
||||
.w4:
|
||||
mova xm2, [idxq]
|
||||
add idxq, 16
|
||||
pshufb xm1, xm3, xm2
|
||||
pshufb xm2, xm4, xm2
|
||||
movq xm0, [idxq]
|
||||
add idxq, 8
|
||||
psrlw xm1, xm0, 4
|
||||
punpcklbw xm0, xm1
|
||||
pshufb xm1, xm4, xm0
|
||||
pshufb xm2, xm5, xm0
|
||||
punpcklbw xm0, xm1, xm2
|
||||
punpckhbw xm1, xm2
|
||||
movq [dstq+strideq*0], xm0
|
||||
|
|
@ -4914,10 +4916,12 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h
|
|||
jg .w4
|
||||
RET
|
||||
.w8:
|
||||
movu m2, [idxq] ; only 16-byte alignment
|
||||
add idxq, 32
|
||||
pshufb m1, m3, m2
|
||||
pshufb m2, m4, m2
|
||||
pmovzxbw m2, [idxq]
|
||||
add idxq, 16
|
||||
psllw m1, m2, 4
|
||||
por m2, m1
|
||||
pshufb m1, m4, m2
|
||||
pshufb m2, m5, m2
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0], xm0
|
||||
|
|
@ -4929,19 +4933,22 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h
|
|||
jg .w8
|
||||
RET
|
||||
.w16:
|
||||
vpermq m2, [idxq+ 0], q3120
|
||||
vpermq m5, [idxq+32], q3120
|
||||
add idxq, 64
|
||||
pshufb m1, m3, m2
|
||||
pshufb m2, m4, m2
|
||||
pshufd m3, [idxq], q3120
|
||||
add idxq, 32
|
||||
vpermq m3, m3, q3120
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m2, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m2
|
||||
pshufb m2, m5, m2
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0], m0
|
||||
mova [dstq+strideq*1], m1
|
||||
pshufb m1, m3, m5
|
||||
pshufb m2, m4, m5
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
pshufb m1, m4, m3
|
||||
pshufb m3, m5, m3
|
||||
punpcklbw m0, m1, m3
|
||||
punpckhbw m1, m3
|
||||
mova [dstq+strideq*2], m0
|
||||
mova [dstq+stride3q ], m1
|
||||
lea dstq, [dstq+strideq*4]
|
||||
|
|
@ -4949,41 +4956,47 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h
|
|||
jg .w16
|
||||
RET
|
||||
.w32:
|
||||
vpermq m2, [idxq+ 0], q3120
|
||||
vpermq m5, [idxq+32], q3120
|
||||
add idxq, 64
|
||||
pshufb m1, m3, m2
|
||||
pshufb m2, m4, m2
|
||||
pshufd m3, [idxq], q3120
|
||||
add idxq, 32
|
||||
vpermq m3, m3, q3120
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m2, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m2
|
||||
pshufb m2, m5, m2
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0+ 0], m0
|
||||
mova [dstq+strideq*0+32], m1
|
||||
pshufb m1, m3, m5
|
||||
pshufb m2, m4, m5
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*1+ 0], m0
|
||||
mova [dstq+strideq*1+32], m1
|
||||
mova [dstq+ 0], m0
|
||||
mova [dstq+32], m1
|
||||
pshufb m1, m4, m3
|
||||
pshufb m3, m5, m3
|
||||
punpcklbw m0, m1, m3
|
||||
punpckhbw m1, m3
|
||||
mova [dstq+strideq+ 0], m0
|
||||
mova [dstq+strideq+32], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jg .w32
|
||||
RET
|
||||
.w64:
|
||||
vpermq m2, [idxq+ 0], q3120
|
||||
vpermq m5, [idxq+32], q3120
|
||||
add idxq, 64
|
||||
pshufb m1, m3, m2
|
||||
pshufb m2, m4, m2
|
||||
pshufd m3, [idxq], q3120
|
||||
add idxq, 32
|
||||
vpermq m3, m3, q3120
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m2, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m2
|
||||
pshufb m2, m5, m2
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+ 0], m0
|
||||
mova [dstq+32], m1
|
||||
pshufb m1, m3, m5
|
||||
pshufb m2, m4, m5
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+64], m0
|
||||
mova [dstq+96], m1
|
||||
mova [dstq+32*0], m0
|
||||
mova [dstq+32*1], m1
|
||||
pshufb m1, m4, m3
|
||||
pshufb m3, m5, m3
|
||||
punpcklbw m0, m1, m3
|
||||
punpckhbw m1, m3
|
||||
mova [dstq+32*2], m0
|
||||
mova [dstq+32*3], m1
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
jg .w64
|
||||
|
|
|
|||
82
third_party/dav1d/src/x86/ipred16_avx512.asm
vendored
82
third_party/dav1d/src/x86/ipred16_avx512.asm
vendored
|
|
@ -38,10 +38,10 @@ smooth_perm: db 1, 2, 5, 6, 9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
|
|||
db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
|
||||
db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
|
||||
db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
|
||||
pal_pred_perm: db 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39
|
||||
db 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
|
||||
db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
|
||||
db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
|
||||
pal_pred_perm: db 0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51
|
||||
db 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55
|
||||
db 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
|
||||
db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
|
||||
filter_permA: times 4 db 6, 7, 8, 9, 14, 15, 4, 5
|
||||
times 4 db 10, 11, 12, 13, 2, 3, -1, -1
|
||||
filter_permB: times 4 db 22, 23, 24, 25, 30, 31, 6, 7
|
||||
|
|
@ -57,6 +57,8 @@ filter_shift: times 2 dw 6
|
|||
dd 0
|
||||
times 2 dw 4
|
||||
dd 9
|
||||
pal_unpack: db 0, 8, 4, 12, 32, 40, 36, 44
|
||||
db 16, 24, 20, 28, 48, 56, 52, 60
|
||||
|
||||
%macro JMP_TABLE 3-*
|
||||
%xdefine %1_%2_table (%%table - 2*4)
|
||||
|
|
@ -610,20 +612,23 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
|
|||
jg .w64_loop
|
||||
RET
|
||||
|
||||
cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
|
||||
cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3
|
||||
lea r6, [pal_pred_16bpc_avx512icl_table]
|
||||
tzcnt wd, wm
|
||||
mova m2, [pal_pred_perm]
|
||||
movsxd wq, [r6+wq*4]
|
||||
mova xm3, [palq]
|
||||
mova m3, [pal_pred_perm]
|
||||
movifnidn hd, hm
|
||||
movsxd wq, [r6+wq*4]
|
||||
vpbroadcastq m4, [pal_unpack+0]
|
||||
vpbroadcastq m5, [pal_unpack+8]
|
||||
add wq, r6
|
||||
vbroadcasti32x4 m6, [palq]
|
||||
lea stride3q, [strideq*3]
|
||||
jmp wq
|
||||
.w4:
|
||||
pmovzxbw ym0, [idxq]
|
||||
add idxq, 16
|
||||
vpermw ym0, ym0, ym3
|
||||
pmovzxbd ym0, [idxq]
|
||||
add idxq, 8
|
||||
vpmultishiftqb ym0, ym4, ym0
|
||||
vpermw ym0, ym0, ym6
|
||||
vextracti32x4 xm1, ym0, 1
|
||||
movq [dstq+strideq*0], xm0
|
||||
movhps [dstq+strideq*1], xm0
|
||||
|
|
@ -634,9 +639,10 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
|
|||
jg .w4
|
||||
RET
|
||||
.w8:
|
||||
pmovzxbw m0, [idxq]
|
||||
add idxq, 32
|
||||
vpermw m0, m0, m3
|
||||
pmovzxbd m0, [idxq]
|
||||
add idxq, 16
|
||||
vpmultishiftqb m0, m4, m0
|
||||
vpermw m0, m0, m6
|
||||
mova [dstq+strideq*0], xm0
|
||||
vextracti32x4 [dstq+strideq*1], ym0, 1
|
||||
vextracti32x4 [dstq+strideq*2], m0, 2
|
||||
|
|
@ -646,11 +652,13 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
|
|||
jg .w8
|
||||
RET
|
||||
.w16:
|
||||
vpermb m1, m2, [idxq]
|
||||
add idxq, 64
|
||||
vpermw m0, m1, m3
|
||||
movu ym1, [idxq]
|
||||
add idxq, 32
|
||||
vpermb m1, m3, m1
|
||||
vpmultishiftqb m1, m4, m1
|
||||
vpermw m0, m1, m6
|
||||
psrlw m1, 8
|
||||
vpermw m1, m1, m3
|
||||
vpermw m1, m1, m6
|
||||
mova [dstq+strideq*0], ym0
|
||||
vextracti32x8 [dstq+strideq*1], m0, 1
|
||||
mova [dstq+strideq*2], ym1
|
||||
|
|
@ -660,27 +668,41 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
|
|||
jg .w16
|
||||
RET
|
||||
.w32:
|
||||
vpermb m1, m2, [idxq]
|
||||
vpermb m2, m3, [idxq]
|
||||
add idxq, 64
|
||||
vpermw m0, m1, m3
|
||||
vpmultishiftqb m1, m4, m2
|
||||
vpmultishiftqb m2, m5, m2
|
||||
vpermw m0, m1, m6
|
||||
psrlw m1, 8
|
||||
vpermw m1, m1, m3
|
||||
vpermw m1, m1, m6
|
||||
mova [dstq+strideq*0], m0
|
||||
mova [dstq+strideq*1], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
vpermw m0, m2, m6
|
||||
psrlw m2, 8
|
||||
vpermw m1, m2, m6
|
||||
mova [dstq+strideq*2], m0
|
||||
mova [dstq+stride3q ], m1
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
jg .w32
|
||||
RET
|
||||
.w64:
|
||||
vpermb m1, m2, [idxq]
|
||||
vpermb m2, m3, [idxq]
|
||||
add idxq, 64
|
||||
vpermw m0, m1, m3
|
||||
vpmultishiftqb m1, m4, m2
|
||||
vpmultishiftqb m2, m5, m2
|
||||
vpermw m0, m1, m6
|
||||
psrlw m1, 8
|
||||
vpermw m1, m1, m3
|
||||
mova [dstq+64*0], m0
|
||||
mova [dstq+64*1], m1
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
vpermw m1, m1, m6
|
||||
mova [dstq+ 0], m0
|
||||
mova [dstq+64], m1
|
||||
vpermw m0, m2, m6
|
||||
psrlw m2, 8
|
||||
vpermw m1, m2, m6
|
||||
mova [dstq+strideq+ 0], m0
|
||||
mova [dstq+strideq+64], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jg .w64
|
||||
RET
|
||||
|
||||
|
|
|
|||
129
third_party/dav1d/src/x86/ipred16_sse.asm
vendored
129
third_party/dav1d/src/x86/ipred16_sse.asm
vendored
|
|
@ -3964,25 +3964,27 @@ cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
|
|||
jg .w32_hpad_loop
|
||||
jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc
|
||||
|
||||
cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h
|
||||
cglobal pal_pred_16bpc, 4, 5, 6, dst, stride, pal, idx, w, h
|
||||
%define base r2-pal_pred_16bpc_ssse3_table
|
||||
%if ARCH_X86_32
|
||||
%define hd r2d
|
||||
%endif
|
||||
mova m3, [palq]
|
||||
mova m4, [palq]
|
||||
LEA r2, pal_pred_16bpc_ssse3_table
|
||||
tzcnt wd, wm
|
||||
pshufb m3, [base+pal_pred_shuf]
|
||||
pshufb m4, [base+pal_pred_shuf]
|
||||
movsxd wq, [r2+wq*4]
|
||||
pshufd m4, m3, q1032
|
||||
pshufd m5, m4, q1032
|
||||
add wq, r2
|
||||
movifnidn hd, hm
|
||||
jmp wq
|
||||
.w4:
|
||||
mova m0, [idxq]
|
||||
add idxq, 16
|
||||
pshufb m1, m3, m0
|
||||
pshufb m2, m4, m0
|
||||
movq m0, [idxq]
|
||||
add idxq, 8
|
||||
psrlw m1, m0, 4
|
||||
punpcklbw m0, m1
|
||||
pshufb m1, m4, m0
|
||||
pshufb m2, m5, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
movq [dstq+strideq*0], m0
|
||||
|
|
@ -3995,77 +3997,102 @@ cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h
|
|||
jg .w4
|
||||
RET
|
||||
.w8:
|
||||
mova m0, [idxq]
|
||||
movu m3, [idxq]
|
||||
add idxq, 16
|
||||
pshufb m1, m3, m0
|
||||
pshufb m2, m4, m0
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m0, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m0
|
||||
pshufb m2, m5, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0], m0
|
||||
mova [dstq+strideq*1], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
pshufb m1, m4, m3
|
||||
pshufb m2, m5, m3
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0], m0
|
||||
mova [dstq+strideq*1], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 4
|
||||
jg .w8
|
||||
RET
|
||||
.w16:
|
||||
mova m0, [idxq]
|
||||
movu m3, [idxq]
|
||||
add idxq, 16
|
||||
pshufb m1, m3, m0
|
||||
pshufb m2, m4, m0
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m0, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m0
|
||||
pshufb m2, m5, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+16*0], m0
|
||||
mova [dstq+16*1], m1
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
mova [dstq+ 0], m0
|
||||
mova [dstq+16], m1
|
||||
pshufb m1, m4, m3
|
||||
pshufb m2, m5, m3
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq+ 0], m0
|
||||
mova [dstq+strideq+16], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jg .w16
|
||||
RET
|
||||
.w32:
|
||||
mova m0, [idxq+16*0]
|
||||
pshufb m1, m3, m0
|
||||
pshufb m2, m4, m0
|
||||
movu m3, [idxq]
|
||||
add idxq, 16
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m0, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m0
|
||||
pshufb m2, m5, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova m2, [idxq+16*1]
|
||||
add idxq, 16*2
|
||||
mova [dstq+16*0], m0
|
||||
pshufb m0, m3, m2
|
||||
mova [dstq+16*1], m1
|
||||
pshufb m1, m4, m2
|
||||
punpcklbw m2, m0, m1
|
||||
punpckhbw m0, m1
|
||||
mova [dstq+16*2], m2
|
||||
mova [dstq+16*3], m0
|
||||
pshufb m1, m4, m3
|
||||
pshufb m2, m5, m3
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+16*2], m0
|
||||
mova [dstq+16*3], m1
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
jg .w32
|
||||
RET
|
||||
.w64:
|
||||
mova m0, [idxq+16*0]
|
||||
pshufb m1, m3, m0
|
||||
pshufb m2, m4, m0
|
||||
movu m3, [idxq+16*0]
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m0, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m0
|
||||
pshufb m2, m5, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova m2, [idxq+16*1]
|
||||
mova [dstq+16*0], m0
|
||||
pshufb m0, m3, m2
|
||||
mova [dstq+16*1], m1
|
||||
pshufb m1, m4, m2
|
||||
punpcklbw m2, m0, m1
|
||||
punpckhbw m0, m1
|
||||
mova m1, [idxq+16*2]
|
||||
mova [dstq+16*2], m2
|
||||
pshufb m2, m3, m1
|
||||
mova [dstq+16*3], m0
|
||||
pshufb m0, m4, m1
|
||||
punpcklbw m1, m2, m0
|
||||
punpckhbw m2, m0
|
||||
mova m0, [idxq+16*3]
|
||||
add idxq, 16*4
|
||||
mova [dstq+16*4], m1
|
||||
pshufb m1, m3, m0
|
||||
mova [dstq+16*5], m2
|
||||
pshufb m2, m4, m0
|
||||
pshufb m1, m4, m3
|
||||
pshufb m2, m5, m3
|
||||
movu m3, [idxq+16*1]
|
||||
add idxq, 32
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+16*2], m0
|
||||
mova [dstq+16*3], m1
|
||||
psrlw m1, m3, 4
|
||||
punpcklbw m0, m3, m1
|
||||
punpckhbw m3, m1
|
||||
pshufb m1, m4, m0
|
||||
pshufb m2, m5, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+16*4], m0
|
||||
mova [dstq+16*5], m1
|
||||
pshufb m1, m4, m3
|
||||
pshufb m2, m5, m3
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+16*6], m0
|
||||
|
|
|
|||
80
third_party/dav1d/src/x86/ipred_avx2.asm
vendored
80
third_party/dav1d/src/x86/ipred_avx2.asm
vendored
|
|
@ -5307,18 +5307,20 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_
|
|||
RET
|
||||
|
||||
cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
|
||||
vbroadcasti128 m4, [palq]
|
||||
vpbroadcastq m4, [palq]
|
||||
lea r2, [pal_pred_avx2_table]
|
||||
tzcnt wd, wm
|
||||
movifnidn hd, hm
|
||||
movsxd wq, [r2+wq*4]
|
||||
packuswb m4, m4
|
||||
add wq, r2
|
||||
lea r2, [strideq*3]
|
||||
jmp wq
|
||||
.w4:
|
||||
pshufb xm0, xm4, [idxq]
|
||||
add idxq, 16
|
||||
movq xm0, [idxq]
|
||||
add idxq, 8
|
||||
psrlw xm1, xm0, 4
|
||||
punpcklbw xm0, xm1
|
||||
pshufb xm0, xm4, xm0
|
||||
movd [dstq+strideq*0], xm0
|
||||
pextrd [dstq+strideq*1], xm0, 1
|
||||
pextrd [dstq+strideq*2], xm0, 2
|
||||
|
|
@ -5327,11 +5329,14 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
|
|||
sub hd, 4
|
||||
jg .w4
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w8:
|
||||
pshufb xm0, xm4, [idxq+16*0]
|
||||
pshufb xm1, xm4, [idxq+16*1]
|
||||
add idxq, 16*2
|
||||
movu xm2, [idxq]
|
||||
add idxq, 16
|
||||
pshufb xm1, xm4, xm2
|
||||
psrlw xm2, 4
|
||||
pshufb xm2, xm4, xm2
|
||||
punpcklbw xm0, xm1, xm2
|
||||
punpckhbw xm1, xm2
|
||||
movq [dstq+strideq*0], xm0
|
||||
movhps [dstq+strideq*1], xm0
|
||||
movq [dstq+strideq*2], xm1
|
||||
|
|
@ -5340,47 +5345,48 @@ ALIGN function_align
|
|||
sub hd, 4
|
||||
jg .w8
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w16:
|
||||
pshufb m0, m4, [idxq+32*0]
|
||||
pshufb m1, m4, [idxq+32*1]
|
||||
add idxq, 32*2
|
||||
movu m2, [idxq]
|
||||
add idxq, 32
|
||||
pshufb m1, m4, m2
|
||||
psrlw m2, 4
|
||||
pshufb m2, m4, m2
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0], xm0
|
||||
vextracti128 [dstq+strideq*1], m0, 1
|
||||
mova [dstq+strideq*2], xm1
|
||||
mova [dstq+strideq*1], xm1
|
||||
vextracti128 [dstq+strideq*2], m0, 1
|
||||
vextracti128 [dstq+r2 ], m1, 1
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
jg .w16
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w32:
|
||||
pshufb m0, m4, [idxq+32*0]
|
||||
pshufb m1, m4, [idxq+32*1]
|
||||
pshufb m2, m4, [idxq+32*2]
|
||||
pshufb m3, m4, [idxq+32*3]
|
||||
add idxq, 32*4
|
||||
vpermq m2, [idxq], q3120
|
||||
add idxq, 32
|
||||
pshufb m1, m4, m2
|
||||
psrlw m2, 4
|
||||
pshufb m2, m4, m2
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0], m0
|
||||
mova [dstq+strideq*1], m1
|
||||
mova [dstq+strideq*2], m2
|
||||
mova [dstq+r2 ], m3
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
jg .w32
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w64:
|
||||
pshufb m0, m4, [idxq+32*0]
|
||||
pshufb m1, m4, [idxq+32*1]
|
||||
pshufb m2, m4, [idxq+32*2]
|
||||
pshufb m3, m4, [idxq+32*3]
|
||||
add idxq, 32*4
|
||||
mova [dstq+strideq*0+32*0], m0
|
||||
mova [dstq+strideq*0+32*1], m1
|
||||
mova [dstq+strideq*1+32*0], m2
|
||||
mova [dstq+strideq*1+32*1], m3
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jg .w32
|
||||
RET
|
||||
.w64:
|
||||
vpermq m2, [idxq], q3120
|
||||
add idxq, 32
|
||||
pshufb m1, m4, m2
|
||||
psrlw m2, 4
|
||||
pshufb m2, m4, m2
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+32*0], m0
|
||||
mova [dstq+32*1], m1
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
jg .w64
|
||||
RET
|
||||
|
||||
|
|
|
|||
76
third_party/dav1d/src/x86/ipred_avx512.asm
vendored
76
third_party/dav1d/src/x86/ipred_avx512.asm
vendored
|
|
@ -95,6 +95,8 @@ smooth_endB: db 1, 3, 5, 7, 9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79
|
|||
db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127
|
||||
ipred_h_shuf: db 7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4
|
||||
db 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0
|
||||
pal_unpack: db 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
|
||||
pal_perm: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
|
||||
|
||||
pb_127_m127: times 2 db 127, -127
|
||||
pb_128: times 4 db 128
|
||||
|
|
@ -126,7 +128,6 @@ JMP_TABLE ipred_smooth_h_8bpc, avx512icl, w4, w8, w16, w32, w64
|
|||
JMP_TABLE ipred_dc_8bpc, avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
|
||||
s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
|
||||
JMP_TABLE ipred_dc_left_8bpc, avx512icl, h4, h8, h16, h32, h64
|
||||
JMP_TABLE pal_pred_8bpc, avx512icl, w4, w8, w16, w32, w64
|
||||
|
||||
SECTION .text
|
||||
|
||||
|
|
@ -1111,19 +1112,20 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
|
|||
jg .w64_loop
|
||||
RET
|
||||
|
||||
cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
|
||||
lea r6, [pal_pred_8bpc_avx512icl_table]
|
||||
tzcnt wd, wm
|
||||
vbroadcasti32x4 m4, [palq]
|
||||
cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3
|
||||
movifnidn wd, wm
|
||||
movifnidn hd, hm
|
||||
movsxd wq, [r6+wq*4]
|
||||
packuswb m4, m4
|
||||
add wq, r6
|
||||
lea stride3q, [strideq*3]
|
||||
jmp wq
|
||||
cmp wd, 8
|
||||
jg .w32
|
||||
movq xmm3, [palq]
|
||||
je .w8
|
||||
.w4:
|
||||
pshufb xmm0, xm4, [idxq]
|
||||
add idxq, 16
|
||||
movq xmm0, [idxq]
|
||||
add idxq, 8
|
||||
psrlw xmm1, xmm0, 4
|
||||
punpcklbw xmm0, xmm1
|
||||
pshufb xmm0, xmm3, xmm0
|
||||
movd [dstq+strideq*0], xmm0
|
||||
pextrd [dstq+strideq*1], xmm0, 1
|
||||
pextrd [dstq+strideq*2], xmm0, 2
|
||||
|
|
@ -1133,9 +1135,13 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
|
|||
jg .w4
|
||||
RET
|
||||
.w8:
|
||||
pshufb xmm0, xm4, [idxq+16*0]
|
||||
pshufb xmm1, xm4, [idxq+16*1]
|
||||
add idxq, 16*2
|
||||
movu xmm2, [idxq]
|
||||
add idxq, 16
|
||||
pshufb xmm1, xmm3, xmm2
|
||||
psrlw xmm2, 4
|
||||
pshufb xmm2, xmm3, xmm2
|
||||
punpcklbw xmm0, xmm1, xmm2
|
||||
punpckhbw xmm1, xmm2
|
||||
movq [dstq+strideq*0], xmm0
|
||||
movhps [dstq+strideq*1], xmm0
|
||||
movq [dstq+strideq*2], xmm1
|
||||
|
|
@ -1145,8 +1151,10 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
|
|||
jg .w8
|
||||
RET
|
||||
.w16:
|
||||
pshufb m0, m4, [idxq]
|
||||
add idxq, 64
|
||||
pmovzxdq m0, [idxq]
|
||||
add idxq, 32
|
||||
vpmultishiftqb m0, m3, m0
|
||||
pshufb m0, m5, m0
|
||||
mova [dstq+strideq*0], xm0
|
||||
vextracti32x4 [dstq+strideq*1], ym0, 1
|
||||
vextracti32x4 [dstq+strideq*2], m0, 2
|
||||
|
|
@ -1156,29 +1164,39 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
|
|||
jg .w16
|
||||
RET
|
||||
.w32:
|
||||
pshufb m0, m4, [idxq+64*0]
|
||||
pshufb m1, m4, [idxq+64*1]
|
||||
add idxq, 64*2
|
||||
vpbroadcastq m3, [pal_unpack+0]
|
||||
vpbroadcastq m5, [palq]
|
||||
cmp wd, 32
|
||||
jl .w16
|
||||
pmovzxbd m2, [pal_perm]
|
||||
vpbroadcastq m4, [pal_unpack+8]
|
||||
jg .w64
|
||||
.w32_loop:
|
||||
vpermd m1, m2, [idxq]
|
||||
add idxq, 64
|
||||
vpmultishiftqb m0, m3, m1
|
||||
vpmultishiftqb m1, m4, m1
|
||||
pshufb m0, m5, m0
|
||||
pshufb m1, m5, m1
|
||||
mova [dstq+strideq*0], ym0
|
||||
vextracti32x8 [dstq+strideq*1], m0, 1
|
||||
mova [dstq+strideq*2], ym1
|
||||
vextracti32x8 [dstq+stride3q ], m1, 1
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
jg .w32
|
||||
jg .w32_loop
|
||||
RET
|
||||
.w64:
|
||||
pshufb m0, m4, [idxq+64*0]
|
||||
pshufb m1, m4, [idxq+64*1]
|
||||
pshufb m2, m4, [idxq+64*2]
|
||||
pshufb m3, m4, [idxq+64*3]
|
||||
add idxq, 64*4
|
||||
vpermd m1, m2, [idxq]
|
||||
add idxq, 64
|
||||
vpmultishiftqb m0, m3, m1
|
||||
vpmultishiftqb m1, m4, m1
|
||||
pshufb m0, m5, m0
|
||||
pshufb m1, m5, m1
|
||||
mova [dstq+strideq*0], m0
|
||||
mova [dstq+strideq*1], m1
|
||||
mova [dstq+strideq*2], m2
|
||||
mova [dstq+stride3q ], m3
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jg .w64
|
||||
RET
|
||||
|
||||
|
|
|
|||
112
third_party/dav1d/src/x86/ipred_sse.asm
vendored
112
third_party/dav1d/src/x86/ipred_sse.asm
vendored
|
|
@ -3479,26 +3479,28 @@ cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy
|
|||
jg .end_transpose_loop
|
||||
RET
|
||||
|
||||
;---------------------------------------------------------------------------------------
|
||||
;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal,
|
||||
; const uint8_t *idx, const int w, const int h);
|
||||
;---------------------------------------------------------------------------------------
|
||||
;-------------------------------------------------------------------------------
|
||||
;int dav1d_pal_pred_ssse3(pixel *dst, ptrdiff_t stride, const pixel *pal,
|
||||
; const uint8_t *idx, int w, int h);
|
||||
;-------------------------------------------------------------------------------
|
||||
cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
|
||||
mova m4, [palq]
|
||||
movq m4, [palq]
|
||||
LEA r2, pal_pred_ssse3_table
|
||||
tzcnt wd, wm
|
||||
movifnidn hd, hm
|
||||
movsxd wq, [r2+wq*4]
|
||||
packuswb m4, m4
|
||||
add wq, r2
|
||||
lea r2, [strideq*3]
|
||||
jmp wq
|
||||
.w4:
|
||||
pshufb m0, m4, [idxq]
|
||||
add idxq, 16
|
||||
movd [dstq ], m0
|
||||
movq m1, [idxq]
|
||||
add idxq, 8
|
||||
psrlw m0, m1, 4
|
||||
punpcklbw m1, m0
|
||||
pshufb m0, m4, m1
|
||||
movd [dstq+strideq*0], m0
|
||||
pshuflw m1, m0, q1032
|
||||
movd [dstq+strideq ], m1
|
||||
movd [dstq+strideq*1], m1
|
||||
punpckhqdq m0, m0
|
||||
movd [dstq+strideq*2], m0
|
||||
psrlq m0, 32
|
||||
|
|
@ -3507,60 +3509,68 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
|
|||
sub hd, 4
|
||||
jg .w4
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w8:
|
||||
pshufb m0, m4, [idxq]
|
||||
pshufb m1, m4, [idxq+16]
|
||||
add idxq, 32
|
||||
movq [dstq ], m0
|
||||
movhps [dstq+strideq ], m0
|
||||
movu m0, [idxq]
|
||||
add idxq, 16
|
||||
pshufb m1, m4, m0
|
||||
psrlw m0, 4
|
||||
pshufb m2, m4, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
movq [dstq+strideq*0], m0
|
||||
movhps [dstq+strideq*1], m0
|
||||
movq [dstq+strideq*2], m1
|
||||
movhps [dstq+r2 ], m1
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
jg .w8
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w16:
|
||||
pshufb m0, m4, [idxq]
|
||||
pshufb m1, m4, [idxq+16]
|
||||
pshufb m2, m4, [idxq+32]
|
||||
pshufb m3, m4, [idxq+48]
|
||||
add idxq, 64
|
||||
mova [dstq ], m0
|
||||
mova [dstq+strideq ], m1
|
||||
mova [dstq+strideq*2], m2
|
||||
mova [dstq+r2 ], m3
|
||||
lea dstq, [dstq+strideq*4]
|
||||
sub hd, 4
|
||||
jg .w16
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w32:
|
||||
pshufb m0, m4, [idxq]
|
||||
pshufb m1, m4, [idxq+16]
|
||||
pshufb m2, m4, [idxq+32]
|
||||
pshufb m3, m4, [idxq+48]
|
||||
add idxq, 64
|
||||
mova [dstq ], m0
|
||||
mova [dstq+16 ], m1
|
||||
mova [dstq+strideq ], m2
|
||||
mova [dstq+strideq+16], m3
|
||||
movu m0, [idxq]
|
||||
add idxq, 16
|
||||
pshufb m1, m4, m0
|
||||
psrlw m0, 4
|
||||
pshufb m2, m4, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+strideq*0], m0
|
||||
mova [dstq+strideq*1], m1
|
||||
lea dstq, [dstq+strideq*2]
|
||||
sub hd, 2
|
||||
jg .w16
|
||||
RET
|
||||
.w32:
|
||||
movu m0, [idxq]
|
||||
add idxq, 16
|
||||
pshufb m1, m4, m0
|
||||
psrlw m0, 4
|
||||
pshufb m2, m4, m0
|
||||
punpcklbw m0, m1, m2
|
||||
punpckhbw m1, m2
|
||||
mova [dstq+16*0], m0
|
||||
mova [dstq+16*1], m1
|
||||
add dstq, strideq
|
||||
dec hd
|
||||
jg .w32
|
||||
RET
|
||||
ALIGN function_align
|
||||
.w64:
|
||||
pshufb m0, m4, [idxq]
|
||||
pshufb m1, m4, [idxq+16]
|
||||
pshufb m2, m4, [idxq+32]
|
||||
pshufb m3, m4, [idxq+48]
|
||||
add idxq, 64
|
||||
mova [dstq ], m0
|
||||
mova [dstq+16], m1
|
||||
mova [dstq+32], m2
|
||||
mova [dstq+48], m3
|
||||
movu m0, [idxq+16*0]
|
||||
movu m2, [idxq+16*1]
|
||||
add idxq, 32
|
||||
pshufb m1, m4, m0
|
||||
psrlw m0, 4
|
||||
pshufb m3, m4, m0
|
||||
punpcklbw m0, m1, m3
|
||||
punpckhbw m1, m3
|
||||
mova [dstq+16*0], m0
|
||||
mova [dstq+16*1], m1
|
||||
pshufb m1, m4, m2
|
||||
psrlw m2, 4
|
||||
pshufb m3, m4, m2
|
||||
punpcklbw m0, m1, m3
|
||||
punpckhbw m1, m3
|
||||
mova [dstq+16*2], m0
|
||||
mova [dstq+16*3], m1
|
||||
add dstq, strideq
|
||||
sub hd, 1
|
||||
jg .w64
|
||||
|
|
|
|||
641
third_party/dav1d/src/x86/pal.asm
vendored
Normal file
641
third_party/dav1d/src/x86/pal.asm
vendored
Normal file
|
|
@ -0,0 +1,641 @@
|
|||
; Copyright © 2023, VideoLAN and dav1d authors
|
||||
; Copyright © 2023, Two Orioles, LLC
|
||||
; All rights reserved.
|
||||
;
|
||||
; Redistribution and use in source and binary forms, with or without
|
||||
; modification, are permitted provided that the following conditions are met:
|
||||
;
|
||||
; 1. Redistributions of source code must retain the above copyright notice, this
|
||||
; list of conditions and the following disclaimer.
|
||||
;
|
||||
; 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
; this list of conditions and the following disclaimer in the documentation
|
||||
; and/or other materials provided with the distribution.
|
||||
;
|
||||
; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
%include "config.asm"
|
||||
%include "ext/x86/x86inc.asm"
|
||||
|
||||
SECTION_RODATA 64
|
||||
|
||||
pb_0to63: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
|
||||
%if ARCH_X86_64
|
||||
db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
|
||||
db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
|
||||
db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
|
||||
%endif
|
||||
pal_idx_w8_padh: db 0, 1, 2, 3, 3, 3, 3, 3, 8, 9, 10, 11, 11, 11, 11, 11
|
||||
|
||||
pb_1_16: times 4 db 1, 16
|
||||
%if ARCH_X86_64
|
||||
pb_32: times 4 db 32
|
||||
%endif
|
||||
|
||||
%macro JMP_TABLE 2-*
|
||||
%xdefine %1_table (%%table - 2*4)
|
||||
%xdefine %%base mangle(private_prefix %+ _%1)
|
||||
%%table:
|
||||
%rep %0 - 1
|
||||
dd %%base %+ .w%2 - (%%table - 2*4)
|
||||
%rotate 1
|
||||
%endrep
|
||||
%endmacro
|
||||
|
||||
JMP_TABLE pal_idx_finish_ssse3, 4, 8, 16, 32, 64
|
||||
%if ARCH_X86_64
|
||||
JMP_TABLE pal_idx_finish_avx2, 4, 8, 16, 32, 64
|
||||
JMP_TABLE pal_idx_finish_avx512icl, 4, 8, 16, 32, 64
|
||||
%endif
|
||||
|
||||
SECTION .text
|
||||
|
||||
INIT_XMM ssse3
|
||||
cglobal pal_idx_finish, 2, 7, 6, dst, src, bw, bh, w, h
|
||||
%define base r6-pal_idx_finish_ssse3_table
|
||||
LEA r6, pal_idx_finish_ssse3_table
|
||||
tzcnt bwd, bwm
|
||||
movifnidn bhd, bhm
|
||||
movifnidn wd, wm
|
||||
movifnidn hd, hm
|
||||
movsxd bwq, [r6+bwq*4]
|
||||
movddup m3, [base+pb_1_16]
|
||||
add bwq, r6
|
||||
sub bhd, hd
|
||||
jmp bwq
|
||||
.w4:
|
||||
mova m0, [srcq]
|
||||
add srcq, 16
|
||||
pmaddubsw m0, m3
|
||||
packuswb m0, m0
|
||||
movq [dstq], m0
|
||||
add dstq, 8
|
||||
sub hd, 4
|
||||
jg .w4
|
||||
test bhd, bhd
|
||||
jz .w4_end
|
||||
pshuflw m0, m0, q3333
|
||||
.w4_padv:
|
||||
movq [dstq], m0
|
||||
add dstq, 8
|
||||
sub bhd, 4
|
||||
jg .w4_padv
|
||||
.w4_end:
|
||||
RET
|
||||
.w8_padh:
|
||||
pshufb m0, m2
|
||||
pshufb m1, m2
|
||||
jmp .w8_main
|
||||
.w8:
|
||||
mova m2, [base+pal_idx_w8_padh]
|
||||
.w8_loop:
|
||||
mova m0, [srcq+16*0]
|
||||
mova m1, [srcq+16*1]
|
||||
cmp wd, 8
|
||||
jl .w8_padh
|
||||
.w8_main:
|
||||
pmaddubsw m0, m3
|
||||
pmaddubsw m1, m3
|
||||
add srcq, 16*2
|
||||
packuswb m0, m1
|
||||
movu [dstq], m0
|
||||
add dstq, 16
|
||||
sub hd, 4
|
||||
jg .w8_loop
|
||||
test bhd, bhd
|
||||
jz .w8_end
|
||||
pshufd m0, m0, q3333
|
||||
.w8_padv:
|
||||
movu [dstq], m0
|
||||
add dstq, 16
|
||||
sub bhd, 4
|
||||
jg .w8_padv
|
||||
.w8_end:
|
||||
RET
|
||||
.w16_padh:
|
||||
pshufb m0, m4
|
||||
pshufb m1, m4
|
||||
jmp .w16_main
|
||||
.w16:
|
||||
cmp wd, 16
|
||||
je .w16_loop
|
||||
call .setup_padh
|
||||
.w16_loop:
|
||||
mova m0, [srcq+16*0]
|
||||
mova m1, [srcq+16*1]
|
||||
cmp wd, 16
|
||||
jl .w16_padh
|
||||
.w16_main:
|
||||
pmaddubsw m0, m3
|
||||
pmaddubsw m1, m3
|
||||
add srcq, 16*2
|
||||
packuswb m0, m1
|
||||
movu [dstq], m0
|
||||
add dstq, 16
|
||||
sub hd, 2
|
||||
jg .w16_loop
|
||||
test bhd, bhd
|
||||
jz .w16_end
|
||||
punpckhqdq m0, m0
|
||||
.w16_padv:
|
||||
movu [dstq+16*0], m0
|
||||
movu [dstq+16*1], m0
|
||||
add dstq, 16*2
|
||||
sub bhd, 4
|
||||
jg .w16_padv
|
||||
.w16_end:
|
||||
RET
|
||||
.w32_padh:
|
||||
cmp wd, 16
|
||||
jg .w32_padh2
|
||||
pshufb m1, m0, m5
|
||||
pshufb m0, m4
|
||||
jmp .w32_main
|
||||
.w32_padh2:
|
||||
pshufb m1, m4
|
||||
jmp .w32_main
|
||||
.w32:
|
||||
cmp wd, 32
|
||||
je .w32_loop
|
||||
call .setup_padh
|
||||
.w32_loop:
|
||||
mova m0, [srcq+16*0]
|
||||
mova m1, [srcq+16*1]
|
||||
cmp wd, 32
|
||||
jl .w32_padh
|
||||
.w32_main:
|
||||
pmaddubsw m0, m3
|
||||
pmaddubsw m1, m3
|
||||
add srcq, 16*2
|
||||
packuswb m0, m1
|
||||
movu [dstq], m0
|
||||
add dstq, 16
|
||||
dec hd
|
||||
jg .w32_loop
|
||||
test bhd, bhd
|
||||
jz .w32_end
|
||||
.w32_padv:
|
||||
movu [dstq+16*0], m0
|
||||
movu [dstq+16*1], m0
|
||||
movu [dstq+16*2], m0
|
||||
movu [dstq+16*3], m0
|
||||
add dstq, 16*4
|
||||
sub bhd, 4
|
||||
jg .w32_padv
|
||||
.w32_end:
|
||||
RET
|
||||
.w64_padh:
|
||||
cmp wd, 16
|
||||
jg .w64_padh2
|
||||
pshufb m1, m0, m5
|
||||
pshufb m0, m4
|
||||
pmaddubsw m0, m3
|
||||
pmaddubsw m1, m3
|
||||
packuswb m0, m1
|
||||
packuswb m1, m1
|
||||
jmp .w64_main
|
||||
.w64_padh2:
|
||||
pshufb m1, m4
|
||||
pmaddubsw m0, m3
|
||||
pmaddubsw m2, m1, m3
|
||||
pshufb m1, m5
|
||||
pmaddubsw m1, m3
|
||||
packuswb m0, m2
|
||||
packuswb m1, m1
|
||||
jmp .w64_main
|
||||
.w64_padh3:
|
||||
cmp wd, 48
|
||||
jg .w64_padh4
|
||||
pshufb m2, m1, m5
|
||||
pshufb m1, m4
|
||||
jmp .w64_main2
|
||||
.w64_padh4:
|
||||
pshufb m2, m4
|
||||
jmp .w64_main2
|
||||
.w64:
|
||||
cmp wd, 64
|
||||
je .w64_loop
|
||||
call .setup_padh
|
||||
.w64_loop:
|
||||
mova m0, [srcq+16*0]
|
||||
mova m1, [srcq+16*1]
|
||||
cmp wd, 32
|
||||
jle .w64_padh
|
||||
pmaddubsw m0, m3
|
||||
pmaddubsw m1, m3
|
||||
packuswb m0, m1
|
||||
mova m1, [srcq+16*2]
|
||||
mova m2, [srcq+16*3]
|
||||
cmp wd, 64
|
||||
jl .w64_padh3
|
||||
.w64_main2:
|
||||
pmaddubsw m1, m3
|
||||
pmaddubsw m2, m3
|
||||
packuswb m1, m2
|
||||
.w64_main:
|
||||
add srcq, 16*4
|
||||
movu [dstq+16*0], m0
|
||||
movu [dstq+16*1], m1
|
||||
add dstq, 16*2
|
||||
dec hd
|
||||
jg .w64_loop
|
||||
test bhd, bhd
|
||||
jz .w64_end
|
||||
.w64_padv:
|
||||
movu [dstq+16*0], m0
|
||||
movu [dstq+16*1], m1
|
||||
movu [dstq+16*2], m0
|
||||
movu [dstq+16*3], m1
|
||||
add dstq, 16*4
|
||||
sub bhd, 2
|
||||
jg .w64_padv
|
||||
.w64_end:
|
||||
RET
|
||||
.setup_padh:
|
||||
mova m4, [base+pb_0to63]
|
||||
lea r6d, [wq-1]
|
||||
and r6d, 15
|
||||
movd m5, r6d
|
||||
pxor m0, m0
|
||||
pshufb m5, m0
|
||||
pminub m4, m5
|
||||
ret
|
||||
|
||||
%if ARCH_X86_64
|
||||
|
||||
INIT_YMM avx2
|
||||
cglobal pal_idx_finish, 4, 7, 5, dst, src, bw, bh, w, h
|
||||
%define base r6-pal_idx_finish_avx2_table
|
||||
lea r6, [pal_idx_finish_avx2_table]
|
||||
tzcnt bwd, bwd
|
||||
movifnidn wd, wm
|
||||
movifnidn hd, hm
|
||||
movsxd bwq, [r6+bwq*4]
|
||||
vpbroadcastd m2, [base+pb_1_16]
|
||||
dec wd
|
||||
add bwq, r6
|
||||
sub bhd, hd
|
||||
jmp bwq
|
||||
.w4:
|
||||
mova xm0, [srcq]
|
||||
add srcq, 16
|
||||
pmaddubsw xm0, xm2
|
||||
packuswb xm0, xm0
|
||||
movq [dstq], xm0
|
||||
add dstq, 8
|
||||
sub hd, 4
|
||||
jg .w4
|
||||
test bhd, bhd
|
||||
jz .w4_end
|
||||
pshuflw xm0, xm0, q3333
|
||||
.w4_padv:
|
||||
movq [dstq], xm0
|
||||
add dstq, 8
|
||||
sub bhd, 4
|
||||
jg .w4_padv
|
||||
.w4_end:
|
||||
RET
|
||||
.w8_padh:
|
||||
pshufb xm0, xm3
|
||||
pshufb xm1, xm3
|
||||
jmp .w8_main
|
||||
.w8:
|
||||
mova xm3, [base+pal_idx_w8_padh]
|
||||
.w8_loop:
|
||||
mova xm0, [srcq+16*0]
|
||||
mova xm1, [srcq+16*1]
|
||||
cmp wd, 7
|
||||
jl .w8_padh
|
||||
.w8_main:
|
||||
pmaddubsw xm0, xm2
|
||||
pmaddubsw xm1, xm2
|
||||
add srcq, 16*2
|
||||
packuswb xm0, xm1
|
||||
movu [dstq], xm0
|
||||
add dstq, 16
|
||||
sub hd, 4
|
||||
jg .w8_loop
|
||||
test bhd, bhd
|
||||
jz .w8_end
|
||||
pshufd xm0, xm0, q3333
|
||||
.w8_padv:
|
||||
movu [dstq], xm0
|
||||
add dstq, 16
|
||||
sub bhd, 4
|
||||
jg .w8_padv
|
||||
.w8_end:
|
||||
RET
|
||||
.w16_padh:
|
||||
pshufb m0, m3
|
||||
pshufb m1, m3
|
||||
jmp .w16_main
|
||||
.w16:
|
||||
cmp wd, 15
|
||||
je .w16_loop
|
||||
vbroadcasti128 m0, [base+pb_0to63]
|
||||
movd xm3, wd
|
||||
vpbroadcastb m3, xm3
|
||||
pminub m3, m0
|
||||
.w16_loop:
|
||||
mova m0, [srcq+32*0]
|
||||
mova m1, [srcq+32*1]
|
||||
cmp wd, 15
|
||||
jl .w16_padh
|
||||
.w16_main:
|
||||
pmaddubsw m0, m2
|
||||
pmaddubsw m1, m2
|
||||
add srcq, 32*2
|
||||
packuswb m0, m1
|
||||
vpermq m1, m0, q3120
|
||||
movu [dstq], m1
|
||||
add dstq, 32
|
||||
sub hd, 4
|
||||
jg .w16_loop
|
||||
test bhd, bhd
|
||||
jz .w16_end
|
||||
vpermq m0, m0, q3333
|
||||
.w16_padv:
|
||||
movu [dstq], m0
|
||||
add dstq, 32
|
||||
sub bhd, 4
|
||||
jg .w16_padv
|
||||
.w16_end:
|
||||
RET
|
||||
.w32_padh:
|
||||
cmp wd, 15
|
||||
jg .w32_padh2
|
||||
vinserti128 m0, xm0, 1
|
||||
vinserti128 m1, xm1, 1
|
||||
.w32_padh2:
|
||||
pshufb m0, m3
|
||||
pshufb m1, m3
|
||||
jmp .w32_main
|
||||
.w32:
|
||||
cmp wd, 31
|
||||
je .w32_loop
|
||||
movd xm3, wd
|
||||
vpbroadcastb m3, xm3
|
||||
pminub m3, [base+pb_0to63]
|
||||
.w32_loop:
|
||||
mova m0, [srcq+32*0]
|
||||
mova m1, [srcq+32*1]
|
||||
cmp wd, 31
|
||||
jl .w32_padh
|
||||
.w32_main:
|
||||
pmaddubsw m0, m2
|
||||
pmaddubsw m1, m2
|
||||
add srcq, 32*2
|
||||
packuswb m0, m1
|
||||
vpermq m1, m0, q3120
|
||||
movu [dstq], m1
|
||||
add dstq, 32
|
||||
sub hd, 2
|
||||
jg .w32_loop
|
||||
test bhd, bhd
|
||||
jz .w32_end
|
||||
vpermq m0, m0, q3131
|
||||
.w32_padv:
|
||||
movu [dstq+32*0], m0
|
||||
movu [dstq+32*1], m0
|
||||
add dstq, 32*2
|
||||
sub bhd, 4
|
||||
jg .w32_padv
|
||||
.w32_end:
|
||||
RET
|
||||
.w64_padh:
|
||||
cmp wd, 15
|
||||
jg .w64_padh2
|
||||
vinserti128 m1, m0, xm0, 1
|
||||
pshufb m0, m1, m3
|
||||
pshufb m1, m4
|
||||
jmp .w64_main
|
||||
.w64_padh2:
|
||||
cmp wd, 31
|
||||
jg .w64_padh3
|
||||
vperm2i128 m1, m0, m0, 0x11
|
||||
pshufb m0, m3
|
||||
pshufb m1, m4
|
||||
jmp .w64_main
|
||||
.w64_padh3:
|
||||
cmp wd, 47
|
||||
jg .w64_padh4
|
||||
vinserti128 m1, xm1, 1
|
||||
.w64_padh4:
|
||||
pshufb m1, m3
|
||||
jmp .w64_main
|
||||
.w64:
|
||||
cmp wd, 63
|
||||
je .w64_loop
|
||||
mov r6d, wd
|
||||
and r6d, 31
|
||||
movd xm4, r6d
|
||||
vpbroadcastb m4, xm4
|
||||
pminub m3, m4, [pb_0to63]
|
||||
.w64_loop:
|
||||
mova m0, [srcq+32*0]
|
||||
mova m1, [srcq+32*1]
|
||||
cmp wd, 63
|
||||
jl .w64_padh
|
||||
.w64_main:
|
||||
pmaddubsw m0, m2
|
||||
pmaddubsw m1, m2
|
||||
add srcq, 32*2
|
||||
packuswb m0, m1
|
||||
vpermq m0, m0, q3120
|
||||
movu [dstq], m0
|
||||
add dstq, 32
|
||||
dec hd
|
||||
jg .w64_loop
|
||||
test bhd, bhd
|
||||
jz .w64_end
|
||||
.w64_padv:
|
||||
movu [dstq+32*0], m0
|
||||
movu [dstq+32*1], m0
|
||||
movu [dstq+32*2], m0
|
||||
movu [dstq+32*3], m0
|
||||
add dstq, 32*4
|
||||
sub bhd, 4
|
||||
jg .w64_padv
|
||||
.w64_end:
|
||||
RET
|
||||
|
||||
INIT_ZMM avx512icl
|
||||
cglobal pal_idx_finish, 4, 7, 7, dst, src, bw, bh, w, h
|
||||
%define base r6-pal_idx_finish_avx512icl_table
|
||||
lea r6, [pal_idx_finish_avx512icl_table]
|
||||
tzcnt bwd, bwd
|
||||
movifnidn wd, wm
|
||||
movifnidn hd, hm
|
||||
movsxd bwq, [r6+bwq*4]
|
||||
vpbroadcastd m4, [base+pb_1_16]
|
||||
dec wd
|
||||
add bwq, r6
|
||||
sub bhd, hd
|
||||
jmp bwq
|
||||
.w4:
|
||||
mova xmm0, [srcq]
|
||||
add srcq, 16
|
||||
pmaddubsw xmm0, xm4
|
||||
packuswb xmm0, xmm0
|
||||
movq [dstq], xmm0
|
||||
add dstq, 8
|
||||
sub hd, 4
|
||||
jg .w4
|
||||
test bhd, bhd
|
||||
jz .w4_end
|
||||
pshuflw xmm0, xmm0, q3333
|
||||
.w4_padv:
|
||||
movq [dstq], xmm0
|
||||
add dstq, 8
|
||||
sub bhd, 4
|
||||
jg .w4_padv
|
||||
.w4_end:
|
||||
RET
|
||||
.w8_padh:
|
||||
pshufb xmm0, xmm2
|
||||
pshufb xmm1, xmm2
|
||||
jmp .w8_main
|
||||
.w8:
|
||||
mova xmm2, [base+pal_idx_w8_padh]
|
||||
.w8_loop:
|
||||
mova xmm0, [srcq+16*0]
|
||||
mova xmm1, [srcq+16*1]
|
||||
cmp wd, 7
|
||||
jl .w8_padh
|
||||
.w8_main:
|
||||
pmaddubsw xmm0, xm4
|
||||
pmaddubsw xmm1, xm4
|
||||
add srcq, 16*2
|
||||
packuswb xmm0, xmm1
|
||||
movu [dstq], xmm0
|
||||
add dstq, 16
|
||||
sub hd, 4
|
||||
jg .w8_loop
|
||||
test bhd, bhd
|
||||
jz .w8_end
|
||||
pshufd xmm0, xmm0, q3333
|
||||
.w8_padv:
|
||||
movu [dstq], xmm0
|
||||
add dstq, 16
|
||||
sub bhd, 4
|
||||
jg .w8_padv
|
||||
.w8_end:
|
||||
RET
|
||||
.w16_padh:
|
||||
pshufb m0, m2
|
||||
jmp .w16_main
|
||||
.w16:
|
||||
cmp wd, 15
|
||||
je .w16_loop
|
||||
vbroadcasti32x4 m2, [base+pb_0to63]
|
||||
vpbroadcastb m0, wd
|
||||
pminub m2, m0
|
||||
.w16_loop:
|
||||
mova m0, [srcq]
|
||||
cmp wd, 15
|
||||
jl .w16_padh
|
||||
.w16_main:
|
||||
pmaddubsw m0, m4
|
||||
add srcq, 64
|
||||
vpmovwb ym0, m0
|
||||
movu [dstq], ym0
|
||||
add dstq, 32
|
||||
sub hd, 4
|
||||
jg .w16_loop
|
||||
test bhd, bhd
|
||||
jz .w16_end
|
||||
vpermq ym0, ym0, q3333
|
||||
.w16_padv:
|
||||
movu [dstq], ym0
|
||||
add dstq, 32
|
||||
sub bhd, 4
|
||||
jg .w16_padv
|
||||
.w16_end:
|
||||
RET
|
||||
.w32_padh:
|
||||
vpermb m0, m2, m0
|
||||
vpermb m1, m2, m1
|
||||
jmp .w32_main
|
||||
.w32:
|
||||
mova m2, [base+pb_0to63]
|
||||
paddb m3, m2, m2
|
||||
cmp wd, 31
|
||||
je .w32_loop
|
||||
vpbroadcastb m0, wd
|
||||
mov r6d, 0xff00
|
||||
kmovw k1, r6d
|
||||
vpaddd m0{k1}, [pb_32] {1to16}
|
||||
pminub m2, m0
|
||||
.w32_loop:
|
||||
mova m0, [srcq+64*0]
|
||||
mova m1, [srcq+64*1]
|
||||
cmp wd, 31
|
||||
jl .w32_padh
|
||||
.w32_main:
|
||||
pmaddubsw m0, m4
|
||||
pmaddubsw m1, m4
|
||||
add srcq, 64*2
|
||||
vpermt2b m0, m3, m1
|
||||
movu [dstq], m0
|
||||
add dstq, 64
|
||||
sub hd, 4
|
||||
jg .w32_loop
|
||||
test bhd, bhd
|
||||
jz .w32_end
|
||||
vshufi32x4 m0, m0, q3333
|
||||
.w32_padv:
|
||||
movu [dstq], m0
|
||||
add dstq, 64
|
||||
sub bhd, 4
|
||||
jg .w32_padv
|
||||
.w32_end:
|
||||
RET
|
||||
.w64_padh:
|
||||
REPX {vpermb x, m5, x}, m0, m1, m2, m3
|
||||
jmp .w64_main
|
||||
.w64:
|
||||
mova m5, [base+pb_0to63]
|
||||
paddb m6, m5, m5
|
||||
cmp wd, 63
|
||||
je .w64_loop
|
||||
vpbroadcastb m0, wd
|
||||
pminub m5, m0
|
||||
.w64_loop:
|
||||
mova m0, [srcq+64*0]
|
||||
mova m1, [srcq+64*1]
|
||||
mova m2, [srcq+64*2]
|
||||
mova m3, [srcq+64*3]
|
||||
cmp wd, 63
|
||||
jl .w64_padh
|
||||
.w64_main:
|
||||
REPX {pmaddubsw x, m4}, m0, m1, m2, m3
|
||||
add srcq, 64*4
|
||||
vpermt2b m0, m6, m1
|
||||
vpermt2b m2, m6, m3
|
||||
movu [dstq+64*0], m0
|
||||
movu [dstq+64*1], m2
|
||||
add dstq, 64*2
|
||||
sub hd, 4
|
||||
jg .w64_loop
|
||||
test bhd, bhd
|
||||
jz .w64_end
|
||||
vshufi32x4 m2, m2, q3232
|
||||
.w64_padv:
|
||||
movu [dstq+64*0], m2
|
||||
movu [dstq+64*1], m2
|
||||
add dstq, 64*2
|
||||
sub bhd, 4
|
||||
jg .w64_padv
|
||||
.w64_end:
|
||||
RET
|
||||
|
||||
%endif ; ARCH_X86_64
|
||||
50
third_party/dav1d/src/x86/pal.h
vendored
Normal file
50
third_party/dav1d/src/x86/pal.h
vendored
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
* Copyright © 2023, VideoLAN and dav1d authors
|
||||
* Copyright © 2023, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/cpu.h"
|
||||
|
||||
decl_pal_idx_finish_fn(dav1d_pal_idx_finish_ssse3);
|
||||
decl_pal_idx_finish_fn(dav1d_pal_idx_finish_avx2);
|
||||
decl_pal_idx_finish_fn(dav1d_pal_idx_finish_avx512icl);
|
||||
|
||||
static ALWAYS_INLINE void pal_dsp_init_x86(Dav1dPalDSPContext *const c) {
|
||||
const unsigned flags = dav1d_get_cpu_flags();
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
|
||||
|
||||
c->pal_idx_finish = dav1d_pal_idx_finish_ssse3;
|
||||
|
||||
#if ARCH_X86_64
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
|
||||
|
||||
c->pal_idx_finish = dav1d_pal_idx_finish_avx2;
|
||||
|
||||
if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
|
||||
|
||||
c->pal_idx_finish = dav1d_pal_idx_finish_avx512icl;
|
||||
#endif
|
||||
}
|
||||
1
third_party/dav1d/tests/meson.build
vendored
1
third_party/dav1d/tests/meson.build
vendored
|
|
@ -35,6 +35,7 @@ if is_asm_enabled
|
|||
checkasm_sources = files(
|
||||
'checkasm/checkasm.c',
|
||||
'checkasm/msac.c',
|
||||
'checkasm/pal.c',
|
||||
'checkasm/refmvs.c',
|
||||
)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue