Bug 1846318 update dav1d to e58afe4dd9057591882a01c31382c203e8a61c92 r=chunmin

Depends on D187495 Differential Revision: https://phabricator.services.mozilla.com/D187496
2023-09-25 19:34:48 +00:00 · 2023-09-25 19:34:48 +00:00 · 0d02f04be0
commit 0d02f04be0
parent bfae62bdc0
40 changed files with 1831 additions and 734 deletions
--- a/media/libdav1d/asm/moz.build
+++ b/media/libdav1d/asm/moz.build
@ -133,6 +133,7 @@ if CONFIG['CPU_ARCH'] in ('x86', 'x86_64'):
        '../../../third_party/dav1d/src/x86/mc16_sse.asm',
        '../../../third_party/dav1d/src/x86/mc_sse.asm',
        '../../../third_party/dav1d/src/x86/msac.asm',
+        '../../../third_party/dav1d/src/x86/pal.asm',
        '../../../third_party/dav1d/src/x86/refmvs.asm',
    ]

--- a/media/libdav1d/moz.build
+++ b/media/libdav1d/moz.build
@ -85,6 +85,7 @@ SOURCES += [
    '../../third_party/dav1d/src/mem.c',
    '../../third_party/dav1d/src/msac.c',
    '../../third_party/dav1d/src/obu.c',
+    '../../third_party/dav1d/src/pal.c',
    '../../third_party/dav1d/src/picture.c',
    '../../third_party/dav1d/src/qm.c',
    '../../third_party/dav1d/src/ref.c',
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 616bfd1506a8a75c6a358e578cbec9ca11931502 (2023-07-01T11:36:39.000+03:00).
+  release: e58afe4dd9057591882a01c31382c203e8a61c92 (2023-07-25T16:10:07.000+02:00).

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: 616bfd1506a8a75c6a358e578cbec9ca11931502
+  revision: e58afe4dd9057591882a01c31382c203e8a61c92

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "616bfd1506a8a75c6a358e578cbec9ca11931502"
+#define DAV1D_VERSION "e58afe4dd9057591882a01c31382c203e8a61c92"
--- a/third_party/dav1d/include/common/bitdepth.h
+++ b/third_party/dav1d/include/common/bitdepth.h
@ -34,7 +34,7 @@
 #include "common/attributes.h"

 #if !defined(BITDEPTH)
-typedef void pixel;
+typedef uint8_t pixel; /* can't be void due to pointer-to-array usage */
 typedef void coef;
 #define HIGHBD_DECL_SUFFIX /* nothing */
 #define HIGHBD_CALL_SUFFIX /* nothing */
--- a/third_party/dav1d/src/arm/32/filmgrain.S
+++ b/third_party/dav1d/src/arm/32/filmgrain.S
@ -1481,8 +1481,8 @@ function fgy_32x32_8bpc_neon, export=1
        calc_offset     r6,  lr,  r6,  0,   0
        add_offset      r5,  r6,  lr,  r5,  r9

-        add             r4,  r4,  #32          // grain_lut += BLOCK_SIZE * bx
-        add             r6,  r11, r9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             r4,  r4,  #32          // grain_lut += FG_BLOCK_SIZE * bx
+        add             r6,  r11, r9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by

        ldr             r10, [sp, #120]        // type
        adr             r11, L(fgy_loop_tbl)
@ -1490,8 +1490,8 @@ function fgy_32x32_8bpc_neon, export=1
        tst             r10, #1
        ldr             r10, [r11, r10, lsl #2]

-        add             r8,  r8,  r9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
-        add             r8,  r8,  #32          // grain_lut += BLOCK_SIZE * bx
+        add             r8,  r8,  r9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+        add             r8,  r8,  #32          // grain_lut += FG_BLOCK_SIZE * bx

        add             r11, r11, r10

@ -1695,10 +1695,10 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
        calc_offset     r8,  r12, r8,  \sx, \sy
        add_offset      r5,  r8,  r12, r5,  r10

-        add             r4,  r4,  #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
-        add             r8,  lr,  r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
-        add             r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
-        add             r11, r11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+        add             r4,  r4,  #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+        add             r8,  lr,  r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+        add             r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+        add             r11, r11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx

        movrel_local    r12, overlap_coeffs_\sx
        ldr             lr,  [sp, #132]        // type
--- a/third_party/dav1d/src/arm/32/filmgrain16.S
+++ b/third_party/dav1d/src/arm/32/filmgrain16.S
@ -1353,8 +1353,8 @@ function fgy_32x32_16bpc_neon, export=1
        calc_offset     r6,  lr,  r6,  0,   0
        add_offset      r5,  r6,  lr,  r5,  r9

-        add             r4,  r4,  #32*2        // grain_lut += BLOCK_SIZE * bx
-        add             r6,  r11, r9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             r4,  r4,  #32*2        // grain_lut += FG_BLOCK_SIZE * bx
+        add             r6,  r11, r9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by

        ldr             r10, [sp, #120]        // type
        adr             r11, L(fgy_loop_tbl)
@ -1362,8 +1362,8 @@ function fgy_32x32_16bpc_neon, export=1
        tst             r10, #1
        ldr             r10, [r11, r10, lsl #2]

-        add             r8,  r8,  r9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
-        add             r8,  r8,  #32*2        // grain_lut += BLOCK_SIZE * bx
+        add             r8,  r8,  r9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+        add             r8,  r8,  #32*2        // grain_lut += FG_BLOCK_SIZE * bx

        add             r11, r11, r10

@ -1651,10 +1651,10 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1

        vmov.16         d31[3], r7             // overlap y [1]

-        add             r4,  r4,  #2*(32 >> \sx)      // grain_lut += BLOCK_SIZE * bx
-        add             r8,  lr,  r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
-        add             r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
-        add             r11, r11, #2*(32 >> \sx)      // grain_lut += BLOCK_SIZE * bx
+        add             r4,  r4,  #2*(32 >> \sx)      // grain_lut += FG_BLOCK_SIZE * bx
+        add             r8,  lr,  r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+        add             r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+        add             r11, r11, #2*(32 >> \sx)      // grain_lut += FG_BLOCK_SIZE * bx

        movrel_local    r12, overlap_coeffs_\sx
        ldr             lr,       [sp, #132]   // type
--- a/third_party/dav1d/src/arm/32/ipred.S
+++ b/third_party/dav1d/src/arm/32/ipred.S
@ -1576,17 +1576,17 @@ L(ipred_filter_tbl):
 endfunc

 // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-//                         const uint16_t *const pal, const uint8_t *idx,
+//                         const pixel *const pal, const uint8_t *idx,
 //                         const int w, const int h);
 function pal_pred_8bpc_neon, export=1
        push            {r4-r5, lr}
        ldrd            r4,  r5,  [sp, #12]
-        vld1.16         {q0}, [r2, :128]
+        vld1.8          {d0}, [r2, :64]
        clz             lr,  r4
        adr             r12, L(pal_pred_tbl)
        sub             lr,  lr,  #25
+        vmov.i8         q15, #7
        ldr             lr,  [r12, lr, lsl #2]
-        vmovn.i16       d0,  q0
        add             r12, r12, lr
        add             r2,  r0,  r1
        bx              r12
@ -1602,8 +1602,11 @@ L(pal_pred_tbl):
 40:
        lsl             r1,  r1,  #1
 4:
-        vld1.8          {q1}, [r3, :128]!
+        vld1.8          {d2}, [r3, :64]!
        subs            r5,  r5,  #4
+        vshr.u8         d3,  d2,  #4
+        vand.u8         d2,  d2,  d30
+        vzip.8          d2,  d3
        vtbl.8          d2, {d0}, d2
        vtbl.8          d3, {d0}, d3
        vst1.32         {d2[0]}, [r0, :32], r1
@ -1615,8 +1618,11 @@ L(pal_pred_tbl):
 80:
        lsl             r1,  r1,  #1
 8:
-        vld1.8          {q1, q2}, [r3, :128]!
+        vld1.8          {q1}, [r3, :64]!
        subs            r5,  r5,  #4
+        vshr.u8         q2,  q1,  #4
+        vand.u8         q1,  q1,  q15
+        vzip.8          q1,  q2
        vtbl.8          d2, {d0}, d2
        vtbl.8          d3, {d0}, d3
        vst1.8          {d2}, [r0, :64], r1
@ -1630,9 +1636,14 @@ L(pal_pred_tbl):
 160:
        lsl             r1,  r1,  #1
 16:
-        vld1.8          {q8,  q9},  [r3, :128]!
+        vld1.8          {q10, q11}, [r3, :64]!
        subs            r5,  r5,  #4
-        vld1.8          {q10, q11}, [r3, :128]!
+        vand.u8         q8,  q10, q15
+        vshr.u8         q9,  q10, #4
+        vand.u8         q10, q11, q15
+        vshr.u8         q11, q11, #4
+        vzip.8          q8,  q9
+        vzip.8          q10, q11
        vtbl.8          d16, {d0}, d16
        vtbl.8          d17, {d0}, d17
        vtbl.8          d18, {d0}, d18
@ -1650,9 +1661,14 @@ L(pal_pred_tbl):
 320:
        lsl             r1,  r1,  #1
 32:
-        vld1.8          {q8,  q9},  [r3, :128]!
+        vld1.8          {q10, q11}, [r3, :64]!
        subs            r5,  r5,  #2
-        vld1.8          {q10, q11}, [r3, :128]!
+        vand.u8         q8,  q10, q15
+        vshr.u8         q9,  q10, #4
+        vand.u8         q10, q11, q15
+        vshr.u8         q11, q11, #4
+        vzip.8          q8,  q9
+        vzip.8          q10, q11
        vtbl.8          d16, {d0}, d16
        vtbl.8          d17, {d0}, d17
        vtbl.8          d18, {d0}, d18
@ -1668,9 +1684,14 @@ L(pal_pred_tbl):
 640:
        sub             r1,  r1,  #32
 64:
-        vld1.8          {q8,  q9},  [r3, :128]!
+        vld1.8          {q10, q11}, [r3, :64]!
        subs            r5,  r5,  #1
-        vld1.8          {q10, q11}, [r3, :128]!
+        vand.u8         q8,  q10, q15
+        vshr.u8         q9,  q10, #4
+        vand.u8         q10, q11, q15
+        vshr.u8         q11, q11, #4
+        vzip.8          q8,  q9
+        vzip.8          q10, q11
        vtbl.8          d16, {d0}, d16
        vtbl.8          d17, {d0}, d17
        vtbl.8          d18, {d0}, d18
--- a/third_party/dav1d/src/arm/32/ipred16.S
+++ b/third_party/dav1d/src/arm/32/ipred16.S
@ -1732,7 +1732,7 @@ function ipred_filter_16bpc_neon, export=1
 endfunc

 // void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
-//                          const uint16_t *const pal, const uint8_t *idx,
+//                          const pixel *const pal, const uint8_t *idx,
 //                          const int w, const int h);
 function pal_pred_16bpc_neon, export=1
        push            {r4-r5, lr}
@ -1742,6 +1742,7 @@ function pal_pred_16bpc_neon, export=1
        clz             lr,  r4
        adr             r12, L(pal_pred_tbl)
        sub             lr,  lr,  #25
+        vmov.i8         q13, #7
        ldr             lr,  [r12, lr, lsl #2]
        vmov.i16        q15, #0x100
        add             r12, r12, lr
@ -1759,8 +1760,11 @@ L(pal_pred_tbl):
 40:
        lsl             r1,  r1,  #1
 4:
-        vld1.8          {q1}, [r3, :128]!
+        vld1.8          {d2}, [r3, :64]!
        subs            r5,  r5,  #4
+        vshr.u8         d3,  d2,  #4
+        vand.u8         d2,  d2,  d26
+        vzip.8          d2,  d3
        // Restructure q1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
        vadd.i8         q0,  q1,  q1
        vadd.i8         q1,  q1,  q1
@ -1780,8 +1784,11 @@ L(pal_pred_tbl):
 80:
        lsl             r1,  r1,  #1
 8:
-        vld1.8          {q1, q2}, [r3, :128]!
+        vld1.8          {q1}, [r3, :64]!
        subs            r5,  r5,  #4
+        vshr.u8         q2,  q1,  #4
+        vand.u8         q1,  q1,  q13
+        vzip.8          q1,  q2
        // Prefer doing the adds twice, instead of chaining a vmov after
        // the add.
        vadd.i8         q0,  q1,  q1
@ -1811,9 +1818,14 @@ L(pal_pred_tbl):
 160:
        lsl             r1,  r1,  #1
 16:
-        vld1.8          {q2, q3},   [r3, :128]!
+        vld1.8          {q10, q11}, [r3, :64]!
        subs            r5,  r5,  #4
-        vld1.8          {q10, q11}, [r3, :128]!
+        vand.u8         q2,  q10, q13
+        vshr.u8         q3,  q10, #4
+        vand.u8         q10, q11, q13
+        vshr.u8         q11, q11, #4
+        vzip.8          q2,  q3
+        vzip.8          q10, q11
        vadd.i8         q0,  q2,  q2
        vadd.i8         q1,  q2,  q2
        vadd.i8         q2,  q3,  q3
@ -1860,9 +1872,14 @@ L(pal_pred_tbl):
        lsl             r1,  r1,  #1
        sub             r1,  r1,  #32
 32:
-        vld1.8          {q2, q3},   [r3, :128]!
+        vld1.8          {q10, q11}, [r3, :64]!
        subs            r5,  r5,  #2
-        vld1.8          {q10, q11}, [r3, :128]!
+        vand.u8         q2,  q10, q13
+        vshr.u8         q3,  q10, #4
+        vand.u8         q10, q11, q13
+        vshr.u8         q11, q11, #4
+        vzip.8          q2,  q3
+        vzip.8          q10, q11
        vadd.i8         q0,  q2,  q2
        vadd.i8         q1,  q2,  q2
        vadd.i8         q2,  q3,  q3
@ -1908,9 +1925,14 @@ L(pal_pred_tbl):
 640:
        sub             r1,  r1,  #96
 64:
-        vld1.8          {q2, q3},   [r3, :128]!
+        vld1.8          {q10, q11}, [r3, :64]!
        subs            r5,  r5,  #1
-        vld1.8          {q10, q11}, [r3, :128]!
+        vand.u8         q2,  q10, q13
+        vshr.u8         q3,  q10, #4
+        vand.u8         q10, q11, q13
+        vshr.u8         q11, q11, #4
+        vzip.8          q2,  q3
+        vzip.8          q10, q11
        vadd.i8         q0,  q2,  q2
        vadd.i8         q1,  q2,  q2
        vadd.i8         q2,  q3,  q3
--- a/third_party/dav1d/src/arm/64/filmgrain.S
+++ b/third_party/dav1d/src/arm/64/filmgrain.S
@ -1409,14 +1409,14 @@ function fgy_32x32_8bpc_neon, export=1
        ldr             w11, [sp, #24]         // type
        adr             x13, L(fgy_loop_tbl)

-        add             x4,  x12, #32          // grain_lut += BLOCK_SIZE * bx
-        add             x6,  x14, x9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             x4,  x12, #32          // grain_lut += FG_BLOCK_SIZE * bx
+        add             x6,  x14, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by

        tst             w11, #1
        ldrh            w11, [x13, w11, uxtw #1]

-        add             x8,  x16, x9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
-        add             x8,  x8,  #32          // grain_lut += BLOCK_SIZE * bx
+        add             x8,  x16, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+        add             x8,  x8,  #32          // grain_lut += FG_BLOCK_SIZE * bx

        sub             x11, x13, w11, uxtw

@ -1638,10 +1638,10 @@ function fguv_32x32_\layout\()_8bpc_neon, export=1
        add_offset      x17, w16, x17, x5,  x10
        add_offset      x5,  w8,  x11, x5,  x10

-        add             x4,  x13, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
-        add             x8,  x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
-        add             x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
-        add             x11, x11, #(32 >> \sx) // grain_lut += BLOCK_SIZE * bx
+        add             x4,  x13, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
+        add             x8,  x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+        add             x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+        add             x11, x11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx

        ldr             w13, [sp, #64]         // type

--- a/third_party/dav1d/src/arm/64/filmgrain16.S
+++ b/third_party/dav1d/src/arm/64/filmgrain16.S
@ -1308,14 +1308,14 @@ function fgy_32x32_16bpc_neon, export=1
        ldr             w11, [sp, #88]         // type
        adr             x13, L(fgy_loop_tbl)

-        add             x4,  x12, #32*2        // grain_lut += BLOCK_SIZE * bx
-        add             x6,  x14, x9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
+        add             x4,  x12, #32*2        // grain_lut += FG_BLOCK_SIZE * bx
+        add             x6,  x14, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by

        tst             w11, #1
        ldrh            w11, [x13, w11, uxtw #1]

-        add             x8,  x16, x9,  lsl #5  // grain_lut += grain_stride * BLOCK_SIZE * by
-        add             x8,  x8,  #32*2        // grain_lut += BLOCK_SIZE * bx
+        add             x8,  x16, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+        add             x8,  x8,  #32*2        // grain_lut += FG_BLOCK_SIZE * bx

        sub             x11, x13, w11, uxtw

@ -1581,10 +1581,10 @@ function fguv_32x32_\layout\()_16bpc_neon, export=1
        add_offset      x17, w16, x17, x5,  x10
        add_offset      x5,  w8,  x11, x5,  x10

-        add             x4,  x13, #2*(32 >> \sx)      // grain_lut += BLOCK_SIZE * bx
-        add             x8,  x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
-        add             x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * BLOCK_SIZE * by
-        add             x11, x11, #2*(32 >> \sx)      // grain_lut += BLOCK_SIZE * bx
+        add             x4,  x13, #2*(32 >> \sx)      // grain_lut += FG_BLOCK_SIZE * bx
+        add             x8,  x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+        add             x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
+        add             x11, x11, #2*(32 >> \sx)      // grain_lut += FG_BLOCK_SIZE * bx

        ldr             w13, [sp, #112]        // type

--- a/third_party/dav1d/src/arm/64/ipred.S
+++ b/third_party/dav1d/src/arm/64/ipred.S
@ -3921,23 +3921,26 @@ L(ipred_filter_tbl):
 endfunc

 // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
-//                         const uint16_t *const pal, const uint8_t *idx,
+//                         const pixel *const pal, const uint8_t *idx,
 //                         const int w, const int h);
 function pal_pred_8bpc_neon, export=1
-        ld1             {v0.8h}, [x2]
+        ld1             {v0.8b}, [x2]
        clz             w9,  w4
        adr             x6,  L(pal_pred_tbl)
        sub             w9,  w9,  #25
+        movi            v31.16b, #7
        ldrh            w9,  [x6, w9, uxtw #1]
-        xtn             v0.8b,  v0.8h
        sub             x6,  x6,  w9, uxtw
        add             x2,  x0,  x1
        lsl             x1,  x1,  #1
        br              x6
 4:
        AARCH64_VALID_JUMP_TARGET
-        ld1             {v1.16b}, [x3], #16
+        ld1             {v1.8b}, [x3], #8
        subs            w5,  w5,  #4
+        ushr            v3.8b,   v1.8b,   #4
+        and             v2.8b,   v1.8b,   v31.8b
+        zip1            v1.16b,  v2.16b,  v3.16b
        tbl             v1.16b, {v0.16b}, v1.16b
        st1             {v1.s}[0], [x0], x1
        st1             {v1.s}[1], [x2], x1
@ -3947,8 +3950,12 @@ function pal_pred_8bpc_neon, export=1
        ret
 8:
        AARCH64_VALID_JUMP_TARGET
-        ld1             {v1.16b, v2.16b}, [x3], #32
+        ld1             {v1.16b}, [x3], #16
        subs            w5,  w5,  #4
+        ushr            v4.16b,  v1.16b,  #4
+        and             v3.16b,  v1.16b,  v31.16b
+        zip1            v1.16b,  v3.16b,  v4.16b
+        zip2            v2.16b,  v3.16b,  v4.16b
        tbl             v1.16b, {v0.16b}, v1.16b
        st1             {v1.d}[0], [x0], x1
        tbl             v2.16b, {v0.16b}, v2.16b
@ -3959,9 +3966,17 @@ function pal_pred_8bpc_neon, export=1
        ret
 16:
        AARCH64_VALID_JUMP_TARGET
-        ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x3], #64
+        ld1             {v1.16b, v2.16b}, [x3], #32
        subs            w5,  w5,  #4
+        ushr            v5.16b,  v1.16b,  #4
+        and             v4.16b,  v1.16b,  v31.16b
+        ushr            v7.16b,  v2.16b,  #4
+        and             v6.16b,  v2.16b,  v31.16b
+        zip1            v1.16b,  v4.16b,  v5.16b
+        zip2            v2.16b,  v4.16b,  v5.16b
+        zip1            v3.16b,  v6.16b,  v7.16b
        tbl             v1.16b, {v0.16b}, v1.16b
+        zip2            v4.16b,  v6.16b,  v7.16b
        tbl             v2.16b, {v0.16b}, v2.16b
        st1             {v1.16b}, [x0], x1
        tbl             v3.16b, {v0.16b}, v3.16b
@ -3974,10 +3989,25 @@ function pal_pred_8bpc_neon, export=1
 32:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
-        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
        subs            w5,  w5,  #4
+        ushr            v21.16b, v16.16b, #4
+        and             v20.16b, v16.16b, v31.16b
+        ushr            v23.16b, v17.16b, #4
+        and             v22.16b, v17.16b, v31.16b
+        ushr            v25.16b, v18.16b, #4
+        and             v24.16b, v18.16b, v31.16b
+        ushr            v27.16b, v19.16b, #4
+        and             v26.16b, v19.16b, v31.16b
+        zip1            v16.16b, v20.16b, v21.16b
+        zip2            v17.16b, v20.16b, v21.16b
+        zip1            v18.16b, v22.16b, v23.16b
+        zip2            v19.16b, v22.16b, v23.16b
+        zip1            v20.16b, v24.16b, v25.16b
+        zip2            v21.16b, v24.16b, v25.16b
        tbl             v16.16b, {v0.16b}, v16.16b
+        zip1            v22.16b, v26.16b, v27.16b
        tbl             v17.16b, {v0.16b}, v17.16b
+        zip2            v23.16b, v26.16b, v27.16b
        tbl             v18.16b, {v0.16b}, v18.16b
        tbl             v19.16b, {v0.16b}, v19.16b
        tbl             v20.16b, {v0.16b}, v20.16b
@ -3993,10 +4023,25 @@ function pal_pred_8bpc_neon, export=1
 64:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
-        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x3], #64
        subs            w5,  w5,  #2
+        ushr            v21.16b, v16.16b, #4
+        and             v20.16b, v16.16b, v31.16b
+        ushr            v23.16b, v17.16b, #4
+        and             v22.16b, v17.16b, v31.16b
+        ushr            v25.16b, v18.16b, #4
+        and             v24.16b, v18.16b, v31.16b
+        ushr            v27.16b, v19.16b, #4
+        and             v26.16b, v19.16b, v31.16b
+        zip1            v16.16b, v20.16b, v21.16b
+        zip2            v17.16b, v20.16b, v21.16b
+        zip1            v18.16b, v22.16b, v23.16b
+        zip2            v19.16b, v22.16b, v23.16b
+        zip1            v20.16b, v24.16b, v25.16b
+        zip2            v21.16b, v24.16b, v25.16b
        tbl             v16.16b, {v0.16b}, v16.16b
+        zip1            v22.16b, v26.16b, v27.16b
        tbl             v17.16b, {v0.16b}, v17.16b
+        zip2            v23.16b, v26.16b, v27.16b
        tbl             v18.16b, {v0.16b}, v18.16b
        tbl             v19.16b, {v0.16b}, v19.16b
        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
--- a/third_party/dav1d/src/arm/64/ipred16.S
+++ b/third_party/dav1d/src/arm/64/ipred16.S
@ -4179,13 +4179,14 @@ function ipred_filter_16bpc_neon, export=1
 endfunc

 // void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
-//                          const uint16_t *const pal, const uint8_t *idx,
+//                          const pixel *const pal, const uint8_t *idx,
 //                          const int w, const int h);
 function pal_pred_16bpc_neon, export=1
        ld1             {v30.8h}, [x2]
        clz             w9,  w4
        adr             x6,  L(pal_pred_tbl)
        sub             w9,  w9,  #25
+        movi            v29.16b, #7
        ldrh            w9,  [x6, w9, uxtw #1]
        movi            v31.8h,  #1, lsl #8
        sub             x6,  x6,  w9, uxtw
@ -4195,8 +4196,11 @@ function pal_pred_16bpc_neon, export=1
        add             x2,  x0,  x1
        lsl             x1,  x1,  #1
 4:
-        ld1             {v1.16b}, [x3], #16
+        ld1             {v1.8b}, [x3], #8
        subs            w5,  w5,  #4
+        ushr            v3.8b,   v1.8b,   #4
+        and             v2.8b,   v1.8b,   v29.8b
+        zip1            v1.16b,  v2.16b,  v3.16b
        // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
        add             v1.16b,  v1.16b,  v1.16b
        zip1            v0.16b,  v1.16b,  v1.16b
@ -4216,8 +4220,12 @@ function pal_pred_16bpc_neon, export=1
        add             x2,  x0,  x1
        lsl             x1,  x1,  #1
 8:
-        ld1             {v2.16b, v3.16b}, [x3], #32
+        ld1             {v2.16b}, [x3], #16
        subs            w5,  w5,  #4
+        ushr            v4.16b,  v2.16b,  #4
+        and             v3.16b,  v2.16b,  v29.16b
+        zip1            v2.16b,  v3.16b,  v4.16b
+        zip2            v3.16b,  v3.16b,  v4.16b
        add             v2.16b,  v2.16b,  v2.16b
        add             v3.16b,  v3.16b,  v3.16b
        zip1            v0.16b,  v2.16b,  v2.16b
@ -4243,8 +4251,16 @@ function pal_pred_16bpc_neon, export=1
        add             x2,  x0,  x1
        lsl             x1,  x1,  #1
 16:
-        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
+        ld1             {v4.16b, v5.16b}, [x3], #32
        subs            w5,  w5,  #4
+        ushr            v7.16b,  v4.16b,  #4
+        and             v6.16b,  v4.16b,  v29.16b
+        ushr            v3.16b,  v5.16b,  #4
+        and             v2.16b,  v5.16b,  v29.16b
+        zip1            v4.16b,  v6.16b,  v7.16b
+        zip2            v5.16b,  v6.16b,  v7.16b
+        zip1            v6.16b,  v2.16b,  v3.16b
+        zip2            v7.16b,  v2.16b,  v3.16b
        add             v4.16b,  v4.16b,  v4.16b
        add             v5.16b,  v5.16b,  v5.16b
        add             v6.16b,  v6.16b,  v6.16b
@ -4284,8 +4300,16 @@ function pal_pred_16bpc_neon, export=1
        add             x2,  x0,  x1
        lsl             x1,  x1,  #1
 32:
-        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
+        ld1             {v4.16b, v5.16b}, [x3], #32
        subs            w5,  w5,  #2
+        ushr            v7.16b,  v4.16b,  #4
+        and             v6.16b,  v4.16b,  v29.16b
+        ushr            v3.16b,  v5.16b,  #4
+        and             v2.16b,  v5.16b,  v29.16b
+        zip1            v4.16b,  v6.16b,  v7.16b
+        zip2            v5.16b,  v6.16b,  v7.16b
+        zip1            v6.16b,  v2.16b,  v3.16b
+        zip2            v7.16b,  v2.16b,  v3.16b
        add             v4.16b,  v4.16b,  v4.16b
        add             v5.16b,  v5.16b,  v5.16b
        add             v6.16b,  v6.16b,  v6.16b
@ -4322,8 +4346,16 @@ function pal_pred_16bpc_neon, export=1
        AARCH64_VALID_JUMP_TARGET
        add             x2,  x0,  #64
 64:
-        ld1             {v4.16b, v5.16b, v6.16b, v7.16b}, [x3], #64
+        ld1             {v4.16b, v5.16b}, [x3], #32
        subs            w5,  w5,  #1
+        ushr            v7.16b,  v4.16b,  #4
+        and             v6.16b,  v4.16b,  v29.16b
+        ushr            v3.16b,  v5.16b,  #4
+        and             v2.16b,  v5.16b,  v29.16b
+        zip1            v4.16b,  v6.16b,  v7.16b
+        zip2            v5.16b,  v6.16b,  v7.16b
+        zip1            v6.16b,  v2.16b,  v3.16b
+        zip2            v7.16b,  v2.16b,  v3.16b
        add             v4.16b,  v4.16b,  v4.16b
        add             v5.16b,  v5.16b,  v5.16b
        add             v6.16b,  v6.16b,  v6.16b
--- a/third_party/dav1d/src/arm/filmgrain.h
+++ b/third_party/dav1d/src/arm/filmgrain.h
@ -91,8 +91,8 @@ static void fgy_32x32xn_neon(pixel *const dst_row, const pixel *const src_row,

    int offsets[2 /* col offset */][2 /* row offset */];

-    // process this row in BLOCK_SIZE^2 blocks
-    for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
+    // process this row in FG_BLOCK_SIZE^2 blocks
+    for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) {

        if (data->overlap_flag && bx) {
            // shift previous offsets left
@ -155,8 +155,8 @@ fguv_32x32xn_##nm##_neon(pixel *const dst_row, const pixel *const src_row, \
 \
    int offsets[2 /* col offset */][2 /* row offset */]; \
 \
-    /* process this row in BLOCK_SIZE^2 blocks (subsampled) */ \
-    for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) { \
+    /* process this row in FG_BLOCK_SIZE^2 blocks (subsampled) */ \
+    for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) { \
        if (data->overlap_flag && bx) { \
            /* shift previous offsets left */ \
            for (int i = 0; i < rows; i++) \
--- a/third_party/dav1d/src/dav1d.rc.in
+++ b/third_party/dav1d/src/dav1d.rc.in
@ -22,7 +22,7 @@ BEGIN
      VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder"
      VALUE "InternalName", "dav1d"
      VALUE "OriginalFilename", "libdav1d.dll"
-      VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
+      VALUE "LegalCopyright", L"Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
    END
  END
  BLOCK "VarFileInfo"
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@ -370,142 +370,6 @@ static inline int findoddzero(const uint8_t *buf, int len) {
    return 0;
 }

-static void read_pal_plane(Dav1dTaskContext *const t, Av1Block *const b,
-                           const int pl, const int sz_ctx,
-                           const int bx4, const int by4)
-{
-    Dav1dTileState *const ts = t->ts;
-    const Dav1dFrameContext *const f = t->f;
-    const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
-                                           ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
-    uint16_t cache[16], used_cache[8];
-    int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
-    int n_cache = 0;
-    // don't reuse above palette outside SB64 boundaries
-    int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
-    const uint16_t *l = t->al_pal[1][by4][pl], *a = t->al_pal[0][bx4][pl];
-
-    // fill/sort cache
-    while (l_cache && a_cache) {
-        if (*l < *a) {
-            if (!n_cache || cache[n_cache - 1] != *l)
-                cache[n_cache++] = *l;
-            l++;
-            l_cache--;
-        } else {
-            if (*a == *l) {
-                l++;
-                l_cache--;
-            }
-            if (!n_cache || cache[n_cache - 1] != *a)
-                cache[n_cache++] = *a;
-            a++;
-            a_cache--;
-        }
-    }
-    if (l_cache) {
-        do {
-            if (!n_cache || cache[n_cache - 1] != *l)
-                cache[n_cache++] = *l;
-            l++;
-        } while (--l_cache > 0);
-    } else if (a_cache) {
-        do {
-            if (!n_cache || cache[n_cache - 1] != *a)
-                cache[n_cache++] = *a;
-            a++;
-        } while (--a_cache > 0);
-    }
-
-    // find reused cache entries
-    int i = 0;
-    for (int n = 0; n < n_cache && i < pal_sz; n++)
-        if (dav1d_msac_decode_bool_equi(&ts->msac))
-            used_cache[i++] = cache[n];
-    const int n_used_cache = i;
-
-    // parse new entries
-    uint16_t *const pal = t->frame_thread.pass ?
-        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
-                            ((t->bx >> 1) + (t->by & 1))][pl] : t->scratch.pal[pl];
-    if (i < pal_sz) {
-        int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
-
-        if (i < pal_sz) {
-            int bits = f->cur.p.bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
-            const int max = (1 << f->cur.p.bpc) - 1;
-
-            do {
-                const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
-                prev = pal[i++] = imin(prev + delta + !pl, max);
-                if (prev + !pl >= max) {
-                    for (; i < pal_sz; i++)
-                        pal[i] = max;
-                    break;
-                }
-                bits = imin(bits, 1 + ulog2(max - prev - !pl));
-            } while (i < pal_sz);
-        }
-
-        // merge cache+new entries
-        int n = 0, m = n_used_cache;
-        for (i = 0; i < pal_sz; i++) {
-            if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
-                pal[i] = used_cache[n++];
-            } else {
-                assert(m < pal_sz);
-                pal[i] = pal[m++];
-            }
-        }
-    } else {
-        memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
-    }
-
-    if (DEBUG_BLOCK_INFO) {
-        printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
-               pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
-        for (int n = 0; n < n_cache; n++)
-            printf("%c%02x", n ? ' ' : '[', cache[n]);
-        printf("%s, pal=", n_cache ? "]" : "[]");
-        for (int n = 0; n < pal_sz; n++)
-            printf("%c%02x", n ? ' ' : '[', pal[n]);
-        printf("]\n");
-    }
-}
-
-static void read_pal_uv(Dav1dTaskContext *const t, Av1Block *const b,
-                        const int sz_ctx, const int bx4, const int by4)
-{
-    read_pal_plane(t, b, 1, sz_ctx, bx4, by4);
-
-    // V pal coding
-    Dav1dTileState *const ts = t->ts;
-    const Dav1dFrameContext *const f = t->f;
-    uint16_t *const pal = t->frame_thread.pass ?
-        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
-                            ((t->bx >> 1) + (t->by & 1))][2] : t->scratch.pal[2];
-    if (dav1d_msac_decode_bool_equi(&ts->msac)) {
-        const int bits = f->cur.p.bpc - 4 +
-                         dav1d_msac_decode_bools(&ts->msac, 2);
-        int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
-        const int max = (1 << f->cur.p.bpc) - 1;
-        for (int i = 1; i < b->pal_sz[1]; i++) {
-            int delta = dav1d_msac_decode_bools(&ts->msac, bits);
-            if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
-            prev = pal[i] = (prev + delta) & max;
-        }
-    } else {
-        for (int i = 0; i < b->pal_sz[1]; i++)
-            pal[i] = dav1d_msac_decode_bools(&ts->msac, f->cur.p.bpc);
-    }
-    if (DEBUG_BLOCK_INFO) {
-        printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
-        for (int n = 0; n < b->pal_sz[1]; n++)
-            printf("%c%02x", n ? ' ' : '[', pal[n]);
-        printf("]\n");
-    }
-}
-
 // meant to be SIMD'able, so that theoretical complexity of this function
 // times block size goes from w4*h4 to w4+h4-1
 // a and b are previous two lines containing (a) top/left entries or (b)
@ -584,7 +448,8 @@ static void read_pal_indices(Dav1dTaskContext *const t,
    Dav1dTileState *const ts = t->ts;
    const ptrdiff_t stride = bw4 * 4;
    assert(pal_idx);
-    pal_idx[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
+    pixel *const pal_tmp = t->scratch.pal_idx_uv;
+    pal_tmp[0] = dav1d_msac_decode_uniform(&ts->msac, b->pal_sz[pl]);
    uint16_t (*const color_map_cdf)[8] =
        ts->cdf.m.color_map[pl][b->pal_sz[pl] - 2];
    uint8_t (*const order)[8] = t->scratch.pal_order;
@ -593,23 +458,16 @@ static void read_pal_indices(Dav1dTaskContext *const t,
        // top/left-to-bottom/right diagonals ("wave-front")
        const int first = imin(i, w4 * 4 - 1);
        const int last = imax(0, i - h4 * 4 + 1);
-        order_palette(pal_idx, stride, i, first, last, order, ctx);
+        order_palette(pal_tmp, stride, i, first, last, order, ctx);
        for (int j = first, m = 0; j >= last; j--, m++) {
            const int color_idx = dav1d_msac_decode_symbol_adapt8(&ts->msac,
                                      color_map_cdf[ctx[m]], b->pal_sz[pl] - 1);
-            pal_idx[(i - j) * stride + j] = order[m][color_idx];
+            pal_tmp[(i - j) * stride + j] = order[m][color_idx];
        }
    }
-    // fill invisible edges
-    if (bw4 > w4)
-        for (int y = 0; y < 4 * h4; y++)
-            memset(&pal_idx[y * stride + 4 * w4],
-                   pal_idx[y * stride + 4 * w4 - 1], 4 * (bw4 - w4));
-    if (h4 < bh4) {
-        const uint8_t *const src = &pal_idx[stride * (4 * h4 - 1)];
-        for (int y = h4 * 4; y < bh4 * 4; y++)
-            memcpy(&pal_idx[y * stride], src, bw4 * 4);
-    }
+
+    t->c->pal_dsp.pal_idx_finish(pal_idx, pal_tmp, bw4 * 4, bh4 * 4,
+                                 w4 * 4, h4 * 4);
 }

 static void read_vartx_tree(Dav1dTaskContext *const t,
@ -1306,7 +1164,7 @@ static int decode_b(Dav1dTaskContext *const t,
                if (DEBUG_BLOCK_INFO)
                    printf("Post-y_pal[%d]: r=%d\n", use_y_pal, ts->msac.rng);
                if (use_y_pal)
-                    read_pal_plane(t, b, 0, sz_ctx, bx4, by4);
+                    f->bd_fn.read_pal_plane(t, b, 0, sz_ctx, bx4, by4);
            }

            if (has_chroma && b->uv_mode == DC_PRED) {
@ -1316,7 +1174,7 @@ static int decode_b(Dav1dTaskContext *const t,
                if (DEBUG_BLOCK_INFO)
                    printf("Post-uv_pal[%d]: r=%d\n", use_uv_pal, ts->msac.rng);
                if (use_uv_pal) // see aomedia bug 2183 for why we use luma coordinates
-                    read_pal_uv(t, b, sz_ctx, bx4, by4);
+                    f->bd_fn.read_pal_uv(t, b, sz_ctx, bx4, by4);
            }
        }

@ -1341,9 +1199,9 @@ static int decode_b(Dav1dTaskContext *const t,
                const int p = t->frame_thread.pass & 1;
                assert(ts->frame_thread[p].pal_idx);
                pal_idx = ts->frame_thread[p].pal_idx;
-                ts->frame_thread[p].pal_idx += bw4 * bh4 * 16;
+                ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
            } else
-                pal_idx = t->scratch.pal_idx;
+                pal_idx = t->scratch.pal_idx_y;
            read_pal_indices(t, pal_idx, b, 0, w4, h4, bw4, bh4);
            if (DEBUG_BLOCK_INFO)
                printf("Post-y-pal-indices: r=%d\n", ts->msac.rng);
@ -1355,9 +1213,9 @@ static int decode_b(Dav1dTaskContext *const t,
                const int p = t->frame_thread.pass & 1;
                assert(ts->frame_thread[p].pal_idx);
                pal_idx = ts->frame_thread[p].pal_idx;
-                ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16;
+                ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
            } else
-                pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
+                pal_idx = t->scratch.pal_idx_uv;
            read_pal_indices(t, pal_idx, b, 1, cw4, ch4, cbw4, cbh4);
            if (DEBUG_BLOCK_INFO)
                printf("Post-uv-pal-indices: r=%d\n", ts->msac.rng);
@ -1430,34 +1288,16 @@ static int decode_b(Dav1dTaskContext *const t,
        case_set(bh4, l., 1, by4);
        case_set(bw4, a->, 0, bx4);
 #undef set_ctx
-        if (b->pal_sz[0]) {
-            uint16_t *const pal = t->frame_thread.pass ?
-                f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
-                                    ((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0];
-            for (int x = 0; x < bw4; x++)
-                memcpy(t->al_pal[0][bx4 + x][0], pal, 16);
-            for (int y = 0; y < bh4; y++)
-                memcpy(t->al_pal[1][by4 + y][0], pal, 16);
-        }
+        if (b->pal_sz[0])
+            f->bd_fn.copy_pal_block_y(t, bx4, by4, bw4, bh4);
        if (has_chroma) {
 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
                rep_macro(type, t->dir uvmode, off, mul * b->uv_mode)
                case_set(cbh4, l., 1, cby4);
                case_set(cbw4, a->, 0, cbx4);
 #undef set_ctx
-            if (b->pal_sz[1]) {
-                const uint16_t (*const pal)[8] = t->frame_thread.pass ?
-                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) *
-                    (f->b4_stride >> 1) + ((t->bx >> 1) + (t->by & 1))] :
-                    t->scratch.pal;
-                // see aomedia bug 2183 for why we use luma coordinates here
-                for (int pl = 1; pl <= 2; pl++) {
-                    for (int x = 0; x < bw4; x++)
-                        memcpy(t->al_pal[0][bx4 + x][pl], pal[pl], 16);
-                    for (int y = 0; y < bh4; y++)
-                        memcpy(t->al_pal[1][by4 + y][pl], pal[pl], 16);
-                }
-            }
+            if (b->pal_sz[1])
+                f->bd_fn.copy_pal_block_uv(t, bx4, by4, bw4, bh4);
        }
        if (IS_INTER_OR_SWITCH(f->frame_hdr) || f->frame_hdr->allow_intrabc)
            splat_intraref(f->c, t, bs, bw4, bh4);
@ -2642,7 +2482,10 @@ static void setup_tile(Dav1dTileState *const ts,
    const uint8_t *const size_mul = ss_size_mul[f->cur.p.layout];
    for (int p = 0; p < 2; p++) {
        ts->frame_thread[p].pal_idx = f->frame_thread.pal_idx ?
-            &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 4] :
+            &f->frame_thread.pal_idx[(size_t)tile_start_off * size_mul[1] / 8] :
+            NULL;
+        ts->frame_thread[p].cbi = f->frame_thread.cbi ?
+            &f->frame_thread.cbi[(size_t)tile_start_off * size_mul[0] / 64] :
            NULL;
        ts->frame_thread[p].cf = f->frame_thread.cf ?
            (uint8_t*)f->frame_thread.cf +
@ -3015,6 +2858,19 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
            }
        }

+        const int cbi_sz = num_sb128 * size_mul[0];
+        if (cbi_sz != f->frame_thread.cbi_sz) {
+            dav1d_free_aligned(f->frame_thread.cbi);
+            f->frame_thread.cbi =
+                dav1d_alloc_aligned(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) *
+                                    cbi_sz * 32 * 32 / 4, 64);
+            if (!f->frame_thread.cbi) {
+                f->frame_thread.cbi_sz = 0;
+                goto error;
+            }
+            f->frame_thread.cbi_sz = cbi_sz;
+        }
+
        const int cf_sz = (num_sb128 * size_mul[0]) << hbd;
        if (cf_sz != f->frame_thread.cf_sz) {
            dav1d_free_aligned(f->frame_thread.cf);
@ -3029,16 +2885,17 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
        }

        if (f->frame_hdr->allow_screen_content_tools) {
-            if (num_sb128 != f->frame_thread.pal_sz) {
+            const int pal_sz = num_sb128 << hbd;
+            if (pal_sz != f->frame_thread.pal_sz) {
                dav1d_free_aligned(f->frame_thread.pal);
                f->frame_thread.pal =
                    dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal) *
-                                        num_sb128 * 16 * 16, 64);
+                                        pal_sz * 16 * 16, 64);
                if (!f->frame_thread.pal) {
                    f->frame_thread.pal_sz = 0;
                    goto error;
                }
-                f->frame_thread.pal_sz = num_sb128;
+                f->frame_thread.pal_sz = pal_sz;
            }

            const int pal_idx_sz = num_sb128 * size_mul[1];
@ -3046,7 +2903,7 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
                dav1d_free_aligned(f->frame_thread.pal_idx);
                f->frame_thread.pal_idx =
                    dav1d_alloc_aligned(ALLOC_PAL, sizeof(*f->frame_thread.pal_idx) *
-                                        pal_idx_sz * 128 * 128 / 4, 64);
+                                        pal_idx_sz * 128 * 128 / 8, 64);
                if (!f->frame_thread.pal_idx) {
                    f->frame_thread.pal_idx_sz = 0;
                    goto error;
@ -3171,12 +3028,9 @@ int dav1d_decode_frame_init(Dav1dFrameContext *const f) {
        }
        if (c->n_fc > 1) {
            dav1d_free(f->frame_thread.b);
-            dav1d_free(f->frame_thread.cbi);
            f->frame_thread.b = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.b) *
                                             num_sb128 * 32 * 32);
-            f->frame_thread.cbi = dav1d_malloc(ALLOC_BLOCK, sizeof(*f->frame_thread.cbi) *
-                                               num_sb128 * 32 * 32);
-            if (!f->frame_thread.b || !f->frame_thread.cbi) {
+            if (!f->frame_thread.b) {
                f->lf.mask_sz = 0;
                goto error;
            }
@ -3584,7 +3438,11 @@ int dav1d_submit_frame(Dav1dContext *const c) {
        f->bd_fn.filter_sbrow_resize = dav1d_filter_sbrow_resize_##bd##bpc; \
        f->bd_fn.filter_sbrow_lr = dav1d_filter_sbrow_lr_##bd##bpc; \
        f->bd_fn.backup_ipred_edge = dav1d_backup_ipred_edge_##bd##bpc; \
-        f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc
+        f->bd_fn.read_coef_blocks = dav1d_read_coef_blocks_##bd##bpc; \
+        f->bd_fn.copy_pal_block_y = dav1d_copy_pal_block_y_##bd##bpc; \
+        f->bd_fn.copy_pal_block_uv = dav1d_copy_pal_block_uv_##bd##bpc; \
+        f->bd_fn.read_pal_plane = dav1d_read_pal_plane_##bd##bpc; \
+        f->bd_fn.read_pal_uv = dav1d_read_pal_uv_##bd##bpc
    if (!f->seq_hdr->hbd) {
 #if CONFIG_8BPC
        assign_bitdepth_case(8);
--- a/third_party/dav1d/src/fg_apply_tmpl.c
+++ b/third_party/dav1d/src/fg_apply_tmpl.c
@ -172,14 +172,14 @@ void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp,
    const int cpw = (out->p.w + ss_x) >> ss_x;
    const int is_id = out->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
    pixel *const luma_src =
-        ((pixel *) in->data[0]) + row * BLOCK_SIZE * PXSTRIDE(in->stride[0]);
+        ((pixel *) in->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(in->stride[0]);
 #if BITDEPTH != 8
    const int bitdepth_max = (1 << out->p.bpc) - 1;
 #endif

    if (data->num_y_points) {
-        const int bh = imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE);
-        dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * BLOCK_SIZE * PXSTRIDE(out->stride[0]),
+        const int bh = imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE);
+        dsp->fgy_32x32xn(((pixel *) out->data[0]) + row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[0]),
                         luma_src, out->stride[0], data,
                         out->p.w, scaling[0], grain_lut[0], bh, row HIGHBD_TAIL_SUFFIX);
    }
@ -190,7 +190,7 @@ void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp,
        return;
    }

-    const int bh = (imin(out->p.h - row * BLOCK_SIZE, BLOCK_SIZE) + ss_y) >> ss_y;
+    const int bh = (imin(out->p.h - row * FG_BLOCK_SIZE, FG_BLOCK_SIZE) + ss_y) >> ss_y;

    // extend padding pixels
    if (out->p.w & ss_x) {
@ -201,7 +201,7 @@ void bitfn(dav1d_apply_grain_row)(const Dav1dFilmGrainDSPContext *const dsp,
        }
    }

-    const ptrdiff_t uv_off = row * BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
+    const ptrdiff_t uv_off = row * FG_BLOCK_SIZE * PXSTRIDE(out->stride[1]) >> ss_y;
    if (data->chroma_scaling_from_luma) {
        for (int pl = 0; pl < 2; pl++)
            dsp->fguv_32x32xn[in->p.layout - 1](((pixel *) out->data[1 + pl]) + uv_off,
@ -232,7 +232,7 @@ void bitfn(dav1d_apply_grain)(const Dav1dFilmGrainDSPContext *const dsp,
 #else
    uint8_t scaling[3][SCALING_SIZE];
 #endif
-    const int rows = (out->p.h + 31) >> 5;
+    const int rows = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE;

    bitfn(dav1d_prep_grain)(dsp, out, in, scaling, grain_lut);
    for (int row = 0; row < rows; row++)
--- a/third_party/dav1d/src/filmgrain.h
+++ b/third_party/dav1d/src/filmgrain.h
@ -34,7 +34,7 @@

 #define GRAIN_WIDTH 82
 #define GRAIN_HEIGHT 73
-#define BLOCK_SIZE 32
+#define FG_BLOCK_SIZE 32
 #if !defined(BITDEPTH) || BITDEPTH == 8
 #define SCALING_SIZE 256
 typedef int8_t entry;
--- a/third_party/dav1d/src/filmgrain_tmpl.c
+++ b/third_party/dav1d/src/filmgrain_tmpl.c
@ -162,8 +162,8 @@ static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH],
    const int randval = offsets[bx][by];
    const int offx = 3 + (2 >> subx) * (3 + (randval >> 4));
    const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF));
-    return grain_lut[offy + y + (BLOCK_SIZE >> suby) * by]
-                    [offx + x + (BLOCK_SIZE >> subx) * bx];
+    return grain_lut[offy + y + (FG_BLOCK_SIZE >> suby) * by]
+                    [offx + x + (FG_BLOCK_SIZE >> subx) * bx];
 }

 static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
@ -195,13 +195,13 @@ static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
    }

-    assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
+    assert(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);

    int offsets[2 /* col offset */][2 /* row offset */];

-    // process this row in BLOCK_SIZE^2 blocks
-    for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE) {
-        const int bw = imin(BLOCK_SIZE, (int) pw - bx);
+    // process this row in FG_BLOCK_SIZE^2 blocks
+    for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) {
+        const int bw = imin(FG_BLOCK_SIZE, (int) pw - bx);

        if (data->overlap_flag && bx) {
            // shift previous offsets left
@ -306,13 +306,13 @@ fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row,
        seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF);
    }

-    assert(stride % (BLOCK_SIZE * sizeof(pixel)) == 0);
+    assert(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0);

    int offsets[2 /* col offset */][2 /* row offset */];

-    // process this row in BLOCK_SIZE^2 blocks (subsampled)
-    for (unsigned bx = 0; bx < pw; bx += BLOCK_SIZE >> sx) {
-        const int bw = imin(BLOCK_SIZE >> sx, (int)(pw - bx));
+    // process this row in FG_BLOCK_SIZE^2 blocks (subsampled)
+    for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) {
+        const int bw = imin(FG_BLOCK_SIZE >> sx, (int)(pw - bx));
        if (data->overlap_flag && bx) {
            // shift previous offsets left
            for (int i = 0; i < rows; i++)
--- a/third_party/dav1d/src/internal.h
+++ b/third_party/dav1d/src/internal.h
@ -53,6 +53,7 @@ typedef struct Dav1dTask Dav1dTask;
 #include "src/looprestoration.h"
 #include "src/mc.h"
 #include "src/msac.h"
+#include "src/pal.h"
 #include "src/picture.h"
 #include "src/recon.h"
 #include "src/refmvs.h"
@ -174,6 +175,7 @@ struct Dav1dContext {
    CdfThreadContext cdf[8];

    Dav1dDSPContext dsp[3 /* 8, 10, 12 bits/component */];
+    Dav1dPalDSPContext pal_dsp;
    Dav1dRefmvsDSPContext refmvs_dsp;

    Dav1dPicAllocator allocator;
@ -253,6 +255,10 @@ struct Dav1dFrameContext {
        filter_sbrow_fn filter_sbrow_lr;
        backup_ipred_edge_fn backup_ipred_edge;
        read_coef_blocks_fn read_coef_blocks;
+        copy_pal_block_fn copy_pal_block_y;
+        copy_pal_block_fn copy_pal_block_uv;
+        read_pal_plane_fn read_pal_plane;
+        read_pal_uv_fn read_pal_uv;
    } bd_fn;

    int ipred_edge_sz;
@ -274,14 +280,14 @@ struct Dav1dFrameContext {
        atomic_uint *frame_progress, *copy_lpf_progress;
        // indexed using t->by * f->b4_stride + t->bx
        Av1Block *b;
-        int16_t (*cbi)[3 /* plane */]; /* bits 0-4: txtp, bits 5-15: eob */
+        int16_t *cbi; /* bits 0-4: txtp, bits 5-15: eob */
        // indexed using (t->by >> 1) * (f->b4_stride >> 1) + (t->bx >> 1)
-        uint16_t (*pal)[3 /* plane */][8 /* idx */];
+        pixel (*pal)[3 /* plane */][8 /* idx */];
        // iterated over inside tile state
        uint8_t *pal_idx;
        coef *cf;
        int prog_sz;
-        int pal_sz, pal_idx_sz, cf_sz;
+        int cbi_sz, pal_sz, pal_idx_sz, cf_sz;
        // start offsets per tile
        int *tile_start_off;
    } frame_thread;
@ -358,6 +364,7 @@ struct Dav1dTileState {
    atomic_int progress[2 /* 0: reconstruction, 1: entropy */];
    struct {
        uint8_t *pal_idx;
+        int16_t *cbi;
        coef *cf;
    } frame_thread[2 /* 0: reconstruction, 1: entropy */];

@ -387,9 +394,10 @@ struct Dav1dTaskContext {
        int16_t cf_8bpc [32 * 32];
        int32_t cf_16bpc[32 * 32];
    };
-    // FIXME types can be changed to pixel (and dynamically allocated)
-    // which would make copy/assign operations slightly faster?
-    uint16_t al_pal[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
+    union {
+        uint8_t  al_pal_8bpc [2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
+        uint16_t al_pal_16bpc[2 /* a/l */][32 /* bx/y4 */][3 /* plane */][8 /* palette_idx */];
+    };
    uint8_t pal_sz_uv[2 /* a/l */][32 /* bx4/by4 */];
    ALIGN(union, 64) {
        struct {
@ -419,16 +427,18 @@ struct Dav1dTaskContext {
                int16_t ac[32 * 32]; // intra-only
                uint8_t txtp_map[32 * 32]; // inter-only
            };
-            uint8_t pal_idx[2 * 64 * 64];
-            uint16_t pal[3 /* plane */][8 /* palette_idx */];
-            ALIGN(union, 64) {
+            uint8_t pal_idx_y[32 * 64];
+            uint8_t pal_idx_uv[64 * 64]; /* also used as pre-pack scratch buffer */
+            union {
                struct {
                    uint8_t interintra_8bpc[64 * 64];
                    uint8_t edge_8bpc[257];
+                    ALIGN(uint8_t pal_8bpc[3 /* plane */][8 /* palette_idx */], 8);
                };
                struct {
                    uint16_t interintra_16bpc[64 * 64];
                    uint16_t edge_16bpc[257];
+                    ALIGN(uint16_t pal_16bpc[3 /* plane */][8 /* palette_idx */], 16);
                };
            };
        };
--- a/third_party/dav1d/src/ipred.h
+++ b/third_party/dav1d/src/ipred.h
@ -74,7 +74,7 @@ typedef decl_cfl_pred_fn(*cfl_pred_fn);
 * - only 16-byte alignment is guaranteed for idx.
 */
 #define decl_pal_pred_fn(name) \
-void (name)(pixel *dst, ptrdiff_t stride, const uint16_t *pal, \
+void (name)(pixel *dst, ptrdiff_t stride, const pixel *pal, \
            const uint8_t *idx, int w, int h)
 typedef decl_pal_pred_fn(*pal_pred_fn);

--- a/third_party/dav1d/src/ipred_tmpl.c
+++ b/third_party/dav1d/src/ipred_tmpl.c
@ -715,13 +715,16 @@ cfl_ac_fn(422, 1, 0)
 cfl_ac_fn(444, 0, 0)

 static void pal_pred_c(pixel *dst, const ptrdiff_t stride,
-                       const uint16_t *const pal, const uint8_t *idx,
+                       const pixel *const pal, const uint8_t *idx,
                       const int w, const int h)
 {
    for (int y = 0; y < h; y++) {
-        for (int x = 0; x < w; x++)
-            dst[x] = (pixel) pal[idx[x]];
-        idx += w;
+        for (int x = 0; x < w; x += 2) {
+            const int i = *idx++;
+            assert(!(i & 0x88));
+            dst[x + 0] = pal[i & 7];
+            dst[x + 1] = pal[i >> 4];
+        }
        dst += PXSTRIDE(stride);
    }
 }
--- a/third_party/dav1d/src/lib.c
+++ b/third_party/dav1d/src/lib.c
@ -52,11 +52,10 @@

 static COLD void init_internal(void) {
    dav1d_init_cpu();
-    dav1d_init_interintra_masks();
+    dav1d_init_ii_wedge_masks();
    dav1d_init_intra_edge_tree();
    dav1d_init_qm_tables();
    dav1d_init_thread();
-    dav1d_init_wedge_masks();
 }

 COLD const char *dav1d_version(void) {
@ -287,6 +286,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
            t->task_thread.td.inited = 1;
        }
    }
+    dav1d_pal_dsp_init(&c->pal_dsp);
    dav1d_refmvs_dsp_init(&c->refmvs_dsp);

    pthread_attr_destroy(&thread_attr);
@ -641,11 +641,11 @@ static COLD void close_internal(Dav1dContext **const c_out, int flush) {
        if (c->n_fc > 1) {
            dav1d_free(f->tile_thread.lowest_pixel_mem);
            dav1d_free(f->frame_thread.b);
+            dav1d_free_aligned(f->frame_thread.cbi);
            dav1d_free_aligned(f->frame_thread.pal_idx);
            dav1d_free_aligned(f->frame_thread.cf);
            dav1d_free(f->frame_thread.tile_start_off);
            dav1d_free_aligned(f->frame_thread.pal);
-            dav1d_free(f->frame_thread.cbi);
        }
        if (c->n_tc > 1) {
            pthread_mutex_destroy(&f->task_thread.pending_tasks.lock);
--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@ -42,6 +42,7 @@ libdav1d_sources = files(
    'mem.c',
    'msac.c',
    'obu.c',
+    'pal.c',
    'picture.c',
    'qm.c',
    'ref.c',
@ -167,6 +168,7 @@ if is_asm_enabled
        libdav1d_sources_asm = files(
            'x86/cpuid.asm',
            'x86/msac.asm',
+            'x86/pal.asm',
            'x86/refmvs.asm',
            'x86/itx_avx512.asm',
            'x86/cdef_avx2.asm',
--- a/third_party/dav1d/src/pal.c
+++ b/third_party/dav1d/src/pal.c
@ -0,0 +1,77 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+
+#include <string.h>
+
+#include "common/attributes.h"
+
+#include "src/pal.h"
+
+// fill invisible edges and pack to 4-bit (2 pixels per byte)
+static void pal_idx_finish_c(uint8_t *dst, const uint8_t *src,
+                             const int bw, const int bh,
+                             const int w, const int h)
+{
+    assert(bw >= 4 && bw <= 64 && !(bw & (bw - 1)));
+    assert(bh >= 4 && bh <= 64 && !(bh & (bh - 1)));
+    assert(w  >= 4 && w <= bw && !(w & 3));
+    assert(h  >= 4 && h <= bh && !(h & 3));
+
+    const int dst_w = w / 2;
+    const int dst_bw = bw / 2;
+
+    for (int y = 0; y < h; y++, src += bw, dst += dst_bw) {
+        for (int x = 0; x < dst_w; x++)
+            dst[x] = src[x * 2 + 0] | (src[x * 2 + 1] << 4);
+        if (dst_w < dst_bw)
+            memset(dst + dst_w, src[w - 1] * 0x11, dst_bw - dst_w);
+    }
+
+    if (h < bh) {
+        const uint8_t *const last_row = &dst[-dst_bw];
+        for (int y = h; y < bh; y++, dst += dst_bw)
+            memcpy(dst, last_row, dst_bw);
+    }
+}
+
+#if HAVE_ASM
+#if ARCH_X86
+#include "src/x86/pal.h"
+#endif
+#endif
+
+COLD void dav1d_pal_dsp_init(Dav1dPalDSPContext *const c) {
+    c->pal_idx_finish = pal_idx_finish_c;
+
+#if HAVE_ASM
+#if ARCH_X86
+    pal_dsp_init_x86(c);
+#endif
+#endif
+}
--- a/third_party/dav1d/src/pal.h
+++ b/third_party/dav1d/src/pal.h
@ -0,0 +1,43 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef DAV1D_SRC_PAL_H
+#define DAV1D_SRC_PAL_H
+
+#include <stdint.h>
+
+#define decl_pal_idx_finish_fn(name) \
+void (name)(uint8_t *dst, const uint8_t *src, int bw, int bh, int w, int h)
+typedef decl_pal_idx_finish_fn(*pal_idx_finish_fn);
+
+typedef struct Dav1dPalDSPContext {
+    pal_idx_finish_fn pal_idx_finish;
+} Dav1dPalDSPContext;
+
+void dav1d_pal_dsp_init(Dav1dPalDSPContext *dsp);
+
+#endif /* DAV1D_SRC_PAL_H */
--- a/third_party/dav1d/src/recon.h
+++ b/third_party/dav1d/src/recon.h
@ -57,6 +57,18 @@ typedef decl_backup_ipred_edge_fn(*backup_ipred_edge_fn);
 void (name)(Dav1dTaskContext *t, enum BlockSize bs, const Av1Block *b)
 typedef decl_read_coef_blocks_fn(*read_coef_blocks_fn);

+#define decl_copy_pal_block_fn(name) \
+void (name)(Dav1dTaskContext *t, int bx4, int by4, int bw4, int bh4)
+typedef decl_copy_pal_block_fn(*copy_pal_block_fn);
+
+#define decl_read_pal_plane_fn(name) \
+void (name)(Dav1dTaskContext *t, Av1Block *b, int pl, int sz_ctx, int bx4, int by4)
+typedef decl_read_pal_plane_fn(*read_pal_plane_fn);
+
+#define decl_read_pal_uv_fn(name) \
+void (name)(Dav1dTaskContext *t, Av1Block *b, int sz_ctx, int bx4, int by4)
+typedef decl_read_pal_uv_fn(*read_pal_uv_fn);
+
 decl_recon_b_intra_fn(dav1d_recon_b_intra_8bpc);
 decl_recon_b_intra_fn(dav1d_recon_b_intra_16bpc);

@ -82,4 +94,13 @@ decl_backup_ipred_edge_fn(dav1d_backup_ipred_edge_16bpc);
 decl_read_coef_blocks_fn(dav1d_read_coef_blocks_8bpc);
 decl_read_coef_blocks_fn(dav1d_read_coef_blocks_16bpc);

+decl_copy_pal_block_fn(dav1d_copy_pal_block_y_8bpc);
+decl_copy_pal_block_fn(dav1d_copy_pal_block_y_16bpc);
+decl_copy_pal_block_fn(dav1d_copy_pal_block_uv_8bpc);
+decl_copy_pal_block_fn(dav1d_copy_pal_block_uv_16bpc);
+decl_read_pal_plane_fn(dav1d_read_pal_plane_8bpc);
+decl_read_pal_plane_fn(dav1d_read_pal_plane_16bpc);
+decl_read_pal_uv_fn(dav1d_read_pal_uv_8bpc);
+decl_read_pal_uv_fn(dav1d_read_pal_uv_16bpc);
+
 #endif /* DAV1D_SRC_RECON_H */
--- a/third_party/dav1d/src/recon_tmpl.c
+++ b/third_party/dav1d/src/recon_tmpl.c
@ -770,14 +770,12 @@ static void read_coef_tree(Dav1dTaskContext *const t,
        uint8_t cf_ctx;
        int eob;
        coef *cf;
-        int16_t *cbi;

        if (t->frame_thread.pass) {
            const int p = t->frame_thread.pass & 1;
            assert(ts->frame_thread[p].cf);
            cf = ts->frame_thread[p].cf;
            ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
-            cbi = f->frame_thread.cbi[t->by * f->b4_stride + t->bx];
        } else {
            cf = bitfn(t->cf);
        }
@ -804,10 +802,11 @@ static void read_coef_tree(Dav1dTaskContext *const t,
            case_set_upto16(txw,,,);
 #undef set_ctx
            if (t->frame_thread.pass == 1)
-                cbi[0] = eob * (1 << 5) + txtp;
+                *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
        } else {
-            eob  = cbi[0] >> 5;
-            txtp = cbi[0] & 0x1f;
+            const int cbi = *ts->frame_thread[0].cbi++;
+            eob  = cbi >> 5;
+            txtp = cbi & 0x1f;
        }
        if (!(t->frame_thread.pass & 1)) {
            assert(dst);
@ -872,8 +871,6 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
            for (y = init_y, t->by += init_y; y < sub_h4;
                 y += t_dim->h, t->by += t_dim->h, y_off++)
            {
-                int16_t (*const cbi)[3] =
-                    &f->frame_thread.cbi[t->by * f->b4_stride];
                int x_off = !!init_x;
                for (x = init_x, t->bx += init_x; x < sub_w4;
                     x += t_dim->w, t->bx += t_dim->w, x_off++)
@ -891,7 +888,7 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
                        if (DEBUG_BLOCK_INFO)
                            printf("Post-y-cf-blk[tx=%d,txtp=%d,eob=%d]: r=%d\n",
                                   b->tx, txtp, eob, ts->msac.rng);
-                        cbi[t->bx][0] = eob * (1 << 5) + txtp;
+                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
                        ts->frame_thread[1].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
                        rep_macro(type, t->dir lcoef, off, mul * cf_ctx)
@ -917,8 +914,6 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
                for (y = init_y >> ss_ver, t->by += init_y; y < sub_ch4;
                     y += uv_t_dim->h, t->by += uv_t_dim->h << ss_ver)
                {
-                    int16_t (*const cbi)[3] =
-                        &f->frame_thread.cbi[t->by * f->b4_stride];
                    for (x = init_x >> ss_hor, t->bx += init_x; x < sub_cw4;
                         x += uv_t_dim->w, t->bx += uv_t_dim->w << ss_hor)
                    {
@ -936,7 +931,7 @@ void bytefn(dav1d_read_coef_blocks)(Dav1dTaskContext *const t,
                            printf("Post-uv-cf-blk[pl=%d,tx=%d,"
                                   "txtp=%d,eob=%d]: r=%d\n",
                                   pl, b->uvtx, txtp, eob, ts->msac.rng);
-                        cbi[t->bx][pl + 1] = eob * (1 << 5) + txtp;
+                        *ts->frame_thread[1].cbi++ = eob * (1 << 5) + txtp;
                        ts->frame_thread[1].cf += uv_t_dim->w * uv_t_dim->h * 16;
 #define set_ctx(type, dir, diridx, off, mul, rep_macro) \
                        rep_macro(type, t->dir ccoef[pl], off, mul * cf_ctx)
@ -1236,13 +1231,14 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
                    const int p = t->frame_thread.pass & 1;
                    assert(ts->frame_thread[p].pal_idx);
                    pal_idx = ts->frame_thread[p].pal_idx;
-                    ts->frame_thread[p].pal_idx += bw4 * bh4 * 16;
+                    ts->frame_thread[p].pal_idx += bw4 * bh4 * 8;
                } else {
-                    pal_idx = t->scratch.pal_idx;
+                    pal_idx = t->scratch.pal_idx_y;
                }
-                const uint16_t *const pal = t->frame_thread.pass ?
+                const pixel *const pal = t->frame_thread.pass ?
                    f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
-                                        ((t->bx >> 1) + (t->by & 1))][0] : t->scratch.pal[0];
+                                        ((t->bx >> 1) + (t->by & 1))][0] :
+                    bytefn(t->scratch.pal)[0];
                f->dsp->ipred.pal_pred(dst, f->cur.stride[0], pal,
                                       pal_idx, bw4 * 4, bh4 * 4);
                if (DEBUG_BLOCK_INFO && DEBUG_B_PIXELS)
@ -1319,10 +1315,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
                        enum TxfmType txtp;
                        if (t->frame_thread.pass) {
                            const int p = t->frame_thread.pass & 1;
+                            const int cbi = *ts->frame_thread[p].cbi++;
                            cf = ts->frame_thread[p].cf;
                            ts->frame_thread[p].cf += imin(t_dim->w, 8) * imin(t_dim->h, 8) * 16;
-                            const int cbi =
-                                f->frame_thread.cbi[t->by * f->b4_stride + t->bx][0];
                            eob  = cbi >> 5;
                            txtp = cbi & 0x1f;
                        } else {
@ -1428,7 +1423,7 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
            } else if (b->pal_sz[1]) {
                const ptrdiff_t uv_dstoff = 4 * ((t->bx >> ss_hor) +
                                              (t->by >> ss_ver) * PXSTRIDE(f->cur.stride[1]));
-                const uint16_t (*pal)[8];
+                const pixel (*pal)[8];
                const uint8_t *pal_idx;
                if (t->frame_thread.pass) {
                    const int p = t->frame_thread.pass & 1;
@ -1436,10 +1431,10 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
                    pal = f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
                                              ((t->bx >> 1) + (t->by & 1))];
                    pal_idx = ts->frame_thread[p].pal_idx;
-                    ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 16;
+                    ts->frame_thread[p].pal_idx += cbw4 * cbh4 * 8;
                } else {
-                    pal = t->scratch.pal;
-                    pal_idx = &t->scratch.pal_idx[bw4 * bh4 * 16];
+                    pal = bytefn(t->scratch.pal);
+                    pal_idx = t->scratch.pal_idx_uv;
                }

                f->dsp->ipred.pal_pred(((pixel *) f->cur.data[1]) + uv_dstoff,
@ -1543,10 +1538,9 @@ void bytefn(dav1d_recon_b_intra)(Dav1dTaskContext *const t, const enum BlockSize
                            coef *cf;
                            if (t->frame_thread.pass) {
                                const int p = t->frame_thread.pass & 1;
+                                const int cbi = *ts->frame_thread[p].cbi++;
                                cf = ts->frame_thread[p].cf;
                                ts->frame_thread[p].cf += uv_t_dim->w * uv_t_dim->h * 16;
-                                const int cbi =
-                                    f->frame_thread.cbi[t->by * f->b4_stride + t->bx][pl + 1];
                                eob  = cbi >> 5;
                                txtp = cbi & 0x1f;
                            } else {
@ -1682,12 +1676,8 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
            dsp->ipred.intra_pred[m](tmp, 4 * bw4 * sizeof(pixel),
                                     tl_edge, bw4 * 4, bh4 * 4, 0, 0, 0
                                     HIGHBD_CALL_SUFFIX);
-            const uint8_t *const ii_mask =
-                b->interintra_type == INTER_INTRA_BLEND ?
-                     dav1d_ii_masks[bs][0][b->interintra_mode] :
-                     dav1d_wedge_masks[bs][0][0][b->wedge_idx];
            dsp->mc.blend(dst, f->cur.stride[0], tmp,
-                          bw4 * 4, bh4 * 4, ii_mask);
+                          bw4 * 4, bh4 * 4, II_MASK(0, bs, b));
        }

        if (!has_chroma) goto skip_inter_chroma_pred;
@ -1790,10 +1780,7 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
                // FIXME for 8x32 with 4:2:2 subsampling, this probably does
                // the wrong thing since it will select 4x16, not 4x32, as a
                // transform size...
-                const uint8_t *const ii_mask =
-                    b->interintra_type == INTER_INTRA_BLEND ?
-                         dav1d_ii_masks[bs][chr_layout_idx][b->interintra_mode] :
-                         dav1d_wedge_masks[bs][chr_layout_idx][0][b->wedge_idx];
+                const uint8_t *const ii_mask = II_MASK(chr_layout_idx, bs, b);

                for (int pl = 0; pl < 2; pl++) {
                    pixel *const tmp = bitfn(t->scratch.interintra);
@ -1871,12 +1858,12 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
            mask = seg_mask;
            break;
        case COMP_INTER_WEDGE:
-            mask = dav1d_wedge_masks[bs][0][0][b->wedge_idx];
+            mask = WEDGE_MASK(0, bs, 0, b->wedge_idx);
            dsp->mc.mask(dst, f->cur.stride[0],
                         tmp[b->mask_sign], tmp[!b->mask_sign],
                         bw4 * 4, bh4 * 4, mask HIGHBD_CALL_SUFFIX);
            if (has_chroma)
-                mask = dav1d_wedge_masks[bs][chr_layout_idx][b->mask_sign][b->wedge_idx];
+                mask = WEDGE_MASK(chr_layout_idx, bs, b->mask_sign, b->wedge_idx);
            break;
        }

@ -1993,10 +1980,9 @@ int bytefn(dav1d_recon_b_inter)(Dav1dTaskContext *const t, const enum BlockSize
                        enum TxfmType txtp;
                        if (t->frame_thread.pass) {
                            const int p = t->frame_thread.pass & 1;
+                            const int cbi = *ts->frame_thread[p].cbi++;
                            cf = ts->frame_thread[p].cf;
                            ts->frame_thread[p].cf += uvtx->w * uvtx->h * 16;
-                            const int cbi =
-                                f->frame_thread.cbi[t->by * f->b4_stride + t->bx][pl + 1];
                            eob  = cbi >> 5;
                            txtp = cbi & 0x1f;
                        } else {
@ -2198,3 +2184,178 @@ void bytefn(dav1d_backup_ipred_edge)(Dav1dTaskContext *const t) {
                       4 * (ts->tiling.col_end - x_off) >> ss_hor);
    }
 }
+
+void bytefn(dav1d_copy_pal_block_y)(Dav1dTaskContext *const t,
+                                    const int bx4, const int by4,
+                                    const int bw4, const int bh4)
+
+{
+    const Dav1dFrameContext *const f = t->f;
+    pixel *const pal = t->frame_thread.pass ?
+        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+                            ((t->bx >> 1) + (t->by & 1))][0] :
+        bytefn(t->scratch.pal)[0];
+    for (int x = 0; x < bw4; x++)
+        memcpy(bytefn(t->al_pal)[0][bx4 + x][0], pal, 8 * sizeof(pixel));
+    for (int y = 0; y < bh4; y++)
+        memcpy(bytefn(t->al_pal)[1][by4 + y][0], pal, 8 * sizeof(pixel));
+}
+
+void bytefn(dav1d_copy_pal_block_uv)(Dav1dTaskContext *const t,
+                                     const int bx4, const int by4,
+                                     const int bw4, const int bh4)
+
+{
+    const Dav1dFrameContext *const f = t->f;
+    const pixel (*const pal)[8] = t->frame_thread.pass ?
+        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+                            ((t->bx >> 1) + (t->by & 1))] :
+        bytefn(t->scratch.pal);
+    // see aomedia bug 2183 for why we use luma coordinates here
+    for (int pl = 1; pl <= 2; pl++) {
+        for (int x = 0; x < bw4; x++)
+            memcpy(bytefn(t->al_pal)[0][bx4 + x][pl], pal[pl], 8 * sizeof(pixel));
+        for (int y = 0; y < bh4; y++)
+            memcpy(bytefn(t->al_pal)[1][by4 + y][pl], pal[pl], 8 * sizeof(pixel));
+    }
+}
+
+void bytefn(dav1d_read_pal_plane)(Dav1dTaskContext *const t, Av1Block *const b,
+                                  const int pl, const int sz_ctx,
+                                  const int bx4, const int by4)
+{
+    Dav1dTileState *const ts = t->ts;
+    const Dav1dFrameContext *const f = t->f;
+    const int pal_sz = b->pal_sz[pl] = dav1d_msac_decode_symbol_adapt8(&ts->msac,
+                                           ts->cdf.m.pal_sz[pl][sz_ctx], 6) + 2;
+    pixel cache[16], used_cache[8];
+    int l_cache = pl ? t->pal_sz_uv[1][by4] : t->l.pal_sz[by4];
+    int n_cache = 0;
+    // don't reuse above palette outside SB64 boundaries
+    int a_cache = by4 & 15 ? pl ? t->pal_sz_uv[0][bx4] : t->a->pal_sz[bx4] : 0;
+    const pixel *l = bytefn(t->al_pal)[1][by4][pl];
+    const pixel *a = bytefn(t->al_pal)[0][bx4][pl];
+
+    // fill/sort cache
+    while (l_cache && a_cache) {
+        if (*l < *a) {
+            if (!n_cache || cache[n_cache - 1] != *l)
+                cache[n_cache++] = *l;
+            l++;
+            l_cache--;
+        } else {
+            if (*a == *l) {
+                l++;
+                l_cache--;
+            }
+            if (!n_cache || cache[n_cache - 1] != *a)
+                cache[n_cache++] = *a;
+            a++;
+            a_cache--;
+        }
+    }
+    if (l_cache) {
+        do {
+            if (!n_cache || cache[n_cache - 1] != *l)
+                cache[n_cache++] = *l;
+            l++;
+        } while (--l_cache > 0);
+    } else if (a_cache) {
+        do {
+            if (!n_cache || cache[n_cache - 1] != *a)
+                cache[n_cache++] = *a;
+            a++;
+        } while (--a_cache > 0);
+    }
+
+    // find reused cache entries
+    int i = 0;
+    for (int n = 0; n < n_cache && i < pal_sz; n++)
+        if (dav1d_msac_decode_bool_equi(&ts->msac))
+            used_cache[i++] = cache[n];
+    const int n_used_cache = i;
+
+    // parse new entries
+    pixel *const pal = t->frame_thread.pass ?
+        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+                            ((t->bx >> 1) + (t->by & 1))][pl] :
+        bytefn(t->scratch.pal)[pl];
+    if (i < pal_sz) {
+        const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
+        int prev = pal[i++] = dav1d_msac_decode_bools(&ts->msac, bpc);
+
+        if (i < pal_sz) {
+            int bits = bpc - 3 + dav1d_msac_decode_bools(&ts->msac, 2);
+            const int max = (1 << bpc) - 1;
+
+            do {
+                const int delta = dav1d_msac_decode_bools(&ts->msac, bits);
+                prev = pal[i++] = imin(prev + delta + !pl, max);
+                if (prev + !pl >= max) {
+                    for (; i < pal_sz; i++)
+                        pal[i] = max;
+                    break;
+                }
+                bits = imin(bits, 1 + ulog2(max - prev - !pl));
+            } while (i < pal_sz);
+        }
+
+        // merge cache+new entries
+        int n = 0, m = n_used_cache;
+        for (i = 0; i < pal_sz; i++) {
+            if (n < n_used_cache && (m >= pal_sz || used_cache[n] <= pal[m])) {
+                pal[i] = used_cache[n++];
+            } else {
+                assert(m < pal_sz);
+                pal[i] = pal[m++];
+            }
+        }
+    } else {
+        memcpy(pal, used_cache, n_used_cache * sizeof(*used_cache));
+    }
+
+    if (DEBUG_BLOCK_INFO) {
+        printf("Post-pal[pl=%d,sz=%d,cache_size=%d,used_cache=%d]: r=%d, cache=",
+               pl, pal_sz, n_cache, n_used_cache, ts->msac.rng);
+        for (int n = 0; n < n_cache; n++)
+            printf("%c%02x", n ? ' ' : '[', cache[n]);
+        printf("%s, pal=", n_cache ? "]" : "[]");
+        for (int n = 0; n < pal_sz; n++)
+            printf("%c%02x", n ? ' ' : '[', pal[n]);
+        printf("]\n");
+    }
+}
+
+void bytefn(dav1d_read_pal_uv)(Dav1dTaskContext *const t, Av1Block *const b,
+                               const int sz_ctx, const int bx4, const int by4)
+{
+    bytefn(dav1d_read_pal_plane)(t, b, 1, sz_ctx, bx4, by4);
+
+    // V pal coding
+    Dav1dTileState *const ts = t->ts;
+    const Dav1dFrameContext *const f = t->f;
+    pixel *const pal = t->frame_thread.pass ?
+        f->frame_thread.pal[((t->by >> 1) + (t->bx & 1)) * (f->b4_stride >> 1) +
+                            ((t->bx >> 1) + (t->by & 1))][2] :
+        bytefn(t->scratch.pal)[2];
+    const int bpc = BITDEPTH == 8 ? 8 : f->cur.p.bpc;
+    if (dav1d_msac_decode_bool_equi(&ts->msac)) {
+        const int bits = bpc - 4 + dav1d_msac_decode_bools(&ts->msac, 2);
+        int prev = pal[0] = dav1d_msac_decode_bools(&ts->msac, bpc);
+        const int max = (1 << bpc) - 1;
+        for (int i = 1; i < b->pal_sz[1]; i++) {
+            int delta = dav1d_msac_decode_bools(&ts->msac, bits);
+            if (delta && dav1d_msac_decode_bool_equi(&ts->msac)) delta = -delta;
+            prev = pal[i] = (prev + delta) & max;
+        }
+    } else {
+        for (int i = 0; i < b->pal_sz[1]; i++)
+            pal[i] = dav1d_msac_decode_bools(&ts->msac, bpc);
+    }
+    if (DEBUG_BLOCK_INFO) {
+        printf("Post-pal[pl=2]: r=%d ", ts->msac.rng);
+        for (int n = 0; n < b->pal_sz[1]; n++)
+            printf("%c%02x", n ? ' ' : '[', pal[n]);
+        printf("]\n");
+    }
+}
--- a/third_party/dav1d/src/thread_task.c
+++ b/third_party/dav1d/src/thread_task.c
@ -500,7 +500,7 @@ static inline void delayed_fg_task(const Dav1dContext *const c,
    case DAV1D_TASK_TYPE_FG_APPLY:;
        int row = atomic_fetch_add(&ttd->delayed_fg.progress[0], 1);
        pthread_mutex_unlock(&ttd->lock);
-        int progmax = (out->p.h + 31) >> 5;
+        int progmax = (out->p.h + FG_BLOCK_SIZE - 1) / FG_BLOCK_SIZE;
    fg_apply_loop:
        if (row + 1 < progmax)
            pthread_cond_signal(&ttd->cond);
--- a/third_party/dav1d/src/wedge.c
+++ b/third_party/dav1d/src/wedge.c
@ -83,37 +83,7 @@ static const wedge_code_type wedge_codebook_16_heqw[16] = {
    { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
 };

-static uint8_t ALIGN(wedge_masks_444_32x32[2 * 16 * 32 * 32], 64);
-static uint8_t ALIGN(wedge_masks_444_32x16[2 * 16 * 32 * 16], 64);
-static uint8_t ALIGN(wedge_masks_444_32x8 [2 * 16 * 32 *  8], 64);
-static uint8_t ALIGN(wedge_masks_444_16x32[2 * 16 * 16 * 32], 64);
-static uint8_t ALIGN(wedge_masks_444_16x16[2 * 16 * 16 * 16], 64);
-static uint8_t ALIGN(wedge_masks_444_16x8 [2 * 16 * 16 *  8], 64);
-static uint8_t ALIGN(wedge_masks_444_8x32 [2 * 16 *  8 * 32], 64);
-static uint8_t ALIGN(wedge_masks_444_8x16 [2 * 16 *  8 * 16], 64);
-static uint8_t ALIGN(wedge_masks_444_8x8  [2 * 16 *  8 *  8], 64);
-
-static uint8_t ALIGN(wedge_masks_422_16x32[2 * 16 * 16 * 32], 64);
-static uint8_t ALIGN(wedge_masks_422_16x16[2 * 16 * 16 * 16], 64);
-static uint8_t ALIGN(wedge_masks_422_16x8 [2 * 16 * 16 *  8], 64);
-static uint8_t ALIGN(wedge_masks_422_8x32 [2 * 16 *  8 * 32], 64);
-static uint8_t ALIGN(wedge_masks_422_8x16 [2 * 16 *  8 * 16], 64);
-static uint8_t ALIGN(wedge_masks_422_8x8  [2 * 16 *  8 *  8], 64);
-static uint8_t ALIGN(wedge_masks_422_4x32 [2 * 16 *  4 * 32], 64);
-static uint8_t ALIGN(wedge_masks_422_4x16 [2 * 16 *  4 * 16], 64);
-static uint8_t ALIGN(wedge_masks_422_4x8  [2 * 16 *  4 *  8], 32);
-
-static uint8_t ALIGN(wedge_masks_420_16x16[2 * 16 * 16 * 16], 64);
-static uint8_t ALIGN(wedge_masks_420_16x8 [2 * 16 * 16 *  8], 64);
-static uint8_t ALIGN(wedge_masks_420_16x4 [2 * 16 * 16 *  4], 64);
-static uint8_t ALIGN(wedge_masks_420_8x16 [2 * 16 *  8 * 16], 64);
-static uint8_t ALIGN(wedge_masks_420_8x8  [2 * 16 *  8 *  8], 64);
-static uint8_t ALIGN(wedge_masks_420_8x4  [2 * 16 *  8 *  4], 64);
-static uint8_t ALIGN(wedge_masks_420_4x16 [2 * 16 *  4 * 16], 64);
-static uint8_t ALIGN(wedge_masks_420_4x8  [2 * 16 *  4 *  8], 32);
-static uint8_t ALIGN(wedge_masks_420_4x4  [2 * 16 *  4 *  4], 16);
-
-const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3][2][16];
+Dav1dMasks dav1d_masks;

 static void insert_border(uint8_t *const dst, const uint8_t *const src,
                          const int ctr)
@ -136,29 +106,33 @@ static void hflip(uint8_t *const dst, const uint8_t *const src) {
            dst[y_off + 64 - 1 - x] = src[y_off + x];
 }

-static void invert(uint8_t *const dst, const uint8_t *const src,
-                   const int w, const int h)
-{
-    for (int y = 0, y_off = 0; y < h; y++, y_off += w)
-        for (int x = 0; x < w; x++)
-            dst[y_off + x] = 64 - src[y_off + x];
-}
-
-static void copy2d(uint8_t *dst, const uint8_t *src,
+static void copy2d(uint8_t *dst, const uint8_t *src, int sign,
                   const int w, const int h, const int x_off, const int y_off)
 {
    src += y_off * 64 + x_off;
-    for (int y = 0; y < h; y++) {
-        memcpy(dst, src, w);
-        src += 64;
-        dst += w;
+    if (sign) {
+        for (int y = 0; y < h; y++) {
+            for (int x = 0; x < w; x++)
+                dst[x] = 64 - src[x];
+            src += 64;
+            dst += w;
+        }
+    } else {
+        for (int y = 0; y < h; y++) {
+            memcpy(dst, src, w);
+            src += 64;
+            dst += w;
+        }
    }
 }

-static COLD void init_chroma(uint8_t *chroma, const uint8_t *luma,
-                             const int sign, const int w, const int h,
-                             const int ss_ver)
+#define MASK_OFFSET(x) ((uint16_t)(((uintptr_t)(x) - (uintptr_t)&dav1d_masks) >> 3))
+
+static COLD uint16_t init_chroma(uint8_t *chroma, const uint8_t *luma,
+                                 const int sign, const int w, const int h,
+                                 const int ss_ver)
 {
+    const uint16_t offset = MASK_OFFSET(chroma);
    for (int y = 0; y < h; y += 1 + ss_ver) {
        for (int x = 0; x < w; x += 2) {
            int sum = luma[x] + luma[x + 1] + 1;
@ -168,62 +142,69 @@ static COLD void init_chroma(uint8_t *chroma, const uint8_t *luma,
        luma += w << ss_ver;
        chroma += w >> 1;
    }
+    return offset;
 }

-static COLD void fill2d_16x2(uint8_t *dst, const int w, const int h,
-                             const enum BlockSize bs,
+static COLD void fill2d_16x2(const int w, const int h, const enum BlockSize bs,
                             const uint8_t (*const master)[64 * 64],
                             const wedge_code_type *const cb,
                             uint8_t *masks_444, uint8_t *masks_422,
-                             uint8_t *masks_420, const unsigned signs)
+                             uint8_t *masks_420, unsigned signs)
 {
-    uint8_t *ptr = dst;
-    for (int n = 0; n < 16; n++) {
-        copy2d(ptr, master[cb[n].direction], w, h,
-               32 - (w * cb[n].x_offset >> 3), 32 - (h * cb[n].y_offset >> 3));
-        ptr += w * h;
-    }
-    for (int n = 0, off = 0; n < 16; n++, off += w * h)
-        invert(ptr + off, dst + off, w, h);
-
    const int n_stride_444 = (w * h);
    const int n_stride_422 = n_stride_444 >> 1;
    const int n_stride_420 = n_stride_444 >> 2;
-    const int sign_stride_444 = 16 * n_stride_444;
    const int sign_stride_422 = 16 * n_stride_422;
    const int sign_stride_420 = 16 * n_stride_420;
-    // assign pointers in externally visible array
+
+    // assign pointer offsets in lookup table
    for (int n = 0; n < 16; n++) {
-        const int sign = (signs >> n) & 1;
-        dav1d_wedge_masks[bs][0][0][n] = &masks_444[ sign * sign_stride_444];
+        const int sign = signs & 1;
+
+        copy2d(masks_444, master[cb[n].direction], sign, w, h,
+               32 - (w * cb[n].x_offset >> 3), 32 - (h * cb[n].y_offset >> 3));
+
        // not using !sign is intentional here, since 444 does not require
        // any rounding since no chroma subsampling is applied.
-        dav1d_wedge_masks[bs][0][1][n] = &masks_444[ sign * sign_stride_444];
-        dav1d_wedge_masks[bs][1][0][n] = &masks_422[ sign * sign_stride_422];
-        dav1d_wedge_masks[bs][1][1][n] = &masks_422[!sign * sign_stride_422];
-        dav1d_wedge_masks[bs][2][0][n] = &masks_420[ sign * sign_stride_420];
-        dav1d_wedge_masks[bs][2][1][n] = &masks_420[!sign * sign_stride_420];
+        dav1d_masks.offsets[0][bs].wedge[0][n] =
+        dav1d_masks.offsets[0][bs].wedge[1][n] = MASK_OFFSET(masks_444);
+
+        dav1d_masks.offsets[1][bs].wedge[0][n] =
+            init_chroma(&masks_422[ sign * sign_stride_422], masks_444, 0, w, h, 0);
+        dav1d_masks.offsets[1][bs].wedge[1][n] =
+            init_chroma(&masks_422[!sign * sign_stride_422], masks_444, 1, w, h, 0);
+        dav1d_masks.offsets[2][bs].wedge[0][n] =
+            init_chroma(&masks_420[ sign * sign_stride_420], masks_444, 0, w, h, 1);
+        dav1d_masks.offsets[2][bs].wedge[1][n] =
+            init_chroma(&masks_420[!sign * sign_stride_420], masks_444, 1, w, h, 1);
+
+        signs >>= 1;
        masks_444 += n_stride_444;
        masks_422 += n_stride_422;
        masks_420 += n_stride_420;
-
-        // since the pointers come from inside, we know that
-        // violation of the const is OK here. Any other approach
-        // means we would have to duplicate the sign correction
-        // logic in two places, which isn't very nice, or mark
-        // the table faced externally as non-const, which also sucks
-        init_chroma((uint8_t *)dav1d_wedge_masks[bs][1][0][n],
-                    dav1d_wedge_masks[bs][0][0][n], 0, w, h, 0);
-        init_chroma((uint8_t *)dav1d_wedge_masks[bs][1][1][n],
-                    dav1d_wedge_masks[bs][0][0][n], 1, w, h, 0);
-        init_chroma((uint8_t *)dav1d_wedge_masks[bs][2][0][n],
-                    dav1d_wedge_masks[bs][0][0][n], 0, w, h, 1);
-        init_chroma((uint8_t *)dav1d_wedge_masks[bs][2][1][n],
-                    dav1d_wedge_masks[bs][0][0][n], 1, w, h, 1);
    }
 }

-COLD void dav1d_init_wedge_masks(void) {
+static COLD void build_nondc_ii_masks(uint8_t *const mask_v, const int w,
+                                      const int h, const int step)
+{
+    static const uint8_t ii_weights_1d[32] = {
+        60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,  8,  7,
+         6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,
+    };
+
+    uint8_t *const mask_h  = &mask_v[w * h];
+    uint8_t *const mask_sm = &mask_h[w * h];
+    for (int y = 0, off = 0; y < h; y++, off += w) {
+        memset(&mask_v[off], ii_weights_1d[y * step], w);
+        for (int x = 0; x < w; x++) {
+            mask_sm[off + x] = ii_weights_1d[imin(x, y) * step];
+            mask_h[off + x] = ii_weights_1d[x * step];
+        }
+    }
+}
+
+COLD void dav1d_init_ii_wedge_masks(void) {
    // This function is guaranteed to be called only once

    enum WedgeMasterLineType {
@ -257,9 +238,11 @@ COLD void dav1d_init_wedge_masks(void) {
    hflip(master[WEDGE_OBLIQUE153], master[WEDGE_OBLIQUE27]);

 #define fill(w, h, sz_422, sz_420, hvsw, signs) \
-    fill2d_16x2((uint8_t *) wedge_masks_444_##w##x##h,  w, h, BS_##w##x##h, \
-                master, wedge_codebook_16_##hvsw, wedge_masks_444_##w##x##h, \
-                wedge_masks_422_##sz_422, wedge_masks_420_##sz_420, signs)
+    fill2d_16x2(w, h, BS_##w##x##h - BS_32x32, \
+                master, wedge_codebook_16_##hvsw, \
+                dav1d_masks.wedge_444_##w##x##h, \
+                dav1d_masks.wedge_422_##sz_422, \
+                dav1d_masks.wedge_420_##sz_420, signs)

    fill(32, 32, 16x32, 16x16, heqw, 0x7bfb);
    fill(32, 16, 16x16, 16x8,  hltw, 0x7beb);
@ -271,72 +254,46 @@ COLD void dav1d_init_wedge_masks(void) {
    fill( 8, 16,  4x16,  4x8,  hgtw, 0x7beb);
    fill( 8,  8,  4x8,   4x4,  heqw, 0x7bfb);
 #undef fill
-}

-#define N_II_PRED_MODES (N_INTER_INTRA_PRED_MODES - 1)
-static uint8_t ALIGN(ii_dc_mask[32 * 32], 64);
-static uint8_t ALIGN(ii_nondc_mask_32x32[N_II_PRED_MODES][32 * 32], 64);
-static uint8_t ALIGN(ii_nondc_mask_16x32[N_II_PRED_MODES][16 * 32], 64);
-static uint8_t ALIGN(ii_nondc_mask_16x16[N_II_PRED_MODES][16 * 16], 64);
-static uint8_t ALIGN(ii_nondc_mask_8x32 [N_II_PRED_MODES][ 8 * 32], 64);
-static uint8_t ALIGN(ii_nondc_mask_8x16 [N_II_PRED_MODES][ 8 * 16], 64);
-static uint8_t ALIGN(ii_nondc_mask_8x8  [N_II_PRED_MODES][ 8 *  8], 64);
-static uint8_t ALIGN(ii_nondc_mask_4x16 [N_II_PRED_MODES][ 4 * 16], 64);
-static uint8_t ALIGN(ii_nondc_mask_4x8  [N_II_PRED_MODES][ 4 *  8], 32);
-static uint8_t ALIGN(ii_nondc_mask_4x4  [N_II_PRED_MODES][ 4 *  4], 16);
-#undef N_II_PRED_MODES
+    memset(dav1d_masks.ii_dc, 32, 32 * 32);
+    for (int c = 0; c < 3; c++) {
+        dav1d_masks.offsets[c][BS_32x32-BS_32x32].ii[II_DC_PRED] =
+        dav1d_masks.offsets[c][BS_32x16-BS_32x32].ii[II_DC_PRED] =
+        dav1d_masks.offsets[c][BS_16x32-BS_32x32].ii[II_DC_PRED] =
+        dav1d_masks.offsets[c][BS_16x16-BS_32x32].ii[II_DC_PRED] =
+        dav1d_masks.offsets[c][BS_16x8 -BS_32x32].ii[II_DC_PRED] =
+        dav1d_masks.offsets[c][BS_8x16 -BS_32x32].ii[II_DC_PRED] =
+        dav1d_masks.offsets[c][BS_8x8  -BS_32x32].ii[II_DC_PRED] =
+            MASK_OFFSET(dav1d_masks.ii_dc);
+    }

-#define set1(sz) \
-    [II_DC_PRED] = ii_dc_mask, \
-    [II_VERT_PRED] = ii_nondc_mask_##sz[II_VERT_PRED - 1], \
-    [II_HOR_PRED] = ii_nondc_mask_##sz[II_HOR_PRED - 1], \
-    [II_SMOOTH_PRED] = ii_nondc_mask_##sz[II_SMOOTH_PRED - 1]
-#define set(sz_444, sz_422, sz_420) \
-    { { set1(sz_444) }, { set1(sz_422) }, { set1(sz_420) } }
-const uint8_t *dav1d_ii_masks[N_BS_SIZES][3][N_INTER_INTRA_PRED_MODES] = {
-    [BS_8x8]   = set( 8x8,   4x8,   4x4),
-    [BS_8x16]  = set( 8x16,  4x16,  4x8),
-    [BS_16x8]  = set(16x16,  8x8,   8x8),
-    [BS_16x16] = set(16x16,  8x16,  8x8),
-    [BS_16x32] = set(16x32,  8x32,  8x16),
-    [BS_32x16] = set(32x32, 16x16, 16x16),
-    [BS_32x32] = set(32x32, 16x32, 16x16),
-};
-#undef set
-#undef set1
+#define BUILD_NONDC_II_MASKS(w, h, step) \
+    build_nondc_ii_masks(dav1d_masks.ii_nondc_##w##x##h, w, h, step)

-static COLD void build_nondc_ii_masks(uint8_t *const mask_v,
-                                      uint8_t *const mask_h,
-                                      uint8_t *const mask_sm,
-                                      const int w, const int h, const int step)
-{
-    static const uint8_t ii_weights_1d[] = {
-        60, 52, 45, 39, 34, 30, 26, 22, 19, 17, 15, 13, 11, 10,  8,  7,
-         6,  6,  5,  4,  4,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  1,
-    };
+#define ASSIGN_NONDC_II_OFFSET(bs, w444, h444, w422, h422, w420, h420) \
+    dav1d_masks.offsets[0][bs-BS_32x32].ii[p + 1] = \
+        MASK_OFFSET(&dav1d_masks.ii_nondc_##w444##x##h444[p*w444*h444]); \
+    dav1d_masks.offsets[1][bs-BS_32x32].ii[p + 1] = \
+        MASK_OFFSET(&dav1d_masks.ii_nondc_##w422##x##h422[p*w422*h422]); \
+    dav1d_masks.offsets[2][bs-BS_32x32].ii[p + 1] = \
+        MASK_OFFSET(&dav1d_masks.ii_nondc_##w420##x##h420[p*w420*h420])

-    for (int y = 0, off = 0; y < h; y++, off += w) {
-        memset(&mask_v[off], ii_weights_1d[y * step], w);
-        for (int x = 0; x < w; x++) {
-            mask_sm[off + x] = ii_weights_1d[imin(x, y) * step];
-            mask_h[off + x] = ii_weights_1d[x * step];
-        }
+    BUILD_NONDC_II_MASKS(32, 32, 1);
+    BUILD_NONDC_II_MASKS(16, 32, 1);
+    BUILD_NONDC_II_MASKS(16, 16, 2);
+    BUILD_NONDC_II_MASKS( 8, 32, 1);
+    BUILD_NONDC_II_MASKS( 8, 16, 2);
+    BUILD_NONDC_II_MASKS( 8,  8, 4);
+    BUILD_NONDC_II_MASKS( 4, 16, 2);
+    BUILD_NONDC_II_MASKS( 4,  8, 4);
+    BUILD_NONDC_II_MASKS( 4,  4, 8);
+    for (int p = 0; p < 3; p++) {
+        ASSIGN_NONDC_II_OFFSET(BS_32x32, 32, 32, 16, 32, 16, 16);
+        ASSIGN_NONDC_II_OFFSET(BS_32x16, 32, 32, 16, 16, 16, 16);
+        ASSIGN_NONDC_II_OFFSET(BS_16x32, 16, 32,  8, 32,  8, 16);
+        ASSIGN_NONDC_II_OFFSET(BS_16x16, 16, 16,  8, 16,  8,  8);
+        ASSIGN_NONDC_II_OFFSET(BS_16x8,  16, 16,  8,  8,  8,  8);
+        ASSIGN_NONDC_II_OFFSET(BS_8x16,   8, 16,  4, 16,  4,  8);
+        ASSIGN_NONDC_II_OFFSET(BS_8x8,    8,  8,  4,  8,  4,  4);
    }
 }
-
-COLD void dav1d_init_interintra_masks(void) {
-    // This function is guaranteed to be called only once
-
-    memset(ii_dc_mask, 32, 32 * 32);
-#define set(a) a[II_VERT_PRED - 1], a[II_HOR_PRED - 1], a[II_SMOOTH_PRED - 1]
-    build_nondc_ii_masks(set(ii_nondc_mask_32x32), 32, 32, 1);
-    build_nondc_ii_masks(set(ii_nondc_mask_16x32), 16, 32, 1);
-    build_nondc_ii_masks(set(ii_nondc_mask_16x16), 16, 16, 2);
-    build_nondc_ii_masks(set(ii_nondc_mask_8x32),   8, 32, 1);
-    build_nondc_ii_masks(set(ii_nondc_mask_8x16),   8, 16, 2);
-    build_nondc_ii_masks(set(ii_nondc_mask_8x8),    8,  8, 4);
-    build_nondc_ii_masks(set(ii_nondc_mask_4x16),   4, 16, 2);
-    build_nondc_ii_masks(set(ii_nondc_mask_4x8),    4,  8, 4);
-    build_nondc_ii_masks(set(ii_nondc_mask_4x4),    4,  4, 8);
-#undef set
-}
--- a/third_party/dav1d/src/wedge.h
+++ b/third_party/dav1d/src/wedge.h
@ -30,12 +30,67 @@

 #include "src/levels.h"

-void dav1d_init_wedge_masks(void);
-EXTERN const uint8_t *dav1d_wedge_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
-                                 [2 /* sign */][16 /* wedge_idx */];
+typedef struct {
+    /* Offsets, in units of 8 bytes, relative to the start of the struct. */
+    struct {
+        uint16_t wedge[2 /* sign */][16 /* wedge_idx */];
+        uint16_t ii[N_INTER_INTRA_PRED_MODES];
+    } offsets[3 /* 444, 422, 420 */][BS_8x8 - BS_32x32 + 1];

-void dav1d_init_interintra_masks(void);
-EXTERN const uint8_t *dav1d_ii_masks[N_BS_SIZES][3 /* 444/luma, 422, 420 */]
-                                    [N_INTER_INTRA_PRED_MODES];
+    uint8_t ALIGN(wedge_444_32x32[    16 * 32 * 32], 64);
+    uint8_t ALIGN(wedge_444_32x16[    16 * 32 * 16], 64);
+    uint8_t ALIGN(wedge_444_32x8 [    16 * 32 *  8], 64);
+    uint8_t ALIGN(wedge_444_16x32[    16 * 16 * 32], 64);
+    uint8_t ALIGN(wedge_444_16x16[    16 * 16 * 16], 64);
+    uint8_t ALIGN(wedge_444_16x8 [    16 * 16 *  8], 64);
+    uint8_t ALIGN(wedge_444_8x32 [    16 *  8 * 32], 64);
+    uint8_t ALIGN(wedge_444_8x16 [    16 *  8 * 16], 64);
+    uint8_t ALIGN(wedge_444_8x8  [    16 *  8 *  8], 64);
+
+    uint8_t ALIGN(wedge_422_16x32[2 * 16 * 16 * 32], 64);
+    uint8_t ALIGN(wedge_422_16x16[2 * 16 * 16 * 16], 64);
+    uint8_t ALIGN(wedge_422_16x8 [2 * 16 * 16 *  8], 64);
+    uint8_t ALIGN(wedge_422_8x32 [2 * 16 *  8 * 32], 64);
+    uint8_t ALIGN(wedge_422_8x16 [2 * 16 *  8 * 16], 64);
+    uint8_t ALIGN(wedge_422_8x8  [2 * 16 *  8 *  8], 64);
+    uint8_t ALIGN(wedge_422_4x32 [2 * 16 *  4 * 32], 64);
+    uint8_t ALIGN(wedge_422_4x16 [2 * 16 *  4 * 16], 64);
+    uint8_t ALIGN(wedge_422_4x8  [2 * 16 *  4 *  8], 64);
+
+    uint8_t ALIGN(wedge_420_16x16[2 * 16 * 16 * 16], 64);
+    uint8_t ALIGN(wedge_420_16x8 [2 * 16 * 16 *  8], 64);
+    uint8_t ALIGN(wedge_420_16x4 [2 * 16 * 16 *  4], 64);
+    uint8_t ALIGN(wedge_420_8x16 [2 * 16 *  8 * 16], 64);
+    uint8_t ALIGN(wedge_420_8x8  [2 * 16 *  8 *  8], 64);
+    uint8_t ALIGN(wedge_420_8x4  [2 * 16 *  8 *  4], 64);
+    uint8_t ALIGN(wedge_420_4x16 [2 * 16 *  4 * 16], 64);
+    uint8_t ALIGN(wedge_420_4x8  [2 * 16 *  4 *  8], 64);
+    uint8_t ALIGN(wedge_420_4x4  [2 * 16 *  4 *  4], 64);
+
+    uint8_t ALIGN(ii_dc         [    32 * 32], 64);
+    uint8_t ALIGN(ii_nondc_32x32[3 * 32 * 32], 64);
+    uint8_t ALIGN(ii_nondc_16x32[3 * 16 * 32], 64);
+    uint8_t ALIGN(ii_nondc_16x16[3 * 16 * 16], 64);
+    uint8_t ALIGN(ii_nondc_8x32 [3 *  8 * 32], 64);
+    uint8_t ALIGN(ii_nondc_8x16 [3 *  8 * 16], 64);
+    uint8_t ALIGN(ii_nondc_8x8  [3 *  8 *  8], 64);
+    uint8_t ALIGN(ii_nondc_4x16 [3 *  4 * 16], 64);
+    uint8_t ALIGN(ii_nondc_4x8  [3 *  4 *  8], 32);
+    uint8_t ALIGN(ii_nondc_4x4  [3 *  4 *  4], 16);
+} Dav1dMasks;
+
+#define II_MASK(c, bs, b) \
+    ((const uint8_t*)((uintptr_t)&dav1d_masks + \
+    (size_t)((b)->interintra_type == INTER_INTRA_BLEND ? \
+    dav1d_masks.offsets[c][(bs)-BS_32x32].ii[(b)->interintra_mode] : \
+    dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[0][(b)->wedge_idx]) * 8))
+
+#define WEDGE_MASK(c, bs, sign, idx) \
+    ((const uint8_t*)((uintptr_t)&dav1d_masks + \
+    (size_t)dav1d_masks.offsets[c][(bs)-BS_32x32].wedge[sign][idx] * 8))
+
+EXTERN Dav1dMasks dav1d_masks;
+
+void dav1d_init_ii_wedge_masks(void);

 #endif /* DAV1D_SRC_WEDGE_H */
--- a/third_party/dav1d/src/x86/ipred16_avx2.asm
+++ b/third_party/dav1d/src/x86/ipred16_avx2.asm
@ -4885,24 +4885,26 @@ cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
    jg .w32_wpad
    jmp .w32_hpad

-cglobal pal_pred_16bpc, 4, 6, 5, dst, stride, pal, idx, w, h
-    vbroadcasti128       m3, [palq]
+cglobal pal_pred_16bpc, 4, 6, 6, dst, stride, pal, idx, w, h
+    vbroadcasti128       m4, [palq]
    lea                  r2, [pal_pred_16bpc_avx2_table]
    tzcnt                wd, wm
-    vbroadcasti128       m4, [pal_pred_shuf]
+    vbroadcasti128       m5, [pal_pred_shuf]
    movifnidn            hd, hm
    movsxd               wq, [r2+wq*4]
-    pshufb               m3, m4
-    punpckhqdq           m4, m3, m3
+    pshufb               m4, m5
+    punpckhqdq           m5, m4, m4
    add                  wq, r2
 DEFINE_ARGS dst, stride, stride3, idx, w, h
    lea            stride3q, [strideq*3]
    jmp                  wq
 .w4:
-    mova                xm2, [idxq]
-    add                idxq, 16
-    pshufb              xm1, xm3, xm2
-    pshufb              xm2, xm4, xm2
+    movq                xm0, [idxq]
+    add                idxq, 8
+    psrlw               xm1, xm0, 4
+    punpcklbw           xm0, xm1
+    pshufb              xm1, xm4, xm0
+    pshufb              xm2, xm5, xm0
    punpcklbw           xm0, xm1, xm2
    punpckhbw           xm1, xm2
    movq   [dstq+strideq*0], xm0
@ -4914,10 +4916,12 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h
    jg .w4
    RET
 .w8:
-    movu                 m2, [idxq] ; only 16-byte alignment
-    add                idxq, 32
-    pshufb               m1, m3, m2
-    pshufb               m2, m4, m2
+    pmovzxbw             m2, [idxq]
+    add                idxq, 16
+    psllw                m1, m2, 4
+    por                  m2, m1
+    pshufb               m1, m4, m2
+    pshufb               m2, m5, m2
    punpcklbw            m0, m1, m2
    punpckhbw            m1, m2
    mova         [dstq+strideq*0], xm0
@ -4929,19 +4933,22 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h
    jg .w8
    RET
 .w16:
-    vpermq               m2, [idxq+ 0], q3120
-    vpermq               m5, [idxq+32], q3120
-    add                idxq, 64
-    pshufb               m1, m3, m2
-    pshufb               m2, m4, m2
+    pshufd               m3, [idxq], q3120
+    add                idxq, 32
+    vpermq               m3, m3, q3120
+    psrlw                m1, m3, 4
+    punpcklbw            m2, m3, m1
+    punpckhbw            m3, m1
+    pshufb               m1, m4, m2
+    pshufb               m2, m5, m2
    punpcklbw            m0, m1, m2
    punpckhbw            m1, m2
    mova   [dstq+strideq*0], m0
    mova   [dstq+strideq*1], m1
-    pshufb               m1, m3, m5
-    pshufb               m2, m4, m5
-    punpcklbw            m0, m1, m2
-    punpckhbw            m1, m2
+    pshufb               m1, m4, m3
+    pshufb               m3, m5, m3
+    punpcklbw            m0, m1, m3
+    punpckhbw            m1, m3
    mova   [dstq+strideq*2], m0
    mova   [dstq+stride3q ], m1
    lea                dstq, [dstq+strideq*4]
@ -4949,41 +4956,47 @@ DEFINE_ARGS dst, stride, stride3, idx, w, h
    jg .w16
    RET
 .w32:
-    vpermq               m2, [idxq+ 0], q3120
-    vpermq               m5, [idxq+32], q3120
-    add                idxq, 64
-    pshufb               m1, m3, m2
-    pshufb               m2, m4, m2
+    pshufd               m3, [idxq], q3120
+    add                idxq, 32
+    vpermq               m3, m3, q3120
+    psrlw                m1, m3, 4
+    punpcklbw            m2, m3, m1
+    punpckhbw            m3, m1
+    pshufb               m1, m4, m2
+    pshufb               m2, m5, m2
    punpcklbw            m0, m1, m2
    punpckhbw            m1, m2
-    mova [dstq+strideq*0+ 0], m0
-    mova [dstq+strideq*0+32], m1
-    pshufb               m1, m3, m5
-    pshufb               m2, m4, m5
-    punpcklbw            m0, m1, m2
-    punpckhbw            m1, m2
-    mova [dstq+strideq*1+ 0], m0
-    mova [dstq+strideq*1+32], m1
+    mova          [dstq+ 0], m0
+    mova          [dstq+32], m1
+    pshufb               m1, m4, m3
+    pshufb               m3, m5, m3
+    punpcklbw            m0, m1, m3
+    punpckhbw            m1, m3
+    mova  [dstq+strideq+ 0], m0
+    mova  [dstq+strideq+32], m1
    lea                dstq, [dstq+strideq*2]
    sub                  hd, 2
    jg .w32
    RET
 .w64:
-    vpermq               m2, [idxq+ 0], q3120
-    vpermq               m5, [idxq+32], q3120
-    add                idxq, 64
-    pshufb               m1, m3, m2
-    pshufb               m2, m4, m2
+    pshufd               m3, [idxq], q3120
+    add                idxq, 32
+    vpermq               m3, m3, q3120
+    psrlw                m1, m3, 4
+    punpcklbw            m2, m3, m1
+    punpckhbw            m3, m1
+    pshufb               m1, m4, m2
+    pshufb               m2, m5, m2
    punpcklbw            m0, m1, m2
    punpckhbw            m1, m2
-    mova          [dstq+ 0], m0
-    mova          [dstq+32], m1
-    pshufb               m1, m3, m5
-    pshufb               m2, m4, m5
-    punpcklbw            m0, m1, m2
-    punpckhbw            m1, m2
-    mova          [dstq+64], m0
-    mova          [dstq+96], m1
+    mova        [dstq+32*0], m0
+    mova        [dstq+32*1], m1
+    pshufb               m1, m4, m3
+    pshufb               m3, m5, m3
+    punpcklbw            m0, m1, m3
+    punpckhbw            m1, m3
+    mova        [dstq+32*2], m0
+    mova        [dstq+32*3], m1
    add                 dstq, strideq
    dec                   hd
    jg .w64
--- a/third_party/dav1d/src/x86/ipred16_avx512.asm
+++ b/third_party/dav1d/src/x86/ipred16_avx512.asm
@ -38,10 +38,10 @@ smooth_perm:   db  1,  2,  5,  6,  9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
               db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
               db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
               db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
-pal_pred_perm: db  0, 32,  1, 33,  2, 34,  3, 35,  4, 36,  5, 37,  6, 38,  7, 39
-               db  8, 40,  9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47
-               db 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55
-               db 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63
+pal_pred_perm: db  0, 16, 32, 48,  1, 17, 33, 49,  2, 18, 34, 50,  3, 19, 35, 51
+               db  4, 20, 36, 52,  5, 21, 37, 53,  6, 22, 38, 54,  7, 23, 39, 55
+               db  8, 24, 40, 56,  9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59
+               db 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63
 filter_permA:  times 4 db  6,  7,  8,  9, 14, 15,  4,  5
               times 4 db 10, 11, 12, 13,  2,  3, -1, -1
 filter_permB:  times 4 db 22, 23, 24, 25, 30, 31,  6,  7
@ -57,6 +57,8 @@ filter_shift:  times 2 dw  6
                       dd  0
               times 2 dw  4
                       dd  9
+pal_unpack:            db  0,  8,  4, 12, 32, 40, 36, 44
+                       db 16, 24, 20, 28, 48, 56, 52, 60

 %macro JMP_TABLE 3-*
    %xdefine %1_%2_table (%%table - 2*4)
@ -610,20 +612,23 @@ cglobal ipred_smooth_16bpc, 3, 7, 16, dst, stride, tl, w, h, v_weights, stride3
    jg .w64_loop
    RET

-cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
+cglobal pal_pred_16bpc, 4, 7, 7, dst, stride, pal, idx, w, h, stride3
    lea                  r6, [pal_pred_16bpc_avx512icl_table]
    tzcnt                wd, wm
-    mova                 m2, [pal_pred_perm]
-    movsxd               wq, [r6+wq*4]
-    mova                xm3, [palq]
+    mova                 m3, [pal_pred_perm]
    movifnidn            hd, hm
+    movsxd               wq, [r6+wq*4]
+    vpbroadcastq         m4, [pal_unpack+0]
+    vpbroadcastq         m5, [pal_unpack+8]
    add                  wq, r6
+    vbroadcasti32x4      m6, [palq]
    lea            stride3q, [strideq*3]
    jmp                  wq
 .w4:
-    pmovzxbw            ym0, [idxq]
-    add                idxq, 16
-    vpermw              ym0, ym0, ym3
+    pmovzxbd            ym0, [idxq]
+    add                idxq, 8
+    vpmultishiftqb      ym0, ym4, ym0
+    vpermw              ym0, ym0, ym6
    vextracti32x4       xm1, ym0, 1
    movq   [dstq+strideq*0], xm0
    movhps [dstq+strideq*1], xm0
@ -634,9 +639,10 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
    jg .w4
    RET
 .w8:
-    pmovzxbw             m0, [idxq]
-    add                idxq, 32
-    vpermw               m0, m0, m3
+    pmovzxbd             m0, [idxq]
+    add                idxq, 16
+    vpmultishiftqb       m0, m4, m0
+    vpermw               m0, m0, m6
    mova          [dstq+strideq*0], xm0
    vextracti32x4 [dstq+strideq*1], ym0, 1
    vextracti32x4 [dstq+strideq*2], m0, 2
@ -646,11 +652,13 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
    jg .w8
    RET
 .w16:
-    vpermb               m1, m2, [idxq]
-    add                idxq, 64
-    vpermw               m0, m1, m3
+    movu                ym1, [idxq]
+    add                idxq, 32
+    vpermb               m1, m3, m1
+    vpmultishiftqb       m1, m4, m1
+    vpermw               m0, m1, m6
    psrlw                m1, 8
-    vpermw               m1, m1, m3
+    vpermw               m1, m1, m6
    mova          [dstq+strideq*0], ym0
    vextracti32x8 [dstq+strideq*1], m0, 1
    mova          [dstq+strideq*2], ym1
@ -660,27 +668,41 @@ cglobal pal_pred_16bpc, 4, 7, 4, dst, stride, pal, idx, w, h, stride3
    jg .w16
    RET
 .w32:
-    vpermb               m1, m2, [idxq]
+    vpermb               m2, m3, [idxq]
    add                idxq, 64
-    vpermw               m0, m1, m3
+    vpmultishiftqb       m1, m4, m2
+    vpmultishiftqb       m2, m5, m2
+    vpermw               m0, m1, m6
    psrlw                m1, 8
-    vpermw               m1, m1, m3
+    vpermw               m1, m1, m6
    mova   [dstq+strideq*0], m0
    mova   [dstq+strideq*1], m1
-    lea                dstq, [dstq+strideq*2]
-    sub                  hd, 2
+    vpermw               m0, m2, m6
+    psrlw                m2, 8
+    vpermw               m1, m2, m6
+    mova   [dstq+strideq*2], m0
+    mova   [dstq+stride3q ], m1
+    lea                dstq, [dstq+strideq*4]
+    sub                  hd, 4
    jg .w32
    RET
 .w64:
-    vpermb               m1, m2, [idxq]
+    vpermb               m2, m3, [idxq]
    add                idxq, 64
-    vpermw               m0, m1, m3
+    vpmultishiftqb       m1, m4, m2
+    vpmultishiftqb       m2, m5, m2
+    vpermw               m0, m1, m6
    psrlw                m1, 8
-    vpermw               m1, m1, m3
-    mova        [dstq+64*0], m0
-    mova        [dstq+64*1], m1
-    add                dstq, strideq
-    dec                  hd
+    vpermw               m1, m1, m6
+    mova          [dstq+ 0], m0
+    mova          [dstq+64], m1
+    vpermw               m0, m2, m6
+    psrlw                m2, 8
+    vpermw               m1, m2, m6
+    mova  [dstq+strideq+ 0], m0
+    mova  [dstq+strideq+64], m1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
    jg .w64
    RET

--- a/third_party/dav1d/src/x86/ipred16_sse.asm
+++ b/third_party/dav1d/src/x86/ipred16_sse.asm
@ -3964,25 +3964,27 @@ cglobal ipred_cfl_ac_444_16bpc, 3, 7, 6, ac, ypx, stride, wpad, hpad, w, h
    jg .w32_hpad_loop
    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_ssse3).dc

-cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h
+cglobal pal_pred_16bpc, 4, 5, 6, dst, stride, pal, idx, w, h
 %define base r2-pal_pred_16bpc_ssse3_table
 %if ARCH_X86_32
    %define              hd  r2d
 %endif
-    mova                 m3, [palq]
+    mova                 m4, [palq]
    LEA                  r2, pal_pred_16bpc_ssse3_table
    tzcnt                wd, wm
-    pshufb               m3, [base+pal_pred_shuf]
+    pshufb               m4, [base+pal_pred_shuf]
    movsxd               wq, [r2+wq*4]
-    pshufd               m4, m3, q1032
+    pshufd               m5, m4, q1032
    add                  wq, r2
    movifnidn            hd, hm
    jmp                  wq
 .w4:
-    mova                 m0, [idxq]
-    add                idxq, 16
-    pshufb               m1, m3, m0
-    pshufb               m2, m4, m0
+    movq                 m0, [idxq]
+    add                idxq, 8
+    psrlw                m1, m0, 4
+    punpcklbw            m0, m1
+    pshufb               m1, m4, m0
+    pshufb               m2, m5, m0
    punpcklbw            m0, m1, m2
    punpckhbw            m1, m2
    movq   [dstq+strideq*0], m0
@ -3995,77 +3997,102 @@ cglobal pal_pred_16bpc, 4, 5, 5, dst, stride, pal, idx, w, h
    jg .w4
    RET
 .w8:
-    mova                 m0, [idxq]
+    movu                 m3, [idxq]
    add                idxq, 16
-    pshufb               m1, m3, m0
-    pshufb               m2, m4, m0
+    psrlw                m1, m3, 4
+    punpcklbw            m0, m3, m1
+    punpckhbw            m3, m1
+    pshufb               m1, m4, m0
+    pshufb               m2, m5, m0
    punpcklbw            m0, m1, m2
    punpckhbw            m1, m2
    mova   [dstq+strideq*0], m0
    mova   [dstq+strideq*1], m1
    lea                dstq, [dstq+strideq*2]
-    sub                  hd, 2
+    pshufb               m1, m4, m3
+    pshufb               m2, m5, m3
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    mova   [dstq+strideq*0], m0
+    mova   [dstq+strideq*1], m1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 4
    jg .w8
    RET
 .w16:
-    mova                 m0, [idxq]
+    movu                 m3, [idxq]
    add                idxq, 16
-    pshufb               m1, m3, m0
-    pshufb               m2, m4, m0
+    psrlw                m1, m3, 4
+    punpcklbw            m0, m3, m1
+    punpckhbw            m3, m1
+    pshufb               m1, m4, m0
+    pshufb               m2, m5, m0
    punpcklbw            m0, m1, m2
    punpckhbw            m1, m2
-    mova        [dstq+16*0], m0
-    mova        [dstq+16*1], m1
-    add                dstq, strideq
-    dec                  hd
+    mova          [dstq+ 0], m0
+    mova          [dstq+16], m1
+    pshufb               m1, m4, m3
+    pshufb               m2, m5, m3
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    mova  [dstq+strideq+ 0], m0
+    mova  [dstq+strideq+16], m1
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
    jg .w16
    RET
 .w32:
-    mova                 m0, [idxq+16*0]
-    pshufb               m1, m3, m0
-    pshufb               m2, m4, m0
+    movu                 m3, [idxq]
+    add                idxq, 16
+    psrlw                m1, m3, 4
+    punpcklbw            m0, m3, m1
+    punpckhbw            m3, m1
+    pshufb               m1, m4, m0
+    pshufb               m2, m5, m0
    punpcklbw            m0, m1, m2
    punpckhbw            m1, m2
-    mova                 m2, [idxq+16*1]
-    add                idxq, 16*2
    mova        [dstq+16*0], m0
-    pshufb               m0, m3, m2
    mova        [dstq+16*1], m1
-    pshufb               m1, m4, m2
-    punpcklbw            m2, m0, m1
-    punpckhbw            m0, m1
-    mova        [dstq+16*2], m2
-    mova        [dstq+16*3], m0
+    pshufb               m1, m4, m3
+    pshufb               m2, m5, m3
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    mova        [dstq+16*2], m0
+    mova        [dstq+16*3], m1
    add                dstq, strideq
    dec                  hd
    jg .w32
    RET
 .w64:
-    mova                 m0, [idxq+16*0]
-    pshufb               m1, m3, m0
-    pshufb               m2, m4, m0
+    movu                 m3, [idxq+16*0]
+    psrlw                m1, m3, 4
+    punpcklbw            m0, m3, m1
+    punpckhbw            m3, m1
+    pshufb               m1, m4, m0
+    pshufb               m2, m5, m0
    punpcklbw            m0, m1, m2
    punpckhbw            m1, m2
-    mova                 m2, [idxq+16*1]
    mova        [dstq+16*0], m0
-    pshufb               m0, m3, m2
    mova        [dstq+16*1], m1
-    pshufb               m1, m4, m2
-    punpcklbw            m2, m0, m1
-    punpckhbw            m0, m1
-    mova                 m1, [idxq+16*2]
-    mova        [dstq+16*2], m2
-    pshufb               m2, m3, m1
-    mova        [dstq+16*3], m0
-    pshufb               m0, m4, m1
-    punpcklbw            m1, m2, m0
-    punpckhbw            m2, m0
-    mova                 m0, [idxq+16*3]
-    add                idxq, 16*4
-    mova        [dstq+16*4], m1
-    pshufb               m1, m3, m0
-    mova        [dstq+16*5], m2
-    pshufb               m2, m4, m0
+    pshufb               m1, m4, m3
+    pshufb               m2, m5, m3
+    movu                 m3, [idxq+16*1]
+    add                idxq, 32
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    mova        [dstq+16*2], m0
+    mova        [dstq+16*3], m1
+    psrlw                m1, m3, 4
+    punpcklbw            m0, m3, m1
+    punpckhbw            m3, m1
+    pshufb               m1, m4, m0
+    pshufb               m2, m5, m0
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    mova        [dstq+16*4], m0
+    mova        [dstq+16*5], m1
+    pshufb               m1, m4, m3
+    pshufb               m2, m5, m3
    punpcklbw            m0, m1, m2
    punpckhbw            m1, m2
    mova        [dstq+16*6], m0
--- a/third_party/dav1d/src/x86/ipred_avx2.asm
+++ b/third_party/dav1d/src/x86/ipred_avx2.asm
@ -5307,18 +5307,20 @@ cglobal ipred_cfl_ac_444_8bpc, 4, 9, 6, ac, y, stride, wpad, hpad, w, h, sz, ac_
    RET

 cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
-    vbroadcasti128       m4, [palq]
+    vpbroadcastq         m4, [palq]
    lea                  r2, [pal_pred_avx2_table]
    tzcnt                wd, wm
    movifnidn            hd, hm
    movsxd               wq, [r2+wq*4]
-    packuswb             m4, m4
    add                  wq, r2
    lea                  r2, [strideq*3]
    jmp                  wq
 .w4:
-    pshufb              xm0, xm4, [idxq]
-    add                idxq, 16
+    movq                xm0, [idxq]
+    add                idxq, 8
+    psrlw               xm1, xm0, 4
+    punpcklbw           xm0, xm1
+    pshufb              xm0, xm4, xm0
    movd   [dstq+strideq*0], xm0
    pextrd [dstq+strideq*1], xm0, 1
    pextrd [dstq+strideq*2], xm0, 2
@ -5327,11 +5329,14 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
    sub                  hd, 4
    jg .w4
    RET
-ALIGN function_align
 .w8:
-    pshufb              xm0, xm4, [idxq+16*0]
-    pshufb              xm1, xm4, [idxq+16*1]
-    add                idxq, 16*2
+    movu                xm2, [idxq]
+    add                idxq, 16
+    pshufb              xm1, xm4, xm2
+    psrlw               xm2, 4
+    pshufb              xm2, xm4, xm2
+    punpcklbw           xm0, xm1, xm2
+    punpckhbw           xm1, xm2
    movq   [dstq+strideq*0], xm0
    movhps [dstq+strideq*1], xm0
    movq   [dstq+strideq*2], xm1
@ -5340,47 +5345,48 @@ ALIGN function_align
    sub                  hd, 4
    jg .w8
    RET
-ALIGN function_align
 .w16:
-    pshufb               m0, m4, [idxq+32*0]
-    pshufb               m1, m4, [idxq+32*1]
-    add                idxq, 32*2
+    movu                 m2, [idxq]
+    add                idxq, 32
+    pshufb               m1, m4, m2
+    psrlw                m2, 4
+    pshufb               m2, m4, m2
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
    mova         [dstq+strideq*0], xm0
-    vextracti128 [dstq+strideq*1], m0, 1
-    mova         [dstq+strideq*2], xm1
+    mova         [dstq+strideq*1], xm1
+    vextracti128 [dstq+strideq*2], m0, 1
    vextracti128 [dstq+r2       ], m1, 1
    lea                dstq, [dstq+strideq*4]
    sub                  hd, 4
    jg .w16
    RET
-ALIGN function_align
 .w32:
-    pshufb               m0, m4, [idxq+32*0]
-    pshufb               m1, m4, [idxq+32*1]
-    pshufb               m2, m4, [idxq+32*2]
-    pshufb               m3, m4, [idxq+32*3]
-    add                idxq, 32*4
+    vpermq               m2, [idxq], q3120
+    add                idxq, 32
+    pshufb               m1, m4, m2
+    psrlw                m2, 4
+    pshufb               m2, m4, m2
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
    mova   [dstq+strideq*0], m0
    mova   [dstq+strideq*1], m1
-    mova   [dstq+strideq*2], m2
-    mova   [dstq+r2       ], m3
-    lea                dstq, [dstq+strideq*4]
-    sub                  hd, 4
-    jg .w32
-    RET
-ALIGN function_align
-.w64:
-    pshufb               m0, m4, [idxq+32*0]
-    pshufb               m1, m4, [idxq+32*1]
-    pshufb               m2, m4, [idxq+32*2]
-    pshufb               m3, m4, [idxq+32*3]
-    add                idxq, 32*4
-    mova [dstq+strideq*0+32*0], m0
-    mova [dstq+strideq*0+32*1], m1
-    mova [dstq+strideq*1+32*0], m2
-    mova [dstq+strideq*1+32*1], m3
    lea                dstq, [dstq+strideq*2]
    sub                  hd, 2
+    jg .w32
+    RET
+.w64:
+    vpermq               m2, [idxq], q3120
+    add                idxq, 32
+    pshufb               m1, m4, m2
+    psrlw                m2, 4
+    pshufb               m2, m4, m2
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    mova        [dstq+32*0], m0
+    mova        [dstq+32*1], m1
+    add                dstq, strideq
+    dec                  hd
    jg .w64
    RET

--- a/third_party/dav1d/src/x86/ipred_avx512.asm
+++ b/third_party/dav1d/src/x86/ipred_avx512.asm
@ -95,6 +95,8 @@ smooth_endB:  db  1,  3,  5,  7,  9, 11, 13, 15, 65, 67, 69, 71, 73, 75, 77, 79
              db 49, 51, 53, 55, 57, 59, 61, 63,113,115,117,119,121,123,125,127
 ipred_h_shuf: db  7,  7,  7,  7,  6,  6,  6,  6,  5,  5,  5,  5,  4,  4,  4,  4
              db  3,  3,  3,  3,  2,  2,  2,  2,  1,  1,  1,  1,  0,  0,  0,  0
+pal_unpack:   db  0,  4,  8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60
+pal_perm:     db  0,  8,  1,  9,  2, 10,  3, 11,  4, 12,  5, 13,  6, 14,  7, 15

 pb_127_m127:  times 2 db 127, -127
 pb_128:       times 4 db 128
@ -126,7 +128,6 @@ JMP_TABLE ipred_smooth_h_8bpc,   avx512icl, w4, w8, w16, w32, w64
 JMP_TABLE ipred_dc_8bpc,         avx512icl, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
                                       s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
 JMP_TABLE ipred_dc_left_8bpc,    avx512icl, h4, h8, h16, h32, h64
-JMP_TABLE pal_pred_8bpc,         avx512icl, w4, w8, w16, w32, w64

 SECTION .text

@ -1111,19 +1112,20 @@ cglobal ipred_smooth_8bpc, 4, 7, 16, dst, stride, tl, w, h, v_weights, stride3
    jg .w64_loop
    RET

-cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
-    lea                  r6, [pal_pred_8bpc_avx512icl_table]
-    tzcnt                wd, wm
-    vbroadcasti32x4      m4, [palq]
+cglobal pal_pred_8bpc, 4, 7, 6, dst, stride, pal, idx, w, h, stride3
+    movifnidn            wd, wm
    movifnidn            hd, hm
-    movsxd               wq, [r6+wq*4]
-    packuswb             m4, m4
-    add                  wq, r6
    lea            stride3q, [strideq*3]
-    jmp                  wq
+    cmp                  wd, 8
+    jg .w32
+    movq               xmm3, [palq]
+    je .w8
 .w4:
-    pshufb             xmm0, xm4, [idxq]
-    add                idxq, 16
+    movq               xmm0, [idxq]
+    add                idxq, 8
+    psrlw              xmm1, xmm0, 4
+    punpcklbw          xmm0, xmm1
+    pshufb             xmm0, xmm3, xmm0
    movd   [dstq+strideq*0], xmm0
    pextrd [dstq+strideq*1], xmm0, 1
    pextrd [dstq+strideq*2], xmm0, 2
@ -1133,9 +1135,13 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
    jg .w4
    RET
 .w8:
-    pshufb             xmm0, xm4, [idxq+16*0]
-    pshufb             xmm1, xm4, [idxq+16*1]
-    add                idxq, 16*2
+    movu               xmm2, [idxq]
+    add                idxq, 16
+    pshufb             xmm1, xmm3, xmm2
+    psrlw              xmm2, 4
+    pshufb             xmm2, xmm3, xmm2
+    punpcklbw          xmm0, xmm1, xmm2
+    punpckhbw          xmm1, xmm2
    movq   [dstq+strideq*0], xmm0
    movhps [dstq+strideq*1], xmm0
    movq   [dstq+strideq*2], xmm1
@ -1145,8 +1151,10 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
    jg .w8
    RET
 .w16:
-    pshufb               m0, m4, [idxq]
-    add                idxq, 64
+    pmovzxdq             m0, [idxq]
+    add                idxq, 32
+    vpmultishiftqb       m0, m3, m0
+    pshufb               m0, m5, m0
    mova          [dstq+strideq*0], xm0
    vextracti32x4 [dstq+strideq*1], ym0, 1
    vextracti32x4 [dstq+strideq*2], m0, 2
@ -1156,29 +1164,39 @@ cglobal pal_pred_8bpc, 4, 7, 5, dst, stride, pal, idx, w, h, stride3
    jg .w16
    RET
 .w32:
-    pshufb               m0, m4, [idxq+64*0]
-    pshufb               m1, m4, [idxq+64*1]
-    add                idxq, 64*2
+    vpbroadcastq         m3, [pal_unpack+0]
+    vpbroadcastq         m5, [palq]
+    cmp                  wd, 32
+    jl .w16
+    pmovzxbd             m2, [pal_perm]
+    vpbroadcastq         m4, [pal_unpack+8]
+    jg .w64
+.w32_loop:
+    vpermd               m1, m2, [idxq]
+    add                idxq, 64
+    vpmultishiftqb       m0, m3, m1
+    vpmultishiftqb       m1, m4, m1
+    pshufb               m0, m5, m0
+    pshufb               m1, m5, m1
    mova          [dstq+strideq*0], ym0
    vextracti32x8 [dstq+strideq*1], m0, 1
    mova          [dstq+strideq*2], ym1
    vextracti32x8 [dstq+stride3q ], m1, 1
    lea                dstq, [dstq+strideq*4]
    sub                  hd, 4
-    jg .w32
+    jg .w32_loop
    RET
 .w64:
-    pshufb               m0, m4, [idxq+64*0]
-    pshufb               m1, m4, [idxq+64*1]
-    pshufb               m2, m4, [idxq+64*2]
-    pshufb               m3, m4, [idxq+64*3]
-    add                idxq, 64*4
+    vpermd               m1, m2, [idxq]
+    add                idxq, 64
+    vpmultishiftqb       m0, m3, m1
+    vpmultishiftqb       m1, m4, m1
+    pshufb               m0, m5, m0
+    pshufb               m1, m5, m1
    mova   [dstq+strideq*0], m0
    mova   [dstq+strideq*1], m1
-    mova   [dstq+strideq*2], m2
-    mova   [dstq+stride3q ], m3
-    lea                dstq, [dstq+strideq*4]
-    sub                  hd, 4
+    lea                dstq, [dstq+strideq*2]
+    sub                  hd, 2
    jg .w64
    RET

--- a/third_party/dav1d/src/x86/ipred_sse.asm
+++ b/third_party/dav1d/src/x86/ipred_sse.asm
@ -3479,26 +3479,28 @@ cglobal ipred_z3_8bpc, 4, 7, 8, -16*10, dst, stride, tl, w, h, angle, dy
    jg .end_transpose_loop
    RET

-;---------------------------------------------------------------------------------------
-;int dav1d_pal_pred_ssse3(pixel *dst, const ptrdiff_t stride, const uint16_t *const pal,
-;                                         const uint8_t *idx, const int w, const int h);
-;---------------------------------------------------------------------------------------
+;-------------------------------------------------------------------------------
+;int dav1d_pal_pred_ssse3(pixel *dst, ptrdiff_t stride, const pixel *pal,
+;                         const uint8_t *idx, int w, int h);
+;-------------------------------------------------------------------------------
 cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
-    mova                 m4, [palq]
+    movq                 m4, [palq]
    LEA                  r2, pal_pred_ssse3_table
    tzcnt                wd, wm
    movifnidn            hd, hm
    movsxd               wq, [r2+wq*4]
-    packuswb             m4, m4
    add                  wq, r2
    lea                  r2, [strideq*3]
    jmp                  wq
 .w4:
-    pshufb               m0, m4, [idxq]
-    add                idxq, 16
-    movd   [dstq          ], m0
+    movq                 m1, [idxq]
+    add                idxq, 8
+    psrlw                m0, m1, 4
+    punpcklbw            m1, m0
+    pshufb               m0, m4, m1
+    movd   [dstq+strideq*0], m0
    pshuflw              m1, m0, q1032
-    movd   [dstq+strideq  ], m1
+    movd   [dstq+strideq*1], m1
    punpckhqdq           m0, m0
    movd   [dstq+strideq*2], m0
    psrlq                m0, 32
@ -3507,60 +3509,68 @@ cglobal pal_pred_8bpc, 4, 6, 5, dst, stride, pal, idx, w, h
    sub                  hd, 4
    jg .w4
    RET
-ALIGN function_align
 .w8:
-    pshufb               m0, m4, [idxq]
-    pshufb               m1, m4, [idxq+16]
-    add                idxq, 32
-    movq   [dstq          ], m0
-    movhps [dstq+strideq  ], m0
+    movu                 m0, [idxq]
+    add                idxq, 16
+    pshufb               m1, m4, m0
+    psrlw                m0, 4
+    pshufb               m2, m4, m0
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    movq   [dstq+strideq*0], m0
+    movhps [dstq+strideq*1], m0
    movq   [dstq+strideq*2], m1
    movhps [dstq+r2       ], m1
    lea                dstq, [dstq+strideq*4]
    sub                  hd, 4
    jg .w8
    RET
-ALIGN function_align
 .w16:
-    pshufb               m0, m4, [idxq]
-    pshufb               m1, m4, [idxq+16]
-    pshufb               m2, m4, [idxq+32]
-    pshufb               m3, m4, [idxq+48]
-    add                idxq, 64
-    mova   [dstq          ], m0
-    mova   [dstq+strideq  ], m1
-    mova   [dstq+strideq*2], m2
-    mova   [dstq+r2       ], m3
-    lea                dstq, [dstq+strideq*4]
-    sub                  hd, 4
-    jg .w16
-    RET
-ALIGN function_align
-.w32:
-    pshufb               m0, m4, [idxq]
-    pshufb               m1, m4, [idxq+16]
-    pshufb               m2, m4, [idxq+32]
-    pshufb               m3, m4, [idxq+48]
-    add                idxq, 64
-    mova  [dstq           ], m0
-    mova  [dstq+16        ], m1
-    mova  [dstq+strideq   ], m2
-    mova  [dstq+strideq+16], m3
+    movu                 m0, [idxq]
+    add                idxq, 16
+    pshufb               m1, m4, m0
+    psrlw                m0, 4
+    pshufb               m2, m4, m0
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    mova   [dstq+strideq*0], m0
+    mova   [dstq+strideq*1], m1
    lea                dstq, [dstq+strideq*2]
    sub                  hd, 2
+    jg .w16
+    RET
+.w32:
+    movu                 m0, [idxq]
+    add                idxq, 16
+    pshufb               m1, m4, m0
+    psrlw                m0, 4
+    pshufb               m2, m4, m0
+    punpcklbw            m0, m1, m2
+    punpckhbw            m1, m2
+    mova        [dstq+16*0], m0
+    mova        [dstq+16*1], m1
+    add                dstq, strideq
+    dec                  hd
    jg .w32
    RET
-ALIGN function_align
 .w64:
-    pshufb               m0, m4, [idxq]
-    pshufb               m1, m4, [idxq+16]
-    pshufb               m2, m4, [idxq+32]
-    pshufb               m3, m4, [idxq+48]
-    add                idxq, 64
-    mova          [dstq   ], m0
-    mova          [dstq+16], m1
-    mova          [dstq+32], m2
-    mova          [dstq+48], m3
+    movu                 m0, [idxq+16*0]
+    movu                 m2, [idxq+16*1]
+    add                idxq, 32
+    pshufb               m1, m4, m0
+    psrlw                m0, 4
+    pshufb               m3, m4, m0
+    punpcklbw            m0, m1, m3
+    punpckhbw            m1, m3
+    mova        [dstq+16*0], m0
+    mova        [dstq+16*1], m1
+    pshufb               m1, m4, m2
+    psrlw                m2, 4
+    pshufb               m3, m4, m2
+    punpcklbw            m0, m1, m3
+    punpckhbw            m1, m3
+    mova        [dstq+16*2], m0
+    mova        [dstq+16*3], m1
    add                dstq, strideq
    sub                  hd, 1
    jg .w64
--- a/third_party/dav1d/src/x86/pal.asm
+++ b/third_party/dav1d/src/x86/pal.asm
@ -0,0 +1,641 @@
+; Copyright © 2023, VideoLAN and dav1d authors
+; Copyright © 2023, Two Orioles, LLC
+; All rights reserved.
+;
+; Redistribution and use in source and binary forms, with or without
+; modification, are permitted provided that the following conditions are met:
+;
+; 1. Redistributions of source code must retain the above copyright notice, this
+;    list of conditions and the following disclaimer.
+;
+; 2. Redistributions in binary form must reproduce the above copyright notice,
+;    this list of conditions and the following disclaimer in the documentation
+;    and/or other materials provided with the distribution.
+;
+; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+%include "config.asm"
+%include "ext/x86/x86inc.asm"
+
+SECTION_RODATA 64
+
+pb_0to63:        db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
+%if ARCH_X86_64
+                 db 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+                 db 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47
+                 db 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63
+%endif
+pal_idx_w8_padh: db  0,  1,  2,  3,  3,  3,  3,  3,  8,  9, 10, 11, 11, 11, 11, 11
+
+pb_1_16: times 4 db  1, 16
+%if ARCH_X86_64
+pb_32:   times 4 db 32
+%endif
+
+%macro JMP_TABLE 2-*
+    %xdefine %1_table (%%table - 2*4)
+    %xdefine %%base mangle(private_prefix %+ _%1)
+    %%table:
+    %rep %0 - 1
+        dd %%base %+ .w%2 - (%%table - 2*4)
+        %rotate 1
+    %endrep
+%endmacro
+
+JMP_TABLE pal_idx_finish_ssse3,     4, 8, 16, 32, 64
+%if ARCH_X86_64
+JMP_TABLE pal_idx_finish_avx2,      4, 8, 16, 32, 64
+JMP_TABLE pal_idx_finish_avx512icl, 4, 8, 16, 32, 64
+%endif
+
+SECTION .text
+
+INIT_XMM ssse3
+cglobal pal_idx_finish, 2, 7, 6, dst, src, bw, bh, w, h
+%define base r6-pal_idx_finish_ssse3_table
+    LEA                  r6, pal_idx_finish_ssse3_table
+    tzcnt               bwd, bwm
+    movifnidn           bhd, bhm
+    movifnidn            wd, wm
+    movifnidn            hd, hm
+    movsxd              bwq, [r6+bwq*4]
+    movddup              m3, [base+pb_1_16]
+    add                 bwq, r6
+    sub                 bhd, hd
+    jmp                 bwq
+.w4:
+    mova                 m0, [srcq]
+    add                srcq, 16
+    pmaddubsw            m0, m3
+    packuswb             m0, m0
+    movq             [dstq], m0
+    add                dstq, 8
+    sub                  hd, 4
+    jg .w4
+    test                bhd, bhd
+    jz .w4_end
+    pshuflw              m0, m0, q3333
+.w4_padv:
+    movq             [dstq], m0
+    add                dstq, 8
+    sub                 bhd, 4
+    jg .w4_padv
+.w4_end:
+    RET
+.w8_padh:
+    pshufb               m0, m2
+    pshufb               m1, m2
+    jmp .w8_main
+.w8:
+    mova                 m2, [base+pal_idx_w8_padh]
+.w8_loop:
+    mova                 m0, [srcq+16*0]
+    mova                 m1, [srcq+16*1]
+    cmp                  wd, 8
+    jl .w8_padh
+.w8_main:
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m3
+    add                srcq, 16*2
+    packuswb             m0, m1
+    movu             [dstq], m0
+    add                dstq, 16
+    sub                  hd, 4
+    jg .w8_loop
+    test                bhd, bhd
+    jz .w8_end
+    pshufd               m0, m0, q3333
+.w8_padv:
+    movu             [dstq], m0
+    add                dstq, 16
+    sub                 bhd, 4
+    jg .w8_padv
+.w8_end:
+    RET
+.w16_padh:
+    pshufb               m0, m4
+    pshufb               m1, m4
+    jmp .w16_main
+.w16:
+    cmp                  wd, 16
+    je .w16_loop
+    call .setup_padh
+.w16_loop:
+    mova                 m0, [srcq+16*0]
+    mova                 m1, [srcq+16*1]
+    cmp                  wd, 16
+    jl .w16_padh
+.w16_main:
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m3
+    add                srcq, 16*2
+    packuswb             m0, m1
+    movu             [dstq], m0
+    add                dstq, 16
+    sub                  hd, 2
+    jg .w16_loop
+    test                bhd, bhd
+    jz .w16_end
+    punpckhqdq           m0, m0
+.w16_padv:
+    movu        [dstq+16*0], m0
+    movu        [dstq+16*1], m0
+    add                dstq, 16*2
+    sub                 bhd, 4
+    jg .w16_padv
+.w16_end:
+    RET
+.w32_padh:
+    cmp                  wd, 16
+    jg .w32_padh2
+    pshufb               m1, m0, m5
+    pshufb               m0, m4
+    jmp .w32_main
+.w32_padh2:
+    pshufb               m1, m4
+    jmp .w32_main
+.w32:
+    cmp                  wd, 32
+    je .w32_loop
+    call .setup_padh
+.w32_loop:
+    mova                 m0, [srcq+16*0]
+    mova                 m1, [srcq+16*1]
+    cmp                  wd, 32
+    jl .w32_padh
+.w32_main:
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m3
+    add                srcq, 16*2
+    packuswb             m0, m1
+    movu             [dstq], m0
+    add                dstq, 16
+    dec                  hd
+    jg .w32_loop
+    test                bhd, bhd
+    jz .w32_end
+.w32_padv:
+    movu        [dstq+16*0], m0
+    movu        [dstq+16*1], m0
+    movu        [dstq+16*2], m0
+    movu        [dstq+16*3], m0
+    add                dstq, 16*4
+    sub                 bhd, 4
+    jg .w32_padv
+.w32_end:
+    RET
+.w64_padh:
+    cmp                  wd, 16
+    jg .w64_padh2
+    pshufb               m1, m0, m5
+    pshufb               m0, m4
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m3
+    packuswb             m0, m1
+    packuswb             m1, m1
+    jmp .w64_main
+.w64_padh2:
+    pshufb               m1, m4
+    pmaddubsw            m0, m3
+    pmaddubsw            m2, m1, m3
+    pshufb               m1, m5
+    pmaddubsw            m1, m3
+    packuswb             m0, m2
+    packuswb             m1, m1
+    jmp .w64_main
+.w64_padh3:
+    cmp                  wd, 48
+    jg .w64_padh4
+    pshufb               m2, m1, m5
+    pshufb               m1, m4
+    jmp .w64_main2
+.w64_padh4:
+    pshufb               m2, m4
+    jmp .w64_main2
+.w64:
+    cmp                  wd, 64
+    je .w64_loop
+    call .setup_padh
+.w64_loop:
+    mova                 m0, [srcq+16*0]
+    mova                 m1, [srcq+16*1]
+    cmp                  wd, 32
+    jle .w64_padh
+    pmaddubsw            m0, m3
+    pmaddubsw            m1, m3
+    packuswb             m0, m1
+    mova                 m1, [srcq+16*2]
+    mova                 m2, [srcq+16*3]
+    cmp                  wd, 64
+    jl .w64_padh3
+.w64_main2:
+    pmaddubsw            m1, m3
+    pmaddubsw            m2, m3
+    packuswb             m1, m2
+.w64_main:
+    add                srcq, 16*4
+    movu        [dstq+16*0], m0
+    movu        [dstq+16*1], m1
+    add                dstq, 16*2
+    dec                  hd
+    jg .w64_loop
+    test                bhd, bhd
+    jz .w64_end
+.w64_padv:
+    movu        [dstq+16*0], m0
+    movu        [dstq+16*1], m1
+    movu        [dstq+16*2], m0
+    movu        [dstq+16*3], m1
+    add                dstq, 16*4
+    sub                 bhd, 2
+    jg .w64_padv
+.w64_end:
+    RET
+.setup_padh:
+    mova                 m4, [base+pb_0to63]
+    lea                 r6d, [wq-1]
+    and                 r6d, 15
+    movd                 m5, r6d
+    pxor                 m0, m0
+    pshufb               m5, m0
+    pminub               m4, m5
+    ret
+
+%if ARCH_X86_64
+
+INIT_YMM avx2
+cglobal pal_idx_finish, 4, 7, 5, dst, src, bw, bh, w, h
+%define base r6-pal_idx_finish_avx2_table
+    lea                  r6, [pal_idx_finish_avx2_table]
+    tzcnt               bwd, bwd
+    movifnidn            wd, wm
+    movifnidn            hd, hm
+    movsxd              bwq, [r6+bwq*4]
+    vpbroadcastd         m2, [base+pb_1_16]
+    dec                  wd
+    add                 bwq, r6
+    sub                 bhd, hd
+    jmp                 bwq
+.w4:
+    mova                xm0, [srcq]
+    add                srcq, 16
+    pmaddubsw           xm0, xm2
+    packuswb            xm0, xm0
+    movq             [dstq], xm0
+    add                dstq, 8
+    sub                  hd, 4
+    jg .w4
+    test                bhd, bhd
+    jz .w4_end
+    pshuflw             xm0, xm0, q3333
+.w4_padv:
+    movq             [dstq], xm0
+    add                dstq, 8
+    sub                 bhd, 4
+    jg .w4_padv
+.w4_end:
+    RET
+.w8_padh:
+    pshufb              xm0, xm3
+    pshufb              xm1, xm3
+    jmp .w8_main
+.w8:
+    mova                xm3, [base+pal_idx_w8_padh]
+.w8_loop:
+    mova                xm0, [srcq+16*0]
+    mova                xm1, [srcq+16*1]
+    cmp                  wd, 7
+    jl .w8_padh
+.w8_main:
+    pmaddubsw           xm0, xm2
+    pmaddubsw           xm1, xm2
+    add                srcq, 16*2
+    packuswb            xm0, xm1
+    movu             [dstq], xm0
+    add                dstq, 16
+    sub                  hd, 4
+    jg .w8_loop
+    test                bhd, bhd
+    jz .w8_end
+    pshufd              xm0, xm0, q3333
+.w8_padv:
+    movu             [dstq], xm0
+    add                dstq, 16
+    sub                 bhd, 4
+    jg .w8_padv
+.w8_end:
+    RET
+.w16_padh:
+    pshufb               m0, m3
+    pshufb               m1, m3
+    jmp .w16_main
+.w16:
+    cmp                  wd, 15
+    je .w16_loop
+    vbroadcasti128       m0, [base+pb_0to63]
+    movd                xm3, wd
+    vpbroadcastb         m3, xm3
+    pminub               m3, m0
+.w16_loop:
+    mova                 m0, [srcq+32*0]
+    mova                 m1, [srcq+32*1]
+    cmp                  wd, 15
+    jl .w16_padh
+.w16_main:
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    add                srcq, 32*2
+    packuswb             m0, m1
+    vpermq               m1, m0, q3120
+    movu             [dstq], m1
+    add                dstq, 32
+    sub                  hd, 4
+    jg .w16_loop
+    test                bhd, bhd
+    jz .w16_end
+    vpermq               m0, m0, q3333
+.w16_padv:
+    movu             [dstq], m0
+    add                dstq, 32
+    sub                 bhd, 4
+    jg .w16_padv
+.w16_end:
+    RET
+.w32_padh:
+    cmp                  wd, 15
+    jg .w32_padh2
+    vinserti128          m0, xm0, 1
+    vinserti128          m1, xm1, 1
+.w32_padh2:
+    pshufb               m0, m3
+    pshufb               m1, m3
+    jmp .w32_main
+.w32:
+    cmp                  wd, 31
+    je .w32_loop
+    movd                xm3, wd
+    vpbroadcastb         m3, xm3
+    pminub               m3, [base+pb_0to63]
+.w32_loop:
+    mova                 m0, [srcq+32*0]
+    mova                 m1, [srcq+32*1]
+    cmp                  wd, 31
+    jl .w32_padh
+.w32_main:
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    add                srcq, 32*2
+    packuswb             m0, m1
+    vpermq               m1, m0, q3120
+    movu             [dstq], m1
+    add                dstq, 32
+    sub                  hd, 2
+    jg .w32_loop
+    test                bhd, bhd
+    jz .w32_end
+    vpermq               m0, m0, q3131
+.w32_padv:
+    movu        [dstq+32*0], m0
+    movu        [dstq+32*1], m0
+    add                dstq, 32*2
+    sub                 bhd, 4
+    jg .w32_padv
+.w32_end:
+    RET
+.w64_padh:
+    cmp                  wd, 15
+    jg .w64_padh2
+    vinserti128          m1, m0, xm0, 1
+    pshufb               m0, m1, m3
+    pshufb               m1, m4
+    jmp .w64_main
+.w64_padh2:
+    cmp                  wd, 31
+    jg .w64_padh3
+    vperm2i128           m1, m0, m0, 0x11
+    pshufb               m0, m3
+    pshufb               m1, m4
+    jmp .w64_main
+.w64_padh3:
+    cmp                  wd, 47
+    jg .w64_padh4
+    vinserti128          m1, xm1, 1
+.w64_padh4:
+    pshufb               m1, m3
+    jmp .w64_main
+.w64:
+    cmp                  wd, 63
+    je .w64_loop
+    mov                 r6d, wd
+    and                 r6d, 31
+    movd                xm4, r6d
+    vpbroadcastb         m4, xm4
+    pminub               m3, m4, [pb_0to63]
+.w64_loop:
+    mova                 m0, [srcq+32*0]
+    mova                 m1, [srcq+32*1]
+    cmp                  wd, 63
+    jl .w64_padh
+.w64_main:
+    pmaddubsw            m0, m2
+    pmaddubsw            m1, m2
+    add                srcq, 32*2
+    packuswb             m0, m1
+    vpermq               m0, m0, q3120
+    movu             [dstq], m0
+    add                dstq, 32
+    dec                  hd
+    jg .w64_loop
+    test                bhd, bhd
+    jz .w64_end
+.w64_padv:
+    movu        [dstq+32*0], m0
+    movu        [dstq+32*1], m0
+    movu        [dstq+32*2], m0
+    movu        [dstq+32*3], m0
+    add                dstq, 32*4
+    sub                 bhd, 4
+    jg .w64_padv
+.w64_end:
+    RET
+
+INIT_ZMM avx512icl
+cglobal pal_idx_finish, 4, 7, 7, dst, src, bw, bh, w, h
+%define base r6-pal_idx_finish_avx512icl_table
+    lea                  r6, [pal_idx_finish_avx512icl_table]
+    tzcnt               bwd, bwd
+    movifnidn            wd, wm
+    movifnidn            hd, hm
+    movsxd              bwq, [r6+bwq*4]
+    vpbroadcastd         m4, [base+pb_1_16]
+    dec                  wd
+    add                 bwq, r6
+    sub                 bhd, hd
+    jmp                 bwq
+.w4:
+    mova               xmm0, [srcq]
+    add                srcq, 16
+    pmaddubsw          xmm0, xm4
+    packuswb           xmm0, xmm0
+    movq             [dstq], xmm0
+    add                dstq, 8
+    sub                  hd, 4
+    jg .w4
+    test                bhd, bhd
+    jz .w4_end
+    pshuflw            xmm0, xmm0, q3333
+.w4_padv:
+    movq             [dstq], xmm0
+    add                dstq, 8
+    sub                 bhd, 4
+    jg .w4_padv
+.w4_end:
+    RET
+.w8_padh:
+    pshufb             xmm0, xmm2
+    pshufb             xmm1, xmm2
+    jmp .w8_main
+.w8:
+    mova               xmm2, [base+pal_idx_w8_padh]
+.w8_loop:
+    mova               xmm0, [srcq+16*0]
+    mova               xmm1, [srcq+16*1]
+    cmp                  wd, 7
+    jl .w8_padh
+.w8_main:
+    pmaddubsw          xmm0, xm4
+    pmaddubsw          xmm1, xm4
+    add                srcq, 16*2
+    packuswb           xmm0, xmm1
+    movu             [dstq], xmm0
+    add                dstq, 16
+    sub                  hd, 4
+    jg .w8_loop
+    test                bhd, bhd
+    jz .w8_end
+    pshufd             xmm0, xmm0, q3333
+.w8_padv:
+    movu             [dstq], xmm0
+    add                dstq, 16
+    sub                 bhd, 4
+    jg .w8_padv
+.w8_end:
+    RET
+.w16_padh:
+    pshufb               m0, m2
+    jmp .w16_main
+.w16:
+    cmp                  wd, 15
+    je .w16_loop
+    vbroadcasti32x4      m2, [base+pb_0to63]
+    vpbroadcastb         m0, wd
+    pminub               m2, m0
+.w16_loop:
+    mova                 m0, [srcq]
+    cmp                  wd, 15
+    jl .w16_padh
+.w16_main:
+    pmaddubsw            m0, m4
+    add                srcq, 64
+    vpmovwb             ym0, m0
+    movu             [dstq], ym0
+    add                dstq, 32
+    sub                  hd, 4
+    jg .w16_loop
+    test                bhd, bhd
+    jz .w16_end
+    vpermq              ym0, ym0, q3333
+.w16_padv:
+    movu             [dstq], ym0
+    add                dstq, 32
+    sub                 bhd, 4
+    jg .w16_padv
+.w16_end:
+    RET
+.w32_padh:
+    vpermb               m0, m2, m0
+    vpermb               m1, m2, m1
+    jmp .w32_main
+.w32:
+    mova                 m2, [base+pb_0to63]
+    paddb                m3, m2, m2
+    cmp                  wd, 31
+    je .w32_loop
+    vpbroadcastb         m0, wd
+    mov                 r6d, 0xff00
+    kmovw                k1, r6d
+    vpaddd           m0{k1}, [pb_32] {1to16}
+    pminub               m2, m0
+.w32_loop:
+    mova                 m0, [srcq+64*0]
+    mova                 m1, [srcq+64*1]
+    cmp                  wd, 31
+    jl .w32_padh
+.w32_main:
+    pmaddubsw            m0, m4
+    pmaddubsw            m1, m4
+    add                srcq, 64*2
+    vpermt2b             m0, m3, m1
+    movu             [dstq], m0
+    add                dstq, 64
+    sub                  hd, 4
+    jg .w32_loop
+    test                bhd, bhd
+    jz .w32_end
+    vshufi32x4           m0, m0, q3333
+.w32_padv:
+    movu             [dstq], m0
+    add                dstq, 64
+    sub                 bhd, 4
+    jg .w32_padv
+.w32_end:
+    RET
+.w64_padh:
+    REPX  {vpermb x, m5, x}, m0, m1, m2, m3
+    jmp .w64_main
+.w64:
+    mova                 m5, [base+pb_0to63]
+    paddb                m6, m5, m5
+    cmp                  wd, 63
+    je .w64_loop
+    vpbroadcastb         m0, wd
+    pminub               m5, m0
+.w64_loop:
+    mova                 m0, [srcq+64*0]
+    mova                 m1, [srcq+64*1]
+    mova                 m2, [srcq+64*2]
+    mova                 m3, [srcq+64*3]
+    cmp                  wd, 63
+    jl .w64_padh
+.w64_main:
+    REPX  {pmaddubsw x, m4}, m0, m1, m2, m3
+    add                srcq, 64*4
+    vpermt2b             m0, m6, m1
+    vpermt2b             m2, m6, m3
+    movu        [dstq+64*0], m0
+    movu        [dstq+64*1], m2
+    add                dstq, 64*2
+    sub                  hd, 4
+    jg .w64_loop
+    test                bhd, bhd
+    jz .w64_end
+    vshufi32x4           m2, m2, q3232
+.w64_padv:
+    movu        [dstq+64*0], m2
+    movu        [dstq+64*1], m2
+    add                dstq, 64*2
+    sub                 bhd, 4
+    jg .w64_padv
+.w64_end:
+    RET
+
+%endif ; ARCH_X86_64
--- a/third_party/dav1d/src/x86/pal.h
+++ b/third_party/dav1d/src/x86/pal.h
@ -0,0 +1,50 @@
+/*
+ * Copyright © 2023, VideoLAN and dav1d authors
+ * Copyright © 2023, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/cpu.h"
+
+decl_pal_idx_finish_fn(dav1d_pal_idx_finish_ssse3);
+decl_pal_idx_finish_fn(dav1d_pal_idx_finish_avx2);
+decl_pal_idx_finish_fn(dav1d_pal_idx_finish_avx512icl);
+
+static ALWAYS_INLINE void pal_dsp_init_x86(Dav1dPalDSPContext *const c) {
+    const unsigned flags = dav1d_get_cpu_flags();
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_SSSE3)) return;
+
+    c->pal_idx_finish = dav1d_pal_idx_finish_ssse3;
+
+#if ARCH_X86_64
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX2)) return;
+
+    c->pal_idx_finish = dav1d_pal_idx_finish_avx2;
+
+    if (!(flags & DAV1D_X86_CPU_FLAG_AVX512ICL)) return;
+
+    c->pal_idx_finish = dav1d_pal_idx_finish_avx512icl;
+#endif
+}
--- a/third_party/dav1d/tests/meson.build
+++ b/third_party/dav1d/tests/meson.build
@ -35,6 +35,7 @@ if is_asm_enabled
    checkasm_sources = files(
        'checkasm/checkasm.c',
        'checkasm/msac.c',
+        'checkasm/pal.c',
        'checkasm/refmvs.c',
    )