Bug 1891459 - Update dav1d to 5b5399911dd24703de641d65eda5b7f1e845d060 r=chunmin

Differential Revision: https://phabricator.services.mozilla.com/D207425
2024-04-16 16:40:31 +00:00 · 2024-04-16 16:40:31 +00:00 · 16eb058401
commit 16eb058401
parent be484e5383
25 changed files with 6745 additions and 2663 deletions
--- a/media/libdav1d/asm/moz.build
+++ b/media/libdav1d/asm/moz.build
@ -211,6 +211,7 @@ elif CONFIG['TARGET_CPU'] == 'arm' or CONFIG['TARGET_CPU'] == 'aarch64':
            '../../../third_party/dav1d/src/arm/64/looprestoration_tmpl.S',
            '../../../third_party/dav1d/src/arm/64/mc.S',
            '../../../third_party/dav1d/src/arm/64/mc16.S',
+            '../../../third_party/dav1d/src/arm/64/mc_dotprod.S',
            '../../../third_party/dav1d/src/arm/64/msac.S',
            '../../../third_party/dav1d/src/arm/64/refmvs.S',
        ]
--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: 8e08426468a76d8a667e8a79d92bafd85d7411ac (2024-03-18T20:50:37.000+00:00).
+  release: 5b5399911dd24703de641d65eda5b7f1e845d060 (2024-04-15T13:19:42.000+02:00).

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: 8e08426468a76d8a667e8a79d92bafd85d7411ac
+  revision: 5b5399911dd24703de641d65eda5b7f1e845d060

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "8e08426468a76d8a667e8a79d92bafd85d7411ac"
+#define DAV1D_VERSION "5b5399911dd24703de641d65eda5b7f1e845d060"
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@ -81,6 +81,8 @@ cdata.set10('TRIM_DSP_FUNCTIONS', get_option('trim_dsp') == 'true' or
 # Logging option
 cdata.set10('CONFIG_LOG', get_option('logging'))

+cdata.set10('CONFIG_MACOS_KPERF', get_option('macos_kperf'))
+
 #
 # OS/Compiler checks and defines
 #
--- a/third_party/dav1d/meson_options.txt
+++ b/third_party/dav1d/meson_options.txt
@ -68,3 +68,8 @@ option('trim_dsp',
    choices: ['true', 'false', 'if-release'],
    value: 'if-release',
    description: 'Eliminate redundant DSP functions where possible')
+
+option('macos_kperf',
+    type: 'boolean',
+    value: false,
+    description: 'Use the private macOS kperf API for benchmarking')
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@ -837,7 +837,7 @@ endfunc

 // This has got the same signature as the put_8tap functions,
 // and assumes that x8 is set to (clz(w)-24).
-function put_neon
+function put_neon, export=1
        adr             x9,  L(put_tbl)
        ldrh            w8,  [x9, x8, lsl #1]
        sub             x9,  x9,  w8, uxtw
@ -939,7 +939,7 @@ endfunc

 // This has got the same signature as the prep_8tap functions,
 // and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
-function prep_neon
+function prep_neon, export=1
        adr             x9,  L(prep_tbl)
        ldrh            w8,  [x9, x8, lsl #1]
        sub             x9,  x9,  w8, uxtw
--- a/third_party/dav1d/src/arm/64/mc_dotprod.S
+++ b/third_party/dav1d/src/arm/64/mc_dotprod.S
--- a/third_party/dav1d/src/arm/64/msac.S
+++ b/third_party/dav1d/src/arm/64/msac.S
@ -288,10 +288,8 @@ function msac_decode_hi_tok_neon, export=1
        mvni            v30.4h, #0x3f             // 0xffc0
        ldrh            w9,  [x1, #6]             // count = cdf[n_symbols]
        ld1r            {v3.4h},  [x16]           // rng
-        movrel          x16, bits
        ld1             {v29.4h}, [x17]           // EC_MIN_PROB * (n_symbols - ret)
        add             x17, x0,  #DIF + 6
-        ld1             {v16.8h}, [x16]
        mov             w13, #-24
        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
        ldr             w10, [x0, #ALLOW_UPDATE_CDF]
@ -305,30 +303,27 @@ function msac_decode_hi_tok_neon, export=1
        add             v4.4h,   v17.4h,  v29.4h  // v = cdf + EC_MIN_PROB * (n_symbols - ret)
        add             v4.4h,   v6.4h,   v4.4h   // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
        str             h3,  [sp, #14]            // store original u = s->rng
-        cmhs            v2.8h,   v1.8h,   v4.8h   // c >= v
+        cmhs            v2.4h,   v1.4h,   v4.4h   // c >= v
        str             q4,  [sp, #16]            // store v values to allow indexed access
-        and             v6.16b,  v2.16b,  v16.16b // One bit per halfword set in the mask
-        addv            h6,  v6.8h                // Aggregate mask bits
-        umov            w3,  v6.h[0]
+        addv            h6,  v2.4h                // -4 + ret
        add             w13, w13, #5
-        rbit            w3,  w3
+        smov            w15, v6.h[0]
        add             x8,  sp,  #16
-        clz             w15, w3                   // ret
+        add             w15, w15, #4              // ret

        cbz             w10, 2f
        // update_cdf
-        movi            v5.8b, #0xff
+        sub             v5.4h,   v0.4h,   v2.4h   // cdf[i] + (i >= val ? 1 : 0)
        mov             w4,  #-5
-        urhadd          v4.4h,   v5.4h,   v2.4h   // i >= val ? -1 : 32768
+        orr             v2.4h, #0x80, lsl #8      // i >= val ? -1 : 32768
        sub             w4,  w4,  w9, lsr #4      // -((count >> 4) + 5)
-        sub             v4.4h,   v4.4h,   v0.4h   // (32768 - cdf[i]) or (-1 - cdf[i])
+        sub             v4.4h,   v2.4h,   v0.4h   // (32768 - cdf[i]) or (-1 - cdf[i])
        dup             v6.4h,    w4              // -rate

        sub             w9,  w9,  w9, lsr #5      // count - (count == 32)
-        sub             v0.4h,   v0.4h,   v2.4h   // cdf + (i >= val ? 1 : 0)
        sshl            v4.4h,   v4.4h,   v6.4h   // ({32768,-1} - cdf[i]) >> rate
        add             w9,  w9,  #1              // count + (count < 32)
-        add             v0.4h,   v0.4h,   v4.4h   // cdf + (32768 - cdf[i]) >> rate
+        add             v0.4h,   v5.4h,   v4.4h   // cdf[i] + (32768 - cdf[i]) >> rate
        st1             {v0.4h},  [x1]
        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
        strh            w9,  [x1, #6]
--- a/third_party/dav1d/src/arm/itx.h
+++ b/third_party/dav1d/src/arm/itx.h
@ -28,34 +28,6 @@
 #include "src/cpu.h"
 #include "src/itx.h"

-#define decl_itx2_fns(w, h, opt) \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
-
-#define decl_itx12_fns(w, h, opt) \
-decl_itx2_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
-
-#define decl_itx16_fns(w, h, opt) \
-decl_itx12_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
-
-#define decl_itx17_fns(w, h, opt) \
-decl_itx16_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
-
 decl_itx17_fns( 4,  4, neon);
 decl_itx16_fns( 4,  8, neon);
 decl_itx16_fns( 4, 16, neon);
@ -78,41 +50,6 @@ decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x32, neon));
 decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_64x64, neon));

 static ALWAYS_INLINE void itx_dsp_init_arm(Dav1dInvTxfmDSPContext *const c, int bpc) {
-#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
-    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
-        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
-
-#define assign_itx1_fn(pfx, w, h, ext) \
-    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
-
-#define assign_itx2_fn(pfx, w, h, ext) \
-    assign_itx1_fn(pfx, w, h, ext); \
-    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
-
-#define assign_itx12_fn(pfx, w, h, ext) \
-    assign_itx2_fn(pfx, w, h, ext); \
-    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
-    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
-    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
-    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
-    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
-    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
-    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
-    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
-    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
-    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
-
-#define assign_itx16_fn(pfx, w, h, ext) \
-    assign_itx12_fn(pfx, w, h, ext); \
-    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
-    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
-    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
-    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
-
-#define assign_itx17_fn(pfx, w, h, ext) \
-    assign_itx16_fn(pfx, w, h, ext); \
-    assign_itx_fn(pfx, w, h, wht_wht,           WHT_WHT,           ext)
-
    const unsigned flags = dav1d_get_cpu_flags();

    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
--- a/third_party/dav1d/src/arm/mc.h
+++ b/third_party/dav1d/src/arm/mc.h
@ -30,26 +30,40 @@
 #include "src/mc.h"
 #include "src/cpu.h"

-decl_mc_fn(BF(dav1d_put_8tap_regular, neon));
-decl_mc_fn(BF(dav1d_put_8tap_regular_smooth, neon));
-decl_mc_fn(BF(dav1d_put_8tap_regular_sharp, neon));
-decl_mc_fn(BF(dav1d_put_8tap_smooth, neon));
-decl_mc_fn(BF(dav1d_put_8tap_smooth_regular, neon));
-decl_mc_fn(BF(dav1d_put_8tap_smooth_sharp, neon));
-decl_mc_fn(BF(dav1d_put_8tap_sharp, neon));
-decl_mc_fn(BF(dav1d_put_8tap_sharp_regular, neon));
-decl_mc_fn(BF(dav1d_put_8tap_sharp_smooth, neon));
-decl_mc_fn(BF(dav1d_put_bilin, neon));
+#define decl_8tap_gen(decl_name, fn_name, opt) \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular,        opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular_smooth, opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_regular_sharp,  opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth_regular, opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth,         opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_smooth_sharp,   opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp_regular,  opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp_smooth,   opt)); \
+    decl_##decl_name##_fn(BF(dav1d_##fn_name##_8tap_sharp,          opt))

-decl_mct_fn(BF(dav1d_prep_8tap_regular, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_regular_smooth, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_regular_sharp, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_smooth, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_smooth_regular, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_smooth_sharp, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_sharp, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_sharp_regular, neon));
-decl_mct_fn(BF(dav1d_prep_8tap_sharp_smooth, neon));
+#define decl_8tap_fns(opt) \
+    decl_8tap_gen(mc,  put,  opt); \
+    decl_8tap_gen(mct, prep, opt)
+
+#define init_8tap_gen(name, opt) \
+    init_##name##_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        opt); \
+    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, opt); \
+    init_##name##_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  opt); \
+    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, opt); \
+    init_##name##_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         opt); \
+    init_##name##_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   opt); \
+    init_##name##_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  opt); \
+    init_##name##_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   opt); \
+    init_##name##_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          opt)
+
+#define init_8tap_fns(opt) \
+    init_8tap_gen(mc,  opt); \
+    init_8tap_gen(mct, opt)
+
+decl_8tap_fns(neon);
+decl_8tap_fns(neon_dotprod);
+
+decl_mc_fn(BF(dav1d_put_bilin, neon));
 decl_mct_fn(BF(dav1d_prep_bilin, neon));

 decl_avg_fn(BF(dav1d_avg, neon));
@ -77,27 +91,10 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {

    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;

-    init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        neon);
-    init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
-    init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  neon);
-    init_mc_fn (FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
-    init_mc_fn (FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         neon);
-    init_mc_fn (FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   neon);
-    init_mc_fn (FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  neon);
-    init_mc_fn (FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   neon);
-    init_mc_fn (FILTER_2D_8TAP_SHARP,          8tap_sharp,          neon);
-    init_mc_fn (FILTER_2D_BILINEAR,            bilin,               neon);
+    init_8tap_fns(neon);

-    init_mct_fn(FILTER_2D_8TAP_REGULAR,        8tap_regular,        neon);
-    init_mct_fn(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
-    init_mct_fn(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  neon);
-    init_mct_fn(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular, neon);
-    init_mct_fn(FILTER_2D_8TAP_SMOOTH,         8tap_smooth,         neon);
-    init_mct_fn(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp,   neon);
-    init_mct_fn(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular,  neon);
-    init_mct_fn(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth,   neon);
-    init_mct_fn(FILTER_2D_8TAP_SHARP,          8tap_sharp,          neon);
-    init_mct_fn(FILTER_2D_BILINEAR,            bilin,               neon);
+    init_mc_fn (FILTER_2D_BILINEAR, bilin, neon);
+    init_mct_fn(FILTER_2D_BILINEAR, bilin, neon);

    c->avg = BF(dav1d_avg, neon);
    c->w_avg = BF(dav1d_w_avg, neon);
@ -111,4 +108,12 @@ static ALWAYS_INLINE void mc_dsp_init_arm(Dav1dMCDSPContext *const c) {
    c->warp8x8 = BF(dav1d_warp_affine_8x8, neon);
    c->warp8x8t = BF(dav1d_warp_affine_8x8t, neon);
    c->emu_edge = BF(dav1d_emu_edge, neon);
+
+#if ARCH_AARCH64
+#if HAVE_DOTPROD && BITDEPTH == 8
+    if (!(flags & DAV1D_ARM_CPU_FLAG_DOTPROD)) return;
+
+    init_8tap_fns(neon_dotprod);
+#endif  // HAVE_DOTPROD && BITDEPTH == 8
+#endif  // ARCH_AARCH64
 }
--- a/third_party/dav1d/src/cdf.c
+++ b/third_party/dav1d/src/cdf.c
--- a/third_party/dav1d/src/cdf.h
+++ b/third_party/dav1d/src/cdf.h
@ -34,12 +34,10 @@
 #include "src/ref.h"
 #include "src/thread_data.h"

-/* Buffers padded to [8] or [16] for SIMD where needed. */
+/* Buffers padded to [4]/[8]/[16] for SIMD where needed. */

 typedef struct CdfModeContext {
-    ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32);
    ALIGN(uint16_t uv_mode[2][N_INTRA_PRED_MODES][N_UV_INTRA_PRED_MODES + 2], 32);
-    ALIGN(uint16_t wedge_idx[9][16], 32);
    ALIGN(uint16_t partition[N_BL_LEVELS][4][N_PARTITIONS + 6], 32);
    ALIGN(uint16_t cfl_alpha[6][16], 32);
    ALIGN(uint16_t txtp_inter1[2][16], 32);
@ -49,23 +47,33 @@ typedef struct CdfModeContext {
    ALIGN(uint16_t cfl_sign[8], 16);
    ALIGN(uint16_t angle_delta[8][8], 16);
    ALIGN(uint16_t filter_intra[5 + 3], 16);
-    ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16);
    ALIGN(uint16_t seg_id[3][DAV1D_MAX_SEGMENTS], 16);
    ALIGN(uint16_t pal_sz[2][7][7 + 1], 16);
    ALIGN(uint16_t color_map[2][7][5][8], 16);
-    ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8);
    ALIGN(uint16_t txsz[N_TX_SIZES - 1][3][4], 8);
-    ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8);
    ALIGN(uint16_t delta_q[4], 8);
    ALIGN(uint16_t delta_lf[5][4], 8);
-    ALIGN(uint16_t interintra_mode[4][4], 8);
    ALIGN(uint16_t restore_switchable[3 + 1], 8);
    ALIGN(uint16_t restore_wiener[2], 4);
    ALIGN(uint16_t restore_sgrproj[2], 4);
-    ALIGN(uint16_t interintra[7][2], 4);
-    ALIGN(uint16_t interintra_wedge[7][2], 4);
    ALIGN(uint16_t txtp_inter3[4][2], 4);
    ALIGN(uint16_t use_filter_intra[N_BS_SIZES][2], 4);
+    ALIGN(uint16_t txpart[7][3][2], 4);
+    ALIGN(uint16_t skip[3][2], 4);
+    ALIGN(uint16_t pal_y[7][3][2], 4);
+    ALIGN(uint16_t pal_uv[2][2], 4);
+
+    /* key/intra */
+    ALIGN(uint16_t intrabc[2], 4);
+
+    /* inter/switch */
+    ALIGN(uint16_t y_mode[4][N_INTRA_PRED_MODES + 3], 32);
+    ALIGN(uint16_t wedge_idx[9][16], 32);
+    ALIGN(uint16_t comp_inter_mode[8][N_COMP_INTER_PRED_MODES], 16);
+    ALIGN(uint16_t filter[2][8][DAV1D_N_SWITCHABLE_FILTERS + 1], 8);
+    ALIGN(uint16_t interintra_mode[4][4], 8);
+    ALIGN(uint16_t motion_mode[N_BS_SIZES][3 + 1], 8);
+    ALIGN(uint16_t skip_mode[3][2], 4);
    ALIGN(uint16_t newmv_mode[6][2], 4);
    ALIGN(uint16_t globalmv_mode[2][2], 4);
    ALIGN(uint16_t refmv_mode[6][2], 4);
@ -80,14 +88,10 @@ typedef struct CdfModeContext {
    ALIGN(uint16_t comp_fwd_ref[3][3][2], 4);
    ALIGN(uint16_t comp_bwd_ref[2][3][2], 4);
    ALIGN(uint16_t comp_uni_ref[3][3][2], 4);
-    ALIGN(uint16_t txpart[7][3][2], 4);
-    ALIGN(uint16_t skip[3][2], 4);
-    ALIGN(uint16_t skip_mode[3][2], 4);
    ALIGN(uint16_t seg_pred[3][2], 4);
+    ALIGN(uint16_t interintra[7][2], 4);
+    ALIGN(uint16_t interintra_wedge[7][2], 4);
    ALIGN(uint16_t obmc[N_BS_SIZES][2], 4);
-    ALIGN(uint16_t pal_y[7][3][2], 4);
-    ALIGN(uint16_t pal_uv[2][2], 4);
-    ALIGN(uint16_t intrabc[2], 4);
 } CdfModeContext;

 typedef struct CdfCoefContext {
@ -108,13 +112,13 @@ typedef struct CdfCoefContext {

 typedef struct CdfMvComponent {
    ALIGN(uint16_t classes[11 + 5], 32);
-    ALIGN(uint16_t class0_fp[2][4], 8);
-    ALIGN(uint16_t classN_fp[4], 8);
-    ALIGN(uint16_t class0_hp[2], 4);
-    ALIGN(uint16_t classN_hp[2], 4);
-    ALIGN(uint16_t class0[2], 4);
-    ALIGN(uint16_t classN[10][2], 4);
    ALIGN(uint16_t sign[2], 4);
+    ALIGN(uint16_t class0[2], 4);
+    ALIGN(uint16_t class0_fp[2][4], 8);
+    ALIGN(uint16_t class0_hp[2], 4);
+    ALIGN(uint16_t classN[10][2], 4);
+    ALIGN(uint16_t classN_fp[4], 8);
+    ALIGN(uint16_t classN_hp[2], 4);
 } CdfMvComponent;

 typedef struct CdfMvContext {
@ -123,10 +127,10 @@ typedef struct CdfMvContext {
 } CdfMvContext;

 typedef struct CdfContext {
-    CdfModeContext m;
-    ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32);
    CdfCoefContext coef;
-    CdfMvContext mv, dmv;
+    CdfModeContext m;
+    CdfMvContext mv;
+    ALIGN(uint16_t kfym[5][5][N_INTRA_PRED_MODES + 3], 32);
 } CdfContext;

 typedef struct CdfThreadContext {
@ -138,7 +142,7 @@ typedef struct CdfThreadContext {
    atomic_uint *progress;
 } CdfThreadContext;

-void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, int qidx);
+void dav1d_cdf_thread_init_static(CdfThreadContext *cdf, unsigned qidx);
 int dav1d_cdf_thread_alloc(Dav1dContext *c, CdfThreadContext *cdf,
                           const int have_frame_mt);
 void dav1d_cdf_thread_copy(CdfContext *dst, const CdfThreadContext *src);
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@ -73,42 +73,29 @@ static void init_quant_tables(const Dav1dSequenceHeader *const seq_hdr,
    }
 }

-static int read_mv_component_diff(Dav1dTaskContext *const t,
+static int read_mv_component_diff(MsacContext *const msac,
                                  CdfMvComponent *const mv_comp,
-                                  const int have_fp)
+                                  const int mv_prec)
 {
-    Dav1dTileState *const ts = t->ts;
-    const Dav1dFrameContext *const f = t->f;
-    const int have_hp = f->frame_hdr->hp;
-    const int sign = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->sign);
-    const int cl = dav1d_msac_decode_symbol_adapt16(&ts->msac,
-                                                    mv_comp->classes, 10);
-    int up, fp, hp;
+    const int sign = dav1d_msac_decode_bool_adapt(msac, mv_comp->sign);
+    const int cl = dav1d_msac_decode_symbol_adapt16(msac, mv_comp->classes, 10);
+    int up, fp = 3, hp = 1;

    if (!cl) {
-        up = dav1d_msac_decode_bool_adapt(&ts->msac, mv_comp->class0);
-        if (have_fp) {
-            fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
-                                                 mv_comp->class0_fp[up], 3);
-            hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
-                                                        mv_comp->class0_hp) : 1;
-        } else {
-            fp = 3;
-            hp = 1;
+        up = dav1d_msac_decode_bool_adapt(msac, mv_comp->class0);
+        if (mv_prec >= 0) {  // !force_integer_mv
+            fp = dav1d_msac_decode_symbol_adapt4(msac, mv_comp->class0_fp[up], 3);
+            if (mv_prec > 0) // allow_high_precision_mv
+                hp = dav1d_msac_decode_bool_adapt(msac, mv_comp->class0_hp);
        }
    } else {
        up = 1 << cl;
        for (int n = 0; n < cl; n++)
-            up |= dav1d_msac_decode_bool_adapt(&ts->msac,
-                                               mv_comp->classN[n]) << n;
-        if (have_fp) {
-            fp = dav1d_msac_decode_symbol_adapt4(&ts->msac,
-                                                 mv_comp->classN_fp, 3);
-            hp = have_hp ? dav1d_msac_decode_bool_adapt(&ts->msac,
-                                                        mv_comp->classN_hp) : 1;
-        } else {
-            fp = 3;
-            hp = 1;
+            up |= dav1d_msac_decode_bool_adapt(msac, mv_comp->classN[n]) << n;
+        if (mv_prec >= 0) {  // !force_integer_mv
+            fp = dav1d_msac_decode_symbol_adapt4(msac, mv_comp->classN_fp, 3);
+            if (mv_prec > 0) // allow_high_precision_mv
+                hp = dav1d_msac_decode_bool_adapt(msac, mv_comp->classN_hp);
        }
    }

@ -117,25 +104,16 @@ static int read_mv_component_diff(Dav1dTaskContext *const t,
    return sign ? -diff : diff;
 }

-static void read_mv_residual(Dav1dTaskContext *const t, mv *const ref_mv,
-                             CdfMvContext *const mv_cdf, const int have_fp)
+static void read_mv_residual(Dav1dTileState *const ts, mv *const ref_mv,
+                             const int mv_prec)
 {
-    switch (dav1d_msac_decode_symbol_adapt4(&t->ts->msac, t->ts->cdf.mv.joint,
-                                            N_MV_JOINTS - 1))
-    {
-    case MV_JOINT_HV:
-        ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
-        ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp);
-        break;
-    case MV_JOINT_H:
-        ref_mv->x += read_mv_component_diff(t, &mv_cdf->comp[1], have_fp);
-        break;
-    case MV_JOINT_V:
-        ref_mv->y += read_mv_component_diff(t, &mv_cdf->comp[0], have_fp);
-        break;
-    default:
-        break;
-    }
+    MsacContext *const msac = &ts->msac;
+    const enum MVJoint mv_joint =
+        dav1d_msac_decode_symbol_adapt4(msac, ts->cdf.mv.joint, N_MV_JOINTS - 1);
+    if (mv_joint & MV_JOINT_V)
+        ref_mv->y += read_mv_component_diff(msac, &ts->cdf.mv.comp[0], mv_prec);
+    if (mv_joint & MV_JOINT_H)
+        ref_mv->x += read_mv_component_diff(msac, &ts->cdf.mv.comp[1], mv_prec);
 }

 static void read_tx_tree(Dav1dTaskContext *const t,
@ -1001,8 +979,7 @@ static int decode_b(Dav1dTaskContext *const t,
        const int have_delta_q = f->frame_hdr->delta.q.present &&
            (bs != (f->seq_hdr->sb128 ? BS_128x128 : BS_64x64) || !b->skip);

-        int8_t prev_delta_lf[4];
-        memcpy(prev_delta_lf, ts->last_delta_lf, 4);
+        uint32_t prev_delta_lf = ts->last_delta_lf.u32;

        if (have_delta_q) {
            int delta_q = dav1d_msac_decode_symbol_adapt4(&ts->msac,
@ -1038,8 +1015,8 @@ static int decode_b(Dav1dTaskContext *const t,
                            delta_lf = -delta_lf;
                        delta_lf *= 1 << f->frame_hdr->delta.lf.res_log2;
                    }
-                    ts->last_delta_lf[i] =
-                        iclip(ts->last_delta_lf[i] + delta_lf, -63, 63);
+                    ts->last_delta_lf.i8[i] =
+                        iclip(ts->last_delta_lf.i8[i] + delta_lf, -63, 63);
                    if (have_delta_q && DEBUG_BLOCK_INFO)
                        printf("Post-delta_lf[%d:%d]: r=%d\n", i, delta_lf,
                               ts->msac.rng);
@ -1054,13 +1031,13 @@ static int decode_b(Dav1dTaskContext *const t,
            init_quant_tables(f->seq_hdr, f->frame_hdr, ts->last_qidx, ts->dqmem);
            ts->dq = ts->dqmem;
        }
-        if (!memcmp(ts->last_delta_lf, (int8_t[4]) { 0, 0, 0, 0 }, 4)) {
+        if (!ts->last_delta_lf.u32) {
            // assign frame-wide lf values to this sb
            ts->lflvl = f->lf.lvl;
-        } else if (memcmp(ts->last_delta_lf, prev_delta_lf, 4)) {
+        } else if (ts->last_delta_lf.u32 != prev_delta_lf) {
            // find sb-specific lf lvl parameters
-            dav1d_calc_lf_values(ts->lflvlmem, f->frame_hdr, ts->last_delta_lf);
            ts->lflvl = ts->lflvlmem;
+            dav1d_calc_lf_values(ts->lflvlmem, f->frame_hdr, ts->last_delta_lf.i8);
        }
    }

@ -1324,7 +1301,7 @@ static int decode_b(Dav1dTaskContext *const t,
        }

        const union mv ref = b->mv[0];
-        read_mv_residual(t, &b->mv[0], &ts->cdf.dmv, 0);
+        read_mv_residual(ts, &b->mv[0], -1);

        // clip intrabc motion vector to decoded parts of current tile
        int border_left = ts->tiling.col_start * 4;
@ -1586,8 +1563,8 @@ static int decode_b(Dav1dTaskContext *const t,
                break; \
            case NEWMV: \
                b->mv[idx] = mvstack[b->drl_idx].mv.mv[idx]; \
-                read_mv_residual(t, &b->mv[idx], &ts->cdf.mv, \
-                                 !f->frame_hdr->force_integer_mv); \
+                const int mv_prec = f->frame_hdr->hp - f->frame_hdr->force_integer_mv; \
+                read_mv_residual(ts, &b->mv[idx], mv_prec); \
                break; \
            }
            has_subpel_filter = imin(bw4, bh4) == 1 ||
@ -1775,8 +1752,8 @@ static int decode_b(Dav1dTaskContext *const t,
                if (DEBUG_BLOCK_INFO)
                    printf("Post-intermode[%d,drl=%d]: r=%d\n",
                           b->inter_mode, b->drl_idx, ts->msac.rng);
-                read_mv_residual(t, &b->mv[0], &ts->cdf.mv,
-                                 !f->frame_hdr->force_integer_mv);
+                const int mv_prec = f->frame_hdr->hp - f->frame_hdr->force_integer_mv;
+                read_mv_residual(ts, &b->mv[0], mv_prec);
                if (DEBUG_BLOCK_INFO)
                    printf("Post-residualmv[mv=y:%d,x:%d]: r=%d\n",
                           b->mv[0].y, b->mv[0].x, ts->msac.rng);
@ -2495,7 +2472,7 @@ static void setup_tile(Dav1dTileState *const ts,

    dav1d_cdf_thread_copy(&ts->cdf, &f->in_cdf);
    ts->last_qidx = f->frame_hdr->quant.yac;
-    memset(ts->last_delta_lf, 0, sizeof(ts->last_delta_lf));
+    ts->last_delta_lf.u32 = 0;

    dav1d_msac_init(&ts->msac, data, sz, f->frame_hdr->disable_cdf_update);

--- a/third_party/dav1d/src/internal.h
+++ b/third_party/dav1d/src/internal.h
@ -303,8 +303,8 @@ struct Dav1dFrameContext {
        int lr_buf_plane_sz[2]; /* (stride*sbh*4) << sb128 if n_tc > 1, else stride*4 */
        int re_sz /* h */;
        ALIGN(Av1FilterLUT lim_lut, 16);
+        ALIGN(uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */], 16);
        int last_sharpness;
-        uint8_t lvl[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
        uint8_t *tx_lpf_right_edge[2];
        uint8_t *cdef_line_buf, *lr_line_buf;
        pixel *cdef_line[2 /* pre, post */][3 /* plane */];
@ -376,8 +376,11 @@ struct Dav1dTileState {
    const uint16_t (*dq)[3][2];
    int last_qidx;

-    int8_t last_delta_lf[4];
-    uint8_t lflvlmem[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */];
+    union {
+        int8_t i8[4];
+        uint32_t u32;
+    } last_delta_lf;
+    ALIGN(uint8_t lflvlmem[8 /* seg_id */][4 /* dir */][8 /* ref */][2 /* is_gmv */], 16);
    const uint8_t (*lflvl)[4][8][2];

    Av1RestorationUnit *lr_ref[3];
--- a/third_party/dav1d/src/itx.h
+++ b/third_party/dav1d/src/itx.h
@ -39,10 +39,73 @@ void (name)(pixel *dst, ptrdiff_t dst_stride, coef *coeff, int eob \
            HIGHBD_DECL_SUFFIX)
 typedef decl_itx_fn(*itxfm_fn);

+#define decl_itx2_fns(w, h, opt) \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
+
+#define decl_itx12_fns(w, h, opt) \
+decl_itx2_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
+
+#define decl_itx16_fns(w, h, opt) \
+decl_itx12_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
+
+#define decl_itx17_fns(w, h, opt) \
+decl_itx16_fns(w, h, opt); \
+decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
+
 typedef struct Dav1dInvTxfmDSPContext {
    itxfm_fn itxfm_add[N_RECT_TX_SIZES][N_TX_TYPES_PLUS_LL];
 } Dav1dInvTxfmDSPContext;

 bitfn_decls(void dav1d_itx_dsp_init, Dav1dInvTxfmDSPContext *c, int bpc);

+#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
+    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
+        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
+
+#define assign_itx1_fn(pfx, w, h, ext) \
+    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
+
+#define assign_itx2_fn(pfx, w, h, ext) \
+    assign_itx1_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
+
+#define assign_itx12_fn(pfx, w, h, ext) \
+    assign_itx2_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
+    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
+    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
+    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
+    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
+    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
+    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
+    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
+    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
+    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
+
+#define assign_itx16_fn(pfx, w, h, ext) \
+    assign_itx12_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
+    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
+    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
+    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
+
+#define assign_itx17_fn(pfx, w, h, ext) \
+    assign_itx16_fn(pfx, w, h, ext); \
+    assign_itx_fn(pfx, w, h, wht_wht,           WHT_WHT,           ext)
+
 #endif /* DAV1D_SRC_ITX_H */
--- a/third_party/dav1d/src/lf_mask.c
+++ b/third_party/dav1d/src/lf_mask.c
@ -436,7 +436,7 @@ static void calc_lf_value(uint8_t (*const lflvl_values)[2],
    const int base = iclip(iclip(base_lvl + lf_delta, 0, 63) + seg_delta, 0, 63);

    if (!mr_delta) {
-        memset(lflvl_values, base, 8 * 2);
+        memset(lflvl_values, base, sizeof(*lflvl_values) * 8);
    } else {
        const int sh = base >= 32;
        lflvl_values[0][0] = lflvl_values[0][1] =
@ -457,7 +457,7 @@ static inline void calc_lf_value_chroma(uint8_t (*const lflvl_values)[2],
                                        const Dav1dLoopfilterModeRefDeltas *const mr_delta)
 {
    if (!base_lvl)
-        memset(lflvl_values, 0, 8 * 2);
+        memset(lflvl_values, 0, sizeof(*lflvl_values) * 8);
    else
        calc_lf_value(lflvl_values, base_lvl, lf_delta, seg_delta, mr_delta);
 }
@ -469,7 +469,7 @@ void dav1d_calc_lf_values(uint8_t (*const lflvl_values)[4][8][2],
    const int n_seg = hdr->segmentation.enabled ? 8 : 1;

    if (!hdr->loopfilter.level_y[0] && !hdr->loopfilter.level_y[1]) {
-        memset(lflvl_values, 0, 8 * 4 * 2 * n_seg);
+        memset(lflvl_values, 0, sizeof(*lflvl_values) * n_seg);
        return;
    }

--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@ -106,6 +106,7 @@ if is_asm_enabled
                    'arm/64/loopfilter.S',
                    'arm/64/looprestoration.S',
                    'arm/64/mc.S',
+                    'arm/64/mc_dotprod.S',
                )
            endif

--- a/third_party/dav1d/src/refmvs.c
+++ b/third_party/dav1d/src/refmvs.c
@ -817,7 +817,9 @@ int dav1d_refmvs_init_frame(refmvs_frame *const rf,
    if (r_stride != rf->r_stride || n_tile_rows != rf->n_tile_rows) {
        if (rf->r) dav1d_freep_aligned(&rf->r);
        const int uses_2pass = n_tile_threads > 1 && n_frame_threads > 1;
-        rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass), 64);
+        /* sizeof(refmvs_block) == 12 but it's accessed using 16-byte loads in asm,
+         * so add 4 bytes of padding to avoid buffer overreads. */
+        rf->r = dav1d_alloc_aligned(ALLOC_REFMVS, sizeof(*rf->r) * 35 * r_stride * n_tile_rows * (1 + uses_2pass) + 4, 64);
        if (!rf->r) return DAV1D_ERR(ENOMEM);
        rf->r_stride = r_stride;
    }
--- a/third_party/dav1d/src/riscv/itx.h
+++ b/third_party/dav1d/src/riscv/itx.h
@ -28,34 +28,6 @@
 #include "src/cpu.h"
 #include "src/itx.h"

-#define decl_itx2_fns(w, h, opt) \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
-
-#define decl_itx12_fns(w, h, opt) \
-decl_itx2_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
-
-#define decl_itx16_fns(w, h, opt) \
-decl_itx12_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
-
-#define decl_itx17_fns(w, h, opt) \
-decl_itx16_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
-
 #define decl_itx_fns(ext) \
 decl_itx17_fns( 4,  4, ext); \
 decl_itx16_fns( 4,  8, ext); \
@ -70,41 +42,6 @@ decl_itx16_fns(16, 16, ext)
 decl_itx_fns(rvv);

 static ALWAYS_INLINE void itx_dsp_init_riscv(Dav1dInvTxfmDSPContext *const c, int const bpc) {
-#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
-    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
-        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
-
-#define assign_itx1_fn(pfx, w, h, ext) \
-    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
-
-#define assign_itx2_fn(pfx, w, h, ext) \
-    assign_itx1_fn(pfx, w, h, ext); \
-    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
-
-#define assign_itx12_fn(pfx, w, h, ext) \
-    assign_itx2_fn(pfx, w, h, ext); \
-    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
-    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
-    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
-    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
-    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
-    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
-    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
-    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
-    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
-    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
-
-#define assign_itx16_fn(pfx, w, h, ext) \
-    assign_itx12_fn(pfx, w, h, ext); \
-    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
-    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
-    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
-    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
-
-#define assign_itx17_fn(pfx, w, h, ext) \
-    assign_itx16_fn(pfx, w, h, ext); \
-    assign_itx_fn(pfx, w, h, wht_wht,           WHT_WHT,           ext)
-
  const unsigned flags = dav1d_get_cpu_flags();

  if (!(flags & DAV1D_RISCV_CPU_FLAG_V)) return;
--- a/third_party/dav1d/src/x86/ipred_avx2.asm
+++ b/third_party/dav1d/src/x86/ipred_avx2.asm
@ -66,7 +66,8 @@ z_filter_wh:  db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
 z_filter_k:   db  0, 16,  0, 16,  0, 20,  0, 20,  8, 16,  8, 16
              db 32, 16, 32, 16, 24, 20, 24, 20, 16, 16, 16, 16
              db  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  8,  0
-z_filter_s:   db  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7
+const \
+z_filter_s,   db  0,  0,  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7
              db  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15
              db 15, 15, 15, 15, 15, 15, 15, 15 ; should be in one cache line
 pb_128:       times 4 db 128 ; those are just placed here for alignment.
--- a/third_party/dav1d/src/x86/itx.h
+++ b/third_party/dav1d/src/x86/itx.h
@ -30,34 +30,6 @@

 #define BF_BPC(x, bits, suffix) x##_##bits##bpc_##suffix

-#define decl_itx2_fns(w, h, opt) \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_identity_##w##x##h, opt))
-
-#define decl_itx12_fns(w, h, opt) \
-decl_itx2_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_dct_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_dct_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_flipadst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_dct_##w##x##h, opt))
-
-#define decl_itx16_fns(w, h, opt) \
-decl_itx12_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_adst_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_flipadst_identity_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_adst_##w##x##h, opt)); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_identity_flipadst_##w##x##h, opt))
-
-#define decl_itx17_fns(w, h, opt) \
-decl_itx16_fns(w, h, opt); \
-decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_##w##x##h, opt))
-
 #define decl_itx_fns(ext) \
 decl_itx17_fns( 4,  4, ext); \
 decl_itx16_fns( 4,  8, ext); \
@ -136,42 +108,6 @@ decl_itx_fn(dav1d_inv_txfm_add_wht_wht_4x4_16bpc_avx2);
 decl_itx_fn(BF(dav1d_inv_txfm_add_wht_wht_4x4, sse2));

 static ALWAYS_INLINE void itx_dsp_init_x86(Dav1dInvTxfmDSPContext *const c, const int bpc) {
-#define assign_itx_fn(pfx, w, h, type, type_enum, ext) \
-    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
-        BF(dav1d_inv_txfm_add_##type##_##w##x##h, ext)
-
-#define assign_itx1_fn(pfx, w, h, ext) \
-    assign_itx_fn(pfx, w, h, dct_dct,           DCT_DCT,           ext)
-
-#define assign_itx2_fn(pfx, w, h, ext) \
-    assign_itx1_fn(pfx, w, h, ext); \
-    assign_itx_fn(pfx, w, h, identity_identity, IDTX,              ext)
-
-#define assign_itx12_fn(pfx, w, h, ext) \
-    assign_itx2_fn(pfx, w, h, ext); \
-    assign_itx_fn(pfx, w, h, dct_adst,          ADST_DCT,          ext); \
-    assign_itx_fn(pfx, w, h, dct_flipadst,      FLIPADST_DCT,      ext); \
-    assign_itx_fn(pfx, w, h, dct_identity,      H_DCT,             ext); \
-    assign_itx_fn(pfx, w, h, adst_dct,          DCT_ADST,          ext); \
-    assign_itx_fn(pfx, w, h, adst_adst,         ADST_ADST,         ext); \
-    assign_itx_fn(pfx, w, h, adst_flipadst,     FLIPADST_ADST,     ext); \
-    assign_itx_fn(pfx, w, h, flipadst_dct,      DCT_FLIPADST,      ext); \
-    assign_itx_fn(pfx, w, h, flipadst_adst,     ADST_FLIPADST,     ext); \
-    assign_itx_fn(pfx, w, h, flipadst_flipadst, FLIPADST_FLIPADST, ext); \
-    assign_itx_fn(pfx, w, h, identity_dct,      V_DCT,             ext)
-
-#define assign_itx16_fn(pfx, w, h, ext) \
-    assign_itx12_fn(pfx, w, h, ext); \
-    assign_itx_fn(pfx, w, h, adst_identity,     H_ADST,            ext); \
-    assign_itx_fn(pfx, w, h, flipadst_identity, H_FLIPADST,        ext); \
-    assign_itx_fn(pfx, w, h, identity_adst,     V_ADST,            ext); \
-    assign_itx_fn(pfx, w, h, identity_flipadst, V_FLIPADST,        ext)
-
-#define assign_itx17_fn(pfx, w, h, ext) \
-    assign_itx16_fn(pfx, w, h, ext); \
-    assign_itx_fn(pfx, w, h, wht_wht,           WHT_WHT,           ext)
-
-
 #define assign_itx_bpc_fn(pfx, w, h, type, type_enum, bpc, ext) \
    c->itxfm_add[pfx##TX_##w##X##h][type_enum] = \
        BF_BPC(dav1d_inv_txfm_add_##type##_##w##x##h, bpc, ext)
--- a/third_party/dav1d/src/x86/mc16_avx2.asm
+++ b/third_party/dav1d/src/x86/mc16_avx2.asm
--- a/third_party/dav1d/src/x86/mc_avx2.asm
+++ b/third_party/dav1d/src/x86/mc_avx2.asm
--- a/third_party/dav1d/src/x86/mc_avx512.asm
+++ b/third_party/dav1d/src/x86/mc_avx512.asm
--- a/third_party/dav1d/tests/meson.build
+++ b/third_party/dav1d/tests/meson.build
@ -100,7 +100,7 @@ if is_asm_enabled
            ],
        )

-    test('checkasm', checkasm, suite: 'checkasm', timeout: 180, is_parallel: false)
+    test('checkasm', checkasm, suite: 'checkasm', timeout: 180)
    benchmark('checkasm', checkasm, suite: 'checkasm', timeout: 3600, args: '--bench')
 endif