Bug 1661093 - Update libdav1d to 0243c3ff for Firefox 82. r=mjf

Differential Revision: https://phabricator.services.mozilla.com/D92534
2020-10-06 15:53:50 +00:00 · 2020-10-06 15:53:50 +00:00 · 067cafe63f
commit 067cafe63f
parent 8452a57539
36 changed files with 4671 additions and 1491 deletions
--- a/media/libdav1d/asm/moz.build
+++ b/media/libdav1d/asm/moz.build
@ -186,7 +186,9 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
            '../../../third_party/dav1d/src/arm/32/itx.S',
            '../../../third_party/dav1d/src/arm/32/loopfilter.S',
            '../../../third_party/dav1d/src/arm/32/looprestoration.S',
+            '../../../third_party/dav1d/src/arm/32/looprestoration16.S',
            '../../../third_party/dav1d/src/arm/32/mc.S',
+            '../../../third_party/dav1d/src/arm/32/mc16.S',
            '../../../third_party/dav1d/src/arm/32/msac.S',
        ]

--- a/media/libdav1d/moz.yaml
+++ b/media/libdav1d/moz.yaml
@ -20,11 +20,11 @@ origin:

  # Human-readable identifier for this version/release
  # Generally "version NNN", "tag SSS", "bookmark SSS"
-  release: commit d0e50cacead63e9904dde184580ce9a746374bd5 (2020-08-21T15:13:49.000+02:00).
+  release: commit 0243c3ffb644e61848b82f24f5e4a7324669d76e (2020-09-27T15:38:45.000+02:00).

  # Revision to pull in
  # Must be a long or short commit SHA (long preferred)
-  revision: d0e50cacead63e9904dde184580ce9a746374bd5
+  revision: 0243c3ffb644e61848b82f24f5e4a7324669d76e

  # The package's license, where possible using the mnemonic from
  # https://spdx.org/licenses/
--- a/media/libdav1d/vcs_version.h
+++ b/media/libdav1d/vcs_version.h
@ -1,2 +1,2 @@
 /* auto-generated, do not edit */
-#define DAV1D_VERSION "0.7.1-49-gd0e50ca"
+#define DAV1D_VERSION "0.7.1-81-g0243c3f"
--- a/media/libdav1d/version.h
+++ b/media/libdav1d/version.h
@ -27,8 +27,8 @@
 #ifndef DAV1D_VERSION_H
 #define DAV1D_VERSION_H

-#define DAV1D_API_VERSION_MAJOR 4
+#define DAV1D_API_VERSION_MAJOR 5
 #define DAV1D_API_VERSION_MINOR 0
-#define DAV1D_API_VERSION_PATCH 2
+#define DAV1D_API_VERSION_PATCH 0

 #endif /* DAV1D_VERSION_H */
--- a/third_party/dav1d/CONTRIBUTING.md
+++ b/third_party/dav1d/CONTRIBUTING.md
@ -12,7 +12,7 @@ The todo list can be found [on the wiki](https://code.videolan.org/videolan/dav1
 The codebase is developed with the following assumptions:

 For the library:
- C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extension,
+- C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extensions. Anonymous structures and unions are the only allowed compiler extensions for internal code.
 - x86 asm in .asm files, using the NASM syntax,
 - arm/arm64 in .S files, using the GAS syntax limited to subset llvm 5.0's internal assembler supports,
 - no C++ is allowed, whatever the version.
--- a/third_party/dav1d/include/dav1d/dav1d.h
+++ b/third_party/dav1d/include/dav1d/dav1d.h
@ -65,9 +65,9 @@ typedef struct Dav1dSettings {
    int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
    int all_layers; ///< output all spatial layers of a scalable AV1 biststream
    unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
-    uint8_t reserved[32]; ///< reserved for future use
    Dav1dPicAllocator allocator; ///< Picture allocator callback.
    Dav1dLogger logger; ///< Logger callback.
+    uint8_t reserved[32]; ///< reserved for future use
 } Dav1dSettings;

 /**
--- a/third_party/dav1d/include/dav1d/headers.h
+++ b/third_party/dav1d/include/dav1d/headers.h
@ -28,6 +28,7 @@
 #ifndef DAV1D_HEADERS_H
 #define DAV1D_HEADERS_H

+#include <stdint.h>
 #include <stddef.h>

 // Constants from Section 3. "Symbols and abbreviated terms"
@ -95,9 +96,9 @@ typedef struct Dav1dWarpedMotionParams {
    union {
        struct {
            int16_t alpha, beta, gamma, delta;
-        };
+        } p;
        int16_t abcd[4];
-    };
+    } u;
 } Dav1dWarpedMotionParams;

 enum Dav1dPixelLayout {
--- a/third_party/dav1d/include/dav1d/meson.build
+++ b/third_party/dav1d/include/dav1d/meson.build
@ -31,11 +31,15 @@ version_h_target = configure_file(input: 'version.h.in',
                                  output: 'version.h',
                                  configuration: version_h_data)

+dav1d_api_headers = [
+    'common.h',
+    'data.h',
+    'dav1d.h',
+    'headers.h',
+    'picture.h',
+]
+
 # install headers
-install_headers('common.h',
-                'data.h',
-                'dav1d.h',
-                'headers.h',
-                'picture.h',
+install_headers(dav1d_api_headers,
                version_h_target,
                subdir : 'dav1d')
--- a/third_party/dav1d/meson.build
+++ b/third_party/dav1d/meson.build
@ -28,9 +28,9 @@ project('dav1d', ['c'],
                      'warning_level=2',
                      'buildtype=release',
                      'b_ndebug=if-release'],
-    meson_version: '>= 0.47.0')
+    meson_version: '>= 0.49.0')

-dav1d_soname_version       = '4.0.2'
+dav1d_soname_version       = '5.0.0'
 dav1d_api_version_array    = dav1d_soname_version.split('.')
 dav1d_api_version_major    = dav1d_api_version_array[0]
 dav1d_api_version_minor    = dav1d_api_version_array[1]
@ -118,6 +118,17 @@ if host_machine.system() == 'windows'
    thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))

    rt_dependency = []
+
+    rc_version_array = meson.project_version().split('.')
+    winmod = import('windows')
+    rc_data = configuration_data()
+    rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0])
+    rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1])
+    rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2])
+    rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
+    rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
+    rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
+    rc_data.set('COPYRIGHT_YEARS', '2020')
 else
    thread_dependency = dependency('threads')
    thread_compat_dep = []
@ -227,7 +238,7 @@ endif
 # Compiler flags that should be set
 # But when the compiler does not supports them
 # it is not an error and silently tolerated
-if cc.get_id() != 'msvc'
+if cc.get_argument_syntax() != 'msvc'
    optional_arguments += [
      '-Wundef',
      '-Werror=vla',
@ -426,6 +437,28 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
        ])
 endif

+use_gaspp = false
+if (is_asm_enabled and
+    (host_machine.cpu_family() == 'aarch64' or
+     host_machine.cpu_family().startswith('arm')) and
+    cc.get_argument_syntax() == 'msvc')
+    gaspp = find_program('gas-preprocessor.pl')
+    use_gaspp = true
+    gaspp_gen = generator(gaspp,
+        output: '@BASENAME@.obj',
+        arguments: [
+            '-as-type', 'armasm',
+            '-arch', host_machine.cpu_family(),
+            '--',
+            host_machine.cpu_family() == 'aarch64' ? 'armasm64' : 'armasm',
+            '-nologo',
+            '-I@0@'.format(dav1d_src_root),
+            '-I@0@/'.format(meson.current_build_dir()),
+            '@INPUT@',
+            '-c',
+            '-o', '@OUTPUT@'
+        ])
+endif

 # Generate config.h
 config_h_target = configure_file(output: 'config.h', configuration: cdata)
--- a/third_party/dav1d/src/arm/32/looprestoration.S
+++ b/third_party/dav1d/src/arm/32/looprestoration.S
@ -40,8 +40,8 @@ function wiener_filter_h_8bpc_neon, export=1
        mov             r8,  r5
        vld1.16         {q0},  [r4]
        movw            r9,  #(1 << 14) - (1 << 2)
-        vdup.16         q14,  r9
-        vmov.s16        q15,  #2048
+        vdup.16         q14, r9
+        vmov.s16        q15, #2048
        // Calculate mid_stride
        add             r10, r5,  #7
        bic             r10, r10, #7
@ -108,8 +108,8 @@ function wiener_filter_h_8bpc_neon, export=1
 0:
        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
        // and shift q2 to have 3x the first byte at the front.
-        vdup.8          q1, d4[0]
-        vdup.8          q8, d18[0]
+        vdup.8          q1,  d4[0]
+        vdup.8          q8,  d18[0]
        // Move r2 back to account for the last 3 bytes we loaded before,
        // which we shifted out.
        sub             r2,  r2,  #3
@ -127,7 +127,7 @@ function wiener_filter_h_8bpc_neon, export=1
        bne             4f
        // If we'll need to pad the right edge, load that byte to pad with
        // here since we can find it pretty easily from here.
-        sub             r9,  r5, #14
+        sub             r9,  r5,  #14
        ldrb            r11, [r2, r9]
        ldrb            r9,  [lr, r9]
        // Fill q12/q13 with the right padding pixel
@ -144,7 +144,6 @@ function wiener_filter_h_8bpc_neon, export=1
        b               6f

 4:      // Loop horizontally
-.macro filter_8
        // This is tuned as some sort of compromise between Cortex A7, A8,
        // A9 and A53.
        vmul.s16        q3,  q1,  d0[0]
@ -187,8 +186,6 @@ function wiener_filter_h_8bpc_neon, export=1
        vshr.s16        q10, q10, #3
        vadd.s16        q3,  q3,  q15
        vadd.s16        q10, q10, q15
-.endm
-        filter_8
        vst1.16         {q3},  [r0,  :128]!
        vst1.16         {q10}, [r12, :128]!

@ -206,50 +203,43 @@ function wiener_filter_h_8bpc_neon, export=1

 5:      // Filter 4 pixels, 7 <= w < 11
 .macro filter_4
+        vext.8          d20, d2,  d3,  #2
+        vext.8          d21, d2,  d3,  #4
+        vext.8          d22, d2,  d3,  #6
+        vext.8          d23, d3,  d4,  #2
+        vext.8          d8,  d3,  d4,  #4
        vmul.s16        d6,  d2,  d0[0]
-        vext.8          q10, q1,  q2,  #2
-        vext.8          q11, q1,  q2,  #4
        vmla.s16        d6,  d20, d0[1]
-        vmla.s16        d6,  d22, d0[2]
-        vext.8          q10, q1,  q2,  #6
-        vext.8          q11, q1,  q2,  #8
-        vmla.s16        d6,  d20, d0[3]
-        vmla.s16        d6,  d22, d1[0]
-        vext.8          q10, q1,  q2,  #10
-        vext.8          q11, q1,  q2,  #12
-        vmla.s16        d6,  d20, d1[1]
-        vmla.s16        d6,  d22, d1[2]
+        vmla.s16        d6,  d21, d0[2]
+        vmla.s16        d6,  d22, d0[3]
+        vmla.s16        d6,  d3,  d1[0]
+        vmla.s16        d6,  d23, d1[1]
+        vmla.s16        d6,  d8,  d1[2]

-        vmul.s16        d20, d16, d0[0]
-        vext.8          q11, q8,  q9,  #2
-        vext.8          q4,  q8,  q9,  #4
-        vmla.s16        d20, d22, d0[1]
-        vmla.s16        d20, d8,  d0[2]
-        vext.8          q11, q8,  q9,  #6
-        vext.8          q4,  q8,  q9,  #8
-        vmla.s16        d20, d22, d0[3]
-        vmla.s16        d20, d8,  d1[0]
-        vext.8          q11, q8,  q9,  #10
-        vext.8          q4,  q8,  q9,  #12
-        vmla.s16        d20, d22, d1[1]
-        vmla.s16        d20, d8,  d1[2]
+        vext.8          d20, d16, d17, #2
+        vext.8          d21, d16, d17, #4
+        vext.8          d22, d16, d17, #6
+        vext.8          d23, d17, d18, #2
+        vext.8          d8,  d17, d18, #4
+        vmul.s16        d7,  d16, d0[0]
+        vmla.s16        d7,  d20, d0[1]
+        vmla.s16        d7,  d21, d0[2]
+        vmla.s16        d7,  d22, d0[3]
+        vmla.s16        d7,  d17, d1[0]
+        vmla.s16        d7,  d23, d1[1]
+        vmla.s16        d7,  d8,  d1[2]

-        vext.8          q11, q1,  q2,  #6
-        vshl.s16        d22, d22, #7
-        vsub.s16        d22, d22, d28
-        vqadd.s16       d6,  d6,  d22
-        vext.8          q11, q8,  q9,  #6
-        vshl.s16        d22, d22, #7
-        vsub.s16        d22, d22, d28
-        vqadd.s16       d20, d20, d22
-        vshr.s16        d6,  d6,  #3
-        vshr.s16        d20, d20, #3
-        vadd.s16        d6,  d6,  d30
-        vadd.s16        d20, d20, d30
+        vext.8          d22, d2,  d3,  #6
+        vext.8          d23, d16, d17, #6
+        vshl.s16        q11, q11, #7
+        vsub.s16        q11, q11, q14
+        vqadd.s16       q3,  q3,  q11
+        vshr.s16        q3,  q3,  #3
+        vadd.s16        q3,  q3,  q15
 .endm
        filter_4
        vst1.16         {d6},  [r0,  :64]!
-        vst1.16         {d20}, [r12, :64]!
+        vst1.16         {d7},  [r12, :64]!

        subs            r5,  r5,  #4 // 3 <= w < 7
        vext.8          q1,  q1,  q2,  #8
@ -323,7 +313,7 @@ L(variable_shift_tbl):
        // w >= 4, filter 4 pixels
        filter_4
        vst1.16         {d6},  [r0,  :64]!
-        vst1.16         {d20}, [r12, :64]!
+        vst1.16         {d7},  [r12, :64]!
        subs            r5,  r5,  #4 // 0 <= w < 4
        vext.8          q1,  q1,  q2,  #8
        vext.8          q8,  q8,  q9,  #8
@ -338,11 +328,11 @@ L(variable_shift_tbl):
        vdup.16         d25, d16[3]
        vpadd.s16       d6,  d6,  d6
        vtrn.16         d24, d25
-        vshl.s16        d24, d24,  #7
-        vsub.s16        d24, d24,  d28
-        vqadd.s16       d6,  d6,   d24
-        vshr.s16        d6,  d6,   #3
-        vadd.s16        d6,  d6,   d30
+        vshl.s16        d24, d24, #7
+        vsub.s16        d24, d24, d28
+        vqadd.s16       d6,  d6,  d24
+        vshr.s16        d6,  d6,  #3
+        vadd.s16        d6,  d6,  d30
        vst1.s16        {d6[0]}, [r0,  :16]!
        vst1.s16        {d6[1]}, [r12, :16]!
        subs            r5,  r5,  #1
@ -363,7 +353,6 @@ L(variable_shift_tbl):
 0:
        vpop            {q4}
        pop             {r4-r11,pc}
-.purgem filter_8
 .purgem filter_4
 endfunc

@ -422,22 +411,22 @@ function wiener_filter_v_8bpc_neon, export=1
        // Interleaving the mul/mla chains actually hurts performance
        // significantly on Cortex A53, thus keeping mul/mla tightly
        // chained like this.
-        vmull.s16       q2,  d16,  d0[0]
-        vmlal.s16       q2,  d18,  d0[1]
-        vmlal.s16       q2,  d20,  d0[2]
-        vmlal.s16       q2,  d22,  d0[3]
-        vmlal.s16       q2,  d24,  d1[0]
-        vmlal.s16       q2,  d26,  d1[1]
-        vmlal.s16       q2,  d28,  d1[2]
-        vmull.s16       q3,  d17,  d0[0]
-        vmlal.s16       q3,  d19,  d0[1]
-        vmlal.s16       q3,  d21,  d0[2]
-        vmlal.s16       q3,  d23,  d0[3]
-        vmlal.s16       q3,  d25,  d1[0]
-        vmlal.s16       q3,  d27,  d1[1]
-        vmlal.s16       q3,  d29,  d1[2]
-        vqrshrun.s32    d4,  q2,   #11
-        vqrshrun.s32    d5,  q3,   #11
+        vmull.s16       q2,  d16, d0[0]
+        vmlal.s16       q2,  d18, d0[1]
+        vmlal.s16       q2,  d20, d0[2]
+        vmlal.s16       q2,  d22, d0[3]
+        vmlal.s16       q2,  d24, d1[0]
+        vmlal.s16       q2,  d26, d1[1]
+        vmlal.s16       q2,  d28, d1[2]
+        vmull.s16       q3,  d17, d0[0]
+        vmlal.s16       q3,  d19, d0[1]
+        vmlal.s16       q3,  d21, d0[2]
+        vmlal.s16       q3,  d23, d0[3]
+        vmlal.s16       q3,  d25, d1[0]
+        vmlal.s16       q3,  d27, d1[1]
+        vmlal.s16       q3,  d29, d1[2]
+        vqrshrun.s32    d4,  q2,  #11
+        vqrshrun.s32    d5,  q3,  #11
        vqmovun.s16     d4,  q2
        vst1.8          {d4}, [r0], r1
 .if \compare
@ -473,7 +462,7 @@ function wiener_filter_v_8bpc_neon, export=1
 52:     // 2 rows in total, q11 already loaded, load q12 with content data
        // and 2 rows of edge.
        vld1.16         {q14}, [r2, :128], r7
-        vmov            q15,  q14
+        vmov            q15, q14
        b               8f
 53:
        // 3 rows in total, q11 already loaded, load q12 and q13 with content
@ -615,8 +604,8 @@ L(copy_narrow_tbl):
        asr             r1,  r1,  #1
 22:
        subs            r4,  r4,  #1
-        vld1.16         {d0[]},  [r2]!
-        vst1.16         {d0[0]}, [r0], r1
+        vld1.16         {d0[]},  [r2, :16]!
+        vst1.16         {d0[0]}, [r0, :16], r1
        bgt             22b
 0:
        pop             {r4,pc}
@ -644,8 +633,8 @@ L(copy_narrow_tbl):
        ble             0f
        b               42b
 41:
-        vld1.32         {d0[]},  [r2]
-        vst1.32         {d0[0]}, [r0]
+        vld1.32         {d0[]},  [r2, :32]
+        vst1.32         {d0[0]}, [r0, :32]
 0:
        pop             {r4,pc}

@ -785,7 +774,7 @@ function sgr_box3_h_8bpc_neon, export=1
        bne             4f
        // If we'll need to pad the right edge, load that byte to pad with
        // here since we can find it pretty easily from here.
-        sub             lr,  r5, #(2 + 16 - 2 + 1)
+        sub             lr,  r5,  #(2 + 16 - 2 + 1)
        ldrb            r11, [r3,  lr]
        ldrb            lr,  [r12, lr]
        // Fill q14/q15 with the right padding pixel
@ -1058,7 +1047,7 @@ function sgr_box5_h_8bpc_neon, export=1
        bne             4f
        // If we'll need to pad the right edge, load that byte to pad with
        // here since we can find it pretty easily from here.
-        sub             lr,  r5, #(2 + 16 - 3 + 1)
+        sub             lr,  r5,  #(2 + 16 - 3 + 1)
        ldrb            r11, [r3,  lr]
        ldrb            lr,  [r12, lr]
        // Fill q14/q15 with the right padding pixel
@ -1100,7 +1089,7 @@ function sgr_box5_h_8bpc_neon, export=1
        vaddl_u16_n     q12, q13, d2,  d3,  d16, d17, \w
        vaddl_u16_n     q8,  q9,  d18, d19, d20, d21, \w
        vaddw_u16_n     q12, q13, d22, d23, \w
-        vadd_i32_n      q12, q13, q8,  q9, \w
+        vadd_i32_n      q12, q13, q8,  q9,  \w
        vext.8          q8,  q5,  q6,  #2
        vext.8          q9,  q5,  q6,  #4
        vext.8          q10, q5,  q6,  #6
@ -1152,7 +1141,7 @@ function sgr_box5_h_8bpc_neon, export=1

 6:      // Pad the right edge and produce the last few pixels.
        // w < 7, w+1 pixels valid in q0/q4
-        sub             lr,   r5,  #1
+        sub             lr,  r5,  #1
        // lr = pixels valid - 2
        adr             r11, L(box5_variable_shift_tbl)
        ldr             lr,  [r11, lr, lsl #2]
--- a/third_party/dav1d/src/arm/32/looprestoration16.S
+++ b/third_party/dav1d/src/arm/32/looprestoration16.S
@ -0,0 +1,720 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2020, Martin Storsjo
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "src/arm/asm.S"
+#include "util.S"
+
+// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
+//                                       const pixel *src, ptrdiff_t stride,
+//                                       const int16_t fh[7], const intptr_t w,
+//                                       int h, enum LrEdgeFlags edges,
+//                                       const int bitdepth_max);
+function wiener_filter_h_16bpc_neon, export=1
+        push            {r4-r11,lr}
+        vpush           {q4-q7}
+        ldrd            r4,  r5,  [sp, #100]
+        ldrd            r6,  r7,  [sp, #108]
+        ldr             r8,       [sp, #116] // bitdepth_max
+        vld1.16         {q0}, [r4]
+        clz             r8,  r8
+        vmov.i32        q14, #1
+        sub             r9,  r8,  #38  // -(bitdepth + 6)
+        sub             r8,  r8,  #25  // -round_bits_h
+        neg             r9,  r9        // bitdepth + 6
+        vdup.32         q1,  r9
+        vdup.32         q13, r8        // -round_bits_h
+        vmov.i16        q15, #8192
+        vshl.u32        q14, q14, q1   // 1 << (bitdepth + 6)
+        mov             r8,  r5
+        // Calculate mid_stride
+        add             r10, r5,  #7
+        bic             r10, r10, #7
+        lsl             r10, r10, #1
+
+        // Clear the last unused element of q0, to allow filtering a single
+        // pixel with one plain vmul+vpadd.
+        mov             r12, #0
+        vmov.16         d1[3], r12
+
+        // Set up pointers for reading/writing alternate rows
+        add             r12, r0,  r10
+        lsl             r10, r10, #1
+        add             lr,  r2,  r3
+        lsl             r3,  r3,  #1
+
+        // Subtract the width from mid_stride
+        sub             r10, r10, r5, lsl #1
+
+        // For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
+        cmp             r5,  #8
+        add             r11, r5,  #13
+        bic             r11, r11, #7
+        bge             1f
+        mov             r11, #16
+1:
+        sub             r3,  r3,  r11, lsl #1
+
+        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             2f
+        // LR_HAVE_LEFT
+        cmp             r1,  #0
+        bne             0f
+        // left == NULL
+        sub             r2,  r2,  #6
+        sub             lr,  lr,  #6
+        b               1f
+0:      // LR_HAVE_LEFT, left != NULL
+2:      // !LR_HAVE_LEFT, increase the stride.
+        // For this case we don't read the left 3 pixels from the src pointer,
+        // but shift it as if we had done that.
+        add             r3,  r3,  #6
+
+
+1:      // Loop vertically
+        vld1.16         {q2, q3}, [r2]!
+        vld1.16         {q4, q5}, [lr]!
+
+        tst             r7,  #1 // LR_HAVE_LEFT
+        beq             0f
+        cmp             r1,  #0
+        beq             2f
+        // LR_HAVE_LEFT, left != NULL
+        vld1.16         {d3},  [r1]!
+        // Move r2/lr back to account for the last 3 pixels we loaded earlier,
+        // which we'll shift out.
+        sub             r2,  r2,  #6
+        sub             lr,  lr,  #6
+        vld1.16         {d13}, [r1]!
+        vext.8          q3,  q2,  q3,  #10
+        vext.8          q2,  q1,  q2,  #10
+        vext.8          q5,  q4,  q5,  #10
+        vext.8          q4,  q6,  q4,  #10
+        b               2f
+0:
+        // !LR_HAVE_LEFT, fill q1 with the leftmost pixel
+        // and shift q2/q3 to have 3x the first pixel at the front.
+        vdup.16         q1,  d4[0]
+        vdup.16         q6,  d8[0]
+        // Move r2 back to account for the last 3 pixels we loaded before,
+        // which we shifted out.
+        sub             r2,  r2,  #6
+        sub             lr,  lr,  #6
+        vext.8          q3,  q2,  q3,  #10
+        vext.8          q2,  q1,  q2,  #10
+        vext.8          q5,  q4,  q5,  #10
+        vext.8          q4,  q6,  q4,  #10
+
+2:
+
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        bne             4f
+        // If we'll need to pad the right edge, load that byte to pad with
+        // here since we can find it pretty easily from here.
+        sub             r9,  r5,  #14
+        lsl             r9,  r9,  #1
+        ldrh            r11, [r2, r9]
+        ldrh            r9,  [lr, r9]
+        // Fill q11/q12 with the right padding pixel
+        vdup.16         q11, r11
+        vdup.16         q12, r9
+3:      // !LR_HAVE_RIGHT
+        // If we'll have to pad the right edge we need to quit early here.
+        cmp             r5,  #11
+        bge             4f   // If w >= 11, all used input pixels are valid
+        cmp             r5,  #7
+        bge             5f   // If w >= 7, we can filter 4 pixels
+        b               6f
+
+4:      // Loop horizontally
+        vext.8          q10, q2,  q3,  #6
+        vext.8          q8,  q2,  q3,  #2
+        vext.8          q9,  q2,  q3,  #4
+        vshll.u16       q6,  d20, #7
+        vshll.u16       q7,  d21, #7
+        vmlal.s16       q6,  d4,  d0[0]
+        vmlal.s16       q6,  d16, d0[1]
+        vmlal.s16       q6,  d18, d0[2]
+        vmlal.s16       q6,  d20, d0[3]
+        vmlal.s16       q7,  d5,  d0[0]
+        vmlal.s16       q7,  d17, d0[1]
+        vmlal.s16       q7,  d19, d0[2]
+        vmlal.s16       q7,  d21, d0[3]
+        vext.8          q8,  q2,  q3,  #8
+        vext.8          q9,  q2,  q3,  #10
+        vext.8          q10, q2,  q3,  #12
+        vmlal.s16       q6,  d16, d1[0]
+        vmlal.s16       q6,  d18, d1[1]
+        vmlal.s16       q6,  d20, d1[2]
+        vmlal.s16       q7,  d17, d1[0]
+        vmlal.s16       q7,  d19, d1[1]
+        vmlal.s16       q7,  d21, d1[2]
+        vext.8          q10, q4,  q5,  #6
+        vext.8          q2,  q4,  q5,  #2
+        vshll.u16       q8,  d20, #7
+        vshll.u16       q9,  d21, #7
+        vmlal.s16       q8,  d8,  d0[0]
+        vmlal.s16       q8,  d4,  d0[1]
+        vmlal.s16       q8,  d20, d0[3]
+        vmlal.s16       q9,  d9,  d0[0]
+        vmlal.s16       q9,  d5,  d0[1]
+        vmlal.s16       q9,  d21, d0[3]
+        vext.8          q2,  q4,  q5,  #4
+        vext.8          q10, q4,  q5,  #8
+        vmlal.s16       q8,  d4,  d0[2]
+        vmlal.s16       q8,  d20, d1[0]
+        vmlal.s16       q9,  d5,  d0[2]
+        vmlal.s16       q9,  d21, d1[0]
+        vext.8          q2,  q4,  q5,  #10
+        vext.8          q10, q4,  q5,  #12
+        vmlal.s16       q8,  d4,  d1[1]
+        vmlal.s16       q8,  d20, d1[2]
+        vmlal.s16       q9,  d5,  d1[1]
+        vmlal.s16       q9,  d21, d1[2]
+
+        vmvn.i16        q10, #0x8000 // 0x7fff = (1 << 15) - 1
+        vadd.i32        q6,  q6,  q14
+        vadd.i32        q7,  q7,  q14
+        vadd.i32        q8,  q8,  q14
+        vadd.i32        q9,  q9,  q14
+        vrshl.s32       q6,  q6,  q13
+        vrshl.s32       q7,  q7,  q13
+        vrshl.s32       q8,  q8,  q13
+        vrshl.s32       q9,  q9,  q13
+        vqmovun.s32     d12, q6
+        vqmovun.s32     d13, q7
+        vqmovun.s32     d14, q8
+        vqmovun.s32     d15, q9
+        vmin.u16        q6,  q6,  q10
+        vmin.u16        q7,  q7,  q10
+        vsub.i16        q6,  q6,  q15
+        vsub.i16        q7,  q7,  q15
+        vst1.16         {q6}, [r0,  :128]!
+        vst1.16         {q7}, [r12, :128]!
+
+        subs            r5,  r5,  #8
+        ble             9f
+        tst             r7,  #2 // LR_HAVE_RIGHT
+        vmov            q2,  q3
+        vmov            q4,  q5
+        vld1.16         {q3}, [r2]!
+        vld1.16         {q5}, [lr]!
+        bne             4b // If we don't need to pad, just keep filtering.
+        b               3b // If we need to pad, check how many pixels we have left.
+
+5:      // Filter 4 pixels, 7 <= w < 11
+.macro filter_4
+        vext.8          d18, d4,  d5,  #6
+        vext.8          d16, d4,  d5,  #2
+        vext.8          d17, d4,  d5,  #4
+        vext.8          d19, d5,  d6,  #2
+        vext.8          d20, d5,  d6,  #4
+        vshll.u16       q6,  d18, #7
+        vmlal.s16       q6,  d4,  d0[0]
+        vmlal.s16       q6,  d16, d0[1]
+        vmlal.s16       q6,  d17, d0[2]
+        vmlal.s16       q6,  d18, d0[3]
+        vmlal.s16       q6,  d5,  d1[0]
+        vmlal.s16       q6,  d19, d1[1]
+        vmlal.s16       q6,  d20, d1[2]
+
+        vext.8          d18, d8,  d9,  #6
+        vext.8          d16, d8,  d9,  #2
+        vext.8          d17, d8,  d9,  #4
+        vext.8          d19, d9,  d10, #2
+        vext.8          d20, d9,  d10, #4
+        vshll.u16       q7,  d18, #7
+        vmlal.s16       q7,  d8,  d0[0]
+        vmlal.s16       q7,  d16, d0[1]
+        vmlal.s16       q7,  d17, d0[2]
+        vmlal.s16       q7,  d18, d0[3]
+        vmlal.s16       q7,  d9,  d1[0]
+        vmlal.s16       q7,  d19, d1[1]
+        vmlal.s16       q7,  d20, d1[2]
+
+        vmvn.i16        q10, #0x8000 // 0x7fff = (1 << 15) - 1
+        vadd.i32        q6,  q6,  q14
+        vadd.i32        q7,  q7,  q14
+        vrshl.s32       q6,  q6,  q13
+        vrshl.s32       q7,  q7,  q13
+        vqmovun.s32     d12, q6
+        vqmovun.s32     d13, q7
+        vmin.u16        q6,  q6,  q10
+        vsub.i16        q6,  q6,  q15
+.endm
+        filter_4
+        vst1.16         {d12}, [r0,  :64]!
+        vst1.16         {d13}, [r12, :64]!
+
+        subs            r5,  r5,  #4 // 3 <= w < 7
+        vext.8          q2,  q2,  q3,  #8
+        vext.8          q3,  q3,  q3,  #8
+        vext.8          q4,  q4,  q5,  #8
+        vext.8          q5,  q5,  q5,  #8
+
+6:      // Pad the right edge and filter the last few pixels.
+        // w < 7, w+3 pixels valid in q2-q3
+        cmp             r5,  #5
+        blt             7f
+        bgt             8f
+        // w == 5, 8 pixels valid in q2, q3 invalid
+        vmov            q3,  q11
+        vmov            q5,  q12
+        b               88f
+
+7:      // 1 <= w < 5, 4-7 pixels valid in q2
+        sub             r9,  r5,  #1
+        // r9 = (pixels valid - 4)
+        adr             r11, L(variable_shift_tbl)
+        ldr             r9,  [r11, r9, lsl #2]
+        add             r11, r11, r9
+        vmov            q3,  q11
+        vmov            q5,  q12
+        bx              r11
+
+        .align 2
+L(variable_shift_tbl):
+        .word 44f - L(variable_shift_tbl) + CONFIG_THUMB
+        .word 55f - L(variable_shift_tbl) + CONFIG_THUMB
+        .word 66f - L(variable_shift_tbl) + CONFIG_THUMB
+        .word 77f - L(variable_shift_tbl) + CONFIG_THUMB
+
+44:     // 4 pixels valid in q2/q4, fill the high half with padding.
+        vmov            d5,  d6
+        vmov            d9,  d10
+        b               88f
+        // Shift q2 right, shifting out invalid pixels,
+        // shift q2 left to the original offset, shifting in padding pixels.
+55:     // 5 pixels valid
+        vext.8          q2,  q2,  q2,  #10
+        vext.8          q2,  q2,  q3,  #6
+        vext.8          q4,  q4,  q4,  #10
+        vext.8          q4,  q4,  q5,  #6
+        b               88f
+66:     // 6 pixels valid
+        vext.8          q2,  q2,  q2,  #12
+        vext.8          q2,  q2,  q3,  #4
+        vext.8          q4,  q4,  q4,  #12
+        vext.8          q4,  q4,  q5,  #4
+        b               88f
+77:     // 7 pixels valid
+        vext.8          q2,  q2,  q2,  #14
+        vext.8          q2,  q2,  q3,  #2
+        vext.8          q4,  q4,  q4,  #14
+        vext.8          q4,  q4,  q5,  #2
+        b               88f
+
+8:      // w > 5, w == 6, 9 pixels valid in q2-q3, 1 pixel valid in q3
+        vext.8          q3,  q3,  q3,  #2
+        vext.8          q3,  q3,  q11, #14
+        vext.8          q5,  q5,  q5,  #2
+        vext.8          q5,  q5,  q12, #14
+
+88:
+        // w < 7, q2-q3 padded properly
+        cmp             r5,  #4
+        blt             888f
+
+        // w >= 4, filter 4 pixels
+        filter_4
+        vst1.16         {d12}, [r0,  :64]!
+        vst1.16         {d13}, [r12, :64]!
+        subs            r5,  r5,  #4 // 0 <= w < 4
+        vext.8          q2,  q2,  q3,  #8
+        vext.8          q4,  q4,  q5,  #8
+        beq             9f
+888:    // 1 <= w < 4, filter 1 pixel at a time
+        vmull.s16       q6,  d4,  d0
+        vmull.s16       q7,  d5,  d1
+        vmull.s16       q8,  d8,  d0
+        vmull.s16       q9,  d9,  d1
+        vadd.i32        q6,  q7
+        vadd.i32        q8,  q9
+        vpadd.i32       d12, d12, d13
+        vpadd.i32       d13, d16, d17
+        vdup.16         d14, d4[3]
+        vdup.16         d15, d8[3]
+        vpadd.i32       d12, d12, d13
+        vtrn.16         d14, d15
+        vadd.i32        d12, d12, d28
+        vshll.u16       q7,  d14, #7
+        vmvn.i16        d20, #0x8000 // 0x7fff = (1 << 15) - 1
+        vadd.i32        d12, d12, d14
+        vrshl.s32       d12, d12, d26
+        vqmovun.s32     d12, q6
+        vmin.u16        d12, d12, d20
+        vsub.i16        d12, d12, d30
+        vst1.16         {d12[0]}, [r0,  :16]!
+        vst1.16         {d12[1]}, [r12, :16]!
+        subs            r5,  r5,  #1
+        vext.8          q2,  q2,  q3,  #2
+        vext.8          q4,  q4,  q5,  #2
+        bgt             888b
+
+9:
+        subs            r6,  r6,  #2
+        ble             0f
+        // Jump to the next row and loop horizontally
+        add             r0,  r0,  r10
+        add             r12, r12, r10
+        add             r2,  r2,  r3
+        add             lr,  lr,  r3
+        mov             r5,  r8
+        b               1b
+0:
+        vpop            {q4-q7}
+        pop             {r4-r11,pc}
+.purgem filter_4
+endfunc
+
+// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                       const int16_t *mid, int w, int h,
+//                                       const int16_t fv[7], enum LrEdgeFlags edges,
+//                                       ptrdiff_t mid_stride, const int bitdepth_max);
+function wiener_filter_v_16bpc_neon, export=1
+        push            {r4-r7,lr}
+        vpush           {q4-q5}
+        ldrd            r4,  r5,  [sp, #52]
+        ldrd            r6,  r7,  [sp, #60]
+        ldr             lr,       [sp, #68] // bitdepth_max
+        vmov.i16        q1,  #0
+        mov             r12, #128
+        vld1.16         {q0},  [r5]
+        vdup.16         q5,  lr
+        clz             lr,  lr
+        vmov.i16        d2[3], r12
+        sub             lr,  lr,  #11   // round_bits_v
+        vadd.i16        q0,  q0,  q1
+        vdup.32         q4,  lr
+        mov             lr,  r4
+        vneg.s32        q4,  q4         // -round_bits_v
+
+        // Calculate the number of rows to move back when looping vertically
+        mov             r12, r4
+        tst             r6,  #4 // LR_HAVE_TOP
+        beq             0f
+        sub             r2,  r2,  r7, lsl #1
+        add             r12, r12, #2
+0:
+        tst             r6,  #8 // LR_HAVE_BOTTOM
+        beq             1f
+        add             r12, r12, #2
+
+1:      // Start of horizontal loop; start one vertical filter slice.
+        // Load rows into q8-q11 and pad properly.
+        tst             r6,  #4 // LR_HAVE_TOP
+        vld1.16         {q8},  [r2, :128], r7
+        beq             2f
+        // LR_HAVE_TOP
+        vld1.16         {q10}, [r2, :128], r7
+        vmov            q9,  q8
+        vld1.16         {q11}, [r2, :128], r7
+        b               3f
+2:      // !LR_HAVE_TOP
+        vmov            q9,  q8
+        vmov            q10, q8
+        vmov            q11, q8
+
+3:
+        cmp             r4,  #4
+        blt             5f
+        // Start filtering normally; fill in q12-q14 with unique rows.
+        vld1.16         {q12}, [r2, :128], r7
+        vld1.16         {q13}, [r2, :128], r7
+        vld1.16         {q14}, [r2, :128], r7
+
+4:
+.macro filter compare
+        subs            r4,  r4,  #1
+        // Interleaving the mul/mla chains actually hurts performance
+        // significantly on Cortex A53, thus keeping mul/mla tightly
+        // chained like this.
+        vmull.s16       q2,  d16, d0[0]
+        vmlal.s16       q2,  d18, d0[1]
+        vmlal.s16       q2,  d20, d0[2]
+        vmlal.s16       q2,  d22, d0[3]
+        vmlal.s16       q2,  d24, d1[0]
+        vmlal.s16       q2,  d26, d1[1]
+        vmlal.s16       q2,  d28, d1[2]
+        vmull.s16       q3,  d17, d0[0]
+        vmlal.s16       q3,  d19, d0[1]
+        vmlal.s16       q3,  d21, d0[2]
+        vmlal.s16       q3,  d23, d0[3]
+        vmlal.s16       q3,  d25, d1[0]
+        vmlal.s16       q3,  d27, d1[1]
+        vmlal.s16       q3,  d29, d1[2]
+        vrshl.s32       q2,  q2,  q4    // round_bits_v
+        vrshl.s32       q3,  q3,  q4
+        vqmovun.s32     d4,  q2
+        vqmovun.s32     d5,  q3
+        vmin.u16        q2,  q2,  q5    // bitdepth_max
+        vst1.16         {q2}, [r0], r1
+.if \compare
+        cmp             r4,  #4
+.else
+        ble             9f
+.endif
+        vmov            q8,  q9
+        vmov            q9,  q10
+        vmov            q10, q11
+        vmov            q11, q12
+        vmov            q12, q13
+        vmov            q13, q14
+.endm
+        filter          1
+        blt             7f
+        vld1.16         {q14}, [r2, :128], r7
+        b               4b
+
+5:      // Less than 4 rows in total; not all of q12-q13 are filled yet.
+        tst             r6,  #8 // LR_HAVE_BOTTOM
+        beq             6f
+        // LR_HAVE_BOTTOM
+        cmp             r4,  #2
+        // We load at least 2 rows in all cases.
+        vld1.16         {q12}, [r2, :128], r7
+        vld1.16         {q13}, [r2, :128], r7
+        bgt             53f // 3 rows in total
+        beq             52f // 2 rows in total
+51:     // 1 row in total, q11 already loaded, load edge into q12-q14.
+        vmov            q13, q12
+        b               8f
+52:     // 2 rows in total, q11 already loaded, load q12 with content data
+        // and 2 rows of edge.
+        vld1.16         {q14}, [r2, :128], r7
+        vmov            q15, q14
+        b               8f
+53:
+        // 3 rows in total, q11 already loaded, load q12 and q13 with content
+        // and 2 rows of edge.
+        vld1.16         {q14}, [r2, :128], r7
+        vld1.16         {q15}, [r2, :128], r7
+        vmov            q1,  q15
+        b               8f
+
+6:
+        // !LR_HAVE_BOTTOM
+        cmp             r4,  #2
+        bgt             63f // 3 rows in total
+        beq             62f // 2 rows in total
+61:     // 1 row in total, q11 already loaded, pad that into q12-q14.
+        vmov            q12, q11
+        vmov            q13, q11
+        vmov            q14, q11
+        b               8f
+62:     // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
+        vld1.16         {q12}, [r2, :128], r7
+        vmov            q13, q12
+        vmov            q14, q12
+        vmov            q15, q12
+        b               8f
+63:
+        // 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
+        vld1.16         {q12}, [r2, :128], r7
+        vld1.16         {q13}, [r2, :128], r7
+        vmov            q14, q13
+        vmov            q15, q13
+        vmov            q1,  q13
+        b               8f
+
+7:
+        // All registers up to q13 are filled already, 3 valid rows left.
+        // < 4 valid rows left; fill in padding and filter the last
+        // few rows.
+        tst             r6,  #8 // LR_HAVE_BOTTOM
+        beq             71f
+        // LR_HAVE_BOTTOM; load 2 rows of edge.
+        vld1.16         {q14}, [r2, :128], r7
+        vld1.16         {q15}, [r2, :128], r7
+        vmov            q1,  q15
+        b               8f
+71:
+        // !LR_HAVE_BOTTOM, pad 3 rows
+        vmov            q14, q13
+        vmov            q15, q13
+        vmov            q1,  q13
+
+8:      // At this point, all registers up to q14-q15,q1 are loaded with
+        // edge/padding (depending on how many rows are left).
+        filter          0 // This branches to 9f when done
+        vmov            q14, q15
+        vmov            q15, q1
+        b               8b
+
+9:      // End of one vertical slice.
+        subs            r3,  r3,  #8
+        ble             0f
+        // Move pointers back up to the top and loop horizontally.
+        mls             r0,  r1,  lr,  r0
+        mls             r2,  r7,  r12, r2
+        add             r0,  r0,  #16
+        add             r2,  r2,  #16
+        mov             r4,  lr
+        b               1b
+
+0:
+        vpop            {q4-q5}
+        pop             {r4-r7,pc}
+.purgem filter
+endfunc
+
+// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride,
+//                                   const pixel *src, int w, int h);
+function copy_narrow_16bpc_neon, export=1
+        push            {r4,lr}
+        ldr             r4,  [sp, #8]
+        adr             r12, L(copy_narrow_tbl)
+        ldr             r3,  [r12, r3, lsl #2]
+        add             r12, r12, r3
+        bx              r12
+
+        .align 2
+L(copy_narrow_tbl):
+        .word 0
+        .word 10f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 20f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 30f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 40f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 50f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 60f - L(copy_narrow_tbl) + CONFIG_THUMB
+        .word 70f - L(copy_narrow_tbl) + CONFIG_THUMB
+
+10:
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+18:
+        subs            r4,  r4,  #8
+        blt             110f
+        vld1.16         {q0}, [r2, :128]!
+        vst1.16         {d0[0]}, [r0, :16], r1
+        vst1.16         {d0[1]}, [r3, :16], r1
+        vst1.16         {d0[2]}, [r0, :16], r1
+        vst1.16         {d0[3]}, [r3, :16], r1
+        vst1.16         {d1[0]}, [r0, :16], r1
+        vst1.16         {d1[1]}, [r3, :16], r1
+        vst1.16         {d1[2]}, [r0, :16], r1
+        vst1.16         {d1[3]}, [r3, :16], r1
+        ble             0f
+        b               18b
+110:
+        add             r4,  r4,  #8
+        asr             r1,  r1,  #1
+11:
+        subs            r4,  r4,  #1
+        vld1.16         {d0[]},  [r2]!
+        vst1.16         {d0[0]}, [r0], r1
+        bgt             11b
+0:
+        pop             {r4,pc}
+
+20:
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+24:
+        subs            r4,  r4,  #4
+        blt             210f
+        vld1.32         {q0}, [r2, :128]!
+        vst1.32         {d0[0]}, [r0, :32], r1
+        vst1.32         {d0[1]}, [r3, :32], r1
+        vst1.32         {d1[0]}, [r0, :32], r1
+        vst1.32         {d1[1]}, [r3, :32], r1
+        ble             0f
+        b               24b
+210:
+        add             r4,  r4,  #4
+        asr             r1,  r1,  #1
+22:
+        subs            r4,  r4,  #1
+        vld1.32         {d0[]},  [r2, :32]!
+        vst1.32         {d0[0]}, [r0, :32], r1
+        bgt             22b
+0:
+        pop             {r4,pc}
+
+30:
+        ldr             r3,  [r2]
+        ldrh            r12, [r2, #4]
+        add             r2,  r2,  #6
+        subs            r4,  r4,  #1
+        str             r3,  [r0]
+        strh            r12, [r0, #4]
+        add             r0,  r0,  r1
+        bgt             30b
+        pop             {r4,pc}
+
+40:
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+42:
+        subs            r4,  r4,  #2
+        blt             41f
+        vld1.16         {q0}, [r2, :128]!
+        vst1.16         {d0}, [r0, :64], r1
+        vst1.16         {d1}, [r3, :64], r1
+        ble             0f
+        b               42b
+41:
+        vld1.16         {d0}, [r2, :64]
+        vst1.16         {d0}, [r0, :64]
+0:
+        pop             {r4,pc}
+
+50:
+        vld1.16         {d0}, [r2]
+        ldrh            r12, [r2, #8]
+        add             r2,  r2,  #10
+        subs            r4,  r4,  #1
+        vst1.16         {d0}, [r0]
+        strh            r12, [r0, #8]
+        add             r0,  r0,  r1
+        bgt             50b
+        pop             {r4,pc}
+
+60:
+        vld1.16         {d0}, [r2]
+        ldr             r12, [r2, #8]
+        add             r2,  r2,  #12
+        subs            r4,  r4,  #1
+        vst1.16         {d0}, [r0]
+        str             r12, [r0, #8]
+        add             r0,  r0,  r1
+        bgt             60b
+        pop             {r4,pc}
+
+70:
+        vld1.16         {d0}, [r2]
+        ldr             r12, [r2, #8]
+        ldrh            lr,  [r2, #12]
+        add             r2,  r2,  #14
+        subs            r4,  r4,  #1
+        vst1.16         {d0}, [r0]
+        str             r12, [r0, #8]
+        strh            lr,  [r0, #12]
+        add             r0,  r0,  r1
+        bgt             70b
+        pop             {r4,pc}
+endfunc
--- a/third_party/dav1d/src/arm/32/mc.S
+++ b/third_party/dav1d/src/arm/32/mc.S
@ -1403,12 +1403,12 @@ L(\type\()_8tap_h_tbl):
        vld1.8          {d24}, [\sr2], \s_strd
        vmovl.u8        q8,  d16
        vmovl.u8        q12, d24
-        vext.8          q9,  q8,  q8,  #2
-        vext.8          q10, q8,  q8,  #4
-        vext.8          q11, q8,  q8,  #6
-        vext.8          q13, q12, q12, #2
-        vext.8          q14, q12, q12, #4
-        vext.8          q15, q12, q12, #6
+        vext.8          d18, d16, d17, #2
+        vext.8          d20, d16, d17, #4
+        vext.8          d22, d16, d17, #6
+        vext.8          d26, d24, d25, #2
+        vext.8          d28, d24, d25, #4
+        vext.8          d30, d24, d25, #6
        subs            \h,  \h,  #2
        vmul.s16        d4,  d16, d0[0]
        vmla.s16        d4,  d18, d0[1]
@ -1431,7 +1431,7 @@ L(\type\()_8tap_h_tbl):
        pop             {r4-r11,pc}

 80:     // 8xN h
-        vld1.8          {d0}, [\mx]
+        vld1.8          {d0}, [\mx, :64]
        sub             \src,  \src,  #3
        add             \ds2,  \dst,  \d_strd
        add             \sr2,  \src,  \s_strd
@ -1482,7 +1482,7 @@ L(\type\()_8tap_h_tbl):
        // one temporary for vext in the loop. That's slower on A7 and A53,
        // (but surprisingly, marginally faster on A8 and A73).
        vpush           {q4-q6}
-        vld1.8          {d0}, [\mx]
+        vld1.8          {d0}, [\mx, :64]
        sub             \src,  \src,  #3
        add             \ds2,  \dst,  \d_strd
        add             \sr2,  \src,  \s_strd
@ -1629,7 +1629,7 @@ L(\type\()_8tap_v_tbl):

 28:     // 2x8, 2x16 v
        vpush           {q4-q7}
-        vld1.8          {d0}, [\my]
+        vld1.8          {d0}, [\my, :64]
        sub             \sr2,  \src,  \s_strd, lsl #1
        add             \ds2,  \dst,  \d_strd
        sub             \src,  \sr2,  \s_strd
@ -1709,7 +1709,7 @@ L(\type\()_8tap_v_tbl):

 480:    // 4x8, 4x16 v
        vpush           {q4}
-        vld1.8          {d0}, [\my]
+        vld1.8          {d0}, [\my, :64]
        sub             \sr2, \src, \s_strd, lsl #1
        add             \ds2, \dst, \d_strd
        sub             \src, \sr2, \s_strd
@ -1782,7 +1782,7 @@ L(\type\()_8tap_v_tbl):
 640:
 1280:
        vpush           {q4}
-        vld1.8          {d0}, [\my]
+        vld1.8          {d0}, [\my, :64]
        sub             \src, \src, \s_strd
        sub             \src, \src, \s_strd, lsl #1
        vmovl.s8        q0,  d0
@ -1951,11 +1951,10 @@ L(\type\()_8tap_hv_tbl):
        bl              L(\type\()_8tap_filter_2)

        vext.8          d18, d17, d26, #4
-        vmov            d19, d26
        vmull.s16       q2,  d16, d2[0]
        vmlal.s16       q2,  d17, d2[1]
        vmlal.s16       q2,  d18, d2[2]
-        vmlal.s16       q2,  d19, d2[3]
+        vmlal.s16       q2,  d26, d2[3]

        vqrshrn.s32     d4,  q2,  #\shift_hv
        vqmovun.s16     d4,  q2
@ -1964,11 +1963,11 @@ L(\type\()_8tap_hv_tbl):
        vst1.16         {d4[1]}, [\ds2, :16], \d_strd
        ble             0f
        vmov            d16, d18
-        vmov            d17, d19
+        vmov            d17, d26
        b               2b

 280:    // 2x8, 2x16, 2x32 hv
-        vld1.8          {d2},  [\my]
+        vld1.8          {d2},  [\my, :64]
        sub             \src, \src, #1
        sub             \sr2, \src, \s_strd, lsl #1
        sub             \src, \sr2, \s_strd
@ -2001,7 +2000,6 @@ L(\type\()_8tap_hv_tbl):
 28:
        bl              L(\type\()_8tap_filter_2)
        vext.8          d22, d21, d26, #4
-        vmov            d23, d26
        vmull.s16       q2,  d16, d2[0]
        vmlal.s16       q2,  d17, d2[1]
        vmlal.s16       q2,  d18, d2[2]
@ -2009,7 +2007,7 @@ L(\type\()_8tap_hv_tbl):
        vmlal.s16       q2,  d20, d3[0]
        vmlal.s16       q2,  d21, d3[1]
        vmlal.s16       q2,  d22, d3[2]
-        vmlal.s16       q2,  d23, d3[3]
+        vmlal.s16       q2,  d26, d3[3]

        vqrshrn.s32     d4,  q2,  #\shift_hv
        vqmovun.s16     d4,  q2
@ -2022,7 +2020,7 @@ L(\type\()_8tap_hv_tbl):
        vmov            d18, d20
        vmov            d19, d21
        vmov            d20, d22
-        vmov            d21, d23
+        vmov            d21, d26
        b               28b

 0:
@ -2108,7 +2106,7 @@ L(\type\()_8tap_filter_2):
        b               4b

 480:    // 4x8, 4x16, 4x32 hv
-        vld1.8          {d2},  [\my]
+        vld1.8          {d2},  [\my, :64]
        sub             \src, \src, #1
        sub             \sr2, \src, \s_strd, lsl #1
        sub             \src, \sr2, \s_strd
@ -2211,7 +2209,7 @@ L(\type\()_8tap_filter_4):
        bgt             880f
        vpush           {q4-q7}
        add             \my,  \my,  #2
-        vld1.8          {d0},  [\mx]
+        vld1.8          {d0},  [\mx, :64]
        vld1.32         {d2[]},  [\my]
        sub             \src,  \src,  #3
        sub             \src,  \src,  \s_strd
@ -2301,8 +2299,8 @@ L(\type\()_8tap_filter_4):
 640:
 1280:
        vpush           {q4-q7}
-        vld1.8          {d0},  [\mx]
-        vld1.8          {d2},  [\my]
+        vld1.8          {d0},  [\mx, :64]
+        vld1.8          {d2},  [\my, :64]
        sub             \src,  \src,  #3
        sub             \src,  \src,  \s_strd
        sub             \src,  \src,  \s_strd, lsl #1
--- a/third_party/dav1d/src/arm/32/mc16.S
+++ b/third_party/dav1d/src/arm/32/mc16.S
--- a/third_party/dav1d/src/arm/64/looprestoration16.S
+++ b/third_party/dav1d/src/arm/64/looprestoration16.S
@ -172,13 +172,13 @@ function wiener_filter_h_16bpc_neon, export=1
        // Interleaving the mul/mla chains actually hurts performance
        // significantly on Cortex A53, thus keeping mul/mla tightly
        // chained like this.
+        ext             v18.16b, v2.16b,  v3.16b, #6
        ext             v16.16b, v2.16b,  v3.16b, #2
        ext             v17.16b, v2.16b,  v3.16b, #4
-        ext             v18.16b, v2.16b,  v3.16b, #6
        ext             v19.16b, v2.16b,  v3.16b, #8
        ext             v20.16b, v2.16b,  v3.16b, #10
-        ext             v21.16b, v2.16b,  v3.16b, #12
        ushll_sz        v6,  v7,  v18, #7, \wd
+        ext             v21.16b, v2.16b,  v3.16b, #12
        smlal           v6.4s,   v2.4h,   v0.h[0]
        smlal           v6.4s,   v16.4h,  v0.h[1]
        smlal           v6.4s,   v17.4h,  v0.h[2]
@ -195,13 +195,13 @@ function wiener_filter_h_16bpc_neon, export=1
        smlal2          v7.4s,   v20.8h,  v0.h[5]
        smlal2          v7.4s,   v21.8h,  v0.h[6]
 .endif
+        ext             v21.16b, v4.16b,  v5.16b, #6
        ext             v19.16b, v4.16b,  v5.16b, #2
        ext             v20.16b, v4.16b,  v5.16b, #4
-        ext             v21.16b, v4.16b,  v5.16b, #6
        ext             v22.16b, v4.16b,  v5.16b, #8
        ext             v23.16b, v4.16b,  v5.16b, #10
-        ext             v24.16b, v4.16b,  v5.16b, #12
        ushll_sz        v16, v17, v21, #7, \wd
+        ext             v24.16b, v4.16b,  v5.16b, #12
        smlal           v16.4s,  v4.4h,   v0.h[0]
        smlal           v16.4s,  v19.4h,  v0.h[1]
        smlal           v16.4s,  v20.4h,  v0.h[2]
@ -334,9 +334,9 @@ L(variable_shift_tbl):
        ins             v6.s[1], v7.s[0]
        mvni            v24.4h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
        ushll           v16.4s,  v16.4h,  #7
-        add             v6.4s,   v6.4s,   v30.4s
-        add             v6.4s,   v6.4s,   v16.4s
-        srshl           v6.4s,   v6.4s,   v29.4s
+        add             v6.2s,   v6.2s,   v30.2s
+        add             v6.2s,   v6.2s,   v16.2s
+        srshl           v6.2s,   v6.2s,   v29.2s
        sqxtun          v6.4h,   v6.4s
        umin            v6.4h,   v6.4h,   v24.4h
        sub             v6.4h,   v6.4h,   v31.4h
--- a/third_party/dav1d/src/arm/64/mc.S
+++ b/third_party/dav1d/src/arm/64/mc.S
@ -1906,11 +1906,10 @@ L(\type\()_8tap_hv):
        bl              L(\type\()_8tap_filter_2)

        ext             v18.8b, v17.8b, v28.8b, #4
-        mov             v19.8b, v28.8b
        smull           v2.4s,  v16.4h, v1.h[0]
        smlal           v2.4s,  v17.4h, v1.h[1]
        smlal           v2.4s,  v18.4h, v1.h[2]
-        smlal           v2.4s,  v19.4h, v1.h[3]
+        smlal           v2.4s,  v28.4h, v1.h[3]

        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
        sqxtun          v2.8b,  v2.8h
@ -1919,7 +1918,7 @@ L(\type\()_8tap_hv):
        st1             {v2.h}[1], [\ds2], \d_strd
        b.le            0f
        mov             v16.8b, v18.8b
-        mov             v17.8b, v19.8b
+        mov             v17.8b, v28.8b
        b               2b

 280:    // 2x8, 2x16, 2x32 hv
@ -1956,7 +1955,6 @@ L(\type\()_8tap_hv):
 28:
        bl              L(\type\()_8tap_filter_2)
        ext             v22.8b, v21.8b, v28.8b, #4
-        mov             v23.8b, v28.8b
        smull           v2.4s,  v16.4h, v1.h[0]
        smlal           v2.4s,  v17.4h, v1.h[1]
        smlal           v2.4s,  v18.4h, v1.h[2]
@ -1964,7 +1962,7 @@ L(\type\()_8tap_hv):
        smlal           v2.4s,  v20.4h, v1.h[4]
        smlal           v2.4s,  v21.4h, v1.h[5]
        smlal           v2.4s,  v22.4h, v1.h[6]
-        smlal           v2.4s,  v23.4h, v1.h[7]
+        smlal           v2.4s,  v28.4h, v1.h[7]

        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
        sqxtun          v2.8b,  v2.8h
@ -1977,7 +1975,7 @@ L(\type\()_8tap_hv):
        mov             v18.8b, v20.8b
        mov             v19.8b, v21.8b
        mov             v20.8b, v22.8b
-        mov             v21.8b, v23.8b
+        mov             v21.8b, v28.8b
        b               28b

 0:
--- a/third_party/dav1d/src/arm/64/mc16.S
+++ b/third_party/dav1d/src/arm/64/mc16.S
@ -1004,11 +1004,11 @@ function put_neon
        b.gt            2b
        ret
 4:
-        ld1             {v0.8b}, [x2], x3
-        ld1             {v1.8b}, [x2], x3
+        ld1             {v0.4h}, [x2], x3
+        ld1             {v1.4h}, [x2], x3
        subs            w5,  w5,  #2
-        st1             {v0.8b}, [x0], x1
-        st1             {v1.8b}, [x0], x1
+        st1             {v0.4h}, [x0], x1
+        st1             {v1.4h}, [x0], x1
        b.gt            4b
        ret
 80:
@ -1017,11 +1017,11 @@ function put_neon
        add             x9,  x2,  x3
        lsl             x3,  x3,  #1
 8:
-        ld1             {v0.16b}, [x2], x3
-        ld1             {v1.16b}, [x9], x3
+        ld1             {v0.8h}, [x2], x3
+        ld1             {v1.8h}, [x9], x3
        subs            w5,  w5,  #2
-        st1             {v0.16b}, [x0], x1
-        st1             {v1.16b}, [x8], x1
+        st1             {v0.8h}, [x0], x1
+        st1             {v1.8h}, [x8], x1
        b.gt            8b
        ret
 16:
@ -2039,7 +2039,6 @@ L(\type\()_8tap_hv):
        sxtl            v0.8h,   v0.8b
        sxtl            v1.8h,   v1.8b
        mov             x15, x30
-        sxtl            v1.4s,   v1.4h

        ld1             {v27.8h}, [\src], \s_strd
        ext             v28.16b, v27.16b, v27.16b, #2
@ -2049,19 +2048,23 @@ L(\type\()_8tap_hv):
        addp            v16.4s,  v27.4s,  v27.4s
        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
        bl              L(\type\()_8tap_filter_2)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53).
+        xtn             v16.4h,  v16.4s

-        trn1            v16.2d,  v16.2d,  v24.2d
-        mov             v17.16b, v24.16b
+        trn1            v16.2s,  v16.2s,  v24.2s
+        mov             v17.8b,  v24.8b

 2:
        bl              L(\type\()_8tap_filter_2)

-        ext             v18.16b, v17.16b, v24.16b, #8
-        mov             v19.16b, v24.16b
-        mul             v2.4s,   v16.4s,  v1.s[0]
-        mla             v2.4s,   v17.4s,  v1.s[1]
-        mla             v2.4s,   v18.4s,  v1.s[2]
-        mla             v2.4s,   v19.4s,  v1.s[3]
+        ext             v18.8b,  v17.8b,  v24.8b,  #4
+        smull           v2.4s,   v16.4h,  v1.h[0]
+        smlal           v2.4s,   v17.4h,  v1.h[1]
+        smlal           v2.4s,   v18.4h,  v1.h[2]
+        smlal           v2.4s,   v24.4h,  v1.h[3]

        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
        sqxtun          v2.4h,   v2.4s
@ -2070,8 +2073,8 @@ L(\type\()_8tap_hv):
        st1             {v2.s}[0], [\dst], \d_strd
        st1             {v2.s}[1], [\ds2], \d_strd
        b.le            0f
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v19.16b
+        mov             v16.8b,  v18.8b
+        mov             v17.8b,  v24.8b
        b               2b

 280:    // 2x8, 2x16, 2x32 hv
@ -2085,8 +2088,6 @@ L(\type\()_8tap_hv):
        sxtl            v0.8h,   v0.8b
        sxtl            v1.8h,   v1.8b
        mov             x15, x30
-        sxtl2           v2.4s,   v1.8h
-        sxtl            v1.4s,   v1.4h

        ld1             {v27.8h}, [\src], \s_strd
        ext             v28.16b, v27.16b, v27.16b, #2
@ -2095,29 +2096,33 @@ L(\type\()_8tap_hv):
        addp            v27.4s,  v27.4s,  v28.4s
        addp            v16.4s,  v27.4s,  v27.4s
        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
+        // The intermediates from the horizontal pass fit in 16 bit without
+        // any bias; we could just as well keep them as .4s, but narrowing
+        // them to .4h gives a significant speedup on out of order cores
+        // (at the cost of a smaller slowdown on in-order cores such as A53).

        bl              L(\type\()_8tap_filter_2)
-        trn1            v16.2d,  v16.2d,  v24.2d
-        mov             v17.16b, v24.16b
+        xtn             v16.4h,  v16.4s
+        trn1            v16.2s,  v16.2s,  v24.2s
+        mov             v17.8b,  v24.8b
        bl              L(\type\()_8tap_filter_2)
-        ext             v18.16b, v17.16b, v24.16b, #8
-        mov             v19.16b, v24.16b
+        ext             v18.8b,  v17.8b,  v24.8b,  #4
+        mov             v19.8b,  v24.8b
        bl              L(\type\()_8tap_filter_2)
-        ext             v20.16b, v19.16b, v24.16b, #8
-        mov             v21.16b, v24.16b
+        ext             v20.8b,  v19.8b,  v24.8b,  #4
+        mov             v21.8b,  v24.8b

 28:
        bl              L(\type\()_8tap_filter_2)
-        ext             v22.16b, v21.16b, v24.16b, #8
-        mov             v23.16b, v24.16b
-        mul             v3.4s,   v16.4s,  v1.s[0]
-        mla             v3.4s,   v17.4s,  v1.s[1]
-        mla             v3.4s,   v18.4s,  v1.s[2]
-        mla             v3.4s,   v19.4s,  v1.s[3]
-        mla             v3.4s,   v20.4s,  v2.s[0]
-        mla             v3.4s,   v21.4s,  v2.s[1]
-        mla             v3.4s,   v22.4s,  v2.s[2]
-        mla             v3.4s,   v23.4s,  v2.s[3]
+        ext             v22.8b,  v21.8b,  v24.8b,  #4
+        smull           v3.4s,   v16.4h,  v1.h[0]
+        smlal           v3.4s,   v17.4h,  v1.h[1]
+        smlal           v3.4s,   v18.4h,  v1.h[2]
+        smlal           v3.4s,   v19.4h,  v1.h[3]
+        smlal           v3.4s,   v20.4h,  v1.h[4]
+        smlal           v3.4s,   v21.4h,  v1.h[5]
+        smlal           v3.4s,   v22.4h,  v1.h[6]
+        smlal           v3.4s,   v24.4h,  v1.h[7]

        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
        sqxtun          v3.4h,   v3.4s
@ -2126,12 +2131,12 @@ L(\type\()_8tap_hv):
        st1             {v3.s}[0], [\dst], \d_strd
        st1             {v3.s}[1], [\ds2], \d_strd
        b.le            0f
-        mov             v16.16b, v18.16b
-        mov             v17.16b, v19.16b
-        mov             v18.16b, v20.16b
-        mov             v19.16b, v21.16b
-        mov             v20.16b, v22.16b
-        mov             v21.16b, v23.16b
+        mov             v16.8b,  v18.8b
+        mov             v17.8b,  v19.8b
+        mov             v18.8b,  v20.8b
+        mov             v19.8b,  v21.8b
+        mov             v20.8b,  v22.8b
+        mov             v21.8b,  v24.8b
        b               28b

 0:
@ -2151,6 +2156,7 @@ L(\type\()_8tap_filter_2):
        smlal           v24.4s,  v27.4h,  v0.h[2]
        smlal           v24.4s,  v28.4h,  v0.h[3]
        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
+        xtn             v24.4h,  v24.4s
        ret
 .endif

--- a/third_party/dav1d/src/arm/looprestoration_init_tmpl.c
+++ b/third_party/dav1d/src/arm/looprestoration_init_tmpl.c
@ -29,7 +29,6 @@
 #include "src/looprestoration.h"
 #include "src/tables.h"

-#if BITDEPTH == 8 || ARCH_AARCH64
 // The 8bpc version calculates things slightly differently than the reference
 // C version. That version calculates roughly this:
 // int16_t sum = 0;
@ -105,6 +104,7 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
    }
 }

+#if BITDEPTH == 8 || ARCH_AARCH64
 void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
                                const pixel (*left)[4],
                                const pixel *src, const ptrdiff_t stride,
@ -290,8 +290,8 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPCont

    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;

-#if BITDEPTH == 8 || ARCH_AARCH64
    c->wiener = wiener_filter_neon;
+#if BITDEPTH == 8 || ARCH_AARCH64
    if (bpc <= 10)
        c->selfguided = sgr_filter_neon;
 #endif
--- a/third_party/dav1d/src/arm/mc_init_tmpl.c
+++ b/third_party/dav1d/src/arm/mc_init_tmpl.c
@ -77,7 +77,6 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {

    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;

-#if BITDEPTH == 8 || ARCH_AARCH64
    init_mc_fn (FILTER_2D_8TAP_REGULAR,        8tap_regular,        neon);
    init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
    init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp,  neon);
@ -103,6 +102,7 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
    c->avg = BF(dav1d_avg, neon);
    c->w_avg = BF(dav1d_w_avg, neon);
    c->mask = BF(dav1d_mask, neon);
+#if BITDEPTH == 8 || ARCH_AARCH64
    c->blend = BF(dav1d_blend, neon);
    c->blend_h = BF(dav1d_blend_h, neon);
    c->blend_v = BF(dav1d_blend_v, neon);
--- a/third_party/dav1d/src/decode.c
+++ b/third_party/dav1d/src/decode.c
@ -773,10 +773,10 @@ static int decode_b(Dav1dTileContext *const t,
                               signabs(t->warpmv.matrix[3]),
                               signabs(t->warpmv.matrix[4]),
                               signabs(t->warpmv.matrix[5]),
-                               signabs(t->warpmv.alpha),
-                               signabs(t->warpmv.beta),
-                               signabs(t->warpmv.gamma),
-                               signabs(t->warpmv.delta),
+                               signabs(t->warpmv.u.p.alpha),
+                               signabs(t->warpmv.u.p.beta),
+                               signabs(t->warpmv.u.p.gamma),
+                               signabs(t->warpmv.u.p.delta),
                               b->mv2d.y, b->mv2d.x);
 #undef signabs
                }
@ -1843,10 +1843,10 @@ static int decode_b(Dav1dTileContext *const t,
                               signabs(t->warpmv.matrix[3]),
                               signabs(t->warpmv.matrix[4]),
                               signabs(t->warpmv.matrix[5]),
-                               signabs(t->warpmv.alpha),
-                               signabs(t->warpmv.beta),
-                               signabs(t->warpmv.gamma),
-                               signabs(t->warpmv.delta),
+                               signabs(t->warpmv.u.p.alpha),
+                               signabs(t->warpmv.u.p.beta),
+                               signabs(t->warpmv.u.p.gamma),
+                               signabs(t->warpmv.u.p.delta),
                               b->mv[0].y, b->mv[0].x);
 #undef signabs
                    if (f->frame_thread.pass) {
--- a/third_party/dav1d/src/meson.build
+++ b/third_party/dav1d/src/meson.build
@ -82,7 +82,7 @@ libdav1d_entrypoints_sources = files(
 )

 # ASM specific sources
-libdav1d_nasm_objs = []
+libdav1d_asm_objs = []
 # Arch-specific flags
 arch_flags = []
 if is_asm_enabled
@ -102,7 +102,7 @@ if is_asm_enabled
        )
        if (host_machine.cpu_family() == 'aarch64' or
            host_machine.cpu() == 'arm64')
-            libdav1d_sources += files(
+            libdav1d_sources_asm = files(
                # itx.S is used for both 8 and 16 bpc.
                'arm/64/itx.S',
                'arm/64/looprestoration_common.S',
@ -110,7 +110,7 @@ if is_asm_enabled
            )

            if dav1d_bitdepths.contains('8')
-                libdav1d_sources += files(
+                libdav1d_sources_asm += files(
                    'arm/64/cdef.S',
                    'arm/64/ipred.S',
                    'arm/64/loopfilter.S',
@ -120,7 +120,7 @@ if is_asm_enabled
            endif

            if dav1d_bitdepths.contains('16')
-                libdav1d_sources += files(
+                libdav1d_sources_asm += files(
                    'arm/64/cdef16.S',
                    'arm/64/ipred16.S',
                    'arm/64/itx16.S',
@ -130,12 +130,12 @@ if is_asm_enabled
                )
            endif
        elif host_machine.cpu_family().startswith('arm')
-            libdav1d_sources += files(
+            libdav1d_sources_asm = files(
                'arm/32/msac.S',
            )

            if dav1d_bitdepths.contains('8')
-                libdav1d_sources += files(
+                libdav1d_sources_asm += files(
                    'arm/32/cdef.S',
                    'arm/32/ipred.S',
                    'arm/32/itx.S',
@ -146,10 +146,18 @@ if is_asm_enabled
            endif

            if dav1d_bitdepths.contains('16')
-                libdav1d_sources += files(
+                libdav1d_sources_asm += files(
+                    'arm/32/looprestoration16.S',
+                    'arm/32/mc16.S',
                )
            endif
        endif
+
+        if use_gaspp
+            libdav1d_asm_objs = gaspp_gen.process(libdav1d_sources_asm)
+        else
+            libdav1d_sources += libdav1d_sources_asm
+        endif
    elif host_machine.cpu_family().startswith('x86')

        libdav1d_sources += files(
@ -200,7 +208,7 @@ if is_asm_enabled
        endif

        # Compile the ASM sources with NASM
-        libdav1d_nasm_objs = nasm_gen.process(libdav1d_sources_asm)
+        libdav1d_asm_objs = nasm_gen.process(libdav1d_sources_asm)
    elif host_machine.cpu() == 'ppc64le'
        arch_flags = ['-maltivec', '-mvsx']
        libdav1d_sources += files(
@ -222,17 +230,6 @@ api_export_flags = []
 #

 if host_machine.system() == 'windows' and get_option('default_library') != 'static'
-    rc_version_array = meson.project_version().split('.')
-    winmod = import('windows')
-    rc_data = configuration_data()
-    rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0])
-    rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1])
-    rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2])
-    rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
-    rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
-    rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
-    rc_data.set('COPYRIGHT_YEARS', '2019')
-
    rc_file = configure_file(
        input : 'dav1d.rc.in',
        output : 'dav1d.rc',
@ -301,7 +298,7 @@ endif

 libdav1d = library('dav1d',
    libdav1d_sources,
-    libdav1d_nasm_objs,
+    libdav1d_asm_objs,
    libdav1d_rc_obj,

    objects : [
--- a/third_party/dav1d/src/obu.c
+++ b/third_party/dav1d/src/obu.c
@ -112,6 +112,8 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
            struct Dav1dSequenceHeaderOperatingPoint *const op =
                &hdr->operating_points[i];
            op->idc = dav1d_get_bits(gb, 12);
+            if (op->idc && (!(op->idc & 0xff) || !(op->idc & 0xf00)))
+                goto error;
            op->major_level = 2 + dav1d_get_bits(gb, 3);
            op->minor_level = dav1d_get_bits(gb, 2);
            op->tier = op->major_level > 3 ? dav1d_get_bits(gb, 1) : 0;
--- a/third_party/dav1d/src/recon_tmpl.c
+++ b/third_party/dav1d/src/recon_tmpl.c
@ -1082,11 +1082,11 @@ static int warp_affine(Dav1dTileContext *const t,
            const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;

            const int dx = (int) (mvx >> 16) - 4;
-            const int mx = (((int) mvx & 0xffff) - wmp->alpha * 4 -
-                                                   wmp->beta  * 7) & ~0x3f;
+            const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 -
+                                                   wmp->u.p.beta  * 7) & ~0x3f;
            const int dy = (int) (mvy >> 16) - 4;
-            const int my = (((int) mvy & 0xffff) - wmp->gamma * 4 -
-                                                   wmp->delta * 4) & ~0x3f;
+            const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 -
+                                                   wmp->u.p.delta * 4) & ~0x3f;

            const pixel *ref_ptr;
            ptrdiff_t ref_stride = refp->p.stride[!!pl];
@ -1108,10 +1108,10 @@ static int warp_affine(Dav1dTileContext *const t,
            }
            if (dst16 != NULL)
                dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
-                                 wmp->abcd, mx, my HIGHBD_CALL_SUFFIX);
+                                 wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
            else
                dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
-                                wmp->abcd, mx, my HIGHBD_CALL_SUFFIX);
+                                wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
        }
        if (dst8) dst8  += 8 * PXSTRIDE(dstride);
        else      dst16 += 8 * dstride;
--- a/third_party/dav1d/src/tables.c
+++ b/third_party/dav1d/src/tables.c
@ -391,10 +391,10 @@ const Dav1dWarpedMotionParams dav1d_default_wm_params = {
        0, 0, 1 << 16,
        0, 0, 1 << 16,
    },
-    .alpha = 0,
-    .beta = 0,
-    .gamma = 0,
-    .delta = 0,
+    .u.p.alpha = 0,
+    .u.p.beta = 0,
+    .u.p.gamma = 0,
+    .u.p.delta = 0,
 };

 const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = {
--- a/third_party/dav1d/src/warpmv.c
+++ b/third_party/dav1d/src/warpmv.c
@ -82,21 +82,21 @@ int dav1d_get_shear_params(Dav1dWarpedMotionParams *const wm) {

    if (mat[2] <= 0) return 1;

-    wm->alpha = iclip_wmp(mat[2] - 0x10000);
-    wm->beta = iclip_wmp(mat[3]);
+    wm->u.p.alpha = iclip_wmp(mat[2] - 0x10000);
+    wm->u.p.beta = iclip_wmp(mat[3]);

    int shift;
    const int y = apply_sign(resolve_divisor_32(abs(mat[2]), &shift), mat[2]);
    const int64_t v1 = ((int64_t) mat[4] * 0x10000) * y;
    const int rnd = (1 << shift) >> 1;
-    wm->gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1));
+    wm->u.p.gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1));
    const int64_t v2 = ((int64_t) mat[3] * mat[4]) * y;
-    wm->delta = iclip_wmp(mat[5] -
+    wm->u.p.delta = iclip_wmp(mat[5] -
                          apply_sign64((int) ((llabs(v2) + rnd) >> shift), v2) -
                          0x10000);

-    return (4 * abs(wm->alpha) + 7 * abs(wm->beta) >= 0x10000) ||
-           (4 * abs(wm->gamma) + 4 * abs(wm->delta) >= 0x10000);
+    return (4 * abs(wm->u.p.alpha) + 7 * abs(wm->u.p.beta) >= 0x10000) ||
+           (4 * abs(wm->u.p.gamma) + 4 * abs(wm->u.p.delta) >= 0x10000);
 }

 static int resolve_divisor_64(const uint64_t d, int *const shift) {
--- a/third_party/dav1d/src/x86/mc_avx2.asm
+++ b/third_party/dav1d/src/x86/mc_avx2.asm
--- a/third_party/dav1d/src/x86/mc_sse.asm
+++ b/third_party/dav1d/src/x86/mc_sse.asm
--- a/third_party/dav1d/tests/header_test.c
+++ b/third_party/dav1d/tests/header_test.c
@ -0,0 +1,33 @@
+/*
+ * Copyright © 2018, VideoLAN and dav1d authors
+ * Copyright © 2018, Two Orioles, LLC
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include DAV1D_TEST_HEADER
+
+int main()
+{
+    return 0;
+}
--- a/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c
+++ b/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c
@ -31,6 +31,7 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <string.h>
+#include <stdlib.h>

 #include <dav1d/dav1d.h>
 #include "src/cpu.h"
@ -38,8 +39,6 @@

 #ifdef DAV1D_ALLOC_FAIL

-#include <stdlib.h>
-
 #include "alloc_fail.h"

 static unsigned djb_xor(const uint8_t * c, size_t len) {
@ -56,6 +55,39 @@ static unsigned r32le(const uint8_t *const p) {

 #define DAV1D_FUZZ_MAX_SIZE 4096 * 4096

+// search for "--cpumask xxx" in argv and remove both parameters
+int LLVMFuzzerInitialize(int *argc, char ***argv) {
+    int i = 1;
+    for (; i < *argc; i++) {
+        if (!strcmp((*argv)[i], "--cpumask")) {
+            const char * cpumask = (*argv)[i+1];
+            if (cpumask) {
+                char *end;
+                unsigned res;
+                if (!strncmp(cpumask, "0x", 2)) {
+                    cpumask += 2;
+                    res = (unsigned) strtoul(cpumask, &end, 16);
+                } else {
+                    res = (unsigned) strtoul(cpumask, &end, 0);
+                }
+                if (end != cpumask && !end[0]) {
+                    dav1d_set_cpu_flags_mask(res);
+                }
+            }
+            break;
+        }
+    }
+
+    for (; i < *argc - 2; i++) {
+        (*argv)[i] = (*argv)[i + 2];
+    }
+
+    *argc = i;
+
+    return 0;
+}
+
+
 // expects ivf input

 int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
--- a/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.h
+++ b/third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.h
@ -31,6 +31,7 @@
 #include <stddef.h>
 #include <stdint.h>

+int LLVMFuzzerInitialize(int *argc, char ***argv);
 int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size);

 #endif /* DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H */
--- a/third_party/dav1d/tests/libfuzzer/main.c
+++ b/third_party/dav1d/tests/libfuzzer/main.c
@ -40,7 +40,7 @@

 // expects ivf input

-int main(const int argc, char *const *const argv) {
+int main(int argc, char *argv[]) {
    int ret = -1;
    FILE *f = NULL;
    int64_t fsize;
@ -48,6 +48,10 @@ int main(const int argc, char *const *const argv) {
    uint8_t *data = NULL;
    size_t size = 0;

+    if (LLVMFuzzerInitialize(&argc, &argv)) {
+        return 1;
+    }
+
    if (argc != 2) {
        fprintf(stdout, "Usage:\n%s fuzzing_testcase.ivf\n", argv[0]);
        return -1;
--- a/third_party/dav1d/tests/meson.build
+++ b/third_party/dav1d/tests/meson.build
@ -31,8 +31,6 @@ if not get_option('enable_tests')
    subdir_done()
 endif

-libdav1d_nasm_objs_if_needed = []
-
 if is_asm_enabled
    checkasm_sources = files(
        'checkasm/checkasm.c',
@ -62,25 +60,27 @@ if is_asm_enabled
        checkasm_bitdepth_objs += checkasm_bitdepth_lib.extract_all_objects()
    endforeach

-    checkasm_nasm_objs = []
+    checkasm_asm_objs = []
+    checkasm_asm_sources = []
    if host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64'
-        checkasm_sources += files('checkasm/arm/checkasm_64.S')
+        checkasm_asm_sources += files('checkasm/arm/checkasm_64.S')
    elif host_machine.cpu_family().startswith('arm')
-        checkasm_sources += files('checkasm/arm/checkasm_32.S')
+        checkasm_asm_sources += files('checkasm/arm/checkasm_32.S')
    elif host_machine.cpu_family().startswith('x86')
-        checkasm_nasm_objs = nasm_gen.process(files('checkasm/x86/checkasm.asm'))
+        checkasm_asm_objs += nasm_gen.process(files('checkasm/x86/checkasm.asm'))
+    endif
+
+    if use_gaspp
+        checkasm_asm_objs += gaspp_gen.process(checkasm_asm_sources)
+    else
+        checkasm_sources += checkasm_asm_sources
    endif

    m_lib = cc.find_library('m', required: false)

-    if meson.version().version_compare('< 0.48.999')
-        libdav1d_nasm_objs_if_needed = libdav1d_nasm_objs
-    endif
-
    checkasm = executable('checkasm',
        checkasm_sources,
-        checkasm_nasm_objs,
-        libdav1d_nasm_objs_if_needed,
+        checkasm_asm_objs,

        objects: [
            checkasm_bitdepth_objs,
@ -101,10 +101,30 @@ if is_asm_enabled
    test('checkasm', checkasm, is_parallel: false)
 endif

+c99_extension_flag = cc.first_supported_argument(
+    '-Werror=c11-extensions',
+    '-Werror=c99-c11-compat',
+    '-Wc11-extensions',
+    '-Wc99-c11-compat',
+)
+
+# dav1d_api_headers
+foreach header : dav1d_api_headers
+    target = header + '_test'
+
+    header_test_exe = executable(target,
+        'header_test.c',
+        include_directories: dav1d_inc_dirs,
+        c_args: ['-DDAV1D_TEST_HEADER="@0@"'.format(header), c99_extension_flag],
+        build_by_default: true
+    )
+
+    test(target, header_test_exe)
+endforeach
+
+
 # fuzzing binaries
-if meson.version().version_compare('>=0.49')
-    subdir('libfuzzer')
-endif
+subdir('libfuzzer')

 # Include dav1d test data repository with additional tests
 if get_option('testdata_tests')
--- a/third_party/dav1d/tools/dav1d.c
+++ b/third_party/dav1d/tools/dav1d.c
@ -124,11 +124,15 @@ static void print_stats(const int istty, const unsigned n, const unsigned num,
    else
        b += snprintf(b, end - b, "Decoded %u/%u frames (%.1lf%%)",
                      n, num, 100.0 * n / num);
-    if (i_fps && b < end) {
+    if (b < end) {
        const double d_fps = 1e9 * n / elapsed;
-        const double speed = d_fps / i_fps;
-        b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)",
-                      d_fps, i_fps, speed);
+        if (i_fps) {
+            const double speed = d_fps / i_fps;
+            b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)",
+                          d_fps, i_fps, speed);
+        } else {
+            b += snprintf(b, end - b, " - %.2lf fps", d_fps);
+        }
    }
    if (!istty)
        strcpy(b > end - 2 ? end - 2 : b, "\n");
--- a/third_party/dav1d/tools/dav1d.manifest
+++ b/third_party/dav1d/tools/dav1d.manifest
@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="utf-8" standalone="yes"?>
+<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">
+    <assemblyIdentity type="win32" name="VideoLAN.dav1d" version="1.0.0.0"/>
+    <application xmlns="urn:schemas-microsoft-com:asm.v3">
+        <windowsSettings>
+            <longPathAware xmlns="http://schemas.microsoft.com/SMI/2016/WindowsSettings">true</longPathAware>
+            <activeCodePage xmlns="http://schemas.microsoft.com/SMI/2019/WindowsSettings">UTF-8</activeCodePage>
+        </windowsSettings>
+    </application>
+</assembly>
--- a/third_party/dav1d/tools/dav1d.rc.in
+++ b/third_party/dav1d/tools/dav1d.rc.in
@ -0,0 +1,33 @@
+#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0
+#define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@"
+#define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0
+#define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@"
+
+#include <windows.h>
+
+1 RT_MANIFEST "dav1d.manifest"
+1 VERSIONINFO
+FILETYPE VFT_APP
+FILEOS VOS_NT_WINDOWS32
+PRODUCTVERSION PROJECT_VERSION_NUMBER
+FILEVERSION API_VERSION_NUMBER
+BEGIN
+  BLOCK "StringFileInfo"
+  BEGIN
+    BLOCK "040904E4"
+    BEGIN
+      VALUE "CompanyName", "VideoLAN"
+      VALUE "ProductName", "dav1d"
+      VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR
+      VALUE "FileVersion", API_VERSION_NUMBER_STR
+      VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder"
+      VALUE "InternalName", "dav1d"
+      VALUE "OriginalFilename", "dav1d.exe"
+      VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
+    END
+  END
+  BLOCK "VarFileInfo"
+  BEGIN
+    VALUE "Translation", 0x409, 1252
+  END
+END
--- a/third_party/dav1d/tools/meson.build
+++ b/third_party/dav1d/tools/meson.build
@ -77,8 +77,24 @@ dav1d_sources = files(
    'dav1d_cli_parse.c',
 )

+if host_machine.system() == 'windows'
+    rc_file = configure_file(
+        input : 'dav1d.rc.in',
+        output : 'dav1d.rc',
+        configuration : rc_data
+    )
+
+    dav1d_rc_obj = winmod.compile_resources(rc_file,
+       depend_files : files('dav1d.manifest'),
+       include_directories : include_directories('.')
+    )
+else
+    dav1d_rc_obj = []
+endif
+
 dav1d = executable('dav1d',
    dav1d_sources,
+    dav1d_rc_obj,
    rev_target, cli_config_h_target,

    link_with : [libdav1d, dav1d_input_objs, dav1d_output_objs],
--- a/third_party/dav1d/tools/output/y4m2.c
+++ b/third_party/dav1d/tools/output/y4m2.c
@ -28,6 +28,7 @@
 #include "config.h"

 #include <errno.h>
+#include <inttypes.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@ -77,8 +78,17 @@ static int write_header(Y4m2OutputContext *const c, const Dav1dPicture *const p)
        chr_names_8bpc_i420[p->seq_hdr->chr > 2 ? DAV1D_CHR_UNKNOWN : p->seq_hdr->chr] :
        ss_names[p->p.layout][p->seq_hdr->hbd];

-    fprintf(c->f, "YUV4MPEG2 W%d H%d F%d:%d Ip C%s\n",
-            p->p.w, p->p.h, c->fps[0], c->fps[1], ss_name);
+    const unsigned fw = p->p.w;
+    const unsigned fh = p->p.h;
+    uint64_t aw = (uint64_t)fh * p->frame_hdr->render_width;
+    uint64_t ah = (uint64_t)fw * p->frame_hdr->render_height;
+    uint64_t gcd = ah;
+    for (uint64_t a = aw, b; (b = a % gcd); a = gcd, gcd = b);
+    aw /= gcd;
+    ah /= gcd;
+
+    fprintf(c->f, "YUV4MPEG2 W%u H%u F%u:%u Ip A%"PRIu64":%"PRIu64" C%s\n",
+            fw, fh, c->fps[0], c->fps[1], aw, ah, ss_name);

    return 0;
 }