forked from mirrors/gecko-dev
Bug 1661093 - Update libdav1d to 0243c3ff for Firefox 82. r=mjf
Differential Revision: https://phabricator.services.mozilla.com/D92534
This commit is contained in:
parent
8452a57539
commit
067cafe63f
36 changed files with 4671 additions and 1491 deletions
|
|
@ -186,7 +186,9 @@ elif CONFIG['CPU_ARCH'] == 'arm' or CONFIG['CPU_ARCH'] == 'aarch64':
|
|||
'../../../third_party/dav1d/src/arm/32/itx.S',
|
||||
'../../../third_party/dav1d/src/arm/32/loopfilter.S',
|
||||
'../../../third_party/dav1d/src/arm/32/looprestoration.S',
|
||||
'../../../third_party/dav1d/src/arm/32/looprestoration16.S',
|
||||
'../../../third_party/dav1d/src/arm/32/mc.S',
|
||||
'../../../third_party/dav1d/src/arm/32/mc16.S',
|
||||
'../../../third_party/dav1d/src/arm/32/msac.S',
|
||||
]
|
||||
|
||||
|
|
|
|||
|
|
@ -20,11 +20,11 @@ origin:
|
|||
|
||||
# Human-readable identifier for this version/release
|
||||
# Generally "version NNN", "tag SSS", "bookmark SSS"
|
||||
release: commit d0e50cacead63e9904dde184580ce9a746374bd5 (2020-08-21T15:13:49.000+02:00).
|
||||
release: commit 0243c3ffb644e61848b82f24f5e4a7324669d76e (2020-09-27T15:38:45.000+02:00).
|
||||
|
||||
# Revision to pull in
|
||||
# Must be a long or short commit SHA (long preferred)
|
||||
revision: d0e50cacead63e9904dde184580ce9a746374bd5
|
||||
revision: 0243c3ffb644e61848b82f24f5e4a7324669d76e
|
||||
|
||||
# The package's license, where possible using the mnemonic from
|
||||
# https://spdx.org/licenses/
|
||||
|
|
|
|||
|
|
@ -1,2 +1,2 @@
|
|||
/* auto-generated, do not edit */
|
||||
#define DAV1D_VERSION "0.7.1-49-gd0e50ca"
|
||||
#define DAV1D_VERSION "0.7.1-81-g0243c3f"
|
||||
|
|
|
|||
|
|
@ -27,8 +27,8 @@
|
|||
#ifndef DAV1D_VERSION_H
|
||||
#define DAV1D_VERSION_H
|
||||
|
||||
#define DAV1D_API_VERSION_MAJOR 4
|
||||
#define DAV1D_API_VERSION_MAJOR 5
|
||||
#define DAV1D_API_VERSION_MINOR 0
|
||||
#define DAV1D_API_VERSION_PATCH 2
|
||||
#define DAV1D_API_VERSION_PATCH 0
|
||||
|
||||
#endif /* DAV1D_VERSION_H */
|
||||
|
|
|
|||
2
third_party/dav1d/CONTRIBUTING.md
vendored
2
third_party/dav1d/CONTRIBUTING.md
vendored
|
|
@ -12,7 +12,7 @@ The todo list can be found [on the wiki](https://code.videolan.org/videolan/dav1
|
|||
The codebase is developed with the following assumptions:
|
||||
|
||||
For the library:
|
||||
- C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extension,
|
||||
- C language with C99 version, without the VLA or the Complex (*\_\_STDC_NO_COMPLEX__*) features, and without compiler extensions. Anonymous structures and unions are the only allowed compiler extensions for internal code.
|
||||
- x86 asm in .asm files, using the NASM syntax,
|
||||
- arm/arm64 in .S files, using the GAS syntax limited to subset llvm 5.0's internal assembler supports,
|
||||
- no C++ is allowed, whatever the version.
|
||||
|
|
|
|||
2
third_party/dav1d/include/dav1d/dav1d.h
vendored
2
third_party/dav1d/include/dav1d/dav1d.h
vendored
|
|
@ -65,9 +65,9 @@ typedef struct Dav1dSettings {
|
|||
int operating_point; ///< select an operating point for scalable AV1 bitstreams (0 - 31)
|
||||
int all_layers; ///< output all spatial layers of a scalable AV1 biststream
|
||||
unsigned frame_size_limit; ///< maximum frame size, in pixels (0 = unlimited)
|
||||
uint8_t reserved[32]; ///< reserved for future use
|
||||
Dav1dPicAllocator allocator; ///< Picture allocator callback.
|
||||
Dav1dLogger logger; ///< Logger callback.
|
||||
uint8_t reserved[32]; ///< reserved for future use
|
||||
} Dav1dSettings;
|
||||
|
||||
/**
|
||||
|
|
|
|||
5
third_party/dav1d/include/dav1d/headers.h
vendored
5
third_party/dav1d/include/dav1d/headers.h
vendored
|
|
@ -28,6 +28,7 @@
|
|||
#ifndef DAV1D_HEADERS_H
|
||||
#define DAV1D_HEADERS_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stddef.h>
|
||||
|
||||
// Constants from Section 3. "Symbols and abbreviated terms"
|
||||
|
|
@ -95,9 +96,9 @@ typedef struct Dav1dWarpedMotionParams {
|
|||
union {
|
||||
struct {
|
||||
int16_t alpha, beta, gamma, delta;
|
||||
};
|
||||
} p;
|
||||
int16_t abcd[4];
|
||||
};
|
||||
} u;
|
||||
} Dav1dWarpedMotionParams;
|
||||
|
||||
enum Dav1dPixelLayout {
|
||||
|
|
|
|||
14
third_party/dav1d/include/dav1d/meson.build
vendored
14
third_party/dav1d/include/dav1d/meson.build
vendored
|
|
@ -31,11 +31,15 @@ version_h_target = configure_file(input: 'version.h.in',
|
|||
output: 'version.h',
|
||||
configuration: version_h_data)
|
||||
|
||||
dav1d_api_headers = [
|
||||
'common.h',
|
||||
'data.h',
|
||||
'dav1d.h',
|
||||
'headers.h',
|
||||
'picture.h',
|
||||
]
|
||||
|
||||
# install headers
|
||||
install_headers('common.h',
|
||||
'data.h',
|
||||
'dav1d.h',
|
||||
'headers.h',
|
||||
'picture.h',
|
||||
install_headers(dav1d_api_headers,
|
||||
version_h_target,
|
||||
subdir : 'dav1d')
|
||||
|
|
|
|||
39
third_party/dav1d/meson.build
vendored
39
third_party/dav1d/meson.build
vendored
|
|
@ -28,9 +28,9 @@ project('dav1d', ['c'],
|
|||
'warning_level=2',
|
||||
'buildtype=release',
|
||||
'b_ndebug=if-release'],
|
||||
meson_version: '>= 0.47.0')
|
||||
meson_version: '>= 0.49.0')
|
||||
|
||||
dav1d_soname_version = '4.0.2'
|
||||
dav1d_soname_version = '5.0.0'
|
||||
dav1d_api_version_array = dav1d_soname_version.split('.')
|
||||
dav1d_api_version_major = dav1d_api_version_array[0]
|
||||
dav1d_api_version_minor = dav1d_api_version_array[1]
|
||||
|
|
@ -118,6 +118,17 @@ if host_machine.system() == 'windows'
|
|||
thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))
|
||||
|
||||
rt_dependency = []
|
||||
|
||||
rc_version_array = meson.project_version().split('.')
|
||||
winmod = import('windows')
|
||||
rc_data = configuration_data()
|
||||
rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0])
|
||||
rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1])
|
||||
rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2])
|
||||
rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
|
||||
rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
|
||||
rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
|
||||
rc_data.set('COPYRIGHT_YEARS', '2020')
|
||||
else
|
||||
thread_dependency = dependency('threads')
|
||||
thread_compat_dep = []
|
||||
|
|
@ -227,7 +238,7 @@ endif
|
|||
# Compiler flags that should be set
|
||||
# But when the compiler does not supports them
|
||||
# it is not an error and silently tolerated
|
||||
if cc.get_id() != 'msvc'
|
||||
if cc.get_argument_syntax() != 'msvc'
|
||||
optional_arguments += [
|
||||
'-Wundef',
|
||||
'-Werror=vla',
|
||||
|
|
@ -426,6 +437,28 @@ if is_asm_enabled and host_machine.cpu_family().startswith('x86')
|
|||
])
|
||||
endif
|
||||
|
||||
use_gaspp = false
|
||||
if (is_asm_enabled and
|
||||
(host_machine.cpu_family() == 'aarch64' or
|
||||
host_machine.cpu_family().startswith('arm')) and
|
||||
cc.get_argument_syntax() == 'msvc')
|
||||
gaspp = find_program('gas-preprocessor.pl')
|
||||
use_gaspp = true
|
||||
gaspp_gen = generator(gaspp,
|
||||
output: '@BASENAME@.obj',
|
||||
arguments: [
|
||||
'-as-type', 'armasm',
|
||||
'-arch', host_machine.cpu_family(),
|
||||
'--',
|
||||
host_machine.cpu_family() == 'aarch64' ? 'armasm64' : 'armasm',
|
||||
'-nologo',
|
||||
'-I@0@'.format(dav1d_src_root),
|
||||
'-I@0@/'.format(meson.current_build_dir()),
|
||||
'@INPUT@',
|
||||
'-c',
|
||||
'-o', '@OUTPUT@'
|
||||
])
|
||||
endif
|
||||
|
||||
# Generate config.h
|
||||
config_h_target = configure_file(output: 'config.h', configuration: cdata)
|
||||
|
|
|
|||
143
third_party/dav1d/src/arm/32/looprestoration.S
vendored
143
third_party/dav1d/src/arm/32/looprestoration.S
vendored
|
|
@ -40,8 +40,8 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||
mov r8, r5
|
||||
vld1.16 {q0}, [r4]
|
||||
movw r9, #(1 << 14) - (1 << 2)
|
||||
vdup.16 q14, r9
|
||||
vmov.s16 q15, #2048
|
||||
vdup.16 q14, r9
|
||||
vmov.s16 q15, #2048
|
||||
// Calculate mid_stride
|
||||
add r10, r5, #7
|
||||
bic r10, r10, #7
|
||||
|
|
@ -108,8 +108,8 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||
0:
|
||||
// !LR_HAVE_LEFT, fill q1 with the leftmost byte
|
||||
// and shift q2 to have 3x the first byte at the front.
|
||||
vdup.8 q1, d4[0]
|
||||
vdup.8 q8, d18[0]
|
||||
vdup.8 q1, d4[0]
|
||||
vdup.8 q8, d18[0]
|
||||
// Move r2 back to account for the last 3 bytes we loaded before,
|
||||
// which we shifted out.
|
||||
sub r2, r2, #3
|
||||
|
|
@ -127,7 +127,7 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||
bne 4f
|
||||
// If we'll need to pad the right edge, load that byte to pad with
|
||||
// here since we can find it pretty easily from here.
|
||||
sub r9, r5, #14
|
||||
sub r9, r5, #14
|
||||
ldrb r11, [r2, r9]
|
||||
ldrb r9, [lr, r9]
|
||||
// Fill q12/q13 with the right padding pixel
|
||||
|
|
@ -144,7 +144,6 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||
b 6f
|
||||
|
||||
4: // Loop horizontally
|
||||
.macro filter_8
|
||||
// This is tuned as some sort of compromise between Cortex A7, A8,
|
||||
// A9 and A53.
|
||||
vmul.s16 q3, q1, d0[0]
|
||||
|
|
@ -187,8 +186,6 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||
vshr.s16 q10, q10, #3
|
||||
vadd.s16 q3, q3, q15
|
||||
vadd.s16 q10, q10, q15
|
||||
.endm
|
||||
filter_8
|
||||
vst1.16 {q3}, [r0, :128]!
|
||||
vst1.16 {q10}, [r12, :128]!
|
||||
|
||||
|
|
@ -206,50 +203,43 @@ function wiener_filter_h_8bpc_neon, export=1
|
|||
|
||||
5: // Filter 4 pixels, 7 <= w < 11
|
||||
.macro filter_4
|
||||
vext.8 d20, d2, d3, #2
|
||||
vext.8 d21, d2, d3, #4
|
||||
vext.8 d22, d2, d3, #6
|
||||
vext.8 d23, d3, d4, #2
|
||||
vext.8 d8, d3, d4, #4
|
||||
vmul.s16 d6, d2, d0[0]
|
||||
vext.8 q10, q1, q2, #2
|
||||
vext.8 q11, q1, q2, #4
|
||||
vmla.s16 d6, d20, d0[1]
|
||||
vmla.s16 d6, d22, d0[2]
|
||||
vext.8 q10, q1, q2, #6
|
||||
vext.8 q11, q1, q2, #8
|
||||
vmla.s16 d6, d20, d0[3]
|
||||
vmla.s16 d6, d22, d1[0]
|
||||
vext.8 q10, q1, q2, #10
|
||||
vext.8 q11, q1, q2, #12
|
||||
vmla.s16 d6, d20, d1[1]
|
||||
vmla.s16 d6, d22, d1[2]
|
||||
vmla.s16 d6, d21, d0[2]
|
||||
vmla.s16 d6, d22, d0[3]
|
||||
vmla.s16 d6, d3, d1[0]
|
||||
vmla.s16 d6, d23, d1[1]
|
||||
vmla.s16 d6, d8, d1[2]
|
||||
|
||||
vmul.s16 d20, d16, d0[0]
|
||||
vext.8 q11, q8, q9, #2
|
||||
vext.8 q4, q8, q9, #4
|
||||
vmla.s16 d20, d22, d0[1]
|
||||
vmla.s16 d20, d8, d0[2]
|
||||
vext.8 q11, q8, q9, #6
|
||||
vext.8 q4, q8, q9, #8
|
||||
vmla.s16 d20, d22, d0[3]
|
||||
vmla.s16 d20, d8, d1[0]
|
||||
vext.8 q11, q8, q9, #10
|
||||
vext.8 q4, q8, q9, #12
|
||||
vmla.s16 d20, d22, d1[1]
|
||||
vmla.s16 d20, d8, d1[2]
|
||||
vext.8 d20, d16, d17, #2
|
||||
vext.8 d21, d16, d17, #4
|
||||
vext.8 d22, d16, d17, #6
|
||||
vext.8 d23, d17, d18, #2
|
||||
vext.8 d8, d17, d18, #4
|
||||
vmul.s16 d7, d16, d0[0]
|
||||
vmla.s16 d7, d20, d0[1]
|
||||
vmla.s16 d7, d21, d0[2]
|
||||
vmla.s16 d7, d22, d0[3]
|
||||
vmla.s16 d7, d17, d1[0]
|
||||
vmla.s16 d7, d23, d1[1]
|
||||
vmla.s16 d7, d8, d1[2]
|
||||
|
||||
vext.8 q11, q1, q2, #6
|
||||
vshl.s16 d22, d22, #7
|
||||
vsub.s16 d22, d22, d28
|
||||
vqadd.s16 d6, d6, d22
|
||||
vext.8 q11, q8, q9, #6
|
||||
vshl.s16 d22, d22, #7
|
||||
vsub.s16 d22, d22, d28
|
||||
vqadd.s16 d20, d20, d22
|
||||
vshr.s16 d6, d6, #3
|
||||
vshr.s16 d20, d20, #3
|
||||
vadd.s16 d6, d6, d30
|
||||
vadd.s16 d20, d20, d30
|
||||
vext.8 d22, d2, d3, #6
|
||||
vext.8 d23, d16, d17, #6
|
||||
vshl.s16 q11, q11, #7
|
||||
vsub.s16 q11, q11, q14
|
||||
vqadd.s16 q3, q3, q11
|
||||
vshr.s16 q3, q3, #3
|
||||
vadd.s16 q3, q3, q15
|
||||
.endm
|
||||
filter_4
|
||||
vst1.16 {d6}, [r0, :64]!
|
||||
vst1.16 {d20}, [r12, :64]!
|
||||
vst1.16 {d7}, [r12, :64]!
|
||||
|
||||
subs r5, r5, #4 // 3 <= w < 7
|
||||
vext.8 q1, q1, q2, #8
|
||||
|
|
@ -323,7 +313,7 @@ L(variable_shift_tbl):
|
|||
// w >= 4, filter 4 pixels
|
||||
filter_4
|
||||
vst1.16 {d6}, [r0, :64]!
|
||||
vst1.16 {d20}, [r12, :64]!
|
||||
vst1.16 {d7}, [r12, :64]!
|
||||
subs r5, r5, #4 // 0 <= w < 4
|
||||
vext.8 q1, q1, q2, #8
|
||||
vext.8 q8, q8, q9, #8
|
||||
|
|
@ -338,11 +328,11 @@ L(variable_shift_tbl):
|
|||
vdup.16 d25, d16[3]
|
||||
vpadd.s16 d6, d6, d6
|
||||
vtrn.16 d24, d25
|
||||
vshl.s16 d24, d24, #7
|
||||
vsub.s16 d24, d24, d28
|
||||
vqadd.s16 d6, d6, d24
|
||||
vshr.s16 d6, d6, #3
|
||||
vadd.s16 d6, d6, d30
|
||||
vshl.s16 d24, d24, #7
|
||||
vsub.s16 d24, d24, d28
|
||||
vqadd.s16 d6, d6, d24
|
||||
vshr.s16 d6, d6, #3
|
||||
vadd.s16 d6, d6, d30
|
||||
vst1.s16 {d6[0]}, [r0, :16]!
|
||||
vst1.s16 {d6[1]}, [r12, :16]!
|
||||
subs r5, r5, #1
|
||||
|
|
@ -363,7 +353,6 @@ L(variable_shift_tbl):
|
|||
0:
|
||||
vpop {q4}
|
||||
pop {r4-r11,pc}
|
||||
.purgem filter_8
|
||||
.purgem filter_4
|
||||
endfunc
|
||||
|
||||
|
|
@ -422,22 +411,22 @@ function wiener_filter_v_8bpc_neon, export=1
|
|||
// Interleaving the mul/mla chains actually hurts performance
|
||||
// significantly on Cortex A53, thus keeping mul/mla tightly
|
||||
// chained like this.
|
||||
vmull.s16 q2, d16, d0[0]
|
||||
vmlal.s16 q2, d18, d0[1]
|
||||
vmlal.s16 q2, d20, d0[2]
|
||||
vmlal.s16 q2, d22, d0[3]
|
||||
vmlal.s16 q2, d24, d1[0]
|
||||
vmlal.s16 q2, d26, d1[1]
|
||||
vmlal.s16 q2, d28, d1[2]
|
||||
vmull.s16 q3, d17, d0[0]
|
||||
vmlal.s16 q3, d19, d0[1]
|
||||
vmlal.s16 q3, d21, d0[2]
|
||||
vmlal.s16 q3, d23, d0[3]
|
||||
vmlal.s16 q3, d25, d1[0]
|
||||
vmlal.s16 q3, d27, d1[1]
|
||||
vmlal.s16 q3, d29, d1[2]
|
||||
vqrshrun.s32 d4, q2, #11
|
||||
vqrshrun.s32 d5, q3, #11
|
||||
vmull.s16 q2, d16, d0[0]
|
||||
vmlal.s16 q2, d18, d0[1]
|
||||
vmlal.s16 q2, d20, d0[2]
|
||||
vmlal.s16 q2, d22, d0[3]
|
||||
vmlal.s16 q2, d24, d1[0]
|
||||
vmlal.s16 q2, d26, d1[1]
|
||||
vmlal.s16 q2, d28, d1[2]
|
||||
vmull.s16 q3, d17, d0[0]
|
||||
vmlal.s16 q3, d19, d0[1]
|
||||
vmlal.s16 q3, d21, d0[2]
|
||||
vmlal.s16 q3, d23, d0[3]
|
||||
vmlal.s16 q3, d25, d1[0]
|
||||
vmlal.s16 q3, d27, d1[1]
|
||||
vmlal.s16 q3, d29, d1[2]
|
||||
vqrshrun.s32 d4, q2, #11
|
||||
vqrshrun.s32 d5, q3, #11
|
||||
vqmovun.s16 d4, q2
|
||||
vst1.8 {d4}, [r0], r1
|
||||
.if \compare
|
||||
|
|
@ -473,7 +462,7 @@ function wiener_filter_v_8bpc_neon, export=1
|
|||
52: // 2 rows in total, q11 already loaded, load q12 with content data
|
||||
// and 2 rows of edge.
|
||||
vld1.16 {q14}, [r2, :128], r7
|
||||
vmov q15, q14
|
||||
vmov q15, q14
|
||||
b 8f
|
||||
53:
|
||||
// 3 rows in total, q11 already loaded, load q12 and q13 with content
|
||||
|
|
@ -615,8 +604,8 @@ L(copy_narrow_tbl):
|
|||
asr r1, r1, #1
|
||||
22:
|
||||
subs r4, r4, #1
|
||||
vld1.16 {d0[]}, [r2]!
|
||||
vst1.16 {d0[0]}, [r0], r1
|
||||
vld1.16 {d0[]}, [r2, :16]!
|
||||
vst1.16 {d0[0]}, [r0, :16], r1
|
||||
bgt 22b
|
||||
0:
|
||||
pop {r4,pc}
|
||||
|
|
@ -644,8 +633,8 @@ L(copy_narrow_tbl):
|
|||
ble 0f
|
||||
b 42b
|
||||
41:
|
||||
vld1.32 {d0[]}, [r2]
|
||||
vst1.32 {d0[0]}, [r0]
|
||||
vld1.32 {d0[]}, [r2, :32]
|
||||
vst1.32 {d0[0]}, [r0, :32]
|
||||
0:
|
||||
pop {r4,pc}
|
||||
|
||||
|
|
@ -785,7 +774,7 @@ function sgr_box3_h_8bpc_neon, export=1
|
|||
bne 4f
|
||||
// If we'll need to pad the right edge, load that byte to pad with
|
||||
// here since we can find it pretty easily from here.
|
||||
sub lr, r5, #(2 + 16 - 2 + 1)
|
||||
sub lr, r5, #(2 + 16 - 2 + 1)
|
||||
ldrb r11, [r3, lr]
|
||||
ldrb lr, [r12, lr]
|
||||
// Fill q14/q15 with the right padding pixel
|
||||
|
|
@ -1058,7 +1047,7 @@ function sgr_box5_h_8bpc_neon, export=1
|
|||
bne 4f
|
||||
// If we'll need to pad the right edge, load that byte to pad with
|
||||
// here since we can find it pretty easily from here.
|
||||
sub lr, r5, #(2 + 16 - 3 + 1)
|
||||
sub lr, r5, #(2 + 16 - 3 + 1)
|
||||
ldrb r11, [r3, lr]
|
||||
ldrb lr, [r12, lr]
|
||||
// Fill q14/q15 with the right padding pixel
|
||||
|
|
@ -1100,7 +1089,7 @@ function sgr_box5_h_8bpc_neon, export=1
|
|||
vaddl_u16_n q12, q13, d2, d3, d16, d17, \w
|
||||
vaddl_u16_n q8, q9, d18, d19, d20, d21, \w
|
||||
vaddw_u16_n q12, q13, d22, d23, \w
|
||||
vadd_i32_n q12, q13, q8, q9, \w
|
||||
vadd_i32_n q12, q13, q8, q9, \w
|
||||
vext.8 q8, q5, q6, #2
|
||||
vext.8 q9, q5, q6, #4
|
||||
vext.8 q10, q5, q6, #6
|
||||
|
|
@ -1152,7 +1141,7 @@ function sgr_box5_h_8bpc_neon, export=1
|
|||
|
||||
6: // Pad the right edge and produce the last few pixels.
|
||||
// w < 7, w+1 pixels valid in q0/q4
|
||||
sub lr, r5, #1
|
||||
sub lr, r5, #1
|
||||
// lr = pixels valid - 2
|
||||
adr r11, L(box5_variable_shift_tbl)
|
||||
ldr lr, [r11, lr, lsl #2]
|
||||
|
|
|
|||
720
third_party/dav1d/src/arm/32/looprestoration16.S
vendored
Normal file
720
third_party/dav1d/src/arm/32/looprestoration16.S
vendored
Normal file
|
|
@ -0,0 +1,720 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2020, Martin Storsjo
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include "src/arm/asm.S"
|
||||
#include "util.S"
|
||||
|
||||
// void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
|
||||
// const pixel *src, ptrdiff_t stride,
|
||||
// const int16_t fh[7], const intptr_t w,
|
||||
// int h, enum LrEdgeFlags edges,
|
||||
// const int bitdepth_max);
|
||||
function wiener_filter_h_16bpc_neon, export=1
|
||||
push {r4-r11,lr}
|
||||
vpush {q4-q7}
|
||||
ldrd r4, r5, [sp, #100]
|
||||
ldrd r6, r7, [sp, #108]
|
||||
ldr r8, [sp, #116] // bitdepth_max
|
||||
vld1.16 {q0}, [r4]
|
||||
clz r8, r8
|
||||
vmov.i32 q14, #1
|
||||
sub r9, r8, #38 // -(bitdepth + 6)
|
||||
sub r8, r8, #25 // -round_bits_h
|
||||
neg r9, r9 // bitdepth + 6
|
||||
vdup.32 q1, r9
|
||||
vdup.32 q13, r8 // -round_bits_h
|
||||
vmov.i16 q15, #8192
|
||||
vshl.u32 q14, q14, q1 // 1 << (bitdepth + 6)
|
||||
mov r8, r5
|
||||
// Calculate mid_stride
|
||||
add r10, r5, #7
|
||||
bic r10, r10, #7
|
||||
lsl r10, r10, #1
|
||||
|
||||
// Clear the last unused element of q0, to allow filtering a single
|
||||
// pixel with one plain vmul+vpadd.
|
||||
mov r12, #0
|
||||
vmov.16 d1[3], r12
|
||||
|
||||
// Set up pointers for reading/writing alternate rows
|
||||
add r12, r0, r10
|
||||
lsl r10, r10, #1
|
||||
add lr, r2, r3
|
||||
lsl r3, r3, #1
|
||||
|
||||
// Subtract the width from mid_stride
|
||||
sub r10, r10, r5, lsl #1
|
||||
|
||||
// For w >= 8, we read (w+5)&~7+8 pixels, for w < 8 we read 16 pixels.
|
||||
cmp r5, #8
|
||||
add r11, r5, #13
|
||||
bic r11, r11, #7
|
||||
bge 1f
|
||||
mov r11, #16
|
||||
1:
|
||||
sub r3, r3, r11, lsl #1
|
||||
|
||||
// Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
|
||||
tst r7, #1 // LR_HAVE_LEFT
|
||||
beq 2f
|
||||
// LR_HAVE_LEFT
|
||||
cmp r1, #0
|
||||
bne 0f
|
||||
// left == NULL
|
||||
sub r2, r2, #6
|
||||
sub lr, lr, #6
|
||||
b 1f
|
||||
0: // LR_HAVE_LEFT, left != NULL
|
||||
2: // !LR_HAVE_LEFT, increase the stride.
|
||||
// For this case we don't read the left 3 pixels from the src pointer,
|
||||
// but shift it as if we had done that.
|
||||
add r3, r3, #6
|
||||
|
||||
|
||||
1: // Loop vertically
|
||||
vld1.16 {q2, q3}, [r2]!
|
||||
vld1.16 {q4, q5}, [lr]!
|
||||
|
||||
tst r7, #1 // LR_HAVE_LEFT
|
||||
beq 0f
|
||||
cmp r1, #0
|
||||
beq 2f
|
||||
// LR_HAVE_LEFT, left != NULL
|
||||
vld1.16 {d3}, [r1]!
|
||||
// Move r2/lr back to account for the last 3 pixels we loaded earlier,
|
||||
// which we'll shift out.
|
||||
sub r2, r2, #6
|
||||
sub lr, lr, #6
|
||||
vld1.16 {d13}, [r1]!
|
||||
vext.8 q3, q2, q3, #10
|
||||
vext.8 q2, q1, q2, #10
|
||||
vext.8 q5, q4, q5, #10
|
||||
vext.8 q4, q6, q4, #10
|
||||
b 2f
|
||||
0:
|
||||
// !LR_HAVE_LEFT, fill q1 with the leftmost pixel
|
||||
// and shift q2/q3 to have 3x the first pixel at the front.
|
||||
vdup.16 q1, d4[0]
|
||||
vdup.16 q6, d8[0]
|
||||
// Move r2 back to account for the last 3 pixels we loaded before,
|
||||
// which we shifted out.
|
||||
sub r2, r2, #6
|
||||
sub lr, lr, #6
|
||||
vext.8 q3, q2, q3, #10
|
||||
vext.8 q2, q1, q2, #10
|
||||
vext.8 q5, q4, q5, #10
|
||||
vext.8 q4, q6, q4, #10
|
||||
|
||||
2:
|
||||
|
||||
tst r7, #2 // LR_HAVE_RIGHT
|
||||
bne 4f
|
||||
// If we'll need to pad the right edge, load that byte to pad with
|
||||
// here since we can find it pretty easily from here.
|
||||
sub r9, r5, #14
|
||||
lsl r9, r9, #1
|
||||
ldrh r11, [r2, r9]
|
||||
ldrh r9, [lr, r9]
|
||||
// Fill q11/q12 with the right padding pixel
|
||||
vdup.16 q11, r11
|
||||
vdup.16 q12, r9
|
||||
3: // !LR_HAVE_RIGHT
|
||||
// If we'll have to pad the right edge we need to quit early here.
|
||||
cmp r5, #11
|
||||
bge 4f // If w >= 11, all used input pixels are valid
|
||||
cmp r5, #7
|
||||
bge 5f // If w >= 7, we can filter 4 pixels
|
||||
b 6f
|
||||
|
||||
4: // Loop horizontally
|
||||
vext.8 q10, q2, q3, #6
|
||||
vext.8 q8, q2, q3, #2
|
||||
vext.8 q9, q2, q3, #4
|
||||
vshll.u16 q6, d20, #7
|
||||
vshll.u16 q7, d21, #7
|
||||
vmlal.s16 q6, d4, d0[0]
|
||||
vmlal.s16 q6, d16, d0[1]
|
||||
vmlal.s16 q6, d18, d0[2]
|
||||
vmlal.s16 q6, d20, d0[3]
|
||||
vmlal.s16 q7, d5, d0[0]
|
||||
vmlal.s16 q7, d17, d0[1]
|
||||
vmlal.s16 q7, d19, d0[2]
|
||||
vmlal.s16 q7, d21, d0[3]
|
||||
vext.8 q8, q2, q3, #8
|
||||
vext.8 q9, q2, q3, #10
|
||||
vext.8 q10, q2, q3, #12
|
||||
vmlal.s16 q6, d16, d1[0]
|
||||
vmlal.s16 q6, d18, d1[1]
|
||||
vmlal.s16 q6, d20, d1[2]
|
||||
vmlal.s16 q7, d17, d1[0]
|
||||
vmlal.s16 q7, d19, d1[1]
|
||||
vmlal.s16 q7, d21, d1[2]
|
||||
vext.8 q10, q4, q5, #6
|
||||
vext.8 q2, q4, q5, #2
|
||||
vshll.u16 q8, d20, #7
|
||||
vshll.u16 q9, d21, #7
|
||||
vmlal.s16 q8, d8, d0[0]
|
||||
vmlal.s16 q8, d4, d0[1]
|
||||
vmlal.s16 q8, d20, d0[3]
|
||||
vmlal.s16 q9, d9, d0[0]
|
||||
vmlal.s16 q9, d5, d0[1]
|
||||
vmlal.s16 q9, d21, d0[3]
|
||||
vext.8 q2, q4, q5, #4
|
||||
vext.8 q10, q4, q5, #8
|
||||
vmlal.s16 q8, d4, d0[2]
|
||||
vmlal.s16 q8, d20, d1[0]
|
||||
vmlal.s16 q9, d5, d0[2]
|
||||
vmlal.s16 q9, d21, d1[0]
|
||||
vext.8 q2, q4, q5, #10
|
||||
vext.8 q10, q4, q5, #12
|
||||
vmlal.s16 q8, d4, d1[1]
|
||||
vmlal.s16 q8, d20, d1[2]
|
||||
vmlal.s16 q9, d5, d1[1]
|
||||
vmlal.s16 q9, d21, d1[2]
|
||||
|
||||
vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1
|
||||
vadd.i32 q6, q6, q14
|
||||
vadd.i32 q7, q7, q14
|
||||
vadd.i32 q8, q8, q14
|
||||
vadd.i32 q9, q9, q14
|
||||
vrshl.s32 q6, q6, q13
|
||||
vrshl.s32 q7, q7, q13
|
||||
vrshl.s32 q8, q8, q13
|
||||
vrshl.s32 q9, q9, q13
|
||||
vqmovun.s32 d12, q6
|
||||
vqmovun.s32 d13, q7
|
||||
vqmovun.s32 d14, q8
|
||||
vqmovun.s32 d15, q9
|
||||
vmin.u16 q6, q6, q10
|
||||
vmin.u16 q7, q7, q10
|
||||
vsub.i16 q6, q6, q15
|
||||
vsub.i16 q7, q7, q15
|
||||
vst1.16 {q6}, [r0, :128]!
|
||||
vst1.16 {q7}, [r12, :128]!
|
||||
|
||||
subs r5, r5, #8
|
||||
ble 9f
|
||||
tst r7, #2 // LR_HAVE_RIGHT
|
||||
vmov q2, q3
|
||||
vmov q4, q5
|
||||
vld1.16 {q3}, [r2]!
|
||||
vld1.16 {q5}, [lr]!
|
||||
bne 4b // If we don't need to pad, just keep filtering.
|
||||
b 3b // If we need to pad, check how many pixels we have left.
|
||||
|
||||
5: // Filter 4 pixels, 7 <= w < 11
|
||||
.macro filter_4
|
||||
vext.8 d18, d4, d5, #6
|
||||
vext.8 d16, d4, d5, #2
|
||||
vext.8 d17, d4, d5, #4
|
||||
vext.8 d19, d5, d6, #2
|
||||
vext.8 d20, d5, d6, #4
|
||||
vshll.u16 q6, d18, #7
|
||||
vmlal.s16 q6, d4, d0[0]
|
||||
vmlal.s16 q6, d16, d0[1]
|
||||
vmlal.s16 q6, d17, d0[2]
|
||||
vmlal.s16 q6, d18, d0[3]
|
||||
vmlal.s16 q6, d5, d1[0]
|
||||
vmlal.s16 q6, d19, d1[1]
|
||||
vmlal.s16 q6, d20, d1[2]
|
||||
|
||||
vext.8 d18, d8, d9, #6
|
||||
vext.8 d16, d8, d9, #2
|
||||
vext.8 d17, d8, d9, #4
|
||||
vext.8 d19, d9, d10, #2
|
||||
vext.8 d20, d9, d10, #4
|
||||
vshll.u16 q7, d18, #7
|
||||
vmlal.s16 q7, d8, d0[0]
|
||||
vmlal.s16 q7, d16, d0[1]
|
||||
vmlal.s16 q7, d17, d0[2]
|
||||
vmlal.s16 q7, d18, d0[3]
|
||||
vmlal.s16 q7, d9, d1[0]
|
||||
vmlal.s16 q7, d19, d1[1]
|
||||
vmlal.s16 q7, d20, d1[2]
|
||||
|
||||
vmvn.i16 q10, #0x8000 // 0x7fff = (1 << 15) - 1
|
||||
vadd.i32 q6, q6, q14
|
||||
vadd.i32 q7, q7, q14
|
||||
vrshl.s32 q6, q6, q13
|
||||
vrshl.s32 q7, q7, q13
|
||||
vqmovun.s32 d12, q6
|
||||
vqmovun.s32 d13, q7
|
||||
vmin.u16 q6, q6, q10
|
||||
vsub.i16 q6, q6, q15
|
||||
.endm
|
||||
filter_4
|
||||
vst1.16 {d12}, [r0, :64]!
|
||||
vst1.16 {d13}, [r12, :64]!
|
||||
|
||||
subs r5, r5, #4 // 3 <= w < 7
|
||||
vext.8 q2, q2, q3, #8
|
||||
vext.8 q3, q3, q3, #8
|
||||
vext.8 q4, q4, q5, #8
|
||||
vext.8 q5, q5, q5, #8
|
||||
|
||||
6: // Pad the right edge and filter the last few pixels.
|
||||
// w < 7, w+3 pixels valid in q2-q3
|
||||
cmp r5, #5
|
||||
blt 7f
|
||||
bgt 8f
|
||||
// w == 5, 8 pixels valid in q2, q3 invalid
|
||||
vmov q3, q11
|
||||
vmov q5, q12
|
||||
b 88f
|
||||
|
||||
7: // 1 <= w < 5, 4-7 pixels valid in q2
|
||||
sub r9, r5, #1
|
||||
// r9 = (pixels valid - 4)
|
||||
adr r11, L(variable_shift_tbl)
|
||||
ldr r9, [r11, r9, lsl #2]
|
||||
add r11, r11, r9
|
||||
vmov q3, q11
|
||||
vmov q5, q12
|
||||
bx r11
|
||||
|
||||
.align 2
|
||||
L(variable_shift_tbl):
|
||||
.word 44f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 55f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 66f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||
.word 77f - L(variable_shift_tbl) + CONFIG_THUMB
|
||||
|
||||
44: // 4 pixels valid in q2/q4, fill the high half with padding.
|
||||
vmov d5, d6
|
||||
vmov d9, d10
|
||||
b 88f
|
||||
// Shift q2 right, shifting out invalid pixels,
|
||||
// shift q2 left to the original offset, shifting in padding pixels.
|
||||
55: // 5 pixels valid
|
||||
vext.8 q2, q2, q2, #10
|
||||
vext.8 q2, q2, q3, #6
|
||||
vext.8 q4, q4, q4, #10
|
||||
vext.8 q4, q4, q5, #6
|
||||
b 88f
|
||||
66: // 6 pixels valid
|
||||
vext.8 q2, q2, q2, #12
|
||||
vext.8 q2, q2, q3, #4
|
||||
vext.8 q4, q4, q4, #12
|
||||
vext.8 q4, q4, q5, #4
|
||||
b 88f
|
||||
77: // 7 pixels valid
|
||||
vext.8 q2, q2, q2, #14
|
||||
vext.8 q2, q2, q3, #2
|
||||
vext.8 q4, q4, q4, #14
|
||||
vext.8 q4, q4, q5, #2
|
||||
b 88f
|
||||
|
||||
8: // w > 5, w == 6, 9 pixels valid in q2-q3, 1 pixel valid in q3
|
||||
vext.8 q3, q3, q3, #2
|
||||
vext.8 q3, q3, q11, #14
|
||||
vext.8 q5, q5, q5, #2
|
||||
vext.8 q5, q5, q12, #14
|
||||
|
||||
88:
|
||||
// w < 7, q2-q3 padded properly
|
||||
cmp r5, #4
|
||||
blt 888f
|
||||
|
||||
// w >= 4, filter 4 pixels
|
||||
filter_4
|
||||
vst1.16 {d12}, [r0, :64]!
|
||||
vst1.16 {d13}, [r12, :64]!
|
||||
subs r5, r5, #4 // 0 <= w < 4
|
||||
vext.8 q2, q2, q3, #8
|
||||
vext.8 q4, q4, q5, #8
|
||||
beq 9f
|
||||
888: // 1 <= w < 4, filter 1 pixel at a time
|
||||
vmull.s16 q6, d4, d0
|
||||
vmull.s16 q7, d5, d1
|
||||
vmull.s16 q8, d8, d0
|
||||
vmull.s16 q9, d9, d1
|
||||
vadd.i32 q6, q7
|
||||
vadd.i32 q8, q9
|
||||
vpadd.i32 d12, d12, d13
|
||||
vpadd.i32 d13, d16, d17
|
||||
vdup.16 d14, d4[3]
|
||||
vdup.16 d15, d8[3]
|
||||
vpadd.i32 d12, d12, d13
|
||||
vtrn.16 d14, d15
|
||||
vadd.i32 d12, d12, d28
|
||||
vshll.u16 q7, d14, #7
|
||||
vmvn.i16 d20, #0x8000 // 0x7fff = (1 << 15) - 1
|
||||
vadd.i32 d12, d12, d14
|
||||
vrshl.s32 d12, d12, d26
|
||||
vqmovun.s32 d12, q6
|
||||
vmin.u16 d12, d12, d20
|
||||
vsub.i16 d12, d12, d30
|
||||
vst1.16 {d12[0]}, [r0, :16]!
|
||||
vst1.16 {d12[1]}, [r12, :16]!
|
||||
subs r5, r5, #1
|
||||
vext.8 q2, q2, q3, #2
|
||||
vext.8 q4, q4, q5, #2
|
||||
bgt 888b
|
||||
|
||||
9:
|
||||
subs r6, r6, #2
|
||||
ble 0f
|
||||
// Jump to the next row and loop horizontally
|
||||
add r0, r0, r10
|
||||
add r12, r12, r10
|
||||
add r2, r2, r3
|
||||
add lr, lr, r3
|
||||
mov r5, r8
|
||||
b 1b
|
||||
0:
|
||||
vpop {q4-q7}
|
||||
pop {r4-r11,pc}
|
||||
.purgem filter_4
|
||||
endfunc
|
||||
|
||||
// void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, ptrdiff_t stride,
|
||||
// const int16_t *mid, int w, int h,
|
||||
// const int16_t fv[7], enum LrEdgeFlags edges,
|
||||
// ptrdiff_t mid_stride, const int bitdepth_max);
|
||||
function wiener_filter_v_16bpc_neon, export=1
|
||||
push {r4-r7,lr}
|
||||
vpush {q4-q5}
|
||||
ldrd r4, r5, [sp, #52]
|
||||
ldrd r6, r7, [sp, #60]
|
||||
ldr lr, [sp, #68] // bitdepth_max
|
||||
vmov.i16 q1, #0
|
||||
mov r12, #128
|
||||
vld1.16 {q0}, [r5]
|
||||
vdup.16 q5, lr
|
||||
clz lr, lr
|
||||
vmov.i16 d2[3], r12
|
||||
sub lr, lr, #11 // round_bits_v
|
||||
vadd.i16 q0, q0, q1
|
||||
vdup.32 q4, lr
|
||||
mov lr, r4
|
||||
vneg.s32 q4, q4 // -round_bits_v
|
||||
|
||||
// Calculate the number of rows to move back when looping vertically
|
||||
mov r12, r4
|
||||
tst r6, #4 // LR_HAVE_TOP
|
||||
beq 0f
|
||||
sub r2, r2, r7, lsl #1
|
||||
add r12, r12, #2
|
||||
0:
|
||||
tst r6, #8 // LR_HAVE_BOTTOM
|
||||
beq 1f
|
||||
add r12, r12, #2
|
||||
|
||||
1: // Start of horizontal loop; start one vertical filter slice.
|
||||
// Load rows into q8-q11 and pad properly.
|
||||
tst r6, #4 // LR_HAVE_TOP
|
||||
vld1.16 {q8}, [r2, :128], r7
|
||||
beq 2f
|
||||
// LR_HAVE_TOP
|
||||
vld1.16 {q10}, [r2, :128], r7
|
||||
vmov q9, q8
|
||||
vld1.16 {q11}, [r2, :128], r7
|
||||
b 3f
|
||||
2: // !LR_HAVE_TOP
|
||||
vmov q9, q8
|
||||
vmov q10, q8
|
||||
vmov q11, q8
|
||||
|
||||
3:
|
||||
cmp r4, #4
|
||||
blt 5f
|
||||
// Start filtering normally; fill in q12-q14 with unique rows.
|
||||
vld1.16 {q12}, [r2, :128], r7
|
||||
vld1.16 {q13}, [r2, :128], r7
|
||||
vld1.16 {q14}, [r2, :128], r7
|
||||
|
||||
4:
|
||||
.macro filter compare
|
||||
subs r4, r4, #1
|
||||
// Interleaving the mul/mla chains actually hurts performance
|
||||
// significantly on Cortex A53, thus keeping mul/mla tightly
|
||||
// chained like this.
|
||||
vmull.s16 q2, d16, d0[0]
|
||||
vmlal.s16 q2, d18, d0[1]
|
||||
vmlal.s16 q2, d20, d0[2]
|
||||
vmlal.s16 q2, d22, d0[3]
|
||||
vmlal.s16 q2, d24, d1[0]
|
||||
vmlal.s16 q2, d26, d1[1]
|
||||
vmlal.s16 q2, d28, d1[2]
|
||||
vmull.s16 q3, d17, d0[0]
|
||||
vmlal.s16 q3, d19, d0[1]
|
||||
vmlal.s16 q3, d21, d0[2]
|
||||
vmlal.s16 q3, d23, d0[3]
|
||||
vmlal.s16 q3, d25, d1[0]
|
||||
vmlal.s16 q3, d27, d1[1]
|
||||
vmlal.s16 q3, d29, d1[2]
|
||||
vrshl.s32 q2, q2, q4 // round_bits_v
|
||||
vrshl.s32 q3, q3, q4
|
||||
vqmovun.s32 d4, q2
|
||||
vqmovun.s32 d5, q3
|
||||
vmin.u16 q2, q2, q5 // bitdepth_max
|
||||
vst1.16 {q2}, [r0], r1
|
||||
.if \compare
|
||||
cmp r4, #4
|
||||
.else
|
||||
ble 9f
|
||||
.endif
|
||||
vmov q8, q9
|
||||
vmov q9, q10
|
||||
vmov q10, q11
|
||||
vmov q11, q12
|
||||
vmov q12, q13
|
||||
vmov q13, q14
|
||||
.endm
|
||||
filter 1
|
||||
blt 7f
|
||||
vld1.16 {q14}, [r2, :128], r7
|
||||
b 4b
|
||||
|
||||
5: // Less than 4 rows in total; not all of q12-q13 are filled yet.
|
||||
tst r6, #8 // LR_HAVE_BOTTOM
|
||||
beq 6f
|
||||
// LR_HAVE_BOTTOM
|
||||
cmp r4, #2
|
||||
// We load at least 2 rows in all cases.
|
||||
vld1.16 {q12}, [r2, :128], r7
|
||||
vld1.16 {q13}, [r2, :128], r7
|
||||
bgt 53f // 3 rows in total
|
||||
beq 52f // 2 rows in total
|
||||
51: // 1 row in total, q11 already loaded, load edge into q12-q14.
|
||||
vmov q13, q12
|
||||
b 8f
|
||||
52: // 2 rows in total, q11 already loaded, load q12 with content data
|
||||
// and 2 rows of edge.
|
||||
vld1.16 {q14}, [r2, :128], r7
|
||||
vmov q15, q14
|
||||
b 8f
|
||||
53:
|
||||
// 3 rows in total, q11 already loaded, load q12 and q13 with content
|
||||
// and 2 rows of edge.
|
||||
vld1.16 {q14}, [r2, :128], r7
|
||||
vld1.16 {q15}, [r2, :128], r7
|
||||
vmov q1, q15
|
||||
b 8f
|
||||
|
||||
6:
|
||||
// !LR_HAVE_BOTTOM
|
||||
cmp r4, #2
|
||||
bgt 63f // 3 rows in total
|
||||
beq 62f // 2 rows in total
|
||||
61: // 1 row in total, q11 already loaded, pad that into q12-q14.
|
||||
vmov q12, q11
|
||||
vmov q13, q11
|
||||
vmov q14, q11
|
||||
b 8f
|
||||
62: // 2 rows in total, q11 already loaded, load q12 and pad that into q12-q15.
|
||||
vld1.16 {q12}, [r2, :128], r7
|
||||
vmov q13, q12
|
||||
vmov q14, q12
|
||||
vmov q15, q12
|
||||
b 8f
|
||||
63:
|
||||
// 3 rows in total, q11 already loaded, load q12 and q13 and pad q13 into q14-q15,q1.
|
||||
vld1.16 {q12}, [r2, :128], r7
|
||||
vld1.16 {q13}, [r2, :128], r7
|
||||
vmov q14, q13
|
||||
vmov q15, q13
|
||||
vmov q1, q13
|
||||
b 8f
|
||||
|
||||
7:
|
||||
// All registers up to q13 are filled already, 3 valid rows left.
|
||||
// < 4 valid rows left; fill in padding and filter the last
|
||||
// few rows.
|
||||
tst r6, #8 // LR_HAVE_BOTTOM
|
||||
beq 71f
|
||||
// LR_HAVE_BOTTOM; load 2 rows of edge.
|
||||
vld1.16 {q14}, [r2, :128], r7
|
||||
vld1.16 {q15}, [r2, :128], r7
|
||||
vmov q1, q15
|
||||
b 8f
|
||||
71:
|
||||
// !LR_HAVE_BOTTOM, pad 3 rows
|
||||
vmov q14, q13
|
||||
vmov q15, q13
|
||||
vmov q1, q13
|
||||
|
||||
8: // At this point, all registers up to q14-q15,q1 are loaded with
|
||||
// edge/padding (depending on how many rows are left).
|
||||
filter 0 // This branches to 9f when done
|
||||
vmov q14, q15
|
||||
vmov q15, q1
|
||||
b 8b
|
||||
|
||||
9: // End of one vertical slice.
|
||||
subs r3, r3, #8
|
||||
ble 0f
|
||||
// Move pointers back up to the top and loop horizontally.
|
||||
mls r0, r1, lr, r0
|
||||
mls r2, r7, r12, r2
|
||||
add r0, r0, #16
|
||||
add r2, r2, #16
|
||||
mov r4, lr
|
||||
b 1b
|
||||
|
||||
0:
|
||||
vpop {q4-q5}
|
||||
pop {r4-r7,pc}
|
||||
.purgem filter
|
||||
endfunc
|
||||
|
||||
// void dav1d_copy_narrow_16bpc_neon(pixel *dst, ptrdiff_t stride,
|
||||
// const pixel *src, int w, int h);
|
||||
function copy_narrow_16bpc_neon, export=1
|
||||
push {r4,lr}
|
||||
ldr r4, [sp, #8]
|
||||
adr r12, L(copy_narrow_tbl)
|
||||
ldr r3, [r12, r3, lsl #2]
|
||||
add r12, r12, r3
|
||||
bx r12
|
||||
|
||||
.align 2
|
||||
L(copy_narrow_tbl):
|
||||
.word 0
|
||||
.word 10f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 20f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 30f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 40f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 50f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 60f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
.word 70f - L(copy_narrow_tbl) + CONFIG_THUMB
|
||||
|
||||
10:
|
||||
add r3, r0, r1
|
||||
lsl r1, r1, #1
|
||||
18:
|
||||
subs r4, r4, #8
|
||||
blt 110f
|
||||
vld1.16 {q0}, [r2, :128]!
|
||||
vst1.16 {d0[0]}, [r0, :16], r1
|
||||
vst1.16 {d0[1]}, [r3, :16], r1
|
||||
vst1.16 {d0[2]}, [r0, :16], r1
|
||||
vst1.16 {d0[3]}, [r3, :16], r1
|
||||
vst1.16 {d1[0]}, [r0, :16], r1
|
||||
vst1.16 {d1[1]}, [r3, :16], r1
|
||||
vst1.16 {d1[2]}, [r0, :16], r1
|
||||
vst1.16 {d1[3]}, [r3, :16], r1
|
||||
ble 0f
|
||||
b 18b
|
||||
110:
|
||||
add r4, r4, #8
|
||||
asr r1, r1, #1
|
||||
11:
|
||||
subs r4, r4, #1
|
||||
vld1.16 {d0[]}, [r2]!
|
||||
vst1.16 {d0[0]}, [r0], r1
|
||||
bgt 11b
|
||||
0:
|
||||
pop {r4,pc}
|
||||
|
||||
20:
|
||||
add r3, r0, r1
|
||||
lsl r1, r1, #1
|
||||
24:
|
||||
subs r4, r4, #4
|
||||
blt 210f
|
||||
vld1.32 {q0}, [r2, :128]!
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
vst1.32 {d0[1]}, [r3, :32], r1
|
||||
vst1.32 {d1[0]}, [r0, :32], r1
|
||||
vst1.32 {d1[1]}, [r3, :32], r1
|
||||
ble 0f
|
||||
b 24b
|
||||
210:
|
||||
add r4, r4, #4
|
||||
asr r1, r1, #1
|
||||
22:
|
||||
subs r4, r4, #1
|
||||
vld1.32 {d0[]}, [r2, :32]!
|
||||
vst1.32 {d0[0]}, [r0, :32], r1
|
||||
bgt 22b
|
||||
0:
|
||||
pop {r4,pc}
|
||||
|
||||
30:
|
||||
ldr r3, [r2]
|
||||
ldrh r12, [r2, #4]
|
||||
add r2, r2, #6
|
||||
subs r4, r4, #1
|
||||
str r3, [r0]
|
||||
strh r12, [r0, #4]
|
||||
add r0, r0, r1
|
||||
bgt 30b
|
||||
pop {r4,pc}
|
||||
|
||||
40:
|
||||
add r3, r0, r1
|
||||
lsl r1, r1, #1
|
||||
42:
|
||||
subs r4, r4, #2
|
||||
blt 41f
|
||||
vld1.16 {q0}, [r2, :128]!
|
||||
vst1.16 {d0}, [r0, :64], r1
|
||||
vst1.16 {d1}, [r3, :64], r1
|
||||
ble 0f
|
||||
b 42b
|
||||
41:
|
||||
vld1.16 {d0}, [r2, :64]
|
||||
vst1.16 {d0}, [r0, :64]
|
||||
0:
|
||||
pop {r4,pc}
|
||||
|
||||
50:
|
||||
vld1.16 {d0}, [r2]
|
||||
ldrh r12, [r2, #8]
|
||||
add r2, r2, #10
|
||||
subs r4, r4, #1
|
||||
vst1.16 {d0}, [r0]
|
||||
strh r12, [r0, #8]
|
||||
add r0, r0, r1
|
||||
bgt 50b
|
||||
pop {r4,pc}
|
||||
|
||||
60:
|
||||
vld1.16 {d0}, [r2]
|
||||
ldr r12, [r2, #8]
|
||||
add r2, r2, #12
|
||||
subs r4, r4, #1
|
||||
vst1.16 {d0}, [r0]
|
||||
str r12, [r0, #8]
|
||||
add r0, r0, r1
|
||||
bgt 60b
|
||||
pop {r4,pc}
|
||||
|
||||
70:
|
||||
vld1.16 {d0}, [r2]
|
||||
ldr r12, [r2, #8]
|
||||
ldrh lr, [r2, #12]
|
||||
add r2, r2, #14
|
||||
subs r4, r4, #1
|
||||
vst1.16 {d0}, [r0]
|
||||
str r12, [r0, #8]
|
||||
strh lr, [r0, #12]
|
||||
add r0, r0, r1
|
||||
bgt 70b
|
||||
pop {r4,pc}
|
||||
endfunc
|
||||
42
third_party/dav1d/src/arm/32/mc.S
vendored
42
third_party/dav1d/src/arm/32/mc.S
vendored
|
|
@ -1403,12 +1403,12 @@ L(\type\()_8tap_h_tbl):
|
|||
vld1.8 {d24}, [\sr2], \s_strd
|
||||
vmovl.u8 q8, d16
|
||||
vmovl.u8 q12, d24
|
||||
vext.8 q9, q8, q8, #2
|
||||
vext.8 q10, q8, q8, #4
|
||||
vext.8 q11, q8, q8, #6
|
||||
vext.8 q13, q12, q12, #2
|
||||
vext.8 q14, q12, q12, #4
|
||||
vext.8 q15, q12, q12, #6
|
||||
vext.8 d18, d16, d17, #2
|
||||
vext.8 d20, d16, d17, #4
|
||||
vext.8 d22, d16, d17, #6
|
||||
vext.8 d26, d24, d25, #2
|
||||
vext.8 d28, d24, d25, #4
|
||||
vext.8 d30, d24, d25, #6
|
||||
subs \h, \h, #2
|
||||
vmul.s16 d4, d16, d0[0]
|
||||
vmla.s16 d4, d18, d0[1]
|
||||
|
|
@ -1431,7 +1431,7 @@ L(\type\()_8tap_h_tbl):
|
|||
pop {r4-r11,pc}
|
||||
|
||||
80: // 8xN h
|
||||
vld1.8 {d0}, [\mx]
|
||||
vld1.8 {d0}, [\mx, :64]
|
||||
sub \src, \src, #3
|
||||
add \ds2, \dst, \d_strd
|
||||
add \sr2, \src, \s_strd
|
||||
|
|
@ -1482,7 +1482,7 @@ L(\type\()_8tap_h_tbl):
|
|||
// one temporary for vext in the loop. That's slower on A7 and A53,
|
||||
// (but surprisingly, marginally faster on A8 and A73).
|
||||
vpush {q4-q6}
|
||||
vld1.8 {d0}, [\mx]
|
||||
vld1.8 {d0}, [\mx, :64]
|
||||
sub \src, \src, #3
|
||||
add \ds2, \dst, \d_strd
|
||||
add \sr2, \src, \s_strd
|
||||
|
|
@ -1629,7 +1629,7 @@ L(\type\()_8tap_v_tbl):
|
|||
|
||||
28: // 2x8, 2x16 v
|
||||
vpush {q4-q7}
|
||||
vld1.8 {d0}, [\my]
|
||||
vld1.8 {d0}, [\my, :64]
|
||||
sub \sr2, \src, \s_strd, lsl #1
|
||||
add \ds2, \dst, \d_strd
|
||||
sub \src, \sr2, \s_strd
|
||||
|
|
@ -1709,7 +1709,7 @@ L(\type\()_8tap_v_tbl):
|
|||
|
||||
480: // 4x8, 4x16 v
|
||||
vpush {q4}
|
||||
vld1.8 {d0}, [\my]
|
||||
vld1.8 {d0}, [\my, :64]
|
||||
sub \sr2, \src, \s_strd, lsl #1
|
||||
add \ds2, \dst, \d_strd
|
||||
sub \src, \sr2, \s_strd
|
||||
|
|
@ -1782,7 +1782,7 @@ L(\type\()_8tap_v_tbl):
|
|||
640:
|
||||
1280:
|
||||
vpush {q4}
|
||||
vld1.8 {d0}, [\my]
|
||||
vld1.8 {d0}, [\my, :64]
|
||||
sub \src, \src, \s_strd
|
||||
sub \src, \src, \s_strd, lsl #1
|
||||
vmovl.s8 q0, d0
|
||||
|
|
@ -1951,11 +1951,10 @@ L(\type\()_8tap_hv_tbl):
|
|||
bl L(\type\()_8tap_filter_2)
|
||||
|
||||
vext.8 d18, d17, d26, #4
|
||||
vmov d19, d26
|
||||
vmull.s16 q2, d16, d2[0]
|
||||
vmlal.s16 q2, d17, d2[1]
|
||||
vmlal.s16 q2, d18, d2[2]
|
||||
vmlal.s16 q2, d19, d2[3]
|
||||
vmlal.s16 q2, d26, d2[3]
|
||||
|
||||
vqrshrn.s32 d4, q2, #\shift_hv
|
||||
vqmovun.s16 d4, q2
|
||||
|
|
@ -1964,11 +1963,11 @@ L(\type\()_8tap_hv_tbl):
|
|||
vst1.16 {d4[1]}, [\ds2, :16], \d_strd
|
||||
ble 0f
|
||||
vmov d16, d18
|
||||
vmov d17, d19
|
||||
vmov d17, d26
|
||||
b 2b
|
||||
|
||||
280: // 2x8, 2x16, 2x32 hv
|
||||
vld1.8 {d2}, [\my]
|
||||
vld1.8 {d2}, [\my, :64]
|
||||
sub \src, \src, #1
|
||||
sub \sr2, \src, \s_strd, lsl #1
|
||||
sub \src, \sr2, \s_strd
|
||||
|
|
@ -2001,7 +2000,6 @@ L(\type\()_8tap_hv_tbl):
|
|||
28:
|
||||
bl L(\type\()_8tap_filter_2)
|
||||
vext.8 d22, d21, d26, #4
|
||||
vmov d23, d26
|
||||
vmull.s16 q2, d16, d2[0]
|
||||
vmlal.s16 q2, d17, d2[1]
|
||||
vmlal.s16 q2, d18, d2[2]
|
||||
|
|
@ -2009,7 +2007,7 @@ L(\type\()_8tap_hv_tbl):
|
|||
vmlal.s16 q2, d20, d3[0]
|
||||
vmlal.s16 q2, d21, d3[1]
|
||||
vmlal.s16 q2, d22, d3[2]
|
||||
vmlal.s16 q2, d23, d3[3]
|
||||
vmlal.s16 q2, d26, d3[3]
|
||||
|
||||
vqrshrn.s32 d4, q2, #\shift_hv
|
||||
vqmovun.s16 d4, q2
|
||||
|
|
@ -2022,7 +2020,7 @@ L(\type\()_8tap_hv_tbl):
|
|||
vmov d18, d20
|
||||
vmov d19, d21
|
||||
vmov d20, d22
|
||||
vmov d21, d23
|
||||
vmov d21, d26
|
||||
b 28b
|
||||
|
||||
0:
|
||||
|
|
@ -2108,7 +2106,7 @@ L(\type\()_8tap_filter_2):
|
|||
b 4b
|
||||
|
||||
480: // 4x8, 4x16, 4x32 hv
|
||||
vld1.8 {d2}, [\my]
|
||||
vld1.8 {d2}, [\my, :64]
|
||||
sub \src, \src, #1
|
||||
sub \sr2, \src, \s_strd, lsl #1
|
||||
sub \src, \sr2, \s_strd
|
||||
|
|
@ -2211,7 +2209,7 @@ L(\type\()_8tap_filter_4):
|
|||
bgt 880f
|
||||
vpush {q4-q7}
|
||||
add \my, \my, #2
|
||||
vld1.8 {d0}, [\mx]
|
||||
vld1.8 {d0}, [\mx, :64]
|
||||
vld1.32 {d2[]}, [\my]
|
||||
sub \src, \src, #3
|
||||
sub \src, \src, \s_strd
|
||||
|
|
@ -2301,8 +2299,8 @@ L(\type\()_8tap_filter_4):
|
|||
640:
|
||||
1280:
|
||||
vpush {q4-q7}
|
||||
vld1.8 {d0}, [\mx]
|
||||
vld1.8 {d2}, [\my]
|
||||
vld1.8 {d0}, [\mx, :64]
|
||||
vld1.8 {d2}, [\my, :64]
|
||||
sub \src, \src, #3
|
||||
sub \src, \src, \s_strd
|
||||
sub \src, \src, \s_strd, lsl #1
|
||||
|
|
|
|||
2429
third_party/dav1d/src/arm/32/mc16.S
vendored
Normal file
2429
third_party/dav1d/src/arm/32/mc16.S
vendored
Normal file
File diff suppressed because it is too large
Load diff
14
third_party/dav1d/src/arm/64/looprestoration16.S
vendored
14
third_party/dav1d/src/arm/64/looprestoration16.S
vendored
|
|
@ -172,13 +172,13 @@ function wiener_filter_h_16bpc_neon, export=1
|
|||
// Interleaving the mul/mla chains actually hurts performance
|
||||
// significantly on Cortex A53, thus keeping mul/mla tightly
|
||||
// chained like this.
|
||||
ext v18.16b, v2.16b, v3.16b, #6
|
||||
ext v16.16b, v2.16b, v3.16b, #2
|
||||
ext v17.16b, v2.16b, v3.16b, #4
|
||||
ext v18.16b, v2.16b, v3.16b, #6
|
||||
ext v19.16b, v2.16b, v3.16b, #8
|
||||
ext v20.16b, v2.16b, v3.16b, #10
|
||||
ext v21.16b, v2.16b, v3.16b, #12
|
||||
ushll_sz v6, v7, v18, #7, \wd
|
||||
ext v21.16b, v2.16b, v3.16b, #12
|
||||
smlal v6.4s, v2.4h, v0.h[0]
|
||||
smlal v6.4s, v16.4h, v0.h[1]
|
||||
smlal v6.4s, v17.4h, v0.h[2]
|
||||
|
|
@ -195,13 +195,13 @@ function wiener_filter_h_16bpc_neon, export=1
|
|||
smlal2 v7.4s, v20.8h, v0.h[5]
|
||||
smlal2 v7.4s, v21.8h, v0.h[6]
|
||||
.endif
|
||||
ext v21.16b, v4.16b, v5.16b, #6
|
||||
ext v19.16b, v4.16b, v5.16b, #2
|
||||
ext v20.16b, v4.16b, v5.16b, #4
|
||||
ext v21.16b, v4.16b, v5.16b, #6
|
||||
ext v22.16b, v4.16b, v5.16b, #8
|
||||
ext v23.16b, v4.16b, v5.16b, #10
|
||||
ext v24.16b, v4.16b, v5.16b, #12
|
||||
ushll_sz v16, v17, v21, #7, \wd
|
||||
ext v24.16b, v4.16b, v5.16b, #12
|
||||
smlal v16.4s, v4.4h, v0.h[0]
|
||||
smlal v16.4s, v19.4h, v0.h[1]
|
||||
smlal v16.4s, v20.4h, v0.h[2]
|
||||
|
|
@ -334,9 +334,9 @@ L(variable_shift_tbl):
|
|||
ins v6.s[1], v7.s[0]
|
||||
mvni v24.4h, #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
|
||||
ushll v16.4s, v16.4h, #7
|
||||
add v6.4s, v6.4s, v30.4s
|
||||
add v6.4s, v6.4s, v16.4s
|
||||
srshl v6.4s, v6.4s, v29.4s
|
||||
add v6.2s, v6.2s, v30.2s
|
||||
add v6.2s, v6.2s, v16.2s
|
||||
srshl v6.2s, v6.2s, v29.2s
|
||||
sqxtun v6.4h, v6.4s
|
||||
umin v6.4h, v6.4h, v24.4h
|
||||
sub v6.4h, v6.4h, v31.4h
|
||||
|
|
|
|||
10
third_party/dav1d/src/arm/64/mc.S
vendored
10
third_party/dav1d/src/arm/64/mc.S
vendored
|
|
@ -1906,11 +1906,10 @@ L(\type\()_8tap_hv):
|
|||
bl L(\type\()_8tap_filter_2)
|
||||
|
||||
ext v18.8b, v17.8b, v28.8b, #4
|
||||
mov v19.8b, v28.8b
|
||||
smull v2.4s, v16.4h, v1.h[0]
|
||||
smlal v2.4s, v17.4h, v1.h[1]
|
||||
smlal v2.4s, v18.4h, v1.h[2]
|
||||
smlal v2.4s, v19.4h, v1.h[3]
|
||||
smlal v2.4s, v28.4h, v1.h[3]
|
||||
|
||||
sqrshrn v2.4h, v2.4s, #\shift_hv
|
||||
sqxtun v2.8b, v2.8h
|
||||
|
|
@ -1919,7 +1918,7 @@ L(\type\()_8tap_hv):
|
|||
st1 {v2.h}[1], [\ds2], \d_strd
|
||||
b.le 0f
|
||||
mov v16.8b, v18.8b
|
||||
mov v17.8b, v19.8b
|
||||
mov v17.8b, v28.8b
|
||||
b 2b
|
||||
|
||||
280: // 2x8, 2x16, 2x32 hv
|
||||
|
|
@ -1956,7 +1955,6 @@ L(\type\()_8tap_hv):
|
|||
28:
|
||||
bl L(\type\()_8tap_filter_2)
|
||||
ext v22.8b, v21.8b, v28.8b, #4
|
||||
mov v23.8b, v28.8b
|
||||
smull v2.4s, v16.4h, v1.h[0]
|
||||
smlal v2.4s, v17.4h, v1.h[1]
|
||||
smlal v2.4s, v18.4h, v1.h[2]
|
||||
|
|
@ -1964,7 +1962,7 @@ L(\type\()_8tap_hv):
|
|||
smlal v2.4s, v20.4h, v1.h[4]
|
||||
smlal v2.4s, v21.4h, v1.h[5]
|
||||
smlal v2.4s, v22.4h, v1.h[6]
|
||||
smlal v2.4s, v23.4h, v1.h[7]
|
||||
smlal v2.4s, v28.4h, v1.h[7]
|
||||
|
||||
sqrshrn v2.4h, v2.4s, #\shift_hv
|
||||
sqxtun v2.8b, v2.8h
|
||||
|
|
@ -1977,7 +1975,7 @@ L(\type\()_8tap_hv):
|
|||
mov v18.8b, v20.8b
|
||||
mov v19.8b, v21.8b
|
||||
mov v20.8b, v22.8b
|
||||
mov v21.8b, v23.8b
|
||||
mov v21.8b, v28.8b
|
||||
b 28b
|
||||
|
||||
0:
|
||||
|
|
|
|||
92
third_party/dav1d/src/arm/64/mc16.S
vendored
92
third_party/dav1d/src/arm/64/mc16.S
vendored
|
|
@ -1004,11 +1004,11 @@ function put_neon
|
|||
b.gt 2b
|
||||
ret
|
||||
4:
|
||||
ld1 {v0.8b}, [x2], x3
|
||||
ld1 {v1.8b}, [x2], x3
|
||||
ld1 {v0.4h}, [x2], x3
|
||||
ld1 {v1.4h}, [x2], x3
|
||||
subs w5, w5, #2
|
||||
st1 {v0.8b}, [x0], x1
|
||||
st1 {v1.8b}, [x0], x1
|
||||
st1 {v0.4h}, [x0], x1
|
||||
st1 {v1.4h}, [x0], x1
|
||||
b.gt 4b
|
||||
ret
|
||||
80:
|
||||
|
|
@ -1017,11 +1017,11 @@ function put_neon
|
|||
add x9, x2, x3
|
||||
lsl x3, x3, #1
|
||||
8:
|
||||
ld1 {v0.16b}, [x2], x3
|
||||
ld1 {v1.16b}, [x9], x3
|
||||
ld1 {v0.8h}, [x2], x3
|
||||
ld1 {v1.8h}, [x9], x3
|
||||
subs w5, w5, #2
|
||||
st1 {v0.16b}, [x0], x1
|
||||
st1 {v1.16b}, [x8], x1
|
||||
st1 {v0.8h}, [x0], x1
|
||||
st1 {v1.8h}, [x8], x1
|
||||
b.gt 8b
|
||||
ret
|
||||
16:
|
||||
|
|
@ -2039,7 +2039,6 @@ L(\type\()_8tap_hv):
|
|||
sxtl v0.8h, v0.8b
|
||||
sxtl v1.8h, v1.8b
|
||||
mov x15, x30
|
||||
sxtl v1.4s, v1.4h
|
||||
|
||||
ld1 {v27.8h}, [\src], \s_strd
|
||||
ext v28.16b, v27.16b, v27.16b, #2
|
||||
|
|
@ -2049,19 +2048,23 @@ L(\type\()_8tap_hv):
|
|||
addp v16.4s, v27.4s, v27.4s
|
||||
srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
|
||||
bl L(\type\()_8tap_filter_2)
|
||||
// The intermediates from the horizontal pass fit in 16 bit without
|
||||
// any bias; we could just as well keep them as .4s, but narrowing
|
||||
// them to .4h gives a significant speedup on out of order cores
|
||||
// (at the cost of a smaller slowdown on in-order cores such as A53).
|
||||
xtn v16.4h, v16.4s
|
||||
|
||||
trn1 v16.2d, v16.2d, v24.2d
|
||||
mov v17.16b, v24.16b
|
||||
trn1 v16.2s, v16.2s, v24.2s
|
||||
mov v17.8b, v24.8b
|
||||
|
||||
2:
|
||||
bl L(\type\()_8tap_filter_2)
|
||||
|
||||
ext v18.16b, v17.16b, v24.16b, #8
|
||||
mov v19.16b, v24.16b
|
||||
mul v2.4s, v16.4s, v1.s[0]
|
||||
mla v2.4s, v17.4s, v1.s[1]
|
||||
mla v2.4s, v18.4s, v1.s[2]
|
||||
mla v2.4s, v19.4s, v1.s[3]
|
||||
ext v18.8b, v17.8b, v24.8b, #4
|
||||
smull v2.4s, v16.4h, v1.h[0]
|
||||
smlal v2.4s, v17.4h, v1.h[1]
|
||||
smlal v2.4s, v18.4h, v1.h[2]
|
||||
smlal v2.4s, v24.4h, v1.h[3]
|
||||
|
||||
srshl v2.4s, v2.4s, v29.4s // -(6+intermediate_bits)
|
||||
sqxtun v2.4h, v2.4s
|
||||
|
|
@ -2070,8 +2073,8 @@ L(\type\()_8tap_hv):
|
|||
st1 {v2.s}[0], [\dst], \d_strd
|
||||
st1 {v2.s}[1], [\ds2], \d_strd
|
||||
b.le 0f
|
||||
mov v16.16b, v18.16b
|
||||
mov v17.16b, v19.16b
|
||||
mov v16.8b, v18.8b
|
||||
mov v17.8b, v24.8b
|
||||
b 2b
|
||||
|
||||
280: // 2x8, 2x16, 2x32 hv
|
||||
|
|
@ -2085,8 +2088,6 @@ L(\type\()_8tap_hv):
|
|||
sxtl v0.8h, v0.8b
|
||||
sxtl v1.8h, v1.8b
|
||||
mov x15, x30
|
||||
sxtl2 v2.4s, v1.8h
|
||||
sxtl v1.4s, v1.4h
|
||||
|
||||
ld1 {v27.8h}, [\src], \s_strd
|
||||
ext v28.16b, v27.16b, v27.16b, #2
|
||||
|
|
@ -2095,29 +2096,33 @@ L(\type\()_8tap_hv):
|
|||
addp v27.4s, v27.4s, v28.4s
|
||||
addp v16.4s, v27.4s, v27.4s
|
||||
srshl v16.2s, v16.2s, v30.2s // -(6-intermediate_bits)
|
||||
// The intermediates from the horizontal pass fit in 16 bit without
|
||||
// any bias; we could just as well keep them as .4s, but narrowing
|
||||
// them to .4h gives a significant speedup on out of order cores
|
||||
// (at the cost of a smaller slowdown on in-order cores such as A53).
|
||||
|
||||
bl L(\type\()_8tap_filter_2)
|
||||
trn1 v16.2d, v16.2d, v24.2d
|
||||
mov v17.16b, v24.16b
|
||||
xtn v16.4h, v16.4s
|
||||
trn1 v16.2s, v16.2s, v24.2s
|
||||
mov v17.8b, v24.8b
|
||||
bl L(\type\()_8tap_filter_2)
|
||||
ext v18.16b, v17.16b, v24.16b, #8
|
||||
mov v19.16b, v24.16b
|
||||
ext v18.8b, v17.8b, v24.8b, #4
|
||||
mov v19.8b, v24.8b
|
||||
bl L(\type\()_8tap_filter_2)
|
||||
ext v20.16b, v19.16b, v24.16b, #8
|
||||
mov v21.16b, v24.16b
|
||||
ext v20.8b, v19.8b, v24.8b, #4
|
||||
mov v21.8b, v24.8b
|
||||
|
||||
28:
|
||||
bl L(\type\()_8tap_filter_2)
|
||||
ext v22.16b, v21.16b, v24.16b, #8
|
||||
mov v23.16b, v24.16b
|
||||
mul v3.4s, v16.4s, v1.s[0]
|
||||
mla v3.4s, v17.4s, v1.s[1]
|
||||
mla v3.4s, v18.4s, v1.s[2]
|
||||
mla v3.4s, v19.4s, v1.s[3]
|
||||
mla v3.4s, v20.4s, v2.s[0]
|
||||
mla v3.4s, v21.4s, v2.s[1]
|
||||
mla v3.4s, v22.4s, v2.s[2]
|
||||
mla v3.4s, v23.4s, v2.s[3]
|
||||
ext v22.8b, v21.8b, v24.8b, #4
|
||||
smull v3.4s, v16.4h, v1.h[0]
|
||||
smlal v3.4s, v17.4h, v1.h[1]
|
||||
smlal v3.4s, v18.4h, v1.h[2]
|
||||
smlal v3.4s, v19.4h, v1.h[3]
|
||||
smlal v3.4s, v20.4h, v1.h[4]
|
||||
smlal v3.4s, v21.4h, v1.h[5]
|
||||
smlal v3.4s, v22.4h, v1.h[6]
|
||||
smlal v3.4s, v24.4h, v1.h[7]
|
||||
|
||||
srshl v3.4s, v3.4s, v29.4s // -(6+intermediate_bits)
|
||||
sqxtun v3.4h, v3.4s
|
||||
|
|
@ -2126,12 +2131,12 @@ L(\type\()_8tap_hv):
|
|||
st1 {v3.s}[0], [\dst], \d_strd
|
||||
st1 {v3.s}[1], [\ds2], \d_strd
|
||||
b.le 0f
|
||||
mov v16.16b, v18.16b
|
||||
mov v17.16b, v19.16b
|
||||
mov v18.16b, v20.16b
|
||||
mov v19.16b, v21.16b
|
||||
mov v20.16b, v22.16b
|
||||
mov v21.16b, v23.16b
|
||||
mov v16.8b, v18.8b
|
||||
mov v17.8b, v19.8b
|
||||
mov v18.8b, v20.8b
|
||||
mov v19.8b, v21.8b
|
||||
mov v20.8b, v22.8b
|
||||
mov v21.8b, v24.8b
|
||||
b 28b
|
||||
|
||||
0:
|
||||
|
|
@ -2151,6 +2156,7 @@ L(\type\()_8tap_filter_2):
|
|||
smlal v24.4s, v27.4h, v0.h[2]
|
||||
smlal v24.4s, v28.4h, v0.h[3]
|
||||
srshl v24.4s, v24.4s, v30.4s // -(6-intermediate_bits)
|
||||
xtn v24.4h, v24.4s
|
||||
ret
|
||||
.endif
|
||||
|
||||
|
|
|
|||
|
|
@ -29,7 +29,6 @@
|
|||
#include "src/looprestoration.h"
|
||||
#include "src/tables.h"
|
||||
|
||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||
// The 8bpc version calculates things slightly differently than the reference
|
||||
// C version. That version calculates roughly this:
|
||||
// int16_t sum = 0;
|
||||
|
|
@ -105,6 +104,7 @@ static void wiener_filter_neon(pixel *const dst, const ptrdiff_t dst_stride,
|
|||
}
|
||||
}
|
||||
|
||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||
void BF(dav1d_sgr_box3_h, neon)(int32_t *sumsq, int16_t *sum,
|
||||
const pixel (*left)[4],
|
||||
const pixel *src, const ptrdiff_t stride,
|
||||
|
|
@ -290,8 +290,8 @@ COLD void bitfn(dav1d_loop_restoration_dsp_init_arm)(Dav1dLoopRestorationDSPCont
|
|||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||
c->wiener = wiener_filter_neon;
|
||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||
if (bpc <= 10)
|
||||
c->selfguided = sgr_filter_neon;
|
||||
#endif
|
||||
|
|
|
|||
2
third_party/dav1d/src/arm/mc_init_tmpl.c
vendored
2
third_party/dav1d/src/arm/mc_init_tmpl.c
vendored
|
|
@ -77,7 +77,6 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
|
|||
|
||||
if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
|
||||
|
||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||
init_mc_fn (FILTER_2D_8TAP_REGULAR, 8tap_regular, neon);
|
||||
init_mc_fn (FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth, neon);
|
||||
init_mc_fn (FILTER_2D_8TAP_REGULAR_SHARP, 8tap_regular_sharp, neon);
|
||||
|
|
@ -103,6 +102,7 @@ void bitfn(dav1d_mc_dsp_init_arm)(Dav1dMCDSPContext *const c) {
|
|||
c->avg = BF(dav1d_avg, neon);
|
||||
c->w_avg = BF(dav1d_w_avg, neon);
|
||||
c->mask = BF(dav1d_mask, neon);
|
||||
#if BITDEPTH == 8 || ARCH_AARCH64
|
||||
c->blend = BF(dav1d_blend, neon);
|
||||
c->blend_h = BF(dav1d_blend_h, neon);
|
||||
c->blend_v = BF(dav1d_blend_v, neon);
|
||||
|
|
|
|||
16
third_party/dav1d/src/decode.c
vendored
16
third_party/dav1d/src/decode.c
vendored
|
|
@ -773,10 +773,10 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
signabs(t->warpmv.matrix[3]),
|
||||
signabs(t->warpmv.matrix[4]),
|
||||
signabs(t->warpmv.matrix[5]),
|
||||
signabs(t->warpmv.alpha),
|
||||
signabs(t->warpmv.beta),
|
||||
signabs(t->warpmv.gamma),
|
||||
signabs(t->warpmv.delta),
|
||||
signabs(t->warpmv.u.p.alpha),
|
||||
signabs(t->warpmv.u.p.beta),
|
||||
signabs(t->warpmv.u.p.gamma),
|
||||
signabs(t->warpmv.u.p.delta),
|
||||
b->mv2d.y, b->mv2d.x);
|
||||
#undef signabs
|
||||
}
|
||||
|
|
@ -1843,10 +1843,10 @@ static int decode_b(Dav1dTileContext *const t,
|
|||
signabs(t->warpmv.matrix[3]),
|
||||
signabs(t->warpmv.matrix[4]),
|
||||
signabs(t->warpmv.matrix[5]),
|
||||
signabs(t->warpmv.alpha),
|
||||
signabs(t->warpmv.beta),
|
||||
signabs(t->warpmv.gamma),
|
||||
signabs(t->warpmv.delta),
|
||||
signabs(t->warpmv.u.p.alpha),
|
||||
signabs(t->warpmv.u.p.beta),
|
||||
signabs(t->warpmv.u.p.gamma),
|
||||
signabs(t->warpmv.u.p.delta),
|
||||
b->mv[0].y, b->mv[0].x);
|
||||
#undef signabs
|
||||
if (f->frame_thread.pass) {
|
||||
|
|
|
|||
37
third_party/dav1d/src/meson.build
vendored
37
third_party/dav1d/src/meson.build
vendored
|
|
@ -82,7 +82,7 @@ libdav1d_entrypoints_sources = files(
|
|||
)
|
||||
|
||||
# ASM specific sources
|
||||
libdav1d_nasm_objs = []
|
||||
libdav1d_asm_objs = []
|
||||
# Arch-specific flags
|
||||
arch_flags = []
|
||||
if is_asm_enabled
|
||||
|
|
@ -102,7 +102,7 @@ if is_asm_enabled
|
|||
)
|
||||
if (host_machine.cpu_family() == 'aarch64' or
|
||||
host_machine.cpu() == 'arm64')
|
||||
libdav1d_sources += files(
|
||||
libdav1d_sources_asm = files(
|
||||
# itx.S is used for both 8 and 16 bpc.
|
||||
'arm/64/itx.S',
|
||||
'arm/64/looprestoration_common.S',
|
||||
|
|
@ -110,7 +110,7 @@ if is_asm_enabled
|
|||
)
|
||||
|
||||
if dav1d_bitdepths.contains('8')
|
||||
libdav1d_sources += files(
|
||||
libdav1d_sources_asm += files(
|
||||
'arm/64/cdef.S',
|
||||
'arm/64/ipred.S',
|
||||
'arm/64/loopfilter.S',
|
||||
|
|
@ -120,7 +120,7 @@ if is_asm_enabled
|
|||
endif
|
||||
|
||||
if dav1d_bitdepths.contains('16')
|
||||
libdav1d_sources += files(
|
||||
libdav1d_sources_asm += files(
|
||||
'arm/64/cdef16.S',
|
||||
'arm/64/ipred16.S',
|
||||
'arm/64/itx16.S',
|
||||
|
|
@ -130,12 +130,12 @@ if is_asm_enabled
|
|||
)
|
||||
endif
|
||||
elif host_machine.cpu_family().startswith('arm')
|
||||
libdav1d_sources += files(
|
||||
libdav1d_sources_asm = files(
|
||||
'arm/32/msac.S',
|
||||
)
|
||||
|
||||
if dav1d_bitdepths.contains('8')
|
||||
libdav1d_sources += files(
|
||||
libdav1d_sources_asm += files(
|
||||
'arm/32/cdef.S',
|
||||
'arm/32/ipred.S',
|
||||
'arm/32/itx.S',
|
||||
|
|
@ -146,10 +146,18 @@ if is_asm_enabled
|
|||
endif
|
||||
|
||||
if dav1d_bitdepths.contains('16')
|
||||
libdav1d_sources += files(
|
||||
libdav1d_sources_asm += files(
|
||||
'arm/32/looprestoration16.S',
|
||||
'arm/32/mc16.S',
|
||||
)
|
||||
endif
|
||||
endif
|
||||
|
||||
if use_gaspp
|
||||
libdav1d_asm_objs = gaspp_gen.process(libdav1d_sources_asm)
|
||||
else
|
||||
libdav1d_sources += libdav1d_sources_asm
|
||||
endif
|
||||
elif host_machine.cpu_family().startswith('x86')
|
||||
|
||||
libdav1d_sources += files(
|
||||
|
|
@ -200,7 +208,7 @@ if is_asm_enabled
|
|||
endif
|
||||
|
||||
# Compile the ASM sources with NASM
|
||||
libdav1d_nasm_objs = nasm_gen.process(libdav1d_sources_asm)
|
||||
libdav1d_asm_objs = nasm_gen.process(libdav1d_sources_asm)
|
||||
elif host_machine.cpu() == 'ppc64le'
|
||||
arch_flags = ['-maltivec', '-mvsx']
|
||||
libdav1d_sources += files(
|
||||
|
|
@ -222,17 +230,6 @@ api_export_flags = []
|
|||
#
|
||||
|
||||
if host_machine.system() == 'windows' and get_option('default_library') != 'static'
|
||||
rc_version_array = meson.project_version().split('.')
|
||||
winmod = import('windows')
|
||||
rc_data = configuration_data()
|
||||
rc_data.set('PROJECT_VERSION_MAJOR', rc_version_array[0])
|
||||
rc_data.set('PROJECT_VERSION_MINOR', rc_version_array[1])
|
||||
rc_data.set('PROJECT_VERSION_REVISION', rc_version_array[2])
|
||||
rc_data.set('API_VERSION_MAJOR', dav1d_api_version_major)
|
||||
rc_data.set('API_VERSION_MINOR', dav1d_api_version_minor)
|
||||
rc_data.set('API_VERSION_REVISION', dav1d_api_version_revision)
|
||||
rc_data.set('COPYRIGHT_YEARS', '2019')
|
||||
|
||||
rc_file = configure_file(
|
||||
input : 'dav1d.rc.in',
|
||||
output : 'dav1d.rc',
|
||||
|
|
@ -301,7 +298,7 @@ endif
|
|||
|
||||
libdav1d = library('dav1d',
|
||||
libdav1d_sources,
|
||||
libdav1d_nasm_objs,
|
||||
libdav1d_asm_objs,
|
||||
libdav1d_rc_obj,
|
||||
|
||||
objects : [
|
||||
|
|
|
|||
2
third_party/dav1d/src/obu.c
vendored
2
third_party/dav1d/src/obu.c
vendored
|
|
@ -112,6 +112,8 @@ static int parse_seq_hdr(Dav1dContext *const c, GetBits *const gb,
|
|||
struct Dav1dSequenceHeaderOperatingPoint *const op =
|
||||
&hdr->operating_points[i];
|
||||
op->idc = dav1d_get_bits(gb, 12);
|
||||
if (op->idc && (!(op->idc & 0xff) || !(op->idc & 0xf00)))
|
||||
goto error;
|
||||
op->major_level = 2 + dav1d_get_bits(gb, 3);
|
||||
op->minor_level = dav1d_get_bits(gb, 2);
|
||||
op->tier = op->major_level > 3 ? dav1d_get_bits(gb, 1) : 0;
|
||||
|
|
|
|||
12
third_party/dav1d/src/recon_tmpl.c
vendored
12
third_party/dav1d/src/recon_tmpl.c
vendored
|
|
@ -1082,11 +1082,11 @@ static int warp_affine(Dav1dTileContext *const t,
|
|||
const int64_t mvy = ((int64_t) mat[4] * src_x + mat5_y) >> ss_ver;
|
||||
|
||||
const int dx = (int) (mvx >> 16) - 4;
|
||||
const int mx = (((int) mvx & 0xffff) - wmp->alpha * 4 -
|
||||
wmp->beta * 7) & ~0x3f;
|
||||
const int mx = (((int) mvx & 0xffff) - wmp->u.p.alpha * 4 -
|
||||
wmp->u.p.beta * 7) & ~0x3f;
|
||||
const int dy = (int) (mvy >> 16) - 4;
|
||||
const int my = (((int) mvy & 0xffff) - wmp->gamma * 4 -
|
||||
wmp->delta * 4) & ~0x3f;
|
||||
const int my = (((int) mvy & 0xffff) - wmp->u.p.gamma * 4 -
|
||||
wmp->u.p.delta * 4) & ~0x3f;
|
||||
|
||||
const pixel *ref_ptr;
|
||||
ptrdiff_t ref_stride = refp->p.stride[!!pl];
|
||||
|
|
@ -1108,10 +1108,10 @@ static int warp_affine(Dav1dTileContext *const t,
|
|||
}
|
||||
if (dst16 != NULL)
|
||||
dsp->mc.warp8x8t(&dst16[x], dstride, ref_ptr, ref_stride,
|
||||
wmp->abcd, mx, my HIGHBD_CALL_SUFFIX);
|
||||
wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
|
||||
else
|
||||
dsp->mc.warp8x8(&dst8[x], dstride, ref_ptr, ref_stride,
|
||||
wmp->abcd, mx, my HIGHBD_CALL_SUFFIX);
|
||||
wmp->u.abcd, mx, my HIGHBD_CALL_SUFFIX);
|
||||
}
|
||||
if (dst8) dst8 += 8 * PXSTRIDE(dstride);
|
||||
else dst16 += 8 * dstride;
|
||||
|
|
|
|||
8
third_party/dav1d/src/tables.c
vendored
8
third_party/dav1d/src/tables.c
vendored
|
|
@ -391,10 +391,10 @@ const Dav1dWarpedMotionParams dav1d_default_wm_params = {
|
|||
0, 0, 1 << 16,
|
||||
0, 0, 1 << 16,
|
||||
},
|
||||
.alpha = 0,
|
||||
.beta = 0,
|
||||
.gamma = 0,
|
||||
.delta = 0,
|
||||
.u.p.alpha = 0,
|
||||
.u.p.beta = 0,
|
||||
.u.p.gamma = 0,
|
||||
.u.p.delta = 0,
|
||||
};
|
||||
|
||||
const int8_t dav1d_cdef_directions[2 + 8 + 2 /* dir */][2 /* pass */] = {
|
||||
|
|
|
|||
12
third_party/dav1d/src/warpmv.c
vendored
12
third_party/dav1d/src/warpmv.c
vendored
|
|
@ -82,21 +82,21 @@ int dav1d_get_shear_params(Dav1dWarpedMotionParams *const wm) {
|
|||
|
||||
if (mat[2] <= 0) return 1;
|
||||
|
||||
wm->alpha = iclip_wmp(mat[2] - 0x10000);
|
||||
wm->beta = iclip_wmp(mat[3]);
|
||||
wm->u.p.alpha = iclip_wmp(mat[2] - 0x10000);
|
||||
wm->u.p.beta = iclip_wmp(mat[3]);
|
||||
|
||||
int shift;
|
||||
const int y = apply_sign(resolve_divisor_32(abs(mat[2]), &shift), mat[2]);
|
||||
const int64_t v1 = ((int64_t) mat[4] * 0x10000) * y;
|
||||
const int rnd = (1 << shift) >> 1;
|
||||
wm->gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1));
|
||||
wm->u.p.gamma = iclip_wmp(apply_sign64((int) ((llabs(v1) + rnd) >> shift), v1));
|
||||
const int64_t v2 = ((int64_t) mat[3] * mat[4]) * y;
|
||||
wm->delta = iclip_wmp(mat[5] -
|
||||
wm->u.p.delta = iclip_wmp(mat[5] -
|
||||
apply_sign64((int) ((llabs(v2) + rnd) >> shift), v2) -
|
||||
0x10000);
|
||||
|
||||
return (4 * abs(wm->alpha) + 7 * abs(wm->beta) >= 0x10000) ||
|
||||
(4 * abs(wm->gamma) + 4 * abs(wm->delta) >= 0x10000);
|
||||
return (4 * abs(wm->u.p.alpha) + 7 * abs(wm->u.p.beta) >= 0x10000) ||
|
||||
(4 * abs(wm->u.p.gamma) + 4 * abs(wm->u.p.delta) >= 0x10000);
|
||||
}
|
||||
|
||||
static int resolve_divisor_64(const uint64_t d, int *const shift) {
|
||||
|
|
|
|||
734
third_party/dav1d/src/x86/mc_avx2.asm
vendored
734
third_party/dav1d/src/x86/mc_avx2.asm
vendored
File diff suppressed because it is too large
Load diff
1600
third_party/dav1d/src/x86/mc_sse.asm
vendored
1600
third_party/dav1d/src/x86/mc_sse.asm
vendored
File diff suppressed because it is too large
Load diff
33
third_party/dav1d/tests/header_test.c
vendored
Normal file
33
third_party/dav1d/tests/header_test.c
vendored
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
/*
|
||||
* Copyright © 2018, VideoLAN and dav1d authors
|
||||
* Copyright © 2018, Two Orioles, LLC
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice, this
|
||||
* list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
||||
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include DAV1D_TEST_HEADER
|
||||
|
||||
int main()
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
36
third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c
vendored
36
third_party/dav1d/tests/libfuzzer/dav1d_fuzzer.c
vendored
|
|
@ -31,6 +31,7 @@
|
|||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <dav1d/dav1d.h>
|
||||
#include "src/cpu.h"
|
||||
|
|
@ -38,8 +39,6 @@
|
|||
|
||||
#ifdef DAV1D_ALLOC_FAIL
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#include "alloc_fail.h"
|
||||
|
||||
static unsigned djb_xor(const uint8_t * c, size_t len) {
|
||||
|
|
@ -56,6 +55,39 @@ static unsigned r32le(const uint8_t *const p) {
|
|||
|
||||
#define DAV1D_FUZZ_MAX_SIZE 4096 * 4096
|
||||
|
||||
// search for "--cpumask xxx" in argv and remove both parameters
|
||||
int LLVMFuzzerInitialize(int *argc, char ***argv) {
|
||||
int i = 1;
|
||||
for (; i < *argc; i++) {
|
||||
if (!strcmp((*argv)[i], "--cpumask")) {
|
||||
const char * cpumask = (*argv)[i+1];
|
||||
if (cpumask) {
|
||||
char *end;
|
||||
unsigned res;
|
||||
if (!strncmp(cpumask, "0x", 2)) {
|
||||
cpumask += 2;
|
||||
res = (unsigned) strtoul(cpumask, &end, 16);
|
||||
} else {
|
||||
res = (unsigned) strtoul(cpumask, &end, 0);
|
||||
}
|
||||
if (end != cpumask && !end[0]) {
|
||||
dav1d_set_cpu_flags_mask(res);
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (; i < *argc - 2; i++) {
|
||||
(*argv)[i] = (*argv)[i + 2];
|
||||
}
|
||||
|
||||
*argc = i;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
// expects ivf input
|
||||
|
||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size)
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@
|
|||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
int LLVMFuzzerInitialize(int *argc, char ***argv);
|
||||
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size);
|
||||
|
||||
#endif /* DAV1D_TESTS_LIBFUZZER_DAV1D_FUZZER_H */
|
||||
|
|
|
|||
6
third_party/dav1d/tests/libfuzzer/main.c
vendored
6
third_party/dav1d/tests/libfuzzer/main.c
vendored
|
|
@ -40,7 +40,7 @@
|
|||
|
||||
// expects ivf input
|
||||
|
||||
int main(const int argc, char *const *const argv) {
|
||||
int main(int argc, char *argv[]) {
|
||||
int ret = -1;
|
||||
FILE *f = NULL;
|
||||
int64_t fsize;
|
||||
|
|
@ -48,6 +48,10 @@ int main(const int argc, char *const *const argv) {
|
|||
uint8_t *data = NULL;
|
||||
size_t size = 0;
|
||||
|
||||
if (LLVMFuzzerInitialize(&argc, &argv)) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (argc != 2) {
|
||||
fprintf(stdout, "Usage:\n%s fuzzing_testcase.ivf\n", argv[0]);
|
||||
return -1;
|
||||
|
|
|
|||
50
third_party/dav1d/tests/meson.build
vendored
50
third_party/dav1d/tests/meson.build
vendored
|
|
@ -31,8 +31,6 @@ if not get_option('enable_tests')
|
|||
subdir_done()
|
||||
endif
|
||||
|
||||
libdav1d_nasm_objs_if_needed = []
|
||||
|
||||
if is_asm_enabled
|
||||
checkasm_sources = files(
|
||||
'checkasm/checkasm.c',
|
||||
|
|
@ -62,25 +60,27 @@ if is_asm_enabled
|
|||
checkasm_bitdepth_objs += checkasm_bitdepth_lib.extract_all_objects()
|
||||
endforeach
|
||||
|
||||
checkasm_nasm_objs = []
|
||||
checkasm_asm_objs = []
|
||||
checkasm_asm_sources = []
|
||||
if host_machine.cpu_family() == 'aarch64' or host_machine.cpu() == 'arm64'
|
||||
checkasm_sources += files('checkasm/arm/checkasm_64.S')
|
||||
checkasm_asm_sources += files('checkasm/arm/checkasm_64.S')
|
||||
elif host_machine.cpu_family().startswith('arm')
|
||||
checkasm_sources += files('checkasm/arm/checkasm_32.S')
|
||||
checkasm_asm_sources += files('checkasm/arm/checkasm_32.S')
|
||||
elif host_machine.cpu_family().startswith('x86')
|
||||
checkasm_nasm_objs = nasm_gen.process(files('checkasm/x86/checkasm.asm'))
|
||||
checkasm_asm_objs += nasm_gen.process(files('checkasm/x86/checkasm.asm'))
|
||||
endif
|
||||
|
||||
if use_gaspp
|
||||
checkasm_asm_objs += gaspp_gen.process(checkasm_asm_sources)
|
||||
else
|
||||
checkasm_sources += checkasm_asm_sources
|
||||
endif
|
||||
|
||||
m_lib = cc.find_library('m', required: false)
|
||||
|
||||
if meson.version().version_compare('< 0.48.999')
|
||||
libdav1d_nasm_objs_if_needed = libdav1d_nasm_objs
|
||||
endif
|
||||
|
||||
checkasm = executable('checkasm',
|
||||
checkasm_sources,
|
||||
checkasm_nasm_objs,
|
||||
libdav1d_nasm_objs_if_needed,
|
||||
checkasm_asm_objs,
|
||||
|
||||
objects: [
|
||||
checkasm_bitdepth_objs,
|
||||
|
|
@ -101,10 +101,30 @@ if is_asm_enabled
|
|||
test('checkasm', checkasm, is_parallel: false)
|
||||
endif
|
||||
|
||||
c99_extension_flag = cc.first_supported_argument(
|
||||
'-Werror=c11-extensions',
|
||||
'-Werror=c99-c11-compat',
|
||||
'-Wc11-extensions',
|
||||
'-Wc99-c11-compat',
|
||||
)
|
||||
|
||||
# dav1d_api_headers
|
||||
foreach header : dav1d_api_headers
|
||||
target = header + '_test'
|
||||
|
||||
header_test_exe = executable(target,
|
||||
'header_test.c',
|
||||
include_directories: dav1d_inc_dirs,
|
||||
c_args: ['-DDAV1D_TEST_HEADER="@0@"'.format(header), c99_extension_flag],
|
||||
build_by_default: true
|
||||
)
|
||||
|
||||
test(target, header_test_exe)
|
||||
endforeach
|
||||
|
||||
|
||||
# fuzzing binaries
|
||||
if meson.version().version_compare('>=0.49')
|
||||
subdir('libfuzzer')
|
||||
endif
|
||||
subdir('libfuzzer')
|
||||
|
||||
# Include dav1d test data repository with additional tests
|
||||
if get_option('testdata_tests')
|
||||
|
|
|
|||
12
third_party/dav1d/tools/dav1d.c
vendored
12
third_party/dav1d/tools/dav1d.c
vendored
|
|
@ -124,11 +124,15 @@ static void print_stats(const int istty, const unsigned n, const unsigned num,
|
|||
else
|
||||
b += snprintf(b, end - b, "Decoded %u/%u frames (%.1lf%%)",
|
||||
n, num, 100.0 * n / num);
|
||||
if (i_fps && b < end) {
|
||||
if (b < end) {
|
||||
const double d_fps = 1e9 * n / elapsed;
|
||||
const double speed = d_fps / i_fps;
|
||||
b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)",
|
||||
d_fps, i_fps, speed);
|
||||
if (i_fps) {
|
||||
const double speed = d_fps / i_fps;
|
||||
b += snprintf(b, end - b, " - %.2lf/%.2lf fps (%.2lfx)",
|
||||
d_fps, i_fps, speed);
|
||||
} else {
|
||||
b += snprintf(b, end - b, " - %.2lf fps", d_fps);
|
||||
}
|
||||
}
|
||||
if (!istty)
|
||||
strcpy(b > end - 2 ? end - 2 : b, "\n");
|
||||
|
|
|
|||
10
third_party/dav1d/tools/dav1d.manifest
vendored
Normal file
10
third_party/dav1d/tools/dav1d.manifest
vendored
Normal file
|
|
@ -0,0 +1,10 @@
|
|||
<?xml version="1.0" encoding="utf-8" standalone="yes"?>
|
||||
<assembly xmlns="urn:schemas-microsoft-com:asm.v1" manifestVersion="1.0">
|
||||
<assemblyIdentity type="win32" name="VideoLAN.dav1d" version="1.0.0.0"/>
|
||||
<application xmlns="urn:schemas-microsoft-com:asm.v3">
|
||||
<windowsSettings>
|
||||
<longPathAware xmlns="http://schemas.microsoft.com/SMI/2016/WindowsSettings">true</longPathAware>
|
||||
<activeCodePage xmlns="http://schemas.microsoft.com/SMI/2019/WindowsSettings">UTF-8</activeCodePage>
|
||||
</windowsSettings>
|
||||
</application>
|
||||
</assembly>
|
||||
33
third_party/dav1d/tools/dav1d.rc.in
vendored
Normal file
33
third_party/dav1d/tools/dav1d.rc.in
vendored
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
#define API_VERSION_NUMBER @API_VERSION_MAJOR@,@API_VERSION_MINOR@,@API_VERSION_REVISION@,0
|
||||
#define API_VERSION_NUMBER_STR "@API_VERSION_MAJOR@.@API_VERSION_MINOR@.@API_VERSION_REVISION@"
|
||||
#define PROJECT_VERSION_NUMBER @PROJECT_VERSION_MAJOR@,@PROJECT_VERSION_MINOR@,@PROJECT_VERSION_REVISION@,0
|
||||
#define PROJECT_VERSION_NUMBER_STR "@PROJECT_VERSION_MAJOR@.@PROJECT_VERSION_MINOR@.@PROJECT_VERSION_REVISION@"
|
||||
|
||||
#include <windows.h>
|
||||
|
||||
1 RT_MANIFEST "dav1d.manifest"
|
||||
1 VERSIONINFO
|
||||
FILETYPE VFT_APP
|
||||
FILEOS VOS_NT_WINDOWS32
|
||||
PRODUCTVERSION PROJECT_VERSION_NUMBER
|
||||
FILEVERSION API_VERSION_NUMBER
|
||||
BEGIN
|
||||
BLOCK "StringFileInfo"
|
||||
BEGIN
|
||||
BLOCK "040904E4"
|
||||
BEGIN
|
||||
VALUE "CompanyName", "VideoLAN"
|
||||
VALUE "ProductName", "dav1d"
|
||||
VALUE "ProductVersion", PROJECT_VERSION_NUMBER_STR
|
||||
VALUE "FileVersion", API_VERSION_NUMBER_STR
|
||||
VALUE "FileDescription", "dav1d " PROJECT_VERSION_NUMBER_STR " - AV1 decoder"
|
||||
VALUE "InternalName", "dav1d"
|
||||
VALUE "OriginalFilename", "dav1d.exe"
|
||||
VALUE "LegalCopyright", "Copyright \251 @COPYRIGHT_YEARS@ VideoLAN and dav1d Authors"
|
||||
END
|
||||
END
|
||||
BLOCK "VarFileInfo"
|
||||
BEGIN
|
||||
VALUE "Translation", 0x409, 1252
|
||||
END
|
||||
END
|
||||
16
third_party/dav1d/tools/meson.build
vendored
16
third_party/dav1d/tools/meson.build
vendored
|
|
@ -77,8 +77,24 @@ dav1d_sources = files(
|
|||
'dav1d_cli_parse.c',
|
||||
)
|
||||
|
||||
if host_machine.system() == 'windows'
|
||||
rc_file = configure_file(
|
||||
input : 'dav1d.rc.in',
|
||||
output : 'dav1d.rc',
|
||||
configuration : rc_data
|
||||
)
|
||||
|
||||
dav1d_rc_obj = winmod.compile_resources(rc_file,
|
||||
depend_files : files('dav1d.manifest'),
|
||||
include_directories : include_directories('.')
|
||||
)
|
||||
else
|
||||
dav1d_rc_obj = []
|
||||
endif
|
||||
|
||||
dav1d = executable('dav1d',
|
||||
dav1d_sources,
|
||||
dav1d_rc_obj,
|
||||
rev_target, cli_config_h_target,
|
||||
|
||||
link_with : [libdav1d, dav1d_input_objs, dav1d_output_objs],
|
||||
|
|
|
|||
14
third_party/dav1d/tools/output/y4m2.c
vendored
14
third_party/dav1d/tools/output/y4m2.c
vendored
|
|
@ -28,6 +28,7 @@
|
|||
#include "config.h"
|
||||
|
||||
#include <errno.h>
|
||||
#include <inttypes.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
|
@ -77,8 +78,17 @@ static int write_header(Y4m2OutputContext *const c, const Dav1dPicture *const p)
|
|||
chr_names_8bpc_i420[p->seq_hdr->chr > 2 ? DAV1D_CHR_UNKNOWN : p->seq_hdr->chr] :
|
||||
ss_names[p->p.layout][p->seq_hdr->hbd];
|
||||
|
||||
fprintf(c->f, "YUV4MPEG2 W%d H%d F%d:%d Ip C%s\n",
|
||||
p->p.w, p->p.h, c->fps[0], c->fps[1], ss_name);
|
||||
const unsigned fw = p->p.w;
|
||||
const unsigned fh = p->p.h;
|
||||
uint64_t aw = (uint64_t)fh * p->frame_hdr->render_width;
|
||||
uint64_t ah = (uint64_t)fw * p->frame_hdr->render_height;
|
||||
uint64_t gcd = ah;
|
||||
for (uint64_t a = aw, b; (b = a % gcd); a = gcd, gcd = b);
|
||||
aw /= gcd;
|
||||
ah /= gcd;
|
||||
|
||||
fprintf(c->f, "YUV4MPEG2 W%u H%u F%u:%u Ip A%"PRIu64":%"PRIu64" C%s\n",
|
||||
fw, fh, c->fps[0], c->fps[1], aw, ah, ss_name);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue