forked from mirrors/gecko-dev
Bug 1878645 - Upgrade gemmology to latest version r=marco
Differential Revision: https://phabricator.services.mozilla.com/D200625
This commit is contained in:
parent
2a53966e2a
commit
88ddeef1c4
2 changed files with 34 additions and 43 deletions
73
third_party/gemmology/gemmology.h
vendored
73
third_party/gemmology/gemmology.h
vendored
|
|
@ -209,6 +209,25 @@ maddw(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __AVX512VNNI__
|
||||||
|
|
||||||
|
template <class Arch>
|
||||||
|
inline xsimd::batch<int32_t, Arch>
|
||||||
|
maddw(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
|
||||||
|
xsimd::batch<int32_t, Arch> z,
|
||||||
|
xsimd::kernel::requires_arch<xsimd::avx512vnni<xsimd::avx512bw>>) {
|
||||||
|
return _mm512_dpbusd_epi32(z, x, y);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class Arch>
|
||||||
|
inline xsimd::batch<int32_t, Arch>
|
||||||
|
maddw(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
|
||||||
|
xsimd::batch<int32_t, Arch> z,
|
||||||
|
xsimd::kernel::requires_arch<xsimd::avx512vnni<xsimd::avx512vbmi>>) {
|
||||||
|
return _mm512_dpbusd_epi32(z, x, y);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __SSSE3__
|
#ifdef __SSSE3__
|
||||||
|
|
@ -233,7 +252,7 @@ template <class Arch>
|
||||||
std::tuple<xsimd::batch<int8_t, Arch>, xsimd::batch<int8_t, Arch>>
|
std::tuple<xsimd::batch<int8_t, Arch>, xsimd::batch<int8_t, Arch>>
|
||||||
interleave(xsimd::batch<int8_t, Arch> first, xsimd::batch<int8_t, Arch> second,
|
interleave(xsimd::batch<int8_t, Arch> first, xsimd::batch<int8_t, Arch> second,
|
||||||
xsimd::kernel::requires_arch<xsimd::sse2>) {
|
xsimd::kernel::requires_arch<xsimd::sse2>) {
|
||||||
return {_mm_unpacklo_epi8(first, second), _mm_unpackhi_epi8(first, second)};
|
return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Arch>
|
template <class Arch>
|
||||||
|
|
@ -241,7 +260,7 @@ std::tuple<xsimd::batch<int16_t, Arch>, xsimd::batch<int16_t, Arch>>
|
||||||
interleave(xsimd::batch<int16_t, Arch> first,
|
interleave(xsimd::batch<int16_t, Arch> first,
|
||||||
xsimd::batch<int16_t, Arch> second,
|
xsimd::batch<int16_t, Arch> second,
|
||||||
xsimd::kernel::requires_arch<xsimd::sse2>) {
|
xsimd::kernel::requires_arch<xsimd::sse2>) {
|
||||||
return {_mm_unpacklo_epi16(first, second), _mm_unpackhi_epi16(first, second)};
|
return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Arch>
|
template <class Arch>
|
||||||
|
|
@ -249,7 +268,7 @@ std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>>
|
||||||
interleave(xsimd::batch<int32_t, Arch> first,
|
interleave(xsimd::batch<int32_t, Arch> first,
|
||||||
xsimd::batch<int32_t, Arch> second,
|
xsimd::batch<int32_t, Arch> second,
|
||||||
xsimd::kernel::requires_arch<xsimd::sse2>) {
|
xsimd::kernel::requires_arch<xsimd::sse2>) {
|
||||||
return {_mm_unpacklo_epi32(first, second), _mm_unpackhi_epi32(first, second)};
|
return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Arch>
|
template <class Arch>
|
||||||
|
|
@ -257,7 +276,7 @@ std::tuple<xsimd::batch<int64_t, Arch>, xsimd::batch<int64_t, Arch>>
|
||||||
interleave(xsimd::batch<int64_t, Arch> first,
|
interleave(xsimd::batch<int64_t, Arch> first,
|
||||||
xsimd::batch<int64_t, Arch> second,
|
xsimd::batch<int64_t, Arch> second,
|
||||||
xsimd::kernel::requires_arch<xsimd::sse2>) {
|
xsimd::kernel::requires_arch<xsimd::sse2>) {
|
||||||
return {_mm_unpacklo_epi64(first, second), _mm_unpackhi_epi64(first, second)};
|
return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Arch>
|
template <class Arch>
|
||||||
|
|
@ -362,14 +381,7 @@ template <class Arch>
|
||||||
std::tuple<xsimd::batch<int8_t, Arch>, xsimd::batch<int8_t, Arch>>
|
std::tuple<xsimd::batch<int8_t, Arch>, xsimd::batch<int8_t, Arch>>
|
||||||
interleave(xsimd::batch<int8_t, Arch> first, xsimd::batch<int8_t, Arch> second,
|
interleave(xsimd::batch<int8_t, Arch> first, xsimd::batch<int8_t, Arch> second,
|
||||||
xsimd::kernel::requires_arch<xsimd::neon>) {
|
xsimd::kernel::requires_arch<xsimd::neon>) {
|
||||||
int8x8_t first_lo = vget_low_s8(first);
|
return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
|
||||||
int8x8_t second_lo = vget_low_s8(second);
|
|
||||||
int8x8x2_t result_lo = vzip_s8(first_lo, second_lo);
|
|
||||||
int8x8_t first_hi = vget_high_s8(first);
|
|
||||||
int8x8_t second_hi = vget_high_s8(second);
|
|
||||||
int8x8x2_t result_hi = vzip_s8(first_hi, second_hi);
|
|
||||||
return {vcombine_s8(result_lo.val[0], result_lo.val[1]),
|
|
||||||
vcombine_s8(result_hi.val[0], result_hi.val[1])};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Arch>
|
template <class Arch>
|
||||||
|
|
@ -377,14 +389,7 @@ std::tuple<xsimd::batch<int16_t, Arch>, xsimd::batch<int16_t, Arch>>
|
||||||
interleave(xsimd::batch<int16_t, Arch> first,
|
interleave(xsimd::batch<int16_t, Arch> first,
|
||||||
xsimd::batch<int16_t, Arch> second,
|
xsimd::batch<int16_t, Arch> second,
|
||||||
xsimd::kernel::requires_arch<xsimd::neon>) {
|
xsimd::kernel::requires_arch<xsimd::neon>) {
|
||||||
int16x4_t first_lo = vget_low_s16(first);
|
return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
|
||||||
int16x4_t second_lo = vget_low_s16(second);
|
|
||||||
int16x4x2_t result_lo = vzip_s16(first_lo, second_lo);
|
|
||||||
int16x4_t first_hi = vget_high_s16(first);
|
|
||||||
int16x4_t second_hi = vget_high_s16(second);
|
|
||||||
int16x4x2_t result_hi = vzip_s16(first_hi, second_hi);
|
|
||||||
return {vcombine_s16(result_lo.val[0], result_lo.val[1]),
|
|
||||||
vcombine_s16(result_hi.val[0], result_hi.val[1])};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Arch>
|
template <class Arch>
|
||||||
|
|
@ -392,14 +397,7 @@ std::tuple<xsimd::batch<int32_t, Arch>, xsimd::batch<int32_t, Arch>>
|
||||||
interleave(xsimd::batch<int32_t, Arch> first,
|
interleave(xsimd::batch<int32_t, Arch> first,
|
||||||
xsimd::batch<int32_t, Arch> second,
|
xsimd::batch<int32_t, Arch> second,
|
||||||
xsimd::kernel::requires_arch<xsimd::neon>) {
|
xsimd::kernel::requires_arch<xsimd::neon>) {
|
||||||
int32x2_t first_lo = vget_low_s32(first);
|
return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
|
||||||
int32x2_t second_lo = vget_low_s32(second);
|
|
||||||
int32x2x2_t result_lo = vzip_s32(first_lo, second_lo);
|
|
||||||
int32x2_t first_hi = vget_high_s32(first);
|
|
||||||
int32x2_t second_hi = vget_high_s32(second);
|
|
||||||
int32x2x2_t result_hi = vzip_s32(first_hi, second_hi);
|
|
||||||
return {vcombine_s32(result_lo.val[0], result_lo.val[1]),
|
|
||||||
vcombine_s32(result_hi.val[0], result_hi.val[1])};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Arch>
|
template <class Arch>
|
||||||
|
|
@ -407,11 +405,7 @@ std::tuple<xsimd::batch<int64_t, Arch>, xsimd::batch<int64_t, Arch>>
|
||||||
interleave(xsimd::batch<int64_t, Arch> first,
|
interleave(xsimd::batch<int64_t, Arch> first,
|
||||||
xsimd::batch<int64_t, Arch> second,
|
xsimd::batch<int64_t, Arch> second,
|
||||||
xsimd::kernel::requires_arch<xsimd::neon>) {
|
xsimd::kernel::requires_arch<xsimd::neon>) {
|
||||||
int64x1_t first_lo = vget_low_s64(first);
|
return {xsimd::zip_lo(first, second), xsimd::zip_hi(first, second)};
|
||||||
int64x1_t second_lo = vget_low_s64(second);
|
|
||||||
int64x1_t first_hi = vget_high_s64(first);
|
|
||||||
int64x1_t second_hi = vget_high_s64(second);
|
|
||||||
return {vcombine_s64(first_lo, second_lo), vcombine_s64(first_hi, second_hi)};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Arch>
|
template <class Arch>
|
||||||
|
|
@ -554,10 +548,9 @@ inline xsimd::batch<int16_t, Arch>
|
||||||
madd(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
|
madd(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
|
||||||
xsimd::kernel::requires_arch<xsimd::neon64>) {
|
xsimd::kernel::requires_arch<xsimd::neon64>) {
|
||||||
|
|
||||||
int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(x))),
|
int16x8_t tl = vmull_s8(vreinterpret_s8_u8(vget_low_u8(x)),
|
||||||
vmovl_s8(vget_low_s8(y)));
|
vget_low_s8(y));
|
||||||
int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(x))),
|
int16x8_t th = vmull_high_s8(vreinterpretq_s8_u8(x), y);
|
||||||
vmovl_s8(vget_high_s8(y)));
|
|
||||||
return vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th));
|
return vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -566,14 +559,12 @@ inline xsimd::batch<int32_t, Arch>
|
||||||
maddw(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
|
maddw(xsimd::batch<uint8_t, Arch> x, xsimd::batch<int8_t, Arch> y,
|
||||||
xsimd::batch<int32_t, Arch> z,
|
xsimd::batch<int32_t, Arch> z,
|
||||||
xsimd::kernel::requires_arch<xsimd::neon64>) {
|
xsimd::kernel::requires_arch<xsimd::neon64>) {
|
||||||
|
|
||||||
int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(x))),
|
int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(x))),
|
||||||
vmovl_s8(vget_low_s8(y)));
|
vmovl_s8(vget_low_s8(y)));
|
||||||
int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(x))),
|
int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(x))),
|
||||||
vmovl_s8(vget_high_s8(y)));
|
vmovl_s8(vget_high_s8(y)));
|
||||||
int32x4_t pl = vpaddlq_s16(tl);
|
return vpadalq_s16(vpadalq_s16(z, tl), th);
|
||||||
int32x4_t ph = vpaddlq_s16(th);
|
//TODO: investigate using vdotq_s32
|
||||||
return vpaddq_s32(z, vpaddq_s32(pl, ph));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class Arch>
|
template <class Arch>
|
||||||
|
|
|
||||||
4
third_party/gemmology/moz.yaml
vendored
4
third_party/gemmology/moz.yaml
vendored
|
|
@ -10,8 +10,8 @@ origin:
|
||||||
|
|
||||||
url: https://github.com/mozilla/gemmology
|
url: https://github.com/mozilla/gemmology
|
||||||
|
|
||||||
release: c04bacb101e020d9e6b51f20c92d7f63af50dd01 (2023-12-18T13:47:06Z).
|
release: ec535e87d0ab9d1457ff6d2af247cc8113e74694 (2024-02-05T09:05:20Z).
|
||||||
revision: c04bacb101e020d9e6b51f20c92d7f63af50dd01
|
revision: ec535e87d0ab9d1457ff6d2af247cc8113e74694
|
||||||
|
|
||||||
license: MIT
|
license: MIT
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue