Backed out 2 changesets (bug 1890935, bug 1882209) for causing Android build bustages. CLOSED TREE

Backed out changeset 05cb55554fc0 (bug 1882209)
Backed out changeset 67b0625e2cfa (bug 1890935)
This commit is contained in:
Stanca Serban 2024-04-19 08:07:52 +03:00
parent 1619fc55f6
commit 4d62b8911e
233 changed files with 18455 additions and 782 deletions

25
Cargo.lock generated
View file

@ -91,15 +91,6 @@ version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b84bf0a05bbb2a83e5eb6fa36bb6e87baa08193c35ff52bbf6b38d8af2890e46" checksum = "b84bf0a05bbb2a83e5eb6fa36bb6e87baa08193c35ff52bbf6b38d8af2890e46"
[[package]]
name = "any_all_workaround"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88fea40735f2cc320a5133ce772d39c571bd6c9b0d4c1a326926eecdd5af2e86"
dependencies = [
"cfg-if 1.0.0",
]
[[package]] [[package]]
name = "anyhow" name = "anyhow"
version = "1.0.69" version = "1.0.69"
@ -1583,12 +1574,12 @@ dependencies = [
[[package]] [[package]]
name = "encoding_rs" name = "encoding_rs"
version = "0.8.34" version = "0.8.33"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b45de904aa0b010bce2ab45264d0631681847fa7b6f2eaa7dab7619943bc4f59" checksum = "7268b386296a025e474d5140678f75d6de9493ae55a5d709eeb9dd08149945e1"
dependencies = [ dependencies = [
"any_all_workaround",
"cfg-if 1.0.0", "cfg-if 1.0.0",
"packed_simd",
] ]
[[package]] [[package]]
@ -4329,6 +4320,16 @@ dependencies = [
"oxilangtag", "oxilangtag",
] ]
[[package]]
name = "packed_simd"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f9f08af0c877571712e2e3e686ad79efad9657dbf0f7c3c8ba943ff6c38932d"
dependencies = [
"cfg-if 1.0.0",
"num-traits",
]
[[package]] [[package]]
name = "parking_lot" name = "parking_lot"
version = "0.12.1" version = "0.12.1"

View file

@ -265,7 +265,7 @@ endif
ifndef RUSTC_BOOTSTRAP ifndef RUSTC_BOOTSTRAP
RUSTC_BOOTSTRAP := mozglue_static,qcms RUSTC_BOOTSTRAP := mozglue_static,qcms
ifdef MOZ_RUST_SIMD ifdef MOZ_RUST_SIMD
RUSTC_BOOTSTRAP := $(RUSTC_BOOTSTRAP),encoding_rs,any_all_workaround RUSTC_BOOTSTRAP := $(RUSTC_BOOTSTRAP),encoding_rs,packed_simd
endif endif
export RUSTC_BOOTSTRAP export RUSTC_BOOTSTRAP
endif endif

View file

@ -152,9 +152,9 @@ Here are the Rust versions for each Firefox version.
| Firefox 123 | Rust 1.75.0 | 1.70.0 | 2023 December 28 | 2024 January 11 | 2024 February 20 | Firefox 123 | Rust 1.75.0 | 1.70.0 | 2023 December 28 | 2024 January 11 | 2024 February 20
| Firefox 124 | Rust 1.76.0 | 1.70.0 | 2024 February 8 | 2024 February 15 | 2024 March 19 | Firefox 124 | Rust 1.76.0 | 1.70.0 | 2024 February 8 | 2024 February 15 | 2024 March 19
| Firefox 125 | Rust 1.76.0 | 1.74.0 | 2024 February 8 | 2024 March 14 | 2024 April 16 | Firefox 125 | Rust 1.76.0 | 1.74.0 | 2024 February 8 | 2024 March 14 | 2024 April 16
| Firefox 126 | Rust 1.77.2 | 1.74.0 | 2024 March 28 | 2024 April 11 | 2024 May 14
| **Estimated** | | **Estimated** |
| Firefox 127 | Rust 1.78.0 | 1.76.0 | 2024 May 2 | 2024 May 9 | 2024 June 11 | Firefox 126 | Rust 1.77.2 | ? | 2024 March 28 | 2024 April 11 | 2024 May 14
| Firefox 127 | Rust 1.78.0 | ? | 2024 May 2 | 2024 May 9 | 2024 June 11
| Firefox 128 | Rust 1.78.0 | ? | 2024 May 2 | 2024 June 6 | 2024 July 9 | Firefox 128 | Rust 1.78.0 | ? | 2024 May 2 | 2024 June 6 | 2024 July 9
| Firefox 129 | Rust 1.79.0 | ? | 2024 June 13 | 2024 July 4 | 2024 August 6 | Firefox 129 | Rust 1.79.0 | ? | 2024 June 13 | 2024 July 4 | 2024 August 6
| Firefox 130 | Rust 1.80.0 | ? | 2024 July 25 | 2024 August 1 | 2024 September 3 | Firefox 130 | Rust 1.80.0 | ? | 2024 July 25 | 2024 August 1 | 2024 September 3

View file

@ -11,7 +11,7 @@ import certifi
from mach.site import PythonVirtualenv from mach.site import PythonVirtualenv
from mach.util import get_state_dir from mach.util import get_state_dir
MINIMUM_RUST_VERSION = "1.76.0" MINIMUM_RUST_VERSION = "1.74.0"
def get_tools_dir(srcdir=False): def get_tools_dir(srcdir=False):

View file

@ -588,12 +588,6 @@ who = "Mike Hommey <mh+mozilla@glandium.org>"
criteria = "safe-to-deploy" criteria = "safe-to-deploy"
delta = "0.1.4 -> 0.1.5" delta = "0.1.4 -> 0.1.5"
[[audits.any_all_workaround]]
who = "Henri Sivonen <hsivonen@hsivonen.fi>"
criteria = "safe-to-deploy"
version = "0.1.0"
notes = "The little code that is in this crate I reviewed and modified from packed_simd (which has previously been vendored in full instead of just this small part)."
[[audits.anyhow]] [[audits.anyhow]]
who = "Mike Hommey <mh+mozilla@glandium.org>" who = "Mike Hommey <mh+mozilla@glandium.org>"
criteria = "safe-to-deploy" criteria = "safe-to-deploy"

View file

@ -190,13 +190,6 @@ user-id = 4484
user-login = "hsivonen" user-login = "hsivonen"
user-name = "Henri Sivonen" user-name = "Henri Sivonen"
[[publisher.encoding_rs]]
version = "0.8.34"
when = "2024-04-10"
user-id = 4484
user-login = "hsivonen"
user-name = "Henri Sivonen"
[[publisher.errno]] [[publisher.errno]]
version = "0.3.8" version = "0.3.8"
when = "2023-11-28" when = "2023-11-28"

View file

@ -11,12 +11,12 @@ job-defaults:
script: repack_rust.py script: repack_rust.py
toolchain-artifact: public/build/rustc.tar.zst toolchain-artifact: public/build/rustc.tar.zst
linux64-rust-1.76: linux64-rust-1.74:
treeherder: treeherder:
symbol: TL(rust-1.76) symbol: TL(rust-1.74)
run: run:
arguments: [ arguments: [
'--channel', '1.76.0', '--channel', '1.74.0',
'--host', 'x86_64-unknown-linux-gnu', '--host', 'x86_64-unknown-linux-gnu',
'--target', 'x86_64-unknown-linux-gnu', '--target', 'x86_64-unknown-linux-gnu',
'--target', 'i686-unknown-linux-gnu', '--target', 'i686-unknown-linux-gnu',

View file

@ -1 +0,0 @@
{"files":{"Cargo.toml":"c38be4bc8ef1c4df398b3eae589681d1bbb54a2577c71d592e12db0af757c472","LICENSE-APACHE":"a60eea817514531668d7e00765731449fe14d059d3249e0bc93b36de45f759f2","LICENSE-MIT":"6485b8ed310d3f0340bf1ad1f47645069ce4069dcc6bb46c7d5c6faf41de1fdb","README.md":"abebbd2620f915c70a873dd8221d99eadd8d017b7b194c22f3e0051f1fde193f","src/lib.rs":"e8a36b888f0f20accd4e7bfb2db9196e42b4be2d1014cb675981543d1372c610"},"package":"88fea40735f2cc320a5133ce772d39c571bd6c9b0d4c1a326926eecdd5af2e86"}

View file

@ -1,25 +0,0 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2021"
name = "any_all_workaround"
version = "0.1.0"
authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
description = "Workaround for bad LLVM codegen for boolean reductions on 32-bit ARM"
homepage = "https://docs.rs/any_all_workaround/"
documentation = "https://docs.rs/any_all_workaround/"
readme = "README.md"
license = "MIT OR Apache-2.0"
repository = "https://github.com/hsivonen/any_all_workaround"
[dependencies.cfg-if]
version = "1.0"

View file

@ -1,13 +0,0 @@
# any_all_workaround
This is a workaround for bad codegen ([Rust bug](https://github.com/rust-lang/portable-simd/issues/146), [LLVM bug](https://github.com/llvm/llvm-project/issues/50466)) for the `any()` and `all()` reductions for NEON-backed SIMD vectors on 32-bit ARM. On other platforms these delegate to `any()` and `all()` in `core::simd`.
The plan is to abandon this crate once the LLVM bug is fixed or `core::simd` works around the LLVM bug.
The code is forked from the [`packed_simd` crate](https://raw.githubusercontent.com/hsivonen/packed_simd/d938e39bee9bc5c222f5f2f2a0df9e53b5ce36ae/src/codegen/reductions/mask/arm.rs).
This crate requires Nightly Rust as it depends on the `portable_simd` feature.
# License
`MIT OR Apache-2.0`, since that's how `packed_simd` is licensed.

View file

@ -1,99 +0,0 @@
// This code began as a fork of
// https://raw.githubusercontent.com/rust-lang/packed_simd/d938e39bee9bc5c222f5f2f2a0df9e53b5ce36ae/src/codegen/reductions/mask/arm.rs
// which didn't have a license header on the file, but Cargo.toml said "MIT OR Apache-2.0".
// See LICENSE-MIT and LICENSE-APACHE.
#![no_std]
#![feature(portable_simd)]
#![cfg_attr(
all(
target_arch = "arm",
target_endian = "little",
target_feature = "neon",
target_feature = "v7"
),
feature(stdarch_arm_neon_intrinsics)
)]
use cfg_if::cfg_if;
use core::simd::mask16x8;
use core::simd::mask32x4;
use core::simd::mask8x16;
cfg_if! {
if #[cfg(all(target_arch = "arm", target_endian = "little", target_feature = "neon", target_feature = "v7"))] {
use core::simd::mask8x8;
use core::simd::mask16x4;
use core::simd::mask32x2;
macro_rules! arm_128_v7_neon_impl {
($all:ident, $any:ident, $id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => {
#[inline]
pub fn $all(s: $id) -> bool {
use core::arch::arm::$vpmin;
use core::mem::transmute;
unsafe {
union U {
halves: ($half, $half),
vec: $id,
}
let halves = U { vec: s }.halves;
let h: $half = transmute($vpmin(transmute(halves.0), transmute(halves.1)));
h.all()
}
}
#[inline]
pub fn $any(s: $id) -> bool {
use core::arch::arm::$vpmax;
use core::mem::transmute;
unsafe {
union U {
halves: ($half, $half),
vec: $id,
}
let halves = U { vec: s }.halves;
let h: $half = transmute($vpmax(transmute(halves.0), transmute(halves.1)));
h.any()
}
}
}
}
} else {
macro_rules! arm_128_v7_neon_impl {
($all:ident, $any:ident, $id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => {
#[inline(always)]
pub fn $all(s: $id) -> bool {
s.all()
}
#[inline(always)]
pub fn $any(s: $id) -> bool {
s.any()
}
}
}
}
}
arm_128_v7_neon_impl!(
all_mask8x16,
any_mask8x16,
mask8x16,
mask8x8,
vpmin_u8,
vpmax_u8
);
arm_128_v7_neon_impl!(
all_mask16x8,
any_mask16x8,
mask16x8,
mask16x4,
vpmin_u16,
vpmax_u16
);
arm_128_v7_neon_impl!(
all_mask32x4,
any_mask32x4,
mask32x4,
mask32x2,
vpmin_u32,
vpmax_u32
);

File diff suppressed because one or more lines are too long

View file

@ -11,9 +11,8 @@
[package] [package]
edition = "2018" edition = "2018"
rust-version = "1.36"
name = "encoding_rs" name = "encoding_rs"
version = "0.8.34" version = "0.8.33"
authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"] authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
description = "A Gecko-oriented implementation of the Encoding Standard" description = "A Gecko-oriented implementation of the Encoding Standard"
homepage = "https://docs.rs/encoding_rs/" homepage = "https://docs.rs/encoding_rs/"
@ -37,13 +36,13 @@ repository = "https://github.com/hsivonen/encoding_rs"
[profile.release] [profile.release]
lto = true lto = true
[dependencies.any_all_workaround]
version = "0.1.0"
optional = true
[dependencies.cfg-if] [dependencies.cfg-if]
version = "1.0" version = "1.0"
[dependencies.packed_simd]
version = "0.3.9"
optional = true
[dependencies.serde] [dependencies.serde]
version = "1.0" version = "1.0"
optional = true optional = true
@ -75,4 +74,10 @@ fast-legacy-encode = [
less-slow-big5-hanzi-encode = [] less-slow-big5-hanzi-encode = []
less-slow-gb-hanzi-encode = [] less-slow-gb-hanzi-encode = []
less-slow-kanji-encode = [] less-slow-kanji-encode = []
simd-accel = ["any_all_workaround"] simd-accel = [
"packed_simd",
"packed_simd/into_bits",
]
[badges.travis-ci]
repository = "hsivonen/encoding_rs"

View file

@ -167,15 +167,13 @@ There are currently these optional cargo features:
### `simd-accel` ### `simd-accel`
Enables SIMD acceleration using the nightly-dependent `portable_simd` standard Enables SIMD acceleration using the nightly-dependent `packed_simd` crate.
library feature.
This is an opt-in feature, because enabling this feature _opts out_ of Rust's This is an opt-in feature, because enabling this feature _opts out_ of Rust's
guarantees of future compilers compiling old code (aka. "stability story"). guarantees of future compilers compiling old code (aka. "stability story").
Currently, this has not been tested to be an improvement except for these Currently, this has not been tested to be an improvement except for these
targets and enabling the `simd-accel` feature is expected to break the build targets:
on other targets:
* x86_64 * x86_64
* i686 * i686
@ -187,6 +185,22 @@ above, and you are prepared _to have to revise your configuration when updating
Rust_, you should enable this feature. Otherwise, please _do not_ enable this Rust_, you should enable this feature. Otherwise, please _do not_ enable this
feature. feature.
_Note!_ If you are compiling for a target that does not have 128-bit SIMD
enabled as part of the target definition and you are enabling 128-bit SIMD
using `-C target_feature`, you need to enable the `core_arch` Cargo feature
for `packed_simd` to compile a crates.io snapshot of `core_arch` instead of
using the standard-library copy of `core::arch`, because the `core::arch`
module of the pre-compiled standard library has been compiled with the
assumption that the CPU doesn't have 128-bit SIMD. At present this applies
mainly to 32-bit ARM targets whose first component does not include the
substring `neon`.
The encoding_rs side of things has not been properly set up for POWER,
PowerPC, MIPS, etc., SIMD at this time, so even if you were to follow
the advice from the previous paragraph, you probably shouldn't use
the `simd-accel` option on the less mainstream architectures at this
time.
Used by Firefox. Used by Firefox.
### `serde` ### `serde`
@ -367,9 +381,8 @@ as semver-breaking, because this crate depends on `cfg-if`, which doesn't
appear to treat MSRV changes as semver-breaking, so it would be useless for appear to treat MSRV changes as semver-breaking, so it would be useless for
this crate to treat MSRV changes as semver-breaking. this crate to treat MSRV changes as semver-breaking.
As of 2024-04-04, MSRV appears to be Rust 1.36.0 for using the crate and As of 2021-02-04, MSRV appears to be Rust 1.36.0 for using the crate and
1.42.0 for doc tests to pass without errors about the global allocator. 1.42.0 for doc tests to pass without errors about the global allocator.
With the `simd-accel` feature, the MSRV is even higher.
## Compatibility with rust-encoding ## Compatibility with rust-encoding
@ -433,17 +446,10 @@ To regenerate the generated code:
- [x] Add actually fast CJK encode options. - [x] Add actually fast CJK encode options.
- [ ] ~Investigate [Bob Steagall's lookup table acceleration for UTF-8](https://github.com/BobSteagall/CppNow2018/blob/master/FastConversionFromUTF-8/Fast%20Conversion%20From%20UTF-8%20with%20C%2B%2B%2C%20DFAs%2C%20and%20SSE%20Intrinsics%20-%20Bob%20Steagall%20-%20C%2B%2BNow%202018.pdf).~ - [ ] ~Investigate [Bob Steagall's lookup table acceleration for UTF-8](https://github.com/BobSteagall/CppNow2018/blob/master/FastConversionFromUTF-8/Fast%20Conversion%20From%20UTF-8%20with%20C%2B%2B%2C%20DFAs%2C%20and%20SSE%20Intrinsics%20-%20Bob%20Steagall%20-%20C%2B%2BNow%202018.pdf).~
- [x] Provide a build mode that works without `alloc` (with lesser API surface). - [x] Provide a build mode that works without `alloc` (with lesser API surface).
- [x] Migrate to `std::simd` ~once it is stable and declare 1.0.~ - [ ] Migrate to `std::simd` once it is stable and declare 1.0.
- [ ] Migrate `unsafe` slice access by larger types than `u8`/`u16` to `align_to`.
## Release Notes ## Release Notes
### 0.8.34
* Use the `portable_simd` nightly feature of the standard library instead of the `packed_simd` crate. Only affects the `simd-accel` optional nightly feature.
* Internal documentation improvements and minor code improvements around `unsafe`.
* Added `rust-version` to `Cargo.toml`.
### 0.8.33 ### 0.8.33
* Use `packed_simd` instead of `packed_simd_2` again now that updates are back under the `packed_simd` name. Only affects the `simd-accel` optional nightly feature. * Use `packed_simd` instead of `packed_simd_2` again now that updates are back under the `packed_simd` name. Only affects the `simd-accel` optional nightly feature.

File diff suppressed because it is too large Load diff

View file

@ -34,7 +34,7 @@ use crate::simd_funcs::*;
all(target_endian = "little", target_feature = "neon") all(target_endian = "little", target_feature = "neon")
) )
))] ))]
use core::simd::u16x8; use packed_simd::u16x8;
use super::DecoderResult; use super::DecoderResult;
use super::EncoderResult; use super::EncoderResult;
@ -90,23 +90,19 @@ impl Endian for LittleEndian {
#[derive(Debug, Copy, Clone)] #[derive(Debug, Copy, Clone)]
struct UnalignedU16Slice { struct UnalignedU16Slice {
// Safety invariant: ptr must be valid for reading 2*len bytes
ptr: *const u8, ptr: *const u8,
len: usize, len: usize,
} }
impl UnalignedU16Slice { impl UnalignedU16Slice {
/// Safety: ptr must be valid for reading 2*len bytes
#[inline(always)] #[inline(always)]
pub unsafe fn new(ptr: *const u8, len: usize) -> UnalignedU16Slice { pub unsafe fn new(ptr: *const u8, len: usize) -> UnalignedU16Slice {
// Safety: field invariant passed up to caller here
UnalignedU16Slice { ptr, len } UnalignedU16Slice { ptr, len }
} }
#[inline(always)] #[inline(always)]
pub fn trim_last(&mut self) { pub fn trim_last(&mut self) {
assert!(self.len > 0); assert!(self.len > 0);
// Safety: invariant upheld here: a slice is still valid with a shorter len
self.len -= 1; self.len -= 1;
} }
@ -117,9 +113,7 @@ impl UnalignedU16Slice {
assert!(i < self.len); assert!(i < self.len);
unsafe { unsafe {
let mut u: MaybeUninit<u16> = MaybeUninit::uninit(); let mut u: MaybeUninit<u16> = MaybeUninit::uninit();
// Safety: i is at most len - 1, which works here
::core::ptr::copy_nonoverlapping(self.ptr.add(i * 2), u.as_mut_ptr() as *mut u8, 2); ::core::ptr::copy_nonoverlapping(self.ptr.add(i * 2), u.as_mut_ptr() as *mut u8, 2);
// Safety: valid read above lets us do this
u.assume_init() u.assume_init()
} }
} }
@ -127,13 +121,8 @@ impl UnalignedU16Slice {
#[cfg(feature = "simd-accel")] #[cfg(feature = "simd-accel")]
#[inline(always)] #[inline(always)]
pub fn simd_at(&self, i: usize) -> u16x8 { pub fn simd_at(&self, i: usize) -> u16x8 {
// Safety: i/len are on the scale of u16s, each one corresponds to 2 u8s
assert!(i + SIMD_STRIDE_SIZE / 2 <= self.len); assert!(i + SIMD_STRIDE_SIZE / 2 <= self.len);
let byte_index = i * 2; let byte_index = i * 2;
// Safety: load16_unaligned needs SIMD_STRIDE_SIZE=16 u8 elements to read,
// or 16/2 = 8 u16 elements to read.
// We have checked that we have at least that many above.
unsafe { to_u16_lanes(load16_unaligned(self.ptr.add(byte_index))) } unsafe { to_u16_lanes(load16_unaligned(self.ptr.add(byte_index))) }
} }
@ -147,7 +136,6 @@ impl UnalignedU16Slice {
// XXX the return value should be restricted not to // XXX the return value should be restricted not to
// outlive self. // outlive self.
assert!(from <= self.len); assert!(from <= self.len);
// Safety: This upholds the same invariant: `from` is in bounds and we're returning a shorter slice
unsafe { UnalignedU16Slice::new(self.ptr.add(from * 2), self.len - from) } unsafe { UnalignedU16Slice::new(self.ptr.add(from * 2), self.len - from) }
} }
@ -156,8 +144,6 @@ impl UnalignedU16Slice {
pub fn copy_bmp_to<E: Endian>(&self, other: &mut [u16]) -> Option<(u16, usize)> { pub fn copy_bmp_to<E: Endian>(&self, other: &mut [u16]) -> Option<(u16, usize)> {
assert!(self.len <= other.len()); assert!(self.len <= other.len());
let mut offset = 0; let mut offset = 0;
// Safety: SIMD_STRIDE_SIZE is measured in bytes, whereas len is in u16s. We check we can
// munch SIMD_STRIDE_SIZE / 2 u16s which means we can write SIMD_STRIDE_SIZE u8s
if SIMD_STRIDE_SIZE / 2 <= self.len { if SIMD_STRIDE_SIZE / 2 <= self.len {
let len_minus_stride = self.len - SIMD_STRIDE_SIZE / 2; let len_minus_stride = self.len - SIMD_STRIDE_SIZE / 2;
loop { loop {
@ -165,7 +151,6 @@ impl UnalignedU16Slice {
if E::OPPOSITE_ENDIAN { if E::OPPOSITE_ENDIAN {
simd = simd_byte_swap(simd); simd = simd_byte_swap(simd);
} }
// Safety: we have enough space on the other side to write this
unsafe { unsafe {
store8_unaligned(other.as_mut_ptr().add(offset), simd); store8_unaligned(other.as_mut_ptr().add(offset), simd);
} }
@ -173,7 +158,6 @@ impl UnalignedU16Slice {
break; break;
} }
offset += SIMD_STRIDE_SIZE / 2; offset += SIMD_STRIDE_SIZE / 2;
// Safety: This ensures we still have space for writing SIMD_STRIDE_SIZE u8s
if offset > len_minus_stride { if offset > len_minus_stride {
break; break;
} }
@ -252,7 +236,6 @@ fn copy_unaligned_basic_latin_to_ascii<E: Endian>(
) -> CopyAsciiResult<usize, (u16, usize)> { ) -> CopyAsciiResult<usize, (u16, usize)> {
let len = ::core::cmp::min(src.len(), dst.len()); let len = ::core::cmp::min(src.len(), dst.len());
let mut offset = 0; let mut offset = 0;
// Safety: This check ensures we are able to read/write at least SIMD_STRIDE_SIZE elements
if SIMD_STRIDE_SIZE <= len { if SIMD_STRIDE_SIZE <= len {
let len_minus_stride = len - SIMD_STRIDE_SIZE; let len_minus_stride = len - SIMD_STRIDE_SIZE;
loop { loop {
@ -266,13 +249,10 @@ fn copy_unaligned_basic_latin_to_ascii<E: Endian>(
break; break;
} }
let packed = simd_pack(first, second); let packed = simd_pack(first, second);
// Safety: We are able to write SIMD_STRIDE_SIZE elements in this iteration
unsafe { unsafe {
store16_unaligned(dst.as_mut_ptr().add(offset), packed); store16_unaligned(dst.as_mut_ptr().add(offset), packed);
} }
offset += SIMD_STRIDE_SIZE; offset += SIMD_STRIDE_SIZE;
// Safety: This is `offset > len - SIMD_STRIDE_SIZE`, which ensures that we can write at least SIMD_STRIDE_SIZE elements
// in the next iteration
if offset > len_minus_stride { if offset > len_minus_stride {
break; break;
} }
@ -657,7 +637,7 @@ impl<'a> Utf16Destination<'a> {
self.write_code_unit((0xDC00 + (astral & 0x3FF)) as u16); self.write_code_unit((0xDC00 + (astral & 0x3FF)) as u16);
} }
#[inline(always)] #[inline(always)]
fn write_surrogate_pair(&mut self, high: u16, low: u16) { pub fn write_surrogate_pair(&mut self, high: u16, low: u16) {
self.write_code_unit(high); self.write_code_unit(high);
self.write_code_unit(low); self.write_code_unit(low);
} }
@ -666,7 +646,6 @@ impl<'a> Utf16Destination<'a> {
self.write_bmp_excl_ascii(combined); self.write_bmp_excl_ascii(combined);
self.write_bmp_excl_ascii(combining); self.write_bmp_excl_ascii(combining);
} }
// Safety-usable invariant: CopyAsciiResult::GoOn will only contain bytes >=0x80
#[inline(always)] #[inline(always)]
pub fn copy_ascii_from_check_space_bmp<'b>( pub fn copy_ascii_from_check_space_bmp<'b>(
&'b mut self, &'b mut self,
@ -680,8 +659,6 @@ impl<'a> Utf16Destination<'a> {
} else { } else {
(DecoderResult::InputEmpty, src_remaining.len()) (DecoderResult::InputEmpty, src_remaining.len())
}; };
// Safety: This function is documented as needing valid pointers for src/dest and len, which
// is true since we've passed the minumum length of the two
match unsafe { match unsafe {
ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
} { } {
@ -690,20 +667,16 @@ impl<'a> Utf16Destination<'a> {
self.pos += length; self.pos += length;
return CopyAsciiResult::Stop((pending, source.pos, self.pos)); return CopyAsciiResult::Stop((pending, source.pos, self.pos));
} }
// Safety: the function is documented as returning bytes >=0x80 in the Some
Some((non_ascii, consumed)) => { Some((non_ascii, consumed)) => {
source.pos += consumed; source.pos += consumed;
self.pos += consumed; self.pos += consumed;
source.pos += 1; // +1 for non_ascii source.pos += 1; // +1 for non_ascii
// Safety: non-ascii bubbled out here
non_ascii non_ascii
} }
} }
}; };
// Safety: non-ascii returned here
CopyAsciiResult::GoOn((non_ascii_ret, Utf16BmpHandle::new(self))) CopyAsciiResult::GoOn((non_ascii_ret, Utf16BmpHandle::new(self)))
} }
// Safety-usable invariant: CopyAsciiResult::GoOn will only contain bytes >=0x80
#[inline(always)] #[inline(always)]
pub fn copy_ascii_from_check_space_astral<'b>( pub fn copy_ascii_from_check_space_astral<'b>(
&'b mut self, &'b mut self,
@ -718,8 +691,6 @@ impl<'a> Utf16Destination<'a> {
} else { } else {
(DecoderResult::InputEmpty, src_remaining.len()) (DecoderResult::InputEmpty, src_remaining.len())
}; };
// Safety: This function is documented as needing valid pointers for src/dest and len, which
// is true since we've passed the minumum length of the two
match unsafe { match unsafe {
ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
} { } {
@ -728,13 +699,11 @@ impl<'a> Utf16Destination<'a> {
self.pos += length; self.pos += length;
return CopyAsciiResult::Stop((pending, source.pos, self.pos)); return CopyAsciiResult::Stop((pending, source.pos, self.pos));
} }
// Safety: the function is documented as returning bytes >=0x80 in the Some
Some((non_ascii, consumed)) => { Some((non_ascii, consumed)) => {
source.pos += consumed; source.pos += consumed;
self.pos += consumed; self.pos += consumed;
if self.pos + 1 < dst_len { if self.pos + 1 < dst_len {
source.pos += 1; // +1 for non_ascii source.pos += 1; // +1 for non_ascii
// Safety: non-ascii bubbled out here
non_ascii non_ascii
} else { } else {
return CopyAsciiResult::Stop(( return CopyAsciiResult::Stop((
@ -746,7 +715,6 @@ impl<'a> Utf16Destination<'a> {
} }
} }
}; };
// Safety: non-ascii returned here
CopyAsciiResult::GoOn((non_ascii_ret, Utf16AstralHandle::new(self))) CopyAsciiResult::GoOn((non_ascii_ret, Utf16AstralHandle::new(self)))
} }
#[inline(always)] #[inline(always)]

View file

@ -689,7 +689,7 @@
//! for discussion about the UTF-16 family. //! for discussion about the UTF-16 family.
#![no_std] #![no_std]
#![cfg_attr(feature = "simd-accel", feature(core_intrinsics, portable_simd))] #![cfg_attr(feature = "simd-accel", feature(core_intrinsics))]
#[cfg(feature = "alloc")] #[cfg(feature = "alloc")]
#[cfg_attr(test, macro_use)] #[cfg_attr(test, macro_use)]
@ -699,6 +699,17 @@ extern crate core;
#[macro_use] #[macro_use]
extern crate cfg_if; extern crate cfg_if;
#[cfg(all(
feature = "simd-accel",
any(
target_feature = "sse2",
all(target_endian = "little", target_arch = "aarch64"),
all(target_endian = "little", target_feature = "neon")
)
))]
#[macro_use(shuffle)]
extern crate packed_simd;
#[cfg(feature = "serde")] #[cfg(feature = "serde")]
extern crate serde; extern crate serde;

View file

@ -116,11 +116,6 @@ macro_rules! by_unit_check_alu {
} }
let len_minus_stride = len - ALU_ALIGNMENT / unit_size; let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len { if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
// Safety: the above check lets us perform 4 consecutive reads of
// length ALU_ALIGNMENT / unit_size. ALU_ALIGNMENT is the size of usize, and unit_size
// is the size of the `src` pointer, so this is equal to performing four usize reads.
//
// This invariant is upheld on all loop iterations
let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size)); let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
loop { loop {
let unroll_accu = unsafe { *(src.add(offset) as *const usize) } let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
@ -139,14 +134,12 @@ macro_rules! by_unit_check_alu {
return false; return false;
} }
offset += 4 * (ALU_ALIGNMENT / unit_size); offset += 4 * (ALU_ALIGNMENT / unit_size);
// Safety: this check lets us continue to perform the 4 reads earlier
if offset > len_minus_unroll { if offset > len_minus_unroll {
break; break;
} }
} }
} }
while offset <= len_minus_stride { while offset <= len_minus_stride {
// Safety: the above check lets us perform one usize read.
accu |= unsafe { *(src.add(offset) as *const usize) }; accu |= unsafe { *(src.add(offset) as *const usize) };
offset += ALU_ALIGNMENT / unit_size; offset += ALU_ALIGNMENT / unit_size;
} }
@ -196,11 +189,6 @@ macro_rules! by_unit_check_simd {
} }
let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size; let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len { if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
// Safety: the above check lets us perform 4 consecutive reads of
// length SIMD_STRIDE_SIZE / unit_size. SIMD_STRIDE_SIZE is the size of $simd_ty, and unit_size
// is the size of the `src` pointer, so this is equal to performing four $simd_ty reads.
//
// This invariant is upheld on all loop iterations
let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size)); let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
loop { loop {
let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) } let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
@ -220,7 +208,6 @@ macro_rules! by_unit_check_simd {
return false; return false;
} }
offset += 4 * (SIMD_STRIDE_SIZE / unit_size); offset += 4 * (SIMD_STRIDE_SIZE / unit_size);
// Safety: this check lets us continue to perform the 4 reads earlier
if offset > len_minus_unroll { if offset > len_minus_unroll {
break; break;
} }
@ -228,7 +215,6 @@ macro_rules! by_unit_check_simd {
} }
let mut simd_accu = $splat; let mut simd_accu = $splat;
while offset <= len_minus_stride { while offset <= len_minus_stride {
// Safety: the above check lets us perform one $simd_ty read.
simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) }; simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
offset += SIMD_STRIDE_SIZE / unit_size; offset += SIMD_STRIDE_SIZE / unit_size;
} }
@ -248,8 +234,8 @@ macro_rules! by_unit_check_simd {
cfg_if! { cfg_if! {
if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] { if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
use crate::simd_funcs::*; use crate::simd_funcs::*;
use core::simd::u8x16; use packed_simd::u8x16;
use core::simd::u16x8; use packed_simd::u16x8;
const SIMD_ALIGNMENT: usize = 16; const SIMD_ALIGNMENT: usize = 16;

View file

@ -7,74 +7,55 @@
// option. This file may not be copied, modified, or distributed // option. This file may not be copied, modified, or distributed
// except according to those terms. // except according to those terms.
use any_all_workaround::all_mask16x8; use packed_simd::u16x8;
use any_all_workaround::all_mask8x16; use packed_simd::u8x16;
use any_all_workaround::any_mask16x8; use packed_simd::IntoBits;
use any_all_workaround::any_mask8x16;
use core::simd::cmp::SimdPartialEq;
use core::simd::cmp::SimdPartialOrd;
use core::simd::mask16x8;
use core::simd::mask8x16;
use core::simd::simd_swizzle;
use core::simd::u16x8;
use core::simd::u8x16;
use core::simd::ToBytes;
// TODO: Migrate unaligned access to stdlib code if/when the RFC // TODO: Migrate unaligned access to stdlib code if/when the RFC
// https://github.com/rust-lang/rfcs/pull/1725 is implemented. // https://github.com/rust-lang/rfcs/pull/1725 is implemented.
/// Safety invariant: ptr must be valid for an unaligned read of 16 bytes
#[inline(always)] #[inline(always)]
pub unsafe fn load16_unaligned(ptr: *const u8) -> u8x16 { pub unsafe fn load16_unaligned(ptr: *const u8) -> u8x16 {
let mut simd = ::core::mem::MaybeUninit::<u8x16>::uninit(); let mut simd = ::core::mem::uninitialized();
::core::ptr::copy_nonoverlapping(ptr, simd.as_mut_ptr() as *mut u8, 16); ::core::ptr::copy_nonoverlapping(ptr, &mut simd as *mut u8x16 as *mut u8, 16);
// Safety: copied 16 bytes of initialized memory into this, it is now initialized simd
simd.assume_init()
} }
/// Safety invariant: ptr must be valid for an aligned-for-u8x16 read of 16 bytes
#[allow(dead_code)] #[allow(dead_code)]
#[inline(always)] #[inline(always)]
pub unsafe fn load16_aligned(ptr: *const u8) -> u8x16 { pub unsafe fn load16_aligned(ptr: *const u8) -> u8x16 {
*(ptr as *const u8x16) *(ptr as *const u8x16)
} }
/// Safety invariant: ptr must be valid for an unaligned store of 16 bytes
#[inline(always)] #[inline(always)]
pub unsafe fn store16_unaligned(ptr: *mut u8, s: u8x16) { pub unsafe fn store16_unaligned(ptr: *mut u8, s: u8x16) {
::core::ptr::copy_nonoverlapping(&s as *const u8x16 as *const u8, ptr, 16); ::core::ptr::copy_nonoverlapping(&s as *const u8x16 as *const u8, ptr, 16);
} }
/// Safety invariant: ptr must be valid for an aligned-for-u8x16 store of 16 bytes
#[allow(dead_code)] #[allow(dead_code)]
#[inline(always)] #[inline(always)]
pub unsafe fn store16_aligned(ptr: *mut u8, s: u8x16) { pub unsafe fn store16_aligned(ptr: *mut u8, s: u8x16) {
*(ptr as *mut u8x16) = s; *(ptr as *mut u8x16) = s;
} }
/// Safety invariant: ptr must be valid for an unaligned read of 16 bytes
#[inline(always)] #[inline(always)]
pub unsafe fn load8_unaligned(ptr: *const u16) -> u16x8 { pub unsafe fn load8_unaligned(ptr: *const u16) -> u16x8 {
let mut simd = ::core::mem::MaybeUninit::<u16x8>::uninit(); let mut simd = ::core::mem::uninitialized();
::core::ptr::copy_nonoverlapping(ptr as *const u8, simd.as_mut_ptr() as *mut u8, 16); ::core::ptr::copy_nonoverlapping(ptr as *const u8, &mut simd as *mut u16x8 as *mut u8, 16);
// Safety: copied 16 bytes of initialized memory into this, it is now initialized simd
simd.assume_init()
} }
/// Safety invariant: ptr must be valid for an aligned-for-u16x8 read of 16 bytes
#[allow(dead_code)] #[allow(dead_code)]
#[inline(always)] #[inline(always)]
pub unsafe fn load8_aligned(ptr: *const u16) -> u16x8 { pub unsafe fn load8_aligned(ptr: *const u16) -> u16x8 {
*(ptr as *const u16x8) *(ptr as *const u16x8)
} }
/// Safety invariant: ptr must be valid for an unaligned store of 16 bytes
#[inline(always)] #[inline(always)]
pub unsafe fn store8_unaligned(ptr: *mut u16, s: u16x8) { pub unsafe fn store8_unaligned(ptr: *mut u16, s: u16x8) {
::core::ptr::copy_nonoverlapping(&s as *const u16x8 as *const u8, ptr as *mut u8, 16); ::core::ptr::copy_nonoverlapping(&s as *const u16x8 as *const u8, ptr as *mut u8, 16);
} }
/// Safety invariant: ptr must be valid for an aligned-for-u16x8 store of 16 bytes
#[allow(dead_code)] #[allow(dead_code)]
#[inline(always)] #[inline(always)]
pub unsafe fn store8_aligned(ptr: *mut u16, s: u16x8) { pub unsafe fn store8_aligned(ptr: *mut u16, s: u16x8) {
@ -119,7 +100,7 @@ pub fn simd_byte_swap(s: u16x8) -> u16x8 {
#[inline(always)] #[inline(always)]
pub fn to_u16_lanes(s: u8x16) -> u16x8 { pub fn to_u16_lanes(s: u8x16) -> u16x8 {
u16x8::from_ne_bytes(s) s.into_bits()
} }
cfg_if! { cfg_if! {
@ -127,11 +108,10 @@ cfg_if! {
// Expose low-level mask instead of higher-level conclusion, // Expose low-level mask instead of higher-level conclusion,
// because the non-ASCII case would perform less well otherwise. // because the non-ASCII case would perform less well otherwise.
// Safety-usable invariant: This returned value is whether each high bit is set
#[inline(always)] #[inline(always)]
pub fn mask_ascii(s: u8x16) -> i32 { pub fn mask_ascii(s: u8x16) -> i32 {
unsafe { unsafe {
_mm_movemask_epi8(s.into()) _mm_movemask_epi8(s.into_bits())
} }
} }
@ -145,16 +125,14 @@ cfg_if! {
#[inline(always)] #[inline(always)]
pub fn simd_is_ascii(s: u8x16) -> bool { pub fn simd_is_ascii(s: u8x16) -> bool {
unsafe { unsafe {
// Safety: We have cfg()d the correct platform _mm_movemask_epi8(s.into_bits()) == 0
_mm_movemask_epi8(s.into()) == 0
} }
} }
} else if #[cfg(target_arch = "aarch64")]{ } else if #[cfg(target_arch = "aarch64")]{
#[inline(always)] #[inline(always)]
pub fn simd_is_ascii(s: u8x16) -> bool { pub fn simd_is_ascii(s: u8x16) -> bool {
unsafe { unsafe {
// Safety: We have cfg()d the correct platform vmaxvq_u8(s.into_bits()) < 0x80
vmaxvq_u8(s.into()) < 0x80
} }
} }
} else { } else {
@ -163,7 +141,7 @@ cfg_if! {
// This optimizes better on ARM than // This optimizes better on ARM than
// the lt formulation. // the lt formulation.
let highest_ascii = u8x16::splat(0x7F); let highest_ascii = u8x16::splat(0x7F);
!any_mask8x16(s.simd_gt(highest_ascii)) !s.gt(highest_ascii).any()
} }
} }
} }
@ -176,21 +154,20 @@ cfg_if! {
return true; return true;
} }
let above_str_latin1 = u8x16::splat(0xC4); let above_str_latin1 = u8x16::splat(0xC4);
s.simd_lt(above_str_latin1).all() s.lt(above_str_latin1).all()
} }
} else if #[cfg(target_arch = "aarch64")]{ } else if #[cfg(target_arch = "aarch64")]{
#[inline(always)] #[inline(always)]
pub fn simd_is_str_latin1(s: u8x16) -> bool { pub fn simd_is_str_latin1(s: u8x16) -> bool {
unsafe { unsafe {
// Safety: We have cfg()d the correct platform vmaxvq_u8(s.into_bits()) < 0xC4
vmaxvq_u8(s.into()) < 0xC4
} }
} }
} else { } else {
#[inline(always)] #[inline(always)]
pub fn simd_is_str_latin1(s: u8x16) -> bool { pub fn simd_is_str_latin1(s: u8x16) -> bool {
let above_str_latin1 = u8x16::splat(0xC4); let above_str_latin1 = u8x16::splat(0xC4);
all_mask8x16(s.simd_lt(above_str_latin1)) s.lt(above_str_latin1).all()
} }
} }
} }
@ -200,23 +177,21 @@ cfg_if! {
#[inline(always)] #[inline(always)]
pub fn simd_is_basic_latin(s: u16x8) -> bool { pub fn simd_is_basic_latin(s: u16x8) -> bool {
unsafe { unsafe {
// Safety: We have cfg()d the correct platform vmaxvq_u16(s.into_bits()) < 0x80
vmaxvq_u16(s.into()) < 0x80
} }
} }
#[inline(always)] #[inline(always)]
pub fn simd_is_latin1(s: u16x8) -> bool { pub fn simd_is_latin1(s: u16x8) -> bool {
unsafe { unsafe {
// Safety: We have cfg()d the correct platform vmaxvq_u16(s.into_bits()) < 0x100
vmaxvq_u16(s.into()) < 0x100
} }
} }
} else { } else {
#[inline(always)] #[inline(always)]
pub fn simd_is_basic_latin(s: u16x8) -> bool { pub fn simd_is_basic_latin(s: u16x8) -> bool {
let above_ascii = u16x8::splat(0x80); let above_ascii = u16x8::splat(0x80);
all_mask16x8(s.simd_lt(above_ascii)) s.lt(above_ascii).all()
} }
#[inline(always)] #[inline(always)]
@ -225,7 +200,7 @@ cfg_if! {
// seems faster in this case while the above // seems faster in this case while the above
// function is better the other way round... // function is better the other way round...
let highest_latin1 = u16x8::splat(0xFF); let highest_latin1 = u16x8::splat(0xFF);
!any_mask16x8(s.simd_gt(highest_latin1)) !s.gt(highest_latin1).any()
} }
} }
} }
@ -234,7 +209,7 @@ cfg_if! {
pub fn contains_surrogates(s: u16x8) -> bool { pub fn contains_surrogates(s: u16x8) -> bool {
let mask = u16x8::splat(0xF800); let mask = u16x8::splat(0xF800);
let surrogate_bits = u16x8::splat(0xD800); let surrogate_bits = u16x8::splat(0xD800);
any_mask16x8((s & mask).simd_eq(surrogate_bits)) (s & mask).eq(surrogate_bits).any()
} }
cfg_if! { cfg_if! {
@ -242,8 +217,7 @@ cfg_if! {
macro_rules! aarch64_return_false_if_below_hebrew { macro_rules! aarch64_return_false_if_below_hebrew {
($s:ident) => ({ ($s:ident) => ({
unsafe { unsafe {
// Safety: We have cfg()d the correct platform if vmaxvq_u16($s.into_bits()) < 0x0590 {
if vmaxvq_u16($s.into()) < 0x0590 {
return false; return false;
} }
} }
@ -260,7 +234,7 @@ cfg_if! {
macro_rules! non_aarch64_return_false_if_all { macro_rules! non_aarch64_return_false_if_all {
($s:ident) => ({ ($s:ident) => ({
if all_mask16x8($s) { if $s.all() {
return false; return false;
} }
}) })
@ -271,7 +245,7 @@ cfg_if! {
macro_rules! in_range16x8 { macro_rules! in_range16x8 {
($s:ident, $start:expr, $end:expr) => {{ ($s:ident, $start:expr, $end:expr) => {{
// SIMD sub is wrapping // SIMD sub is wrapping
($s - u16x8::splat($start)).simd_lt(u16x8::splat($end - $start)) ($s - u16x8::splat($start)).lt(u16x8::splat($end - $start))
}}; }};
} }
@ -285,44 +259,43 @@ pub fn is_u16x8_bidi(s: u16x8) -> bool {
aarch64_return_false_if_below_hebrew!(s); aarch64_return_false_if_below_hebrew!(s);
let below_hebrew = s.simd_lt(u16x8::splat(0x0590)); let below_hebrew = s.lt(u16x8::splat(0x0590));
non_aarch64_return_false_if_all!(below_hebrew); non_aarch64_return_false_if_all!(below_hebrew);
if all_mask16x8( if (below_hebrew | in_range16x8!(s, 0x0900, 0x200F) | in_range16x8!(s, 0x2068, 0xD802)).all() {
below_hebrew | in_range16x8!(s, 0x0900, 0x200F) | in_range16x8!(s, 0x2068, 0xD802),
) {
return false; return false;
} }
// Quick refutation failed. Let's do the full check. // Quick refutation failed. Let's do the full check.
any_mask16x8( (in_range16x8!(s, 0x0590, 0x0900)
(in_range16x8!(s, 0x0590, 0x0900) | in_range16x8!(s, 0xFB1D, 0xFE00)
| in_range16x8!(s, 0xFB1D, 0xFE00) | in_range16x8!(s, 0xFE70, 0xFEFF)
| in_range16x8!(s, 0xFE70, 0xFEFF) | in_range16x8!(s, 0xD802, 0xD804)
| in_range16x8!(s, 0xD802, 0xD804) | in_range16x8!(s, 0xD83A, 0xD83C)
| in_range16x8!(s, 0xD83A, 0xD83C) | s.eq(u16x8::splat(0x200F))
| s.simd_eq(u16x8::splat(0x200F)) | s.eq(u16x8::splat(0x202B))
| s.simd_eq(u16x8::splat(0x202B)) | s.eq(u16x8::splat(0x202E))
| s.simd_eq(u16x8::splat(0x202E)) | s.eq(u16x8::splat(0x2067)))
| s.simd_eq(u16x8::splat(0x2067))), .any()
)
} }
#[inline(always)] #[inline(always)]
pub fn simd_unpack(s: u8x16) -> (u16x8, u16x8) { pub fn simd_unpack(s: u8x16) -> (u16x8, u16x8) {
let first: u8x16 = simd_swizzle!( unsafe {
s, let first: u8x16 = shuffle!(
u8x16::splat(0), s,
[0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23] u8x16::splat(0),
); [0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23]
let second: u8x16 = simd_swizzle!( );
s, let second: u8x16 = shuffle!(
u8x16::splat(0), s,
[8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31] u8x16::splat(0),
); [8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31]
(u16x8::from_ne_bytes(first), u16x8::from_ne_bytes(second)) );
(first.into_bits(), second.into_bits())
}
} }
cfg_if! { cfg_if! {
@ -330,20 +303,21 @@ cfg_if! {
#[inline(always)] #[inline(always)]
pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 { pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
unsafe { unsafe {
// Safety: We have cfg()d the correct platform _mm_packus_epi16(a.into_bits(), b.into_bits()).into_bits()
_mm_packus_epi16(a.into(), b.into()).into()
} }
} }
} else { } else {
#[inline(always)] #[inline(always)]
pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 { pub fn simd_pack(a: u16x8, b: u16x8) -> u8x16 {
let first: u8x16 = a.to_ne_bytes(); unsafe {
let second: u8x16 = b.to_ne_bytes(); let first: u8x16 = a.into_bits();
simd_swizzle!( let second: u8x16 = b.into_bits();
first, shuffle!(
second, first,
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] second,
) [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
)
}
} }
} }
} }

View file

@ -53,9 +53,6 @@ impl SingleByteDecoder {
// statically omit the bound check when accessing // statically omit the bound check when accessing
// `[u16; 128]` with an index // `[u16; 128]` with an index
// `non_ascii as usize - 0x80usize`. // `non_ascii as usize - 0x80usize`.
//
// Safety: `non_ascii` is a u8 byte >=0x80, from the invariants
// on Utf8Destination::copy_ascii_from_check_space_bmp()
let mapped = let mapped =
unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) }; unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
// let mapped = self.table[non_ascii as usize - 0x80usize]; // let mapped = self.table[non_ascii as usize - 0x80usize];
@ -154,12 +151,9 @@ impl SingleByteDecoder {
} else { } else {
(DecoderResult::InputEmpty, src.len()) (DecoderResult::InputEmpty, src.len())
}; };
// Safety invariant: converted <= length. Quite often we have `converted < length`
// which will be separately marked.
let mut converted = 0usize; let mut converted = 0usize;
'outermost: loop { 'outermost: loop {
match unsafe { match unsafe {
// Safety: length is the minimum length, `src/dst + x` will always be valid for reads/writes of `len - x`
ascii_to_basic_latin( ascii_to_basic_latin(
src.as_ptr().add(converted), src.as_ptr().add(converted),
dst.as_mut_ptr().add(converted), dst.as_mut_ptr().add(converted),
@ -170,12 +164,6 @@ impl SingleByteDecoder {
return (pending, length, length); return (pending, length, length);
} }
Some((mut non_ascii, consumed)) => { Some((mut non_ascii, consumed)) => {
// Safety invariant: `converted <= length` upheld, since this can only consume
// up to `length - converted` bytes.
//
// Furthermore, in this context,
// we can assume `converted < length` since this branch is only ever hit when
// ascii_to_basic_latin fails to consume the entire slice
converted += consumed; converted += consumed;
'middle: loop { 'middle: loop {
// `converted` doesn't count the reading of `non_ascii` yet. // `converted` doesn't count the reading of `non_ascii` yet.
@ -184,9 +172,6 @@ impl SingleByteDecoder {
// statically omit the bound check when accessing // statically omit the bound check when accessing
// `[u16; 128]` with an index // `[u16; 128]` with an index
// `non_ascii as usize - 0x80usize`. // `non_ascii as usize - 0x80usize`.
//
// Safety: We can rely on `non_ascii` being between `0x80` and `0xFF` due to
// the invariants of `ascii_to_basic_latin()`, and our table has enough space for that.
let mapped = let mapped =
unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) }; unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
// let mapped = self.table[non_ascii as usize - 0x80usize]; // let mapped = self.table[non_ascii as usize - 0x80usize];
@ -198,10 +183,9 @@ impl SingleByteDecoder {
); );
} }
unsafe { unsafe {
// Safety: As mentioned above, `converted < length` // The bound check has already been performed
*(dst.get_unchecked_mut(converted)) = mapped; *(dst.get_unchecked_mut(converted)) = mapped;
} }
// Safety: `converted <= length` upheld, since `converted < length` before this
converted += 1; converted += 1;
// Next, handle ASCII punctuation and non-ASCII without // Next, handle ASCII punctuation and non-ASCII without
// going back to ASCII acceleration. Non-ASCII scripts // going back to ASCII acceleration. Non-ASCII scripts
@ -214,10 +198,7 @@ impl SingleByteDecoder {
if converted == length { if converted == length {
return (pending, length, length); return (pending, length, length);
} }
// Safety: We are back to `converted < length` because of the == above
// and can perform this check.
let mut b = unsafe { *(src.get_unchecked(converted)) }; let mut b = unsafe { *(src.get_unchecked(converted)) };
// Safety: `converted < length` is upheld for this loop
'innermost: loop { 'innermost: loop {
if b > 127 { if b > 127 {
non_ascii = b; non_ascii = b;
@ -227,20 +208,15 @@ impl SingleByteDecoder {
// byte unconditionally instead of trying to unread it // byte unconditionally instead of trying to unread it
// to make it part of the next SIMD stride. // to make it part of the next SIMD stride.
unsafe { unsafe {
// Safety: `converted < length` is true for this loop
*(dst.get_unchecked_mut(converted)) = u16::from(b); *(dst.get_unchecked_mut(converted)) = u16::from(b);
} }
// Safety: We are now at `converted <= length`. We should *not* `continue`
// the loop without reverifying
converted += 1; converted += 1;
if b < 60 { if b < 60 {
// We've got punctuation // We've got punctuation
if converted == length { if converted == length {
return (pending, length, length); return (pending, length, length);
} }
// Safety: we're back to `converted <= length` because of the == above
b = unsafe { *(src.get_unchecked(converted)) }; b = unsafe { *(src.get_unchecked(converted)) };
// Safety: The loop continues as `converted < length`
continue 'innermost; continue 'innermost;
} }
// We've got markup or ASCII text // We've got markup or ASCII text
@ -258,8 +234,6 @@ impl SingleByteDecoder {
loop { loop {
if let Some((non_ascii, offset)) = validate_ascii(bytes) { if let Some((non_ascii, offset)) = validate_ascii(bytes) {
total += offset; total += offset;
// Safety: We can rely on `non_ascii` being between `0x80` and `0xFF` due to
// the invariants of `ascii_to_basic_latin()`, and our table has enough space for that.
let mapped = unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) }; let mapped = unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) };
if mapped != u16::from(non_ascii) { if mapped != u16::from(non_ascii) {
return total; return total;
@ -410,12 +384,9 @@ impl SingleByteEncoder {
} else { } else {
(EncoderResult::InputEmpty, src.len()) (EncoderResult::InputEmpty, src.len())
}; };
// Safety invariant: converted <= length. Quite often we have `converted < length`
// which will be separately marked.
let mut converted = 0usize; let mut converted = 0usize;
'outermost: loop { 'outermost: loop {
match unsafe { match unsafe {
// Safety: length is the minimum length, `src/dst + x` will always be valid for reads/writes of `len - x`
basic_latin_to_ascii( basic_latin_to_ascii(
src.as_ptr().add(converted), src.as_ptr().add(converted),
dst.as_mut_ptr().add(converted), dst.as_mut_ptr().add(converted),
@ -426,23 +397,15 @@ impl SingleByteEncoder {
return (pending, length, length); return (pending, length, length);
} }
Some((mut non_ascii, consumed)) => { Some((mut non_ascii, consumed)) => {
// Safety invariant: `converted <= length` upheld, since this can only consume
// up to `length - converted` bytes.
//
// Furthermore, in this context,
// we can assume `converted < length` since this branch is only ever hit when
// ascii_to_basic_latin fails to consume the entire slice
converted += consumed; converted += consumed;
'middle: loop { 'middle: loop {
// `converted` doesn't count the reading of `non_ascii` yet. // `converted` doesn't count the reading of `non_ascii` yet.
match self.encode_u16(non_ascii) { match self.encode_u16(non_ascii) {
Some(byte) => { Some(byte) => {
unsafe { unsafe {
// Safety: we're allowed this access since `converted < length`
*(dst.get_unchecked_mut(converted)) = byte; *(dst.get_unchecked_mut(converted)) = byte;
} }
converted += 1; converted += 1;
// `converted <= length` now
} }
None => { None => {
// At this point, we need to know if we // At this point, we need to know if we
@ -458,8 +421,6 @@ impl SingleByteEncoder {
converted, converted,
); );
} }
// Safety: convered < length from outside the match, and `converted + 1 != length`,
// So `converted + 1 < length` as well. We're in bounds
let second = let second =
u32::from(unsafe { *src.get_unchecked(converted + 1) }); u32::from(unsafe { *src.get_unchecked(converted + 1) });
if second & 0xFC00u32 != 0xDC00u32 { if second & 0xFC00u32 != 0xDC00u32 {
@ -471,18 +432,6 @@ impl SingleByteEncoder {
} }
// The next code unit is a low surrogate. // The next code unit is a low surrogate.
let astral: char = unsafe { let astral: char = unsafe {
// Safety: We can rely on non_ascii being 0xD800-0xDBFF since the high bits are 0xD800
// Then, (non_ascii << 10 - 0xD800 << 10) becomes between (0 to 0x3FF) << 10, which is between
// 0x400 to 0xffc00. Adding the 0x10000 gives a range of 0x10400 to 0x10fc00. Subtracting the 0xDC00
// gives 0x2800 to 0x102000
// The second term is between 0xDC00 and 0xDFFF from the check above. This gives a maximum
// possible range of (0x10400 + 0xDC00) to (0x102000 + 0xDFFF) which is 0x1E000 to 0x10ffff.
// This is in range.
//
// From a Unicode principles perspective this can also be verified as we have checked that `non_ascii` is a high surrogate
// (0xD800..=0xDBFF), and that `second` is a low surrogate (`0xDC00..=0xDFFF`), and we are applying reverse of the UTC16 transformation
// algorithm <https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF>, by applying the high surrogate - 0xD800 to the
// high ten bits, and the low surrogate - 0xDc00 to the low ten bits, and then adding 0x10000
::core::char::from_u32_unchecked( ::core::char::from_u32_unchecked(
(u32::from(non_ascii) << 10) + second (u32::from(non_ascii) << 10) + second
- (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32), - (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32),
@ -507,7 +456,6 @@ impl SingleByteEncoder {
converted + 1, // +1 `for non_ascii` converted + 1, // +1 `for non_ascii`
converted, converted,
); );
// Safety: This branch diverges, so no need to uphold invariants on `converted`
} }
} }
// Next, handle ASCII punctuation and non-ASCII without // Next, handle ASCII punctuation and non-ASCII without
@ -521,12 +469,8 @@ impl SingleByteEncoder {
if converted == length { if converted == length {
return (pending, length, length); return (pending, length, length);
} }
// Safety: we're back to `converted < length` due to the == above and can perform
// the unchecked read
let mut unit = unsafe { *(src.get_unchecked(converted)) }; let mut unit = unsafe { *(src.get_unchecked(converted)) };
'innermost: loop { 'innermost: loop {
// Safety: This loop always begins with `converted < length`, see
// the invariant outside and the comment on the continue below
if unit > 127 { if unit > 127 {
non_ascii = unit; non_ascii = unit;
continue 'middle; continue 'middle;
@ -535,25 +479,19 @@ impl SingleByteEncoder {
// byte unconditionally instead of trying to unread it // byte unconditionally instead of trying to unread it
// to make it part of the next SIMD stride. // to make it part of the next SIMD stride.
unsafe { unsafe {
// Safety: Can rely on converted < length
*(dst.get_unchecked_mut(converted)) = unit as u8; *(dst.get_unchecked_mut(converted)) = unit as u8;
} }
converted += 1; converted += 1;
// `converted <= length` here
if unit < 60 { if unit < 60 {
// We've got punctuation // We've got punctuation
if converted == length { if converted == length {
return (pending, length, length); return (pending, length, length);
} }
// Safety: `converted < length` due to the == above. The read is safe.
unit = unsafe { *(src.get_unchecked(converted)) }; unit = unsafe { *(src.get_unchecked(converted)) };
// Safety: This only happens if `converted < length`, maintaining it
continue 'innermost; continue 'innermost;
} }
// We've got markup or ASCII text // We've got markup or ASCII text
continue 'outermost; continue 'outermost;
// Safety: All other routes to here diverge so the continue is the only
// way to run the innermost loop.
} }
} }
} }

View file

@ -14,13 +14,12 @@ use crate::variant::*;
cfg_if! { cfg_if! {
if #[cfg(feature = "simd-accel")] { if #[cfg(feature = "simd-accel")] {
use simd_funcs::*; use simd_funcs::*;
use core::simd::u16x8; use packed_simd::u16x8;
use core::simd::cmp::SimdPartialOrd;
#[inline(always)] #[inline(always)]
fn shift_upper(unpacked: u16x8) -> u16x8 { fn shift_upper(unpacked: u16x8) -> u16x8 {
let highest_ascii = u16x8::splat(0x7F); let highest_ascii = u16x8::splat(0x7F);
unpacked + unpacked.simd_gt(highest_ascii).select(u16x8::splat(0xF700), u16x8::splat(0)) } unpacked + unpacked.gt(highest_ascii).select(u16x8::splat(0xF700), u16x8::splat(0)) }
} else { } else {
} }
} }
@ -117,15 +116,10 @@ impl UserDefinedDecoder {
let simd_iterations = length >> 4; let simd_iterations = length >> 4;
let src_ptr = src.as_ptr(); let src_ptr = src.as_ptr();
let dst_ptr = dst.as_mut_ptr(); let dst_ptr = dst.as_mut_ptr();
// Safety: This is `for i in 0..length / 16`
for i in 0..simd_iterations { for i in 0..simd_iterations {
// Safety: This is in bounds: length is the minumum valid length for both src/dst
// and i ranges to length/16, so multiplying by 16 will always be `< length` and can do
// a 16 byte read
let input = unsafe { load16_unaligned(src_ptr.add(i * 16)) }; let input = unsafe { load16_unaligned(src_ptr.add(i * 16)) };
let (first, second) = simd_unpack(input); let (first, second) = simd_unpack(input);
unsafe { unsafe {
// Safety: same as above, but this is two consecutive 8-byte reads
store8_unaligned(dst_ptr.add(i * 16), shift_upper(first)); store8_unaligned(dst_ptr.add(i * 16), shift_upper(first));
store8_unaligned(dst_ptr.add((i * 16) + 8), shift_upper(second)); store8_unaligned(dst_ptr.add((i * 16) + 8), shift_upper(second));
} }

File diff suppressed because one or more lines are too long

83
third_party/rust/packed_simd/Cargo.toml vendored Normal file
View file

@ -0,0 +1,83 @@
# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO
#
# When uploading crates to the registry Cargo will automatically
# "normalize" Cargo.toml files for maximal compatibility
# with all versions of Cargo and also rewrite `path` dependencies
# to registry (e.g., crates.io) dependencies.
#
# If you are reading this file be aware that the original Cargo.toml
# will likely look very different (and much more reasonable).
# See Cargo.toml.orig for the original contents.
[package]
edition = "2018"
name = "packed_simd"
version = "0.3.9"
build = "build.rs"
description = "Portable Packed SIMD vectors"
homepage = "https://github.com/rust-lang/packed_simd"
documentation = "https://docs.rs/crate/packed_simd/"
readme = "README.md"
keywords = [
"simd",
"vector",
"portability",
]
categories = [
"hardware-support",
"concurrency",
"no-std",
"data-structures",
]
license = "MIT OR Apache-2.0"
repository = "https://github.com/rust-lang/packed_simd"
[package.metadata.docs.rs]
features = ["into_bits"]
rustdoc-args = [
"--cfg",
"doc_cfg",
]
[dependencies.cfg-if]
version = "1.0.0"
[dependencies.core_arch]
version = "0.1.5"
optional = true
[dependencies.num-traits]
version = "0.2.14"
features = ["libm"]
default-features = false
[dev-dependencies.arrayvec]
version = "^0.5"
default-features = false
[dev-dependencies.paste]
version = "^1"
[features]
default = []
into_bits = []
libcore_neon = []
[target."cfg(target_arch = \"x86_64\")".dependencies.sleef-sys]
version = "0.1.2"
optional = true
[target.wasm32-unknown-unknown.dev-dependencies.wasm-bindgen]
version = "=0.2.87"
[target.wasm32-unknown-unknown.dev-dependencies.wasm-bindgen-test]
version = "=0.3.37"
[badges.is-it-maintained-issue-resolution]
repository = "rust-lang/packed_simd"
[badges.is-it-maintained-open-issues]
repository = "rust-lang/packed_simd"
[badges.maintenance]
status = "experimental"

144
third_party/rust/packed_simd/README.md vendored Normal file
View file

@ -0,0 +1,144 @@
# `Simd<[T; N]>`
## Implementation of [Rust RFC #2366: `std::simd`][rfc2366]
[![Latest Version]][crates.io] [![docs]][master_docs]
**WARNING**: this crate only supports the most recent nightly Rust toolchain
and will be superseded by [`#![feature(portable_simd)]`](https://github.com/rust-lang/portable-simd).
## Documentation
* [API docs (`master` branch)][master_docs]
* [Performance guide][perf_guide]
* [API docs (`docs.rs`)][docs.rs]
* [RFC2366 `std::simd`][rfc2366]: - contains motivation, design rationale,
discussion, etc.
## Examples
Most of the examples come with both a scalar and a vectorized implementation.
* [`aobench`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/aobench)
* [`fannkuch_redux`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/fannkuch_redux)
* [`matrix inverse`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/matrix_inverse)
* [`mandelbrot`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/mandelbrot)
* [`n-body`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/nbody)
* [`options_pricing`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/options_pricing)
* [`spectral_norm`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/spectral_norm)
* [`triangle transform`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/triangle_xform)
* [`stencil`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/stencil)
* [`vector dot product`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/dot_product)
## Cargo features
* `into_bits` (default: disabled): enables `FromBits`/`IntoBits` trait
implementations for the vector types. These allow reinterpreting the bits of a
vector type as those of another vector type safely by just using the
`.into_bits()` method.
## Performance
The following [ISPC] examples are also part of `packed_simd`'s
[`examples/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/examples/)
directory, where `packed_simd`+[`rayon`][rayon] are used to emulate [ISPC]'s
Single-Program-Multiple-Data (SPMD) programming model. The performance results
on different hardware is shown in the `readme.md` of each example. The following
table summarizes the performance ranges, where `+` means speed-up and `-`
slowdown:
* `aobench`: `[-1.02x, +1.53x]`,
* `stencil`: `[+1.06x, +1.72x]`,
* `mandelbrot`: `[-1.74x, +1.2x]`,
* `options_pricing`:
* `black_scholes`: `+1.0x`
* `binomial_put`: `+1.4x`
While SPMD is not the intended use case for `packed_simd`, it is possible to
combine the library with [`rayon`][rayon] to poorly emulate [ISPC]'s SPMD programming
model in Rust. Writing performant code is not as straightforward as with
[ISPC], but with some care (e.g. see the [Performance Guide][perf_guide]) one
can easily match and often out-perform [ISPC]'s "default performance".
## Platform support
The following table describes the supported platforms: `build` shows whether
the library compiles without issues for a given target, while `run` shows
whether the test suite passes for a given target.
| **Linux** | **build** | **run** |
|---------------------------------------|-----------|---------|
| `i586-unknown-linux-gnu` | ✓ | ✗ |
| `i686-unknown-linux-gnu` | ✓ | ✗ |
| `x86_64-unknown-linux-gnu` | ✓ | ✓ |
| `arm-unknown-linux-gnueabihf` | ✓ | ✓ |
| `armv7-unknown-linux-gnueabi` | ✓ | ✓ |
| `aarch64-unknown-linux-gnu` | ✓ | ✓ |
| `powerpc-unknown-linux-gnu` | ✓ | ✗ |
| `powerpc64-unknown-linux-gnu` | ✓ | ✗ |
| `powerpc64le-unknown-linux-gnu` | ✓ | ✓ |
| `s390x-unknown-linux-gnu` | ✓ | ✗ |
| `sparc64-unknown-linux-gnu` | ✓ | ✗ |
| `thumbv7neon-unknown-linux-gnueabihf` | ✓ | ✓ |
| **MacOSX** | **build** | **run** |
| `x86_64-apple-darwin` | ✓ | ✓ |
| **Android** | **build** | **run** |
| `x86_64-linux-android` | ✓ | ✓ |
| `armv7-linux-androideabi` | ✓ | ✗ |
| `aarch64-linux-android` | ✓ | ✗ |
| `thumbv7neon-linux-androideabi` | ✓ | ✗ |
| **iOS** | **build** | **run** |
| `x86_64-apple-ios` | ✗ | ✗ |
| `aarch64-apple-ios` | ✗ | ✗ |
## Machine code verification
The
[`verify/`](https://github.com/rust-lang-nursery/packed_simd/tree/master/verify)
crate tests disassembles the portable packed vector APIs at run-time and
compares the generated machine code against the desired one to make sure that
this crate remains efficient.
## License
This project is licensed under either of
* [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0)
([LICENSE-APACHE](LICENSE-APACHE))
* [MIT License](http://opensource.org/licenses/MIT)
([LICENSE-MIT](LICENSE-MIT))
at your option.
## Contributing
We welcome all people who want to contribute.
Please see the [contributing instructions] for more information.
Contributions in any form (issues, pull requests, etc.) to this project
must adhere to Rust's [Code of Conduct].
Unless you explicitly state otherwise, any contribution intentionally submitted
for inclusion in `packed_simd` by you, as defined in the Apache-2.0 license, shall be
dual licensed as above, without any additional terms or conditions.
[travis]: https://travis-ci.com/rust-lang/packed_simd
[Travis-CI Status]: https://travis-ci.com/rust-lang/packed_simd.svg?branch=master
[appveyor]: https://ci.appveyor.com/project/gnzlbg/packed-simd
[Appveyor Status]: https://ci.appveyor.com/api/projects/status/hd7v9dvr442hgdix?svg=true
[Latest Version]: https://img.shields.io/crates/v/packed_simd.svg
[crates.io]: https://crates.io/crates/packed_simd
[docs]: https://docs.rs/packed_simd/badge.svg
[docs.rs]: https://docs.rs/packed_simd
[master_docs]: https://rust-lang-nursery.github.io/packed_simd/packed_simd/
[perf_guide]: https://rust-lang-nursery.github.io/packed_simd/perf-guide/
[rfc2366]: https://github.com/rust-lang/rfcs/pull/2366
[ISPC]: https://ispc.github.io/
[rayon]: https://crates.io/crates/rayon
[boost_license]: https://www.boost.org/LICENSE_1_0.txt
[SLEEF]: https://sleef.org/
[sleef_sys]: https://crates.io/crates/sleef-sys
[contributing instructions]: contributing.md
[Code of Conduct]: https://www.rust-lang.org/en-US/conduct.html

View file

@ -0,0 +1,3 @@
status = [
"continuous-integration/travis-ci/push"
]

6
third_party/rust/packed_simd/build.rs vendored Normal file
View file

@ -0,0 +1,6 @@
fn main() {
let target = std::env::var("TARGET").expect("TARGET environment variable not defined");
if target.contains("neon") {
println!("cargo:rustc-cfg=libcore_neon");
}
}

71
third_party/rust/packed_simd/ci/all.sh vendored Executable file
View file

@ -0,0 +1,71 @@
#!/usr/bin/env bash
#
# Performs an operation on all targets
set -ex
: "${1?The all.sh script requires one argument.}"
op=$1
cargo_clean() {
cargo clean
}
cargo_check_fmt() {
cargo fmt --all -- --check
}
cargo_fmt() {
cargo fmt --all
}
cargo_clippy() {
cargo clippy --all -- -D clippy::perf
}
CMD="-1"
case $op in
clean*)
CMD=cargo_clean
;;
check_fmt*)
CMD=cargo_check_fmt
;;
fmt*)
CMD=cargo_fmt
;;
clippy)
CMD=cargo_clippy
;;
*)
echo "Unknown operation: \"${op}\""
exit 1
;;
esac
echo "Operation is: ${CMD}"
# On src/
$CMD
# Check examples/
for dir in examples/*/
do
dir=${dir%*/}
(
cd "${dir%*/}"
$CMD
)
done
(
cd verify/verify
$CMD
)
(
cd micro_benchmarks
$CMD
)

View file

@ -0,0 +1,21 @@
#!/usr/bin/env sh
# Copyright 2016 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.
set -ex
ANDROID_NDK_URL=https://dl.google.com/android/repository
ANDROID_NDK_ARCHIVE=android-ndk-r25b-linux.zip
curl -fO "$ANDROID_NDK_URL/$ANDROID_NDK_ARCHIVE"
unzip -q $ANDROID_NDK_ARCHIVE
rm $ANDROID_NDK_ARCHIVE
mv android-ndk-* ndk
rm -rf android-ndk-*

View file

@ -0,0 +1,60 @@
#!/usr/bin/env sh
# Copyright 2016 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.
set -ex
# Prep the SDK and emulator
#
# Note that the update process requires that we accept a bunch of licenses, and
# we can't just pipe `yes` into it for some reason, so we take the same strategy
# located in https://github.com/appunite/docker by just wrapping it in a script
# which apparently magically accepts the licenses.
mkdir sdk
curl --retry 5 https://dl.google.com/android/repository/sdk-tools-linux-3859397.zip -O
unzip -d sdk sdk-tools-linux-3859397.zip
case "$1" in
arm | armv7)
abi=armeabi-v7a
;;
aarch64)
abi=arm64-v8a
;;
i686)
abi=x86
;;
x86_64)
abi=x86_64
;;
*)
echo "invalid arch: $1"
exit 1
;;
esac;
# --no_https avoids
# javax.net.ssl.SSLHandshakeException: sun.security.validator.ValidatorException: No trusted certificate found
yes | ./sdk/tools/bin/sdkmanager --licenses --no_https
yes | ./sdk/tools/bin/sdkmanager --no_https \
"emulator" \
"platform-tools" \
"platforms;android-24" \
"system-images;android-24;default;$abi"
echo "no" |
./sdk/tools/bin/avdmanager create avd \
--name "${1}" \
--package "system-images;android-24;default;$abi"

View file

@ -0,0 +1,56 @@
#!/usr/bin/env bash
# Copyright 2017 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.
set -ex
URL=https://dl.google.com/android/repository/sys-img/android
main() {
local arch="${1}"
local name="${2}"
local dest=/system
local td
td="$(mktemp -d)"
apt-get install --no-install-recommends e2tools
pushd "${td}"
curl --retry 5 -O "${URL}/${name}"
unzip -q "${name}"
local system
system="$(find . -name system.img)"
mkdir -p ${dest}/{bin,lib,lib64}
# Extract android linker and libraries to /system
# This allows android executables to be run directly (or with qemu)
if [ "${arch}" = "x86_64" ] || [ "${arch}" = "arm64" ]; then
e2cp -p "${system}:/bin/linker64" "${dest}/bin/"
e2cp -p "${system}:/lib64/libdl.so" "${dest}/lib64/"
e2cp -p "${system}:/lib64/libc.so" "${dest}/lib64/"
e2cp -p "${system}:/lib64/libm.so" "${dest}/lib64/"
else
e2cp -p "${system}:/bin/linker" "${dest}/bin/"
e2cp -p "${system}:/lib/libdl.so" "${dest}/lib/"
e2cp -p "${system}:/lib/libc.so" "${dest}/lib/"
e2cp -p "${system}:/lib/libm.so" "${dest}/lib/"
fi
# clean up
apt-get purge --auto-remove -y e2tools
popd
rm -rf "${td}"
}
main "${@}"

32
third_party/rust/packed_simd/ci/benchmark.sh vendored Executable file
View file

@ -0,0 +1,32 @@
#!/usr/bin/env bash
#
# Runs all benchmarks. Controlled by the following environment variables:
#
# FEATURES={} - cargo features to pass to all benchmarks (e.g. core_arch,sleef-sys,ispc)
# NORUN={1} - only builds the benchmarks
set -ex
if [[ ${NORUN} != 1 ]]; then
# Most benchmarks require hyperfine; require it upfront.
hash hyperfine 2>/dev/null || { echo >&2 "hyperfine is not in PATH."; exit 1; }
fi
# If the ispc benchmark feature is enabled, ispc must be in the path of the
# benchmarks.
if echo "$FEATURES" | grep -q "ispc"; then
hash ispc 2>/dev/null || { echo >&2 "ispc is not in PATH."; exit 1; }
fi
# An example with a benchmark.sh is a benchmark:
for dir in examples/*/
do
dir=${dir%*/}
cd ${dir%*/}
if [ -f "benchmark.sh" ]; then
./benchmark.sh
fi
cd -
done

View file

@ -0,0 +1,176 @@
// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// This is a script to deploy and execute a binary on an iOS simulator.
// The primary use of this is to be able to run unit tests on the simulator and
// retrieve the results.
//
// To do this through Cargo instead, use Dinghy
// (https://github.com/snipsco/dinghy): cargo dinghy install, then cargo dinghy
// test.
use std::env;
use std::fs::{self, File};
use std::io::Write;
use std::path::Path;
use std::process;
use std::process::Command;
macro_rules! t {
($e:expr) => (match $e {
Ok(e) => e,
Err(e) => panic!("{} failed with: {}", stringify!($e), e),
})
}
// Step one: Wrap as an app
fn package_as_simulator_app(crate_name: &str, test_binary_path: &Path) {
println!("Packaging simulator app");
drop(fs::remove_dir_all("ios_simulator_app"));
t!(fs::create_dir("ios_simulator_app"));
t!(fs::copy(test_binary_path,
Path::new("ios_simulator_app").join(crate_name)));
let mut f = t!(File::create("ios_simulator_app/Info.plist"));
t!(f.write_all(format!(r#"
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC
"-//Apple//DTD PLIST 1.0//EN"
"http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<dict>
<key>CFBundleExecutable</key>
<string>{}</string>
<key>CFBundleIdentifier</key>
<string>com.rust.unittests</string>
</dict>
</plist>
"#, crate_name).as_bytes()));
}
// Step two: Start the iOS simulator
fn start_simulator() {
println!("Looking for iOS simulator");
let output = t!(Command::new("xcrun").arg("simctl").arg("list").output());
assert!(output.status.success());
let mut simulator_exists = false;
let mut simulator_booted = false;
let mut found_rust_sim = false;
let stdout = t!(String::from_utf8(output.stdout));
for line in stdout.lines() {
if line.contains("rust_ios") {
if found_rust_sim {
panic!("Duplicate rust_ios simulators found. Please \
double-check xcrun simctl list.");
}
simulator_exists = true;
simulator_booted = line.contains("(Booted)");
found_rust_sim = true;
}
}
if simulator_exists == false {
println!("Creating iOS simulator");
Command::new("xcrun")
.arg("simctl")
.arg("create")
.arg("rust_ios")
.arg("com.apple.CoreSimulator.SimDeviceType.iPhone-SE")
.arg("com.apple.CoreSimulator.SimRuntime.iOS-10-2")
.check_status();
} else if simulator_booted == true {
println!("Shutting down already-booted simulator");
Command::new("xcrun")
.arg("simctl")
.arg("shutdown")
.arg("rust_ios")
.check_status();
}
println!("Starting iOS simulator");
// We can't uninstall the app (if present) as that will hang if the
// simulator isn't completely booted; just erase the simulator instead.
Command::new("xcrun").arg("simctl").arg("erase").arg("rust_ios").check_status();
Command::new("xcrun").arg("simctl").arg("boot").arg("rust_ios").check_status();
}
// Step three: Install the app
fn install_app_to_simulator() {
println!("Installing app to simulator");
Command::new("xcrun")
.arg("simctl")
.arg("install")
.arg("booted")
.arg("ios_simulator_app/")
.check_status();
}
// Step four: Run the app
fn run_app_on_simulator() {
println!("Running app");
let output = t!(Command::new("xcrun")
.arg("simctl")
.arg("launch")
.arg("--console")
.arg("booted")
.arg("com.rust.unittests")
.output());
println!("stdout --\n{}\n", String::from_utf8_lossy(&output.stdout));
println!("stderr --\n{}\n", String::from_utf8_lossy(&output.stderr));
let stdout = String::from_utf8_lossy(&output.stdout);
let failed = stdout.lines()
.find(|l| l.contains("FAILED"))
.map(|l| l.contains("FAILED"))
.unwrap_or(false);
let passed = stdout.lines()
.find(|l| l.contains("test result: ok"))
.map(|l| l.contains("test result: ok"))
.unwrap_or(false);
println!("Shutting down simulator");
Command::new("xcrun")
.arg("simctl")
.arg("shutdown")
.arg("rust_ios")
.check_status();
if !(passed && !failed) {
panic!("tests didn't pass");
}
}
trait CheckStatus {
fn check_status(&mut self);
}
impl CheckStatus for Command {
fn check_status(&mut self) {
println!("\trunning: {:?}", self);
assert!(t!(self.status()).success());
}
}
fn main() {
let args: Vec<String> = env::args().collect();
if args.len() != 2 {
println!("Usage: {} <executable>", args[0]);
process::exit(-1);
}
let test_binary_path = Path::new(&args[1]);
let crate_name = test_binary_path.file_name().unwrap();
package_as_simulator_app(crate_name.to_str().unwrap(), test_binary_path);
start_simulator();
install_app_to_simulator();
run_app_on_simulator();
}

View file

@ -0,0 +1,47 @@
FROM ubuntu:16.04
RUN dpkg --add-architecture i386 && \
apt-get update && \
apt-get install -y --no-install-recommends \
file \
make \
curl \
ca-certificates \
python \
unzip \
expect \
openjdk-9-jre \
libstdc++6:i386 \
libpulse0 \
gcc \
libc6-dev
WORKDIR /android/
COPY android* /android/
ENV ANDROID_ARCH=aarch64
ENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools
RUN sh /android/android-install-ndk.sh $ANDROID_ARCH
RUN sh /android/android-install-sdk.sh $ANDROID_ARCH
RUN mv /root/.android /tmp
RUN chmod 777 -R /tmp/.android
RUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/*
ENV PATH=$PATH:/rust/bin \
CARGO_TARGET_AARCH64_LINUX_ANDROID_LINKER=aarch64-linux-android-gcc \
CARGO_TARGET_AARCH64_LINUX_ANDROID_RUNNER=/tmp/runtest \
OBJDUMP=aarch64-linux-android-objdump \
HOME=/tmp
ADD runtest-android.rs /tmp/runtest.rs
ENTRYPOINT [ \
"bash", \
"-c", \
# set SHELL so android can detect a 64bits system, see
# http://stackoverflow.com/a/41789144
"SHELL=/bin/dash /android/sdk/emulator/emulator @aarch64 -no-window & \
rustc /tmp/runtest.rs -o /tmp/runtest && \
exec \"$@\"", \
"--" \
]

View file

@ -0,0 +1,14 @@
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
ca-certificates \
libc6-dev \
gcc-aarch64-linux-gnu \
libc6-dev-arm64-cross \
qemu-user \
make \
file
ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc \
CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_RUNNER="qemu-aarch64 -L /usr/aarch64-linux-gnu" \
OBJDUMP=aarch64-linux-gnu-objdump

View file

@ -0,0 +1,15 @@
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
ca-certificates \
libc6-dev \
libc6-armel-cross \
libc6-dev-armel-cross \
binutils-arm-linux-gnueabi \
gcc-arm-linux-gnueabi \
qemu-user \
make \
file
ENV CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABI_LINKER=arm-linux-gnueabi-gcc \
CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABI_RUNNER="qemu-arm -L /usr/arm-linux-gnueabi" \
OBJDUMP=arm-linux-gnueabi-objdump

View file

@ -0,0 +1,13 @@
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
ca-certificates \
libc6-dev \
gcc-arm-linux-gnueabihf \
libc6-dev-armhf-cross \
qemu-user \
make \
file
ENV CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \
CARGO_TARGET_ARM_UNKNOWN_LINUX_GNUEABIHF_RUNNER="qemu-arm -L /usr/arm-linux-gnueabihf" \
OBJDUMP=arm-linux-gnueabihf-objdump

View file

@ -0,0 +1,47 @@
FROM ubuntu:16.04
RUN dpkg --add-architecture i386 && \
apt-get update && \
apt-get install -y --no-install-recommends \
file \
make \
curl \
ca-certificates \
python \
unzip \
expect \
openjdk-9-jre \
libstdc++6:i386 \
libpulse0 \
gcc \
libc6-dev
WORKDIR /android/
COPY android* /android/
ENV ANDROID_ARCH=arm
ENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools
RUN sh /android/android-install-ndk.sh $ANDROID_ARCH
RUN sh /android/android-install-sdk.sh $ANDROID_ARCH
RUN mv /root/.android /tmp
RUN chmod 777 -R /tmp/.android
RUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/*
ENV PATH=$PATH:/rust/bin \
CARGO_TARGET_ARM_LINUX_ANDROIDEABI_LINKER=arm-linux-androideabi-gcc \
CARGO_TARGET_ARM_LINUX_ANDROIDEABI_RUNNER=/tmp/runtest \
OBJDUMP=arm-linux-androideabi-objdump \
HOME=/tmp
ADD runtest-android.rs /tmp/runtest.rs
ENTRYPOINT [ \
"bash", \
"-c", \
# set SHELL so android can detect a 64bits system, see
# http://stackoverflow.com/a/41789144
"SHELL=/bin/dash /android/sdk/emulator/emulator @arm -no-window & \
rustc /tmp/runtest.rs -o /tmp/runtest && \
exec \"$@\"", \
"--" \
]

View file

@ -0,0 +1,13 @@
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
ca-certificates \
libc6-dev \
gcc-arm-linux-gnueabihf \
libc6-dev-armhf-cross \
qemu-user \
make \
file
ENV CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \
CARGO_TARGET_ARMV7_UNKNOWN_LINUX_GNUEABIHF_RUNNER="qemu-arm -L /usr/arm-linux-gnueabihf" \
OBJDUMP=arm-linux-gnueabihf-objdump

View file

@ -0,0 +1,7 @@
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc-multilib \
libc6-dev \
file \
make \
ca-certificates

View file

@ -0,0 +1,7 @@
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc-multilib \
libc6-dev \
file \
make \
ca-certificates

View file

@ -0,0 +1,13 @@
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc libc6-dev qemu-user ca-certificates \
gcc-mips-linux-gnu libc6-dev-mips-cross \
qemu-system-mips \
qemu-user \
make \
file
ENV CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_LINKER=mips-linux-gnu-gcc \
CARGO_TARGET_MIPS_UNKNOWN_LINUX_GNU_RUNNER="qemu-mips -L /usr/mips-linux-gnu" \
OBJDUMP=mips-linux-gnu-objdump

View file

@ -0,0 +1,10 @@
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc libc6-dev qemu-user ca-certificates \
gcc-mips64-linux-gnuabi64 libc6-dev-mips64-cross \
qemu-system-mips64 qemu-user
ENV CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_LINKER=mips64-linux-gnuabi64-gcc \
CARGO_TARGET_MIPS64_UNKNOWN_LINUX_GNUABI64_RUNNER="qemu-mips64 -L /usr/mips64-linux-gnuabi64" \
OBJDUMP=mips64-linux-gnuabi64-objdump

View file

@ -0,0 +1,10 @@
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc libc6-dev qemu-user ca-certificates \
gcc-mips64el-linux-gnuabi64 libc6-dev-mips64el-cross \
qemu-system-mips64el
ENV CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_LINKER=mips64el-linux-gnuabi64-gcc \
CARGO_TARGET_MIPS64EL_UNKNOWN_LINUX_GNUABI64_RUNNER="qemu-mips64el -L /usr/mips64el-linux-gnuabi64" \
OBJDUMP=mips64el-linux-gnuabi64-objdump

View file

@ -0,0 +1,25 @@
FROM ubuntu:18.10
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
gcc \
libc6-dev \
make \
qemu-user \
qemu-system-mips \
bzip2 \
curl \
file
RUN mkdir /toolchain
# Note that this originally came from:
# https://downloads.openwrt.org/snapshots/trunk/malta/generic/OpenWrt-Toolchain-malta-le_gcc-5.3.0_musl-1.1.15.Linux-x86_64.tar.bz2
RUN curl -L https://ci-mirrors.rust-lang.org/libc/OpenWrt-Toolchain-malta-le_gcc-5.3.0_musl-1.1.15.Linux-x86_64.tar.bz2 | \
tar xjf - -C /toolchain --strip-components=2
ENV PATH=$PATH:/rust/bin:/toolchain/bin \
CC_mipsel_unknown_linux_musl=mipsel-openwrt-linux-gcc \
CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_MUSL_LINKER=mipsel-openwrt-linux-gcc \
CARGO_TARGET_MIPSEL_UNKNOWN_LINUX_MUSL_RUNNER="qemu-mipsel -L /toolchain"

View file

@ -0,0 +1,13 @@
FROM ubuntu:22.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc libc6-dev qemu-user ca-certificates \
gcc-powerpc-linux-gnu libc6-dev-powerpc-cross \
qemu-system-ppc \
make \
file
ENV CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_LINKER=powerpc-linux-gnu-gcc \
CARGO_TARGET_POWERPC_UNKNOWN_LINUX_GNU_RUNNER="qemu-ppc -cpu Vger -L /usr/powerpc-linux-gnu" \
CC=powerpc-linux-gnu-gcc \
OBJDUMP=powerpc-linux-gnu-objdump

View file

@ -0,0 +1,17 @@
FROM ubuntu:22.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
ca-certificates \
libc6-dev \
gcc-powerpc64-linux-gnu \
libc6-dev-ppc64-cross \
qemu-user \
qemu-system-ppc \
make \
file
ENV CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_LINKER=powerpc64-linux-gnu-gcc \
CARGO_TARGET_POWERPC64_UNKNOWN_LINUX_GNU_RUNNER="qemu-ppc64 -L /usr/powerpc64-linux-gnu" \
CC=powerpc64-linux-gnu-gcc \
OBJDUMP=powerpc64-linux-gnu-objdump

View file

@ -0,0 +1,11 @@
FROM ubuntu:22.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc libc6-dev qemu-user ca-certificates \
gcc-powerpc64le-linux-gnu libc6-dev-ppc64el-cross \
qemu-system-ppc file make
ENV CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_LINKER=powerpc64le-linux-gnu-gcc \
CARGO_TARGET_POWERPC64LE_UNKNOWN_LINUX_GNU_RUNNER="qemu-ppc64le -L /usr/powerpc64le-linux-gnu" \
CC=powerpc64le-linux-gnu-gcc \
OBJDUMP=powerpc64le-linux-gnu-objdump

View file

@ -0,0 +1,20 @@
FROM ubuntu:22.04
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
curl \
cmake \
gcc \
libc6-dev \
g++-s390x-linux-gnu \
libc6-dev-s390x-cross \
qemu-user \
make \
file
ENV CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_LINKER=s390x-linux-gnu-gcc \
CARGO_TARGET_S390X_UNKNOWN_LINUX_GNU_RUNNER="qemu-s390x -L /usr/s390x-linux-gnu" \
CC_s390x_unknown_linux_gnu=s390x-linux-gnu-gcc \
CXX_s390x_unknown_linux_gnu=s390x-linux-gnu-g++ \
OBJDUMP=s390x-linux-gnu-objdump

View file

@ -0,0 +1,18 @@
FROM debian:bookworm
RUN apt-get update && apt-get install -y --no-install-recommends \
curl ca-certificates \
gcc libc6-dev \
gcc-sparc64-linux-gnu libc6-dev-sparc64-cross \
qemu-system-sparc64 openbios-sparc seabios ipxe-qemu \
p7zip-full cpio
COPY linux-sparc64.sh /
RUN bash /linux-sparc64.sh
COPY test-runner-linux /
ENV CARGO_TARGET_SPARC64_UNKNOWN_LINUX_GNU_LINKER=sparc64-linux-gnu-gcc \
CARGO_TARGET_SPARC64_UNKNOWN_LINUX_GNU_RUNNER="/test-runner-linux sparc64" \
CC_sparc64_unknown_linux_gnu=sparc64-linux-gnu-gcc \
PATH=$PATH:/rust/bin

View file

@ -0,0 +1,47 @@
FROM ubuntu:16.04
RUN dpkg --add-architecture i386 && \
apt-get update && \
apt-get install -y --no-install-recommends \
file \
make \
curl \
ca-certificates \
python \
unzip \
expect \
openjdk-9-jre \
libstdc++6:i386 \
libpulse0 \
gcc \
libc6-dev
WORKDIR /android/
COPY android* /android/
ENV ANDROID_ARCH=arm
ENV PATH=$PATH:/android/ndk-$ANDROID_ARCH/bin:/android/sdk/tools:/android/sdk/platform-tools
RUN sh /android/android-install-ndk.sh $ANDROID_ARCH
RUN sh /android/android-install-sdk.sh $ANDROID_ARCH
RUN mv /root/.android /tmp
RUN chmod 777 -R /tmp/.android
RUN chmod 755 /android/sdk/tools/* /android/sdk/emulator/qemu/linux-x86_64/*
ENV PATH=$PATH:/rust/bin \
CARGO_TARGET_THUMBV7NEON_LINUX_ANDROIDEABI_LINKER=arm-linux-androideabi-gcc \
CARGO_TARGET_THUMBV7NEON_LINUX_ANDROIDEABI_RUNNER=/tmp/runtest \
OBJDUMP=arm-linux-androideabi-objdump \
HOME=/tmp
ADD runtest-android.rs /tmp/runtest.rs
ENTRYPOINT [ \
"bash", \
"-c", \
# set SHELL so android can detect a 64bits system, see
# http://stackoverflow.com/a/41789144
"SHELL=/bin/dash /android/sdk/emulator/emulator @arm -no-window & \
rustc /tmp/runtest.rs -o /tmp/runtest && \
exec \"$@\"", \
"--" \
]

View file

@ -0,0 +1,13 @@
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
ca-certificates \
libc6-dev \
gcc-arm-linux-gnueabihf \
libc6-dev-armhf-cross \
qemu-user \
make \
file
ENV CARGO_TARGET_THUMBV7NEON_UNKNOWN_LINUX_GNUEABIHF_LINKER=arm-linux-gnueabihf-gcc \
CARGO_TARGET_THUMBV7NEON_UNKNOWN_LINUX_GNUEABIHF_RUNNER="qemu-arm -L /usr/arm-linux-gnueabihf" \
OBJDUMP=arm-linux-gnueabihf-objdump

View file

@ -0,0 +1,39 @@
FROM ubuntu:22.04
RUN apt-get update -y && apt-get install -y --no-install-recommends \
ca-certificates \
clang \
cmake \
curl \
git \
libc6-dev \
make \
ninja-build \
python-is-python3 \
xz-utils
# Install `wasm2wat`
RUN git clone --recursive https://github.com/WebAssembly/wabt
RUN make -C wabt -j$(nproc)
ENV PATH=$PATH:/wabt/bin
# Install `wasm-bindgen-test-runner`
RUN curl -L https://github.com/rustwasm/wasm-bindgen/releases/download/0.2.87/wasm-bindgen-0.2.87-x86_64-unknown-linux-musl.tar.gz \
| tar xzf -
# Keep in sync with the version on Cargo.toml.
ENV PATH=$PATH:/wasm-bindgen-0.2.87-x86_64-unknown-linux-musl
ENV CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_RUNNER=wasm-bindgen-test-runner
# Install `node`
RUN curl https://nodejs.org/dist/v14.16.0/node-v14.16.0-linux-x64.tar.xz | tar xJf -
ENV PATH=$PATH:/node-v14.16.0-linux-x64/bin
# We use a shim linker that removes `--strip-debug` when passed to LLD. While
# this typically results in invalid debug information in release mode it doesn't
# result in an invalid names section which is what we're interested in.
COPY lld-shim.rs /
ENV CARGO_TARGET_WASM32_UNKNOWN_UNKNOWN_LINKER=/tmp/lld-shim
# Rustc isn't available until this container starts, so defer compilation of the
# shim.
ENTRYPOINT /rust/bin/rustc /lld-shim.rs -o /tmp/lld-shim && exec bash "$@"

View file

@ -0,0 +1,31 @@
FROM ubuntu:20.04
RUN apt-get update && \
apt-get install -y --no-install-recommends \
ca-certificates \
curl \
gcc \
libc-dev \
python \
unzip \
file \
make
WORKDIR /android/
ENV ANDROID_ARCH=x86_64
COPY android-install-ndk.sh /android/
RUN sh /android/android-install-ndk.sh
ENV STDARCH_ASSERT_INSTR_LIMIT=30
# We do not run x86_64-linux-android tests on an android emulator.
# See ci/android-sysimage.sh for informations about how tests are run.
COPY android-sysimage.sh /android/
RUN bash /android/android-sysimage.sh x86_64 x86_64-24_r07.zip
ENV PATH=$PATH:/rust/bin:/android/ndk/toolchains/llvm/prebuilt/linux-x86_64/bin \
CARGO_TARGET_X86_64_LINUX_ANDROID_LINKER=x86_64-linux-android21-clang \
CC_x86_64_linux_android=x86_64-linux-android21-clang \
CXX_x86_64_linux_android=x86_64-linux-android21-clang++ \
OBJDUMP=llvm-objdump \
HOME=/tmp

View file

@ -0,0 +1,16 @@
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
libc6-dev \
file \
make \
ca-certificates \
wget \
bzip2 \
cmake \
libclang-dev \
clang
RUN wget https://github.com/gnzlbg/intel_sde/raw/master/sde-external-8.16.0-2018-01-30-lin.tar.bz2
RUN tar -xjf sde-external-8.16.0-2018-01-30-lin.tar.bz2
ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUNNER="/sde-external-8.16.0-2018-01-30-lin/sde64 --"

View file

@ -0,0 +1,10 @@
FROM ubuntu:18.04
RUN apt-get update && apt-get install -y --no-install-recommends \
gcc \
libc6-dev \
file \
make \
ca-certificates \
cmake \
libclang-dev \
clang

27
third_party/rust/packed_simd/ci/dox.sh vendored Executable file
View file

@ -0,0 +1,27 @@
#!/bin/sh
set -ex
rm -rf target/doc
mkdir -p target/doc
# Build API documentation
cargo doc --features=into_bits
# Build Performance Guide
# FIXME: https://github.com/rust-lang-nursery/mdBook/issues/780
# mdbook build perf-guide -d target/doc/perf-guide
cd perf-guide
mdbook build
cd -
cp -r perf-guide/book target/doc/perf-guide
# If we're on travis, not a PR, and on the right branch, publish!
if [ "$TRAVIS_PULL_REQUEST" = "false" ] && [ "$TRAVIS_BRANCH" = "master" ]; then
python3 -vV
pip -vV
python3.9 -vV
pip install ghp_import --user
ghp-import -n target/doc
git push -qf https://${GH_PAGES}@github.com/${TRAVIS_REPO_SLUG}.git gh-pages
fi

View file

@ -0,0 +1,18 @@
set -ex
mkdir -m 777 /qemu
cd /qemu
curl -LO https://github.com/qemu/qemu/raw/master/pc-bios/s390-ccw.img
curl -LO http://ftp.debian.org/debian/dists/testing/main/installer-s390x/20170828/images/generic/kernel.debian
curl -LO http://ftp.debian.org/debian/dists/testing/main/installer-s390x/20170828/images/generic/initrd.debian
mv kernel.debian kernel
mv initrd.debian initrd.gz
mkdir init
cd init
gunzip -c ../initrd.gz | cpio -id
rm ../initrd.gz
cp /usr/s390x-linux-gnu/lib/libgcc_s.so.1 usr/lib/
chmod a+w .

View file

@ -0,0 +1,17 @@
set -ex
mkdir -m 777 /qemu
cd /qemu
curl -LO https://cdimage.debian.org/cdimage/ports/9.0/sparc64/iso-cd/debian-9.0-sparc64-NETINST-1.iso
7z e debian-9.0-sparc64-NETINST-1.iso boot/initrd.gz
7z e debian-9.0-sparc64-NETINST-1.iso boot/sparc64
mv sparc64 kernel
rm debian-9.0-sparc64-NETINST-1.iso
mkdir init
cd init
gunzip -c ../initrd.gz | cpio -id
rm ../initrd.gz
cp /usr/sparc64-linux-gnu/lib/libgcc_s.so.1 usr/lib/
chmod a+w .

View file

@ -0,0 +1,11 @@
use std::os::unix::prelude::*;
use std::process::Command;
use std::env;
fn main() {
let args = env::args()
.skip(1)
.filter(|s| s != "--strip-debug")
.collect::<Vec<_>>();
panic!("failed to exec: {}", Command::new("rust-lld").args(&args).exec());
}

View file

@ -0,0 +1,17 @@
#!/usr/bin/env sh
set -x
export success=true
find . -iname '*.rs' | while read -r file; do
result=$(grep '.\{79\}' "${file}" | grep --invert 'http')
if [ "${result}" = "" ]
then
:
else
echo "file \"${file}\": $result"
exit 1
fi
done

38
third_party/rust/packed_simd/ci/run-docker.sh vendored Executable file
View file

@ -0,0 +1,38 @@
# Small script to run tests for a target (or all targets) inside all the
# respective docker images.
set -ex
run() {
echo "Building docker container for TARGET=${TARGET} RUSTFLAGS=${RUSTFLAGS}"
docker build -t packed_simd -f ci/docker/${TARGET}/Dockerfile ci/
mkdir -p target
target=$(echo "${TARGET}" | sed 's/-emulated//')
echo "Running docker"
docker run \
--user `id -u`:`id -g` \
--rm \
--init \
--volume $HOME/.cargo:/cargo \
--env CARGO_HOME=/cargo \
--volume `rustc --print sysroot`:/rust:ro \
--env TARGET=$target \
--env NORUN \
--env NOVERIFY \
--env RUSTFLAGS \
--volume `pwd`:/checkout:ro \
--volume `pwd`/target:/checkout/target \
--workdir /checkout \
--privileged \
packed_simd \
bash \
-c 'PATH=$PATH:/rust/bin exec ci/run.sh'
}
if [ -z "${TARGET}" ]; then
for d in `ls ci/docker/`; do
run $d
done
else
run ${TARGET}
fi

99
third_party/rust/packed_simd/ci/run.sh vendored Executable file
View file

@ -0,0 +1,99 @@
#!/usr/bin/env bash
set -ex
: ${TARGET?"The TARGET environment variable must be set."}
# Tests are all super fast anyway, and they fault often enough on travis that
# having only one thread increases debuggability to be worth it.
#export RUST_TEST_THREADS=1
#export RUST_BACKTRACE=full
#export RUST_TEST_NOCAPTURE=1
# Some appveyor builds run out-of-memory; this attempts to mitigate that:
# https://github.com/rust-lang-nursery/packed_simd/issues/39
# export RUSTFLAGS="${RUSTFLAGS} -C codegen-units=1"
# export CARGO_BUILD_JOBS=1
export CARGO_SUBCMD=test
if [[ "${NORUN}" == "1" ]]; then
export CARGO_SUBCMD=build
fi
if [[ ${TARGET} == "x86_64-apple-ios" ]] || [[ ${TARGET} == "i386-apple-ios" ]]; then
export RUSTFLAGS="${RUSTFLAGS} -Clink-arg=-mios-simulator-version-min=7.0"
rustc ./ci/deploy_and_run_on_ios_simulator.rs -o $HOME/runtest
export CARGO_TARGET_X86_64_APPLE_IOS_RUNNER=$HOME/runtest
export CARGO_TARGET_I386_APPLE_IOS_RUNNER=$HOME/runtest
fi
# The source directory is read-only. Need to copy internal crates to the target
# directory for their Cargo.lock to be properly written.
mkdir target || true
rustc --version
cargo --version
echo "TARGET=${TARGET}"
echo "HOST=${HOST}"
echo "RUSTFLAGS=${RUSTFLAGS}"
echo "NORUN=${NORUN}"
echo "NOVERIFY=${NOVERIFY}"
echo "CARGO_SUBCMD=${CARGO_SUBCMD}"
echo "CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS}"
echo "CARGO_INCREMENTAL=${CARGO_INCREMENTAL}"
echo "RUST_TEST_THREADS=${RUST_TEST_THREADS}"
echo "RUST_BACKTRACE=${RUST_BACKTRACE}"
echo "RUST_TEST_NOCAPTURE=${RUST_TEST_NOCAPTURE}"
cargo_test() {
cmd="cargo ${CARGO_SUBCMD} --verbose --target=${TARGET} ${@}"
if [ "${NORUN}" != "1" ]
then
if [ "$TARGET" != "wasm32-unknown-unknown" ]
then
cmd="$cmd -- --quiet"
fi
fi
mkdir target || true
${cmd} 2>&1 | tee > target/output
if [[ ${PIPESTATUS[0]} != 0 ]]; then
cat target/output
return 1
fi
}
cargo_test_impl() {
ORIGINAL_RUSTFLAGS=${RUSTFLAGS}
RUSTFLAGS="${ORIGINAL_RUSTFLAGS} --cfg test_v16 --cfg test_v32 --cfg test_v64" cargo_test ${@}
RUSTFLAGS="${ORIGINAL_RUSTFLAGS} --cfg test_v128 --cfg test_v256" cargo_test ${@}
RUSTFLAGS="${ORIGINAL_RUSTFLAGS} --cfg test_v512" cargo_test ${@}
RUSTFLAGS=${ORIGINAL_RUSTFLAGS}
}
# Debug run:
if [[ "${TARGET}" != "wasm32-unknown-unknown" ]]; then
# Run wasm32-unknown-unknown in release mode only
cargo_test_impl
fi
if [[ "${TARGET}" == "x86_64-unknown-linux-gnu" ]] || [[ "${TARGET}" == "x86_64-pc-windows-msvc" ]]; then
# use sleef on linux and windows x86_64 builds
# FIXME: Use `core_arch,sleef-sys` features once they works again
cargo_test_impl --release --features=into_bits
else
# FIXME: Use `core_arch` feature once it works again
cargo_test_impl --release --features=into_bits
fi
# Verify code generation
if [[ "${NOVERIFY}" != "1" ]]; then
cp -r verify/verify target/verify
export STDSIMD_ASSERT_INSTR_LIMIT=30
if [[ "${TARGET}" == "i586-unknown-linux-gnu" ]]; then
export STDSIMD_ASSERT_INSTR_LIMIT=50
fi
cargo_test --release --manifest-path=target/verify/Cargo.toml
fi
# FIXME: Figure out which examples take too long to run and ignore or adjust those
#. ci/run_examples.sh

View file

@ -0,0 +1,51 @@
# Runs all examples.
# FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/55
# All examples fail to build for `armv7-apple-ios`.
if [[ ${TARGET} == "armv7-apple-ios" ]]; then
exit 0
fi
# FIXME: travis exceeds 50 minutes on these targets
# Skipping the examples is an attempt at preventing travis from timing-out
if [[ ${TARGET} == "arm-linux-androidabi" ]] || [[ ${TARGET} == "aarch64-linux-androidabi" ]] \
|| [[ ${TARGET} == "sparc64-unknown-linux-gnu" ]]; then
exit 0
fi
if [[ ${TARGET} == "wasm32-unknown-unknown" ]]; then
exit 0
fi
cp -r examples/aobench target/aobench
cargo_test --manifest-path=target/aobench/Cargo.toml --release --no-default-features
cargo_test --manifest-path=target/aobench/Cargo.toml --release --features=256bit
cp -r examples/dot_product target/dot_product
cargo_test --manifest-path=target/dot_product/Cargo.toml --release
cp -r examples/fannkuch_redux target/fannkuch_redux
cargo_test --manifest-path=target/fannkuch_redux/Cargo.toml --release
# FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/56
if [[ ${TARGET} != "i586-unknown-linux-gnu" ]]; then
cp -r examples/mandelbrot target/mandelbrot
cargo_test --manifest-path=target/mandelbrot/Cargo.toml --release
fi
cp -r examples/matrix_inverse target/matrix_inverse
cargo_test --manifest-path=target/matrix_inverse/Cargo.toml --release
cp -r examples/nbody target/nbody
cargo_test --manifest-path=target/nbody/Cargo.toml --release
cp -r examples/spectral_norm target/spectral_norm
cargo_test --manifest-path=target/spectral_norm/Cargo.toml --release
if [[ ${TARGET} != "i586-unknown-linux-gnu" ]]; then
cp -r examples/stencil target/stencil
cargo_test --manifest-path=target/stencil/Cargo.toml --release
fi
cp -r examples/triangle_xform target/triangle_xform
cargo_test --manifest-path=target/triangle_xform/Cargo.toml --release

View file

@ -0,0 +1,45 @@
use std::env;
use std::process::Command;
use std::path::{Path, PathBuf};
fn main() {
let args = env::args_os()
.skip(1)
.filter(|arg| arg != "--quiet")
.collect::<Vec<_>>();
assert_eq!(args.len(), 1);
let test = PathBuf::from(&args[0]);
let dst = Path::new("/data/local/tmp").join(test.file_name().unwrap());
let status = Command::new("adb")
.arg("wait-for-device")
.status()
.expect("failed to run: adb wait-for-device");
assert!(status.success());
let status = Command::new("adb")
.arg("push")
.arg(&test)
.arg(&dst)
.status()
.expect("failed to run: adb pushr");
assert!(status.success());
let output = Command::new("adb")
.arg("shell")
.arg(&dst)
.output()
.expect("failed to run: adb shell");
assert!(status.success());
println!("status: {}\nstdout ---\n{}\nstderr ---\n{}",
output.status,
String::from_utf8_lossy(&output.stdout),
String::from_utf8_lossy(&output.stderr));
let stdout = String::from_utf8_lossy(&output.stdout);
let mut lines = stdout.lines().filter(|l| l.starts_with("test result"));
if !lines.all(|l| l.contains("test result: ok") && l.contains("0 failed")) {
panic!("failed to find successful test run");
}
}

View file

@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -ex
# Get latest ISPC binary for the target and put it in the path
git clone https://github.com/gnzlbg/ispc-binaries
cp ispc-binaries/ispc-${TARGET} ispc

View file

@ -0,0 +1,24 @@
#!/bin/sh
set -e
arch=$1
prog=$2
cd /qemu/init
cp -f $2 prog
find . | cpio --create --format='newc' --quiet | gzip > ../initrd.gz
cd ..
timeout 30s qemu-system-$arch \
-m 1024 \
-nographic \
-kernel kernel \
-initrd initrd.gz \
-append init=/prog > output || true
# remove kernel messages
tr -d '\r' < output | egrep -v '^\['
# if the output contains a failure, return error
! grep FAILED output > /dev/null

View file

@ -0,0 +1,67 @@
# Contributing to `packed_simd`
Welcome! If you are reading this document, it means you are interested in contributing
to the `packed_simd` crate.
## Reporting issues
All issues with this crate are tracked using GitHub's [Issue Tracker].
You can use issues to bring bugs to the attention of the maintainers, to discuss
certain problems encountered with the crate, or to request new features (although
feature requests should be limited to things mentioned in the [RFC]).
One thing to keep in mind is to always use the **latest** nightly toolchain when
working on this crate. Due to the nature of this project, we use a lot of unstable
features, meaning breakage happens often.
[Issue Tracker]: https://github.com/rust-lang-nursery/packed_simd/issues
[RFC]: https://github.com/rust-lang/rfcs/pull/2366
### LLVM issues
The Rust compiler relies on [LLVM](https://llvm.org/) for machine code generation,
and quite a few LLVM bugs have been discovered during the development of this project.
If you encounter issues with incorrect/suboptimal codegen, which you do not encounter
when using the [SIMD vendor intrinsics](https://doc.rust-lang.org/nightly/std/arch/),
it is likely the issue is with LLVM, or this crate's interaction with it.
You should first open an issue **in this repo** to help us track the problem, and we
will help determine what is the exact cause of the problem.
If LLVM is indeed the cause, the issue will be reported upstream to the
[LLVM bugtracker](https://bugs.llvm.org/).
## Submitting Pull Requests
New code is submitted to the crate using GitHub's [pull request] mechanism.
You should first fork this repository, make your changes (preferably in a new
branch), then use GitHub's web UI to create a new PR.
[pull request]: https://help.github.com/articles/about-pull-requests/
### Examples
The `examples` directory contains code showcasing SIMD code written with this crate,
usually in comparison to scalar or ISPC code. If you have a project / idea which
uses SIMD, we'd love to add it to the examples list.
Every example should include a small `README`, describing the example code's purpose.
If your example could potentially work as a benchmark, then add a `benchmark.sh`
script to allow running the example benchmark code in CI. See an existing example's
[`benchmark.sh`](examples/aobench/benchmark.sh) for a sample.
Don't forget to update the crate's top-level `README` with a link to your example.
### Perf guide
The objective of the [performance guide][perf-guide] is to be a comprehensive
resource detailing the process of optimizing Rust code with SIMD support.
If you believe a certain section could be reworded, or if you have any tips & tricks
related to SIMD which you'd like to share, please open a PR.
[mdBook] is used to manage the formatting of the guide as a book.
[perf-guide]: https://rust-lang-nursery.github.io/packed_simd/perf-guide/
[mdBook]: https://github.com/rust-lang-nursery/mdBook

View file

@ -0,0 +1,12 @@
[book]
authors = ["Gonzalo Brito Gadeschi", "Gabriel Majeri"]
multilingual = false
src = "src"
title = "Rust SIMD Performance Guide"
description = "This book describes how to write performant SIMD code in Rust."
[build]
create-missing = false
[output.html]
additional-css = ["./src/ascii.css"]

View file

@ -0,0 +1,21 @@
# Summary
[Introduction](./introduction.md)
- [Floating-point Math](./float-math/fp.md)
- [Short-vector Math Library](./float-math/svml.md)
- [Approximate functions](./float-math/approx.md)
- [Fused multiply-accumulate](./float-math/fma.md)
- [Target features](./target-feature/features.md)
- [Using `RUSTFLAGS`](./target-feature/rustflags.md)
- [Using the `target_feature` attribute](./target-feature/attribute.md)
- [Interaction with inlining](./target-feature/inlining.md)
- [Detecting features at runtime](./target-feature/runtime.md)
- [Bounds checking](./bound_checks.md)
- [Vertical and horizontal operations](./vert-hor-ops.md)
- [Performance profiling](./prof/profiling.md)
- [Profiling on Linux](./prof/linux.md)
- [Using machine code analyzers](./prof/mca.md)

View file

@ -0,0 +1,4 @@
code {
/* "Source Code Pro" breaks ASCII art */
font-family: Consolas, "Ubuntu Mono", Menlo, "DejaVu Sans Mono", monospace;
}

View file

@ -0,0 +1,22 @@
# Bounds checking
Reading and writing packed vectors to/from slices is checked by default.
Independently of the configuration options used, the safe functions:
* `Simd<[T; N]>::from_slice_aligned(& s[..])`
* `Simd<[T; N]>::write_to_slice_aligned(&mut s[..])`
always check that:
* the slice is big enough to hold the vector
* the slice is suitably aligned to perform an aligned load/store for a `Simd<[T;
N]>` (this alignment is often much larger than that of `T`).
There are `_unaligned` versions that use unaligned load and stores, as well as
`unsafe` `_unchecked` that do not perform any checks iff `debug-assertions =
false` / `debug = false`. That is, the `_unchecked` methods do still assert size
and alignment in debug builds and could also do so in release builds depending
on the configuration options.
These assertions do often significantly impact performance and you should be
aware of them.

View file

@ -0,0 +1,8 @@
# Approximate functions
<!-- TODO:
Explain that they exists, that they are often _much_ faster, how to use them,
that people should check whether the error is good enough for their
applications. Explain that this error is currently unstable and might change.
-->

View file

@ -0,0 +1,6 @@
# Fused Multiply Add
<!-- TODO:
Explain that this is a compound operation, infinite precision, difference
between `mul_add` and `mul_adde`, that LLVM cannot do this by itself, etc.
-->

View file

@ -0,0 +1,3 @@
# Floating-point math
This chapter contains information pertaining to working with floating-point numbers.

View file

@ -0,0 +1,7 @@
# Short Vector Math Library
<!-- TODO:
Explain how is short-vector math performed by default (just scalarized libm calls).
Explain how to enable `sleef`, etc.
-->

View file

@ -0,0 +1,26 @@
# Introduction
## What is SIMD
<!-- TODO:
describe what SIMD is, which algorithms can benefit from it,
give usage examples
-->
## History of SIMD in Rust
<!-- TODO:
discuss history of unstable std::simd,
stabilization of std::arch, etc.
-->
## Discover packed_simd
<!-- TODO: describe scope of this project -->
Writing fast and portable SIMD algorithms using `packed_simd` is, unfortunately,
not trivial. There are many pitfals that one should be aware of, and some idioms
that help avoid those pitfalls.
This book attempts to document these best practices and provides practical examples
on how to apply the tips to _your_ code.

View file

@ -0,0 +1,107 @@
# Performance profiling on Linux
## Using `perf`
[perf](https://perf.wiki.kernel.org/) is the most powerful performance profiler
for Linux, featuring support for various hardware Performance Monitoring Units,
as well as integration with the kernel's performance events framework.
We will only look at how can the `perf` command can be used to profile SIMD code.
Full system profiling is outside of the scope of this book.
### Recording
The first step is to record a program's execution during an average workload.
It helps if you can isolate the parts of your program which have performance
issues, and set up a benchmark which can be easily (re)run.
Build the benchmark binary in release mode, after having enabled debug info:
```sh
$ cargo build --release
Finished release [optimized + debuginfo] target(s) in 0.02s
```
Then use the `perf record` subcommand:
```sh
$ perf record --call-graph=dwarf ./target/release/my-program
[ perf record: Woken up 10 times to write data ]
[ perf record: Captured and wrote 2,356 MB perf.data (292 samples) ]
```
Instead of using `--call-graph=dwarf`, which can become pretty slow, you can use
`--call-graph=lbr` if you have a processor with support for Last Branch Record
(i.e. Intel Haswell and newer).
`perf` will, by default, record the count of CPU cycles it takes to execute
various parts of your program. You can use the `-e` command line option
to enable other performance events, such as `cache-misses`. Use `perf list`
to get a list of all hardware counters supported by your CPU.
### Viewing the report
The next step is getting a bird's eye view of the program's execution.
`perf` provides a `ncurses`-based interface which will get you started.
Use `perf report` to open a visualization of your program's performance:
```sh
perf report --hierarchy -M intel
```
`--hierarchy` will display a tree-like structure of where your program spent
most of its time. `-M intel` enables disassembly output with Intel syntax, which
is subjectively more readable than the default AT&T syntax.
Here is the output from profiling the `nbody` benchmark:
```
- 100,00% nbody
- 94,18% nbody
+ 93,48% [.] nbody_lib::simd::advance
+ 0,70% [.] nbody_lib::run
+ 5,06% libc-2.28.so
```
If you move with the arrow keys to any node in the tree, you can the press `a`
to have `perf` _annotate_ that node. This means it will:
- disassemble the function
- associate every instruction with the percentage of time which was spent executing it
- interleaves the disassembly with the source code,
assuming it found the debug symbols
(you can use `s` to toggle this behaviour)
`perf` will, by default, open the instruction which it identified as being the
hottest spot in the function:
```
0,76 │ movapd xmm2,xmm0
0,38 │ movhlps xmm2,xmm0
│ addpd xmm2,xmm0
│ unpcklpd xmm1,xmm2
12,50 │ sqrtpd xmm0,xmm1
1,52 │ mulpd xmm0,xmm1
```
In this case, `sqrtpd` will be highlighted in red, since that's the instruction
which the CPU spends most of its time executing.
## Using Valgrind
Valgrind is a set of tools which initially helped C/C++ programmers find unsafe
memory accesses in their code. Nowadays the project also has
- a heap profiler called `massif`
- a cache utilization profiler called `cachegrind`
- a call-graph performance profiler called `callgrind`
<!--
TODO: explain valgrind's dynamic binary translation, warn about massive
slowdown, talk about `kcachegrind` for a GUI
-->

View file

@ -0,0 +1,100 @@
# Machine code analysis tools
## The microarchitecture of modern CPUs
While you might have heard of Instruction Set Architectures, such as `x86` or
`arm` or `mips`, the term _microarchitecture_ (also written here as _µ-arch_),
refers to the internal details of an actual family of CPUs, such as Intel's
_Haswell_ or AMD's _Jaguar_.
Replacing scalar code with SIMD code will improve performance on all CPUs
supporting the required vector extensions.
However, due to microarchitectural differences, the actual speed-up at
runtime might vary.
**Example**: a simple example arises when optimizing for AMD K8 CPUs.
The assembly generated for an empty function should look like this:
```asm
nop
ret
```
The `nop` is used to align the `ret` instruction for better performance.
However, the compiler will actually generated the following code:
```asm
repz ret
```
The `repz` instruction will repeat the following instruction until a certain
condition. Of course, in this situation, the function will simply immediately
return, and the `ret` instruction is still aligned.
However, AMD K8's branch predictor performs better with the latter code.
For those looking to absolutely maximize performance for a certain target µ-arch,
you will have to read some CPU manuals, or ask the compiler to do it for you
with `-C target-cpu`.
### Summary of CPU internals
Modern processors are able to execute instructions out-of-order for better performance,
by utilizing tricks such as [branch prediction], [instruction pipelining],
or [superscalar execution].
[branch prediction]: https://en.wikipedia.org/wiki/Branch_predictor
[instruction pipelining]: https://en.wikipedia.org/wiki/Instruction_pipelining
[superscalar execution]: https://en.wikipedia.org/wiki/Superscalar_processor
SIMD instructions are also subject to these optimizations, meaning it can get pretty
difficult to determine where the slowdown happens.
For example, if the profiler reports a store operation is slow, one of two things
could be happening:
- the store is limited by the CPU's memory bandwidth, which is actually an ideal
scenario, all things considered;
- memory bandwidth is nowhere near its peak, but the value to be stored is at the
end of a long chain of operations, and this store is where the profiler
encountered the pipeline stall;
Since most profilers are simple tools which don't understand the subtleties of
instruction scheduling, you
## Analyzing the machine code
Certain tools have knowledge of internal CPU microarchitecture, i.e. they know
- how many physical [register files] a CPU actually has
- what is the latency / throughtput of an instruction
- what [µ-ops] are generated for a set of instructions
and many other architectural details.
[register files]: https://en.wikipedia.org/wiki/Register_file
[µ-ops]: https://en.wikipedia.org/wiki/Micro-operation
These tools are therefore able to provide accurate information as to why some
instructions are inefficient, and where the bottleneck is.
The disadvantage is that the output of these tools requires advanced knowledge
of the target architecture to understand, i.e. they **cannot** point out what
the cause of the issue is explicitly.
## Intel's Architecture Code Analyzer (IACA)
[IACA] is a free tool offered by Intel for analyzing the performance of various
computational kernels.
Being a proprietary, closed source tool, it _only_ supports Intel's µ-arches.
[IACA]: https://software.intel.com/en-us/articles/intel-architecture-code-analyzer
## llvm-mca
<!--
TODO: once LLVM 7 gets released, write a chapter on using llvm-mca
with SIMD disassembly.
-->

View file

@ -0,0 +1,14 @@
# Performance profiling
While the rest of the book provides practical advice on how to improve the performance
of SIMD code, this chapter is dedicated to [**performance profiling**][profiling].
Profiling consists of recording a program's execution in order to identify program
hotspots.
**Important**: most profilers require debug information in order to accurately
link the program hotspots back to the corresponding source code lines. Rust will
disable debug info generation by default for optimized builds, but you can change
that [in your `Cargo.toml`][cargo-ref].
[profiling]: https://en.wikipedia.org/wiki/Profiling_(computer_programming)
[cargo-ref]: https://doc.rust-lang.org/cargo/reference/manifest.html#the-profile-sections

View file

@ -0,0 +1,5 @@
# The `target_feature` attribute
<!-- TODO:
Explain the `#[target_feature]` attribute
-->

View file

@ -0,0 +1,13 @@
# Enabling target features
Not all processors of a certain architecture will have SIMD processing units,
and using a SIMD instruction which is not supported will trigger undefined behavior.
To allow building safe, portable programs, the Rust compiler will **not**, by default,
generate any sort of vector instructions, unless it can statically determine
they are supported. For example, on AMD64, SSE2 support is architecturally guaranteed.
The `x86_64-apple-darwin` target enables up to SSSE3. The get a defintive list of
which features are enabled by default on various platforms, refer to the target
specifications [in the compiler's source code][targets].
[targets]: https://github.com/rust-lang/rust/tree/master/src/librustc_target/spec

View file

@ -0,0 +1,5 @@
# Inlining
<!-- TODO:
Explain how the `#[target_feature]` attribute interacts with inlining
-->

View file

@ -0,0 +1,31 @@
# Target features in practice
Using `RUSTFLAGS` will allow the crate being compiled, as well as all its
transitive dependencies to use certain target features.
A tehnique used to avoid undefined behavior at runtime is to compile and
ship multiple binaries, each compiled with a certain set of features.
This might not be feasible in some cases, and can quickly get out of hand
as more and more vector extensions are added to an architecture.
Rust can be more flexible: you can build a single binary/library which automatically
picks the best supported vector instructions depending on the host machine.
The trick consists of monomorphizing parts of the code during building, and then
using run-time feature detection to select the right code path when running.
<!-- TODO
Explain how to create efficient functions that dispatch to different
implementations at run-time without issues (e.g. using `#[inline(always)]` for
the impls, wrapping in `#[target_feature]`, and the wrapping those in a function
that does run-time feature detection).
-->
**NOTE** (x86 specific): because the AVX (256-bit) registers extend the existing
SSE (128-bit) registers, mixing SSE and AVX instructions in a program can cause
performance issues.
The solution is to compile all code, even the code written with 128-bit vectors,
with the AVX target feature enabled. This will cause the compiler to prefix the
generated instructions with the [VEX] prefix.
[VEX]: https://en.wikipedia.org/wiki/VEX_prefix

View file

@ -0,0 +1,5 @@
# Detecting host features at runtime
<!-- TODO:
Explain cost (how it works).
-->

View file

@ -0,0 +1,77 @@
# Using RUSTFLAGS
One of the easiest ways to benefit from SIMD is to allow the compiler
to generate code using certain vector instruction extensions.
The environment variable `RUSTFLAGS` can be used to pass options for code
generation to the Rust compiler. These flags will affect **all** compiled crates.
There are two flags which can be used to enable specific vector extensions:
## target-feature
- Syntax: `-C target-feature=<features>`
- Provides the compiler with a comma-separated set of instruction extensions
to enable.
**Example**: Use `-C target-feature=+sse3,+avx` to enable generating instructions
for [Streaming SIMD Extensions 3](https://en.wikipedia.org/wiki/SSE3) and
[Advanced Vector Extensions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions).
- To list target triples for all targets supported by Rust, use:
```sh
rustc --print target-list
```
- To list all support target features for a certain target triple, use:
```sh
rustc --target=${TRIPLE} --print target-features
```
- Note that all CPU features are independent, and will have to be enabled individually.
**Example**: Setting `-C target-feature=+avx2` will _not_ enable `fma`, even though
all CPUs which support AVX2 also support FMA. To enable both, one has to use
`-C target-feature=+avx2,+fma`
- Some features also depend on other features, which need to be enabled for the
target instructions to be generated.
**Example**: Unless `v7` is specified as the target CPU (see below), to enable
NEON on ARM it is necessary to use `-C target-feature=+v7,+neon`.
## target-cpu
- Syntax: `-C target-cpu=<cpu>`
- Sets the identifier of a CPU family / model for which to build and optimize the code.
**Example**: `RUSTFLAGS='-C target-cpu=cortex-a75'`
- To list all supported target CPUs for a certain target triple, use:
```sh
rustc --target=${TRIPLE} --print target-cpus
```
**Example**:
```sh
rustc --target=i686-pc-windows-msvc --print target-cpus
```
- The compiler will translate this into a list of target features. Therefore,
individual feature checks (`#[cfg(target_feature = "...")]`) will still
work properly.
- It will cause the code generator to optimize the generated code for that
specific CPU model.
- Using `native` as the CPU model will cause Rust to generate and optimize code
for the CPU running the compiler. It is useful when building programs which you
plan to only use locally. This should never be used when the generated programs
are meant to be run on other computers, such as when packaging for distribution
or cross-compiling.

View file

@ -0,0 +1,76 @@
# Vertical and horizontal operations
In SIMD terminology, each vector has a certain "width" (number of lanes).
A vector processor is able to perform two kinds of operations on a vector:
- Vertical operations:
operate on two vectors of the same width, result has same width
**Example**: vertical addition of two `f32x4` vectors
%0 == | 2 | -3.5 | 0 | 7 |
+ + + +
%1 == | 4 | 1.5 | -1 | 0 |
= = = =
%0 + %1 == | 6 | -2 | -1 | 7 |
- Horizontal operations:
reduce the elements of two vectors in some way,
the result's elements combine information from the two original ones
**Example**: horizontal addition of two `u64x2` vectors
%0 == | 1 | 3 |
└─+───┘
└───────┐
%1 == | 4 | -1 | │
└─+──┘ │
└───┐ │
│ │
┌─────│───┘
▼ ▼
%0 + %1 == | 4 | 3 |
## Performance consideration of horizontal operations
The result of vertical operations, like vector negation: `-a`, for a given lane,
does not depend on the result of the operation for the other lanes. The result
of horizontal operations, like the vector `sum` reduction: `a.sum()`, depends on
the value of all vector lanes.
In virtually all architectures vertical operations are fast, while horizontal
operations are, by comparison, very slow.
Consider the following two functions for computing the sum of all `f32` values
in a slice:
```rust
fn fast_sum(x: &[f32]) -> f32 {
assert!(x.len() % 4 == 0);
let mut sum = f32x4::splat(0.); // [0., 0., 0., 0.]
for i in (0..x.len()).step_by(4) {
sum += f32x4::from_slice_unaligned(&x[i..]);
}
sum.sum()
}
fn slow_sum(x: &[f32]) -> f32 {
assert!(x.len() % 4 == 0);
let mut sum: f32 = 0.;
for i in (0..x.len()).step_by(4) {
sum += f32x4::from_slice_unaligned(&x[i..]).sum();
}
sum
}
```
The inner loop over the slice is where the bulk of the work actually happens.
There, the `fast_sum` function perform vertical operations into a vector, doing
a single horizontal reduction at the end, while the `slow_sum` function performs
horizontal vector operations inside of the loop.
On all widely-used architectures, `fast_sum` is a large constant factor faster
than `slow_sum`. You can run the [slice_sum]() example and see for yourself. On
the particular machine tested there the algorithm using the horizontal vector
addition is 2.7x slower than the one using vertical vector operations!

View file

@ -0,0 +1 @@
nightly

View file

@ -0,0 +1,5 @@
max_width = 110
use_small_heuristics = "Max"
wrap_comments = true
edition = "2018"
error_on_line_overflow = true

309
third_party/rust/packed_simd/src/api.rs vendored Normal file
View file

@ -0,0 +1,309 @@
//! Implements the Simd<[T; N]> APIs
#[macro_use]
mod bitmask;
pub(crate) mod cast;
#[macro_use]
mod cmp;
#[macro_use]
mod default;
#[macro_use]
mod fmt;
#[macro_use]
mod from;
#[macro_use]
mod hash;
#[macro_use]
mod math;
#[macro_use]
mod minimal;
#[macro_use]
mod ops;
#[macro_use]
mod ptr;
#[macro_use]
mod reductions;
#[macro_use]
mod select;
#[macro_use]
mod shuffle;
#[macro_use]
mod shuffle1_dyn;
#[macro_use]
mod slice;
#[macro_use]
mod swap_bytes;
#[macro_use]
mod bit_manip;
#[cfg(feature = "into_bits")]
pub(crate) mod into_bits;
macro_rules! impl_i {
([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident
| $ielem_ty:ident, $ibitmask_ty:ident | $test_tt:tt | $($elem_ids:ident),*
| From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => {
impl_minimal_iuf!([$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
| $($elem_ids),* | $(#[$doc])*);
impl_ops_vector_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_ops_scalar_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_ops_vector_bitwise!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0)
);
impl_ops_scalar_bitwise!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0)
);
impl_ops_vector_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_ops_scalar_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_ops_vector_rotates!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_ops_vector_neg!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_ops_vector_int_min_max!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt
);
impl_reduction_integer_arithmetic!(
[$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
);
impl_reduction_min_max!(
[$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
);
impl_reduction_bitwise!(
[$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
| (|x|{ x as $elem_ty }) | (!(0 as $elem_ty), 0)
);
impl_fmt_debug!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_fmt_lower_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_fmt_upper_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_fmt_octal!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_fmt_binary!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_from_array!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (1, 1));
impl_from_vectors!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),*
);
impl_default!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_hash!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_slice_from_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_slice_write_to_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_swap_bytes!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_bit_manip!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_cmp_partial_eq!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1)
);
impl_cmp_eq!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1));
impl_cmp_vertical!(
[$elem_ty; $elem_n]: $tuple_id, $mask_ty, false, (1, 0) | $test_tt
);
impl_cmp_partial_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_cmp_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1));
impl_bitmask!($tuple_id | $ibitmask_ty | (-1, 0) | $test_tt);
test_select!($elem_ty, $mask_ty, $tuple_id, (1, 2) | $test_tt);
test_cmp_partial_ord_int!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
test_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
}
}
macro_rules! impl_u {
([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident
| $ielem_ty:ident, $ibitmask_ty:ident | $test_tt:tt | $($elem_ids:ident),*
| From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => {
impl_minimal_iuf!([$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
| $($elem_ids),* | $(#[$doc])*);
impl_ops_vector_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_ops_scalar_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_ops_vector_bitwise!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0)
);
impl_ops_scalar_bitwise!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | (!(0 as $elem_ty), 0)
);
impl_ops_vector_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_ops_scalar_shifts!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_ops_vector_rotates!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_ops_vector_int_min_max!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt
);
impl_reduction_integer_arithmetic!(
[$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
);
impl_reduction_min_max!(
[$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
);
impl_reduction_bitwise!(
[$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
| (|x|{ x as $elem_ty }) | (!(0 as $elem_ty), 0)
);
impl_fmt_debug!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_fmt_lower_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_fmt_upper_hex!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_fmt_octal!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_fmt_binary!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_from_array!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (1, 1));
impl_from_vectors!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),*
);
impl_default!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_hash!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_slice_from_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_slice_write_to_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_swap_bytes!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_bit_manip!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_cmp_partial_eq!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | (1, 0)
);
impl_cmp_eq!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1));
impl_cmp_vertical!(
[$elem_ty; $elem_n]: $tuple_id, $mask_ty, false, (1, 0) | $test_tt
);
impl_cmp_partial_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_cmp_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (0, 1));
impl_bitmask!($tuple_id | $ibitmask_ty | ($ielem_ty::max_value(), 0) |
$test_tt);
test_select!($elem_ty, $mask_ty, $tuple_id, (1, 2) | $test_tt);
test_cmp_partial_ord_int!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
test_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
}
}
macro_rules! impl_f {
([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident
| $ielem_ty:ident | $test_tt:tt | $($elem_ids:ident),*
| From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => {
impl_minimal_iuf!([$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
| $($elem_ids),* | $(#[$doc])*);
impl_ops_vector_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_ops_scalar_arithmetic!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_ops_vector_neg!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_ops_vector_float_min_max!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt
);
impl_reduction_float_arithmetic!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_reduction_min_max!(
[$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
);
impl_fmt_debug!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_from_array!([$elem_ty; $elem_n]: $tuple_id | $test_tt | (1., 1.));
impl_from_vectors!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),*
);
impl_default!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_cmp_partial_eq!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | (1., 0.)
);
impl_slice_from_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_slice_write_to_slice!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_float_consts!([$elem_ty; $elem_n]: $tuple_id);
impl_float_category!([$elem_ty; $elem_n]: $tuple_id, $mask_ty);
// floating-point math
impl_math_float_abs!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_math_float_cos!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_math_float_exp!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_math_float_ln!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_math_float_mul_add!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_math_float_mul_adde!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_math_float_powf!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_math_float_recpre!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_math_float_rsqrte!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_math_float_sin!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_math_float_sqrt!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_math_float_sqrte!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_math_float_tanh!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_cmp_vertical!(
[$elem_ty; $elem_n]: $tuple_id, $mask_ty, false, (1., 0.)
| $test_tt
);
test_select!($elem_ty, $mask_ty, $tuple_id, (1., 2.) | $test_tt);
test_reduction_float_min_max!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt
);
test_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
}
}
macro_rules! impl_m {
([$elem_ty:ident; $elem_n:expr]: $tuple_id:ident
| $ielem_ty:ident, $ibitmask_ty:ident
| $test_tt:tt | $($elem_ids:ident),* | From: $($from_vec_ty:ident),*
| $(#[$doc:meta])*) => {
impl_minimal_mask!(
[$elem_ty; $elem_n]: $tuple_id | $ielem_ty | $test_tt
| $($elem_ids),* | $(#[$doc])*
);
impl_ops_vector_mask_bitwise!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false)
);
impl_ops_scalar_mask_bitwise!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false)
);
impl_reduction_bitwise!(
[bool; $elem_n]: $tuple_id | $ielem_ty | $test_tt
| (|x|{ x != 0 }) | (true, false)
);
impl_reduction_mask!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_fmt_debug!([bool; $elem_n]: $tuple_id | $test_tt);
impl_from_array!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt
| (crate::$elem_ty::new(true), true)
);
impl_from_vectors!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | $($from_vec_ty),*
);
impl_default!([bool; $elem_n]: $tuple_id | $test_tt);
impl_cmp_partial_eq!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false)
);
impl_cmp_eq!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | (true, false)
);
impl_cmp_vertical!(
[$elem_ty; $elem_n]: $tuple_id, $tuple_id, true, (true, false)
| $test_tt
);
impl_select!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_cmp_partial_ord!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_cmp_ord!(
[$elem_ty; $elem_n]: $tuple_id | $test_tt | (false, true)
);
impl_shuffle1_dyn!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
impl_bitmask!($tuple_id | $ibitmask_ty | (true, false) | $test_tt);
test_cmp_partial_ord_mask!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
test_shuffle1_dyn_mask!([$elem_ty; $elem_n]: $tuple_id | $test_tt);
}
}
macro_rules! impl_const_p {
([$elem_ty:ty; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident,
$usize_ty:ident, $isize_ty:ident
| $test_tt:tt | $($elem_ids:ident),*
| From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => {
impl_minimal_p!(
[$elem_ty; $elem_n]: $tuple_id, $mask_ty, $usize_ty, $isize_ty
| ref_ | $test_tt | $($elem_ids),*
| (1 as $elem_ty, 0 as $elem_ty) | $(#[$doc])*
);
impl_ptr_read!([$elem_ty; $elem_n]: $tuple_id, $mask_ty | $test_tt);
}
}
macro_rules! impl_mut_p {
([$elem_ty:ty; $elem_n:expr]: $tuple_id:ident, $mask_ty:ident,
$usize_ty:ident, $isize_ty:ident
| $test_tt:tt | $($elem_ids:ident),*
| From: $($from_vec_ty:ident),* | $(#[$doc:meta])*) => {
impl_minimal_p!(
[$elem_ty; $elem_n]: $tuple_id, $mask_ty, $usize_ty, $isize_ty
| ref_mut_ | $test_tt | $($elem_ids),*
| (1 as $elem_ty, 0 as $elem_ty) | $(#[$doc])*
);
impl_ptr_read!([$elem_ty; $elem_n]: $tuple_id, $mask_ty | $test_tt);
impl_ptr_write!([$elem_ty; $elem_n]: $tuple_id, $mask_ty | $test_tt);
}
}

View file

@ -0,0 +1,129 @@
//! Bit manipulations.
macro_rules! impl_bit_manip {
([$elem_ty:ident; $elem_count:expr]: $id:ident | $test_tt:tt) => {
impl $id {
/// Returns the number of ones in the binary representation of
/// the lanes of `self`.
#[inline]
pub fn count_ones(self) -> Self {
super::codegen::bit_manip::BitManip::ctpop(self)
}
/// Returns the number of zeros in the binary representation of
/// the lanes of `self`.
#[inline]
pub fn count_zeros(self) -> Self {
super::codegen::bit_manip::BitManip::ctpop(!self)
}
/// Returns the number of leading zeros in the binary
/// representation of the lanes of `self`.
#[inline]
pub fn leading_zeros(self) -> Self {
super::codegen::bit_manip::BitManip::ctlz(self)
}
/// Returns the number of trailing zeros in the binary
/// representation of the lanes of `self`.
#[inline]
pub fn trailing_zeros(self) -> Self {
super::codegen::bit_manip::BitManip::cttz(self)
}
}
test_if! {
$test_tt:
paste::item! {
#[allow(overflowing_literals)]
pub mod [<$id _bit_manip>] {
#![allow(const_item_mutation)]
use super::*;
const LANE_WIDTH: usize = mem::size_of::<$elem_ty>() * 8;
macro_rules! test_func {
($x:expr, $func:ident) => {{
let mut actual = $x;
for i in 0..$id::lanes() {
actual = actual.replace(
i,
$x.extract(i).$func() as $elem_ty
);
}
let expected = $x.$func();
assert_eq!(actual, expected);
}};
}
const BYTES: [u8; 64] = [
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23,
24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39,
40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55,
56, 57, 58, 59, 60, 61, 62, 63,
];
fn load_bytes() -> $id {
let elems: &mut [$elem_ty] = unsafe {
slice::from_raw_parts_mut(
BYTES.as_mut_ptr() as *mut $elem_ty,
$id::lanes(),
)
};
$id::from_slice_unaligned(elems)
}
#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn count_ones() {
test_func!($id::splat(0), count_ones);
test_func!($id::splat(!0), count_ones);
test_func!(load_bytes(), count_ones);
}
#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn count_zeros() {
test_func!($id::splat(0), count_zeros);
test_func!($id::splat(!0), count_zeros);
test_func!(load_bytes(), count_zeros);
}
#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn leading_zeros() {
test_func!($id::splat(0), leading_zeros);
test_func!($id::splat(1), leading_zeros);
// some implementations use `pshufb` which has unique
// behavior when the 8th bit is set.
test_func!($id::splat(0b1000_0010), leading_zeros);
test_func!($id::splat(!0), leading_zeros);
test_func!(
$id::splat(1 << (LANE_WIDTH - 1)),
leading_zeros
);
test_func!(load_bytes(), leading_zeros);
}
#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn trailing_zeros() {
test_func!($id::splat(0), trailing_zeros);
test_func!($id::splat(1), trailing_zeros);
test_func!($id::splat(0b1000_0010), trailing_zeros);
test_func!($id::splat(!0), trailing_zeros);
test_func!(
$id::splat(1 << (LANE_WIDTH - 1)),
trailing_zeros
);
test_func!(load_bytes(), trailing_zeros);
}
}
}
}
};
}

View file

@ -0,0 +1,79 @@
//! Bitmask API
macro_rules! impl_bitmask {
($id:ident | $ibitmask_ty:ident | ($set:expr, $clear:expr)
| $test_tt:tt) => {
impl $id {
/// Creates a bitmask with the MSB of each vector lane.
///
/// If the vector has less than 8 lanes, the bits that do not
/// correspond to any vector lanes are cleared.
#[inline]
pub fn bitmask(self) -> $ibitmask_ty {
unsafe { codegen::llvm::simd_bitmask(self.0) }
}
}
test_if! {
$test_tt:
paste::item! {
#[cfg(not(
// FIXME: https://github.com/rust-lang-nursery/packed_simd/issues/210
target_endian = "big"
))]
pub mod [<$id _bitmask>] {
use super::*;
#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn bitmask() {
// clear all lanes
let vec = $id::splat($clear as _);
let bitmask: $ibitmask_ty = 0;
assert_eq!(vec.bitmask(), bitmask);
// set even lanes
let mut vec = $id::splat($clear as _);
for i in 0..$id::lanes() {
if i % 2 == 0 {
vec = vec.replace(i, $set as _);
}
}
// create bitmask with even lanes set:
let mut bitmask: $ibitmask_ty = 0;
for i in 0..$id::lanes() {
if i % 2 == 0 {
bitmask |= 1 << i;
}
}
assert_eq!(vec.bitmask(), bitmask);
// set odd lanes
let mut vec = $id::splat($clear as _);
for i in 0..$id::lanes() {
if i % 2 != 0 {
vec = vec.replace(i, $set as _);
}
}
// create bitmask with odd lanes set:
let mut bitmask: $ibitmask_ty = 0;
for i in 0..$id::lanes() {
if i % 2 != 0 {
bitmask |= 1 << i;
}
}
assert_eq!(vec.bitmask(), bitmask);
// set all lanes
let vec = $id::splat($set as _);
let mut bitmask: $ibitmask_ty = 0;
for i in 0..$id::lanes() {
bitmask |= 1 << i;
}
assert_eq!(vec.bitmask(), bitmask);
}
}
}
}
};
}

View file

@ -0,0 +1,108 @@
//! Implementation of `FromCast` and `IntoCast`.
#![allow(clippy::module_name_repetitions)]
/// Numeric cast from `T` to `Self`.
///
/// > Note: This is a temporary workaround until the conversion traits
/// specified > in [RFC2484] are implemented.
///
/// Numeric cast between vectors with the same number of lanes, such that:
///
/// * casting integer vectors whose lane types have the same size (e.g. `i32xN`
/// -> `u32xN`) is a **no-op**,
///
/// * casting from a larger integer to a smaller integer (e.g. `u32xN` ->
/// `u8xN`) will **truncate**,
///
/// * casting from a smaller integer to a larger integer (e.g. `u8xN` ->
/// `u32xN`) will:
/// * **zero-extend** if the source is unsigned, or
/// * **sign-extend** if the source is signed,
///
/// * casting from a float to an integer will **round the float towards zero**,
///
/// * casting from an integer to float will produce the floating point
/// representation of the integer, **rounding to nearest, ties to even**,
///
/// * casting from an `f32` to an `f64` is perfect and lossless,
///
/// * casting from an `f64` to an `f32` **rounds to nearest, ties to even**.
///
/// [RFC2484]: https://github.com/rust-lang/rfcs/pull/2484
pub trait FromCast<T>: crate::marker::Sized {
/// Numeric cast from `T` to `Self`.
fn from_cast(_: T) -> Self;
}
/// Numeric cast from `Self` to `T`.
///
/// > Note: This is a temporary workaround until the conversion traits
/// specified > in [RFC2484] are implemented.
///
/// Numeric cast between vectors with the same number of lanes, such that:
///
/// * casting integer vectors whose lane types have the same size (e.g. `i32xN`
/// -> `u32xN`) is a **no-op**,
///
/// * casting from a larger integer to a smaller integer (e.g. `u32xN` ->
/// `u8xN`) will **truncate**,
///
/// * casting from a smaller integer to a larger integer (e.g. `u8xN` ->
/// `u32xN`) will:
/// * **zero-extend** if the source is unsigned, or
/// * **sign-extend** if the source is signed,
///
/// * casting from a float to an integer will **round the float towards zero**,
///
/// * casting from an integer to float will produce the floating point
/// representation of the integer, **rounding to nearest, ties to even**,
///
/// * casting from an `f32` to an `f64` is perfect and lossless,
///
/// * casting from an `f64` to an `f32` **rounds to nearest, ties to even**.
///
/// [RFC2484]: https://github.com/rust-lang/rfcs/pull/2484
pub trait Cast<T>: crate::marker::Sized {
/// Numeric cast from `self` to `T`.
fn cast(self) -> T;
}
/// `FromCast` implies `Cast`.
impl<T, U> Cast<U> for T
where
U: FromCast<T>,
{
#[inline]
fn cast(self) -> U {
U::from_cast(self)
}
}
/// `FromCast` and `Cast` are reflexive
impl<T> FromCast<T> for T {
#[inline]
fn from_cast(t: Self) -> Self {
t
}
}
#[macro_use]
mod macros;
mod v16;
pub use self::v16::*;
mod v32;
pub use self::v32::*;
mod v64;
pub use self::v64::*;
mod v128;
pub use self::v128::*;
mod v256;
pub use self::v256::*;
mod v512;
pub use self::v512::*;

View file

@ -0,0 +1,82 @@
//! Macros implementing `FromCast`
macro_rules! impl_from_cast_ {
($id:ident[$test_tt:tt]: $from_ty:ident) => {
impl crate::api::cast::FromCast<$from_ty> for $id {
#[inline]
fn from_cast(x: $from_ty) -> Self {
use crate::llvm::simd_cast;
debug_assert_eq!($from_ty::lanes(), $id::lanes());
Simd(unsafe { simd_cast(x.0) })
}
}
test_if!{
$test_tt:
paste::item! {
pub mod [<$id _from_cast_ $from_ty>] {
use super::*;
#[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn test() {
assert_eq!($id::lanes(), $from_ty::lanes());
}
}
}
}
};
}
macro_rules! impl_from_cast {
($id:ident[$test_tt:tt]: $($from_ty:ident),*) => {
$(
impl_from_cast_!($id[$test_tt]: $from_ty);
)*
}
}
macro_rules! impl_from_cast_mask_ {
($id:ident[$test_tt:tt]: $from_ty:ident) => {
impl crate::api::cast::FromCast<$from_ty> for $id {
#[inline]
fn from_cast(x: $from_ty) -> Self {
debug_assert_eq!($from_ty::lanes(), $id::lanes());
x.ne($from_ty::default())
.select($id::splat(true), $id::splat(false))
}
}
test_if!{
$test_tt:
paste::item! {
pub mod [<$id _from_cast_ $from_ty>] {
use super::*;
#[cfg_attr(not(target_arch = "wasm32"), test)] #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn test() {
assert_eq!($id::lanes(), $from_ty::lanes());
let x = $from_ty::default();
let m: $id = x.cast();
assert!(m.none());
}
}
}
}
};
}
macro_rules! impl_from_cast_mask {
($id:ident[$test_tt:tt]: $($from_ty:ident),*) => {
$(
impl_from_cast_mask_!($id[$test_tt]: $from_ty);
)*
}
}
#[allow(unused)]
macro_rules! impl_into_cast {
($id:ident[$test_tt:tt]: $($from_ty:ident),*) => {
$(
impl_from_cast_!($from_ty[$test_tt]: $id);
)*
}
}

View file

@ -0,0 +1,302 @@
//! `FromCast` and `IntoCast` implementations for portable 128-bit wide vectors
#[rustfmt::skip]
use crate::*;
impl_from_cast!(i8x16[test_v128]: u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16);
impl_from_cast!(u8x16[test_v128]: i8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16);
impl_from_cast_mask!(m8x16[test_v128]: i8x16, u8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16);
impl_from_cast!(
i16x8[test_v128]: i8x8,
u8x8,
m8x8,
u16x8,
m16x8,
i32x8,
u32x8,
f32x8,
m32x8,
i64x8,
u64x8,
f64x8,
m64x8,
isizex8,
usizex8,
msizex8
);
impl_from_cast!(
u16x8[test_v128]: i8x8,
u8x8,
m8x8,
i16x8,
m16x8,
i32x8,
u32x8,
f32x8,
m32x8,
i64x8,
u64x8,
f64x8,
m64x8,
isizex8,
usizex8,
msizex8
);
impl_from_cast_mask!(
m16x8[test_v128]: i8x8,
u8x8,
m8x8,
i16x8,
u16x8,
i32x8,
u32x8,
f32x8,
m32x8,
i64x8,
u64x8,
f64x8,
m64x8,
isizex8,
usizex8,
msizex8
);
impl_from_cast!(
i32x4[test_v128]: i8x4,
u8x4,
m8x4,
i16x4,
u16x4,
m16x4,
u32x4,
f32x4,
m32x4,
i64x4,
u64x4,
f64x4,
m64x4,
i128x4,
u128x4,
m128x4,
isizex4,
usizex4,
msizex4
);
impl_from_cast!(
u32x4[test_v128]: i8x4,
u8x4,
m8x4,
i16x4,
u16x4,
m16x4,
i32x4,
f32x4,
m32x4,
i64x4,
u64x4,
f64x4,
m64x4,
i128x4,
u128x4,
m128x4,
isizex4,
usizex4,
msizex4
);
impl_from_cast!(
f32x4[test_v128]: i8x4,
u8x4,
m8x4,
i16x4,
u16x4,
m16x4,
i32x4,
u32x4,
m32x4,
i64x4,
u64x4,
f64x4,
m64x4,
i128x4,
u128x4,
m128x4,
isizex4,
usizex4,
msizex4
);
impl_from_cast_mask!(
m32x4[test_v128]: i8x4,
u8x4,
m8x4,
i16x4,
u16x4,
m16x4,
i32x4,
u32x4,
f32x4,
i64x4,
u64x4,
f64x4,
m64x4,
i128x4,
u128x4,
m128x4,
isizex4,
usizex4,
msizex4
);
impl_from_cast!(
i64x2[test_v128]: i8x2,
u8x2,
m8x2,
i16x2,
u16x2,
m16x2,
i32x2,
u32x2,
f32x2,
m32x2,
u64x2,
f64x2,
m64x2,
i128x2,
u128x2,
m128x2,
isizex2,
usizex2,
msizex2
);
impl_from_cast!(
u64x2[test_v128]: i8x2,
u8x2,
m8x2,
i16x2,
u16x2,
m16x2,
i32x2,
u32x2,
f32x2,
m32x2,
i64x2,
f64x2,
m64x2,
i128x2,
u128x2,
m128x2,
isizex2,
usizex2,
msizex2
);
impl_from_cast!(
f64x2[test_v128]: i8x2,
u8x2,
m8x2,
i16x2,
u16x2,
m16x2,
i32x2,
u32x2,
f32x2,
m32x2,
i64x2,
u64x2,
m64x2,
i128x2,
u128x2,
m128x2,
isizex2,
usizex2,
msizex2
);
impl_from_cast_mask!(
m64x2[test_v128]: i8x2,
u8x2,
m8x2,
i16x2,
u16x2,
m16x2,
i32x2,
u32x2,
f32x2,
m32x2,
i64x2,
u64x2,
f64x2,
i128x2,
u128x2,
m128x2,
isizex2,
usizex2,
msizex2
);
impl_from_cast!(
isizex2[test_v128]: i8x2,
u8x2,
m8x2,
i16x2,
u16x2,
m16x2,
i32x2,
u32x2,
f32x2,
m32x2,
i64x2,
u64x2,
f64x2,
m64x2,
i128x2,
u128x2,
m128x2,
usizex2,
msizex2
);
impl_from_cast!(
usizex2[test_v128]: i8x2,
u8x2,
m8x2,
i16x2,
u16x2,
m16x2,
i32x2,
u32x2,
f32x2,
m32x2,
i64x2,
u64x2,
f64x2,
m64x2,
i128x2,
u128x2,
m128x2,
isizex2,
msizex2
);
impl_from_cast_mask!(
msizex2[test_v128]: i8x2,
u8x2,
m8x2,
i16x2,
u16x2,
m16x2,
i32x2,
u32x2,
f32x2,
m32x2,
i64x2,
u64x2,
f64x2,
m64x2,
i128x2,
u128x2,
m128x2,
isizex2,
usizex2
);
// FIXME[test_v128]: 64-bit single element vectors into_cast impls
impl_from_cast!(i128x1[test_v128]: u128x1, m128x1);
impl_from_cast!(u128x1[test_v128]: i128x1, m128x1);
impl_from_cast!(m128x1[test_v128]: i128x1, u128x1);

View file

@ -0,0 +1,68 @@
//! `FromCast` and `IntoCast` implementations for portable 16-bit wide vectors
#[rustfmt::skip]
use crate::*;
impl_from_cast!(
i8x2[test_v16]: u8x2,
m8x2,
i16x2,
u16x2,
m16x2,
i32x2,
u32x2,
f32x2,
m32x2,
i64x2,
u64x2,
f64x2,
m64x2,
i128x2,
u128x2,
m128x2,
isizex2,
usizex2,
msizex2
);
impl_from_cast!(
u8x2[test_v16]: i8x2,
m8x2,
i16x2,
u16x2,
m16x2,
i32x2,
u32x2,
f32x2,
m32x2,
i64x2,
u64x2,
f64x2,
m64x2,
i128x2,
u128x2,
m128x2,
isizex2,
usizex2,
msizex2
);
impl_from_cast_mask!(
m8x2[test_v16]: i8x2,
u8x2,
i16x2,
u16x2,
m16x2,
i32x2,
u32x2,
f32x2,
m32x2,
i64x2,
u64x2,
f64x2,
m64x2,
i128x2,
u128x2,
m128x2,
isizex2,
usizex2,
msizex2
);

View file

@ -0,0 +1,298 @@
//! `FromCast` and `IntoCast` implementations for portable 256-bit wide vectors
#[rustfmt::skip]
use crate::*;
impl_from_cast!(i8x32[test_v256]: u8x32, m8x32, i16x32, u16x32, m16x32);
impl_from_cast!(u8x32[test_v256]: i8x32, m8x32, i16x32, u16x32, m16x32);
impl_from_cast_mask!(m8x32[test_v256]: i8x32, u8x32, i16x32, u16x32, m16x32);
impl_from_cast!(i16x16[test_v256]: i8x16, u8x16, m8x16, u16x16, m16x16, i32x16, u32x16, f32x16, m32x16);
impl_from_cast!(u16x16[test_v256]: i8x16, u8x16, m8x16, i16x16, m16x16, i32x16, u32x16, f32x16, m32x16);
impl_from_cast_mask!(m16x16[test_v256]: i8x16, u8x16, m8x16, i16x16, u16x16, i32x16, u32x16, f32x16, m32x16);
impl_from_cast!(
i32x8[test_v256]: i8x8,
u8x8,
m8x8,
i16x8,
u16x8,
m16x8,
u32x8,
f32x8,
m32x8,
i64x8,
u64x8,
f64x8,
m64x8,
isizex8,
usizex8,
msizex8
);
impl_from_cast!(
u32x8[test_v256]: i8x8,
u8x8,
m8x8,
i16x8,
u16x8,
m16x8,
i32x8,
f32x8,
m32x8,
i64x8,
u64x8,
f64x8,
m64x8,
isizex8,
usizex8,
msizex8
);
impl_from_cast!(
f32x8[test_v256]: i8x8,
u8x8,
m8x8,
i16x8,
u16x8,
m16x8,
i32x8,
u32x8,
m32x8,
i64x8,
u64x8,
f64x8,
m64x8,
isizex8,
usizex8,
msizex8
);
impl_from_cast_mask!(
m32x8[test_v256]: i8x8,
u8x8,
m8x8,
i16x8,
u16x8,
m16x8,
i32x8,
u32x8,
f32x8,
i64x8,
u64x8,
f64x8,
m64x8,
isizex8,
usizex8,
msizex8
);
impl_from_cast!(
i64x4[test_v256]: i8x4,
u8x4,
m8x4,
i16x4,
u16x4,
m16x4,
i32x4,
u32x4,
f32x4,
m32x4,
u64x4,
f64x4,
m64x4,
i128x4,
u128x4,
m128x4,
isizex4,
usizex4,
msizex4
);
impl_from_cast!(
u64x4[test_v256]: i8x4,
u8x4,
m8x4,
i16x4,
u16x4,
m16x4,
i32x4,
u32x4,
f32x4,
m32x4,
i64x4,
f64x4,
m64x4,
i128x4,
u128x4,
m128x4,
isizex4,
usizex4,
msizex4
);
impl_from_cast!(
f64x4[test_v256]: i8x4,
u8x4,
m8x4,
i16x4,
u16x4,
m16x4,
i32x4,
u32x4,
f32x4,
m32x4,
i64x4,
u64x4,
m64x4,
i128x4,
u128x4,
m128x4,
isizex4,
usizex4,
msizex4
);
impl_from_cast_mask!(
m64x4[test_v256]: i8x4,
u8x4,
m8x4,
i16x4,
u16x4,
m16x4,
i32x4,
u32x4,
f32x4,
m32x4,
i64x4,
u64x4,
f64x4,
i128x4,
u128x4,
m128x4,
isizex4,
usizex4,
msizex4
);
impl_from_cast!(
i128x2[test_v256]: i8x2,
u8x2,
m8x2,
i16x2,
u16x2,
m16x2,
i32x2,
u32x2,
f32x2,
m32x2,
i64x2,
u64x2,
f64x2,
m64x2,
u128x2,
m128x2,
isizex2,
usizex2,
msizex2
);
impl_from_cast!(
u128x2[test_v256]: i8x2,
u8x2,
m8x2,
i16x2,
u16x2,
m16x2,
i32x2,
u32x2,
f32x2,
m32x2,
i64x2,
u64x2,
f64x2,
m64x2,
i128x2,
m128x2,
isizex2,
usizex2,
msizex2
);
impl_from_cast_mask!(
m128x2[test_v256]: i8x2,
u8x2,
m8x2,
i16x2,
u16x2,
m16x2,
i32x2,
u32x2,
f32x2,
m32x2,
i64x2,
u64x2,
m64x2,
f64x2,
i128x2,
u128x2,
isizex2,
usizex2,
msizex2
);
impl_from_cast!(
isizex4[test_v256]: i8x4,
u8x4,
m8x4,
i16x4,
u16x4,
m16x4,
i32x4,
u32x4,
f32x4,
m32x4,
i64x4,
u64x4,
f64x4,
m64x4,
i128x4,
u128x4,
m128x4,
usizex4,
msizex4
);
impl_from_cast!(
usizex4[test_v256]: i8x4,
u8x4,
m8x4,
i16x4,
u16x4,
m16x4,
i32x4,
u32x4,
f32x4,
m32x4,
i64x4,
u64x4,
f64x4,
m64x4,
i128x4,
u128x4,
m128x4,
isizex4,
msizex4
);
impl_from_cast_mask!(
msizex4[test_v256]: i8x4,
u8x4,
m8x4,
i16x4,
u16x4,
m16x4,
i32x4,
u32x4,
f32x4,
m32x4,
i64x4,
u64x4,
f64x4,
m64x4,
i128x4,
u128x4,
m128x4,
isizex4,
usizex4
);

View file

@ -0,0 +1,132 @@
//! `FromCast` and `IntoCast` implementations for portable 32-bit wide vectors
#[rustfmt::skip]
use crate::*;
impl_from_cast!(
i8x4[test_v32]: u8x4,
m8x4,
i16x4,
u16x4,
m16x4,
i32x4,
u32x4,
f32x4,
m32x4,
i64x4,
u64x4,
f64x4,
m64x4,
i128x4,
u128x4,
m128x4,
isizex4,
usizex4,
msizex4
);
impl_from_cast!(
u8x4[test_v32]: i8x4,
m8x4,
i16x4,
u16x4,
m16x4,
i32x4,
u32x4,
f32x4,
m32x4,
i64x4,
u64x4,
f64x4,
m64x4,
i128x4,
u128x4,
m128x4,
isizex4,
usizex4,
msizex4
);
impl_from_cast_mask!(
m8x4[test_v32]: i8x4,
u8x4,
i16x4,
u16x4,
m16x4,
i32x4,
u32x4,
f32x4,
m32x4,
i64x4,
u64x4,
f64x4,
m64x4,
i128x4,
u128x4,
m128x4,
isizex4,
usizex4,
msizex4
);
impl_from_cast!(
i16x2[test_v32]: i8x2,
u8x2,
m8x2,
u16x2,
m16x2,
i32x2,
u32x2,
f32x2,
m32x2,
i64x2,
u64x2,
f64x2,
m64x2,
i128x2,
u128x2,
m128x2,
isizex2,
usizex2,
msizex2
);
impl_from_cast!(
u16x2[test_v32]: i8x2,
u8x2,
m8x2,
i16x2,
m16x2,
i32x2,
u32x2,
f32x2,
m32x2,
i64x2,
u64x2,
f64x2,
m64x2,
i128x2,
u128x2,
m128x2,
isizex2,
usizex2,
msizex2
);
impl_from_cast_mask!(
m16x2[test_v32]: i8x2,
u8x2,
m8x2,
i16x2,
u16x2,
i32x2,
u32x2,
f32x2,
m32x2,
i64x2,
u64x2,
f64x2,
m64x2,
i128x2,
u128x2,
m128x2,
isizex2,
usizex2,
msizex2
);

View file

@ -0,0 +1,209 @@
//! `FromCast` and `IntoCast` implementations for portable 512-bit wide vectors
#[rustfmt::skip]
use crate::*;
impl_from_cast!(i8x64[test_v512]: u8x64, m8x64);
impl_from_cast!(u8x64[test_v512]: i8x64, m8x64);
impl_from_cast_mask!(m8x64[test_v512]: i8x64, u8x64);
impl_from_cast!(i16x32[test_v512]: i8x32, u8x32, m8x32, u16x32, m16x32);
impl_from_cast!(u16x32[test_v512]: i8x32, u8x32, m8x32, i16x32, m16x32);
impl_from_cast_mask!(m16x32[test_v512]: i8x32, u8x32, m8x32, i16x32, u16x32);
impl_from_cast!(i32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, u32x16, f32x16, m32x16);
impl_from_cast!(u32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, f32x16, m32x16);
impl_from_cast!(f32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, m32x16);
impl_from_cast_mask!(m32x16[test_v512]: i8x16, u8x16, m8x16, i16x16, u16x16, m16x16, i32x16, u32x16, f32x16);
impl_from_cast!(
i64x8[test_v512]: i8x8,
u8x8,
m8x8,
i16x8,
u16x8,
m16x8,
i32x8,
u32x8,
f32x8,
m32x8,
u64x8,
f64x8,
m64x8,
isizex8,
usizex8,
msizex8
);
impl_from_cast!(
u64x8[test_v512]: i8x8,
u8x8,
m8x8,
i16x8,
u16x8,
m16x8,
i32x8,
u32x8,
f32x8,
m32x8,
i64x8,
f64x8,
m64x8,
isizex8,
usizex8,
msizex8
);
impl_from_cast!(
f64x8[test_v512]: i8x8,
u8x8,
m8x8,
i16x8,
u16x8,
m16x8,
i32x8,
u32x8,
f32x8,
m32x8,
i64x8,
u64x8,
m64x8,
isizex8,
usizex8,
msizex8
);
impl_from_cast_mask!(
m64x8[test_v512]: i8x8,
u8x8,
m8x8,
i16x8,
u16x8,
m16x8,
i32x8,
u32x8,
f32x8,
m32x8,
i64x8,
u64x8,
f64x8,
isizex8,
usizex8,
msizex8
);
impl_from_cast!(
i128x4[test_v512]: i8x4,
u8x4,
m8x4,
i16x4,
u16x4,
m16x4,
i32x4,
u32x4,
f32x4,
m32x4,
i64x4,
u64x4,
f64x4,
m64x4,
u128x4,
m128x4,
isizex4,
usizex4,
msizex4
);
impl_from_cast!(
u128x4[test_v512]: i8x4,
u8x4,
m8x4,
i16x4,
u16x4,
m16x4,
i32x4,
u32x4,
f32x4,
m32x4,
i64x4,
u64x4,
f64x4,
m64x4,
i128x4,
m128x4,
isizex4,
usizex4,
msizex4
);
impl_from_cast_mask!(
m128x4[test_v512]: i8x4,
u8x4,
m8x4,
i16x4,
u16x4,
m16x4,
i32x4,
u32x4,
f32x4,
m32x4,
i64x4,
u64x4,
m64x4,
f64x4,
i128x4,
u128x4,
isizex4,
usizex4,
msizex4
);
impl_from_cast!(
isizex8[test_v512]: i8x8,
u8x8,
m8x8,
i16x8,
u16x8,
m16x8,
i32x8,
u32x8,
f32x8,
m32x8,
i64x8,
u64x8,
f64x8,
m64x8,
usizex8,
msizex8
);
impl_from_cast!(
usizex8[test_v512]: i8x8,
u8x8,
m8x8,
i16x8,
u16x8,
m16x8,
i32x8,
u32x8,
f32x8,
m32x8,
i64x8,
u64x8,
f64x8,
m64x8,
isizex8,
msizex8
);
impl_from_cast_mask!(
msizex8[test_v512]: i8x8,
u8x8,
m8x8,
i16x8,
u16x8,
m16x8,
i32x8,
u32x8,
f32x8,
m32x8,
i64x8,
u64x8,
f64x8,
m64x8,
isizex8,
usizex8
);

Some files were not shown because too many files have changed in this diff Show more