forked from mirrors/gecko-dev
		
	Bug 1757483 - Update libjxl and highway r=tnikkel
Differential Revision: https://phabricator.services.mozilla.com/D139919
This commit is contained in:
		
							parent
							
								
									baf0a36e00
								
							
						
					
					
						commit
						2765398421
					
				
					 142 changed files with 13857 additions and 6858 deletions
				
			
		|  | @ -22,6 +22,7 @@ EXPORTS.hwy += [ | |||
|     "/third_party/highway/hwy/detect_targets.h", | ||||
|     "/third_party/highway/hwy/foreach_target.h", | ||||
|     "/third_party/highway/hwy/highway.h", | ||||
|     "/third_party/highway/hwy/highway_export.h", | ||||
|     "/third_party/highway/hwy/targets.h", | ||||
| ] | ||||
| 
 | ||||
|  |  | |||
|  | @ -20,11 +20,11 @@ origin: | |||
| 
 | ||||
|   # Human-readable identifier for this version/release | ||||
|   # Generally "version NNN", "tag SSS", "bookmark SSS" | ||||
|   release: commit e69083a12a05caf037cabecdf1b248b7579705a5 (2021-11-11T08:20:00Z). | ||||
|   release: commit f13e3b956eb226561ac79427893ec0afd66f91a8 (2022-02-15T18:19:21Z). | ||||
| 
 | ||||
|   # Revision to pull in | ||||
|   # Must be a long or short commit SHA (long preferred) | ||||
|   revision: e69083a12a05caf037cabecdf1b248b7579705a5 | ||||
|   revision: f13e3b956eb226561ac79427893ec0afd66f91a8 | ||||
| 
 | ||||
|   # The package's license, where possible using the mnemonic from | ||||
|   # https://spdx.org/licenses/ | ||||
|  |  | |||
|  | @ -10,9 +10,9 @@ origin: | |||
| 
 | ||||
|   url: https://github.com/libjxl/libjxl | ||||
| 
 | ||||
|   release: commit 4322679b1c418addc2284c5ea84fc2c3935b4a75 (2022-02-07T20:56:39Z). | ||||
|   release: commit 89875cba4d18485ec9692c80b747b59b73ce712e (2022-02-28T16:03:42Z). | ||||
| 
 | ||||
|   revision: 4322679b1c418addc2284c5ea84fc2c3935b4a75 | ||||
|   revision: 89875cba4d18485ec9692c80b747b59b73ce712e | ||||
| 
 | ||||
|   license: Apache-2.0 | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										57
									
								
								third_party/highway/.github/workflows/build_test.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										57
									
								
								third_party/highway/.github/workflows/build_test.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,57 @@ | |||
| # Copyright 2021 Google LLC | ||||
| # | ||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| # you may not use this file except in compliance with the License. | ||||
| # You may obtain a copy of the License at | ||||
| # | ||||
| #      http://www.apache.org/licenses/LICENSE-2.0 | ||||
| # | ||||
| # Unless required by applicable law or agreed to in writing, software | ||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | ||||
| 
 | ||||
| name: Build / test | ||||
| on: [push, pull_request] | ||||
| jobs: | ||||
|   cmake: | ||||
|     name: Build and test ${{ matrix.name }} | ||||
|     runs-on: ubuntu-18.04 | ||||
|     strategy: | ||||
|       matrix: | ||||
|         include: | ||||
|           - name: Clang-5.0 | ||||
|             extra_deps: clang-5.0 | ||||
|             c_compiler: clang-5.0 | ||||
|             cxx_compiler: clang++-5.0 | ||||
| 
 | ||||
|           - name: Clang-6.0 | ||||
|             extra_deps: clang-6.0 | ||||
|             c_compiler: clang-6.0 | ||||
|             cxx_compiler: clang++-6.0 | ||||
| 
 | ||||
|     steps: | ||||
|       - uses: actions/checkout@v2 | ||||
| 
 | ||||
|       - name: Install deps | ||||
|         run: sudo apt-get install ${{ matrix.extra_deps }} | ||||
| 
 | ||||
|       - name: Build and test | ||||
|         run: | | ||||
|           export CMAKE_BUILD_PARALLEL_LEVEL=2 | ||||
|           export CTEST_PARALLEL_LEVEL=2 | ||||
|           CXXFLAGS=-Werror CC=${{ matrix.c_compiler }} CXX=${{ matrix.cxx_compiler }} cmake -B out . | ||||
|           cmake --build out | ||||
|           ctest --test-dir out | ||||
| 
 | ||||
|   bazel: | ||||
|     runs-on: ubuntu-latest | ||||
|     steps: | ||||
|       - uses: actions/checkout@v2 | ||||
|       - uses: bazelbuild/setup-bazelisk@v1 | ||||
|       - uses: actions/cache@v2 | ||||
|         with: | ||||
|           path: ~/.cache/bazel | ||||
|           key: bazel-${{ runner.os }} | ||||
|       - run: bazel build //... | ||||
							
								
								
									
										106
									
								
								third_party/highway/BUILD
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										106
									
								
								third_party/highway/BUILD
									
									
									
									
										vendored
									
									
								
							|  | @ -1,6 +1,6 @@ | |||
| load("@bazel_skylib//lib:selects.bzl", "selects") | ||||
| load("@rules_cc//cc:defs.bzl", "cc_test") | ||||
| 
 | ||||
| load("@rules_cc//cc:defs.bzl", "cc_test") | ||||
| package(default_visibility = ["//visibility:public"]) | ||||
| 
 | ||||
| licenses(["notice"]) | ||||
|  | @ -18,6 +18,11 @@ config_setting( | |||
|     flag_values = {"@bazel_tools//tools/cpp:compiler": "msvc"}, | ||||
| ) | ||||
| 
 | ||||
| config_setting( | ||||
|     name = "compiler_emscripten", | ||||
|     values = {"cpu": "wasm32"}, | ||||
| ) | ||||
| 
 | ||||
| # See https://github.com/bazelbuild/bazel/issues/12707 | ||||
| config_setting( | ||||
|     name = "compiler_gcc_bug", | ||||
|  | @ -41,13 +46,6 @@ selects.config_setting_group( | |||
|     ], | ||||
| ) | ||||
| 
 | ||||
| config_setting( | ||||
|     name = "emulate_sve", | ||||
|     values = { | ||||
|         "copt": "-DHWY_EMULATE_SVE", | ||||
|     }, | ||||
| ) | ||||
| 
 | ||||
| # Additional warnings for Clang OR GCC (skip for MSVC) | ||||
| CLANG_GCC_COPTS = [ | ||||
|     "-Wunused-parameter", | ||||
|  | @ -82,7 +80,7 @@ COPTS = select({ | |||
|     "//conditions:default": CLANG_GCC_COPTS + CLANG_ONLY_COPTS, | ||||
| }) + select({ | ||||
|     "@platforms//cpu:riscv64": [ | ||||
|         "-march=rv64gcv0p10", | ||||
|         "-march=rv64gcv1p0", | ||||
|         "-menable-experimental-extensions", | ||||
|     ], | ||||
|     "//conditions:default": [ | ||||
|  | @ -112,28 +110,32 @@ cc_library( | |||
|         "hwy/base.h", | ||||
|         "hwy/cache_control.h", | ||||
|         "hwy/detect_compiler_arch.h",  # private | ||||
|         "hwy/detect_targets.h",  # private | ||||
|         "hwy/targets.h", | ||||
|         "hwy/highway_export.h", | ||||
|     ], | ||||
|     compatible_with = [], | ||||
|     copts = COPTS, | ||||
|     textual_hdrs = [ | ||||
|         # These are textual because config macros influence them: | ||||
|         "hwy/detect_targets.h",  # private | ||||
|         "hwy/targets.h", | ||||
|         # End of list | ||||
|         "hwy/highway.h",  # public | ||||
|         "hwy/foreach_target.h",  # public | ||||
|         "hwy/ops/arm_neon-inl.h", | ||||
|         "hwy/ops/arm_sve-inl.h", | ||||
|         "hwy/ops/generic_ops-inl.h", | ||||
|         "hwy/ops/rvv-inl.h", | ||||
|         "hwy/ops/scalar-inl.h", | ||||
|         "hwy/ops/set_macros-inl.h", | ||||
|         "hwy/ops/shared-inl.h", | ||||
|         "hwy/ops/wasm_128-inl.h", | ||||
|         "hwy/ops/x86_128-inl.h", | ||||
|         "hwy/ops/x86_256-inl.h", | ||||
|         "hwy/ops/x86_512-inl.h", | ||||
|     ], | ||||
|     deps = select({ | ||||
|         ":emulate_sve": ["//third_party/farm_sve"], | ||||
|         # Select avoids recompiling native arch if only non-native changed | ||||
|     ] + select({ | ||||
|         ":compiler_emscripten": ["hwy/ops/wasm_128-inl.h"], | ||||
|         "//conditions:default": [], | ||||
|     }) + select({ | ||||
|         "@platforms//cpu:riscv64": ["hwy/ops/rvv-inl.h"], | ||||
|         "//conditions:default": [], | ||||
|     }), | ||||
| ) | ||||
|  | @ -144,7 +146,9 @@ cc_library( | |||
|     textual_hdrs = [ | ||||
|         "hwy/contrib/dot/dot-inl.h", | ||||
|     ], | ||||
|     deps = [":hwy"], | ||||
|     deps = [ | ||||
|         ":hwy", | ||||
|     ], | ||||
| ) | ||||
| 
 | ||||
| cc_library( | ||||
|  | @ -156,7 +160,9 @@ cc_library( | |||
|         "hwy/contrib/image/image.h", | ||||
|     ], | ||||
|     compatible_with = [], | ||||
|     deps = [":hwy"], | ||||
|     deps = [ | ||||
|         ":hwy", | ||||
|     ], | ||||
| ) | ||||
| 
 | ||||
| cc_library( | ||||
|  | @ -165,16 +171,9 @@ cc_library( | |||
|     textual_hdrs = [ | ||||
|         "hwy/contrib/math/math-inl.h", | ||||
|     ], | ||||
|     deps = [":hwy"], | ||||
| ) | ||||
| 
 | ||||
| cc_library( | ||||
|     name = "sort", | ||||
|     compatible_with = [], | ||||
|     textual_hdrs = [ | ||||
|         "hwy/contrib/sort/sort-inl.h", | ||||
|     deps = [ | ||||
|         ":hwy", | ||||
|     ], | ||||
|     deps = [":hwy"], | ||||
| ) | ||||
| 
 | ||||
| # Everything required for tests that use Highway. | ||||
|  | @ -188,7 +187,9 @@ cc_library( | |||
|     ], | ||||
|     # Must not depend on a gtest variant, which can conflict with the | ||||
|     # GUNIT_INTERNAL_BUILD_MODE defined by the test. | ||||
|     deps = [":hwy"], | ||||
|     deps = [ | ||||
|         ":hwy", | ||||
|     ], | ||||
| ) | ||||
| 
 | ||||
| cc_library( | ||||
|  | @ -212,7 +213,9 @@ cc_library( | |||
|     srcs = ["hwy/examples/skeleton.cc"], | ||||
|     hdrs = ["hwy/examples/skeleton.h"], | ||||
|     textual_hdrs = ["hwy/examples/skeleton-inl.h"], | ||||
|     deps = [":hwy"], | ||||
|     deps = [ | ||||
|         ":hwy", | ||||
|     ], | ||||
| ) | ||||
| 
 | ||||
| cc_binary( | ||||
|  | @ -226,7 +229,7 @@ HWY_TESTS = [ | |||
|     ("hwy/contrib/dot/", "dot_test"), | ||||
|     ("hwy/contrib/image/", "image_test"), | ||||
|     ("hwy/contrib/math/", "math_test"), | ||||
|     ("hwy/contrib/sort/", "sort_test"), | ||||
|     # contrib/sort has its own BUILD, we add it to GUITAR_TESTS. | ||||
|     ("hwy/examples/", "skeleton_test"), | ||||
|     ("hwy/", "nanobenchmark_test"), | ||||
|     ("hwy/", "aligned_allocator_test"), | ||||
|  | @ -239,13 +242,27 @@ HWY_TESTS = [ | |||
|     ("hwy/tests/", "compare_test"), | ||||
|     ("hwy/tests/", "convert_test"), | ||||
|     ("hwy/tests/", "crypto_test"), | ||||
|     ("hwy/tests/", "demote_test"), | ||||
|     ("hwy/tests/", "logical_test"), | ||||
|     ("hwy/tests/", "mask_test"), | ||||
|     ("hwy/tests/", "memory_test"), | ||||
|     ("hwy/tests/", "shift_test"), | ||||
|     ("hwy/tests/", "swizzle_test"), | ||||
|     ("hwy/tests/", "test_util_test"), | ||||
| ] | ||||
| 
 | ||||
| HWY_TEST_DEPS = [ | ||||
|     ":dot", | ||||
|     ":hwy", | ||||
|     ":hwy_test_util", | ||||
|     ":image", | ||||
|     ":math", | ||||
|     ":nanobenchmark", | ||||
|     ":skeleton", | ||||
|     "//hwy/contrib/sort:vqsort", | ||||
|     "@com_google_googletest//:gtest_main", | ||||
| ] | ||||
| 
 | ||||
| [ | ||||
|     [ | ||||
|         cc_test( | ||||
|  | @ -265,6 +282,18 @@ HWY_TESTS = [ | |||
|                 "@platforms//cpu:riscv64": ["fully_static_link"], | ||||
|                 "//conditions:default": [], | ||||
|             }), | ||||
|             linkopts = select({ | ||||
|                 ":compiler_emscripten": [ | ||||
|                     "-s ASSERTIONS=2", | ||||
|                     "-s ENVIRONMENT=node,shell,web", | ||||
|                     "-s ERROR_ON_UNDEFINED_SYMBOLS=1", | ||||
|                     "-s DEMANGLE_SUPPORT=1", | ||||
|                     "-s EXIT_RUNTIME=1", | ||||
|                     "-s ALLOW_MEMORY_GROWTH=1", | ||||
|                     "--pre-js $(location :preamble.js.lds)", | ||||
|                 ], | ||||
|                 "//conditions:default": [], | ||||
|             }), | ||||
|             linkstatic = select({ | ||||
|                 "@platforms//cpu:riscv64": True, | ||||
|                 "//conditions:default": False, | ||||
|  | @ -272,17 +301,10 @@ HWY_TESTS = [ | |||
|             local_defines = ["HWY_IS_TEST"], | ||||
|             # for test_suite. | ||||
|             tags = ["hwy_ops_test"], | ||||
|             deps = [ | ||||
|                 ":dot", | ||||
|                 ":hwy", | ||||
|                 ":hwy_test_util", | ||||
|                 ":image", | ||||
|                 ":math", | ||||
|                 ":nanobenchmark", | ||||
|                 ":skeleton", | ||||
|                 ":sort", | ||||
|                 "@com_google_googletest//:gtest_main", | ||||
|             ], | ||||
|             deps = HWY_TEST_DEPS + select({ | ||||
|                 ":compiler_emscripten": [":preamble.js.lds"], | ||||
|                 "//conditions:default": [], | ||||
|             }), | ||||
|         ), | ||||
|     ] | ||||
|     for subdir, test in HWY_TESTS | ||||
|  | @ -293,3 +315,5 @@ test_suite( | |||
|     name = "hwy_ops_tests", | ||||
|     tags = ["hwy_ops_test"], | ||||
| ) | ||||
| 
 | ||||
| # Placeholder for integration test, do not remove | ||||
|  |  | |||
							
								
								
									
										113
									
								
								third_party/highway/CMakeLists.txt
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										113
									
								
								third_party/highway/CMakeLists.txt
									
									
									
									
										vendored
									
									
								
							|  | @ -19,11 +19,13 @@ if(POLICY CMP0083) | |||
|   cmake_policy(SET CMP0083 NEW) | ||||
| endif() | ||||
| 
 | ||||
| project(hwy VERSION 0.15.0)  # Keep in sync with highway.h version | ||||
| project(hwy VERSION 0.16.0)  # Keep in sync with highway.h version | ||||
| 
 | ||||
| # Directly define the ABI version from the cmake project() version values: | ||||
| set(LIBRARY_VERSION "${hwy_VERSION}") | ||||
| set(LIBRARY_SOVERSION ${hwy_VERSION_MAJOR}) | ||||
| 
 | ||||
| set(CMAKE_CXX_STANDARD 11) | ||||
| set(CMAKE_CXX_EXTENSIONS OFF) | ||||
| set(CMAKE_CXX_STANDARD_REQUIRED YES) | ||||
| 
 | ||||
| # Enabled PIE binaries by default if supported. | ||||
| include(CheckPIESupported OPTIONAL RESULT_VARIABLE CHECK_PIE_SUPPORTED) | ||||
|  | @ -40,13 +42,14 @@ if (NOT CMAKE_BUILD_TYPE) | |||
|   set(CMAKE_BUILD_TYPE RelWithDebInfo) | ||||
| endif() | ||||
| 
 | ||||
| set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON?") | ||||
| set(HWY_CMAKE_ARM7 OFF CACHE BOOL "Set copts for ARMv7 with NEON (requires vfpv4)?") | ||||
| 
 | ||||
| # Unconditionally adding -Werror risks breaking the build when new warnings | ||||
| # arise due to compiler/platform changes. Enable this in CI/tests. | ||||
| set(HWY_WARNINGS_ARE_ERRORS OFF CACHE BOOL "Add -Werror flag?") | ||||
| 
 | ||||
| set(HWY_EXAMPLES_TESTS_INSTALL ON CACHE BOOL "Build examples, tests, install?") | ||||
| set(HWY_ENABLE_EXAMPLES ON CACHE BOOL "Build examples") | ||||
| set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install library") | ||||
| 
 | ||||
| include(CheckCXXSourceCompiles) | ||||
| check_cxx_source_compiles( | ||||
|  | @ -64,7 +67,32 @@ set(HWY_CONTRIB_SOURCES | |||
|     hwy/contrib/image/image.cc | ||||
|     hwy/contrib/image/image.h | ||||
|     hwy/contrib/math/math-inl.h | ||||
|     hwy/contrib/sort/sort-inl.h | ||||
|     hwy/contrib/sort/disabled_targets.h | ||||
|     hwy/contrib/sort/shared-inl.h | ||||
|     hwy/contrib/sort/sorting_networks-inl.h | ||||
|     hwy/contrib/sort/traits-inl.h | ||||
|     hwy/contrib/sort/traits128-inl.h | ||||
|     hwy/contrib/sort/vqsort-inl.h | ||||
|     hwy/contrib/sort/vqsort.cc | ||||
|     hwy/contrib/sort/vqsort.h | ||||
|     hwy/contrib/sort/vqsort_128a.cc | ||||
|     hwy/contrib/sort/vqsort_128d.cc | ||||
|     hwy/contrib/sort/vqsort_f32a.cc | ||||
|     hwy/contrib/sort/vqsort_f32d.cc | ||||
|     hwy/contrib/sort/vqsort_f64a.cc | ||||
|     hwy/contrib/sort/vqsort_f64d.cc | ||||
|     hwy/contrib/sort/vqsort_i16a.cc | ||||
|     hwy/contrib/sort/vqsort_i16d.cc | ||||
|     hwy/contrib/sort/vqsort_i32a.cc | ||||
|     hwy/contrib/sort/vqsort_i32d.cc | ||||
|     hwy/contrib/sort/vqsort_i64a.cc | ||||
|     hwy/contrib/sort/vqsort_i64d.cc | ||||
|     hwy/contrib/sort/vqsort_u16a.cc | ||||
|     hwy/contrib/sort/vqsort_u16d.cc | ||||
|     hwy/contrib/sort/vqsort_u32a.cc | ||||
|     hwy/contrib/sort/vqsort_u32d.cc | ||||
|     hwy/contrib/sort/vqsort_u64a.cc | ||||
|     hwy/contrib/sort/vqsort_u64d.cc | ||||
| ) | ||||
| 
 | ||||
| set(HWY_SOURCES | ||||
|  | @ -76,6 +104,7 @@ set(HWY_SOURCES | |||
|     hwy/detect_targets.h  # private | ||||
|     hwy/foreach_target.h | ||||
|     hwy/highway.h | ||||
|     hwy/highway_export.h | ||||
|     hwy/nanobenchmark.cc | ||||
|     hwy/nanobenchmark.h | ||||
|     hwy/ops/arm_neon-inl.h | ||||
|  | @ -192,20 +221,59 @@ else() | |||
| 
 | ||||
| endif()  # !MSVC | ||||
| 
 | ||||
| add_library(hwy STATIC ${HWY_SOURCES}) | ||||
| # By default prefer STATIC build (legacy behavior) | ||||
| option(BUILD_SHARED_LIBS "Build shared libraries" OFF) | ||||
| option(HWY_FORCE_STATIC_LIBS "Ignore BUILD_SHARED_LIBS" OFF) | ||||
| # only expose shared/static options to advanced users: | ||||
| mark_as_advanced(BUILD_SHARED_LIBS) | ||||
| mark_as_advanced(HWY_FORCE_STATIC_LIBS) | ||||
| # Define visibility settings globally: | ||||
| set(CMAKE_CXX_VISIBILITY_PRESET hidden) | ||||
| set(CMAKE_VISIBILITY_INLINES_HIDDEN 1) | ||||
| 
 | ||||
| # Copy-cat "add_library" logic + add override. | ||||
| set(HWY_LIBRARY_TYPE "SHARED") | ||||
| if (NOT BUILD_SHARED_LIBS OR HWY_FORCE_STATIC_LIBS) | ||||
|   set(HWY_LIBRARY_TYPE "STATIC") | ||||
| endif() | ||||
| 
 | ||||
| # This preprocessor define will drive the build, also used in the *.pc files: | ||||
| if("${HWY_LIBRARY_TYPE}" STREQUAL "SHARED") | ||||
|   set(DLLEXPORT_TO_DEFINE "HWY_SHARED_DEFINE") | ||||
| else() | ||||
|   set(DLLEXPORT_TO_DEFINE "HWY_STATIC_DEFINE") | ||||
| endif() | ||||
| 
 | ||||
| add_library(hwy ${HWY_LIBRARY_TYPE} ${HWY_SOURCES}) | ||||
| target_compile_definitions(hwy PUBLIC "${DLLEXPORT_TO_DEFINE}") | ||||
| target_compile_options(hwy PRIVATE ${HWY_FLAGS}) | ||||
| set_property(TARGET hwy PROPERTY POSITION_INDEPENDENT_CODE ON) | ||||
| set_target_properties(hwy PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION}) | ||||
| target_include_directories(hwy PUBLIC ${CMAKE_CURRENT_LIST_DIR}) | ||||
| target_compile_features(hwy PUBLIC cxx_std_11) | ||||
| set_target_properties(hwy PROPERTIES | ||||
|   LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version) | ||||
| # not supported by MSVC/Clang, safe to skip (we use DLLEXPORT annotations) | ||||
| if(UNIX AND NOT APPLE) | ||||
|   set_property(TARGET hwy APPEND_STRING PROPERTY | ||||
|     LINK_FLAGS " -Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/hwy/hwy.version") | ||||
| endif() | ||||
| 
 | ||||
| add_library(hwy_contrib STATIC ${HWY_CONTRIB_SOURCES}) | ||||
| add_library(hwy_contrib ${HWY_LIBRARY_TYPE} ${HWY_CONTRIB_SOURCES}) | ||||
| target_link_libraries(hwy_contrib hwy) | ||||
| target_compile_options(hwy_contrib PRIVATE ${HWY_FLAGS}) | ||||
| set_property(TARGET hwy_contrib PROPERTY POSITION_INDEPENDENT_CODE ON) | ||||
| set_target_properties(hwy_contrib PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION}) | ||||
| target_include_directories(hwy_contrib PUBLIC ${CMAKE_CURRENT_LIST_DIR}) | ||||
| target_compile_features(hwy_contrib PUBLIC cxx_std_11) | ||||
| 
 | ||||
| add_library(hwy_test STATIC ${HWY_TEST_SOURCES}) | ||||
| add_library(hwy_test ${HWY_LIBRARY_TYPE} ${HWY_TEST_SOURCES}) | ||||
| target_link_libraries(hwy_test hwy) | ||||
| target_compile_options(hwy_test PRIVATE ${HWY_FLAGS}) | ||||
| set_property(TARGET hwy_test PROPERTY POSITION_INDEPENDENT_CODE ON) | ||||
| set_target_properties(hwy_test PROPERTIES VERSION ${LIBRARY_VERSION} SOVERSION ${LIBRARY_SOVERSION}) | ||||
| target_include_directories(hwy_test PUBLIC ${CMAKE_CURRENT_LIST_DIR}) | ||||
| target_compile_features(hwy_test PUBLIC cxx_std_11) | ||||
| 
 | ||||
| # -------------------------------------------------------- hwy_list_targets | ||||
| # Generate a tool to print the compiled-in targets as defined by the current | ||||
|  | @ -219,17 +287,22 @@ target_include_directories(hwy_list_targets PRIVATE | |||
| # Naked target also not always could be run (due to the lack of '.\' prefix) | ||||
| # Thus effective command to run should contain the full path | ||||
| # and emulator prefix (if any). | ||||
| if (NOT CMAKE_CROSSCOMPILING OR CMAKE_CROSSCOMPILING_EMULATOR) | ||||
| add_custom_command(TARGET hwy_list_targets POST_BUILD | ||||
|     COMMAND ${CMAKE_CROSSCOMPILING_EMULATOR} $<TARGET_FILE:hwy_list_targets> || (exit 0)) | ||||
| endif() | ||||
| 
 | ||||
| # -------------------------------------------------------- | ||||
| # Allow skipping the following sections for projects that do not need them: | ||||
| # tests, examples, benchmarks and installation. | ||||
| if (HWY_EXAMPLES_TESTS_INSTALL) | ||||
| 
 | ||||
| # -------------------------------------------------------- install library | ||||
| if (HWY_ENABLE_INSTALL) | ||||
| 
 | ||||
| install(TARGETS hwy | ||||
|   DESTINATION "${CMAKE_INSTALL_LIBDIR}") | ||||
|   LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" | ||||
|   ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" | ||||
|   RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}") | ||||
| # Install all the headers keeping the relative path to the current directory | ||||
| # when installing them. | ||||
| foreach (source ${HWY_SOURCES}) | ||||
|  | @ -241,7 +314,9 @@ foreach (source ${HWY_SOURCES}) | |||
| endforeach() | ||||
| 
 | ||||
| install(TARGETS hwy_contrib | ||||
|   DESTINATION "${CMAKE_INSTALL_LIBDIR}") | ||||
|   LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" | ||||
|   ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" | ||||
|   RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}") | ||||
| # Install all the headers keeping the relative path to the current directory | ||||
| # when installing them. | ||||
| foreach (source ${HWY_CONTRIB_SOURCES}) | ||||
|  | @ -253,7 +328,9 @@ foreach (source ${HWY_CONTRIB_SOURCES}) | |||
| endforeach() | ||||
| 
 | ||||
| install(TARGETS hwy_test | ||||
|   DESTINATION "${CMAKE_INSTALL_LIBDIR}") | ||||
|   LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" | ||||
|   ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" | ||||
|   RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}") | ||||
| # Install all the headers keeping the relative path to the current directory | ||||
| # when installing them. | ||||
| foreach (source ${HWY_TEST_SOURCES}) | ||||
|  | @ -272,7 +349,9 @@ foreach (pc libhwy.pc libhwy-contrib.pc libhwy-test.pc) | |||
|       DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig") | ||||
| endforeach() | ||||
| 
 | ||||
| endif() # HWY_ENABLE_INSTALL | ||||
| # -------------------------------------------------------- Examples | ||||
| if (HWY_ENABLE_EXAMPLES) | ||||
| 
 | ||||
| # Avoids mismatch between GTest's static CRT and our dynamic. | ||||
| set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) | ||||
|  | @ -280,7 +359,6 @@ set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) | |||
| # Programming exercise with integrated benchmark | ||||
| add_executable(hwy_benchmark hwy/examples/benchmark.cc) | ||||
| target_sources(hwy_benchmark PRIVATE | ||||
|     hwy/nanobenchmark.cc | ||||
|     hwy/nanobenchmark.h) | ||||
| # Try adding either -DHWY_COMPILE_ONLY_SCALAR or -DHWY_COMPILE_ONLY_STATIC to | ||||
| # observe the difference in targets printed. | ||||
|  | @ -289,6 +367,7 @@ target_link_libraries(hwy_benchmark hwy) | |||
| set_target_properties(hwy_benchmark | ||||
|     PROPERTIES RUNTIME_OUTPUT_DIRECTORY "examples/") | ||||
| 
 | ||||
| endif() # HWY_ENABLE_EXAMPLES | ||||
| # -------------------------------------------------------- Tests | ||||
| 
 | ||||
| include(CTest) | ||||
|  | @ -352,9 +431,11 @@ set(HWY_TEST_FILES | |||
|   hwy/tests/compare_test.cc | ||||
|   hwy/tests/convert_test.cc | ||||
|   hwy/tests/crypto_test.cc | ||||
|   hwy/tests/demote_test.cc | ||||
|   hwy/tests/logical_test.cc | ||||
|   hwy/tests/mask_test.cc | ||||
|   hwy/tests/memory_test.cc | ||||
|   hwy/tests/shift_test.cc | ||||
|   hwy/tests/swizzle_test.cc | ||||
|   hwy/tests/test_util_test.cc | ||||
| ) | ||||
|  | @ -377,7 +458,7 @@ foreach (TESTFILE IN LISTS HWY_TEST_FILES) | |||
|     target_link_libraries(${TESTNAME} hwy hwy_contrib hwy_test gtest gtest_main) | ||||
|   endif() | ||||
|   # Output test targets in the test directory. | ||||
|   set_target_properties(${TESTNAME} PROPERTIES PREFIX "tests/") | ||||
|   set_target_properties(${TESTNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "tests") | ||||
| 
 | ||||
|   if (HWY_EMSCRIPTEN) | ||||
|     set_target_properties(${TESTNAME} PROPERTIES LINK_FLAGS "-s SINGLE_FILE=1") | ||||
|  | @ -394,5 +475,3 @@ endforeach () | |||
| target_sources(skeleton_test PRIVATE hwy/examples/skeleton.cc) | ||||
| 
 | ||||
| endif() # BUILD_TESTING | ||||
| 
 | ||||
| endif() # HWY_EXAMPLES_TESTS_INSTALL | ||||
|  |  | |||
							
								
								
									
										2
									
								
								third_party/highway/CMakeLists.txt.in
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								third_party/highway/CMakeLists.txt.in
									
									
									
									
										vendored
									
									
								
							|  | @ -5,7 +5,7 @@ project(googletest-download NONE) | |||
| include(ExternalProject) | ||||
| ExternalProject_Add(googletest | ||||
|   GIT_REPOSITORY    https://github.com/google/googletest.git | ||||
|   GIT_TAG           master | ||||
|   GIT_TAG           43efa0a4efd40c78b9210d15373112081899a97c | ||||
|   SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-src" | ||||
|   BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-build" | ||||
|   CONFIGURE_COMMAND "" | ||||
|  |  | |||
							
								
								
									
										171
									
								
								third_party/highway/README.md
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										171
									
								
								third_party/highway/README.md
									
									
									
									
										vendored
									
									
								
							|  | @ -1,30 +1,95 @@ | |||
| # Efficient and performance-portable SIMD | ||||
| # Efficient and performance-portable vector software | ||||
| 
 | ||||
| Highway is a C++ library for SIMD (Single Instruction, Multiple Data), i.e. | ||||
| applying the same operation to multiple 'lanes' using a single CPU instruction. | ||||
| [//]: # (placeholder, do not remove) | ||||
| 
 | ||||
| ## Why Highway? | ||||
| Highway is a C++ library that provides portable SIMD/vector intrinsics. | ||||
| 
 | ||||
| - more portable (same source code) than platform-specific intrinsics, | ||||
| - works on a wider range of compilers than compiler-specific vector extensions, | ||||
| - more dependable than autovectorization, | ||||
| - easier to write/maintain than assembly language, | ||||
| - supports **runtime dispatch**, | ||||
| - supports **variable-length vector** architectures. | ||||
| ## Why | ||||
| 
 | ||||
| We are passionate about high-performance software. We see major untapped | ||||
| potential in CPUs (servers, mobile, desktops). Highway is for engineers who want | ||||
| to reliably and economically push the boundaries of what is possible in | ||||
| software. | ||||
| 
 | ||||
| ## How | ||||
| 
 | ||||
| CPUs provide SIMD/vector instructions that apply the same operation to multiple | ||||
| data items. This can reduce energy usage e.g. *fivefold* because fewer | ||||
| instructions are executed. We also often see *5-10x* speedups. | ||||
| 
 | ||||
| Highway makes SIMD/vector programming practical and workable according to these | ||||
| guiding principles: | ||||
| 
 | ||||
| **Does what you expect**: Highway is a C++ library with carefully-chosen | ||||
| functions that map well to CPU instructions without extensive compiler | ||||
| transformations. The resulting code is more predictable and robust to code | ||||
| changes/compiler updates than autovectorization. | ||||
| 
 | ||||
| **Works on widely-used platforms**: Highway supports four architectures; the | ||||
| same application code can target eight instruction sets, including those with | ||||
| 'scalable' vectors (size unknown at compile time). Highway only requires C++11 | ||||
| and supports four families of compilers. If you would like to use Highway on | ||||
| other platforms, please raise an issue. | ||||
| 
 | ||||
| **Flexible to deploy**: Applications using Highway can run on heterogeneous | ||||
| clouds or client devices, choosing the best available instruction set at | ||||
| runtime. Alternatively, developers may choose to target a single instruction set | ||||
| without any runtime overhead. In both cases, the application code is the same | ||||
| except for swapping `HWY_STATIC_DISPATCH` with `HWY_DYNAMIC_DISPATCH` plus one | ||||
| line of code. | ||||
| 
 | ||||
| **Suitable for a variety of domains**: Highway provides an extensive set of | ||||
| operations, used for image processing (floating-point), compression, video | ||||
| analysis, linear algebra, cryptography, sorting and random generation. We | ||||
| recognise that new use-cases may require additional ops and are happy to add | ||||
| them where it makes sense (e.g. no performance cliffs on some architectures). If | ||||
| you would like to discuss, please file an issue. | ||||
| 
 | ||||
| **Rewards data-parallel design**: Highway provides tools such as Gather, | ||||
| MaskedLoad, and FixedTag to enable speedups for legacy data structures. However, | ||||
| the biggest gains are unlocked by designing algorithms and data structures for | ||||
| scalable vectors. Helpful techniques include batching, structure-of-array | ||||
| layouts, and aligned/padded allocations. | ||||
| 
 | ||||
| ## Examples | ||||
| 
 | ||||
| Online demos using Compiler Explorer: | ||||
| 
 | ||||
| -   [generating code for multiple targets](https://gcc.godbolt.org/z/n6rx6xK5h) (recommended) | ||||
| -   [single target using -m flags](https://gcc.godbolt.org/z/rGnjMevKG) | ||||
| 
 | ||||
| Projects using Highway: (to add yours, feel free to raise an issue or contact us | ||||
| via the below email) | ||||
| 
 | ||||
| *   [iresearch database index](https://github.com/iresearch-toolkit/iresearch/blob/e7638e7a4b99136ca41f82be6edccf01351a7223/core/utils/simd_utils.hpp) | ||||
| *   [JPEG XL image codec](https://github.com/libjxl/libjxl) | ||||
| *   [Grok JPEG 2000 image codec](https://github.com/GrokImageCompression/grok) | ||||
| *   [vectorized Quicksort](https://github.com/google/highway/tree/master/hwy/contrib/sort) | ||||
| 
 | ||||
| ## Current status | ||||
| 
 | ||||
| ### Targets | ||||
| 
 | ||||
| Supported targets: scalar, S-SSE3, SSE4, AVX2, AVX-512, AVX3_DL (~Icelake, | ||||
| requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE, | ||||
| requires opt-in by defining `HWY_WANT_AVX3_DL`), NEON (ARMv7 and v8), SVE, SVE2, | ||||
| WASM SIMD. | ||||
| 
 | ||||
| SVE is tested using farm_sve (see acknowledgments). SVE2 is implemented but not | ||||
| yet validated. A subset of RVV is implemented and tested with GCC and QEMU. | ||||
| Work is underway to compile using LLVM, which has different intrinsics with AVL. | ||||
| SVE was initially tested using farm_sve (see acknowledgments). A subset of RVV | ||||
| is implemented and tested with LLVM and QEMU. Work is underway to add RVV ops | ||||
| which were not yet supported by GCC. | ||||
| 
 | ||||
| Version 0.11 is considered stable enough to use in other projects, and is | ||||
| expected to remain backwards compatible unless serious issues are discovered | ||||
| while finishing the RVV target. After that, Highway will reach version 1.0. | ||||
| ### Versioning | ||||
| 
 | ||||
| Highway releases aim to follow the semver.org system (MAJOR.MINOR.PATCH), | ||||
| incrementing MINOR after backward-compatible additions and PATCH after | ||||
| backward-compatible fixes. We recommend using releases (rather than the Git tip) | ||||
| because they are tested more extensively, see below. | ||||
| 
 | ||||
| Version 0.11 is considered stable enough to use in other projects. | ||||
| Version 1.0 will signal an increased focus on backwards compatibility and will | ||||
| be reached after the RVV target is finished (planned for 2022H1). | ||||
| 
 | ||||
| ### Testing | ||||
| 
 | ||||
| Continuous integration tests build with a recent version of Clang (running on | ||||
| x86 and QEMU for ARM) and MSVC from VS2015 (running on x86). | ||||
|  | @ -33,13 +98,15 @@ Before releases, we also test on x86 with Clang and GCC, and ARMv7/8 via | |||
| GCC cross-compile and QEMU. See the | ||||
| [testing process](g3doc/release_testing_process.md) for details. | ||||
| 
 | ||||
| ### Related modules | ||||
| 
 | ||||
| The `contrib` directory contains SIMD-related utilities: an image class with | ||||
| aligned rows, and a math library (16 functions already implemented, mostly | ||||
| trigonometry). | ||||
| aligned rows, a math library (16 functions already implemented, mostly | ||||
| trigonometry), and functions for computing dot products and sorting. | ||||
| 
 | ||||
| ## Installation | ||||
| 
 | ||||
| This project uses cmake to generate and build. In a Debian-based system you can | ||||
| This project uses CMake to generate and build. In a Debian-based system you can | ||||
| install it via: | ||||
| 
 | ||||
| ```bash | ||||
|  | @ -55,7 +122,8 @@ installing gtest separately: | |||
| sudo apt install libgtest-dev | ||||
| ``` | ||||
| 
 | ||||
| To build and test the library the standard cmake workflow can be used: | ||||
| To build Highway as a shared or static library (depending on BUILD_SHARED_LIBS), | ||||
| the standard CMake workflow can be used: | ||||
| 
 | ||||
| ```bash | ||||
| mkdir -p build && cd build | ||||
|  | @ -76,31 +144,40 @@ and their parameters, and the [instruction_matrix](g3doc/instruction_matrix.pdf) | |||
| indicates the number of instructions per operation. | ||||
| 
 | ||||
| We recommend using full SIMD vectors whenever possible for maximum performance | ||||
| portability. To obtain them, pass a `HWY_FULL(float)` tag to functions such as | ||||
| `Zero/Set/Load`. There is also the option of a vector of up to `N` (a power of | ||||
| two <= 16/sizeof(T)) lanes of type `T`: `HWY_CAPPED(T, N)`. If `HWY_TARGET == | ||||
| HWY_SCALAR`, the vector always has one lane. For all other targets, up to | ||||
| 128-bit vectors are guaranteed to be available. | ||||
| portability. To obtain them, pass a `ScalableTag<float>` (or equivalently | ||||
| `HWY_FULL(float)`) tag to functions such as `Zero/Set/Load`. There are two | ||||
| alternatives for use-cases requiring an upper bound on the lanes: | ||||
| 
 | ||||
| Functions using Highway must be inside `namespace HWY_NAMESPACE {` | ||||
| (possibly nested in one or more other namespaces defined by the project), and | ||||
| additionally either prefixed with `HWY_ATTR`, or residing between | ||||
| `HWY_BEFORE_NAMESPACE()` and `HWY_AFTER_NAMESPACE()`. | ||||
| -   For up to a power of two `N`, specify `CappedTag<T, N>` (or | ||||
|     equivalently `HWY_CAPPED(T, N)`). This is useful for data structures such as | ||||
|     a narrow matrix. A loop is still required because vectors may actually have | ||||
|     fewer than `N` lanes. | ||||
| 
 | ||||
| -   For exactly a power of two `N` lanes, specify `FixedTag<T, N>`. The largest | ||||
|     supported `N` depends on the target, but is guaranteed to be at least | ||||
|     `16/sizeof(T)`. | ||||
| 
 | ||||
| Functions using Highway must either be inside `namespace HWY_NAMESPACE {` | ||||
| (possibly nested in one or more other namespaces defined by the project), OR | ||||
| each op must be prefixed with `hn::`, e.g. `namespace hn = hwy::HWY_NAMESPACE; | ||||
| hn::LoadDup128()`. Additionally, each function using Highway must either be | ||||
| prefixed with `HWY_ATTR`, OR reside between `HWY_BEFORE_NAMESPACE()` and | ||||
| `HWY_AFTER_NAMESPACE()`. | ||||
| 
 | ||||
| *   For static dispatch, `HWY_TARGET` will be the best available target among | ||||
|     `HWY_BASELINE_TARGETS`, i.e. those allowed for use by the compiler (see | ||||
|     [quick-reference](g3doc/quick_reference.md)). Functions inside `HWY_NAMESPACE` | ||||
|     can be called using `HWY_STATIC_DISPATCH(func)(args)` within the same module | ||||
|     they are defined in. You can call the function from other modules by | ||||
|     wrapping it in a regular function and declaring the regular function in a | ||||
|     header. | ||||
|     [quick-reference](g3doc/quick_reference.md)). Functions inside | ||||
|     `HWY_NAMESPACE` can be called using `HWY_STATIC_DISPATCH(func)(args)` within | ||||
|     the same module they are defined in. You can call the function from other | ||||
|     modules by wrapping it in a regular function and declaring the regular | ||||
|     function in a header. | ||||
| 
 | ||||
| *   For dynamic dispatch, a table of function pointers is generated via the | ||||
|     `HWY_EXPORT` macro that is used by `HWY_DYNAMIC_DISPATCH(func)(args)` to | ||||
|     call the best function pointer for the current CPU's supported targets. A | ||||
|     module is automatically compiled for each target in `HWY_TARGETS` (see | ||||
|     [quick-reference](g3doc/quick_reference.md)) if `HWY_TARGET_INCLUDE` is | ||||
|     defined and foreach_target.h is included. | ||||
|     defined and `foreach_target.h` is included. | ||||
| 
 | ||||
| ## Compiler flags | ||||
| 
 | ||||
|  | @ -123,17 +200,17 @@ ensure proper VEX code generation for AVX2 targets. | |||
| To vectorize a loop, "strip-mining" transforms it into an outer loop and inner | ||||
| loop with number of iterations matching the preferred vector width. | ||||
| 
 | ||||
| In this section, let `T` denote the element type, `d = HWY_FULL(T)`, `count` the | ||||
| number of elements to process, and `N = Lanes(d)` the number of lanes in a full | ||||
| vector. Assume the loop body is given as a function `template<bool partial, | ||||
| class D> void LoopBody(D d, size_t max_n)`. | ||||
| In this section, let `T` denote the element type, `d = ScalableTag<T>`, `count` | ||||
| the number of elements to process, and `N = Lanes(d)` the number of lanes in a | ||||
| full vector. Assume the loop body is given as a function `template<bool partial, | ||||
| class D> void LoopBody(D d, size_t index, size_t max_n)`. | ||||
| 
 | ||||
| Highway offers several ways to express loops where `N` need not divide `count`: | ||||
| 
 | ||||
| *   Ensure all inputs/outputs are padded. Then the loop is simply | ||||
| 
 | ||||
|     ``` | ||||
|     for (size_t i = 0; i < count; i += N) LoopBody<false>(d, 0); | ||||
|     for (size_t i = 0; i < count; i += N) LoopBody<false>(d, i, 0); | ||||
|     ``` | ||||
|     Here, the template parameter and second function argument are not needed. | ||||
| 
 | ||||
|  | @ -149,8 +226,8 @@ Highway offers several ways to express loops where `N` need not divide `count`: | |||
| 
 | ||||
|     ``` | ||||
|     size_t i = 0; | ||||
|     for (; i + N <= count; i += N) LoopBody<false>(d, 0); | ||||
|     for (; i < count; ++i) LoopBody<false>(HWY_CAPPED(T, 1)(), 0); | ||||
|     for (; i + N <= count; i += N) LoopBody<false>(d, i, 0); | ||||
|     for (; i < count; ++i) LoopBody<false>(HWY_CAPPED(T, 1)(), i, 0); | ||||
|     ``` | ||||
|     The template parameter and second function arguments are again not needed. | ||||
| 
 | ||||
|  | @ -163,18 +240,20 @@ Highway offers several ways to express loops where `N` need not divide `count`: | |||
|     ``` | ||||
|     size_t i = 0; | ||||
|     for (; i + N <= count; i += N) { | ||||
|       LoopBody<false>(d, 0); | ||||
|       LoopBody<false>(d, i, 0); | ||||
|     } | ||||
|     if (i < count) { | ||||
|       LoopBody<true>(d, count - i); | ||||
|       LoopBody<true>(d, i, count - i); | ||||
|     } | ||||
|     ``` | ||||
|     Now the template parameter and second function argument can be used inside | ||||
|     Now the template parameter and third function argument can be used inside | ||||
|     `LoopBody` to 'blend' the new partial vector with previous memory contents: | ||||
|     `Store(IfThenElse(FirstN(d, N), partial, prev_full), d, aligned_pointer);`. | ||||
| 
 | ||||
|     This is a good default when it is infeasible to ensure vectors are padded. | ||||
|     In contrast to the scalar loop, only a single final iteration is needed. | ||||
|     The increased code size from two loop bodies is expected to be worthwhile | ||||
|     because it avoids the cost of masking in all but the final iteration. | ||||
| 
 | ||||
| ## Additional resources | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										12
									
								
								third_party/highway/debian/changelog
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										12
									
								
								third_party/highway/debian/changelog
									
									
									
									
										vendored
									
									
								
							|  | @ -1,3 +1,15 @@ | |||
| highway (0.16.0-1) UNRELEASED; urgency=medium | ||||
| 
 | ||||
|   * Add contrib/sort (vectorized quicksort) | ||||
|   * Add IfNegativeThenElse, IfVecThenElse | ||||
|   * Add Reverse2,4,8, ReverseBlocks, DupEven/Odd, AESLastRound | ||||
|   * Add OrAnd, Min128, Max128, Lt128, SumsOf8 | ||||
|   * Support capped/partial vectors on RVV/SVE, int64 in WASM | ||||
|   * Support SVE2, shared library build | ||||
|   * Remove deprecated overloads without the required d arg (UpperHalf etc.) | ||||
| 
 | ||||
|  -- Jan Wassenberg <janwas@google.com>  Thu, 03 Feb 2022 11:00:00 +0100 | ||||
| 
 | ||||
| highway (0.15.0-1) UNRELEASED; urgency=medium | ||||
| 
 | ||||
|   * New ops: CompressBlendedStore, ConcatOdd/Even, IndicesFromVec | ||||
|  |  | |||
							
								
								
									
										21
									
								
								third_party/highway/hwy/aligned_allocator.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										21
									
								
								third_party/highway/hwy/aligned_allocator.h
									
									
									
									
										vendored
									
									
								
							|  | @ -18,8 +18,11 @@ | |||
| // Memory allocator with support for alignment and offsets.
 | ||||
| 
 | ||||
| #include <stddef.h> | ||||
| 
 | ||||
| #include <memory> | ||||
| 
 | ||||
| #include "hwy/highway_export.h" | ||||
| 
 | ||||
| namespace hwy { | ||||
| 
 | ||||
| // Minimum alignment of allocated memory for use in HWY_ASSUME_ALIGNED, which
 | ||||
|  | @ -36,15 +39,15 @@ using FreePtr = void (*)(void* opaque, void* memory); | |||
| // bytes of newly allocated memory, aligned to the larger of HWY_ALIGNMENT and
 | ||||
| // the vector size. Calls `alloc` with the passed `opaque` pointer to obtain
 | ||||
| // memory or malloc() if it is null.
 | ||||
| void* AllocateAlignedBytes(size_t payload_size, AllocPtr alloc_ptr, | ||||
|                            void* opaque_ptr); | ||||
| HWY_DLLEXPORT void* AllocateAlignedBytes(size_t payload_size, | ||||
|                                          AllocPtr alloc_ptr, void* opaque_ptr); | ||||
| 
 | ||||
| // Frees all memory. No effect if `aligned_pointer` == nullptr, otherwise it
 | ||||
| // must have been returned from a previous call to `AllocateAlignedBytes`.
 | ||||
| // Calls `free_ptr` with the passed `opaque_ptr` pointer to free the memory; if
 | ||||
| // `free_ptr` function is null, uses the default free().
 | ||||
| void FreeAlignedBytes(const void* aligned_pointer, FreePtr free_ptr, | ||||
|                       void* opaque_ptr); | ||||
| HWY_DLLEXPORT void FreeAlignedBytes(const void* aligned_pointer, | ||||
|                                     FreePtr free_ptr, void* opaque_ptr); | ||||
| 
 | ||||
| // Class that deletes the aligned pointer passed to operator() calling the
 | ||||
| // destructor before freeing the pointer. This is equivalent to the
 | ||||
|  | @ -76,8 +79,10 @@ class AlignedDeleter { | |||
|   // array. TypeArrayDeleter<T> would match this prototype.
 | ||||
|   using ArrayDeleter = void (*)(void* t_ptr, size_t t_size); | ||||
| 
 | ||||
|   static void DeleteAlignedArray(void* aligned_pointer, FreePtr free_ptr, | ||||
|                                  void* opaque_ptr, ArrayDeleter deleter); | ||||
|   HWY_DLLEXPORT static void DeleteAlignedArray(void* aligned_pointer, | ||||
|                                                FreePtr free_ptr, | ||||
|                                                void* opaque_ptr, | ||||
|                                                ArrayDeleter deleter); | ||||
| 
 | ||||
|   FreePtr free_; | ||||
|   void* opaque_ptr_; | ||||
|  | @ -107,8 +112,8 @@ template <typename T, typename... Args> | |||
| AlignedUniquePtr<T> MakeUniqueAligned(Args&&... args) { | ||||
|   T* ptr = static_cast<T*>(AllocateAlignedBytes( | ||||
|       sizeof(T), /*alloc_ptr=*/nullptr, /*opaque_ptr=*/nullptr)); | ||||
|   return AlignedUniquePtr<T>( | ||||
|       new (ptr) T(std::forward<Args>(args)...), AlignedDeleter()); | ||||
|   return AlignedUniquePtr<T>(new (ptr) T(std::forward<Args>(args)...), | ||||
|                              AlignedDeleter()); | ||||
| } | ||||
| 
 | ||||
| // Helpers for array allocators (avoids overflow)
 | ||||
|  |  | |||
							
								
								
									
										252
									
								
								third_party/highway/hwy/base.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										252
									
								
								third_party/highway/hwy/base.h
									
									
									
									
										vendored
									
									
								
							|  | @ -24,6 +24,7 @@ | |||
| #include <cfloat> | ||||
| 
 | ||||
| #include "hwy/detect_compiler_arch.h" | ||||
| #include "hwy/highway_export.h" | ||||
| 
 | ||||
| //------------------------------------------------------------------------------
 | ||||
| // Compiler-specific definitions
 | ||||
|  | @ -184,10 +185,6 @@ | |||
|   } while (0) | ||||
| #endif | ||||
| 
 | ||||
| #if defined(HWY_EMULATE_SVE) | ||||
| class FarmFloat16; | ||||
| #endif | ||||
| 
 | ||||
| namespace hwy { | ||||
| 
 | ||||
| //------------------------------------------------------------------------------
 | ||||
|  | @ -205,7 +202,9 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16; | |||
| //------------------------------------------------------------------------------
 | ||||
| // Alignment
 | ||||
| 
 | ||||
| // For stack-allocated partial arrays or LoadDup128.
 | ||||
| // Potentially useful for LoadDup128 and capped vectors. In other cases, arrays
 | ||||
| // should be allocated dynamically via aligned_allocator.h because Lanes() may
 | ||||
| // exceed the stack size.
 | ||||
| #if HWY_ARCH_X86 | ||||
| #define HWY_ALIGN_MAX alignas(64) | ||||
| #elif HWY_ARCH_RVV && defined(__riscv_vector) | ||||
|  | @ -228,9 +227,7 @@ static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16; | |||
| 
 | ||||
| #pragma pack(push, 1) | ||||
| 
 | ||||
| #if defined(HWY_EMULATE_SVE) | ||||
| using float16_t = FarmFloat16; | ||||
| #elif HWY_NATIVE_FLOAT16 | ||||
| #if HWY_NATIVE_FLOAT16 | ||||
| using float16_t = __fp16; | ||||
| // Clang does not allow __fp16 arguments, but scalar.h requires LaneType
 | ||||
| // arguments, so use a wrapper.
 | ||||
|  | @ -253,15 +250,15 @@ using float64_t = double; | |||
| //------------------------------------------------------------------------------
 | ||||
| // Controlling overload resolution (SFINAE)
 | ||||
| 
 | ||||
| template <bool Condition, class T> | ||||
| template <bool Condition> | ||||
| struct EnableIfT {}; | ||||
| template <class T> | ||||
| struct EnableIfT<true, T> { | ||||
|   using type = T; | ||||
| template <> | ||||
| struct EnableIfT<true> { | ||||
|   using type = void; | ||||
| }; | ||||
| 
 | ||||
| template <bool Condition, class T = void> | ||||
| using EnableIf = typename EnableIfT<Condition, T>::type; | ||||
| template <bool Condition> | ||||
| using EnableIf = typename EnableIfT<Condition>::type; | ||||
| 
 | ||||
| template <typename T, typename U> | ||||
| struct IsSameT { | ||||
|  | @ -283,7 +280,7 @@ HWY_API constexpr bool IsSame() { | |||
| //
 | ||||
| // Note that enabling for exactly 128 bits is unnecessary because a function can
 | ||||
| // simply be overloaded with Vec128<T> and/or Full128<T> tag. Enabling for other
 | ||||
| // sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T)>.
 | ||||
| // sizes (e.g. 64 bit) can be achieved via Simd<T, 8 / sizeof(T), 0>.
 | ||||
| #define HWY_IF_LE128(T, N) hwy::EnableIf<N * sizeof(T) <= 16>* = nullptr | ||||
| #define HWY_IF_LE64(T, N) hwy::EnableIf<N * sizeof(T) <= 8>* = nullptr | ||||
| #define HWY_IF_LE32(T, N) hwy::EnableIf<N * sizeof(T) <= 4>* = nullptr | ||||
|  | @ -319,102 +316,6 @@ struct RemoveConstT<const T> { | |||
| template <class T> | ||||
| using RemoveConst = typename RemoveConstT<T>::type; | ||||
| 
 | ||||
| //------------------------------------------------------------------------------
 | ||||
| // Type traits
 | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API constexpr bool IsFloat() { | ||||
|   // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
 | ||||
|   // from a float, not compared.
 | ||||
|   return IsSame<T, float>() || IsSame<T, double>(); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API constexpr bool IsSigned() { | ||||
|   return T(0) > T(-1); | ||||
| } | ||||
| template <> | ||||
| constexpr bool IsSigned<float16_t>() { | ||||
|   return true; | ||||
| } | ||||
| template <> | ||||
| constexpr bool IsSigned<bfloat16_t>() { | ||||
|   return true; | ||||
| } | ||||
| 
 | ||||
| // Largest/smallest representable integer values.
 | ||||
| template <typename T> | ||||
| HWY_API constexpr T LimitsMax() { | ||||
|   static_assert(!IsFloat<T>(), "Only for integer types"); | ||||
|   return IsSigned<T>() ? T((1ULL << (sizeof(T) * 8 - 1)) - 1) | ||||
|                        : static_cast<T>(~0ull); | ||||
| } | ||||
| template <typename T> | ||||
| HWY_API constexpr T LimitsMin() { | ||||
|   static_assert(!IsFloat<T>(), "Only for integer types"); | ||||
|   return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0); | ||||
| } | ||||
| 
 | ||||
| // Largest/smallest representable value (integer or float). This naming avoids
 | ||||
| // confusion with numeric_limits<float>::min() (the smallest positive value).
 | ||||
| template <typename T> | ||||
| HWY_API constexpr T LowestValue() { | ||||
|   return LimitsMin<T>(); | ||||
| } | ||||
| template <> | ||||
| constexpr float LowestValue<float>() { | ||||
|   return -FLT_MAX; | ||||
| } | ||||
| template <> | ||||
| constexpr double LowestValue<double>() { | ||||
|   return -DBL_MAX; | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API constexpr T HighestValue() { | ||||
|   return LimitsMax<T>(); | ||||
| } | ||||
| template <> | ||||
| constexpr float HighestValue<float>() { | ||||
|   return FLT_MAX; | ||||
| } | ||||
| template <> | ||||
| constexpr double HighestValue<double>() { | ||||
|   return DBL_MAX; | ||||
| } | ||||
| 
 | ||||
| // Returns bitmask of the exponent field in IEEE binary32/64.
 | ||||
| template <typename T> | ||||
| constexpr T ExponentMask() { | ||||
|   static_assert(sizeof(T) == 0, "Only instantiate the specializations"); | ||||
|   return 0; | ||||
| } | ||||
| template <> | ||||
| constexpr uint32_t ExponentMask<uint32_t>() { | ||||
|   return 0x7F800000; | ||||
| } | ||||
| template <> | ||||
| constexpr uint64_t ExponentMask<uint64_t>() { | ||||
|   return 0x7FF0000000000000ULL; | ||||
| } | ||||
| 
 | ||||
| // Returns 1 << mantissa_bits as a floating-point number. All integers whose
 | ||||
| // absolute value are less than this can be represented exactly.
 | ||||
| template <typename T> | ||||
| constexpr T MantissaEnd() { | ||||
|   static_assert(sizeof(T) == 0, "Only instantiate the specializations"); | ||||
|   return 0; | ||||
| } | ||||
| template <> | ||||
| constexpr float MantissaEnd<float>() { | ||||
|   return 8388608.0f;  // 1 << 23
 | ||||
| } | ||||
| template <> | ||||
| constexpr double MantissaEnd<double>() { | ||||
|   // floating point literal with p52 requires C++17.
 | ||||
|   return 4503599627370496.0;  // 1 << 52
 | ||||
| } | ||||
| 
 | ||||
| //------------------------------------------------------------------------------
 | ||||
| // Type relations
 | ||||
| 
 | ||||
|  | @ -556,6 +457,118 @@ using SignedFromSize = typename detail::TypeFromSize<N>::Signed; | |||
| template <size_t N> | ||||
| using FloatFromSize = typename detail::TypeFromSize<N>::Float; | ||||
| 
 | ||||
| //------------------------------------------------------------------------------
 | ||||
| // Type traits
 | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API constexpr bool IsFloat() { | ||||
|   // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or
 | ||||
|   // from a float, not compared.
 | ||||
|   return IsSame<T, float>() || IsSame<T, double>(); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API constexpr bool IsSigned() { | ||||
|   return T(0) > T(-1); | ||||
| } | ||||
| template <> | ||||
| constexpr bool IsSigned<float16_t>() { | ||||
|   return true; | ||||
| } | ||||
| template <> | ||||
| constexpr bool IsSigned<bfloat16_t>() { | ||||
|   return true; | ||||
| } | ||||
| 
 | ||||
| // Largest/smallest representable integer values.
 | ||||
| template <typename T> | ||||
| HWY_API constexpr T LimitsMax() { | ||||
|   static_assert(!IsFloat<T>(), "Only for integer types"); | ||||
|   using TU = MakeUnsigned<T>; | ||||
|   return static_cast<T>(IsSigned<T>() ? (static_cast<TU>(~0ull) >> 1) | ||||
|                                       : static_cast<TU>(~0ull)); | ||||
| } | ||||
| template <typename T> | ||||
| HWY_API constexpr T LimitsMin() { | ||||
|   static_assert(!IsFloat<T>(), "Only for integer types"); | ||||
|   return IsSigned<T>() ? T(-1) - LimitsMax<T>() : T(0); | ||||
| } | ||||
| 
 | ||||
| // Largest/smallest representable value (integer or float). This naming avoids
 | ||||
| // confusion with numeric_limits<float>::min() (the smallest positive value).
 | ||||
| template <typename T> | ||||
| HWY_API constexpr T LowestValue() { | ||||
|   return LimitsMin<T>(); | ||||
| } | ||||
| template <> | ||||
| constexpr float LowestValue<float>() { | ||||
|   return -FLT_MAX; | ||||
| } | ||||
| template <> | ||||
| constexpr double LowestValue<double>() { | ||||
|   return -DBL_MAX; | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API constexpr T HighestValue() { | ||||
|   return LimitsMax<T>(); | ||||
| } | ||||
| template <> | ||||
| constexpr float HighestValue<float>() { | ||||
|   return FLT_MAX; | ||||
| } | ||||
| template <> | ||||
| constexpr double HighestValue<double>() { | ||||
|   return DBL_MAX; | ||||
| } | ||||
| 
 | ||||
| // Returns bitmask of the exponent field in IEEE binary32/64.
 | ||||
| template <typename T> | ||||
| constexpr T ExponentMask() { | ||||
|   static_assert(sizeof(T) == 0, "Only instantiate the specializations"); | ||||
|   return 0; | ||||
| } | ||||
| template <> | ||||
| constexpr uint32_t ExponentMask<uint32_t>() { | ||||
|   return 0x7F800000; | ||||
| } | ||||
| template <> | ||||
| constexpr uint64_t ExponentMask<uint64_t>() { | ||||
|   return 0x7FF0000000000000ULL; | ||||
| } | ||||
| 
 | ||||
| // Returns bitmask of the mantissa field in IEEE binary32/64.
 | ||||
| template <typename T> | ||||
| constexpr T MantissaMask() { | ||||
|   static_assert(sizeof(T) == 0, "Only instantiate the specializations"); | ||||
|   return 0; | ||||
| } | ||||
| template <> | ||||
| constexpr uint32_t MantissaMask<uint32_t>() { | ||||
|   return 0x007FFFFF; | ||||
| } | ||||
| template <> | ||||
| constexpr uint64_t MantissaMask<uint64_t>() { | ||||
|   return 0x000FFFFFFFFFFFFFULL; | ||||
| } | ||||
| 
 | ||||
| // Returns 1 << mantissa_bits as a floating-point number. All integers whose
 | ||||
| // absolute value are less than this can be represented exactly.
 | ||||
| template <typename T> | ||||
| constexpr T MantissaEnd() { | ||||
|   static_assert(sizeof(T) == 0, "Only instantiate the specializations"); | ||||
|   return 0; | ||||
| } | ||||
| template <> | ||||
| constexpr float MantissaEnd<float>() { | ||||
|   return 8388608.0f;  // 1 << 23
 | ||||
| } | ||||
| template <> | ||||
| constexpr double MantissaEnd<double>() { | ||||
|   // floating point literal with p52 requires C++17.
 | ||||
|   return 4503599627370496.0;  // 1 << 52
 | ||||
| } | ||||
| 
 | ||||
| //------------------------------------------------------------------------------
 | ||||
| // Helper functions
 | ||||
| 
 | ||||
|  | @ -661,14 +674,21 @@ HWY_API size_t PopCount(uint64_t x) { | |||
| #endif | ||||
| } | ||||
| 
 | ||||
| // Skip HWY_API due to GCC "function not considered for inlining". Previously
 | ||||
| // such errors were caused by underlying type mismatches, but it's not clear
 | ||||
| // what is still mismatched despite all the casts.
 | ||||
| template <typename TI> | ||||
| HWY_API constexpr size_t FloorLog2(TI x) { | ||||
|   return x == 1 ? 0 : FloorLog2(x >> 1) + 1; | ||||
| /*HWY_API*/ constexpr size_t FloorLog2(TI x) { | ||||
|   return x == TI{1} | ||||
|              ? 0 | ||||
|              : static_cast<size_t>(FloorLog2(static_cast<TI>(x >> 1)) + 1); | ||||
| } | ||||
| 
 | ||||
| template <typename TI> | ||||
| HWY_API constexpr size_t CeilLog2(TI x) { | ||||
|   return x == 1 ? 0 : FloorLog2(x - 1) + 1; | ||||
| /*HWY_API*/ constexpr size_t CeilLog2(TI x) { | ||||
|   return x == TI{1} | ||||
|              ? 0 | ||||
|              : static_cast<size_t>(FloorLog2(static_cast<TI>(x - 1)) + 1); | ||||
| } | ||||
| 
 | ||||
| #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64 | ||||
|  | @ -727,7 +747,7 @@ HWY_API bfloat16_t BF16FromF32(float f) { | |||
|   return bf; | ||||
| } | ||||
| 
 | ||||
| HWY_NORETURN void HWY_FORMAT(3, 4) | ||||
| HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) | ||||
|     Abort(const char* file, int line, const char* format, ...); | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
|  |  | |||
							
								
								
									
										6
									
								
								third_party/highway/hwy/cache_control.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										6
									
								
								third_party/highway/hwy/cache_control.h
									
									
									
									
										vendored
									
									
								
							|  | @ -36,9 +36,7 @@ | |||
| // undefine them in this header; these functions are anyway deprecated.
 | ||||
| // TODO(janwas): remove when these functions are removed.
 | ||||
| #pragma push_macro("LoadFence") | ||||
| #pragma push_macro("StoreFence") | ||||
| #undef LoadFence | ||||
| #undef StoreFence | ||||
| 
 | ||||
| namespace hwy { | ||||
| 
 | ||||
|  | @ -72,9 +70,6 @@ HWY_INLINE HWY_ATTR_CACHE void FlushStream() { | |||
| #endif | ||||
| } | ||||
| 
 | ||||
| // DEPRECATED, replace with `FlushStream`.
 | ||||
| HWY_INLINE HWY_ATTR_CACHE void StoreFence() { FlushStream(); } | ||||
| 
 | ||||
| // Optionally begins loading the cache line containing "p" to reduce latency of
 | ||||
| // subsequent actual loads.
 | ||||
| template <typename T> | ||||
|  | @ -109,7 +104,6 @@ HWY_INLINE HWY_ATTR_CACHE void Pause() { | |||
| }  // namespace hwy
 | ||||
| 
 | ||||
| // TODO(janwas): remove when these functions are removed. (See above.)
 | ||||
| #pragma pop_macro("StoreFence") | ||||
| #pragma pop_macro("LoadFence") | ||||
| 
 | ||||
| #endif  // HIGHWAY_HWY_CACHE_CONTROL_H_
 | ||||
|  |  | |||
|  | @ -14,15 +14,14 @@ | |||
| 
 | ||||
| #include "hwy/contrib/image/image.h" | ||||
| 
 | ||||
| #include <algorithm>  // swap
 | ||||
| #include <cstddef> | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/image/image.cc" | ||||
| 
 | ||||
| #include <algorithm>  // swap
 | ||||
| 
 | ||||
| #include "hwy/foreach_target.h" | ||||
| #include "hwy/highway.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
|  |  | |||
|  | @ -27,12 +27,13 @@ | |||
| 
 | ||||
| #include "hwy/aligned_allocator.h" | ||||
| #include "hwy/base.h" | ||||
| #include "hwy/highway_export.h" | ||||
| 
 | ||||
| namespace hwy { | ||||
| 
 | ||||
| // Type-independent parts of Image<> - reduces code duplication and facilitates
 | ||||
| // moving member function implementations to cc file.
 | ||||
| struct ImageBase { | ||||
| struct HWY_CONTRIB_DLLEXPORT ImageBase { | ||||
|   // Returns required alignment in bytes for externally allocated memory.
 | ||||
|   static size_t VectorSize(); | ||||
| 
 | ||||
|  | @ -100,8 +101,7 @@ struct ImageBase { | |||
|  protected: | ||||
|   // Returns pointer to the start of a row.
 | ||||
|   HWY_INLINE void* VoidRow(const size_t y) const { | ||||
| #if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ | ||||
|     defined(THREAD_SANITIZER) | ||||
| #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN | ||||
|     if (y >= ysize_) { | ||||
|       HWY_ABORT("Row(%" PRIu64 ") >= %u\n", static_cast<uint64_t>(y), ysize_); | ||||
|     } | ||||
|  | @ -291,8 +291,7 @@ class Image3 { | |||
|  private: | ||||
|   // Returns pointer to the start of a row.
 | ||||
|   HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const { | ||||
| #if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ | ||||
|     defined(THREAD_SANITIZER) | ||||
| #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN | ||||
|     if (c >= kNumPlanes || y >= ysize()) { | ||||
|       HWY_ABORT("PlaneRow(%" PRIu64 ", %" PRIu64 ") >= %" PRIu64 "\n", | ||||
|                 static_cast<uint64_t>(c), static_cast<uint64_t>(y), | ||||
|  |  | |||
|  | @ -51,7 +51,7 @@ struct TestAlignedT { | |||
|         for (size_t y = 0; y < ysize; ++y) { | ||||
|           T* HWY_RESTRICT row = img.MutableRow(y); | ||||
|           for (size_t x = 0; x < xsize; x += Lanes(d)) { | ||||
|             const auto values = Iota(d, dist(rng)); | ||||
|             const auto values = Iota(d, static_cast<T>(dist(rng))); | ||||
|             Store(values, d, row + x); | ||||
|           } | ||||
|         } | ||||
|  |  | |||
|  | @ -486,7 +486,7 @@ struct AsinImpl<float> { | |||
|   } | ||||
| }; | ||||
| 
 | ||||
| #if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 | ||||
| 
 | ||||
| template <> | ||||
| struct AsinImpl<double> { | ||||
|  | @ -531,7 +531,7 @@ struct AtanImpl<float> { | |||
|   } | ||||
| }; | ||||
| 
 | ||||
| #if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 | ||||
| 
 | ||||
| template <> | ||||
| struct AtanImpl<double> { | ||||
|  | @ -635,7 +635,7 @@ struct CosSinImpl<float> { | |||
|   } | ||||
| }; | ||||
| 
 | ||||
| #if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 | ||||
| 
 | ||||
| template <> | ||||
| struct CosSinImpl<double> { | ||||
|  | @ -787,7 +787,7 @@ struct LogImpl<float> { | |||
|   } | ||||
| }; | ||||
| 
 | ||||
| #if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 | ||||
| template <> | ||||
| struct ExpImpl<double> { | ||||
|   // Rounds double toward zero and returns as int32_t.
 | ||||
|  |  | |||
|  | @ -61,7 +61,7 @@ HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T), | |||
| 
 | ||||
|   uint64_t max_ulp = 0; | ||||
|   // Emulation is slower, so cannot afford as many.
 | ||||
|   constexpr UintT kSamplesPerRange = static_cast<UintT>(AdjustedReps(10000)); | ||||
|   constexpr UintT kSamplesPerRange = static_cast<UintT>(AdjustedReps(4000)); | ||||
|   for (int range_index = 0; range_index < range_count; ++range_index) { | ||||
|     const UintT start = ranges[range_index][0]; | ||||
|     const UintT stop = ranges[range_index][1]; | ||||
|  | @ -96,24 +96,11 @@ HWY_NOINLINE void TestMath(const std::string name, T (*fx1)(T), | |||
|   HWY_ASSERT(max_ulp <= max_error_ulp); | ||||
| } | ||||
| 
 | ||||
| // TODO(janwas): remove once RVV supports fractional LMUL
 | ||||
| #undef DEFINE_MATH_TEST_FUNC | ||||
| #if HWY_TARGET == HWY_RVV | ||||
| 
 | ||||
| #define DEFINE_MATH_TEST_FUNC(NAME)                    \ | ||||
|   HWY_NOINLINE void TestAll##NAME() {                  \ | ||||
|     ForFloatTypes(ForShrinkableVectors<Test##NAME>()); \ | ||||
|   } | ||||
| 
 | ||||
| #else | ||||
| 
 | ||||
| #define DEFINE_MATH_TEST_FUNC(NAME)                 \ | ||||
|   HWY_NOINLINE void TestAll##NAME() {               \ | ||||
|     ForFloatTypes(ForPartialVectors<Test##NAME>()); \ | ||||
|   } | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| #undef DEFINE_MATH_TEST | ||||
| #define DEFINE_MATH_TEST(NAME, F32x1, F32xN, F32_MIN, F32_MAX, F32_ERROR, \ | ||||
|                          F64x1, F64xN, F64_MIN, F64_MAX, F64_ERROR)       \ | ||||
|  |  | |||
							
								
								
									
										133
									
								
								third_party/highway/hwy/contrib/sort/BUILD
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										133
									
								
								third_party/highway/hwy/contrib/sort/BUILD
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,133 @@ | |||
| package(default_visibility = ["//visibility:public"]) | ||||
| 
 | ||||
| licenses(["notice"]) | ||||
| 
 | ||||
| # Unused on Bazel builds, where this is not defined/known; Copybara replaces | ||||
| # usages with an empty list. | ||||
| COMPAT = [ | ||||
|     "//buildenv/target:non_prod",  # includes mobile/vendor. | ||||
| ] | ||||
| 
 | ||||
| cc_library( | ||||
|     name = "vqsort", | ||||
|     srcs = [ | ||||
|         # Split into separate files to reduce MSVC build time. | ||||
|         "vqsort.cc", | ||||
|         "vqsort_i16a.cc", | ||||
|         "vqsort_i16d.cc", | ||||
|         "vqsort_u16a.cc", | ||||
|         "vqsort_u16d.cc", | ||||
|         "vqsort_f32a.cc", | ||||
|         "vqsort_f32d.cc", | ||||
|         "vqsort_i32a.cc", | ||||
|         "vqsort_i32d.cc", | ||||
|         "vqsort_u32a.cc", | ||||
|         "vqsort_u32d.cc", | ||||
|         "vqsort_f64a.cc", | ||||
|         "vqsort_f64d.cc", | ||||
|         "vqsort_i64a.cc", | ||||
|         "vqsort_i64d.cc", | ||||
|         "vqsort_u64a.cc", | ||||
|         "vqsort_u64d.cc", | ||||
|         "vqsort_128a.cc", | ||||
|         "vqsort_128d.cc", | ||||
|     ], | ||||
|     hdrs = [ | ||||
|         "disabled_targets.h", | ||||
|         "vqsort.h",  # public interface | ||||
|     ], | ||||
|     compatible_with = [], | ||||
|     textual_hdrs = [ | ||||
|         "shared-inl.h", | ||||
|         "sorting_networks-inl.h", | ||||
|         "traits-inl.h", | ||||
|         "traits128-inl.h", | ||||
|         "vqsort-inl.h", | ||||
|     ], | ||||
|     deps = [ | ||||
|         # Only if VQSORT_SECURE_RNG is set. | ||||
|         # "//third_party/absl/random", | ||||
|         "//:hwy", | ||||
|     ], | ||||
| ) | ||||
| 
 | ||||
| # ----------------------------------------------------------------------------- | ||||
| # Internal-only targets | ||||
| 
 | ||||
| cc_library( | ||||
|     name = "helpers", | ||||
|     testonly = 1, | ||||
|     textual_hdrs = [ | ||||
|         "algo-inl.h", | ||||
|         "result-inl.h", | ||||
|     ], | ||||
|     deps = [ | ||||
|         ":vqsort", | ||||
|         "//:nanobenchmark", | ||||
|         # Required for HAVE_PDQSORT, but that is unused and this is | ||||
|         # unavailable to Bazel builds, hence commented out. | ||||
|         # "//third_party/boost/allowed", | ||||
|         # Avoid ips4o and thus TBB to work around hwloc build failure. | ||||
|     ], | ||||
| ) | ||||
| 
 | ||||
| cc_binary( | ||||
|     name = "print_network", | ||||
|     testonly = 1, | ||||
|     srcs = ["print_network.cc"], | ||||
|     deps = [ | ||||
|         ":helpers", | ||||
|         ":vqsort", | ||||
|         "//:hwy", | ||||
|     ], | ||||
| ) | ||||
| 
 | ||||
| cc_test( | ||||
|     name = "sort_test", | ||||
|     size = "medium", | ||||
|     srcs = ["sort_test.cc"], | ||||
|     features = ["fully_static_link"], | ||||
|     linkstatic = True, | ||||
|     local_defines = ["HWY_IS_TEST"], | ||||
|     # for test_suite. | ||||
|     tags = ["hwy_ops_test"], | ||||
|     deps = [ | ||||
|         ":helpers", | ||||
|         ":vqsort", | ||||
|         "@com_google_googletest//:gtest_main", | ||||
|         "//:hwy", | ||||
|         "//:hwy_test_util", | ||||
|     ], | ||||
| ) | ||||
| 
 | ||||
| cc_binary( | ||||
|     name = "bench_sort", | ||||
|     testonly = 1, | ||||
|     srcs = ["bench_sort.cc"], | ||||
|     features = ["fully_static_link"], | ||||
|     linkstatic = True, | ||||
|     local_defines = ["HWY_IS_TEST"], | ||||
|     deps = [ | ||||
|         ":helpers", | ||||
|         ":vqsort", | ||||
|         "@com_google_googletest//:gtest_main", | ||||
|         "//:hwy", | ||||
|         "//:hwy_test_util", | ||||
|     ], | ||||
| ) | ||||
| 
 | ||||
| cc_binary( | ||||
|     name = "bench_parallel", | ||||
|     testonly = 1, | ||||
|     srcs = ["bench_parallel.cc"], | ||||
|     features = ["fully_static_link"], | ||||
|     linkstatic = True, | ||||
|     local_defines = ["HWY_IS_TEST"], | ||||
|     deps = [ | ||||
|         ":helpers", | ||||
|         ":vqsort", | ||||
|         "@com_google_googletest//:gtest_main", | ||||
|         "//:hwy", | ||||
|         "//:hwy_test_util", | ||||
|     ], | ||||
| ) | ||||
							
								
								
									
										395
									
								
								third_party/highway/hwy/contrib/sort/algo-inl.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										395
									
								
								third_party/highway/hwy/contrib/sort/algo-inl.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,395 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| // Normal include guard for target-independent parts
 | ||||
| #ifndef HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ | ||||
| #define HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_ | ||||
| 
 | ||||
| #include <stdint.h> | ||||
| #include <string.h>  // memcpy | ||||
| 
 | ||||
| #include <algorithm> | ||||
| #include <cmath>  // std::abs | ||||
| #include <vector> | ||||
| 
 | ||||
| #include "hwy/base.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| // Third-party algorithms
 | ||||
| #define HAVE_AVX2SORT 0 | ||||
| #define HAVE_IPS4O 0 | ||||
| #define HAVE_PARALLEL_IPS4O (HAVE_IPS4O && 1) | ||||
| #define HAVE_PDQSORT 0 | ||||
| #define HAVE_SORT512 0 | ||||
| 
 | ||||
| #if HAVE_AVX2SORT | ||||
| HWY_PUSH_ATTRIBUTES("avx2,avx") | ||||
| #include "avx2sort.h" | ||||
| HWY_POP_ATTRIBUTES | ||||
| #endif | ||||
| #if HAVE_IPS4O | ||||
| #include "third_party/ips4o/include/ips4o.hpp" | ||||
| #include "third_party/ips4o/include/ips4o/thread_pool.hpp" | ||||
| #endif | ||||
| #if HAVE_PDQSORT | ||||
| #include "third_party/boost/allowed/sort/sort.hpp" | ||||
| #endif | ||||
| #if HAVE_SORT512 | ||||
| #include "sort512.h" | ||||
| #endif | ||||
| 
 | ||||
| namespace hwy { | ||||
| 
 | ||||
| enum class Dist { kUniform8, kUniform16, kUniform32 }; | ||||
| 
 | ||||
| std::vector<Dist> AllDist() { | ||||
|   return {/*Dist::kUniform8,*/ Dist::kUniform16, Dist::kUniform32}; | ||||
| } | ||||
| 
 | ||||
| const char* DistName(Dist dist) { | ||||
|   switch (dist) { | ||||
|     case Dist::kUniform8: | ||||
|       return "uniform8"; | ||||
|     case Dist::kUniform16: | ||||
|       return "uniform16"; | ||||
|     case Dist::kUniform32: | ||||
|       return "uniform32"; | ||||
|   } | ||||
|   return "unreachable"; | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| class InputStats { | ||||
|  public: | ||||
|   void Notify(T value) { | ||||
|     min_ = std::min(min_, value); | ||||
|     max_ = std::max(max_, value); | ||||
|     sumf_ += static_cast<double>(value); | ||||
|     count_ += 1; | ||||
|   } | ||||
| 
 | ||||
|   bool operator==(const InputStats& other) const { | ||||
|     if (count_ != other.count_) { | ||||
|       HWY_ABORT("count %d vs %d\n", static_cast<int>(count_), | ||||
|                 static_cast<int>(other.count_)); | ||||
|     } | ||||
| 
 | ||||
|     if (min_ != other.min_ || max_ != other.max_) { | ||||
|       HWY_ABORT("minmax %f/%f vs %f/%f\n", double(min_), double(max_), | ||||
|                 double(other.min_), double(other.max_)); | ||||
|     } | ||||
| 
 | ||||
|     // Sum helps detect duplicated/lost values
 | ||||
|     if (sumf_ != other.sumf_) { | ||||
|       // Allow some tolerance because kUniform32 * num can exceed double
 | ||||
|       // precision.
 | ||||
|       const double mul = 1E-9;  // prevent destructive cancellation
 | ||||
|       const double err = std::abs(sumf_ * mul - other.sumf_ * mul); | ||||
|       if (err > 1E-3) { | ||||
|         HWY_ABORT("Sum mismatch %.15e %.15e (%f) min %g max %g\n", sumf_, | ||||
|                   other.sumf_, err, double(min_), double(max_)); | ||||
|       } | ||||
|     } | ||||
| 
 | ||||
|     return true; | ||||
|   } | ||||
| 
 | ||||
|  private: | ||||
|   T min_ = hwy::HighestValue<T>(); | ||||
|   T max_ = hwy::LowestValue<T>(); | ||||
|   double sumf_ = 0.0; | ||||
|   size_t count_ = 0; | ||||
| }; | ||||
| 
 | ||||
| enum class Algo { | ||||
| #if HAVE_AVX2SORT | ||||
|   kSEA, | ||||
| #endif | ||||
| #if HAVE_IPS4O | ||||
|   kIPS4O, | ||||
| #endif | ||||
| #if HAVE_PARALLEL_IPS4O | ||||
|   kParallelIPS4O, | ||||
| #endif | ||||
| #if HAVE_PDQSORT | ||||
|   kPDQ, | ||||
| #endif | ||||
| #if HAVE_SORT512 | ||||
|   kSort512, | ||||
| #endif | ||||
|   kStd, | ||||
|   kVQSort, | ||||
|   kHeap, | ||||
| }; | ||||
| 
 | ||||
| const char* AlgoName(Algo algo) { | ||||
|   switch (algo) { | ||||
| #if HAVE_AVX2SORT | ||||
|     case Algo::kSEA: | ||||
|       return "sea"; | ||||
| #endif | ||||
| #if HAVE_IPS4O | ||||
|     case Algo::kIPS4O: | ||||
|       return "ips4o"; | ||||
| #endif | ||||
| #if HAVE_PARALLEL_IPS4O | ||||
|     case Algo::kParallelIPS4O: | ||||
|       return "par_ips4o"; | ||||
| #endif | ||||
| #if HAVE_PDQSORT | ||||
|     case Algo::kPDQ: | ||||
|       return "pdq"; | ||||
| #endif | ||||
| #if HAVE_SORT512 | ||||
|     case Algo::kSort512: | ||||
|       return "sort512"; | ||||
| #endif | ||||
|     case Algo::kStd: | ||||
|       return "std"; | ||||
|     case Algo::kVQSort: | ||||
|       return "vq"; | ||||
|     case Algo::kHeap: | ||||
|       return "heap"; | ||||
|   } | ||||
|   return "unreachable"; | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HIGHWAY_HWY_CONTRIB_SORT_ALGO_INL_H_
 | ||||
| 
 | ||||
| // Per-target
 | ||||
| #if defined(HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE) == \ | ||||
|     defined(HWY_TARGET_TOGGLE) | ||||
| #ifdef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE | ||||
| #undef HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE | ||||
| #else | ||||
| #define HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE | ||||
| #endif | ||||
| 
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/traits128-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h"  // HeapSort | ||||
| #include "hwy/tests/test_util-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| class Xorshift128Plus { | ||||
|   static HWY_INLINE uint64_t SplitMix64(uint64_t z) { | ||||
|     z = (z ^ (z >> 30)) * 0xBF58476D1CE4E5B9ull; | ||||
|     z = (z ^ (z >> 27)) * 0x94D049BB133111EBull; | ||||
|     return z ^ (z >> 31); | ||||
|   } | ||||
| 
 | ||||
|  public: | ||||
|   // Generates two vectors of 64-bit seeds via SplitMix64 and stores into
 | ||||
|   // `seeds`. Generating these afresh in each ChoosePivot is too expensive.
 | ||||
|   template <class DU64> | ||||
|   static void GenerateSeeds(DU64 du64, TFromD<DU64>* HWY_RESTRICT seeds) { | ||||
|     seeds[0] = SplitMix64(0x9E3779B97F4A7C15ull); | ||||
|     for (size_t i = 1; i < 2 * Lanes(du64); ++i) { | ||||
|       seeds[i] = SplitMix64(seeds[i - 1]); | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   // Need to pass in the state because vector cannot be class members.
 | ||||
|   template <class DU64> | ||||
|   static Vec<DU64> RandomBits(DU64 /* tag */, Vec<DU64>& state0, | ||||
|                               Vec<DU64>& state1) { | ||||
|     Vec<DU64> s1 = state0; | ||||
|     Vec<DU64> s0 = state1; | ||||
|     const Vec<DU64> bits = Add(s1, s0); | ||||
|     state0 = s0; | ||||
|     s1 = Xor(s1, ShiftLeft<23>(s1)); | ||||
|     state1 = Xor(s1, Xor(s0, Xor(ShiftRight<18>(s1), ShiftRight<5>(s0)))); | ||||
|     return bits; | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| template <typename T, class DU64, HWY_IF_NOT_FLOAT(T)> | ||||
| Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1, | ||||
|                        const Vec<DU64> mask) { | ||||
|   const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1); | ||||
|   return And(bits, mask); | ||||
| } | ||||
| 
 | ||||
| // Important to avoid denormals, which are flushed to zero by SIMD but not
 | ||||
| // scalar sorts, and NaN, which may be ordered differently in scalar vs. SIMD.
 | ||||
| template <typename T, class DU64, HWY_IF_FLOAT(T)> | ||||
| Vec<DU64> RandomValues(DU64 du64, Vec<DU64>& s0, Vec<DU64>& s1, | ||||
|                        const Vec<DU64> mask) { | ||||
|   const Vec<DU64> bits = Xorshift128Plus::RandomBits(du64, s0, s1); | ||||
|   const Vec<DU64> values = And(bits, mask); | ||||
| #if HWY_TARGET == HWY_SCALAR  // Cannot repartition u64 to i32
 | ||||
|   const RebindToSigned<DU64> di; | ||||
| #else | ||||
|   const Repartition<MakeSigned<T>, DU64> di; | ||||
| #endif | ||||
|   const RebindToFloat<decltype(di)> df; | ||||
|   // Avoid NaN/denormal by converting from (range-limited) integer.
 | ||||
|   const Vec<DU64> no_nan = | ||||
|       And(values, Set(du64, MantissaMask<MakeUnsigned<T>>())); | ||||
|   return BitCast(du64, ConvertTo(df, BitCast(di, no_nan))); | ||||
| } | ||||
| 
 | ||||
| template <class DU64> | ||||
| Vec<DU64> MaskForDist(DU64 du64, const Dist dist, size_t sizeof_t) { | ||||
|   switch (sizeof_t) { | ||||
|     case 2: | ||||
|       return Set(du64, (dist == Dist::kUniform8) ? 0x00FF00FF00FF00FFull | ||||
|                                                  : 0xFFFFFFFFFFFFFFFFull); | ||||
|     case 4: | ||||
|       return Set(du64, (dist == Dist::kUniform8)    ? 0x000000FF000000FFull | ||||
|                        : (dist == Dist::kUniform16) ? 0x0000FFFF0000FFFFull | ||||
|                                                     : 0xFFFFFFFFFFFFFFFFull); | ||||
|     case 8: | ||||
|       return Set(du64, (dist == Dist::kUniform8)    ? 0x00000000000000FFull | ||||
|                        : (dist == Dist::kUniform16) ? 0x000000000000FFFFull | ||||
|                                                     : 0x00000000FFFFFFFFull); | ||||
|     default: | ||||
|       HWY_ABORT("Logic error"); | ||||
|       return Zero(du64); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| InputStats<T> GenerateInput(const Dist dist, T* v, size_t num) { | ||||
|   SortTag<uint64_t> du64; | ||||
|   using VU64 = Vec<decltype(du64)>; | ||||
|   const size_t N64 = Lanes(du64); | ||||
|   auto buf = hwy::AllocateAligned<uint64_t>(2 * N64); | ||||
|   Xorshift128Plus::GenerateSeeds(du64, buf.get()); | ||||
|   auto s0 = Load(du64, buf.get()); | ||||
|   auto s1 = Load(du64, buf.get() + N64); | ||||
| 
 | ||||
|   const VU64 mask = MaskForDist(du64, dist, sizeof(T)); | ||||
| 
 | ||||
|   const Repartition<T, decltype(du64)> d; | ||||
|   const size_t N = Lanes(d); | ||||
|   size_t i = 0; | ||||
|   for (; i + N <= num; i += N) { | ||||
|     const VU64 bits = RandomValues<T>(du64, s0, s1, mask); | ||||
| #if HWY_ARCH_RVV | ||||
|     // v may not be 64-bit aligned
 | ||||
|     StoreU(bits, du64, buf.get()); | ||||
|     memcpy(v + i, buf.get(), N64 * sizeof(uint64_t)); | ||||
| #else | ||||
|     StoreU(bits, du64, reinterpret_cast<uint64_t*>(v + i)); | ||||
| #endif | ||||
|   } | ||||
|   if (i < num) { | ||||
|     const VU64 bits = RandomValues<T>(du64, s0, s1, mask); | ||||
|     StoreU(bits, du64, buf.get()); | ||||
|     memcpy(v + i, buf.get(), (num - i) * sizeof(T)); | ||||
|   } | ||||
| 
 | ||||
|   InputStats<T> input_stats; | ||||
|   for (size_t i = 0; i < num; ++i) { | ||||
|     input_stats.Notify(v[i]); | ||||
|   } | ||||
|   return input_stats; | ||||
| } | ||||
| 
 | ||||
| struct ThreadLocal { | ||||
|   Sorter sorter; | ||||
| }; | ||||
| 
 | ||||
| struct SharedState { | ||||
| #if HAVE_PARALLEL_IPS4O | ||||
|   ips4o::StdThreadPool pool{ | ||||
|       HWY_MIN(16, static_cast<int>(std::thread::hardware_concurrency() / 2))}; | ||||
| #endif | ||||
|   std::vector<ThreadLocal> tls{1}; | ||||
| }; | ||||
| 
 | ||||
| template <class Order, typename T> | ||||
| void Run(Algo algo, T* HWY_RESTRICT inout, size_t num, SharedState& shared, | ||||
|          size_t thread) { | ||||
|   using detail::HeapSort; | ||||
|   using detail::LaneTraits; | ||||
|   using detail::SharedTraits; | ||||
| 
 | ||||
|   switch (algo) { | ||||
| #if HAVE_AVX2SORT | ||||
|     case Algo::kSEA: | ||||
|       return avx2::quicksort(inout, static_cast<int>(num)); | ||||
| #endif | ||||
| 
 | ||||
| #if HAVE_IPS4O | ||||
|     case Algo::kIPS4O: | ||||
|       if (Order().IsAscending()) { | ||||
|         return ips4o::sort(inout, inout + num, std::less<T>()); | ||||
|       } else { | ||||
|         return ips4o::sort(inout, inout + num, std::greater<T>()); | ||||
|       } | ||||
| #endif | ||||
| 
 | ||||
| #if HAVE_PARALLEL_IPS4O | ||||
|     case Algo::kParallelIPS4O: | ||||
|       if (Order().IsAscending()) { | ||||
|         return ips4o::parallel::sort(inout, inout + num, std::less<T>()); | ||||
|       } else { | ||||
|         return ips4o::parallel::sort(inout, inout + num, std::greater<T>()); | ||||
|       } | ||||
| #endif | ||||
| 
 | ||||
| #if HAVE_SORT512 | ||||
|     case Algo::kSort512: | ||||
|       HWY_ABORT("not supported"); | ||||
|       //    return Sort512::Sort(inout, num);
 | ||||
| #endif | ||||
| 
 | ||||
| #if HAVE_PDQSORT | ||||
|     case Algo::kPDQ: | ||||
|       if (Order().IsAscending()) { | ||||
|         return boost::sort::pdqsort_branchless(inout, inout + num, | ||||
|                                                std::less<T>()); | ||||
|       } else { | ||||
|         return boost::sort::pdqsort_branchless(inout, inout + num, | ||||
|                                                std::greater<T>()); | ||||
|       } | ||||
| #endif | ||||
| 
 | ||||
|     case Algo::kStd: | ||||
|       if (Order().IsAscending()) { | ||||
|         return std::sort(inout, inout + num, std::less<T>()); | ||||
|       } else { | ||||
|         return std::sort(inout, inout + num, std::greater<T>()); | ||||
|       } | ||||
| 
 | ||||
|     case Algo::kVQSort: | ||||
|       return shared.tls[thread].sorter(inout, num, Order()); | ||||
| 
 | ||||
|     case Algo::kHeap: | ||||
|       HWY_ASSERT(sizeof(T) < 16); | ||||
|       if (Order().IsAscending()) { | ||||
|         const SharedTraits<LaneTraits<detail::OrderAscending>> st; | ||||
|         return HeapSort(st, inout, num); | ||||
|       } else { | ||||
|         const SharedTraits<LaneTraits<detail::OrderDescending>> st; | ||||
|         return HeapSort(st, inout, num); | ||||
|       } | ||||
| 
 | ||||
|     default: | ||||
|       HWY_ABORT("Not implemented"); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #endif  // HIGHWAY_HWY_CONTRIB_SORT_ALGO_TOGGLE
 | ||||
							
								
								
									
										243
									
								
								third_party/highway/hwy/contrib/sort/bench_parallel.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										243
									
								
								third_party/highway/hwy/contrib/sort/bench_parallel.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,243 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| // Concurrent, independent sorts for generating more memory traffic and testing
 | ||||
| // scalability.
 | ||||
| 
 | ||||
| // clang-format off
 | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_parallel.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/algo-inl.h" | ||||
| #include "hwy/contrib/sort/result-inl.h" | ||||
| #include "hwy/aligned_allocator.h" | ||||
| // Last
 | ||||
| #include "hwy/tests/test_util-inl.h" | ||||
| // clang-format on
 | ||||
| 
 | ||||
| #include <stdint.h> | ||||
| #include <stdio.h> | ||||
| 
 | ||||
| #include <condition_variable>  //NOLINT
 | ||||
| #include <functional> | ||||
| #include <memory> | ||||
| #include <mutex>   //NOLINT
 | ||||
| #include <thread>  //NOLINT
 | ||||
| #include <utility> | ||||
| #include <vector> | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| namespace { | ||||
| 
 | ||||
| #if HWY_TARGET != HWY_SCALAR | ||||
| 
 | ||||
| class ThreadPool { | ||||
|  public: | ||||
|   // Starts the given number of worker threads and blocks until they are ready.
 | ||||
|   explicit ThreadPool( | ||||
|       const size_t num_threads = std::thread::hardware_concurrency() / 2) | ||||
|       : num_threads_(num_threads) { | ||||
|     HWY_ASSERT(num_threads_ > 0); | ||||
|     threads_.reserve(num_threads_); | ||||
|     for (size_t i = 0; i < num_threads_; ++i) { | ||||
|       threads_.emplace_back(ThreadFunc, this, i); | ||||
|     } | ||||
| 
 | ||||
|     WorkersReadyBarrier(); | ||||
|   } | ||||
| 
 | ||||
|   ThreadPool(const ThreadPool&) = delete; | ||||
|   ThreadPool& operator&(const ThreadPool&) = delete; | ||||
| 
 | ||||
|   // Waits for all threads to exit.
 | ||||
|   ~ThreadPool() { | ||||
|     StartWorkers(kWorkerExit); | ||||
| 
 | ||||
|     for (std::thread& thread : threads_) { | ||||
|       thread.join(); | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   size_t NumThreads() const { return threads_.size(); } | ||||
| 
 | ||||
|   template <class Func> | ||||
|   void RunOnThreads(size_t max_threads, const Func& func) { | ||||
|     task_ = &CallClosure<Func>; | ||||
|     data_ = &func; | ||||
|     StartWorkers(max_threads); | ||||
|     WorkersReadyBarrier(); | ||||
|   } | ||||
| 
 | ||||
|  private: | ||||
|   // After construction and between calls to Run, workers are "ready", i.e.
 | ||||
|   // waiting on worker_start_cv_. They are "started" by sending a "command"
 | ||||
|   // and notifying all worker_start_cv_ waiters. (That is why all workers
 | ||||
|   // must be ready/waiting - otherwise, the notification will not reach all of
 | ||||
|   // them and the main thread waits in vain for them to report readiness.)
 | ||||
|   using WorkerCommand = uint64_t; | ||||
| 
 | ||||
|   static constexpr WorkerCommand kWorkerWait = ~1ULL; | ||||
|   static constexpr WorkerCommand kWorkerExit = ~2ULL; | ||||
| 
 | ||||
|   // Calls a closure (lambda with captures).
 | ||||
|   template <class Closure> | ||||
|   static void CallClosure(const void* f, size_t thread) { | ||||
|     (*reinterpret_cast<const Closure*>(f))(thread); | ||||
|   } | ||||
| 
 | ||||
|   void WorkersReadyBarrier() { | ||||
|     std::unique_lock<std::mutex> lock(mutex_); | ||||
|     // Typically only a single iteration.
 | ||||
|     while (workers_ready_ != threads_.size()) { | ||||
|       workers_ready_cv_.wait(lock); | ||||
|     } | ||||
|     workers_ready_ = 0; | ||||
| 
 | ||||
|     // Safely handle spurious worker wakeups.
 | ||||
|     worker_start_command_ = kWorkerWait; | ||||
|   } | ||||
| 
 | ||||
|   // Precondition: all workers are ready.
 | ||||
|   void StartWorkers(const WorkerCommand worker_command) { | ||||
|     std::unique_lock<std::mutex> lock(mutex_); | ||||
|     worker_start_command_ = worker_command; | ||||
|     // Workers will need this lock, so release it before they wake up.
 | ||||
|     lock.unlock(); | ||||
|     worker_start_cv_.notify_all(); | ||||
|   } | ||||
| 
 | ||||
|   static void ThreadFunc(ThreadPool* self, size_t thread) { | ||||
|     // Until kWorkerExit command received:
 | ||||
|     for (;;) { | ||||
|       std::unique_lock<std::mutex> lock(self->mutex_); | ||||
|       // Notify main thread that this thread is ready.
 | ||||
|       if (++self->workers_ready_ == self->num_threads_) { | ||||
|         self->workers_ready_cv_.notify_one(); | ||||
|       } | ||||
|     RESUME_WAIT: | ||||
|       // Wait for a command.
 | ||||
|       self->worker_start_cv_.wait(lock); | ||||
|       const WorkerCommand command = self->worker_start_command_; | ||||
|       switch (command) { | ||||
|         case kWorkerWait:    // spurious wakeup:
 | ||||
|           goto RESUME_WAIT;  // lock still held, avoid incrementing ready.
 | ||||
|         case kWorkerExit: | ||||
|           return;  // exits thread
 | ||||
|         default: | ||||
|           break; | ||||
|       } | ||||
| 
 | ||||
|       lock.unlock(); | ||||
|       // Command is the maximum number of threads that should run the task.
 | ||||
|       HWY_ASSERT(command < self->NumThreads()); | ||||
|       if (thread < command) { | ||||
|         self->task_(self->data_, thread); | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   const size_t num_threads_; | ||||
| 
 | ||||
|   // Unmodified after ctor, but cannot be const because we call thread::join().
 | ||||
|   std::vector<std::thread> threads_; | ||||
| 
 | ||||
|   std::mutex mutex_;  // guards both cv and their variables.
 | ||||
|   std::condition_variable workers_ready_cv_; | ||||
|   size_t workers_ready_ = 0; | ||||
|   std::condition_variable worker_start_cv_; | ||||
|   WorkerCommand worker_start_command_; | ||||
| 
 | ||||
|   // Written by main thread, read by workers (after mutex lock/unlock).
 | ||||
|   std::function<void(const void*, size_t)> task_;  // points to CallClosure
 | ||||
|   const void* data_;                               // points to caller's Func
 | ||||
| }; | ||||
| 
 | ||||
| template <class Order, typename T> | ||||
| void RunWithoutVerify(const Dist dist, const size_t num, const Algo algo, | ||||
|                       SharedState& shared, size_t thread) { | ||||
|   auto aligned = hwy::AllocateAligned<T>(num); | ||||
| 
 | ||||
|   (void)GenerateInput(dist, aligned.get(), num); | ||||
| 
 | ||||
|   const Timestamp t0; | ||||
|   Run<Order>(algo, aligned.get(), num, shared, thread); | ||||
|   HWY_ASSERT(aligned[0] < aligned[num - 1]); | ||||
| } | ||||
| 
 | ||||
| void BenchParallel() { | ||||
|   // Not interested in benchmark results for other targets
 | ||||
|   if (HWY_TARGET != HWY_AVX3) return; | ||||
| 
 | ||||
|   ThreadPool pool; | ||||
|   const size_t NT = pool.NumThreads(); | ||||
| 
 | ||||
|   using T = int64_t; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st; | ||||
| 
 | ||||
|   size_t num = 100 * 1000 * 1000; | ||||
| 
 | ||||
| #if HAVE_IPS4O | ||||
|   const Algo algo = Algo::kIPS4O; | ||||
| #else | ||||
|   const Algo algo = Algo::kVQSort; | ||||
| #endif | ||||
|   const Dist dist = Dist::kUniform16; | ||||
| 
 | ||||
|   SharedState shared; | ||||
|   shared.tls.resize(NT); | ||||
| 
 | ||||
|   std::vector<Result> results; | ||||
|   for (size_t nt = 1; nt < NT; nt += HWY_MAX(1, NT / 16)) { | ||||
|     Timestamp t0; | ||||
|     // Default capture because MSVC wants algo/dist but clang does not.
 | ||||
|     pool.RunOnThreads(nt, [=, &shared](size_t thread) { | ||||
|       RunWithoutVerify<SortAscending, T>(dist, num, algo, shared, thread); | ||||
|     }); | ||||
|     const double sec = SecondsSince(t0); | ||||
|     results.push_back(MakeResult<T>(algo, dist, st, num, nt, sec)); | ||||
|     results.back().Print(); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| #else | ||||
| void BenchParallel() {} | ||||
| #endif | ||||
| 
 | ||||
| }  // namespace
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| 
 | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_BEFORE_TEST(BenchParallel); | ||||
| HWY_EXPORT_AND_TEST_P(BenchParallel, BenchParallel); | ||||
| }  // namespace
 | ||||
| }  // namespace hwy
 | ||||
| 
 | ||||
| // Ought not to be necessary, but without this, no tests run on RVV.
 | ||||
| int main(int argc, char** argv) { | ||||
|   ::testing::InitGoogleTest(&argc, argv); | ||||
|   return RUN_ALL_TESTS(); | ||||
| } | ||||
| 
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										259
									
								
								third_party/highway/hwy/contrib/sort/bench_sort.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										259
									
								
								third_party/highway/hwy/contrib/sort/bench_sort.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,259 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| // clang-format off
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/bench_sort.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/algo-inl.h" | ||||
| #include "hwy/contrib/sort/result-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| #include "hwy/contrib/sort/sorting_networks-inl.h"  // SharedTraits
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/traits128-inl.h" | ||||
| #include "hwy/tests/test_util-inl.h" | ||||
| // clang-format on
 | ||||
| 
 | ||||
| #include <stdint.h> | ||||
| #include <stdio.h> | ||||
| #include <string.h>  // memcpy
 | ||||
| 
 | ||||
| #include <vector> | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| namespace { | ||||
| using detail::LaneTraits; | ||||
| using detail::OrderAscending; | ||||
| using detail::OrderDescending; | ||||
| using detail::SharedTraits; | ||||
| 
 | ||||
| #if HWY_TARGET != HWY_SCALAR | ||||
| using detail::OrderAscending128; | ||||
| using detail::OrderDescending128; | ||||
| using detail::Traits128; | ||||
| 
 | ||||
| template <class Traits, typename T> | ||||
| HWY_NOINLINE void BenchPartition() { | ||||
|   const SortTag<T> d; | ||||
|   detail::SharedTraits<Traits> st; | ||||
|   const Dist dist = Dist::kUniform8; | ||||
|   double sum = 0.0; | ||||
| 
 | ||||
|   const size_t max_log2 = AdjustedLog2Reps(20); | ||||
|   for (size_t log2 = max_log2; log2 < max_log2 + 1; ++log2) { | ||||
|     const size_t num = 1ull << log2; | ||||
|     auto aligned = hwy::AllocateAligned<T>(num); | ||||
|     auto buf = | ||||
|         hwy::AllocateAligned<T>(hwy::SortConstants::PartitionBufNum(Lanes(d))); | ||||
| 
 | ||||
|     std::vector<double> seconds; | ||||
|     const size_t num_reps = (1ull << (14 - log2 / 2)) * kReps; | ||||
|     for (size_t rep = 0; rep < num_reps; ++rep) { | ||||
|       (void)GenerateInput(dist, aligned.get(), num); | ||||
| 
 | ||||
|       const Timestamp t0; | ||||
| 
 | ||||
|       detail::Partition(d, st, aligned.get(), 0, num - 1, Set(d, T(128)), | ||||
|                         buf.get()); | ||||
|       seconds.push_back(SecondsSince(t0)); | ||||
|       // 'Use' the result to prevent optimizing out the partition.
 | ||||
|       sum += static_cast<double>(aligned.get()[num / 2]); | ||||
|     } | ||||
| 
 | ||||
|     MakeResult<T>(Algo::kVQSort, dist, st, num, 1, | ||||
|                   SummarizeMeasurements(seconds)) | ||||
|         .Print(); | ||||
|   } | ||||
|   HWY_ASSERT(sum != 999999);  // Prevent optimizing out
 | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void BenchAllPartition() { | ||||
|   // Not interested in benchmark results for these targets
 | ||||
|   if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 || | ||||
|       HWY_TARGET == HWY_AVX2) { | ||||
|     return; | ||||
|   } | ||||
| 
 | ||||
|   BenchPartition<LaneTraits<OrderDescending>, float>(); | ||||
|   BenchPartition<LaneTraits<OrderAscending>, int64_t>(); | ||||
|   BenchPartition<Traits128<OrderDescending128>, uint64_t>(); | ||||
| } | ||||
| 
 | ||||
| template <class Traits, typename T> | ||||
| HWY_NOINLINE void BenchBase(std::vector<Result>& results) { | ||||
|   // Not interested in benchmark results for these targets
 | ||||
|   if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) { | ||||
|     return; | ||||
|   } | ||||
| 
 | ||||
|   const SortTag<T> d; | ||||
|   detail::SharedTraits<Traits> st; | ||||
|   const Dist dist = Dist::kUniform32; | ||||
| 
 | ||||
|   const size_t N = Lanes(d); | ||||
|   const size_t num = SortConstants::BaseCaseNum(N); | ||||
|   auto keys = hwy::AllocateAligned<T>(num); | ||||
|   auto buf = hwy::AllocateAligned<T>(num + N); | ||||
| 
 | ||||
|   std::vector<double> seconds; | ||||
|   double sum = 0;                             // prevents elision
 | ||||
|   constexpr size_t kMul = AdjustedReps(600);  // ensures long enough to measure
 | ||||
| 
 | ||||
|   for (size_t rep = 0; rep < kReps; ++rep) { | ||||
|     InputStats<T> input_stats = GenerateInput(dist, keys.get(), num); | ||||
| 
 | ||||
|     const Timestamp t0; | ||||
|     for (size_t i = 0; i < kMul; ++i) { | ||||
|       detail::BaseCase(d, st, keys.get(), num, buf.get()); | ||||
|       sum += static_cast<double>(keys[0]); | ||||
|     } | ||||
|     seconds.push_back(SecondsSince(t0)); | ||||
|     // printf("%f\n", seconds.back());
 | ||||
| 
 | ||||
|     HWY_ASSERT(VerifySort(st, input_stats, keys.get(), num, "BenchBase")); | ||||
|   } | ||||
|   HWY_ASSERT(sum < 1E99); | ||||
|   results.push_back(MakeResult<T>(Algo::kVQSort, dist, st, num * kMul, 1, | ||||
|                                   SummarizeMeasurements(seconds))); | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void BenchAllBase() { | ||||
|   // Not interested in benchmark results for these targets
 | ||||
|   if (HWY_TARGET == HWY_SSSE3) { | ||||
|     return; | ||||
|   } | ||||
| 
 | ||||
|   std::vector<Result> results; | ||||
|   BenchBase<LaneTraits<OrderAscending>, float>(results); | ||||
|   BenchBase<LaneTraits<OrderDescending>, int64_t>(results); | ||||
|   BenchBase<Traits128<OrderAscending128>, uint64_t>(results); | ||||
|   for (const Result& r : results) { | ||||
|     r.Print(); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| std::vector<Algo> AlgoForBench() { | ||||
|   return { | ||||
| #if HAVE_AVX2SORT | ||||
|     Algo::kSEA, | ||||
| #endif | ||||
| #if HAVE_PARALLEL_IPS4O | ||||
|         Algo::kParallelIPS4O, | ||||
| #endif | ||||
| #if HAVE_IPS4O | ||||
|         Algo::kIPS4O, | ||||
| #endif | ||||
| #if HAVE_PDQSORT | ||||
|         Algo::kPDQ, | ||||
| #endif | ||||
| #if HAVE_SORT512 | ||||
|         Algo::kSort512, | ||||
| #endif | ||||
|         // Algo::kStd,  // too slow to always benchmark
 | ||||
|         // Algo::kHeap,  // too slow to always benchmark
 | ||||
|         Algo::kVQSort, | ||||
|   }; | ||||
| } | ||||
| 
 | ||||
| template <class Traits, typename T> | ||||
| HWY_NOINLINE void BenchSort(size_t num) { | ||||
|   SharedState shared; | ||||
|   detail::SharedTraits<Traits> st; | ||||
|   auto aligned = hwy::AllocateAligned<T>(num); | ||||
|   for (Algo algo : AlgoForBench()) { | ||||
|     for (Dist dist : AllDist()) { | ||||
|       std::vector<double> seconds; | ||||
|       for (size_t rep = 0; rep < kReps; ++rep) { | ||||
|         InputStats<T> input_stats = GenerateInput(dist, aligned.get(), num); | ||||
| 
 | ||||
|         const Timestamp t0; | ||||
|         Run<typename Traits::Order>(algo, aligned.get(), num, shared, | ||||
|                                     /*thread=*/0); | ||||
|         seconds.push_back(SecondsSince(t0)); | ||||
|         // printf("%f\n", seconds.back());
 | ||||
| 
 | ||||
|         HWY_ASSERT( | ||||
|             VerifySort(st, input_stats, aligned.get(), num, "BenchSort")); | ||||
|       } | ||||
|       MakeResult<T>(algo, dist, st, num, 1, SummarizeMeasurements(seconds)) | ||||
|           .Print(); | ||||
|     }  // dist
 | ||||
|   }    // algo
 | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void BenchAllSort() { | ||||
|   // Not interested in benchmark results for these targets
 | ||||
|   if (HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4) { | ||||
|     return; | ||||
|   } | ||||
| 
 | ||||
|   constexpr size_t K = 1000; | ||||
|   constexpr size_t M = K * K; | ||||
|   (void)K; | ||||
|   (void)M; | ||||
|   for (size_t num : { | ||||
| #if HAVE_PARALLEL_IPS4O | ||||
|          100 * M, | ||||
| #else | ||||
|          AdjustedReps(1 * M), | ||||
| #endif | ||||
|        }) { | ||||
|     // BenchSort<LaneTraits<OrderAscending>, float>(num);
 | ||||
|     // BenchSort<LaneTraits<OrderDescending>, double>(num);
 | ||||
|     // BenchSort<LaneTraits<OrderAscending>, int16_t>(num);
 | ||||
|     BenchSort<LaneTraits<OrderDescending>, int32_t>(num); | ||||
|     BenchSort<LaneTraits<OrderAscending>, int64_t>(num); | ||||
|     // BenchSort<LaneTraits<OrderDescending>, uint16_t>(num);
 | ||||
|     // BenchSort<LaneTraits<OrderDescending>, uint32_t>(num);
 | ||||
|     // BenchSort<LaneTraits<OrderAscending>, uint64_t>(num);
 | ||||
| 
 | ||||
|     BenchSort<Traits128<OrderAscending128>, uint64_t>(num); | ||||
|     // BenchSort<Traits128<OrderAscending128>, uint64_t>(num);
 | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| #else | ||||
| void BenchAllPartition() {} | ||||
| void BenchAllBase() {} | ||||
| void BenchAllSort() {} | ||||
| #endif | ||||
| 
 | ||||
| }  // namespace
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| 
 | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_BEFORE_TEST(BenchSort); | ||||
| HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllPartition); | ||||
| HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllBase); | ||||
| HWY_EXPORT_AND_TEST_P(BenchSort, BenchAllSort); | ||||
| }  // namespace
 | ||||
| }  // namespace hwy
 | ||||
| 
 | ||||
| // Ought not to be necessary, but without this, no tests run on RVV.
 | ||||
| int main(int argc, char** argv) { | ||||
|   ::testing::InitGoogleTest(&argc, argv); | ||||
|   return RUN_ALL_TESTS(); | ||||
| } | ||||
| 
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										30
									
								
								third_party/highway/hwy/contrib/sort/disabled_targets.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								third_party/highway/hwy/contrib/sort/disabled_targets.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,30 @@ | |||
| // Copyright 2022 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| // Speed up MSVC builds by building fewer targets. This header must be included
 | ||||
| // from all TUs that contain a HWY_DYNAMIC_DISPATCH to vqsort, i.e. vqsort_*.cc.
 | ||||
| // However, users of vqsort.h are unaffected.
 | ||||
| 
 | ||||
| #ifndef HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_ | ||||
| #define HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_ | ||||
| 
 | ||||
| #include "hwy/base.h" | ||||
| 
 | ||||
| #if HWY_COMPILER_MSVC | ||||
| #undef HWY_DISABLED_TARGETS | ||||
| // HWY_SCALAR remains, so there will still be a valid target to call.
 | ||||
| #define HWY_DISABLED_TARGETS (HWY_SSSE3 | HWY_SSE4) | ||||
| #endif  // HWY_COMPILER_MSVC
 | ||||
| 
 | ||||
| #endif  // HIGHWAY_HWY_CONTRIB_SORT_DISABLED_TARGETS_H_
 | ||||
							
								
								
									
										190
									
								
								third_party/highway/hwy/contrib/sort/print_network.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										190
									
								
								third_party/highway/hwy/contrib/sort/print_network.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,190 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include <stdio.h> | ||||
| 
 | ||||
| #include <algorithm> | ||||
| 
 | ||||
| #include "hwy/base.h" | ||||
| 
 | ||||
| // Based on A.7 in "Entwurf und Implementierung vektorisierter
 | ||||
| // Sortieralgorithmen" and code by Mark Blacher.
 | ||||
| void PrintMergeNetwork16x2() { | ||||
|   for (int i = 8; i < 16; ++i) { | ||||
|     printf("v%x = st.SwapAdjacent(d, v%x);\n", i, i); | ||||
|   } | ||||
|   for (int i = 0; i < 8; ++i) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i); | ||||
|   } | ||||
|   for (int i = 0; i < 4; ++i) { | ||||
|     printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 4, i + 4); | ||||
|     printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 12, i + 12); | ||||
|   } | ||||
|   for (int i = 0; i < 4; ++i) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i); | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 4) { | ||||
|     printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 2, i + 2); | ||||
|     printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 3, i + 3); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 4) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, i + 3); | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 2) { | ||||
|     printf("v%x = st.SwapAdjacent(d, v%x);\n", i + 1, i + 1); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 2) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, i + 1); | ||||
|   } | ||||
|   for (int i = 0; i < 16; ++i) { | ||||
|     printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i); | ||||
|   } | ||||
|   printf("\n"); | ||||
| } | ||||
| 
 | ||||
| void PrintMergeNetwork16x4() { | ||||
|   printf("\n"); | ||||
| 
 | ||||
|   for (int i = 8; i < 16; ++i) { | ||||
|     printf("v%x = st.Reverse4(d, v%x);\n", i, i); | ||||
|   } | ||||
|   for (int i = 0; i < 8; ++i) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i); | ||||
|   } | ||||
|   for (int i = 0; i < 4; ++i) { | ||||
|     printf("v%x = st.Reverse4(d, v%x);\n", i + 4, i + 4); | ||||
|     printf("v%x = st.Reverse4(d, v%x);\n", i + 12, i + 12); | ||||
|   } | ||||
|   for (int i = 0; i < 4; ++i) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i); | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 4) { | ||||
|     printf("v%x = st.Reverse4(d, v%x);\n", i + 2, i + 2); | ||||
|     printf("v%x = st.Reverse4(d, v%x);\n", i + 3, i + 3); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 4) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, i + 3); | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 2) { | ||||
|     printf("v%x = st.Reverse4(d, v%x);\n", i + 1, i + 1); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 2) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, i + 1); | ||||
|   } | ||||
|   for (int i = 0; i < 16; ++i) { | ||||
|     printf("v%x = st.SortPairsReverse4(d, v%x);\n", i, i); | ||||
|   } | ||||
|   for (int i = 0; i < 16; ++i) { | ||||
|     printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| void PrintMergeNetwork16x8() { | ||||
|   printf("\n"); | ||||
| 
 | ||||
|   for (int i = 8; i < 16; ++i) { | ||||
|     printf("v%x = st.ReverseKeys8(d, v%x);\n", i, i); | ||||
|   } | ||||
|   for (int i = 0; i < 8; ++i) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i); | ||||
|   } | ||||
|   for (int i = 0; i < 4; ++i) { | ||||
|     printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 4, i + 4); | ||||
|     printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 12, i + 12); | ||||
|   } | ||||
|   for (int i = 0; i < 4; ++i) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i); | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 4) { | ||||
|     printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 2, i + 2); | ||||
|     printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 3, i + 3); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 4) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, i + 3); | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 2) { | ||||
|     printf("v%x = st.ReverseKeys8(d, v%x);\n", i + 1, i + 1); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 2) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, i + 1); | ||||
|   } | ||||
|   for (int i = 0; i < 16; ++i) { | ||||
|     printf("v%x = st.SortPairsReverse8(d, v%x);\n", i, i); | ||||
|   } | ||||
|   for (int i = 0; i < 16; ++i) { | ||||
|     printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i); | ||||
|   } | ||||
|   for (int i = 0; i < 16; ++i) { | ||||
|     printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| void PrintMergeNetwork16x16() { | ||||
|   printf("\n"); | ||||
| 
 | ||||
|   for (int i = 8; i < 16; ++i) { | ||||
|     printf("v%x = st.ReverseKeys16(d, v%x);\n", i, i); | ||||
|   } | ||||
|   for (int i = 0; i < 8; ++i) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, 15 - i); | ||||
|   } | ||||
|   for (int i = 0; i < 4; ++i) { | ||||
|     printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 4, i + 4); | ||||
|     printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 12, i + 12); | ||||
|   } | ||||
|   for (int i = 0; i < 4; ++i) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, 7 - i); | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i + 8, 15 - i); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 4) { | ||||
|     printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 2, i + 2); | ||||
|     printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 3, i + 3); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 4) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, i + 3); | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i + 1, i + 2); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 2) { | ||||
|     printf("v%x = st.ReverseKeys16(d, v%x);\n", i + 1, i + 1); | ||||
|   } | ||||
|   for (int i = 0; i < 16; i += 2) { | ||||
|     printf("st.Sort2(d, v%x, v%x);\n", i, i + 1); | ||||
|   } | ||||
|   for (int i = 0; i < 16; ++i) { | ||||
|     printf("v%x = st.SortPairsReverse16<kOrder>(d, v%x);\n", i, i); | ||||
|   } | ||||
|   for (int i = 0; i < 16; ++i) { | ||||
|     printf("v%x = st.SortPairsDistance4<kOrder>(d, v%x);\n", i, i); | ||||
|   } | ||||
|   for (int i = 0; i < 16; ++i) { | ||||
|     printf("v%x = st.SortPairsDistance2<kOrder>(d, v%x);\n", i, i); | ||||
|   } | ||||
|   for (int i = 0; i < 16; ++i) { | ||||
|     printf("v%x = st.SortPairsDistance1<kOrder>(d, v%x);\n", i, i); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| int main(int argc, char** argv) { | ||||
|   PrintMergeNetwork16x2(); | ||||
|   PrintMergeNetwork16x4(); | ||||
|   PrintMergeNetwork16x8(); | ||||
|   PrintMergeNetwork16x16(); | ||||
|   return 0; | ||||
| } | ||||
							
								
								
									
										149
									
								
								third_party/highway/hwy/contrib/sort/result-inl.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										149
									
								
								third_party/highway/hwy/contrib/sort/result-inl.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,149 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/algo-inl.h" | ||||
| 
 | ||||
| // Normal include guard for non-SIMD parts
 | ||||
| #ifndef HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_ | ||||
| #define HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_ | ||||
| 
 | ||||
| #include <time.h> | ||||
| 
 | ||||
| #include <algorithm>  // std::sort | ||||
| #include <string> | ||||
| 
 | ||||
| #include "hwy/base.h" | ||||
| #include "hwy/nanobenchmark.h" | ||||
| 
 | ||||
| namespace hwy { | ||||
| 
 | ||||
| struct Timestamp { | ||||
|   Timestamp() { t = platform::Now(); } | ||||
|   double t; | ||||
| }; | ||||
| 
 | ||||
| double SecondsSince(const Timestamp& t0) { | ||||
|   const Timestamp t1; | ||||
|   return t1.t - t0.t; | ||||
| } | ||||
| 
 | ||||
| constexpr size_t kReps = 30; | ||||
| 
 | ||||
| // Returns trimmed mean (we don't want to run an out-of-L3-cache sort often
 | ||||
| // enough for the mode to be reliable).
 | ||||
| double SummarizeMeasurements(std::vector<double>& seconds) { | ||||
|   std::sort(seconds.begin(), seconds.end()); | ||||
|   double sum = 0; | ||||
|   int count = 0; | ||||
|   for (size_t i = kReps / 4; i < seconds.size() - kReps / 2; ++i) { | ||||
|     sum += seconds[i]; | ||||
|     count += 1; | ||||
|   } | ||||
|   return sum / count; | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HIGHWAY_HWY_CONTRIB_SORT_RESULT_INL_H_
 | ||||
| 
 | ||||
| // Per-target
 | ||||
| #if defined(HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE) == \ | ||||
|     defined(HWY_TARGET_TOGGLE) | ||||
| #ifdef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE | ||||
| #undef HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE | ||||
| #else | ||||
| #define HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE | ||||
| #endif | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| struct Result { | ||||
|   Result() {} | ||||
|   Result(const uint32_t target, const Algo algo, Dist dist, bool is128, | ||||
|          size_t num, size_t num_threads, double sec, size_t sizeof_t, | ||||
|          const char* type_name) | ||||
|       : target(target), | ||||
|         algo(algo), | ||||
|         dist(dist), | ||||
|         is128(is128), | ||||
|         num(num), | ||||
|         num_threads(num_threads), | ||||
|         sec(sec), | ||||
|         sizeof_t(sizeof_t), | ||||
|         type_name(type_name) {} | ||||
| 
 | ||||
|   void Print() const { | ||||
|     const double bytes = static_cast<double>(num) * | ||||
|                          static_cast<double>(num_threads) * | ||||
|                          static_cast<double>(sizeof_t); | ||||
|     printf("%10s: %12s: %7s: %9s: %.2E %4.0f MB/s (%2zu threads)\n", | ||||
|            hwy::TargetName(target), AlgoName(algo), | ||||
|            is128 ? "u128" : type_name.c_str(), DistName(dist), | ||||
|            static_cast<double>(num), bytes * 1E-6 / sec, num_threads); | ||||
|   } | ||||
| 
 | ||||
|   uint32_t target; | ||||
|   Algo algo; | ||||
|   Dist dist; | ||||
|   bool is128; | ||||
|   size_t num = 0; | ||||
|   size_t num_threads = 0; | ||||
|   double sec = 0.0; | ||||
|   size_t sizeof_t = 0; | ||||
|   std::string type_name; | ||||
| }; | ||||
| 
 | ||||
| template <typename T, class Traits> | ||||
| Result MakeResult(const Algo algo, Dist dist, Traits st, size_t num, | ||||
|                   size_t num_threads, double sec) { | ||||
|   char string100[100]; | ||||
|   hwy::detail::TypeName(hwy::detail::MakeTypeInfo<T>(), 1, string100); | ||||
|   return Result(HWY_TARGET, algo, dist, st.Is128(), num, num_threads, sec, | ||||
|                 sizeof(T), string100); | ||||
| } | ||||
| 
 | ||||
| template <class Traits, typename T> | ||||
| bool VerifySort(Traits st, const InputStats<T>& input_stats, const T* out, | ||||
|                 size_t num, const char* caller) { | ||||
|   constexpr size_t N1 = st.Is128() ? 2 : 1; | ||||
|   HWY_ASSERT(num >= N1); | ||||
| 
 | ||||
|   InputStats<T> output_stats; | ||||
|   // Ensure it matches the sort order
 | ||||
|   for (size_t i = 0; i < num - N1; i += N1) { | ||||
|     output_stats.Notify(out[i]); | ||||
|     if (N1 == 2) output_stats.Notify(out[i + 1]); | ||||
|     // Reverse order instead of checking !Compare1 so we accept equal keys.
 | ||||
|     if (st.Compare1(out + i + N1, out + i)) { | ||||
|       printf("%s: i=%d of %d: N1=%d %5.0f %5.0f vs. %5.0f %5.0f\n\n", caller, | ||||
|              static_cast<int>(i), static_cast<int>(num), static_cast<int>(N1), | ||||
|              double(out[i + 1]), double(out[i + 0]), double(out[i + N1 + 1]), | ||||
|              double(out[i + N1])); | ||||
|       HWY_ABORT("%d-bit sort is incorrect\n", | ||||
|                 static_cast<int>(sizeof(T) * 8 * N1)); | ||||
|     } | ||||
|   } | ||||
|   output_stats.Notify(out[num - N1]); | ||||
|   if (N1 == 2) output_stats.Notify(out[num - N1 + 1]); | ||||
| 
 | ||||
|   return input_stats == output_stats; | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #endif  // HIGHWAY_HWY_CONTRIB_SORT_RESULT_TOGGLE
 | ||||
							
								
								
									
										104
									
								
								third_party/highway/hwy/contrib/sort/shared-inl.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										104
									
								
								third_party/highway/hwy/contrib/sort/shared-inl.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,104 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| // Definitions shared between vqsort-inl and sorting_networks-inl.
 | ||||
| 
 | ||||
| // Normal include guard for target-independent parts
 | ||||
| #ifndef HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ | ||||
| #define HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_ | ||||
| 
 | ||||
| #include "hwy/base.h" | ||||
| 
 | ||||
| namespace hwy { | ||||
| 
 | ||||
| // Internal constants - these are to avoid magic numbers/literals and cannot be
 | ||||
| // changed without also changing the associated code.
 | ||||
| struct SortConstants { | ||||
| // SortingNetwork reshapes its input into a matrix. This is the maximum number
 | ||||
| // of *keys* per vector.
 | ||||
| #if HWY_COMPILER_MSVC | ||||
|   static constexpr size_t kMaxCols = 8;  // avoids build timeout
 | ||||
| #else | ||||
|   static constexpr size_t kMaxCols = 16;  // enough for u32 in 512-bit vector
 | ||||
| #endif | ||||
| 
 | ||||
|   // 16 rows is a compromise between using the 32 AVX-512/SVE/RVV registers,
 | ||||
|   // fitting within 16 AVX2 registers with only a few spills, keeping BaseCase
 | ||||
|   // code size reasonable (7 KiB for AVX-512 and 16 cols), and minimizing the
 | ||||
|   // extra logN factor for larger networks (for which only loose upper bounds
 | ||||
|   // on size are known).
 | ||||
|   static constexpr size_t kMaxRowsLog2 = 4; | ||||
|   static constexpr size_t kMaxRows = size_t{1} << kMaxRowsLog2; | ||||
| 
 | ||||
|   static HWY_INLINE size_t BaseCaseNum(size_t N) { | ||||
|     return kMaxRows * HWY_MIN(N, kMaxCols); | ||||
|   } | ||||
| 
 | ||||
|   // Unrolling is important (pipelining and amortizing branch mispredictions);
 | ||||
|   // 2x is sufficient to reach full memory bandwidth on SKX in Partition, but
 | ||||
|   // somewhat slower for sorting than 4x.
 | ||||
|   //
 | ||||
|   // To change, must also update left + 3 * N etc. in the loop.
 | ||||
|   static constexpr size_t kPartitionUnroll = 4; | ||||
| 
 | ||||
|   static HWY_INLINE size_t PartitionBufNum(size_t N) { | ||||
|     // The main loop reads kPartitionUnroll vectors, and first loads from
 | ||||
|     // both left and right beforehand, so it requires min = 2 *
 | ||||
|     // kPartitionUnroll vectors. To handle smaller amounts (only guaranteed
 | ||||
|     // >= BaseCaseNum), we partition the right side into a buffer. We need
 | ||||
|     // another vector at the end so CompressStore does not overwrite anything.
 | ||||
|     return (2 * kPartitionUnroll + 1) * N; | ||||
|   } | ||||
| 
 | ||||
|   // Chunk := group of keys loaded for sampling a pivot. Matches the typical
 | ||||
|   // cache line size of 64 bytes to get maximum benefit per L2 miss. If vectors
 | ||||
|   // are larger, use entire vectors to ensure we do not overrun the array.
 | ||||
|   static HWY_INLINE size_t LanesPerChunk(size_t sizeof_t, size_t N) { | ||||
|     return HWY_MAX(64 / sizeof_t, N); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| 
 | ||||
| #endif  // HIGHWAY_HWY_CONTRIB_SORT_SHARED_INL_H_
 | ||||
| 
 | ||||
| // Per-target
 | ||||
| #if defined(HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE) == \ | ||||
|     defined(HWY_TARGET_TOGGLE) | ||||
| #ifdef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE | ||||
| #undef HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE | ||||
| #else | ||||
| #define HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE | ||||
| #endif | ||||
| 
 | ||||
| #include "hwy/highway.h" | ||||
| 
 | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| // Default tag / vector width selector.
 | ||||
| // TODO(janwas): enable once LMUL < 1 is supported.
 | ||||
| #if HWY_TARGET == HWY_RVV && 0 | ||||
| template <typename T> | ||||
| using SortTag = ScalableTag<T, -1>; | ||||
| #else | ||||
| template <typename T> | ||||
| using SortTag = ScalableTag<T>; | ||||
| #endif | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| 
 | ||||
| #endif  // HIGHWAY_HWY_CONTRIB_SORT_SHARED_TOGGLE
 | ||||
							
								
								
									
										682
									
								
								third_party/highway/hwy/contrib/sort/sort_test.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										682
									
								
								third_party/highway/hwy/contrib/sort/sort_test.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -12,160 +12,553 @@ | |||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include <inttypes.h> | ||||
| #include <stdint.h> | ||||
| #include <stdio.h> | ||||
| #include <stdlib.h> | ||||
| 
 | ||||
| // clang-format off
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/sort_test.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| #include "hwy/contrib/sort/sort-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/algo-inl.h" | ||||
| #include "hwy/contrib/sort/result-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h"  // BaseCase
 | ||||
| #include "hwy/tests/test_util-inl.h" | ||||
| // clang-format on
 | ||||
| 
 | ||||
| #include <stdint.h> | ||||
| #include <stdio.h> | ||||
| #include <string.h>  // memcpy
 | ||||
| 
 | ||||
| #include <algorithm>  // std::max
 | ||||
| #include <vector> | ||||
| 
 | ||||
| #undef VQSORT_TEST_IMPL | ||||
| #if (HWY_TARGET == HWY_SCALAR) || (defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD) | ||||
| // Scalar does not implement these, and MSVC non-debug builds time out.
 | ||||
| #define VQSORT_TEST_IMPL 0 | ||||
| #else | ||||
| #define VQSORT_TEST_IMPL 1 | ||||
| #endif | ||||
| 
 | ||||
| #undef VQSORT_TEST_SORT | ||||
| // MSVC non-debug builds time out.
 | ||||
| #if defined(_MSC_VER) && !HWY_IS_DEBUG_BUILD | ||||
| #define VQSORT_TEST_SORT 0 | ||||
| #else | ||||
| #define VQSORT_TEST_SORT 1 | ||||
| #endif | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| namespace { | ||||
| 
 | ||||
| #if HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86 | ||||
| 
 | ||||
| template <class D> | ||||
| size_t K(D d) { | ||||
|   return SortBatchSize(d); | ||||
| } | ||||
| 
 | ||||
| template <SortOrder kOrder, class D> | ||||
| void Validate(D d, const TFromD<D>* in, const TFromD<D>* out) { | ||||
|   const size_t N = Lanes(d); | ||||
|   // Ensure it matches the sort order
 | ||||
|   for (size_t i = 0; i < K(d) - 1; ++i) { | ||||
|     if (!verify::Compare(out[i], out[i + 1], kOrder)) { | ||||
|       printf("range=%" PRIu64 " lane=%" PRIu64 " N=%" PRIu64 " %.0f %.0f\n\n", | ||||
|              static_cast<uint64_t>(i), static_cast<uint64_t>(i), | ||||
|              static_cast<uint64_t>(N), static_cast<float>(out[i + 0]), | ||||
|              static_cast<float>(out[i + 1])); | ||||
|       for (size_t i = 0; i < K(d); ++i) { | ||||
|         printf("%.0f\n", static_cast<float>(out[i])); | ||||
|       } | ||||
| 
 | ||||
|       printf("\n\nin was:\n"); | ||||
|       for (size_t i = 0; i < K(d); ++i) { | ||||
|         printf("%.0f\n", static_cast<float>(in[i])); | ||||
|       } | ||||
|       fflush(stdout); | ||||
|       HWY_ABORT("Sort is incorrect"); | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   // Also verify sums match (detects duplicated/lost values)
 | ||||
|   double expected_sum = 0.0; | ||||
|   double actual_sum = 0.0; | ||||
|   for (size_t i = 0; i < K(d); ++i) { | ||||
|     expected_sum += in[i]; | ||||
|     actual_sum += out[i]; | ||||
|   } | ||||
|   if (expected_sum != actual_sum) { | ||||
|     for (size_t i = 0; i < K(d); ++i) { | ||||
|       printf("%.0f  %.0f\n", static_cast<float>(in[i]), | ||||
|              static_cast<float>(out[i])); | ||||
|     } | ||||
|     HWY_ABORT("Mismatch"); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| class TestReverse { | ||||
|   template <SortOrder kOrder, class D> | ||||
|   void TestOrder(D d, RandomState& /* rng */) { | ||||
|     using T = TFromD<D>; | ||||
|     const size_t N = Lanes(d); | ||||
|     HWY_ASSERT((N % 4) == 0); | ||||
|     auto in = AllocateAligned<T>(K(d)); | ||||
|     auto inout = AllocateAligned<T>(K(d)); | ||||
| 
 | ||||
|     const size_t expected_size = SortBatchSize(d); | ||||
| 
 | ||||
|     for (size_t i = 0; i < K(d); ++i) { | ||||
|       in[i] = static_cast<T>(K(d) - i); | ||||
|       inout[i] = in[i]; | ||||
|     } | ||||
| 
 | ||||
|     const size_t actual_size = SortBatch<kOrder>(d, inout.get()); | ||||
|     HWY_ASSERT_EQ(expected_size, actual_size); | ||||
|     Validate<kOrder>(d, in.get(), inout.get()); | ||||
|   } | ||||
| 
 | ||||
|  public: | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     RandomState rng; | ||||
|     TestOrder<SortOrder::kAscending>(d, rng); | ||||
|     TestOrder<SortOrder::kDescending>(d, rng); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| void TestAllReverse() { | ||||
|   TestReverse test; | ||||
|   test(int32_t(), CappedTag<int32_t, 16>()); | ||||
|   test(uint32_t(), CappedTag<uint32_t, 16>()); | ||||
| } | ||||
| 
 | ||||
| class TestRanges { | ||||
|   template <SortOrder kOrder, class D> | ||||
|   void TestOrder(D d, RandomState& rng) { | ||||
|     using T = TFromD<D>; | ||||
|     const size_t N = Lanes(d); | ||||
|     HWY_ASSERT((N % 4) == 0); | ||||
|     auto in = AllocateAligned<T>(K(d)); | ||||
|     auto inout = AllocateAligned<T>(K(d)); | ||||
| 
 | ||||
|     const size_t expected_size = SortBatchSize(d); | ||||
| 
 | ||||
|     // For each range, try all 0/1 combinations and set any other lanes to
 | ||||
|     // random inputs.
 | ||||
|     constexpr size_t kRange = 8; | ||||
|     for (size_t range = 0; range < K(d); range += kRange) { | ||||
|       for (size_t bits = 0; bits < (1ull << kRange); ++bits) { | ||||
|         // First set all to random, will later overwrite those for `range`
 | ||||
|         for (size_t i = 0; i < K(d); ++i) { | ||||
|           in[i] = inout[i] = static_cast<T>(Random32(&rng) & 0xFF); | ||||
|         } | ||||
|         // Now set the current combination of {0,1} for elements in the range.
 | ||||
|         // This is sufficient to establish correctness (arbitrary inputs could
 | ||||
|         // be mapped to 0/1 with a comparison predicate).
 | ||||
|         for (size_t i = 0; i < kRange; ++i) { | ||||
|           in[range + i] = inout[range + i] = (bits >> i) & 1; | ||||
|         } | ||||
| 
 | ||||
|         const size_t actual_size = SortBatch<kOrder>(d, inout.get()); | ||||
|         HWY_ASSERT_EQ(expected_size, actual_size); | ||||
|         Validate<kOrder>(d, in.get(), inout.get()); | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|  public: | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     RandomState rng; | ||||
|     TestOrder<SortOrder::kAscending>(d, rng); | ||||
|     TestOrder<SortOrder::kDescending>(d, rng); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| void TestAllRanges() { | ||||
|   TestRanges test; | ||||
|   test(int32_t(), CappedTag<int32_t, 16>()); | ||||
|   test(uint32_t(), CappedTag<uint32_t, 16>()); | ||||
| } | ||||
| #if VQSORT_TEST_IMPL || VQSORT_TEST_SORT | ||||
| using detail::LaneTraits; | ||||
| using detail::OrderAscending; | ||||
| using detail::OrderAscending128; | ||||
| using detail::OrderDescending; | ||||
| using detail::OrderDescending128; | ||||
| using detail::SharedTraits; | ||||
| using detail::Traits128; | ||||
| #endif | ||||
| 
 | ||||
| #if !VQSORT_TEST_IMPL | ||||
| static void TestAllMedian() {} | ||||
| static void TestAllBaseCase() {} | ||||
| static void TestAllPartition() {} | ||||
| static void TestAllGenerator() {} | ||||
| #else | ||||
| void TestAllReverse() {} | ||||
| void TestAllRanges() {} | ||||
| #endif  // HWY_TARGET != HWY_SCALAR && HWY_ARCH_X86
 | ||||
| 
 | ||||
| template <class Traits> | ||||
| static HWY_NOINLINE void TestMedian3() { | ||||
|   using T = uint64_t; | ||||
|   using D = CappedTag<T, 1>; | ||||
|   SharedTraits<Traits> st; | ||||
|   const D d; | ||||
|   using V = Vec<D>; | ||||
|   for (uint32_t bits = 0; bits < 8; ++bits) { | ||||
|     const V v0 = Set(d, T{(bits & (1u << 0)) ? 1u : 0u}); | ||||
|     const V v1 = Set(d, T{(bits & (1u << 1)) ? 1u : 0u}); | ||||
|     const V v2 = Set(d, T{(bits & (1u << 2)) ? 1u : 0u}); | ||||
|     const T m = GetLane(detail::MedianOf3(st, v0, v1, v2)); | ||||
|     // If at least half(rounded up) of bits are 1, so is the median.
 | ||||
|     const size_t count = PopCount(bits); | ||||
|     HWY_ASSERT_EQ((count >= 2) ? static_cast<T>(1) : 0, m); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllMedian() { | ||||
|   TestMedian3<LaneTraits<OrderAscending> >(); | ||||
| } | ||||
| 
 | ||||
| template <class Traits, typename T> | ||||
| static HWY_NOINLINE void TestBaseCaseAscDesc() { | ||||
|   SharedTraits<Traits> st; | ||||
|   const SortTag<T> d; | ||||
|   const size_t N = Lanes(d); | ||||
|   const size_t base_case_num = SortConstants::BaseCaseNum(N); | ||||
|   const size_t N1 = st.LanesPerKey(); | ||||
| 
 | ||||
|   constexpr int kDebug = 0; | ||||
|   auto aligned_keys = hwy::AllocateAligned<T>(N + base_case_num + N); | ||||
|   auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N); | ||||
| 
 | ||||
|   std::vector<size_t> lengths; | ||||
|   lengths.push_back(HWY_MAX(1, N1)); | ||||
|   lengths.push_back(3 * N1); | ||||
|   lengths.push_back(base_case_num / 2); | ||||
|   lengths.push_back(base_case_num / 2 + N1); | ||||
|   lengths.push_back(base_case_num - N1); | ||||
|   lengths.push_back(base_case_num); | ||||
| 
 | ||||
|   std::vector<size_t> misalignments; | ||||
|   misalignments.push_back(0); | ||||
|   misalignments.push_back(1); | ||||
|   if (N >= 6) misalignments.push_back(N / 2 - 1); | ||||
|   misalignments.push_back(N / 2); | ||||
|   misalignments.push_back(N / 2 + 1); | ||||
|   misalignments.push_back(HWY_MIN(2 * N / 3 + 3, size_t{N - 1})); | ||||
| 
 | ||||
|   for (bool asc : {false, true}) { | ||||
|     for (size_t len : lengths) { | ||||
|       for (size_t misalign : misalignments) { | ||||
|         T* HWY_RESTRICT keys = aligned_keys.get() + misalign; | ||||
|         if (kDebug) { | ||||
|           printf("============%s asc %d N1 %d len %d misalign %d\n", | ||||
|                  hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(N1), | ||||
|                  static_cast<int>(len), static_cast<int>(misalign)); | ||||
|         } | ||||
| 
 | ||||
|         for (size_t i = 0; i < misalign; ++i) { | ||||
|           aligned_keys[i] = hwy::LowestValue<T>(); | ||||
|         } | ||||
|         InputStats<T> input_stats; | ||||
|         for (size_t i = 0; i < len; ++i) { | ||||
|           keys[i] = | ||||
|               asc ? static_cast<T>(T(i) + 1) : static_cast<T>(T(len) - T(i)); | ||||
|           input_stats.Notify(keys[i]); | ||||
|           if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i])); | ||||
|         } | ||||
|         for (size_t i = len; i < base_case_num + N; ++i) { | ||||
|           keys[i] = hwy::LowestValue<T>(); | ||||
|         } | ||||
| 
 | ||||
|         detail::BaseCase(d, st, keys, len, buf.get()); | ||||
| 
 | ||||
|         if (kDebug >= 2) { | ||||
|           printf("out>>>>>>\n"); | ||||
|           for (size_t i = 0; i < len; ++i) { | ||||
|             printf("%3zu: %f\n", i, double(keys[i])); | ||||
|           } | ||||
|         } | ||||
| 
 | ||||
|         HWY_ASSERT(VerifySort(st, input_stats, keys, len, "BaseAscDesc")); | ||||
|         for (size_t i = 0; i < misalign; ++i) { | ||||
|           if (aligned_keys[i] != hwy::LowestValue<T>()) | ||||
|             HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i)); | ||||
|         } | ||||
|         for (size_t i = len; i < base_case_num + N; ++i) { | ||||
|           if (keys[i] != hwy::LowestValue<T>()) | ||||
|             HWY_ABORT("Overrun right at %d\n", static_cast<int>(i)); | ||||
|         } | ||||
|       }  // misalign
 | ||||
|     }    // len
 | ||||
|   }      // asc
 | ||||
| } | ||||
| 
 | ||||
| template <class Traits, typename T> | ||||
| static HWY_NOINLINE void TestBaseCase01() { | ||||
|   SharedTraits<Traits> st; | ||||
|   const SortTag<T> d; | ||||
|   const size_t N = Lanes(d); | ||||
|   const size_t base_case_num = SortConstants::BaseCaseNum(N); | ||||
|   const size_t N1 = st.LanesPerKey(); | ||||
| 
 | ||||
|   constexpr int kDebug = 0; | ||||
|   auto keys = hwy::AllocateAligned<T>(base_case_num + N); | ||||
|   auto buf = hwy::AllocateAligned<T>(base_case_num + 2 * N); | ||||
| 
 | ||||
|   std::vector<size_t> lengths; | ||||
|   lengths.push_back(HWY_MAX(1, N1)); | ||||
|   lengths.push_back(3 * N1); | ||||
|   lengths.push_back(base_case_num / 2); | ||||
|   lengths.push_back(base_case_num / 2 + N1); | ||||
|   lengths.push_back(base_case_num - N1); | ||||
|   lengths.push_back(base_case_num); | ||||
| 
 | ||||
|   for (size_t len : lengths) { | ||||
|     if (kDebug) { | ||||
|       printf("============%s 01 N1 %d len %d\n", hwy::TypeName(T(), 1).c_str(), | ||||
|              static_cast<int>(N1), static_cast<int>(len)); | ||||
|     } | ||||
|     const uint64_t kMaxBits = AdjustedLog2Reps(HWY_MIN(len, size_t{14})); | ||||
|     for (uint64_t bits = 0; bits < ((1ull << kMaxBits) - 1); ++bits) { | ||||
|       InputStats<T> input_stats; | ||||
|       for (size_t i = 0; i < len; ++i) { | ||||
|         keys[i] = (i < 64 && (bits & (1ull << i))) ? 1 : 0; | ||||
|         input_stats.Notify(keys[i]); | ||||
|         if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i])); | ||||
|       } | ||||
|       for (size_t i = len; i < base_case_num + N; ++i) { | ||||
|         keys[i] = hwy::LowestValue<T>(); | ||||
|       } | ||||
| 
 | ||||
|       detail::BaseCase(d, st, keys.get(), len, buf.get()); | ||||
| 
 | ||||
|       if (kDebug >= 2) { | ||||
|         printf("out>>>>>>\n"); | ||||
|         for (size_t i = 0; i < len; ++i) { | ||||
|           printf("%3zu: %f\n", i, double(keys[i])); | ||||
|         } | ||||
|       } | ||||
| 
 | ||||
|       HWY_ASSERT(VerifySort(st, input_stats, keys.get(), len, "Base01")); | ||||
|       for (size_t i = len; i < base_case_num + N; ++i) { | ||||
|         if (keys[i] != hwy::LowestValue<T>()) | ||||
|           HWY_ABORT("Overrun right at %d\n", static_cast<int>(i)); | ||||
|       } | ||||
|     }  // bits
 | ||||
|   }    // len
 | ||||
| } | ||||
| 
 | ||||
| template <class Traits, typename T> | ||||
| static HWY_NOINLINE void TestBaseCase() { | ||||
|   TestBaseCaseAscDesc<Traits, T>(); | ||||
|   TestBaseCase01<Traits, T>(); | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllBaseCase() { | ||||
|   // Workaround for stack overflow on MSVC debug.
 | ||||
| #if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3) | ||||
|   return; | ||||
| #endif | ||||
| 
 | ||||
|   TestBaseCase<LaneTraits<OrderAscending>, int32_t>(); | ||||
|   TestBaseCase<LaneTraits<OrderDescending>, int64_t>(); | ||||
|   TestBaseCase<Traits128<OrderAscending128>, uint64_t>(); | ||||
|   TestBaseCase<Traits128<OrderDescending128>, uint64_t>(); | ||||
| } | ||||
| 
 | ||||
| template <class Traits, typename T> | ||||
| static HWY_NOINLINE void VerifyPartition(Traits st, T* HWY_RESTRICT keys, | ||||
|                                          size_t left, size_t border, | ||||
|                                          size_t right, const size_t N1, | ||||
|                                          const T* pivot) { | ||||
|   /* for (size_t i = left; i < right; ++i) {
 | ||||
|      if (i == border) printf("--\n"); | ||||
|      printf("%4zu: %3d\n", i, keys[i]); | ||||
|    }*/ | ||||
| 
 | ||||
|   HWY_ASSERT(left % N1 == 0); | ||||
|   HWY_ASSERT(border % N1 == 0); | ||||
|   HWY_ASSERT(right % N1 == 0); | ||||
|   const bool asc = typename Traits::Order().IsAscending(); | ||||
|   for (size_t i = left; i < border; i += N1) { | ||||
|     if (st.Compare1(pivot, keys + i)) { | ||||
|       HWY_ABORT( | ||||
|           "%s: asc %d left[%d] piv %.0f %.0f compares before %.0f %.0f " | ||||
|           "border %d", | ||||
|           hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i), | ||||
|           double(pivot[1]), double(pivot[0]), double(keys[i + 1]), | ||||
|           double(keys[i + 0]), static_cast<int>(border)); | ||||
|     } | ||||
|   } | ||||
|   for (size_t i = border; i < right; i += N1) { | ||||
|     if (!st.Compare1(pivot, keys + i)) { | ||||
|       HWY_ABORT( | ||||
|           "%s: asc %d right[%d] piv %.0f %.0f compares after %.0f %.0f " | ||||
|           "border %d", | ||||
|           hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(i), | ||||
|           double(pivot[1]), double(pivot[0]), double(keys[i + 1]), | ||||
|           double(keys[i]), static_cast<int>(border)); | ||||
|     } | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| template <class Traits, typename T> | ||||
| static HWY_NOINLINE void TestPartition() { | ||||
|   const SortTag<T> d; | ||||
|   SharedTraits<Traits> st; | ||||
|   const bool asc = typename Traits::Order().IsAscending(); | ||||
|   const size_t N = Lanes(d); | ||||
|   constexpr int kDebug = 0; | ||||
|   const size_t base_case_num = SortConstants::BaseCaseNum(N); | ||||
|   // left + len + align
 | ||||
|   const size_t total = 32 + (base_case_num + 4 * HWY_MAX(N, 4)) + 2 * N; | ||||
|   auto aligned_keys = hwy::AllocateAligned<T>(total); | ||||
|   auto buf = hwy::AllocateAligned<T>(SortConstants::PartitionBufNum(N)); | ||||
| 
 | ||||
|   const size_t N1 = st.LanesPerKey(); | ||||
|   for (bool in_asc : {false, true}) { | ||||
|     for (int left_i : {0, 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 22, 28, 29, 30, 31}) { | ||||
|       const size_t left = static_cast<size_t>(left_i) & ~(N1 - 1); | ||||
|       for (size_t ofs : {N, N + 1, N + 2, N + 3, 2 * N, 2 * N + 1, 2 * N + 2, | ||||
|                          2 * N + 3, 3 * N - 1, 4 * N - 3, 4 * N - 2}) { | ||||
|         const size_t len = (base_case_num + ofs) & ~(N1 - 1); | ||||
|         for (T pivot1 : | ||||
|              {T(0), T(len / 3), T(len / 2), T(2 * len / 3), T(len)}) { | ||||
|           const T pivot2[2] = {pivot1, 0}; | ||||
|           const auto pivot = st.SetKey(d, pivot2); | ||||
|           for (size_t misalign = 0; misalign < N; | ||||
|                misalign += st.LanesPerKey()) { | ||||
|             T* HWY_RESTRICT keys = aligned_keys.get() + misalign; | ||||
|             const size_t right = left + len; | ||||
|             if (kDebug) { | ||||
|               printf( | ||||
|                   "=========%s asc %d left %d len %d right %d piv %.0f %.0f\n", | ||||
|                   hwy::TypeName(T(), 1).c_str(), asc, static_cast<int>(left), | ||||
|                   static_cast<int>(len), static_cast<int>(right), | ||||
|                   double(pivot2[1]), double(pivot2[0])); | ||||
|             } | ||||
| 
 | ||||
|             for (size_t i = 0; i < misalign; ++i) { | ||||
|               aligned_keys[i] = hwy::LowestValue<T>(); | ||||
|             } | ||||
|             for (size_t i = 0; i < left; ++i) { | ||||
|               keys[i] = hwy::LowestValue<T>(); | ||||
|             } | ||||
|             for (size_t i = left; i < right; ++i) { | ||||
|               keys[i] = static_cast<T>(in_asc ? T(i + 1) - static_cast<T>(left) | ||||
|                                               : static_cast<T>(right) - T(i)); | ||||
|               if (kDebug >= 2) printf("%3zu: %f\n", i, double(keys[i])); | ||||
|             } | ||||
|             for (size_t i = right; i < total - misalign; ++i) { | ||||
|               keys[i] = hwy::LowestValue<T>(); | ||||
|             } | ||||
| 
 | ||||
|             size_t border = | ||||
|                 detail::Partition(d, st, keys, left, right, pivot, buf.get()); | ||||
| 
 | ||||
|             if (kDebug >= 2) { | ||||
|               printf("out>>>>>>\n"); | ||||
|               for (size_t i = left; i < right; ++i) { | ||||
|                 printf("%3zu: %f\n", i, double(keys[i])); | ||||
|               } | ||||
|               for (size_t i = right; i < total - misalign; ++i) { | ||||
|                 printf("%3zu: sentinel %f\n", i, double(keys[i])); | ||||
|               } | ||||
|             } | ||||
| 
 | ||||
|             VerifyPartition(st, keys, left, border, right, N1, pivot2); | ||||
|             for (size_t i = 0; i < misalign; ++i) { | ||||
|               if (aligned_keys[i] != hwy::LowestValue<T>()) | ||||
|                 HWY_ABORT("Overrun misalign at %d\n", static_cast<int>(i)); | ||||
|             } | ||||
|             for (size_t i = 0; i < left; ++i) { | ||||
|               if (keys[i] != hwy::LowestValue<T>()) | ||||
|                 HWY_ABORT("Overrun left at %d\n", static_cast<int>(i)); | ||||
|             } | ||||
|             for (size_t i = right; i < total - misalign; ++i) { | ||||
|               if (keys[i] != hwy::LowestValue<T>()) | ||||
|                 HWY_ABORT("Overrun right at %d\n", static_cast<int>(i)); | ||||
|             } | ||||
|           }  // misalign
 | ||||
|         }    // pivot
 | ||||
|       }      // len
 | ||||
|     }        // left
 | ||||
|   }          // asc
 | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllPartition() { | ||||
|   TestPartition<LaneTraits<OrderAscending>, int16_t>(); | ||||
|   TestPartition<LaneTraits<OrderDescending>, int32_t>(); | ||||
|   TestPartition<LaneTraits<OrderAscending>, int64_t>(); | ||||
|   TestPartition<LaneTraits<OrderDescending>, float>(); | ||||
| #if HWY_HAVE_FLOAT64 | ||||
|   TestPartition<LaneTraits<OrderDescending>, double>(); | ||||
| #endif | ||||
|   TestPartition<Traits128<OrderAscending128>, uint64_t>(); | ||||
|   TestPartition<Traits128<OrderDescending128>, uint64_t>(); | ||||
| } | ||||
| 
 | ||||
| // (used for sample selection for choosing a pivot)
 | ||||
| template <typename TU> | ||||
| static HWY_NOINLINE void TestRandomGenerator() { | ||||
|   static_assert(!hwy::IsSigned<TU>(), ""); | ||||
|   SortTag<TU> du; | ||||
|   const size_t N = Lanes(du); | ||||
| 
 | ||||
|   detail::Generator rng(&N, N); | ||||
| 
 | ||||
|   const size_t lanes_per_block = HWY_MAX(64 / sizeof(TU), N);  // power of two
 | ||||
| 
 | ||||
|   for (uint32_t num_blocks = 2; num_blocks < 100000; | ||||
|        num_blocks = 3 * num_blocks / 2) { | ||||
|     // Generate some numbers and ensure all are in range
 | ||||
|     uint64_t sum = 0; | ||||
|     constexpr size_t kReps = 10000; | ||||
|     for (size_t rep = 0; rep < kReps; ++rep) { | ||||
|       const uint32_t bits = rng() & 0xFFFFFFFF; | ||||
|       const size_t index = detail::RandomChunkIndex(num_blocks, bits); | ||||
|       HWY_ASSERT(((index + 1) * lanes_per_block) <= | ||||
|                  num_blocks * lanes_per_block); | ||||
| 
 | ||||
|       sum += index; | ||||
|     } | ||||
| 
 | ||||
|     // Also ensure the mean is near the middle of the range
 | ||||
|     const double expected = (num_blocks - 1) / 2.0; | ||||
|     const double actual = double(sum) / kReps; | ||||
|     HWY_ASSERT(0.9 * expected <= actual && actual <= 1.1 * expected); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllGenerator() { | ||||
|   TestRandomGenerator<uint32_t>(); | ||||
|   TestRandomGenerator<uint64_t>(); | ||||
| } | ||||
| 
 | ||||
| #endif  // VQSORT_TEST_IMPL
 | ||||
| 
 | ||||
| #if !VQSORT_TEST_SORT | ||||
| static void TestAllSort() {} | ||||
| #else | ||||
| 
 | ||||
| // Remembers input, and compares results to that of a reference algorithm.
 | ||||
| template <class Traits, typename T> | ||||
| class CompareResults { | ||||
|  public: | ||||
|   void SetInput(const T* in, size_t num) { | ||||
|     copy_.resize(num); | ||||
|     memcpy(copy_.data(), in, num * sizeof(T)); | ||||
|   } | ||||
| 
 | ||||
|   bool Verify(const T* output) { | ||||
| #if HAVE_PDQSORT | ||||
|     const Algo reference = Algo::kPDQ; | ||||
| #else | ||||
|     const Algo reference = Algo::kStd; | ||||
| #endif | ||||
|     SharedState shared; | ||||
|     using Order = typename Traits::Order; | ||||
|     Run<Order>(reference, copy_.data(), copy_.size(), shared, | ||||
|                /*thread=*/0); | ||||
| 
 | ||||
|     for (size_t i = 0; i < copy_.size(); ++i) { | ||||
|       if (copy_[i] != output[i]) { | ||||
|         fprintf(stderr, "Asc %d mismatch at %d: %A %A\n", Order().IsAscending(), | ||||
|                 static_cast<int>(i), double(copy_[i]), double(output[i])); | ||||
|         return false; | ||||
|       } | ||||
|     } | ||||
|     return true; | ||||
|   } | ||||
| 
 | ||||
|  private: | ||||
|   std::vector<T> copy_; | ||||
| }; | ||||
| 
 | ||||
| std::vector<Algo> AlgoForTest() { | ||||
|   return { | ||||
| #if HAVE_AVX2SORT | ||||
|     Algo::kSEA, | ||||
| #endif | ||||
| #if HAVE_IPS4O | ||||
|         Algo::kIPS4O, | ||||
| #endif | ||||
| #if HAVE_PDQSORT | ||||
|         Algo::kPDQ, | ||||
| #endif | ||||
| #if HAVE_SORT512 | ||||
|         Algo::kSort512, | ||||
| #endif | ||||
|         Algo::kHeap, Algo::kVQSort, | ||||
|   }; | ||||
| } | ||||
| 
 | ||||
| template <class Traits, typename T> | ||||
| void TestSort(size_t num) { | ||||
|   // TODO(janwas): fix
 | ||||
|   if (HWY_TARGET == HWY_SSSE3) return; | ||||
| // Workaround for stack overflow on clang-cl (/F 8388608 does not help).
 | ||||
| #if defined(_MSC_VER) && HWY_IS_DEBUG_BUILD && (HWY_TARGET == HWY_AVX3) | ||||
|   return; | ||||
| #endif | ||||
| 
 | ||||
|   SharedState shared; | ||||
|   SharedTraits<Traits> st; | ||||
| 
 | ||||
|   constexpr size_t kMaxMisalign = 16; | ||||
|   auto aligned = hwy::AllocateAligned<T>(kMaxMisalign + num + kMaxMisalign); | ||||
|   for (Algo algo : AlgoForTest()) { | ||||
| #if HAVE_IPS4O | ||||
|     if (st.Is128() && (algo == Algo::kIPS4O || algo == Algo::kParallelIPS4O)) { | ||||
|       continue; | ||||
|     } | ||||
| #endif | ||||
|     for (Dist dist : AllDist()) { | ||||
|       for (size_t misalign : {size_t{0}, size_t{st.LanesPerKey()}, | ||||
|                               size_t{3 * st.LanesPerKey()}, kMaxMisalign / 2}) { | ||||
|         T* keys = aligned.get() + misalign; | ||||
| 
 | ||||
|         // Set up red zones before/after the keys to sort
 | ||||
|         for (size_t i = 0; i < misalign; ++i) { | ||||
|           aligned[i] = hwy::LowestValue<T>(); | ||||
|         } | ||||
|         for (size_t i = 0; i < kMaxMisalign; ++i) { | ||||
|           keys[num + i] = hwy::HighestValue<T>(); | ||||
|         } | ||||
| #if HWY_IS_MSAN | ||||
|         __msan_poison(aligned.get(), misalign * sizeof(T)); | ||||
|         __msan_poison(keys + num, kMaxMisalign * sizeof(T)); | ||||
| #endif | ||||
|         InputStats<T> input_stats = GenerateInput(dist, keys, num); | ||||
| 
 | ||||
|         CompareResults<Traits, T> compare; | ||||
|         compare.SetInput(keys, num); | ||||
| 
 | ||||
|         Run<typename Traits::Order>(algo, keys, num, shared, /*thread=*/0); | ||||
|         HWY_ASSERT(compare.Verify(keys)); | ||||
|         HWY_ASSERT(VerifySort(st, input_stats, keys, num, "TestSort")); | ||||
| 
 | ||||
|         // Check red zones
 | ||||
| #if HWY_IS_MSAN | ||||
|         __msan_unpoison(aligned.get(), misalign * sizeof(T)); | ||||
|         __msan_unpoison(keys + num, kMaxMisalign * sizeof(T)); | ||||
| #endif | ||||
|         for (size_t i = 0; i < misalign; ++i) { | ||||
|           if (aligned[i] != hwy::LowestValue<T>()) | ||||
|             HWY_ABORT("Overrun left at %d\n", static_cast<int>(i)); | ||||
|         } | ||||
|         for (size_t i = num; i < num + kMaxMisalign; ++i) { | ||||
|           if (keys[i] != hwy::HighestValue<T>()) | ||||
|             HWY_ABORT("Overrun right at %d\n", static_cast<int>(i)); | ||||
|         } | ||||
|       }  // misalign
 | ||||
|     }    // dist
 | ||||
|   }      // algo
 | ||||
| } | ||||
| 
 | ||||
| void TestAllSort() { | ||||
|   const size_t num = 15 * 1000; | ||||
| 
 | ||||
|   TestSort<LaneTraits<OrderAscending>, int16_t>(num); | ||||
|   TestSort<LaneTraits<OrderDescending>, uint16_t>(num); | ||||
| 
 | ||||
|   TestSort<LaneTraits<OrderDescending>, int32_t>(num); | ||||
|   TestSort<LaneTraits<OrderDescending>, uint32_t>(num); | ||||
| 
 | ||||
|   TestSort<LaneTraits<OrderAscending>, int64_t>(num); | ||||
|   TestSort<LaneTraits<OrderAscending>, uint64_t>(num); | ||||
| 
 | ||||
|   // WARNING: for float types, SIMD comparisons will flush denormals to zero,
 | ||||
|   // causing mismatches with scalar sorts. In this test, we avoid generating
 | ||||
|   // denormal inputs.
 | ||||
|   TestSort<LaneTraits<OrderAscending>, float>(num); | ||||
| #if HWY_HAVE_FLOAT64  // protects algo-inl's GenerateRandom
 | ||||
|   if (Sorter::HaveFloat64()) { | ||||
|     TestSort<LaneTraits<OrderDescending>, double>(num); | ||||
|   } | ||||
| #endif | ||||
| 
 | ||||
|   TestSort<Traits128<OrderAscending128>, uint64_t>(num); | ||||
|   TestSort<Traits128<OrderAscending128>, uint64_t>(num); | ||||
| } | ||||
| 
 | ||||
| #endif  // VQSORT_TEST_SORT
 | ||||
| 
 | ||||
| }  // namespace
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
|  | @ -174,9 +567,14 @@ HWY_AFTER_NAMESPACE(); | |||
| #if HWY_ONCE | ||||
| 
 | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_BEFORE_TEST(SortTest); | ||||
| HWY_EXPORT_AND_TEST_P(SortTest, TestAllReverse); | ||||
| HWY_EXPORT_AND_TEST_P(SortTest, TestAllRanges); | ||||
| HWY_EXPORT_AND_TEST_P(SortTest, TestAllMedian); | ||||
| HWY_EXPORT_AND_TEST_P(SortTest, TestAllBaseCase); | ||||
| HWY_EXPORT_AND_TEST_P(SortTest, TestAllPartition); | ||||
| HWY_EXPORT_AND_TEST_P(SortTest, TestAllGenerator); | ||||
| HWY_EXPORT_AND_TEST_P(SortTest, TestAllSort); | ||||
| }  // namespace
 | ||||
| }  // namespace hwy
 | ||||
| 
 | ||||
| // Ought not to be necessary, but without this, no tests run on RVV.
 | ||||
|  | @ -185,4 +583,4 @@ int main(int argc, char** argv) { | |||
|   return RUN_ALL_TESTS(); | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| #endif  // HWY_ONCE
 | ||||
|  |  | |||
							
								
								
									
										686
									
								
								third_party/highway/hwy/contrib/sort/sorting_networks-inl.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										686
									
								
								third_party/highway/hwy/contrib/sort/sorting_networks-inl.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,686 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| // Per-target
 | ||||
| #if defined(HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE) == \ | ||||
|     defined(HWY_TARGET_TOGGLE) | ||||
| #ifdef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE | ||||
| #undef HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE | ||||
| #else | ||||
| #define HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE | ||||
| #endif | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/shared-inl.h"  // SortConstants | ||||
| #include "hwy/highway.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| namespace detail { | ||||
| 
 | ||||
| using Constants = hwy::SortConstants; | ||||
| 
 | ||||
| // ------------------------------ SharedTraits
 | ||||
| 
 | ||||
| // Code shared between all traits. It's unclear whether these can profitably be
 | ||||
| // specialized for Lane vs Block, or optimized like SortPairsDistance1 using
 | ||||
| // Compare/DupOdd.
 | ||||
| template <class Base> | ||||
| struct SharedTraits : public Base { | ||||
|   // Conditionally swaps lane 0 with 2, 1 with 3 etc.
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> SortPairsDistance2(D d, Vec<D> v) const { | ||||
|     const Base* base = static_cast<const Base*>(this); | ||||
|     Vec<D> swapped = base->SwapAdjacentPairs(d, v); | ||||
|     base->Sort2(d, v, swapped); | ||||
|     return base->OddEvenPairs(d, swapped, v); | ||||
|   } | ||||
| 
 | ||||
|   // Swaps with the vector formed by reversing contiguous groups of 8 keys.
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> SortPairsReverse8(D d, Vec<D> v) const { | ||||
|     const Base* base = static_cast<const Base*>(this); | ||||
|     Vec<D> swapped = base->ReverseKeys8(d, v); | ||||
|     base->Sort2(d, v, swapped); | ||||
|     return base->OddEvenQuads(d, swapped, v); | ||||
|   } | ||||
| 
 | ||||
|   // Swaps with the vector formed by reversing contiguous groups of 8 keys.
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> SortPairsReverse16(D d, Vec<D> v) const { | ||||
|     const Base* base = static_cast<const Base*>(this); | ||||
|     static_assert(Constants::kMaxCols <= 16, "Need actual Reverse16"); | ||||
|     Vec<D> swapped = base->ReverseKeys(d, v); | ||||
|     base->Sort2(d, v, swapped); | ||||
|     return ConcatUpperLower(d, swapped, v);  // 8 = half of the vector
 | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // ------------------------------ Sorting network
 | ||||
| 
 | ||||
| // (Green's irregular) sorting network for independent columns in 16 vectors.
 | ||||
| template <class D, class Traits, class V = Vec<D>> | ||||
| HWY_INLINE void Sort16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5, | ||||
|                        V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd, | ||||
|                        V& ve, V& vf) { | ||||
|   st.Sort2(d, v0, v1); | ||||
|   st.Sort2(d, v2, v3); | ||||
|   st.Sort2(d, v4, v5); | ||||
|   st.Sort2(d, v6, v7); | ||||
|   st.Sort2(d, v8, v9); | ||||
|   st.Sort2(d, va, vb); | ||||
|   st.Sort2(d, vc, vd); | ||||
|   st.Sort2(d, ve, vf); | ||||
|   st.Sort2(d, v0, v2); | ||||
|   st.Sort2(d, v1, v3); | ||||
|   st.Sort2(d, v4, v6); | ||||
|   st.Sort2(d, v5, v7); | ||||
|   st.Sort2(d, v8, va); | ||||
|   st.Sort2(d, v9, vb); | ||||
|   st.Sort2(d, vc, ve); | ||||
|   st.Sort2(d, vd, vf); | ||||
|   st.Sort2(d, v0, v4); | ||||
|   st.Sort2(d, v1, v5); | ||||
|   st.Sort2(d, v2, v6); | ||||
|   st.Sort2(d, v3, v7); | ||||
|   st.Sort2(d, v8, vc); | ||||
|   st.Sort2(d, v9, vd); | ||||
|   st.Sort2(d, va, ve); | ||||
|   st.Sort2(d, vb, vf); | ||||
|   st.Sort2(d, v0, v8); | ||||
|   st.Sort2(d, v1, v9); | ||||
|   st.Sort2(d, v2, va); | ||||
|   st.Sort2(d, v3, vb); | ||||
|   st.Sort2(d, v4, vc); | ||||
|   st.Sort2(d, v5, vd); | ||||
|   st.Sort2(d, v6, ve); | ||||
|   st.Sort2(d, v7, vf); | ||||
|   st.Sort2(d, v5, va); | ||||
|   st.Sort2(d, v6, v9); | ||||
|   st.Sort2(d, v3, vc); | ||||
|   st.Sort2(d, v7, vb); | ||||
|   st.Sort2(d, vd, ve); | ||||
|   st.Sort2(d, v4, v8); | ||||
|   st.Sort2(d, v1, v2); | ||||
|   st.Sort2(d, v1, v4); | ||||
|   st.Sort2(d, v7, vd); | ||||
|   st.Sort2(d, v2, v8); | ||||
|   st.Sort2(d, vb, ve); | ||||
|   st.Sort2(d, v2, v4); | ||||
|   st.Sort2(d, v5, v6); | ||||
|   st.Sort2(d, v9, va); | ||||
|   st.Sort2(d, vb, vd); | ||||
|   st.Sort2(d, v3, v8); | ||||
|   st.Sort2(d, v7, vc); | ||||
|   st.Sort2(d, v3, v5); | ||||
|   st.Sort2(d, v6, v8); | ||||
|   st.Sort2(d, v7, v9); | ||||
|   st.Sort2(d, va, vc); | ||||
|   st.Sort2(d, v3, v4); | ||||
|   st.Sort2(d, v5, v6); | ||||
|   st.Sort2(d, v7, v8); | ||||
|   st.Sort2(d, v9, va); | ||||
|   st.Sort2(d, vb, vc); | ||||
|   st.Sort2(d, v6, v7); | ||||
|   st.Sort2(d, v8, v9); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Merging networks
 | ||||
| 
 | ||||
| // Blacher's hybrid bitonic/odd-even networks, generated by print_network.cc.
 | ||||
| 
 | ||||
| template <class D, class Traits, class V = Vec<D>> | ||||
| HWY_INLINE void Merge2(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5, | ||||
|                        V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd, | ||||
|                        V& ve, V& vf) { | ||||
|   v8 = st.ReverseKeys2(d, v8); | ||||
|   v9 = st.ReverseKeys2(d, v9); | ||||
|   va = st.ReverseKeys2(d, va); | ||||
|   vb = st.ReverseKeys2(d, vb); | ||||
|   vc = st.ReverseKeys2(d, vc); | ||||
|   vd = st.ReverseKeys2(d, vd); | ||||
|   ve = st.ReverseKeys2(d, ve); | ||||
|   vf = st.ReverseKeys2(d, vf); | ||||
|   st.Sort2(d, v0, vf); | ||||
|   st.Sort2(d, v1, ve); | ||||
|   st.Sort2(d, v2, vd); | ||||
|   st.Sort2(d, v3, vc); | ||||
|   st.Sort2(d, v4, vb); | ||||
|   st.Sort2(d, v5, va); | ||||
|   st.Sort2(d, v6, v9); | ||||
|   st.Sort2(d, v7, v8); | ||||
|   v4 = st.ReverseKeys2(d, v4); | ||||
|   vc = st.ReverseKeys2(d, vc); | ||||
|   v5 = st.ReverseKeys2(d, v5); | ||||
|   vd = st.ReverseKeys2(d, vd); | ||||
|   v6 = st.ReverseKeys2(d, v6); | ||||
|   ve = st.ReverseKeys2(d, ve); | ||||
|   v7 = st.ReverseKeys2(d, v7); | ||||
|   vf = st.ReverseKeys2(d, vf); | ||||
|   st.Sort2(d, v0, v7); | ||||
|   st.Sort2(d, v8, vf); | ||||
|   st.Sort2(d, v1, v6); | ||||
|   st.Sort2(d, v9, ve); | ||||
|   st.Sort2(d, v2, v5); | ||||
|   st.Sort2(d, va, vd); | ||||
|   st.Sort2(d, v3, v4); | ||||
|   st.Sort2(d, vb, vc); | ||||
|   v2 = st.ReverseKeys2(d, v2); | ||||
|   v3 = st.ReverseKeys2(d, v3); | ||||
|   v6 = st.ReverseKeys2(d, v6); | ||||
|   v7 = st.ReverseKeys2(d, v7); | ||||
|   va = st.ReverseKeys2(d, va); | ||||
|   vb = st.ReverseKeys2(d, vb); | ||||
|   ve = st.ReverseKeys2(d, ve); | ||||
|   vf = st.ReverseKeys2(d, vf); | ||||
|   st.Sort2(d, v0, v3); | ||||
|   st.Sort2(d, v1, v2); | ||||
|   st.Sort2(d, v4, v7); | ||||
|   st.Sort2(d, v5, v6); | ||||
|   st.Sort2(d, v8, vb); | ||||
|   st.Sort2(d, v9, va); | ||||
|   st.Sort2(d, vc, vf); | ||||
|   st.Sort2(d, vd, ve); | ||||
|   v1 = st.ReverseKeys2(d, v1); | ||||
|   v3 = st.ReverseKeys2(d, v3); | ||||
|   v5 = st.ReverseKeys2(d, v5); | ||||
|   v7 = st.ReverseKeys2(d, v7); | ||||
|   v9 = st.ReverseKeys2(d, v9); | ||||
|   vb = st.ReverseKeys2(d, vb); | ||||
|   vd = st.ReverseKeys2(d, vd); | ||||
|   vf = st.ReverseKeys2(d, vf); | ||||
|   st.Sort2(d, v0, v1); | ||||
|   st.Sort2(d, v2, v3); | ||||
|   st.Sort2(d, v4, v5); | ||||
|   st.Sort2(d, v6, v7); | ||||
|   st.Sort2(d, v8, v9); | ||||
|   st.Sort2(d, va, vb); | ||||
|   st.Sort2(d, vc, vd); | ||||
|   st.Sort2(d, ve, vf); | ||||
|   v0 = st.SortPairsDistance1(d, v0); | ||||
|   v1 = st.SortPairsDistance1(d, v1); | ||||
|   v2 = st.SortPairsDistance1(d, v2); | ||||
|   v3 = st.SortPairsDistance1(d, v3); | ||||
|   v4 = st.SortPairsDistance1(d, v4); | ||||
|   v5 = st.SortPairsDistance1(d, v5); | ||||
|   v6 = st.SortPairsDistance1(d, v6); | ||||
|   v7 = st.SortPairsDistance1(d, v7); | ||||
|   v8 = st.SortPairsDistance1(d, v8); | ||||
|   v9 = st.SortPairsDistance1(d, v9); | ||||
|   va = st.SortPairsDistance1(d, va); | ||||
|   vb = st.SortPairsDistance1(d, vb); | ||||
|   vc = st.SortPairsDistance1(d, vc); | ||||
|   vd = st.SortPairsDistance1(d, vd); | ||||
|   ve = st.SortPairsDistance1(d, ve); | ||||
|   vf = st.SortPairsDistance1(d, vf); | ||||
| } | ||||
| 
 | ||||
| template <class D, class Traits, class V = Vec<D>> | ||||
| HWY_INLINE void Merge4(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5, | ||||
|                        V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd, | ||||
|                        V& ve, V& vf) { | ||||
|   v8 = st.ReverseKeys4(d, v8); | ||||
|   v9 = st.ReverseKeys4(d, v9); | ||||
|   va = st.ReverseKeys4(d, va); | ||||
|   vb = st.ReverseKeys4(d, vb); | ||||
|   vc = st.ReverseKeys4(d, vc); | ||||
|   vd = st.ReverseKeys4(d, vd); | ||||
|   ve = st.ReverseKeys4(d, ve); | ||||
|   vf = st.ReverseKeys4(d, vf); | ||||
|   st.Sort2(d, v0, vf); | ||||
|   st.Sort2(d, v1, ve); | ||||
|   st.Sort2(d, v2, vd); | ||||
|   st.Sort2(d, v3, vc); | ||||
|   st.Sort2(d, v4, vb); | ||||
|   st.Sort2(d, v5, va); | ||||
|   st.Sort2(d, v6, v9); | ||||
|   st.Sort2(d, v7, v8); | ||||
|   v4 = st.ReverseKeys4(d, v4); | ||||
|   vc = st.ReverseKeys4(d, vc); | ||||
|   v5 = st.ReverseKeys4(d, v5); | ||||
|   vd = st.ReverseKeys4(d, vd); | ||||
|   v6 = st.ReverseKeys4(d, v6); | ||||
|   ve = st.ReverseKeys4(d, ve); | ||||
|   v7 = st.ReverseKeys4(d, v7); | ||||
|   vf = st.ReverseKeys4(d, vf); | ||||
|   st.Sort2(d, v0, v7); | ||||
|   st.Sort2(d, v8, vf); | ||||
|   st.Sort2(d, v1, v6); | ||||
|   st.Sort2(d, v9, ve); | ||||
|   st.Sort2(d, v2, v5); | ||||
|   st.Sort2(d, va, vd); | ||||
|   st.Sort2(d, v3, v4); | ||||
|   st.Sort2(d, vb, vc); | ||||
|   v2 = st.ReverseKeys4(d, v2); | ||||
|   v3 = st.ReverseKeys4(d, v3); | ||||
|   v6 = st.ReverseKeys4(d, v6); | ||||
|   v7 = st.ReverseKeys4(d, v7); | ||||
|   va = st.ReverseKeys4(d, va); | ||||
|   vb = st.ReverseKeys4(d, vb); | ||||
|   ve = st.ReverseKeys4(d, ve); | ||||
|   vf = st.ReverseKeys4(d, vf); | ||||
|   st.Sort2(d, v0, v3); | ||||
|   st.Sort2(d, v1, v2); | ||||
|   st.Sort2(d, v4, v7); | ||||
|   st.Sort2(d, v5, v6); | ||||
|   st.Sort2(d, v8, vb); | ||||
|   st.Sort2(d, v9, va); | ||||
|   st.Sort2(d, vc, vf); | ||||
|   st.Sort2(d, vd, ve); | ||||
|   v1 = st.ReverseKeys4(d, v1); | ||||
|   v3 = st.ReverseKeys4(d, v3); | ||||
|   v5 = st.ReverseKeys4(d, v5); | ||||
|   v7 = st.ReverseKeys4(d, v7); | ||||
|   v9 = st.ReverseKeys4(d, v9); | ||||
|   vb = st.ReverseKeys4(d, vb); | ||||
|   vd = st.ReverseKeys4(d, vd); | ||||
|   vf = st.ReverseKeys4(d, vf); | ||||
|   st.Sort2(d, v0, v1); | ||||
|   st.Sort2(d, v2, v3); | ||||
|   st.Sort2(d, v4, v5); | ||||
|   st.Sort2(d, v6, v7); | ||||
|   st.Sort2(d, v8, v9); | ||||
|   st.Sort2(d, va, vb); | ||||
|   st.Sort2(d, vc, vd); | ||||
|   st.Sort2(d, ve, vf); | ||||
|   v0 = st.SortPairsReverse4(d, v0); | ||||
|   v1 = st.SortPairsReverse4(d, v1); | ||||
|   v2 = st.SortPairsReverse4(d, v2); | ||||
|   v3 = st.SortPairsReverse4(d, v3); | ||||
|   v4 = st.SortPairsReverse4(d, v4); | ||||
|   v5 = st.SortPairsReverse4(d, v5); | ||||
|   v6 = st.SortPairsReverse4(d, v6); | ||||
|   v7 = st.SortPairsReverse4(d, v7); | ||||
|   v8 = st.SortPairsReverse4(d, v8); | ||||
|   v9 = st.SortPairsReverse4(d, v9); | ||||
|   va = st.SortPairsReverse4(d, va); | ||||
|   vb = st.SortPairsReverse4(d, vb); | ||||
|   vc = st.SortPairsReverse4(d, vc); | ||||
|   vd = st.SortPairsReverse4(d, vd); | ||||
|   ve = st.SortPairsReverse4(d, ve); | ||||
|   vf = st.SortPairsReverse4(d, vf); | ||||
|   v0 = st.SortPairsDistance1(d, v0); | ||||
|   v1 = st.SortPairsDistance1(d, v1); | ||||
|   v2 = st.SortPairsDistance1(d, v2); | ||||
|   v3 = st.SortPairsDistance1(d, v3); | ||||
|   v4 = st.SortPairsDistance1(d, v4); | ||||
|   v5 = st.SortPairsDistance1(d, v5); | ||||
|   v6 = st.SortPairsDistance1(d, v6); | ||||
|   v7 = st.SortPairsDistance1(d, v7); | ||||
|   v8 = st.SortPairsDistance1(d, v8); | ||||
|   v9 = st.SortPairsDistance1(d, v9); | ||||
|   va = st.SortPairsDistance1(d, va); | ||||
|   vb = st.SortPairsDistance1(d, vb); | ||||
|   vc = st.SortPairsDistance1(d, vc); | ||||
|   vd = st.SortPairsDistance1(d, vd); | ||||
|   ve = st.SortPairsDistance1(d, ve); | ||||
|   vf = st.SortPairsDistance1(d, vf); | ||||
| } | ||||
| 
 | ||||
| template <class D, class Traits, class V = Vec<D>> | ||||
| HWY_INLINE void Merge8(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, V& v5, | ||||
|                        V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, V& vd, | ||||
|                        V& ve, V& vf) { | ||||
|   v8 = st.ReverseKeys8(d, v8); | ||||
|   v9 = st.ReverseKeys8(d, v9); | ||||
|   va = st.ReverseKeys8(d, va); | ||||
|   vb = st.ReverseKeys8(d, vb); | ||||
|   vc = st.ReverseKeys8(d, vc); | ||||
|   vd = st.ReverseKeys8(d, vd); | ||||
|   ve = st.ReverseKeys8(d, ve); | ||||
|   vf = st.ReverseKeys8(d, vf); | ||||
|   st.Sort2(d, v0, vf); | ||||
|   st.Sort2(d, v1, ve); | ||||
|   st.Sort2(d, v2, vd); | ||||
|   st.Sort2(d, v3, vc); | ||||
|   st.Sort2(d, v4, vb); | ||||
|   st.Sort2(d, v5, va); | ||||
|   st.Sort2(d, v6, v9); | ||||
|   st.Sort2(d, v7, v8); | ||||
|   v4 = st.ReverseKeys8(d, v4); | ||||
|   vc = st.ReverseKeys8(d, vc); | ||||
|   v5 = st.ReverseKeys8(d, v5); | ||||
|   vd = st.ReverseKeys8(d, vd); | ||||
|   v6 = st.ReverseKeys8(d, v6); | ||||
|   ve = st.ReverseKeys8(d, ve); | ||||
|   v7 = st.ReverseKeys8(d, v7); | ||||
|   vf = st.ReverseKeys8(d, vf); | ||||
|   st.Sort2(d, v0, v7); | ||||
|   st.Sort2(d, v8, vf); | ||||
|   st.Sort2(d, v1, v6); | ||||
|   st.Sort2(d, v9, ve); | ||||
|   st.Sort2(d, v2, v5); | ||||
|   st.Sort2(d, va, vd); | ||||
|   st.Sort2(d, v3, v4); | ||||
|   st.Sort2(d, vb, vc); | ||||
|   v2 = st.ReverseKeys8(d, v2); | ||||
|   v3 = st.ReverseKeys8(d, v3); | ||||
|   v6 = st.ReverseKeys8(d, v6); | ||||
|   v7 = st.ReverseKeys8(d, v7); | ||||
|   va = st.ReverseKeys8(d, va); | ||||
|   vb = st.ReverseKeys8(d, vb); | ||||
|   ve = st.ReverseKeys8(d, ve); | ||||
|   vf = st.ReverseKeys8(d, vf); | ||||
|   st.Sort2(d, v0, v3); | ||||
|   st.Sort2(d, v1, v2); | ||||
|   st.Sort2(d, v4, v7); | ||||
|   st.Sort2(d, v5, v6); | ||||
|   st.Sort2(d, v8, vb); | ||||
|   st.Sort2(d, v9, va); | ||||
|   st.Sort2(d, vc, vf); | ||||
|   st.Sort2(d, vd, ve); | ||||
|   v1 = st.ReverseKeys8(d, v1); | ||||
|   v3 = st.ReverseKeys8(d, v3); | ||||
|   v5 = st.ReverseKeys8(d, v5); | ||||
|   v7 = st.ReverseKeys8(d, v7); | ||||
|   v9 = st.ReverseKeys8(d, v9); | ||||
|   vb = st.ReverseKeys8(d, vb); | ||||
|   vd = st.ReverseKeys8(d, vd); | ||||
|   vf = st.ReverseKeys8(d, vf); | ||||
|   st.Sort2(d, v0, v1); | ||||
|   st.Sort2(d, v2, v3); | ||||
|   st.Sort2(d, v4, v5); | ||||
|   st.Sort2(d, v6, v7); | ||||
|   st.Sort2(d, v8, v9); | ||||
|   st.Sort2(d, va, vb); | ||||
|   st.Sort2(d, vc, vd); | ||||
|   st.Sort2(d, ve, vf); | ||||
|   v0 = st.SortPairsReverse8(d, v0); | ||||
|   v1 = st.SortPairsReverse8(d, v1); | ||||
|   v2 = st.SortPairsReverse8(d, v2); | ||||
|   v3 = st.SortPairsReverse8(d, v3); | ||||
|   v4 = st.SortPairsReverse8(d, v4); | ||||
|   v5 = st.SortPairsReverse8(d, v5); | ||||
|   v6 = st.SortPairsReverse8(d, v6); | ||||
|   v7 = st.SortPairsReverse8(d, v7); | ||||
|   v8 = st.SortPairsReverse8(d, v8); | ||||
|   v9 = st.SortPairsReverse8(d, v9); | ||||
|   va = st.SortPairsReverse8(d, va); | ||||
|   vb = st.SortPairsReverse8(d, vb); | ||||
|   vc = st.SortPairsReverse8(d, vc); | ||||
|   vd = st.SortPairsReverse8(d, vd); | ||||
|   ve = st.SortPairsReverse8(d, ve); | ||||
|   vf = st.SortPairsReverse8(d, vf); | ||||
|   v0 = st.SortPairsDistance2(d, v0); | ||||
|   v1 = st.SortPairsDistance2(d, v1); | ||||
|   v2 = st.SortPairsDistance2(d, v2); | ||||
|   v3 = st.SortPairsDistance2(d, v3); | ||||
|   v4 = st.SortPairsDistance2(d, v4); | ||||
|   v5 = st.SortPairsDistance2(d, v5); | ||||
|   v6 = st.SortPairsDistance2(d, v6); | ||||
|   v7 = st.SortPairsDistance2(d, v7); | ||||
|   v8 = st.SortPairsDistance2(d, v8); | ||||
|   v9 = st.SortPairsDistance2(d, v9); | ||||
|   va = st.SortPairsDistance2(d, va); | ||||
|   vb = st.SortPairsDistance2(d, vb); | ||||
|   vc = st.SortPairsDistance2(d, vc); | ||||
|   vd = st.SortPairsDistance2(d, vd); | ||||
|   ve = st.SortPairsDistance2(d, ve); | ||||
|   vf = st.SortPairsDistance2(d, vf); | ||||
|   v0 = st.SortPairsDistance1(d, v0); | ||||
|   v1 = st.SortPairsDistance1(d, v1); | ||||
|   v2 = st.SortPairsDistance1(d, v2); | ||||
|   v3 = st.SortPairsDistance1(d, v3); | ||||
|   v4 = st.SortPairsDistance1(d, v4); | ||||
|   v5 = st.SortPairsDistance1(d, v5); | ||||
|   v6 = st.SortPairsDistance1(d, v6); | ||||
|   v7 = st.SortPairsDistance1(d, v7); | ||||
|   v8 = st.SortPairsDistance1(d, v8); | ||||
|   v9 = st.SortPairsDistance1(d, v9); | ||||
|   va = st.SortPairsDistance1(d, va); | ||||
|   vb = st.SortPairsDistance1(d, vb); | ||||
|   vc = st.SortPairsDistance1(d, vc); | ||||
|   vd = st.SortPairsDistance1(d, vd); | ||||
|   ve = st.SortPairsDistance1(d, ve); | ||||
|   vf = st.SortPairsDistance1(d, vf); | ||||
| } | ||||
| 
 | ||||
| // Unused on MSVC, see below
 | ||||
| #if !HWY_COMPILER_MSVC | ||||
| 
 | ||||
| template <class D, class Traits, class V = Vec<D>> | ||||
| HWY_INLINE void Merge16(D d, Traits st, V& v0, V& v1, V& v2, V& v3, V& v4, | ||||
|                         V& v5, V& v6, V& v7, V& v8, V& v9, V& va, V& vb, V& vc, | ||||
|                         V& vd, V& ve, V& vf) { | ||||
|   v8 = st.ReverseKeys16(d, v8); | ||||
|   v9 = st.ReverseKeys16(d, v9); | ||||
|   va = st.ReverseKeys16(d, va); | ||||
|   vb = st.ReverseKeys16(d, vb); | ||||
|   vc = st.ReverseKeys16(d, vc); | ||||
|   vd = st.ReverseKeys16(d, vd); | ||||
|   ve = st.ReverseKeys16(d, ve); | ||||
|   vf = st.ReverseKeys16(d, vf); | ||||
|   st.Sort2(d, v0, vf); | ||||
|   st.Sort2(d, v1, ve); | ||||
|   st.Sort2(d, v2, vd); | ||||
|   st.Sort2(d, v3, vc); | ||||
|   st.Sort2(d, v4, vb); | ||||
|   st.Sort2(d, v5, va); | ||||
|   st.Sort2(d, v6, v9); | ||||
|   st.Sort2(d, v7, v8); | ||||
|   v4 = st.ReverseKeys16(d, v4); | ||||
|   vc = st.ReverseKeys16(d, vc); | ||||
|   v5 = st.ReverseKeys16(d, v5); | ||||
|   vd = st.ReverseKeys16(d, vd); | ||||
|   v6 = st.ReverseKeys16(d, v6); | ||||
|   ve = st.ReverseKeys16(d, ve); | ||||
|   v7 = st.ReverseKeys16(d, v7); | ||||
|   vf = st.ReverseKeys16(d, vf); | ||||
|   st.Sort2(d, v0, v7); | ||||
|   st.Sort2(d, v8, vf); | ||||
|   st.Sort2(d, v1, v6); | ||||
|   st.Sort2(d, v9, ve); | ||||
|   st.Sort2(d, v2, v5); | ||||
|   st.Sort2(d, va, vd); | ||||
|   st.Sort2(d, v3, v4); | ||||
|   st.Sort2(d, vb, vc); | ||||
|   v2 = st.ReverseKeys16(d, v2); | ||||
|   v3 = st.ReverseKeys16(d, v3); | ||||
|   v6 = st.ReverseKeys16(d, v6); | ||||
|   v7 = st.ReverseKeys16(d, v7); | ||||
|   va = st.ReverseKeys16(d, va); | ||||
|   vb = st.ReverseKeys16(d, vb); | ||||
|   ve = st.ReverseKeys16(d, ve); | ||||
|   vf = st.ReverseKeys16(d, vf); | ||||
|   st.Sort2(d, v0, v3); | ||||
|   st.Sort2(d, v1, v2); | ||||
|   st.Sort2(d, v4, v7); | ||||
|   st.Sort2(d, v5, v6); | ||||
|   st.Sort2(d, v8, vb); | ||||
|   st.Sort2(d, v9, va); | ||||
|   st.Sort2(d, vc, vf); | ||||
|   st.Sort2(d, vd, ve); | ||||
|   v1 = st.ReverseKeys16(d, v1); | ||||
|   v3 = st.ReverseKeys16(d, v3); | ||||
|   v5 = st.ReverseKeys16(d, v5); | ||||
|   v7 = st.ReverseKeys16(d, v7); | ||||
|   v9 = st.ReverseKeys16(d, v9); | ||||
|   vb = st.ReverseKeys16(d, vb); | ||||
|   vd = st.ReverseKeys16(d, vd); | ||||
|   vf = st.ReverseKeys16(d, vf); | ||||
|   st.Sort2(d, v0, v1); | ||||
|   st.Sort2(d, v2, v3); | ||||
|   st.Sort2(d, v4, v5); | ||||
|   st.Sort2(d, v6, v7); | ||||
|   st.Sort2(d, v8, v9); | ||||
|   st.Sort2(d, va, vb); | ||||
|   st.Sort2(d, vc, vd); | ||||
|   st.Sort2(d, ve, vf); | ||||
|   v0 = st.SortPairsReverse16(d, v0); | ||||
|   v1 = st.SortPairsReverse16(d, v1); | ||||
|   v2 = st.SortPairsReverse16(d, v2); | ||||
|   v3 = st.SortPairsReverse16(d, v3); | ||||
|   v4 = st.SortPairsReverse16(d, v4); | ||||
|   v5 = st.SortPairsReverse16(d, v5); | ||||
|   v6 = st.SortPairsReverse16(d, v6); | ||||
|   v7 = st.SortPairsReverse16(d, v7); | ||||
|   v8 = st.SortPairsReverse16(d, v8); | ||||
|   v9 = st.SortPairsReverse16(d, v9); | ||||
|   va = st.SortPairsReverse16(d, va); | ||||
|   vb = st.SortPairsReverse16(d, vb); | ||||
|   vc = st.SortPairsReverse16(d, vc); | ||||
|   vd = st.SortPairsReverse16(d, vd); | ||||
|   ve = st.SortPairsReverse16(d, ve); | ||||
|   vf = st.SortPairsReverse16(d, vf); | ||||
|   v0 = st.SortPairsDistance4(d, v0); | ||||
|   v1 = st.SortPairsDistance4(d, v1); | ||||
|   v2 = st.SortPairsDistance4(d, v2); | ||||
|   v3 = st.SortPairsDistance4(d, v3); | ||||
|   v4 = st.SortPairsDistance4(d, v4); | ||||
|   v5 = st.SortPairsDistance4(d, v5); | ||||
|   v6 = st.SortPairsDistance4(d, v6); | ||||
|   v7 = st.SortPairsDistance4(d, v7); | ||||
|   v8 = st.SortPairsDistance4(d, v8); | ||||
|   v9 = st.SortPairsDistance4(d, v9); | ||||
|   va = st.SortPairsDistance4(d, va); | ||||
|   vb = st.SortPairsDistance4(d, vb); | ||||
|   vc = st.SortPairsDistance4(d, vc); | ||||
|   vd = st.SortPairsDistance4(d, vd); | ||||
|   ve = st.SortPairsDistance4(d, ve); | ||||
|   vf = st.SortPairsDistance4(d, vf); | ||||
|   v0 = st.SortPairsDistance2(d, v0); | ||||
|   v1 = st.SortPairsDistance2(d, v1); | ||||
|   v2 = st.SortPairsDistance2(d, v2); | ||||
|   v3 = st.SortPairsDistance2(d, v3); | ||||
|   v4 = st.SortPairsDistance2(d, v4); | ||||
|   v5 = st.SortPairsDistance2(d, v5); | ||||
|   v6 = st.SortPairsDistance2(d, v6); | ||||
|   v7 = st.SortPairsDistance2(d, v7); | ||||
|   v8 = st.SortPairsDistance2(d, v8); | ||||
|   v9 = st.SortPairsDistance2(d, v9); | ||||
|   va = st.SortPairsDistance2(d, va); | ||||
|   vb = st.SortPairsDistance2(d, vb); | ||||
|   vc = st.SortPairsDistance2(d, vc); | ||||
|   vd = st.SortPairsDistance2(d, vd); | ||||
|   ve = st.SortPairsDistance2(d, ve); | ||||
|   vf = st.SortPairsDistance2(d, vf); | ||||
|   v0 = st.SortPairsDistance1(d, v0); | ||||
|   v1 = st.SortPairsDistance1(d, v1); | ||||
|   v2 = st.SortPairsDistance1(d, v2); | ||||
|   v3 = st.SortPairsDistance1(d, v3); | ||||
|   v4 = st.SortPairsDistance1(d, v4); | ||||
|   v5 = st.SortPairsDistance1(d, v5); | ||||
|   v6 = st.SortPairsDistance1(d, v6); | ||||
|   v7 = st.SortPairsDistance1(d, v7); | ||||
|   v8 = st.SortPairsDistance1(d, v8); | ||||
|   v9 = st.SortPairsDistance1(d, v9); | ||||
|   va = st.SortPairsDistance1(d, va); | ||||
|   vb = st.SortPairsDistance1(d, vb); | ||||
|   vc = st.SortPairsDistance1(d, vc); | ||||
|   vd = st.SortPairsDistance1(d, vd); | ||||
|   ve = st.SortPairsDistance1(d, ve); | ||||
|   vf = st.SortPairsDistance1(d, vf); | ||||
| } | ||||
| 
 | ||||
| #endif  // !HWY_COMPILER_MSVC
 | ||||
| 
 | ||||
| // Reshapes `buf` into a matrix, sorts columns independently, and then merges
 | ||||
| // into a sorted 1D array without transposing.
 | ||||
| //
 | ||||
| // `st` is SharedTraits<LaneTraits/Traits128<Order*>>. This abstraction layer
 | ||||
| //   bridges differences in sort order and single-lane vs 128-bit keys.
 | ||||
| // `buf` ensures full vectors are aligned, and enables loads/stores without
 | ||||
| //   bounds checks.
 | ||||
| //
 | ||||
| // References:
 | ||||
| // https://drops.dagstuhl.de/opus/volltexte/2021/13775/pdf/LIPIcs-SEA-2021-3.pdf
 | ||||
| // https://github.com/simd-sorting/fast-and-robust/blob/master/avx2_sort_demo/avx2sort.h
 | ||||
| // "Entwurf und Implementierung vektorisierter Sortieralgorithmen" (M. Blacher)
 | ||||
| template <class Traits, typename T> | ||||
| HWY_INLINE void SortingNetwork(Traits st, T* HWY_RESTRICT buf, size_t cols) { | ||||
|   const CappedTag<T, Constants::kMaxCols> d; | ||||
|   using V = decltype(Zero(d)); | ||||
| 
 | ||||
|   HWY_DASSERT(cols <= Constants::kMaxCols); | ||||
| 
 | ||||
|   // The network width depends on the number of keys, not lanes.
 | ||||
|   constexpr size_t kLanesPerKey = st.LanesPerKey(); | ||||
|   const size_t keys = cols / kLanesPerKey; | ||||
|   constexpr size_t kMaxKeys = MaxLanes(d) / kLanesPerKey; | ||||
| 
 | ||||
|   // These are aligned iff cols == Lanes(d). We prefer unaligned/non-constexpr
 | ||||
|   // offsets to duplicating this code for every value of cols.
 | ||||
|   static_assert(Constants::kMaxRows == 16, "Update loads/stores/args"); | ||||
|   V v0 = LoadU(d, buf + 0x0 * cols); | ||||
|   V v1 = LoadU(d, buf + 0x1 * cols); | ||||
|   V v2 = LoadU(d, buf + 0x2 * cols); | ||||
|   V v3 = LoadU(d, buf + 0x3 * cols); | ||||
|   V v4 = LoadU(d, buf + 0x4 * cols); | ||||
|   V v5 = LoadU(d, buf + 0x5 * cols); | ||||
|   V v6 = LoadU(d, buf + 0x6 * cols); | ||||
|   V v7 = LoadU(d, buf + 0x7 * cols); | ||||
|   V v8 = LoadU(d, buf + 0x8 * cols); | ||||
|   V v9 = LoadU(d, buf + 0x9 * cols); | ||||
|   V va = LoadU(d, buf + 0xa * cols); | ||||
|   V vb = LoadU(d, buf + 0xb * cols); | ||||
|   V vc = LoadU(d, buf + 0xc * cols); | ||||
|   V vd = LoadU(d, buf + 0xd * cols); | ||||
|   V ve = LoadU(d, buf + 0xe * cols); | ||||
|   V vf = LoadU(d, buf + 0xf * cols); | ||||
| 
 | ||||
|   Sort16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, vf); | ||||
| 
 | ||||
|   // Checking MaxLanes avoids generating HWY_ASSERT code for the unreachable
 | ||||
|   // code paths: if MaxLanes < 2, then keys <= cols < 2.
 | ||||
|   if (HWY_LIKELY(keys >= 2 && kMaxKeys >= 2)) { | ||||
|     Merge2(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, | ||||
|            vf); | ||||
| 
 | ||||
|     if (HWY_LIKELY(keys >= 4 && kMaxKeys >= 4)) { | ||||
|       Merge4(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, ve, | ||||
|              vf); | ||||
| 
 | ||||
|       if (HWY_LIKELY(keys >= 8 && kMaxKeys >= 8)) { | ||||
|         Merge8(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, | ||||
|                ve, vf); | ||||
| 
 | ||||
|         // Avoids build timeout
 | ||||
| #if !HWY_COMPILER_MSVC | ||||
|         if (HWY_LIKELY(keys >= 16 && kMaxKeys >= 16)) { | ||||
|           Merge16(d, st, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, va, vb, vc, vd, | ||||
|                   ve, vf); | ||||
| 
 | ||||
|           static_assert(Constants::kMaxCols <= 16, "Add more branches"); | ||||
|         } | ||||
| #endif | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   StoreU(v0, d, buf + 0x0 * cols); | ||||
|   StoreU(v1, d, buf + 0x1 * cols); | ||||
|   StoreU(v2, d, buf + 0x2 * cols); | ||||
|   StoreU(v3, d, buf + 0x3 * cols); | ||||
|   StoreU(v4, d, buf + 0x4 * cols); | ||||
|   StoreU(v5, d, buf + 0x5 * cols); | ||||
|   StoreU(v6, d, buf + 0x6 * cols); | ||||
|   StoreU(v7, d, buf + 0x7 * cols); | ||||
|   StoreU(v8, d, buf + 0x8 * cols); | ||||
|   StoreU(v9, d, buf + 0x9 * cols); | ||||
|   StoreU(va, d, buf + 0xa * cols); | ||||
|   StoreU(vb, d, buf + 0xb * cols); | ||||
|   StoreU(vc, d, buf + 0xc * cols); | ||||
|   StoreU(vd, d, buf + 0xd * cols); | ||||
|   StoreU(ve, d, buf + 0xe * cols); | ||||
|   StoreU(vf, d, buf + 0xf * cols); | ||||
| } | ||||
| 
 | ||||
| }  // namespace detail
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #endif  // HIGHWAY_HWY_CONTRIB_SORT_SORTING_NETWORKS_TOGGLE
 | ||||
							
								
								
									
										324
									
								
								third_party/highway/hwy/contrib/sort/traits-inl.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										324
									
								
								third_party/highway/hwy/contrib/sort/traits-inl.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,324 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| // Per-target
 | ||||
| #if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE) == \ | ||||
|     defined(HWY_TARGET_TOGGLE) | ||||
| #ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE | ||||
| #undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE | ||||
| #else | ||||
| #define HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE | ||||
| #endif | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/shared-inl.h"  // SortConstants | ||||
| #include "hwy/contrib/sort/vqsort.h"      // SortDescending | ||||
| #include "hwy/highway.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| namespace detail { | ||||
| 
 | ||||
| // Highway does not provide a lane type for 128-bit keys, so we use uint64_t
 | ||||
| // along with an abstraction layer for single-lane vs. lane-pair, which is
 | ||||
| // independent of the order.
 | ||||
| struct KeyLane { | ||||
|   constexpr size_t LanesPerKey() const { return 1; } | ||||
| 
 | ||||
|   // For HeapSort
 | ||||
|   template <typename T> | ||||
|   HWY_INLINE void Swap(T* a, T* b) const { | ||||
|     const T temp = *a; | ||||
|     *a = *b; | ||||
|     *b = temp; | ||||
|   } | ||||
| 
 | ||||
|   // Broadcasts one key into a vector
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const { | ||||
|     return Set(d, *key); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const { | ||||
|     return Reverse(d, v); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> ReverseKeys2(D d, Vec<D> v) const { | ||||
|     return Reverse2(d, v); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> ReverseKeys4(D d, Vec<D> v) const { | ||||
|     return Reverse4(d, v); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> ReverseKeys8(D d, Vec<D> v) const { | ||||
|     return Reverse8(d, v); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> ReverseKeys16(D d, Vec<D> v) const { | ||||
|     static_assert(SortConstants::kMaxCols <= 16, "Assumes u32x16 = 512 bit"); | ||||
|     return ReverseKeys(d, v); | ||||
|   } | ||||
| 
 | ||||
|   template <class V> | ||||
|   HWY_INLINE V OddEvenKeys(const V odd, const V even) const { | ||||
|     return OddEven(odd, even); | ||||
|   } | ||||
| 
 | ||||
|   template <class D, HWY_IF_LANE_SIZE_D(D, 2)> | ||||
|   HWY_INLINE Vec<D> SwapAdjacentPairs(D d, const Vec<D> v) const { | ||||
|     const Repartition<uint32_t, D> du32; | ||||
|     return BitCast(d, Shuffle2301(BitCast(du32, v))); | ||||
|   } | ||||
|   template <class D, HWY_IF_LANE_SIZE_D(D, 4)> | ||||
|   HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const { | ||||
|     return Shuffle1032(v); | ||||
|   } | ||||
|   template <class D, HWY_IF_LANE_SIZE_D(D, 8)> | ||||
|   HWY_INLINE Vec<D> SwapAdjacentPairs(D /* tag */, const Vec<D> v) const { | ||||
|     return SwapAdjacentBlocks(v); | ||||
|   } | ||||
| 
 | ||||
|   template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)> | ||||
|   HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const { | ||||
| #if HWY_HAVE_FLOAT64  // in case D is float32
 | ||||
|     const RepartitionToWide<D> dw; | ||||
| #else | ||||
|     const RepartitionToWide<RebindToUnsigned<D>> dw; | ||||
| #endif | ||||
|     return BitCast(d, SwapAdjacentPairs(dw, BitCast(dw, v))); | ||||
|   } | ||||
|   template <class D, HWY_IF_LANE_SIZE_D(D, 8)> | ||||
|   HWY_INLINE Vec<D> SwapAdjacentQuads(D d, const Vec<D> v) const { | ||||
|     // Assumes max vector size = 512
 | ||||
|     return ConcatLowerUpper(d, v, v); | ||||
|   } | ||||
| 
 | ||||
|   template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)> | ||||
|   HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd, | ||||
|                                  const Vec<D> even) const { | ||||
| #if HWY_HAVE_FLOAT64  // in case D is float32
 | ||||
|     const RepartitionToWide<D> dw; | ||||
| #else | ||||
|     const RepartitionToWide<RebindToUnsigned<D>> dw; | ||||
| #endif | ||||
|     return BitCast(d, OddEven(BitCast(dw, odd), BitCast(dw, even))); | ||||
|   } | ||||
|   template <class D, HWY_IF_LANE_SIZE_D(D, 8)> | ||||
|   HWY_INLINE Vec<D> OddEvenPairs(D /* tag */, Vec<D> odd, Vec<D> even) const { | ||||
|     return OddEvenBlocks(odd, even); | ||||
|   } | ||||
| 
 | ||||
|   template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)> | ||||
|   HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const { | ||||
| #if HWY_HAVE_FLOAT64  // in case D is float32
 | ||||
|     const RepartitionToWide<D> dw; | ||||
| #else | ||||
|     const RepartitionToWide<RebindToUnsigned<D>> dw; | ||||
| #endif | ||||
|     return BitCast(d, OddEvenPairs(dw, BitCast(dw, odd), BitCast(dw, even))); | ||||
|   } | ||||
|   template <class D, HWY_IF_LANE_SIZE_D(D, 8)> | ||||
|   HWY_INLINE Vec<D> OddEvenQuads(D d, Vec<D> odd, Vec<D> even) const { | ||||
|     return ConcatUpperLower(d, odd, even); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // Anything order-related depends on the key traits *and* the order (see
 | ||||
| // FirstOfLanes). We cannot implement just one Compare function because Lt128
 | ||||
| // only compiles if the lane type is u64. Thus we need either overloaded
 | ||||
| // functions with a tag type, class specializations, or separate classes.
 | ||||
| // We avoid overloaded functions because we want all functions to be callable
 | ||||
| // from a SortTraits without per-function wrappers. Specializing would work, but
 | ||||
| // we are anyway going to specialize at a higher level.
 | ||||
| struct OrderAscending : public KeyLane { | ||||
|   using Order = SortAscending; | ||||
| 
 | ||||
|   template <typename T> | ||||
|   HWY_INLINE bool Compare1(const T* a, const T* b) { | ||||
|     return *a < *b; | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const { | ||||
|     return Lt(a, b); | ||||
|   } | ||||
| 
 | ||||
|   // Two halves of Sort2, used in ScanMinMax.
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const { | ||||
|     return Min(a, b); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const { | ||||
|     return Max(a, b); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v, | ||||
|                                  TFromD<D>* HWY_RESTRICT /* buf */) const { | ||||
|     return MinOfLanes(d, v); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v, | ||||
|                                 TFromD<D>* HWY_RESTRICT /* buf */) const { | ||||
|     return MaxOfLanes(d, v); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> FirstValue(D d) const { | ||||
|     return Set(d, hwy::LowestValue<TFromD<D>>()); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> LastValue(D d) const { | ||||
|     return Set(d, hwy::HighestValue<TFromD<D>>()); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| struct OrderDescending : public KeyLane { | ||||
|   using Order = SortDescending; | ||||
| 
 | ||||
|   template <typename T> | ||||
|   HWY_INLINE bool Compare1(const T* a, const T* b) { | ||||
|     return *b < *a; | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Mask<D> Compare(D /* tag */, Vec<D> a, Vec<D> b) const { | ||||
|     return Lt(b, a); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> First(D /* tag */, const Vec<D> a, const Vec<D> b) const { | ||||
|     return Max(a, b); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> Last(D /* tag */, const Vec<D> a, const Vec<D> b) const { | ||||
|     return Min(a, b); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v, | ||||
|                                  TFromD<D>* HWY_RESTRICT /* buf */) const { | ||||
|     return MaxOfLanes(d, v); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v, | ||||
|                                 TFromD<D>* HWY_RESTRICT /* buf */) const { | ||||
|     return MinOfLanes(d, v); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> FirstValue(D d) const { | ||||
|     return Set(d, hwy::HighestValue<TFromD<D>>()); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> LastValue(D d) const { | ||||
|     return Set(d, hwy::LowestValue<TFromD<D>>()); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // Shared code that depends on Order.
 | ||||
| template <class Base> | ||||
| struct LaneTraits : public Base { | ||||
|   constexpr bool Is128() const { return false; } | ||||
| 
 | ||||
|   // For each lane i: replaces a[i] with the first and b[i] with the second
 | ||||
|   // according to Base.
 | ||||
|   // Corresponds to a conditional swap, which is one "node" of a sorting
 | ||||
|   // network. Min/Max are cheaper than compare + blend at least for integers.
 | ||||
|   template <class D> | ||||
|   HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const { | ||||
|     const Base* base = static_cast<const Base*>(this); | ||||
| 
 | ||||
|     const Vec<D> a_copy = a; | ||||
|     // Prior to AVX3, there is no native 64-bit Min/Max, so they compile to 4
 | ||||
|     // instructions. We can reduce it to a compare + 2 IfThenElse.
 | ||||
| #if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3 | ||||
|     if (sizeof(TFromD<D>) == 8) { | ||||
|       const Mask<D> cmp = base->Compare(d, a, b); | ||||
|       a = IfThenElse(cmp, a, b); | ||||
|       b = IfThenElse(cmp, b, a_copy); | ||||
|       return; | ||||
|     } | ||||
| #endif | ||||
|     a = base->First(d, a, b); | ||||
|     b = base->Last(d, a_copy, b); | ||||
|   } | ||||
| 
 | ||||
|   // Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
 | ||||
|   template <class D, HWY_IF_LANE_SIZE_D(D, 8)> | ||||
|   HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const { | ||||
|     const Base* base = static_cast<const Base*>(this); | ||||
|     Vec<D> swapped = base->ReverseKeys2(d, v); | ||||
|     // Further to the above optimization, Sort2+OddEvenKeys compile to four
 | ||||
|     // instructions; we can save one by combining two blends.
 | ||||
| #if HWY_AVX3 < HWY_TARGET && HWY_TARGET <= HWY_SSSE3 | ||||
|     const Vec<D> cmp = VecFromMask(d, base->Compare(d, v, swapped)); | ||||
|     return IfVecThenElse(DupOdd(cmp), swapped, v); | ||||
| #else | ||||
|     Sort2(d, v, swapped); | ||||
|     return base->OddEvenKeys(swapped, v); | ||||
| #endif | ||||
|   } | ||||
| 
 | ||||
|   // (See above - we use Sort2 for non-64-bit types.)
 | ||||
|   template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)> | ||||
|   HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const { | ||||
|     const Base* base = static_cast<const Base*>(this); | ||||
|     Vec<D> swapped = base->ReverseKeys2(d, v); | ||||
|     Sort2(d, v, swapped); | ||||
|     return base->OddEvenKeys(swapped, v); | ||||
|   } | ||||
| 
 | ||||
|   // Swaps with the vector formed by reversing contiguous groups of 4 keys.
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const { | ||||
|     const Base* base = static_cast<const Base*>(this); | ||||
|     Vec<D> swapped = base->ReverseKeys4(d, v); | ||||
|     Sort2(d, v, swapped); | ||||
|     return base->OddEvenPairs(d, swapped, v); | ||||
|   } | ||||
| 
 | ||||
|   // Conditionally swaps lane 0 with 4, 1 with 5 etc.
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> SortPairsDistance4(D d, Vec<D> v) const { | ||||
|     const Base* base = static_cast<const Base*>(this); | ||||
|     Vec<D> swapped = base->SwapAdjacentQuads(d, v); | ||||
|     // Only used in Merge16, so this will not be used on AVX2 (which only has 4
 | ||||
|     // u64 lanes), so skip the above optimization for 64-bit AVX2.
 | ||||
|     Sort2(d, v, swapped); | ||||
|     return base->OddEvenQuads(d, swapped, v); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| }  // namespace detail
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #endif  // HIGHWAY_HWY_CONTRIB_SORT_TRAITS_TOGGLE
 | ||||
							
								
								
									
										368
									
								
								third_party/highway/hwy/contrib/sort/traits128-inl.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										368
									
								
								third_party/highway/hwy/contrib/sort/traits128-inl.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,368 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| // Per-target
 | ||||
| #if defined(HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE) == \ | ||||
|     defined(HWY_TARGET_TOGGLE) | ||||
| #ifdef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE | ||||
| #undef HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE | ||||
| #else | ||||
| #define HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE | ||||
| #endif | ||||
| 
 | ||||
| #include "hwy/contrib/sort/vqsort.h"  // SortDescending | ||||
| #include "hwy/highway.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| namespace detail { | ||||
| 
 | ||||
| #if HWY_TARGET == HWY_SCALAR | ||||
| 
 | ||||
| struct OrderAscending128 { | ||||
|   using Order = SortAscending; | ||||
| 
 | ||||
|   template <typename T> | ||||
|   HWY_INLINE bool Compare1(const T* a, const T* b) { | ||||
|     return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1]; | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| struct OrderDescending128 { | ||||
|   using Order = SortDescending; | ||||
| 
 | ||||
|   template <typename T> | ||||
|   HWY_INLINE bool Compare1(const T* a, const T* b) { | ||||
|     return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1]; | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| template <class Order> | ||||
| struct Traits128 : public Order { | ||||
|   constexpr bool Is128() const { return true; } | ||||
|   constexpr size_t LanesPerKey() const { return 2; } | ||||
| }; | ||||
| 
 | ||||
| #else | ||||
| 
 | ||||
| // Highway does not provide a lane type for 128-bit keys, so we use uint64_t
 | ||||
| // along with an abstraction layer for single-lane vs. lane-pair, which is
 | ||||
| // independent of the order.
 | ||||
| struct Key128 { | ||||
|   constexpr size_t LanesPerKey() const { return 2; } | ||||
| 
 | ||||
|   template <typename T> | ||||
|   HWY_INLINE void Swap(T* a, T* b) const { | ||||
|     const FixedTag<T, 2> d; | ||||
|     const auto temp = LoadU(d, a); | ||||
|     StoreU(LoadU(d, b), d, a); | ||||
|     StoreU(temp, d, b); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> SetKey(D d, const TFromD<D>* key) const { | ||||
|     return LoadDup128(d, key); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> ReverseKeys(D d, Vec<D> v) const { | ||||
|     return ReverseBlocks(d, v); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> ReverseKeys2(D /* tag */, const Vec<D> v) const { | ||||
|     return SwapAdjacentBlocks(v); | ||||
|   } | ||||
| 
 | ||||
|   // Only called for 4 keys because we do not support >512-bit vectors.
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> ReverseKeys4(D d, const Vec<D> v) const { | ||||
|     HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>)); | ||||
|     return ReverseKeys(d, v); | ||||
|   } | ||||
| 
 | ||||
|   // Only called for 4 keys because we do not support >512-bit vectors.
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> OddEvenPairs(D d, const Vec<D> odd, | ||||
|                                  const Vec<D> even) const { | ||||
|     HWY_DASSERT(Lanes(d) <= 64 / sizeof(TFromD<D>)); | ||||
|     return ConcatUpperLower(d, odd, even); | ||||
|   } | ||||
| 
 | ||||
|   template <class V> | ||||
|   HWY_INLINE V OddEvenKeys(const V odd, const V even) const { | ||||
|     return OddEvenBlocks(odd, even); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> ReverseKeys8(D, Vec<D>) const { | ||||
|     HWY_ASSERT(0);  // not supported: would require 1024-bit vectors
 | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> ReverseKeys16(D, Vec<D>) const { | ||||
|     HWY_ASSERT(0);  // not supported: would require 2048-bit vectors
 | ||||
|   } | ||||
| 
 | ||||
|   // This is only called for 8/16 col networks (not supported).
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> SwapAdjacentPairs(D, Vec<D>) const { | ||||
|     HWY_ASSERT(0); | ||||
|   } | ||||
| 
 | ||||
|   // This is only called for 16 col networks (not supported).
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> SwapAdjacentQuads(D, Vec<D>) const { | ||||
|     HWY_ASSERT(0); | ||||
|   } | ||||
| 
 | ||||
|   // This is only called for 8 col networks (not supported).
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> OddEvenQuads(D, Vec<D>, Vec<D>) const { | ||||
|     HWY_ASSERT(0); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // Anything order-related depends on the key traits *and* the order (see
 | ||||
| // FirstOfLanes). We cannot implement just one Compare function because Lt128
 | ||||
| // only compiles if the lane type is u64. Thus we need either overloaded
 | ||||
| // functions with a tag type, class specializations, or separate classes.
 | ||||
| // We avoid overloaded functions because we want all functions to be callable
 | ||||
| // from a SortTraits without per-function wrappers. Specializing would work, but
 | ||||
| // we are anyway going to specialize at a higher level.
 | ||||
| struct OrderAscending128 : public Key128 { | ||||
|   using Order = SortAscending; | ||||
| 
 | ||||
|   template <typename T> | ||||
|   HWY_INLINE bool Compare1(const T* a, const T* b) { | ||||
|     return (a[1] == b[1]) ? a[0] < b[0] : a[1] < b[1]; | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const { | ||||
|     return Lt128(d, a, b); | ||||
|   } | ||||
| 
 | ||||
|   // Used by CompareTop
 | ||||
|   template <class V> | ||||
|   HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const { | ||||
|     return Lt(a, b); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const { | ||||
|     return Min128(d, a, b); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const { | ||||
|     return Max128(d, a, b); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v, | ||||
|                                  TFromD<D>* HWY_RESTRICT buf) const { | ||||
|     const size_t N = Lanes(d); | ||||
|     Store(v, d, buf); | ||||
|     v = SetKey(d, buf + 0);  // result must be broadcasted
 | ||||
|     for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) { | ||||
|       v = First(d, v, SetKey(d, buf + i)); | ||||
|     } | ||||
|     return v; | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v, | ||||
|                                 TFromD<D>* HWY_RESTRICT buf) const { | ||||
|     const size_t N = Lanes(d); | ||||
|     Store(v, d, buf); | ||||
|     v = SetKey(d, buf + 0);  // result must be broadcasted
 | ||||
|     for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) { | ||||
|       v = Last(d, v, SetKey(d, buf + i)); | ||||
|     } | ||||
|     return v; | ||||
|   } | ||||
| 
 | ||||
|   // Same as for regular lanes because 128-bit lanes are u64.
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> FirstValue(D d) const { | ||||
|     return Set(d, hwy::LowestValue<TFromD<D> >()); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> LastValue(D d) const { | ||||
|     return Set(d, hwy::HighestValue<TFromD<D> >()); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| struct OrderDescending128 : public Key128 { | ||||
|   using Order = SortDescending; | ||||
| 
 | ||||
|   template <typename T> | ||||
|   HWY_INLINE bool Compare1(const T* a, const T* b) { | ||||
|     return (a[1] == b[1]) ? b[0] < a[0] : b[1] < a[1]; | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Mask<D> Compare(D d, Vec<D> a, Vec<D> b) const { | ||||
|     return Lt128(d, b, a); | ||||
|   } | ||||
| 
 | ||||
|   // Used by CompareTop
 | ||||
|   template <class V> | ||||
|   HWY_INLINE Mask<DFromV<V> > CompareLanes(V a, V b) const { | ||||
|     return Lt(b, a); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> First(D d, const Vec<D> a, const Vec<D> b) const { | ||||
|     return Max128(d, a, b); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> Last(D d, const Vec<D> a, const Vec<D> b) const { | ||||
|     return Min128(d, a, b); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> FirstOfLanes(D d, Vec<D> v, | ||||
|                                  TFromD<D>* HWY_RESTRICT buf) const { | ||||
|     const size_t N = Lanes(d); | ||||
|     Store(v, d, buf); | ||||
|     v = SetKey(d, buf + 0);  // result must be broadcasted
 | ||||
|     for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) { | ||||
|       v = First(d, v, SetKey(d, buf + i)); | ||||
|     } | ||||
|     return v; | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> LastOfLanes(D d, Vec<D> v, | ||||
|                                 TFromD<D>* HWY_RESTRICT buf) const { | ||||
|     const size_t N = Lanes(d); | ||||
|     Store(v, d, buf); | ||||
|     v = SetKey(d, buf + 0);  // result must be broadcasted
 | ||||
|     for (size_t i = LanesPerKey(); i < N; i += LanesPerKey()) { | ||||
|       v = Last(d, v, SetKey(d, buf + i)); | ||||
|     } | ||||
|     return v; | ||||
|   } | ||||
| 
 | ||||
|   // Same as for regular lanes because 128-bit lanes are u64.
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> FirstValue(D d) const { | ||||
|     return Set(d, hwy::HighestValue<TFromD<D> >()); | ||||
|   } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> LastValue(D d) const { | ||||
|     return Set(d, hwy::LowestValue<TFromD<D> >()); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // Shared code that depends on Order.
 | ||||
| template <class Base> | ||||
| class Traits128 : public Base { | ||||
| #if HWY_TARGET <= HWY_AVX2 | ||||
|   // Returns vector with only the top u64 lane valid. Useful when the next step
 | ||||
|   // is to replicate the mask anyway.
 | ||||
|   template <class D> | ||||
|   HWY_INLINE HWY_MAYBE_UNUSED Vec<D> CompareTop(D d, Vec<D> a, Vec<D> b) const { | ||||
|     const Base* base = static_cast<const Base*>(this); | ||||
|     const Vec<D> eqHL = VecFromMask(d, Eq(a, b)); | ||||
|     const Vec<D> ltHL = VecFromMask(d, base->CompareLanes(a, b)); | ||||
|     const Vec<D> ltLX = ShiftLeftLanes<1>(ltHL); | ||||
|     return OrAnd(ltHL, eqHL, ltLX); | ||||
|   } | ||||
| 
 | ||||
|   // We want to swap 2 u128, i.e. 4 u64 lanes, based on the 0 or FF..FF mask in
 | ||||
|   // the most-significant of those lanes (the result of CompareTop), so
 | ||||
|   // replicate it 4x. Only called for >= 256-bit vectors.
 | ||||
|   template <class V> | ||||
|   HWY_INLINE V ReplicateTop4x(V v) const { | ||||
| #if HWY_TARGET <= HWY_AVX3 | ||||
|     return V{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))}; | ||||
| #else  // AVX2
 | ||||
|     return V{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(3, 3, 3, 3))}; | ||||
| #endif | ||||
|   } | ||||
| #endif | ||||
| 
 | ||||
|  public: | ||||
|   constexpr bool Is128() const { return true; } | ||||
| 
 | ||||
|   template <class D> | ||||
|   HWY_INLINE void Sort2(D d, Vec<D>& a, Vec<D>& b) const { | ||||
|     const Base* base = static_cast<const Base*>(this); | ||||
| 
 | ||||
|     const Vec<D> a_copy = a; | ||||
|     const auto lt = base->Compare(d, a, b); | ||||
|     a = IfThenElse(lt, a, b); | ||||
|     b = IfThenElse(lt, b, a_copy); | ||||
|   } | ||||
| 
 | ||||
|   // Conditionally swaps even-numbered lanes with their odd-numbered neighbor.
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> SortPairsDistance1(D d, Vec<D> v) const { | ||||
|     const Base* base = static_cast<const Base*>(this); | ||||
|     Vec<D> swapped = base->ReverseKeys2(d, v); | ||||
| 
 | ||||
| #if HWY_TARGET <= HWY_AVX2 | ||||
|     const Vec<D> select = ReplicateTop4x(CompareTop(d, v, swapped)); | ||||
|     return IfVecThenElse(select, swapped, v); | ||||
| #else | ||||
|     Sort2(d, v, swapped); | ||||
|     return base->OddEvenKeys(swapped, v); | ||||
| #endif | ||||
|   } | ||||
| 
 | ||||
|   // Swaps with the vector formed by reversing contiguous groups of 4 keys.
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> SortPairsReverse4(D d, Vec<D> v) const { | ||||
|     const Base* base = static_cast<const Base*>(this); | ||||
|     Vec<D> swapped = base->ReverseKeys4(d, v); | ||||
| 
 | ||||
|     // Only specialize for AVX3 because this requires 512-bit vectors.
 | ||||
| #if HWY_TARGET <= HWY_AVX3 | ||||
|     const Vec512<uint64_t> outHx = CompareTop(d, v, swapped); | ||||
|     // Similar to ReplicateTop4x, we want to gang together 2 comparison results
 | ||||
|     // (4 lanes). They are not contiguous, so use permute to replicate 4x.
 | ||||
|     alignas(64) uint64_t kIndices[8] = {7, 7, 5, 5, 5, 5, 7, 7}; | ||||
|     const Vec512<uint64_t> select = | ||||
|         TableLookupLanes(outHx, SetTableIndices(d, kIndices)); | ||||
|     return IfVecThenElse(select, swapped, v); | ||||
| #else | ||||
|     Sort2(d, v, swapped); | ||||
|     return base->OddEvenPairs(d, swapped, v); | ||||
| #endif | ||||
|   } | ||||
| 
 | ||||
|   // Conditionally swaps lane 0 with 4, 1 with 5 etc.
 | ||||
|   template <class D> | ||||
|   HWY_INLINE Vec<D> SortPairsDistance4(D, Vec<D>) const { | ||||
|     // Only used by Merge16, which would require 2048 bit vectors (unsupported).
 | ||||
|     HWY_ASSERT(0); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| #endif  // HWY_TARGET != HWY_SCALAR
 | ||||
| 
 | ||||
| }  // namespace detail
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #endif  // HIGHWAY_HWY_CONTRIB_SORT_TRAITS128_TOGGLE
 | ||||
							
								
								
									
										722
									
								
								third_party/highway/hwy/contrib/sort/vqsort-inl.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										722
									
								
								third_party/highway/hwy/contrib/sort/vqsort-inl.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,722 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| // Normal include guard for target-independent parts
 | ||||
| #ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_ | ||||
| #define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_ | ||||
| 
 | ||||
| // Makes it harder for adversaries to predict our sampling locations, at the
 | ||||
| // cost of 1-2% increased runtime.
 | ||||
| #ifndef VQSORT_SECURE_RNG | ||||
| #define VQSORT_SECURE_RNG 0 | ||||
| #endif | ||||
| 
 | ||||
| #if VQSORT_SECURE_RNG | ||||
| #include "third_party/absl/random/random.h" | ||||
| #endif | ||||
| 
 | ||||
| #include <string.h>  // memcpy | ||||
| 
 | ||||
| #include "hwy/cache_control.h"  // Prefetch | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h"  // Fill24Bytes | ||||
| 
 | ||||
| #if HWY_IS_MSAN | ||||
| #include <sanitizer/msan_interface.h> | ||||
| #endif | ||||
| 
 | ||||
| #endif  // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_INL_H_
 | ||||
| 
 | ||||
| // Per-target
 | ||||
| #if defined(HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE) == \ | ||||
|     defined(HWY_TARGET_TOGGLE) | ||||
| #ifdef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE | ||||
| #undef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE | ||||
| #else | ||||
| #define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE | ||||
| #endif | ||||
| 
 | ||||
| #include "hwy/contrib/sort/shared-inl.h" | ||||
| #include "hwy/contrib/sort/sorting_networks-inl.h" | ||||
| #include "hwy/highway.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| namespace detail { | ||||
| 
 | ||||
| #if HWY_TARGET == HWY_SCALAR | ||||
| 
 | ||||
| template <typename T> | ||||
| void Swap(T* a, T* b) { | ||||
|   T t = *a; | ||||
|   *a = *b; | ||||
|   *b = t; | ||||
| } | ||||
| 
 | ||||
| // Scalar version of HeapSort (see below)
 | ||||
| template <class Traits, typename T> | ||||
| void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) { | ||||
|   if (num < 2) return; | ||||
| 
 | ||||
|   // Build heap.
 | ||||
|   for (size_t i = 1; i < num; i += 1) { | ||||
|     size_t j = i; | ||||
|     while (j != 0) { | ||||
|       const size_t idx_parent = ((j - 1) / 1 / 2); | ||||
|       if (!st.Compare1(keys + idx_parent, keys + j)) { | ||||
|         break; | ||||
|       } | ||||
|       Swap(keys + j, keys + idx_parent); | ||||
|       j = idx_parent; | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   for (size_t i = num - 1; i != 0; i -= 1) { | ||||
|     // Swap root with last
 | ||||
|     Swap(keys + 0, keys + i); | ||||
| 
 | ||||
|     // Sift down the new root.
 | ||||
|     size_t j = 0; | ||||
|     while (j < i) { | ||||
|       const size_t left = 2 * j + 1; | ||||
|       const size_t right = 2 * j + 2; | ||||
|       if (left >= i) break; | ||||
|       size_t idx_larger = j; | ||||
|       if (st.Compare1(keys + j, keys + left)) { | ||||
|         idx_larger = left; | ||||
|       } | ||||
|       if (right < i && st.Compare1(keys + idx_larger, keys + right)) { | ||||
|         idx_larger = right; | ||||
|       } | ||||
|       if (idx_larger == j) break; | ||||
|       Swap(keys + j, keys + idx_larger); | ||||
|       j = idx_larger; | ||||
|     } | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| #else | ||||
| 
 | ||||
| using Constants = hwy::SortConstants; | ||||
| 
 | ||||
| // ------------------------------ HeapSort
 | ||||
| 
 | ||||
| // Heapsort: O(1) space, O(N*logN) worst-case comparisons.
 | ||||
| // Based on LLVM sanitizer_common.h, licensed under Apache-2.0.
 | ||||
| template <class Traits, typename T> | ||||
| void HeapSort(Traits st, T* HWY_RESTRICT keys, const size_t num) { | ||||
|   constexpr size_t N1 = st.LanesPerKey(); | ||||
|   const FixedTag<T, N1> d; | ||||
| 
 | ||||
|   if (num < 2 * N1) return; | ||||
| 
 | ||||
|   // Build heap.
 | ||||
|   for (size_t i = N1; i < num; i += N1) { | ||||
|     size_t j = i; | ||||
|     while (j != 0) { | ||||
|       const size_t idx_parent = ((j - N1) / N1 / 2) * N1; | ||||
|       if (AllFalse(d, st.Compare(d, st.SetKey(d, keys + idx_parent), | ||||
|                                  st.SetKey(d, keys + j)))) { | ||||
|         break; | ||||
|       } | ||||
|       st.Swap(keys + j, keys + idx_parent); | ||||
|       j = idx_parent; | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|   for (size_t i = num - N1; i != 0; i -= N1) { | ||||
|     // Swap root with last
 | ||||
|     st.Swap(keys + 0, keys + i); | ||||
| 
 | ||||
|     // Sift down the new root.
 | ||||
|     size_t j = 0; | ||||
|     while (j < i) { | ||||
|       const size_t left = 2 * j + N1; | ||||
|       const size_t right = 2 * j + 2 * N1; | ||||
|       if (left >= i) break; | ||||
|       size_t idx_larger = j; | ||||
|       const auto key_j = st.SetKey(d, keys + j); | ||||
|       if (AllTrue(d, st.Compare(d, key_j, st.SetKey(d, keys + left)))) { | ||||
|         idx_larger = left; | ||||
|       } | ||||
|       if (right < i && AllTrue(d, st.Compare(d, st.SetKey(d, keys + idx_larger), | ||||
|                                              st.SetKey(d, keys + right)))) { | ||||
|         idx_larger = right; | ||||
|       } | ||||
|       if (idx_larger == j) break; | ||||
|       st.Swap(keys + j, keys + idx_larger); | ||||
|       j = idx_larger; | ||||
|     } | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ BaseCase
 | ||||
| 
 | ||||
| // Sorts `keys` within the range [0, num) via sorting network.
 | ||||
| template <class D, class Traits, typename T> | ||||
| HWY_NOINLINE void BaseCase(D d, Traits st, T* HWY_RESTRICT keys, size_t num, | ||||
|                            T* HWY_RESTRICT buf) { | ||||
|   const size_t N = Lanes(d); | ||||
|   using V = decltype(Zero(d)); | ||||
| 
 | ||||
|   // _Nonzero32 requires num - 1 != 0.
 | ||||
|   if (HWY_UNLIKELY(num <= 1)) return; | ||||
| 
 | ||||
|   // Reshape into a matrix with kMaxRows rows, and columns limited by the
 | ||||
|   // 1D `num`, which is upper-bounded by the vector width (see BaseCaseNum).
 | ||||
|   const size_t num_pow2 = size_t{1} | ||||
|                           << (32 - Num0BitsAboveMS1Bit_Nonzero32( | ||||
|                                        static_cast<uint32_t>(num - 1))); | ||||
|   HWY_DASSERT(num <= num_pow2 && num_pow2 <= Constants::BaseCaseNum(N)); | ||||
|   const size_t cols = | ||||
|       HWY_MAX(st.LanesPerKey(), num_pow2 >> Constants::kMaxRowsLog2); | ||||
|   HWY_DASSERT(cols <= N); | ||||
| 
 | ||||
|   // Copy `keys` to `buf`.
 | ||||
|   size_t i; | ||||
|   for (i = 0; i + N <= num; i += N) { | ||||
|     Store(LoadU(d, keys + i), d, buf + i); | ||||
|   } | ||||
|   for (; i < num; ++i) { | ||||
|     buf[i] = keys[i]; | ||||
|   } | ||||
| 
 | ||||
|   // Fill with padding - last in sort order, not copied to keys.
 | ||||
|   const V kPadding = st.LastValue(d); | ||||
|   // Initialize an extra vector because SortingNetwork loads full vectors,
 | ||||
|   // which may exceed cols*kMaxRows.
 | ||||
|   for (; i < (cols * Constants::kMaxRows + N); i += N) { | ||||
|     StoreU(kPadding, d, buf + i); | ||||
|   } | ||||
| 
 | ||||
|   SortingNetwork(st, buf, cols); | ||||
| 
 | ||||
|   for (i = 0; i + N <= num; i += N) { | ||||
|     StoreU(Load(d, buf + i), d, keys + i); | ||||
|   } | ||||
|   for (; i < num; ++i) { | ||||
|     keys[i] = buf[i]; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Partition
 | ||||
| 
 | ||||
| // Consumes from `left` until a multiple of kUnroll*N remains.
 | ||||
| // Temporarily stores the right side into `buf`, then moves behind `right`.
 | ||||
| template <class D, class Traits, class T> | ||||
| HWY_NOINLINE void PartitionToMultipleOfUnroll(D d, Traits st, | ||||
|                                               T* HWY_RESTRICT keys, | ||||
|                                               size_t& left, size_t& right, | ||||
|                                               const Vec<D> pivot, | ||||
|                                               T* HWY_RESTRICT buf) { | ||||
|   constexpr size_t kUnroll = Constants::kPartitionUnroll; | ||||
|   const size_t N = Lanes(d); | ||||
|   size_t readL = left; | ||||
|   size_t bufR = 0; | ||||
|   const size_t num = right - left; | ||||
|   // Partition requires both a multiple of kUnroll*N and at least
 | ||||
|   // 2*kUnroll*N for the initial loads. If less, consume all here.
 | ||||
|   const size_t num_rem = | ||||
|       (num < 2 * kUnroll * N) ? num : (num & (kUnroll * N - 1)); | ||||
|   size_t i = 0; | ||||
|   for (; i + N <= num_rem; i += N) { | ||||
|     const Vec<D> vL = LoadU(d, keys + readL); | ||||
|     readL += N; | ||||
| 
 | ||||
|     const auto comp = st.Compare(d, pivot, vL); | ||||
|     left += CompressBlendedStore(vL, Not(comp), d, keys + left); | ||||
|     bufR += CompressStore(vL, comp, d, buf + bufR); | ||||
|   } | ||||
|   // Last iteration: only use valid lanes.
 | ||||
|   if (HWY_LIKELY(i != num_rem)) { | ||||
|     const auto mask = FirstN(d, num_rem - i); | ||||
|     const Vec<D> vL = LoadU(d, keys + readL); | ||||
| 
 | ||||
|     const auto comp = st.Compare(d, pivot, vL); | ||||
|     left += CompressBlendedStore(vL, AndNot(comp, mask), d, keys + left); | ||||
|     bufR += CompressStore(vL, And(comp, mask), d, buf + bufR); | ||||
|   } | ||||
| 
 | ||||
|   // MSAN seems not to understand CompressStore. buf[0, bufR) are valid.
 | ||||
| #if HWY_IS_MSAN | ||||
|   __msan_unpoison(buf, bufR * sizeof(T)); | ||||
| #endif | ||||
| 
 | ||||
|   // Everything we loaded was put into buf, or behind the new `left`, after
 | ||||
|   // which there is space for bufR items. First move items from `right` to
 | ||||
|   // `left` to free up space, then copy `buf` into the vacated `right`.
 | ||||
|   // A loop with masked loads from `buf` is insufficient - we would also need to
 | ||||
|   // mask from `right`. Combining a loop with memcpy for the remainders is
 | ||||
|   // slower than just memcpy, so we use that for simplicity.
 | ||||
|   right -= bufR; | ||||
|   memcpy(keys + left, keys + right, bufR * sizeof(T)); | ||||
|   memcpy(keys + right, buf, bufR * sizeof(T)); | ||||
| } | ||||
| 
 | ||||
| template <class D, class Traits, typename T> | ||||
| HWY_INLINE void StoreLeftRight(D d, Traits st, const Vec<D> v, | ||||
|                                const Vec<D> pivot, T* HWY_RESTRICT keys, | ||||
|                                size_t& writeL, size_t& writeR) { | ||||
|   const size_t N = Lanes(d); | ||||
| 
 | ||||
|   const auto comp = st.Compare(d, pivot, v); | ||||
|   const size_t num_left = CompressBlendedStore(v, Not(comp), d, keys + writeL); | ||||
|   writeL += num_left; | ||||
| 
 | ||||
|   writeR -= (N - num_left); | ||||
|   (void)CompressBlendedStore(v, comp, d, keys + writeR); | ||||
| } | ||||
| 
 | ||||
| template <class D, class Traits, typename T> | ||||
| HWY_INLINE void StoreLeftRight4(D d, Traits st, const Vec<D> v0, | ||||
|                                 const Vec<D> v1, const Vec<D> v2, | ||||
|                                 const Vec<D> v3, const Vec<D> pivot, | ||||
|                                 T* HWY_RESTRICT keys, size_t& writeL, | ||||
|                                 size_t& writeR) { | ||||
|   StoreLeftRight(d, st, v0, pivot, keys, writeL, writeR); | ||||
|   StoreLeftRight(d, st, v1, pivot, keys, writeL, writeR); | ||||
|   StoreLeftRight(d, st, v2, pivot, keys, writeL, writeR); | ||||
|   StoreLeftRight(d, st, v3, pivot, keys, writeL, writeR); | ||||
| } | ||||
| 
 | ||||
| // Moves "<= pivot" keys to the front, and others to the back. pivot is
 | ||||
| // broadcasted. Time-critical!
 | ||||
| //
 | ||||
| // Aligned loads do not seem to be worthwhile (not bottlenecked by load ports).
 | ||||
| template <class D, class Traits, typename T> | ||||
| HWY_NOINLINE size_t Partition(D d, Traits st, T* HWY_RESTRICT keys, size_t left, | ||||
|                               size_t right, const Vec<D> pivot, | ||||
|                               T* HWY_RESTRICT buf) { | ||||
|   using V = decltype(Zero(d)); | ||||
|   const size_t N = Lanes(d); | ||||
| 
 | ||||
|   // StoreLeftRight will CompressBlendedStore ending at `writeR`. Unless all
 | ||||
|   // lanes happen to be in the right-side partition, this will overrun `keys`,
 | ||||
|   // which triggers asan errors. Avoid by special-casing the last vector.
 | ||||
|   HWY_DASSERT(right - left > 2 * N);  // ensured by HandleSpecialCases
 | ||||
|   right -= N; | ||||
|   const size_t last = right; | ||||
|   const V vlast = LoadU(d, keys + last); | ||||
| 
 | ||||
|   PartitionToMultipleOfUnroll(d, st, keys, left, right, pivot, buf); | ||||
|   constexpr size_t kUnroll = Constants::kPartitionUnroll; | ||||
| 
 | ||||
|   // Invariant: [left, writeL) and [writeR, right) are already partitioned.
 | ||||
|   size_t writeL = left; | ||||
|   size_t writeR = right; | ||||
| 
 | ||||
|   const size_t num = right - left; | ||||
|   // Cannot load if there were fewer than 2 * kUnroll * N.
 | ||||
|   if (HWY_LIKELY(num != 0)) { | ||||
|     HWY_DASSERT(num >= 2 * kUnroll * N); | ||||
|     HWY_DASSERT((num & (kUnroll * N - 1)) == 0); | ||||
| 
 | ||||
|     // Make space for writing in-place by reading from left and right.
 | ||||
|     const V vL0 = LoadU(d, keys + left + 0 * N); | ||||
|     const V vL1 = LoadU(d, keys + left + 1 * N); | ||||
|     const V vL2 = LoadU(d, keys + left + 2 * N); | ||||
|     const V vL3 = LoadU(d, keys + left + 3 * N); | ||||
|     left += kUnroll * N; | ||||
|     right -= kUnroll * N; | ||||
|     const V vR0 = LoadU(d, keys + right + 0 * N); | ||||
|     const V vR1 = LoadU(d, keys + right + 1 * N); | ||||
|     const V vR2 = LoadU(d, keys + right + 2 * N); | ||||
|     const V vR3 = LoadU(d, keys + right + 3 * N); | ||||
| 
 | ||||
|     // The left/right updates may consume all inputs, so check before the loop.
 | ||||
|     while (left != right) { | ||||
|       V v0, v1, v2, v3; | ||||
| 
 | ||||
|       // Free up capacity for writing by loading from the side that has less.
 | ||||
|       // Data-dependent but branching is faster than forcing branch-free.
 | ||||
|       const size_t capacityL = left - writeL; | ||||
|       const size_t capacityR = writeR - right; | ||||
|       HWY_DASSERT(capacityL <= num && capacityR <= num);  // >= 0
 | ||||
|       if (capacityR < capacityL) { | ||||
|         right -= kUnroll * N; | ||||
|         v0 = LoadU(d, keys + right + 0 * N); | ||||
|         v1 = LoadU(d, keys + right + 1 * N); | ||||
|         v2 = LoadU(d, keys + right + 2 * N); | ||||
|         v3 = LoadU(d, keys + right + 3 * N); | ||||
|         hwy::Prefetch(keys + right - 3 * kUnroll * N); | ||||
|       } else { | ||||
|         v0 = LoadU(d, keys + left + 0 * N); | ||||
|         v1 = LoadU(d, keys + left + 1 * N); | ||||
|         v2 = LoadU(d, keys + left + 2 * N); | ||||
|         v3 = LoadU(d, keys + left + 3 * N); | ||||
|         left += kUnroll * N; | ||||
|         hwy::Prefetch(keys + left + 3 * kUnroll * N); | ||||
|       } | ||||
| 
 | ||||
|       StoreLeftRight4(d, st, v0, v1, v2, v3, pivot, keys, writeL, writeR); | ||||
|     } | ||||
| 
 | ||||
|     // Now finish writing the initial left/right to the middle.
 | ||||
|     StoreLeftRight4(d, st, vL0, vL1, vL2, vL3, pivot, keys, writeL, writeR); | ||||
|     StoreLeftRight4(d, st, vR0, vR1, vR2, vR3, pivot, keys, writeL, writeR); | ||||
|   } | ||||
| 
 | ||||
|   // We have partitioned [left, right) such that writeL is the boundary.
 | ||||
|   HWY_DASSERT(writeL == writeR); | ||||
|   // Make space for inserting vlast: move up to N of the first right-side keys
 | ||||
|   // into the unused space starting at last. If we have fewer, ensure they are
 | ||||
|   // the last items in that vector by subtracting from the *load* address,
 | ||||
|   // which is safe because we have at least two vectors (checked above).
 | ||||
|   const size_t totalR = last - writeL; | ||||
|   const size_t startR = totalR < N ? writeL + totalR - N : writeL; | ||||
|   StoreU(LoadU(d, keys + startR), d, keys + last); | ||||
| 
 | ||||
|   // Partition vlast: write L, then R, into the single-vector gap at writeL.
 | ||||
|   const auto comp = st.Compare(d, pivot, vlast); | ||||
|   writeL += CompressBlendedStore(vlast, Not(comp), d, keys + writeL); | ||||
|   (void)CompressBlendedStore(vlast, comp, d, keys + writeL); | ||||
| 
 | ||||
|   return writeL; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Pivot
 | ||||
| 
 | ||||
| template <class Traits, class V> | ||||
| HWY_INLINE V MedianOf3(Traits st, V v0, V v1, V v2) { | ||||
|   const DFromV<V> d; | ||||
|   // Slightly faster for 128-bit, apparently because not serially dependent.
 | ||||
|   if (st.Is128()) { | ||||
|     // Median = XOR-sum 'minus' the first and last. Calling First twice is
 | ||||
|     // slightly faster than Compare + 2 IfThenElse or even IfThenElse + XOR.
 | ||||
|     const auto sum = Xor(Xor(v0, v1), v2); | ||||
|     const auto first = st.First(d, st.First(d, v0, v1), v2); | ||||
|     const auto last = st.Last(d, st.Last(d, v0, v1), v2); | ||||
|     return Xor(Xor(sum, first), last); | ||||
|   } | ||||
|   st.Sort2(d, v0, v2); | ||||
|   v1 = st.Last(d, v0, v1); | ||||
|   v1 = st.First(d, v1, v2); | ||||
|   return v1; | ||||
| } | ||||
| 
 | ||||
| // Replaces triplets with their median and recurses until less than 3 keys
 | ||||
| // remain. Ignores leftover values (non-whole triplets)!
 | ||||
| template <class D, class Traits, typename T> | ||||
| Vec<D> RecursiveMedianOf3(D d, Traits st, T* HWY_RESTRICT keys, size_t num, | ||||
|                           T* HWY_RESTRICT buf) { | ||||
|   const size_t N = Lanes(d); | ||||
|   constexpr size_t N1 = st.LanesPerKey(); | ||||
| 
 | ||||
|   if (num < 3 * N1) return st.SetKey(d, keys); | ||||
| 
 | ||||
|   size_t read = 0; | ||||
|   size_t written = 0; | ||||
| 
 | ||||
|   // Triplets of vectors
 | ||||
|   for (; read + 3 * N <= num; read += 3 * N) { | ||||
|     const auto v0 = Load(d, keys + read + 0 * N); | ||||
|     const auto v1 = Load(d, keys + read + 1 * N); | ||||
|     const auto v2 = Load(d, keys + read + 2 * N); | ||||
|     Store(MedianOf3(st, v0, v1, v2), d, buf + written); | ||||
|     written += N; | ||||
|   } | ||||
| 
 | ||||
|   // Triplets of keys
 | ||||
|   for (; read + 3 * N1 <= num; read += 3 * N1) { | ||||
|     const auto v0 = st.SetKey(d, keys + read + 0 * N1); | ||||
|     const auto v1 = st.SetKey(d, keys + read + 1 * N1); | ||||
|     const auto v2 = st.SetKey(d, keys + read + 2 * N1); | ||||
|     StoreU(MedianOf3(st, v0, v1, v2), d, buf + written); | ||||
|     written += N1; | ||||
|   } | ||||
| 
 | ||||
|   // Tail recursion; swap buffers
 | ||||
|   return RecursiveMedianOf3(d, st, buf, written, keys); | ||||
| } | ||||
| 
 | ||||
| #if VQSORT_SECURE_RNG | ||||
| using Generator = absl::BitGen; | ||||
| #else | ||||
| // Based on https://github.com/numpy/numpy/issues/16313#issuecomment-641897028
 | ||||
| #pragma pack(push, 1) | ||||
| class Generator { | ||||
|  public: | ||||
|   Generator(const void* heap, size_t num) { | ||||
|     Sorter::Fill24Bytes(heap, num, &a_); | ||||
|     k_ = 1;  // stream index: must be odd
 | ||||
|   } | ||||
| 
 | ||||
|   uint64_t operator()() { | ||||
|     const uint64_t b = b_; | ||||
|     w_ += k_; | ||||
|     const uint64_t next = a_ ^ w_; | ||||
|     a_ = (b + (b << 3)) ^ (b >> 11); | ||||
|     const uint64_t rot = (b << 24) | (b >> 40); | ||||
|     b_ = rot + next; | ||||
|     return next; | ||||
|   } | ||||
| 
 | ||||
|  private: | ||||
|   uint64_t a_; | ||||
|   uint64_t b_; | ||||
|   uint64_t w_; | ||||
|   uint64_t k_;  // increment
 | ||||
| }; | ||||
| #pragma pack(pop) | ||||
| 
 | ||||
| #endif  // !VQSORT_SECURE_RNG
 | ||||
| 
 | ||||
| // Returns slightly biased random index of a chunk in [0, num_chunks).
 | ||||
| // See https://www.pcg-random.org/posts/bounded-rands.html.
 | ||||
| HWY_INLINE size_t RandomChunkIndex(const uint32_t num_chunks, uint32_t bits) { | ||||
|   const uint64_t chunk_index = (static_cast<uint64_t>(bits) * num_chunks) >> 32; | ||||
|   HWY_DASSERT(chunk_index < num_chunks); | ||||
|   return static_cast<size_t>(chunk_index); | ||||
| } | ||||
| 
 | ||||
| template <class D, class Traits, typename T> | ||||
| HWY_NOINLINE Vec<D> ChoosePivot(D d, Traits st, T* HWY_RESTRICT keys, | ||||
|                                 const size_t begin, const size_t end, | ||||
|                                 T* HWY_RESTRICT buf, Generator& rng) { | ||||
|   using V = decltype(Zero(d)); | ||||
|   const size_t N = Lanes(d); | ||||
| 
 | ||||
|   // Power of two
 | ||||
|   const size_t lanes_per_chunk = Constants::LanesPerChunk(sizeof(T), N); | ||||
| 
 | ||||
|   keys += begin; | ||||
|   size_t num = end - begin; | ||||
| 
 | ||||
|   // Align start of keys to chunks. We always have at least 2 chunks because the
 | ||||
|   // base case would have handled anything up to 16 vectors, i.e. >= 4 chunks.
 | ||||
|   HWY_DASSERT(num >= 2 * lanes_per_chunk); | ||||
|   const size_t misalign = | ||||
|       (reinterpret_cast<uintptr_t>(keys) / sizeof(T)) & (lanes_per_chunk - 1); | ||||
|   if (misalign != 0) { | ||||
|     const size_t consume = lanes_per_chunk - misalign; | ||||
|     keys += consume; | ||||
|     num -= consume; | ||||
|   } | ||||
| 
 | ||||
|   // Generate enough random bits for 9 uint32
 | ||||
|   uint64_t* bits64 = reinterpret_cast<uint64_t*>(buf); | ||||
|   for (size_t i = 0; i < 5; ++i) { | ||||
|     bits64[i] = rng(); | ||||
|   } | ||||
|   const uint32_t* bits = reinterpret_cast<const uint32_t*>(buf); | ||||
| 
 | ||||
|   const uint32_t lpc32 = static_cast<uint32_t>(lanes_per_chunk); | ||||
|   // Avoid division
 | ||||
|   const size_t log2_lpc = Num0BitsBelowLS1Bit_Nonzero32(lpc32); | ||||
|   const size_t num_chunks64 = num >> log2_lpc; | ||||
|   // Clamp to uint32 for RandomChunkIndex
 | ||||
|   const uint32_t num_chunks = | ||||
|       static_cast<uint32_t>(HWY_MIN(num_chunks64, 0xFFFFFFFFull)); | ||||
| 
 | ||||
|   const size_t offset0 = RandomChunkIndex(num_chunks, bits[0]) << log2_lpc; | ||||
|   const size_t offset1 = RandomChunkIndex(num_chunks, bits[1]) << log2_lpc; | ||||
|   const size_t offset2 = RandomChunkIndex(num_chunks, bits[2]) << log2_lpc; | ||||
|   const size_t offset3 = RandomChunkIndex(num_chunks, bits[3]) << log2_lpc; | ||||
|   const size_t offset4 = RandomChunkIndex(num_chunks, bits[4]) << log2_lpc; | ||||
|   const size_t offset5 = RandomChunkIndex(num_chunks, bits[5]) << log2_lpc; | ||||
|   const size_t offset6 = RandomChunkIndex(num_chunks, bits[6]) << log2_lpc; | ||||
|   const size_t offset7 = RandomChunkIndex(num_chunks, bits[7]) << log2_lpc; | ||||
|   const size_t offset8 = RandomChunkIndex(num_chunks, bits[8]) << log2_lpc; | ||||
|   for (size_t i = 0; i < lanes_per_chunk; i += N) { | ||||
|     const V v0 = Load(d, keys + offset0 + i); | ||||
|     const V v1 = Load(d, keys + offset1 + i); | ||||
|     const V v2 = Load(d, keys + offset2 + i); | ||||
|     const V medians0 = MedianOf3(st, v0, v1, v2); | ||||
|     Store(medians0, d, buf + i); | ||||
| 
 | ||||
|     const V v3 = Load(d, keys + offset3 + i); | ||||
|     const V v4 = Load(d, keys + offset4 + i); | ||||
|     const V v5 = Load(d, keys + offset5 + i); | ||||
|     const V medians1 = MedianOf3(st, v3, v4, v5); | ||||
|     Store(medians1, d, buf + i + lanes_per_chunk); | ||||
| 
 | ||||
|     const V v6 = Load(d, keys + offset6 + i); | ||||
|     const V v7 = Load(d, keys + offset7 + i); | ||||
|     const V v8 = Load(d, keys + offset8 + i); | ||||
|     const V medians2 = MedianOf3(st, v6, v7, v8); | ||||
|     Store(medians2, d, buf + i + lanes_per_chunk * 2); | ||||
|   } | ||||
| 
 | ||||
|   return RecursiveMedianOf3(d, st, buf, 3 * lanes_per_chunk, | ||||
|                             buf + 3 * lanes_per_chunk); | ||||
| } | ||||
| 
 | ||||
| // Compute exact min/max to detect all-equal partitions. Only called after a
 | ||||
| // degenerate Partition (none in the right partition).
 | ||||
| template <class D, class Traits, typename T> | ||||
| HWY_NOINLINE void ScanMinMax(D d, Traits st, const T* HWY_RESTRICT keys, | ||||
|                              size_t num, T* HWY_RESTRICT buf, Vec<D>& first, | ||||
|                              Vec<D>& last) { | ||||
|   const size_t N = Lanes(d); | ||||
| 
 | ||||
|   first = st.LastValue(d); | ||||
|   last = st.FirstValue(d); | ||||
| 
 | ||||
|   size_t i = 0; | ||||
|   for (; i + N <= num; i += N) { | ||||
|     const Vec<D> v = LoadU(d, keys + i); | ||||
|     first = st.First(d, v, first); | ||||
|     last = st.Last(d, v, last); | ||||
|   } | ||||
|   if (HWY_LIKELY(i != num)) { | ||||
|     HWY_DASSERT(num >= N);  // See HandleSpecialCases
 | ||||
|     const Vec<D> v = LoadU(d, keys + num - N); | ||||
|     first = st.First(d, v, first); | ||||
|     last = st.Last(d, v, last); | ||||
|   } | ||||
| 
 | ||||
|   first = st.FirstOfLanes(d, first, buf); | ||||
|   last = st.LastOfLanes(d, last, buf); | ||||
| } | ||||
| 
 | ||||
| template <class D, class Traits, typename T> | ||||
| void Recurse(D d, Traits st, T* HWY_RESTRICT keys, const size_t begin, | ||||
|              const size_t end, const Vec<D> pivot, T* HWY_RESTRICT buf, | ||||
|              Generator& rng, size_t remaining_levels) { | ||||
|   HWY_DASSERT(begin + 1 < end); | ||||
|   const size_t num = end - begin;  // >= 2
 | ||||
| 
 | ||||
|   // Too many degenerate partitions. This is extremely unlikely to happen
 | ||||
|   // because we select pivots from large (though still O(1)) samples.
 | ||||
|   if (HWY_UNLIKELY(remaining_levels == 0)) { | ||||
|     HeapSort(st, keys + begin, num);  // Slow but N*logN.
 | ||||
|     return; | ||||
|   } | ||||
| 
 | ||||
|   const ptrdiff_t base_case_num = | ||||
|       static_cast<ptrdiff_t>(Constants::BaseCaseNum(Lanes(d))); | ||||
|   const size_t bound = Partition(d, st, keys, begin, end, pivot, buf); | ||||
| 
 | ||||
|   const ptrdiff_t num_left = | ||||
|       static_cast<ptrdiff_t>(bound) - static_cast<ptrdiff_t>(begin); | ||||
|   const ptrdiff_t num_right = | ||||
|       static_cast<ptrdiff_t>(end) - static_cast<ptrdiff_t>(bound); | ||||
| 
 | ||||
|   // Check for degenerate partitions (i.e. Partition did not move any keys):
 | ||||
|   if (HWY_UNLIKELY(num_right == 0)) { | ||||
|     // Because the pivot is one of the keys, it must have been equal to the
 | ||||
|     // first or last key in sort order. Scan for the actual min/max:
 | ||||
|     // passing the current pivot as the new bound is insufficient because one of
 | ||||
|     // the partitions might not actually include that key.
 | ||||
|     Vec<D> first, last; | ||||
|     ScanMinMax(d, st, keys + begin, num, buf, first, last); | ||||
|     if (AllTrue(d, Eq(first, last))) return; | ||||
| 
 | ||||
|     // Separate recursion to make sure that we don't pick `last` as the
 | ||||
|     // pivot - that would again lead to a degenerate partition.
 | ||||
|     Recurse(d, st, keys, begin, end, first, buf, rng, remaining_levels - 1); | ||||
|     return; | ||||
|   } | ||||
| 
 | ||||
|   if (HWY_UNLIKELY(num_left <= base_case_num)) { | ||||
|     BaseCase(d, st, keys + begin, static_cast<size_t>(num_left), buf); | ||||
|   } else { | ||||
|     const Vec<D> next_pivot = ChoosePivot(d, st, keys, begin, bound, buf, rng); | ||||
|     Recurse(d, st, keys, begin, bound, next_pivot, buf, rng, | ||||
|             remaining_levels - 1); | ||||
|   } | ||||
|   if (HWY_UNLIKELY(num_right <= base_case_num)) { | ||||
|     BaseCase(d, st, keys + bound, static_cast<size_t>(num_right), buf); | ||||
|   } else { | ||||
|     const Vec<D> next_pivot = ChoosePivot(d, st, keys, bound, end, buf, rng); | ||||
|     Recurse(d, st, keys, bound, end, next_pivot, buf, rng, | ||||
|             remaining_levels - 1); | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| // Returns true if sorting is finished.
 | ||||
| template <class D, class Traits, typename T> | ||||
| bool HandleSpecialCases(D d, Traits st, T* HWY_RESTRICT keys, size_t num, | ||||
|                         T* HWY_RESTRICT buf) { | ||||
|   const size_t N = Lanes(d); | ||||
|   const size_t base_case_num = Constants::BaseCaseNum(N); | ||||
| 
 | ||||
|   // 128-bit keys require vectors with at least two u64 lanes, which is always
 | ||||
|   // the case unless `d` requests partial vectors (e.g. fraction = 1/2) AND the
 | ||||
|   // hardware vector width is less than 128bit / fraction.
 | ||||
|   const bool partial_128 = N < 2 && st.Is128(); | ||||
|   // Partition assumes its input is at least two vectors. If vectors are huge,
 | ||||
|   // base_case_num may actually be smaller. If so, which is only possible on
 | ||||
|   // RVV, pass a capped or partial d (LMUL < 1).
 | ||||
|   constexpr bool kPotentiallyHuge = | ||||
|       HWY_MAX_BYTES / sizeof(T) > Constants::kMaxRows * Constants::kMaxCols; | ||||
|   const bool huge_vec = kPotentiallyHuge && (2 * N > base_case_num); | ||||
|   if (partial_128 || huge_vec) { | ||||
|     // PERFORMANCE WARNING: falling back to HeapSort.
 | ||||
|     HeapSort(st, keys, num); | ||||
|     return true; | ||||
|   } | ||||
| 
 | ||||
|   // Small arrays: use sorting network, no need for other checks.
 | ||||
|   if (HWY_UNLIKELY(num <= base_case_num)) { | ||||
|     BaseCase(d, st, keys, num, buf); | ||||
|     return true; | ||||
|   } | ||||
| 
 | ||||
|   // We could also check for already sorted/reverse/equal, but that's probably
 | ||||
|   // counterproductive if vqsort is used as a base case.
 | ||||
| 
 | ||||
|   return false;  // not finished sorting
 | ||||
| } | ||||
| 
 | ||||
| #endif  // HWY_TARGET != HWY_SCALAR
 | ||||
| }  // namespace detail
 | ||||
| 
 | ||||
| // Sorts `keys[0..num-1]` according to the order defined by `st.Compare`.
 | ||||
| // In-place i.e. O(1) additional storage. Worst-case N*logN comparisons.
 | ||||
| // Non-stable (order of equal keys may change), except for the common case where
 | ||||
| // the upper bits of T are the key, and the lower bits are a sequential or at
 | ||||
| // least unique ID.
 | ||||
| // There is no upper limit on `num`, but note that pivots may be chosen by
 | ||||
| // sampling only from the first 256 GiB.
 | ||||
| //
 | ||||
| // `d` is typically SortTag<T> (chooses between full and partial vectors).
 | ||||
| // `st` is SharedTraits<{LaneTraits|Traits128}<Order*>>. This abstraction layer
 | ||||
| //   bridges differences in sort order and single-lane vs 128-bit keys.
 | ||||
| template <class D, class Traits, typename T> | ||||
| void Sort(D d, Traits st, T* HWY_RESTRICT keys, size_t num, | ||||
|           T* HWY_RESTRICT buf) { | ||||
| #if HWY_TARGET == HWY_SCALAR | ||||
|   (void)d; | ||||
|   (void)buf; | ||||
|   // PERFORMANCE WARNING: vqsort is not enabled for the non-SIMD target
 | ||||
|   return detail::HeapSort(st, keys, num); | ||||
| #else | ||||
|   if (detail::HandleSpecialCases(d, st, keys, num, buf)) return; | ||||
| 
 | ||||
| #if HWY_MAX_BYTES > 64 | ||||
|   // sorting_networks-inl and traits assume no more than 512 bit vectors.
 | ||||
|   if (Lanes(d) > 64 / sizeof(T)) { | ||||
|     return Sort(CappedTag<T, 64 / sizeof(T)>(), st, keys, num, buf); | ||||
|   } | ||||
| #endif  // HWY_MAX_BYTES > 64
 | ||||
| 
 | ||||
|   // Pulled out of the recursion so we can special-case degenerate partitions.
 | ||||
|   detail::Generator rng(keys, num); | ||||
|   const Vec<D> pivot = detail::ChoosePivot(d, st, keys, 0, num, buf, rng); | ||||
| 
 | ||||
|   // Introspection: switch to worst-case N*logN heapsort after this many.
 | ||||
|   const size_t max_levels = 2 * hwy::CeilLog2(num) + 4; | ||||
| 
 | ||||
|   detail::Recurse(d, st, keys, 0, num, pivot, buf, rng, max_levels); | ||||
| #endif  // HWY_TARGET == HWY_SCALAR
 | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #endif  // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_TOGGLE
 | ||||
							
								
								
									
										148
									
								
								third_party/highway/hwy/contrib/sort/vqsort.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										148
									
								
								third_party/highway/hwy/contrib/sort/vqsort.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,148 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #include <string.h>  // memset
 | ||||
| 
 | ||||
| #include "hwy/aligned_allocator.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/shared-inl.h" | ||||
| 
 | ||||
| // Seed source for SFC generator: 1=getrandom, 2=CryptGenRandom
 | ||||
| // (not all Android support the getrandom wrapper)
 | ||||
| #ifndef VQSORT_SECURE_SEED | ||||
| 
 | ||||
| #if (defined(linux) || defined(__linux__)) && \ | ||||
|     !(defined(ANDROID) || defined(__ANDROID__) || HWY_ARCH_RVV) | ||||
| #define VQSORT_SECURE_SEED 1 | ||||
| #elif defined(_WIN32) || defined(_WIN64) | ||||
| #define VQSORT_SECURE_SEED 2 | ||||
| #else | ||||
| #define VQSORT_SECURE_SEED 0 | ||||
| #endif | ||||
| 
 | ||||
| #endif  // VQSORT_SECURE_SEED
 | ||||
| 
 | ||||
| #if !VQSORT_SECURE_RNG | ||||
| 
 | ||||
| #include <time.h> | ||||
| #if VQSORT_SECURE_SEED == 1 | ||||
| #include <sys/random.h> | ||||
| #elif VQSORT_SECURE_SEED == 2 | ||||
| #include <windows.h> | ||||
| #pragma comment(lib, "Advapi32.lib") | ||||
| // Must come after windows.h.
 | ||||
| #include <wincrypt.h> | ||||
| #endif  // VQSORT_SECURE_SEED
 | ||||
| 
 | ||||
| #endif  // !VQSORT_SECURE_RNG
 | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| size_t VectorSize() { return Lanes(ScalableTag<uint8_t, 3>()); } | ||||
| bool HaveFloat64() { return HWY_HAVE_FLOAT64; } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(VectorSize); | ||||
| HWY_EXPORT(HaveFloat64); | ||||
| 
 | ||||
| HWY_INLINE size_t PivotBufNum(size_t sizeof_t, size_t N) { | ||||
|   // 3 chunks of medians, 1 chunk of median medians plus two padding vectors.
 | ||||
|   const size_t lpc = SortConstants::LanesPerChunk(sizeof_t, N); | ||||
|   return (3 + 1) * lpc + 2 * N; | ||||
| } | ||||
| 
 | ||||
| }  // namespace
 | ||||
| 
 | ||||
| Sorter::Sorter() { | ||||
|   // Determine the largest buffer size required for any type by trying them all.
 | ||||
|   // (The capping of N in BaseCaseNum means that smaller N but larger sizeof_t
 | ||||
|   // may require a larger buffer.)
 | ||||
|   const size_t vector_size = HWY_DYNAMIC_DISPATCH(VectorSize)(); | ||||
|   size_t max_bytes = 0; | ||||
|   for (size_t sizeof_t : | ||||
|        {sizeof(uint16_t), sizeof(uint32_t), sizeof(uint64_t)}) { | ||||
|     const size_t N = vector_size / sizeof_t; | ||||
|     // One extra for padding plus another for full-vector loads.
 | ||||
|     const size_t base_case = SortConstants::BaseCaseNum(N) + 2 * N; | ||||
|     const size_t partition_num = SortConstants::PartitionBufNum(N); | ||||
|     const size_t buf_lanes = | ||||
|         HWY_MAX(base_case, HWY_MAX(partition_num, PivotBufNum(sizeof_t, N))); | ||||
|     max_bytes = HWY_MAX(max_bytes, buf_lanes * sizeof_t); | ||||
|   } | ||||
| 
 | ||||
|   ptr_ = hwy::AllocateAlignedBytes(max_bytes, nullptr, nullptr); | ||||
| 
 | ||||
|   // Prevent msan errors by initializing.
 | ||||
|   memset(ptr_, 0, max_bytes); | ||||
| } | ||||
| 
 | ||||
| void Sorter::Delete() { | ||||
|   FreeAlignedBytes(ptr_, nullptr, nullptr); | ||||
|   ptr_ = nullptr; | ||||
| } | ||||
| 
 | ||||
| #if !VQSORT_SECURE_RNG | ||||
| 
 | ||||
| void Sorter::Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes) { | ||||
| #if VQSORT_SECURE_SEED == 1 | ||||
|   // May block if urandom is not yet initialized.
 | ||||
|   const ssize_t ret = getrandom(bytes, 24, /*flags=*/0); | ||||
|   if (ret == 24) return; | ||||
| #elif VQSORT_SECURE_SEED == 2 | ||||
|   HCRYPTPROV hProvider{}; | ||||
|   if (CryptAcquireContextA(&hProvider, nullptr, nullptr, PROV_RSA_FULL, | ||||
|                            CRYPT_VERIFYCONTEXT)) { | ||||
|     const BOOL ok = | ||||
|         CryptGenRandom(hProvider, 24, reinterpret_cast<BYTE*>(bytes)); | ||||
|     CryptReleaseContext(hProvider, 0); | ||||
|     if (ok) return; | ||||
|   } | ||||
| #endif | ||||
| 
 | ||||
|   // VQSORT_SECURE_SEED == 0, or one of the above failed. Get some entropy from
 | ||||
|   // stack/heap/code addresses and the clock() timer.
 | ||||
|   uint64_t* words = reinterpret_cast<uint64_t*>(bytes); | ||||
|   uint64_t** seed_stack = &words; | ||||
|   void (*seed_code)(const void*, size_t, void*) = &Fill24Bytes; | ||||
|   const uintptr_t bits_stack = reinterpret_cast<uintptr_t>(seed_stack); | ||||
|   const uintptr_t bits_heap = reinterpret_cast<uintptr_t>(seed_heap); | ||||
|   const uintptr_t bits_code = reinterpret_cast<uintptr_t>(seed_code); | ||||
|   const uint64_t bits_time = static_cast<uint64_t>(clock()); | ||||
|   words[0] = bits_stack ^ bits_time ^ seed_num; | ||||
|   words[1] = bits_heap ^ bits_time ^ seed_num; | ||||
|   words[2] = bits_code ^ bits_time ^ seed_num; | ||||
| } | ||||
| 
 | ||||
| #endif  // !VQSORT_SECURE_RNG
 | ||||
| 
 | ||||
| bool Sorter::HaveFloat64() { return HWY_DYNAMIC_DISPATCH(HaveFloat64)(); } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										104
									
								
								third_party/highway/hwy/contrib/sort/vqsort.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										104
									
								
								third_party/highway/hwy/contrib/sort/vqsort.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,104 @@ | |||
| // Copyright 2022 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| // Interface to vectorized quicksort with dynamic dispatch.
 | ||||
| 
 | ||||
| #ifndef HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_ | ||||
| #define HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_ | ||||
| 
 | ||||
| #include "hwy/base.h" | ||||
| 
 | ||||
| namespace hwy { | ||||
| 
 | ||||
| // Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it:
 | ||||
| // https://reviews.llvm.org/D86310
 | ||||
| #pragma pack(push, 1) | ||||
| struct alignas(16) uint128_t { | ||||
|   uint64_t lo;  // little-endian layout
 | ||||
|   uint64_t hi; | ||||
| }; | ||||
| #pragma pack(pop) | ||||
| 
 | ||||
| // Tag arguments that determine the sort order.
 | ||||
| struct SortAscending { | ||||
|   constexpr bool IsAscending() const { return true; } | ||||
| }; | ||||
| struct SortDescending { | ||||
|   constexpr bool IsAscending() const { return false; } | ||||
| }; | ||||
| 
 | ||||
| // Allocates O(1) space. Type-erased RAII wrapper over hwy/aligned_allocator.h.
 | ||||
| // This allows amortizing the allocation over multiple sorts.
 | ||||
| class HWY_CONTRIB_DLLEXPORT Sorter { | ||||
|  public: | ||||
|   Sorter(); | ||||
|   ~Sorter() { Delete(); } | ||||
| 
 | ||||
|   // Move-only
 | ||||
|   Sorter(const Sorter&) = delete; | ||||
|   Sorter& operator=(const Sorter&) = delete; | ||||
|   Sorter(Sorter&& other) { | ||||
|     Delete(); | ||||
|     ptr_ = other.ptr_; | ||||
|     other.ptr_ = nullptr; | ||||
|   } | ||||
|   Sorter& operator=(Sorter&& other) { | ||||
|     Delete(); | ||||
|     ptr_ = other.ptr_; | ||||
|     other.ptr_ = nullptr; | ||||
|     return *this; | ||||
|   } | ||||
| 
 | ||||
|   // Sorts keys[0, n). Dispatches to the best available instruction set,
 | ||||
|   // and does not allocate memory.
 | ||||
|   void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortAscending) const; | ||||
|   void operator()(uint16_t* HWY_RESTRICT keys, size_t n, SortDescending) const; | ||||
|   void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortAscending) const; | ||||
|   void operator()(uint32_t* HWY_RESTRICT keys, size_t n, SortDescending) const; | ||||
|   void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortAscending) const; | ||||
|   void operator()(uint64_t* HWY_RESTRICT keys, size_t n, SortDescending) const; | ||||
| 
 | ||||
|   void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortAscending) const; | ||||
|   void operator()(int16_t* HWY_RESTRICT keys, size_t n, SortDescending) const; | ||||
|   void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortAscending) const; | ||||
|   void operator()(int32_t* HWY_RESTRICT keys, size_t n, SortDescending) const; | ||||
|   void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortAscending) const; | ||||
|   void operator()(int64_t* HWY_RESTRICT keys, size_t n, SortDescending) const; | ||||
| 
 | ||||
|   void operator()(float* HWY_RESTRICT keys, size_t n, SortAscending) const; | ||||
|   void operator()(float* HWY_RESTRICT keys, size_t n, SortDescending) const; | ||||
|   void operator()(double* HWY_RESTRICT keys, size_t n, SortAscending) const; | ||||
|   void operator()(double* HWY_RESTRICT keys, size_t n, SortDescending) const; | ||||
| 
 | ||||
|   void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortAscending) const; | ||||
|   void operator()(uint128_t* HWY_RESTRICT keys, size_t n, SortDescending) const; | ||||
| 
 | ||||
|   // For internal use only
 | ||||
|   static void Fill24Bytes(const void* seed_heap, size_t seed_num, void* bytes); | ||||
|   static bool HaveFloat64(); | ||||
| 
 | ||||
|  private: | ||||
|   void Delete(); | ||||
| 
 | ||||
|   template <typename T> | ||||
|   T* Get() const { | ||||
|     return static_cast<T*>(ptr_); | ||||
|   } | ||||
| 
 | ||||
|   void* ptr_ = nullptr; | ||||
| }; | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| 
 | ||||
| #endif  // HIGHWAY_HWY_CONTRIB_SORT_VQSORT_H_
 | ||||
							
								
								
									
										55
									
								
								third_party/highway/hwy/contrib/sort/vqsort_128a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								third_party/highway/hwy/contrib/sort/vqsort_128a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,55 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128a.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits128-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void Sort128Asc(uint64_t* HWY_RESTRICT keys, size_t num, | ||||
|                 uint64_t* HWY_RESTRICT buf) { | ||||
|   SortTag<uint64_t> d; | ||||
|   detail::SharedTraits<detail::Traits128<detail::OrderAscending128>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(Sort128Asc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n, | ||||
|                         SortAscending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(Sort128Asc) | ||||
|   (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										55
									
								
								third_party/highway/hwy/contrib/sort/vqsort_128d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								third_party/highway/hwy/contrib/sort/vqsort_128d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,55 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_128d.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits128-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void Sort128Desc(uint64_t* HWY_RESTRICT keys, size_t num, | ||||
|                  uint64_t* HWY_RESTRICT buf) { | ||||
|   SortTag<uint64_t> d; | ||||
|   detail::SharedTraits<detail::Traits128<detail::OrderDescending128>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(Sort128Desc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(uint128_t* HWY_RESTRICT keys, size_t n, | ||||
|                         SortDescending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(Sort128Desc) | ||||
|   (reinterpret_cast<uint64_t*>(keys), n * 2, Get<uint64_t>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										53
									
								
								third_party/highway/hwy/contrib/sort/vqsort_f32a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										53
									
								
								third_party/highway/hwy/contrib/sort/vqsort_f32a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,53 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32a.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortF32Asc(float* HWY_RESTRICT keys, size_t num, float* HWY_RESTRICT buf) { | ||||
|   SortTag<float> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortF32Asc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(float* HWY_RESTRICT keys, size_t n, | ||||
|                         SortAscending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortF32Asc)(keys, n, Get<float>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_f32d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_f32d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,54 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f32d.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortF32Desc(float* HWY_RESTRICT keys, size_t num, | ||||
|                  float* HWY_RESTRICT buf) { | ||||
|   SortTag<float> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortF32Desc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(float* HWY_RESTRICT keys, size_t n, | ||||
|                         SortDescending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortF32Desc)(keys, n, Get<float>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										61
									
								
								third_party/highway/hwy/contrib/sort/vqsort_f64a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								third_party/highway/hwy/contrib/sort/vqsort_f64a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,61 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64a.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortF64Asc(double* HWY_RESTRICT keys, size_t num, | ||||
|                 double* HWY_RESTRICT buf) { | ||||
| #if HWY_HAVE_FLOAT64 | ||||
|   SortTag<double> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| #else | ||||
|   (void)keys; | ||||
|   (void)num; | ||||
|   (void)buf; | ||||
|   HWY_ASSERT(0); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortF64Asc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(double* HWY_RESTRICT keys, size_t n, | ||||
|                         SortAscending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortF64Asc)(keys, n, Get<double>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										61
									
								
								third_party/highway/hwy/contrib/sort/vqsort_f64d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								third_party/highway/hwy/contrib/sort/vqsort_f64d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,61 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_f64d.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortF64Desc(double* HWY_RESTRICT keys, size_t num, | ||||
|                  double* HWY_RESTRICT buf) { | ||||
| #if HWY_HAVE_FLOAT64 | ||||
|   SortTag<double> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| #else | ||||
|   (void)keys; | ||||
|   (void)num; | ||||
|   (void)buf; | ||||
|   HWY_ASSERT(0); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortF64Desc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(double* HWY_RESTRICT keys, size_t n, | ||||
|                         SortDescending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortF64Desc)(keys, n, Get<double>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										59
									
								
								third_party/highway/hwy/contrib/sort/vqsort_i16a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								third_party/highway/hwy/contrib/sort/vqsort_i16a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,59 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16a.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| // Workaround for build timeout
 | ||||
| #if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortI16Asc(int16_t* HWY_RESTRICT keys, size_t num, | ||||
|                 int16_t* HWY_RESTRICT buf) { | ||||
|   SortTag<int16_t> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortI16Asc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n, | ||||
|                         SortAscending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortI16Asc)(keys, n, Get<int16_t>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
| 
 | ||||
| #endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
 | ||||
							
								
								
									
										59
									
								
								third_party/highway/hwy/contrib/sort/vqsort_i16d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								third_party/highway/hwy/contrib/sort/vqsort_i16d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,59 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i16d.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| // Workaround for build timeout
 | ||||
| #if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortI16Desc(int16_t* HWY_RESTRICT keys, size_t num, | ||||
|                  int16_t* HWY_RESTRICT buf) { | ||||
|   SortTag<int16_t> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortI16Desc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(int16_t* HWY_RESTRICT keys, size_t n, | ||||
|                         SortDescending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortI16Desc)(keys, n, Get<int16_t>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
| 
 | ||||
| #endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
 | ||||
							
								
								
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_i32a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_i32a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,54 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32a.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortI32Asc(int32_t* HWY_RESTRICT keys, size_t num, | ||||
|                 int32_t* HWY_RESTRICT buf) { | ||||
|   SortTag<int32_t> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortI32Asc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n, | ||||
|                         SortAscending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortI32Asc)(keys, n, Get<int32_t>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_i32d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_i32d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,54 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i32d.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortI32Desc(int32_t* HWY_RESTRICT keys, size_t num, | ||||
|                  int32_t* HWY_RESTRICT buf) { | ||||
|   SortTag<int32_t> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortI32Desc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(int32_t* HWY_RESTRICT keys, size_t n, | ||||
|                         SortDescending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortI32Desc)(keys, n, Get<int32_t>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_i64a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_i64a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,54 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64a.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortI64Asc(int64_t* HWY_RESTRICT keys, size_t num, | ||||
|                 int64_t* HWY_RESTRICT buf) { | ||||
|   SortTag<int64_t> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortI64Asc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n, | ||||
|                         SortAscending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortI64Asc)(keys, n, Get<int64_t>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_i64d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_i64d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,54 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_i64d.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortI64Desc(int64_t* HWY_RESTRICT keys, size_t num, | ||||
|                  int64_t* HWY_RESTRICT buf) { | ||||
|   SortTag<int64_t> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortI64Desc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(int64_t* HWY_RESTRICT keys, size_t n, | ||||
|                         SortDescending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortI64Desc)(keys, n, Get<int64_t>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										59
									
								
								third_party/highway/hwy/contrib/sort/vqsort_u16a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								third_party/highway/hwy/contrib/sort/vqsort_u16a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,59 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16a.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| // Workaround for build timeout
 | ||||
| #if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortU16Asc(uint16_t* HWY_RESTRICT keys, size_t num, | ||||
|                 uint16_t* HWY_RESTRICT buf) { | ||||
|   SortTag<uint16_t> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortU16Asc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n, | ||||
|                         SortAscending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortU16Asc)(keys, n, Get<uint16_t>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
| 
 | ||||
| #endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
 | ||||
							
								
								
									
										59
									
								
								third_party/highway/hwy/contrib/sort/vqsort_u16d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								third_party/highway/hwy/contrib/sort/vqsort_u16d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,59 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u16d.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| // Workaround for build timeout
 | ||||
| #if !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortU16Desc(uint16_t* HWY_RESTRICT keys, size_t num, | ||||
|                  uint16_t* HWY_RESTRICT buf) { | ||||
|   SortTag<uint16_t> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortU16Desc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(uint16_t* HWY_RESTRICT keys, size_t n, | ||||
|                         SortDescending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortU16Desc)(keys, n, Get<uint16_t>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
| 
 | ||||
| #endif  // !HWY_COMPILER_MSVC || HWY_IS_DEBUG_BUILD
 | ||||
							
								
								
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_u32a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_u32a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,54 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32a.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortU32Asc(uint32_t* HWY_RESTRICT keys, size_t num, | ||||
|                 uint32_t* HWY_RESTRICT buf) { | ||||
|   SortTag<uint32_t> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortU32Asc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n, | ||||
|                         SortAscending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortU32Asc)(keys, n, Get<uint32_t>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_u32d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_u32d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,54 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u32d.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortU32Desc(uint32_t* HWY_RESTRICT keys, size_t num, | ||||
|                  uint32_t* HWY_RESTRICT buf) { | ||||
|   SortTag<uint32_t> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortU32Desc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(uint32_t* HWY_RESTRICT keys, size_t n, | ||||
|                         SortDescending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortU32Desc)(keys, n, Get<uint32_t>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_u64a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_u64a.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,54 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64a.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortU64Asc(uint64_t* HWY_RESTRICT keys, size_t num, | ||||
|                 uint64_t* HWY_RESTRICT buf) { | ||||
|   SortTag<uint64_t> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderAscending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortU64Asc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n, | ||||
|                         SortAscending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortU64Asc)(keys, n, Get<uint64_t>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_u64d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								third_party/highway/hwy/contrib/sort/vqsort_u64d.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,54 @@ | |||
| // Copyright 2021 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include "hwy/contrib/sort/disabled_targets.h" | ||||
| #include "hwy/contrib/sort/vqsort.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "hwy/contrib/sort/vqsort_u64d.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // After foreach_target
 | ||||
| #include "hwy/contrib/sort/traits-inl.h" | ||||
| #include "hwy/contrib/sort/vqsort-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| void SortU64Desc(uint64_t* HWY_RESTRICT keys, size_t num, | ||||
|                  uint64_t* HWY_RESTRICT buf) { | ||||
|   SortTag<uint64_t> d; | ||||
|   detail::SharedTraits<detail::LaneTraits<detail::OrderDescending>> st; | ||||
|   Sort(d, st, keys, num, buf); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| namespace hwy { | ||||
| namespace { | ||||
| HWY_EXPORT(SortU64Desc); | ||||
| }  // namespace
 | ||||
| 
 | ||||
| void Sorter::operator()(uint64_t* HWY_RESTRICT keys, size_t n, | ||||
|                         SortDescending) const { | ||||
|   HWY_DYNAMIC_DISPATCH(SortU64Desc)(keys, n, Get<uint64_t>()); | ||||
| } | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| #endif  // HWY_ONCE
 | ||||
							
								
								
									
										16
									
								
								third_party/highway/hwy/detect_compiler_arch.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										16
									
								
								third_party/highway/hwy/detect_compiler_arch.h
									
									
									
									
										vendored
									
									
								
							|  | @ -106,20 +106,6 @@ | |||
| //------------------------------------------------------------------------------
 | ||||
| // Architecture
 | ||||
| 
 | ||||
| #if defined(HWY_EMULATE_SVE) | ||||
| 
 | ||||
| #define HWY_ARCH_X86_32 0 | ||||
| #define HWY_ARCH_X86_64 0 | ||||
| #define HWY_ARCH_X86 0 | ||||
| #define HWY_ARCH_PPC 0 | ||||
| #define HWY_ARCH_ARM_A64 1 | ||||
| #define HWY_ARCH_ARM_V7 0 | ||||
| #define HWY_ARCH_ARM 1 | ||||
| #define HWY_ARCH_WASM 0 | ||||
| #define HWY_ARCH_RVV 0 | ||||
| 
 | ||||
| #else | ||||
| 
 | ||||
| #if defined(__i386__) || defined(_M_IX86) | ||||
| #define HWY_ARCH_X86_32 1 | ||||
| #else | ||||
|  | @ -182,8 +168,6 @@ | |||
| #define HWY_ARCH_RVV 0 | ||||
| #endif | ||||
| 
 | ||||
| #endif // defined(HWY_EMULATE_SVE)
 | ||||
| 
 | ||||
| // It is an error to detect multiple architectures at the same time, but OK to
 | ||||
| // detect none of the above.
 | ||||
| #if (HWY_ARCH_X86 + HWY_ARCH_PPC + HWY_ARCH_ARM + HWY_ARCH_WASM + \ | ||||
|  |  | |||
							
								
								
									
										9
									
								
								third_party/highway/hwy/detect_targets.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										9
									
								
								third_party/highway/hwy/detect_targets.h
									
									
									
									
										vendored
									
									
								
							|  | @ -161,11 +161,6 @@ | |||
| // user to override this without any guarantee of success.
 | ||||
| #ifndef HWY_BASELINE_TARGETS | ||||
| 
 | ||||
| #if defined(HWY_EMULATE_SVE) | ||||
| #define HWY_BASELINE_TARGETS HWY_SVE  // does not support SVE2
 | ||||
| #define HWY_BASELINE_AVX3_DL 0 | ||||
| #else | ||||
| 
 | ||||
| // Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
 | ||||
| // HWY_TARGET == HWY_SCALAR.
 | ||||
| 
 | ||||
|  | @ -186,7 +181,7 @@ | |||
| #define HWY_BASELINE_PPC8 0 | ||||
| #endif | ||||
| 
 | ||||
| // SVE compiles, but is not yet tested.
 | ||||
| // SVE2 compiles, but is not yet tested.
 | ||||
| #if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2) | ||||
| #define HWY_BASELINE_SVE2 HWY_SVE2 | ||||
| #else | ||||
|  | @ -307,8 +302,6 @@ | |||
|    HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 |              \ | ||||
|    HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV) | ||||
| 
 | ||||
| #endif  // HWY_EMULATE_SVE
 | ||||
| 
 | ||||
| #else | ||||
| // User already defined HWY_BASELINE_TARGETS, but we still need to define
 | ||||
| // HWY_BASELINE_AVX3 (matching user's definition) for HWY_CHECK_AVX3_DL.
 | ||||
|  |  | |||
							
								
								
									
										66
									
								
								third_party/highway/hwy/examples/benchmark.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										66
									
								
								third_party/highway/hwy/examples/benchmark.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -25,15 +25,17 @@ | |||
| #include <numeric>  // iota
 | ||||
| 
 | ||||
| #include "hwy/aligned_allocator.h" | ||||
| // Must come after foreach_target.h to avoid redefinition errors.
 | ||||
| #include "hwy/highway.h" | ||||
| #include "hwy/nanobenchmark.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| // These templates are not found via ADL.
 | ||||
| #if HWY_TARGET != HWY_SCALAR | ||||
| using hwy::HWY_NAMESPACE::CombineShiftRightBytes; | ||||
| using hwy::HWY_NAMESPACE::CombineShiftRightLanes; | ||||
| #endif | ||||
| 
 | ||||
| class TwoArray { | ||||
|  | @ -87,14 +89,14 @@ void RunBenchmark(const char* caption) { | |||
| } | ||||
| 
 | ||||
| void Intro() { | ||||
|   HWY_ALIGN const float in[16] = {1, 2, 3, 4, 5, 6}; | ||||
|   HWY_ALIGN float out[16]; | ||||
|   const float in[16] = {1, 2, 3, 4, 5, 6}; | ||||
|   float out[16]; | ||||
|   const ScalableTag<float> d;  // largest possible vector
 | ||||
|   for (size_t i = 0; i < 16; i += Lanes(d)) { | ||||
|     const auto vec = Load(d, in + i);  // aligned!
 | ||||
|     auto result = vec * vec; | ||||
|     result += result;  // can update if not const
 | ||||
|     Store(result, d, out + i); | ||||
|     const auto vec = LoadU(d, in + i);  // no alignment requirement
 | ||||
|     auto result = Mul(vec, vec); | ||||
|     result = Add(result, result);  // can update if not const
 | ||||
|     StoreU(result, d, out + i); | ||||
|   } | ||||
|   printf("\nF(x)->2*x^2, F(%.0f) = %.1f\n", in[2], out[2]); | ||||
| } | ||||
|  | @ -109,32 +111,34 @@ class BenchmarkDot : public TwoArray { | |||
|     const ScalableTag<float> d; | ||||
|     const size_t N = Lanes(d); | ||||
|     using V = decltype(Zero(d)); | ||||
|     constexpr size_t unroll = 8; | ||||
|     // Compiler doesn't make independent sum* accumulators, so unroll manually.
 | ||||
|     // Some older compilers might not be able to fit the 8 arrays in registers,
 | ||||
|     // so manual unrolling can be helpfull if you run into this issue.
 | ||||
|     // 2 FMA ports * 4 cycle latency = 8x unrolled.
 | ||||
|     V sum[unroll]; | ||||
|     for (size_t i = 0; i < unroll; ++i) { | ||||
|       sum[i] = Zero(d); | ||||
|     } | ||||
|     // We cannot use an array because V might be a sizeless type. For reasonable
 | ||||
|     // code, we unroll 4x, but 8x might help (2 FMA ports * 4 cycle latency).
 | ||||
|     V sum0 = Zero(d); | ||||
|     V sum1 = Zero(d); | ||||
|     V sum2 = Zero(d); | ||||
|     V sum3 = Zero(d); | ||||
|     const float* const HWY_RESTRICT pa = &a_[0]; | ||||
|     const float* const HWY_RESTRICT pb = b_; | ||||
|     for (size_t i = 0; i < num_items; i += unroll * N) { | ||||
|       for (size_t j = 0; j < unroll; ++j) { | ||||
|         const auto a = Load(d, pa + i + j * N); | ||||
|         const auto b = Load(d, pb + i + j * N); | ||||
|         sum[j] = MulAdd(a, b, sum[j]); | ||||
|     for (size_t i = 0; i < num_items; i += 4 * N) { | ||||
|       const auto a0 = Load(d, pa + i + 0 * N); | ||||
|       const auto b0 = Load(d, pb + i + 0 * N); | ||||
|       sum0 = MulAdd(a0, b0, sum0); | ||||
|       const auto a1 = Load(d, pa + i + 1 * N); | ||||
|       const auto b1 = Load(d, pb + i + 1 * N); | ||||
|       sum1 = MulAdd(a1, b1, sum1); | ||||
|       const auto a2 = Load(d, pa + i + 2 * N); | ||||
|       const auto b2 = Load(d, pb + i + 2 * N); | ||||
|       sum2 = MulAdd(a2, b2, sum2); | ||||
|       const auto a3 = Load(d, pa + i + 3 * N); | ||||
|       const auto b3 = Load(d, pb + i + 3 * N); | ||||
|       sum3 = MulAdd(a3, b3, sum3); | ||||
|     } | ||||
|     } | ||||
|     // Reduction tree: sum of all accumulators by pairs into sum[0], then the
 | ||||
|     // lanes.
 | ||||
|     for (size_t power = 1; power < unroll; power *= 2) { | ||||
|       for (size_t i = 0; i < unroll; i += 2 * power) { | ||||
|         sum[i] += sum[i + power]; | ||||
|       } | ||||
|     } | ||||
|     dot_ = GetLane(SumOfLanes(d, sum[0])); | ||||
|     // Reduction tree: sum of all accumulators by pairs into sum0.
 | ||||
|     sum0 = Add(sum0, sum1); | ||||
|     sum2 = Add(sum2, sum3); | ||||
|     sum0 = Add(sum0, sum2); | ||||
|     dot_ = GetLane(SumOfLanes(d, sum0)); | ||||
|     return static_cast<FuncOutput>(dot_); | ||||
|   } | ||||
|   void Verify(size_t num_items) { | ||||
|  | @ -193,9 +197,9 @@ struct BenchmarkDelta : public TwoArray { | |||
|     auto prev = Load(df, &a_[0]); | ||||
|     for (; i < num_items; i += Lanes(df)) { | ||||
|       const auto a = Load(df, &a_[i]); | ||||
|       const auto shifted = CombineShiftRightLanes<3>(a, prev); | ||||
|       const auto shifted = CombineShiftRightLanes<3>(df, a, prev); | ||||
|       prev = a; | ||||
|       Store(a - shifted, df, &b_[i]); | ||||
|       Store(Sub(a, shifted), df, &b_[i]); | ||||
|     } | ||||
| #endif | ||||
|     return static_cast<FuncOutput>(b_[num_items - 1]); | ||||
|  |  | |||
							
								
								
									
										22
									
								
								third_party/highway/hwy/examples/skeleton.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										22
									
								
								third_party/highway/hwy/examples/skeleton.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -24,11 +24,14 @@ | |||
| // Generates code for each enabled target by re-including this source file.
 | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // Must come after foreach_target.h to avoid redefinition errors.
 | ||||
| #include "hwy/highway.h" | ||||
| 
 | ||||
| // Optional, can instead add HWY_ATTR to all functions.
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace skeleton { | ||||
| // This namespace name is unique per target, which allows code for multiple
 | ||||
| // targets to co-exist in the same translation unit.
 | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| // Highway ops reside here; ADL does not find templates nor builtins.
 | ||||
|  | @ -47,7 +50,7 @@ template <class DF> | |||
| ATTR_MSAN void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values, | ||||
|                             uint8_t* HWY_RESTRICT log2) { | ||||
|   // Type tags for converting to other element types (Rebind = same count).
 | ||||
|   const Rebind<int32_t, DF> d32; | ||||
|   const RebindToSigned<DF> d32; | ||||
|   const Rebind<uint8_t, DF> d8; | ||||
| 
 | ||||
|   const auto u8 = Load(d8, values); | ||||
|  | @ -59,7 +62,7 @@ ATTR_MSAN void OneFloorLog2(const DF df, const uint8_t* HWY_RESTRICT values, | |||
| void CodepathDemo() { | ||||
|   // Highway defaults to portability, but per-target codepaths may be selected
 | ||||
|   // via #if HWY_TARGET == HWY_SSE4 or by testing capability macros:
 | ||||
| #if HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|   const char* gather = "Has int64"; | ||||
| #else | ||||
|   const char* gather = "No int64"; | ||||
|  | @ -71,20 +74,16 @@ void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count, | |||
|                uint8_t* HWY_RESTRICT log2) { | ||||
|   CodepathDemo(); | ||||
| 
 | ||||
|   // Second argument is necessary on RVV until it supports fractional lengths.
 | ||||
|   const ScalableTag<float, 2> df; | ||||
| 
 | ||||
|   const ScalableTag<float> df; | ||||
|   const size_t N = Lanes(df); | ||||
|   size_t i = 0; | ||||
|   for (; i + N <= count; i += N) { | ||||
|     OneFloorLog2(df, values + i, log2 + i); | ||||
|   } | ||||
|   // TODO(janwas): implement
 | ||||
| #if HWY_TARGET != HWY_RVV | ||||
|   for (; i < count; ++i) { | ||||
|     OneFloorLog2(HWY_CAPPED(float, 1)(), values + i, log2 + i); | ||||
|     CappedTag<float, 1> d1; | ||||
|     OneFloorLog2(d1, values + i, log2 + i); | ||||
|   } | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
|  | @ -92,6 +91,9 @@ void FloorLog2(const uint8_t* HWY_RESTRICT values, size_t count, | |||
| }  // namespace skeleton
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| // The table of pointers to the various implementations in HWY_NAMESPACE must
 | ||||
| // be compiled only once (foreach_target #includes this file multiple times).
 | ||||
| // HWY_ONCE is true for only one of these 'compilation passes'.
 | ||||
| #if HWY_ONCE | ||||
| 
 | ||||
| namespace skeleton { | ||||
|  | @ -105,6 +107,8 @@ HWY_EXPORT(FloorLog2); | |||
| // is equivalent to inlining this function.
 | ||||
| void CallFloorLog2(const uint8_t* HWY_RESTRICT in, const size_t count, | ||||
|                    uint8_t* HWY_RESTRICT out) { | ||||
|   // This must reside outside of HWY_NAMESPACE because it references (calls the
 | ||||
|   // appropriate one from) the per-target implementations there.
 | ||||
|   return HWY_DYNAMIC_DISPATCH(FloorLog2)(in, count, out); | ||||
| } | ||||
| 
 | ||||
|  |  | |||
|  | @ -21,10 +21,13 @@ | |||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "examples/skeleton_test.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| // Must come after foreach_target.h to avoid redefinition errors.
 | ||||
| #include "hwy/highway.h" | ||||
| #include "hwy/tests/test_util-inl.h" | ||||
| 
 | ||||
| // Optional: factor out parts of the implementation into *-inl.h
 | ||||
| // (must also come after foreach_target.h to avoid redefinition errors)
 | ||||
| #include "hwy/examples/skeleton-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
|  | @ -50,10 +53,7 @@ struct TestFloorLog2 { | |||
|     CallFloorLog2(in.get(), count, out.get()); | ||||
|     int sum = 0; | ||||
|     for (size_t i = 0; i < count; ++i) { | ||||
|       // TODO(janwas): implement
 | ||||
| #if HWY_TARGET != HWY_RVV | ||||
|       HWY_ASSERT_EQ(expected[i], out[i]); | ||||
| #endif | ||||
|       sum += out[i]; | ||||
|     } | ||||
|     hwy::PreventElision(sum); | ||||
|  |  | |||
							
								
								
									
										22
									
								
								third_party/highway/hwy/foreach_target.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										22
									
								
								third_party/highway/hwy/foreach_target.h
									
									
									
									
										vendored
									
									
								
							|  | @ -74,6 +74,28 @@ | |||
| #endif | ||||
| #endif | ||||
| 
 | ||||
| #if (HWY_TARGETS & HWY_SVE) && (HWY_STATIC_TARGET != HWY_SVE) | ||||
| #undef HWY_TARGET | ||||
| #define HWY_TARGET HWY_SVE | ||||
| #include HWY_TARGET_INCLUDE | ||||
| #ifdef HWY_TARGET_TOGGLE | ||||
| #undef HWY_TARGET_TOGGLE | ||||
| #else | ||||
| #define HWY_TARGET_TOGGLE | ||||
| #endif | ||||
| #endif | ||||
| 
 | ||||
| #if (HWY_TARGETS & HWY_SVE2) && (HWY_STATIC_TARGET != HWY_SVE2) | ||||
| #undef HWY_TARGET | ||||
| #define HWY_TARGET HWY_SVE2 | ||||
| #include HWY_TARGET_INCLUDE | ||||
| #ifdef HWY_TARGET_TOGGLE | ||||
| #undef HWY_TARGET_TOGGLE | ||||
| #else | ||||
| #define HWY_TARGET_TOGGLE | ||||
| #endif | ||||
| #endif | ||||
| 
 | ||||
| #if (HWY_TARGETS & HWY_SSSE3) && (HWY_STATIC_TARGET != HWY_SSSE3) | ||||
| #undef HWY_TARGET | ||||
| #define HWY_TARGET HWY_SSSE3 | ||||
|  |  | |||
							
								
								
									
										25
									
								
								third_party/highway/hwy/highway.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										25
									
								
								third_party/highway/hwy/highway.h
									
									
									
									
										vendored
									
									
								
							|  | @ -27,7 +27,7 @@ namespace hwy { | |||
| 
 | ||||
| // API version (https://semver.org/); keep in sync with CMakeLists.txt.
 | ||||
| #define HWY_MAJOR 0 | ||||
| #define HWY_MINOR 15 | ||||
| #define HWY_MINOR 16 | ||||
| #define HWY_PATCH 0 | ||||
| 
 | ||||
| //------------------------------------------------------------------------------
 | ||||
|  | @ -37,7 +37,9 @@ namespace hwy { | |||
| 
 | ||||
| // HWY_FULL(T[,LMUL=1]) is a native vector/group. LMUL is the number of
 | ||||
| // registers in the group, and is ignored on targets that do not support groups.
 | ||||
| #define HWY_FULL1(T) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)> | ||||
| #define HWY_FULL1(T) hwy::HWY_NAMESPACE::ScalableTag<T> | ||||
| #define HWY_FULL2(T, LMUL) \ | ||||
|   hwy::HWY_NAMESPACE::ScalableTag<T, CeilLog2(HWY_MAX(0, LMUL))> | ||||
| #define HWY_3TH_ARG(arg1, arg2, arg3, ...) arg3 | ||||
| // Workaround for MSVC grouping __VA_ARGS__ into a single argument
 | ||||
| #define HWY_FULL_RECOMPOSER(args_with_paren) HWY_3TH_ARG args_with_paren | ||||
|  | @ -46,9 +48,9 @@ namespace hwy { | |||
|   HWY_FULL_RECOMPOSER((__VA_ARGS__, HWY_FULL2, HWY_FULL1, )) | ||||
| #define HWY_FULL(...) HWY_CHOOSE_FULL(__VA_ARGS__())(__VA_ARGS__) | ||||
| 
 | ||||
| // Vector of up to MAX_N lanes. Discouraged, when possible, use Half<> instead.
 | ||||
| // Vector of up to MAX_N lanes. It's better to use full vectors where possible.
 | ||||
| #define HWY_CAPPED(T, MAX_N) \ | ||||
|   hwy::HWY_NAMESPACE::Simd<T, HWY_MIN(MAX_N, HWY_LANES(T))> | ||||
|   hwy::HWY_NAMESPACE::CappedTag<T, HWY_MIN(MAX_N, HWY_LANES(T))> | ||||
| 
 | ||||
| //------------------------------------------------------------------------------
 | ||||
| // Export user functions for static/dynamic dispatch
 | ||||
|  | @ -109,6 +111,7 @@ struct FunctionCache { | |||
|   template <FunctionType* const table[]> | ||||
|   static RetType ChooseAndCall(Args... args) { | ||||
|     // If we are running here it means we need to update the chosen target.
 | ||||
|     ChosenTarget& chosen_target = GetChosenTarget(); | ||||
|     chosen_target.Update(); | ||||
|     return (table[chosen_target.GetIndex()])(args...); | ||||
|   } | ||||
|  | @ -263,10 +266,15 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) { | |||
|           HWY_CHOOSE_SCALAR(FUNC_NAME),                                    \ | ||||
|   } | ||||
| #define HWY_DYNAMIC_DISPATCH(FUNC_NAME) \ | ||||
|   (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::chosen_target.GetIndex()])) | ||||
|   (*(HWY_DISPATCH_TABLE(FUNC_NAME)[hwy::GetChosenTarget().GetIndex()])) | ||||
| 
 | ||||
| #endif  // HWY_IDE || ((HWY_TARGETS & (HWY_TARGETS - 1)) == 0)
 | ||||
| 
 | ||||
| // DEPRECATED names; please use HWY_HAVE_* instead.
 | ||||
| #define HWY_CAP_INTEGER64 HWY_HAVE_INTEGER64 | ||||
| #define HWY_CAP_FLOAT16 HWY_HAVE_FLOAT16 | ||||
| #define HWY_CAP_FLOAT64 HWY_HAVE_FLOAT64 | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| 
 | ||||
| #endif  // HWY_HIGHWAY_INCLUDED
 | ||||
|  | @ -283,13 +291,6 @@ FunctionCache<RetType, Args...> FunctionCacheFactory(RetType (*)(Args...)) { | |||
| #define HWY_HIGHWAY_PER_TARGET | ||||
| #endif | ||||
| 
 | ||||
| #undef HWY_FULL2 | ||||
| #if HWY_TARGET == HWY_RVV | ||||
| #define HWY_FULL2(T, LMUL) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T) * (LMUL)> | ||||
| #else | ||||
| #define HWY_FULL2(T, LMUL) hwy::HWY_NAMESPACE::Simd<T, HWY_LANES(T)> | ||||
| #endif | ||||
| 
 | ||||
| // These define ops inside namespace hwy::HWY_NAMESPACE.
 | ||||
| #if HWY_TARGET == HWY_SSSE3 || HWY_TARGET == HWY_SSE4 | ||||
| #include "hwy/ops/x86_128-inl.h" | ||||
|  |  | |||
							
								
								
									
										106
									
								
								third_party/highway/hwy/highway_export.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								third_party/highway/hwy/highway_export.h
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,106 @@ | |||
| // Pseudo-generated file to handle both cmake & bazel build system.
 | ||||
| 
 | ||||
| // Initial generation done using cmake code:
 | ||||
| // include(GenerateExportHeader)
 | ||||
| // generate_export_header(hwy EXPORT_MACRO_NAME HWY_DLLEXPORT EXPORT_FILE_NAME
 | ||||
| // hwy/highway_export.h)
 | ||||
| // code reformatted using clang-format --style=Google
 | ||||
| 
 | ||||
| #ifndef HWY_DLLEXPORT_H | ||||
| #define HWY_DLLEXPORT_H | ||||
| 
 | ||||
| // Bazel build are always static:
 | ||||
| #if !defined(HWY_SHARED_DEFINE) && !defined(HWY_STATIC_DEFINE) | ||||
| #define HWY_STATIC_DEFINE | ||||
| #endif | ||||
| 
 | ||||
| #ifdef HWY_STATIC_DEFINE | ||||
| #define HWY_DLLEXPORT | ||||
| #define HWY_NO_EXPORT | ||||
| #define HWY_CONTRIB_DLLEXPORT | ||||
| #define HWY_CONTRIB_NO_EXPORT | ||||
| #define HWY_TEST_DLLEXPORT | ||||
| #define HWY_TEST_NO_EXPORT | ||||
| #else | ||||
| 
 | ||||
| #ifndef HWY_DLLEXPORT | ||||
| #if defined(hwy_EXPORTS) | ||||
| /* We are building this library */ | ||||
| #ifdef _WIN32 | ||||
| #define HWY_DLLEXPORT __declspec(dllexport) | ||||
| #else | ||||
| #define HWY_DLLEXPORT __attribute__((visibility("default"))) | ||||
| #endif | ||||
| #else | ||||
| /* We are using this library */ | ||||
| #ifdef _WIN32 | ||||
| #define HWY_DLLEXPORT __declspec(dllimport) | ||||
| #else | ||||
| #define HWY_DLLEXPORT __attribute__((visibility("default"))) | ||||
| #endif | ||||
| #endif | ||||
| #endif | ||||
| 
 | ||||
| #ifndef HWY_NO_EXPORT | ||||
| #ifdef _WIN32 | ||||
| #define HWY_NO_EXPORT | ||||
| #else | ||||
| #define HWY_NO_EXPORT __attribute__((visibility("hidden"))) | ||||
| #endif | ||||
| #endif | ||||
| 
 | ||||
| #ifndef HWY_CONTRIB_DLLEXPORT | ||||
| #if defined(hwy_contrib_EXPORTS) | ||||
| /* We are building this library */ | ||||
| #ifdef _WIN32 | ||||
| #define HWY_CONTRIB_DLLEXPORT __declspec(dllexport) | ||||
| #else | ||||
| #define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default"))) | ||||
| #endif | ||||
| #else | ||||
| /* We are using this library */ | ||||
| #ifdef _WIN32 | ||||
| #define HWY_CONTRIB_DLLEXPORT __declspec(dllimport) | ||||
| #else | ||||
| #define HWY_CONTRIB_DLLEXPORT __attribute__((visibility("default"))) | ||||
| #endif | ||||
| #endif | ||||
| #endif | ||||
| 
 | ||||
| #ifndef HWY_CONTRIB_NO_EXPORT | ||||
| #ifdef _WIN32 | ||||
| #define HWY_CONTRIB_NO_EXPORT | ||||
| #else | ||||
| #define HWY_CONTRIB_NO_EXPORT __attribute__((visibility("hidden"))) | ||||
| #endif | ||||
| #endif | ||||
| 
 | ||||
| #ifndef HWY_TEST_DLLEXPORT | ||||
| #if defined(hwy_test_EXPORTS) | ||||
| /* We are building this library */ | ||||
| #ifdef _WIN32 | ||||
| #define HWY_TEST_DLLEXPORT __declspec(dllexport) | ||||
| #else | ||||
| #define HWY_TEST_DLLEXPORT __attribute__((visibility("default"))) | ||||
| #endif | ||||
| #else | ||||
| /* We are using this library */ | ||||
| #ifdef _WIN32 | ||||
| #define HWY_TEST_DLLEXPORT __declspec(dllimport) | ||||
| #else | ||||
| #define HWY_TEST_DLLEXPORT __attribute__((visibility("default"))) | ||||
| #endif | ||||
| #endif | ||||
| #endif | ||||
| 
 | ||||
| #ifndef HWY_TEST_NO_EXPORT | ||||
| #ifdef _WIN32 | ||||
| #define HWY_TEST_NO_EXPORT | ||||
| #else | ||||
| #define HWY_TEST_NO_EXPORT __attribute__((visibility("hidden"))) | ||||
| #endif | ||||
| #endif | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| #endif /* HWY_DLLEXPORT_H */ | ||||
							
								
								
									
										50
									
								
								third_party/highway/hwy/highway_test.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										50
									
								
								third_party/highway/hwy/highway_test.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -15,6 +15,8 @@ | |||
| #include <stddef.h> | ||||
| #include <stdint.h> | ||||
| 
 | ||||
| #include <bitset> | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "highway_test.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
|  | @ -26,6 +28,53 @@ HWY_BEFORE_NAMESPACE(); | |||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| // For testing that ForPartialVectors reaches every possible size:
 | ||||
| using NumLanesSet = std::bitset<HWY_MAX_BYTES + 1>; | ||||
| 
 | ||||
| // Monostate pattern because ForPartialVectors takes a template argument, not a
 | ||||
| // functor by reference.
 | ||||
| static NumLanesSet* NumLanesForSize(size_t sizeof_t) { | ||||
|   HWY_ASSERT(sizeof_t <= sizeof(uint64_t)); | ||||
|   static NumLanesSet num_lanes[sizeof(uint64_t) + 1]; | ||||
|   return num_lanes + sizeof_t; | ||||
| } | ||||
| static size_t* MaxLanesForSize(size_t sizeof_t) { | ||||
|   HWY_ASSERT(sizeof_t <= sizeof(uint64_t)); | ||||
|   static size_t num_lanes[sizeof(uint64_t) + 1] = {0}; | ||||
|   return num_lanes + sizeof_t; | ||||
| } | ||||
| 
 | ||||
| struct TestMaxLanes { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     const size_t kMax = MaxLanes(d); | ||||
|     HWY_ASSERT(N <= kMax); | ||||
|     HWY_ASSERT(kMax <= (HWY_MAX_BYTES / sizeof(T))); | ||||
| 
 | ||||
|     NumLanesForSize(sizeof(T))->set(N); | ||||
|     *MaxLanesForSize(sizeof(T)) = HWY_MAX(*MaxLanesForSize(sizeof(T)), N); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllMaxLanes() { | ||||
|   ForAllTypes(ForPartialVectors<TestMaxLanes>()); | ||||
| 
 | ||||
|   // Ensure ForPartialVectors visited all powers of two [1, N].
 | ||||
|   for (size_t sizeof_t : {sizeof(uint8_t), sizeof(uint16_t), sizeof(uint32_t), | ||||
|                           sizeof(uint64_t)}) { | ||||
|     const size_t N = *MaxLanesForSize(sizeof_t); | ||||
|     for (size_t i = 1; i <= N; i += i) { | ||||
|       if (!NumLanesForSize(sizeof_t)->test(i)) { | ||||
|         fprintf(stderr, "T=%d: did not visit for N=%d, max=%d\n", | ||||
|                 static_cast<int>(sizeof_t), static_cast<int>(i), | ||||
|                 static_cast<int>(N)); | ||||
|         HWY_ASSERT(false); | ||||
|       } | ||||
|     } | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| struct TestSet { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|  | @ -322,6 +371,7 @@ HWY_AFTER_NAMESPACE(); | |||
| 
 | ||||
| namespace hwy { | ||||
| HWY_BEFORE_TEST(HighwayTest); | ||||
| HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllMaxLanes); | ||||
| HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllSet); | ||||
| HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllOverflow); | ||||
| HWY_EXPORT_AND_TEST_P(HighwayTest, TestAllClamp); | ||||
|  |  | |||
							
								
								
									
										19
									
								
								third_party/highway/hwy/hwy.version
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								third_party/highway/hwy/hwy.version
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,19 @@ | |||
| HWY_0 { | ||||
|   global: | ||||
|     extern "C++" { | ||||
|       *hwy::*; | ||||
|     }; | ||||
| 
 | ||||
|   local: | ||||
|     # Hide all the std namespace symbols. std namespace is explicitly marked | ||||
|     # as visibility(default) and header-only functions or methods (such as those | ||||
|     # from templates) should be exposed in shared libraries as weak symbols but | ||||
|     # this is only needed when we expose those types in the shared library API | ||||
|     # in any way. We don't use C++ std types in the API and we also don't | ||||
|     # support exceptions in the library. | ||||
|     # See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=36022 for a discussion | ||||
|     # about this. | ||||
|     extern "C++" { | ||||
|       *std::*; | ||||
|     }; | ||||
| }; | ||||
							
								
								
									
										19
									
								
								third_party/highway/hwy/nanobenchmark.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										19
									
								
								third_party/highway/hwy/nanobenchmark.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -37,7 +37,7 @@ | |||
| #include <windows.h> | ||||
| #endif | ||||
| 
 | ||||
| #if defined(__MACH__) | ||||
| #if defined(__APPLE__) | ||||
| #include <mach/mach.h> | ||||
| #include <mach/mach_time.h> | ||||
| #endif | ||||
|  | @ -148,7 +148,7 @@ inline Ticks Start() { | |||
|   LARGE_INTEGER counter; | ||||
|   (void)QueryPerformanceCounter(&counter); | ||||
|   t = counter.QuadPart; | ||||
| #elif defined(__MACH__) | ||||
| #elif defined(__APPLE__) | ||||
|   t = mach_absolute_time(); | ||||
| #elif defined(__HAIKU__) | ||||
|   t = system_time_nsecs();  // since boot
 | ||||
|  | @ -405,7 +405,7 @@ double NominalClockRate() { | |||
| 
 | ||||
| }  // namespace
 | ||||
| 
 | ||||
| double InvariantTicksPerSecond() { | ||||
| HWY_DLLEXPORT double InvariantTicksPerSecond() { | ||||
| #if HWY_ARCH_PPC && defined(__GLIBC__) | ||||
|   return double(__ppc_get_timebase_freq()); | ||||
| #elif HWY_ARCH_X86 | ||||
|  | @ -415,7 +415,7 @@ double InvariantTicksPerSecond() { | |||
|   LARGE_INTEGER freq; | ||||
|   (void)QueryPerformanceFrequency(&freq); | ||||
|   return double(freq.QuadPart); | ||||
| #elif defined(__MACH__) | ||||
| #elif defined(__APPLE__) | ||||
|   // https://developer.apple.com/library/mac/qa/qa1398/_index.html
 | ||||
|   mach_timebase_info_data_t timebase; | ||||
|   (void)mach_timebase_info(&timebase); | ||||
|  | @ -426,12 +426,12 @@ double InvariantTicksPerSecond() { | |||
| #endif | ||||
| } | ||||
| 
 | ||||
| double Now() { | ||||
| HWY_DLLEXPORT double Now() { | ||||
|   static const double mul = 1.0 / InvariantTicksPerSecond(); | ||||
|   return static_cast<double>(timer::Start()) * mul; | ||||
| } | ||||
| 
 | ||||
| uint64_t TimerResolution() { | ||||
| HWY_DLLEXPORT uint64_t TimerResolution() { | ||||
|   // Nested loop avoids exceeding stack/L1 capacity.
 | ||||
|   timer::Ticks repetitions[Params::kTimerSamples]; | ||||
|   for (size_t rep = 0; rep < Params::kTimerSamples; ++rep) { | ||||
|  | @ -656,10 +656,11 @@ timer::Ticks Overhead(const uint8_t* arg, const InputVec* inputs, | |||
| 
 | ||||
| }  // namespace
 | ||||
| 
 | ||||
| int Unpredictable1() { return timer::Start() != ~0ULL; } | ||||
| HWY_DLLEXPORT int Unpredictable1() { return timer::Start() != ~0ULL; } | ||||
| 
 | ||||
| size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs, | ||||
|                const size_t num_inputs, Result* results, const Params& p) { | ||||
| HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg, | ||||
|                              const FuncInput* inputs, const size_t num_inputs, | ||||
|                              Result* results, const Params& p) { | ||||
|   NANOBENCHMARK_CHECK(num_inputs != 0); | ||||
| 
 | ||||
| #if HWY_ARCH_X86 | ||||
|  |  | |||
							
								
								
									
										16
									
								
								third_party/highway/hwy/nanobenchmark.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										16
									
								
								third_party/highway/hwy/nanobenchmark.h
									
									
									
									
										vendored
									
									
								
							|  | @ -47,6 +47,8 @@ | |||
| #include <stddef.h> | ||||
| #include <stdint.h> | ||||
| 
 | ||||
| #include "hwy/highway_export.h" | ||||
| 
 | ||||
| // Enables sanity checks that verify correct operation at the cost of
 | ||||
| // longer benchmark runs.
 | ||||
| #ifndef NANOBENCHMARK_ENABLE_CHECKS | ||||
|  | @ -72,23 +74,23 @@ namespace platform { | |||
| // Returns tick rate, useful for converting measurements to seconds. Invariant
 | ||||
| // means the tick counter frequency is independent of CPU throttling or sleep.
 | ||||
| // This call may be expensive, callers should cache the result.
 | ||||
| double InvariantTicksPerSecond(); | ||||
| HWY_DLLEXPORT double InvariantTicksPerSecond(); | ||||
| 
 | ||||
| // Returns current timestamp [in seconds] relative to an unspecified origin.
 | ||||
| // Features: monotonic (no negative elapsed time), steady (unaffected by system
 | ||||
| // time changes), high-resolution (on the order of microseconds).
 | ||||
| double Now(); | ||||
| HWY_DLLEXPORT double Now(); | ||||
| 
 | ||||
| // Returns ticks elapsed in back to back timer calls, i.e. a function of the
 | ||||
| // timer resolution (minimum measurable difference) and overhead.
 | ||||
| // This call is expensive, callers should cache the result.
 | ||||
| uint64_t TimerResolution(); | ||||
| HWY_DLLEXPORT uint64_t TimerResolution(); | ||||
| 
 | ||||
| }  // namespace platform
 | ||||
| 
 | ||||
| // Returns 1, but without the compiler knowing what the value is. This prevents
 | ||||
| // optimizing out code.
 | ||||
| int Unpredictable1(); | ||||
| HWY_DLLEXPORT int Unpredictable1(); | ||||
| 
 | ||||
| // Input influencing the function being measured (e.g. number of bytes to copy).
 | ||||
| using FuncInput = size_t; | ||||
|  | @ -164,9 +166,9 @@ struct Result { | |||
| //   uniform distribution over [0, 4) could be represented as {3,0,2,1}.
 | ||||
| // Returns how many Result were written to "results": one per unique input, or
 | ||||
| //   zero if the measurement failed (an error message goes to stderr).
 | ||||
| size_t Measure(const Func func, const uint8_t* arg, const FuncInput* inputs, | ||||
|                const size_t num_inputs, Result* results, | ||||
|                const Params& p = Params()); | ||||
| HWY_DLLEXPORT size_t Measure(const Func func, const uint8_t* arg, | ||||
|                              const FuncInput* inputs, const size_t num_inputs, | ||||
|                              Result* results, const Params& p = Params()); | ||||
| 
 | ||||
| // Calls operator() of the given closure (lambda function).
 | ||||
| template <class Closure> | ||||
|  |  | |||
							
								
								
									
										1530
									
								
								third_party/highway/hwy/ops/arm_neon-inl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1530
									
								
								third_party/highway/hwy/ops/arm_neon-inl.h
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										902
									
								
								third_party/highway/hwy/ops/arm_sve-inl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										902
									
								
								third_party/highway/hwy/ops/arm_sve-inl.h
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										65
									
								
								third_party/highway/hwy/ops/generic_ops-inl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										65
									
								
								third_party/highway/hwy/ops/generic_ops-inl.h
									
									
									
									
										vendored
									
									
								
							|  | @ -19,14 +19,14 @@ HWY_BEFORE_NAMESPACE(); | |||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| // The lane type of a vector type, e.g. float for Vec<Simd<float, 4>>.
 | ||||
| // The lane type of a vector type, e.g. float for Vec<ScalableTag<float>>.
 | ||||
| template <class V> | ||||
| using LaneType = decltype(GetLane(V())); | ||||
| 
 | ||||
| // Vector type, e.g. Vec128<float> for Simd<float, 4>. Useful as the return type
 | ||||
| // of functions that do not take a vector argument, or as an argument type if
 | ||||
| // the function only has a template argument for D, or for explicit type names
 | ||||
| // instead of auto. This may be a built-in type.
 | ||||
| // Vector type, e.g. Vec128<float> for CappedTag<float, 4>. Useful as the return
 | ||||
| // type of functions that do not take a vector argument, or as an argument type
 | ||||
| // if the function only has a template argument for D, or for explicit type
 | ||||
| // names instead of auto. This may be a built-in type.
 | ||||
| template <class D> | ||||
| using Vec = decltype(Zero(D())); | ||||
| 
 | ||||
|  | @ -53,12 +53,6 @@ HWY_API V CombineShiftRightLanes(D d, const V hi, const V lo) { | |||
|   return CombineShiftRightBytes<kBytes>(d, hi, lo); | ||||
| } | ||||
| 
 | ||||
| // DEPRECATED
 | ||||
| template <size_t kLanes, class V> | ||||
| HWY_API V CombineShiftRightLanes(const V hi, const V lo) { | ||||
|   return CombineShiftRightLanes<kLanes>(DFromV<V>(), hi, lo); | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| // Returns lanes with the most significant bit set and all other bits zero.
 | ||||
|  | @ -208,6 +202,15 @@ HWY_API V AESRound(V state, const V round_key) { | |||
|   return state; | ||||
| } | ||||
| 
 | ||||
| template <class V>  // u8
 | ||||
| HWY_API V AESLastRound(V state, const V round_key) { | ||||
|   // LIke AESRound, but without MixColumns.
 | ||||
|   state = detail::SubBytes(state); | ||||
|   state = detail::ShiftRows(state); | ||||
|   state = Xor(state, round_key);  // AddRoundKey
 | ||||
|   return state; | ||||
| } | ||||
| 
 | ||||
| // Constant-time implementation inspired by
 | ||||
| // https://www.bearssl.org/constanttime.html, but about half the cost because we
 | ||||
| // use 64x64 multiplies and 128-bit XORs.
 | ||||
|  | @ -278,23 +281,47 @@ HWY_API V CLMulUpper(V a, V b) { | |||
| #define HWY_NATIVE_POPCNT | ||||
| #endif | ||||
| 
 | ||||
| template <typename V, HWY_IF_LANES_ARE(uint8_t, V)> | ||||
| #if HWY_TARGET == HWY_RVV | ||||
| #define HWY_MIN_POW2_FOR_128 1 | ||||
| #else | ||||
| // All other targets except HWY_SCALAR (which is excluded by HWY_IF_GE128_D)
 | ||||
| // guarantee 128 bits anyway.
 | ||||
| #define HWY_MIN_POW2_FOR_128 0 | ||||
| #endif | ||||
| 
 | ||||
| // This algorithm requires vectors to be at least 16 bytes, which is the case
 | ||||
| // for LMUL >= 2. If not, use the fallback below.
 | ||||
| template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_GE128_D(DFromV<V>), | ||||
|           HWY_IF_POW2_GE(DFromV<V>, HWY_MIN_POW2_FOR_128)> | ||||
| HWY_API V PopulationCount(V v) { | ||||
|   constexpr DFromV<V> d; | ||||
|   const DFromV<V> d; | ||||
|   HWY_ALIGN constexpr uint8_t kLookup[16] = { | ||||
|       0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, | ||||
|   }; | ||||
|   auto lo = And(v, Set(d, 0xF)); | ||||
|   auto hi = ShiftRight<4>(v); | ||||
|   auto lookup = LoadDup128(Simd<uint8_t, HWY_MAX(16, MaxLanes(d))>(), kLookup); | ||||
|   const auto lo = And(v, Set(d, 0xF)); | ||||
|   const auto hi = ShiftRight<4>(v); | ||||
|   const auto lookup = LoadDup128(d, kLookup); | ||||
|   return Add(TableLookupBytes(lookup, hi), TableLookupBytes(lookup, lo)); | ||||
| } | ||||
| 
 | ||||
| // RVV has a specialization that avoids the Set().
 | ||||
| #if HWY_TARGET != HWY_RVV | ||||
| // Slower fallback for capped vectors.
 | ||||
| template <typename V, HWY_IF_LANES_ARE(uint8_t, V), HWY_IF_LT128_D(DFromV<V>)> | ||||
| HWY_API V PopulationCount(V v) { | ||||
|   const DFromV<V> d; | ||||
|   // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
 | ||||
|   v = Sub(v, And(ShiftRight<1>(v), Set(d, 0x55))); | ||||
|   v = Add(And(ShiftRight<2>(v), Set(d, 0x33)), And(v, Set(d, 0x33))); | ||||
|   return And(Add(v, ShiftRight<4>(v)), Set(d, 0x0F)); | ||||
| } | ||||
| #endif  // HWY_TARGET != HWY_RVV
 | ||||
| 
 | ||||
| template <typename V, HWY_IF_LANES_ARE(uint16_t, V)> | ||||
| HWY_API V PopulationCount(V v) { | ||||
|   const DFromV<V> d; | ||||
|   Repartition<uint8_t, decltype(d)> d8; | ||||
|   auto vals = BitCast(d, PopulationCount(BitCast(d8, v))); | ||||
|   const Repartition<uint8_t, decltype(d)> d8; | ||||
|   const auto vals = BitCast(d, PopulationCount(BitCast(d8, v))); | ||||
|   return Add(ShiftRight<8>(vals), And(vals, Set(d, 0xFF))); | ||||
| } | ||||
| 
 | ||||
|  | @ -306,7 +333,7 @@ HWY_API V PopulationCount(V v) { | |||
|   return Add(ShiftRight<16>(vals), And(vals, Set(d, 0xFF))); | ||||
| } | ||||
| 
 | ||||
| #if HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_INTEGER64 | ||||
| template <typename V, HWY_IF_LANES_ARE(uint64_t, V)> | ||||
| HWY_API V PopulationCount(V v) { | ||||
|   const DFromV<V> d; | ||||
|  |  | |||
							
								
								
									
										1627
									
								
								third_party/highway/hwy/ops/rvv-inl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1627
									
								
								third_party/highway/hwy/ops/rvv-inl.h
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										120
									
								
								third_party/highway/hwy/ops/scalar-inl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										120
									
								
								third_party/highway/hwy/ops/scalar-inl.h
									
									
									
									
										vendored
									
									
								
							|  | @ -27,7 +27,7 @@ namespace HWY_NAMESPACE { | |||
| 
 | ||||
| // Single instruction, single data.
 | ||||
| template <typename T> | ||||
| using Sisd = Simd<T, 1>; | ||||
| using Sisd = Simd<T, 1, 0>; | ||||
| 
 | ||||
| // (Wrapper class required for overloading comparison operators.)
 | ||||
| template <typename T> | ||||
|  | @ -187,6 +187,20 @@ HWY_API Vec1<T> operator^(const Vec1<T> a, const Vec1<T> b) { | |||
|   return Xor(a, b); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ OrAnd
 | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec1<T> OrAnd(const Vec1<T> o, const Vec1<T> a1, const Vec1<T> a2) { | ||||
|   return Or(o, And(a1, a2)); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ IfVecThenElse
 | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec1<T> IfVecThenElse(Vec1<T> mask, Vec1<T> yes, Vec1<T> no) { | ||||
|   return IfThenElse(MaskFromVec(mask), yes, no); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ CopySign
 | ||||
| 
 | ||||
| template <typename T> | ||||
|  | @ -275,6 +289,11 @@ HWY_API Vec1<T> IfThenZeroElse(const Mask1<T> mask, const Vec1<T> no) { | |||
|   return mask.bits ? Vec1<T>(0) : no; | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec1<T> IfNegativeThenElse(Vec1<T> v, Vec1<T> yes, Vec1<T> no) { | ||||
|   return v.raw < 0 ? yes : no; | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec1<T> ZeroIfNegative(const Vec1<T> v) { | ||||
|   return v.raw < 0 ? Vec1<T>(0) : v; | ||||
|  | @ -423,7 +442,13 @@ HWY_API Vec1<double> operator-(const Vec1<double> a, const Vec1<double> b) { | |||
|   return Vec1<double>(a.raw - b.raw); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Saturating addition
 | ||||
| // ------------------------------ SumsOf8
 | ||||
| 
 | ||||
| HWY_API Vec1<uint64_t> SumsOf8(const Vec1<uint8_t> v) { | ||||
|   return Vec1<uint64_t>(v.raw); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ SaturatedAdd
 | ||||
| 
 | ||||
| // Returns a + b clamped to the destination range.
 | ||||
| 
 | ||||
|  | @ -931,21 +956,30 @@ HWY_API Vec1<ToT> PromoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) { | |||
|   return Vec1<ToT>(static_cast<ToT>(from.raw)); | ||||
| } | ||||
| 
 | ||||
| template <typename FromT, typename ToT, HWY_IF_FLOAT(FromT)> | ||||
| HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) { | ||||
|   static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting"); | ||||
| 
 | ||||
| // MSVC 19.10 cannot deduce the argument type if HWY_IF_FLOAT(FromT) is here,
 | ||||
| // so we overload for FromT=double and ToT={float,int32_t}.
 | ||||
| HWY_API Vec1<float> DemoteTo(Sisd<float> /* tag */, Vec1<double> from) { | ||||
|   // Prevent ubsan errors when converting float to narrower integer/float
 | ||||
|   if (std::isinf(from.raw) || | ||||
|       std::fabs(from.raw) > static_cast<FromT>(HighestValue<ToT>())) { | ||||
|     return Vec1<ToT>(std::signbit(from.raw) ? LowestValue<ToT>() | ||||
|                                             : HighestValue<ToT>()); | ||||
|       std::fabs(from.raw) > static_cast<double>(HighestValue<float>())) { | ||||
|     return Vec1<float>(std::signbit(from.raw) ? LowestValue<float>() | ||||
|                                               : HighestValue<float>()); | ||||
|   } | ||||
|   return Vec1<ToT>(static_cast<ToT>(from.raw)); | ||||
|   return Vec1<float>(static_cast<float>(from.raw)); | ||||
| } | ||||
| HWY_API Vec1<int32_t> DemoteTo(Sisd<int32_t> /* tag */, Vec1<double> from) { | ||||
|   // Prevent ubsan errors when converting int32_t to narrower integer/int32_t
 | ||||
|   if (std::isinf(from.raw) || | ||||
|       std::fabs(from.raw) > static_cast<double>(HighestValue<int32_t>())) { | ||||
|     return Vec1<int32_t>(std::signbit(from.raw) ? LowestValue<int32_t>() | ||||
|                                                 : HighestValue<int32_t>()); | ||||
|   } | ||||
|   return Vec1<int32_t>(static_cast<int32_t>(from.raw)); | ||||
| } | ||||
| 
 | ||||
| template <typename FromT, typename ToT, HWY_IF_NOT_FLOAT(FromT)> | ||||
| template <typename FromT, typename ToT> | ||||
| HWY_API Vec1<ToT> DemoteTo(Sisd<ToT> /* tag */, Vec1<FromT> from) { | ||||
|   static_assert(!IsFloat<FromT>(), "FromT=double are handled above"); | ||||
|   static_assert(sizeof(ToT) < sizeof(FromT), "Not demoting"); | ||||
| 
 | ||||
|   // Int to int: choose closest value in ToT to `from` (avoids UB)
 | ||||
|  | @ -1083,6 +1117,12 @@ HWY_API T GetLane(const Vec1<T> v) { | |||
|   return v.raw; | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec1<T> DupEven(Vec1<T> v) { | ||||
|   return v; | ||||
| } | ||||
| // DupOdd is unsupported.
 | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec1<T> OddEven(Vec1<T> /* odd */, Vec1<T> even) { | ||||
|   return even; | ||||
|  | @ -1125,6 +1165,14 @@ HWY_API Vec1<T> TableLookupLanes(const Vec1<T> v, const Indices1<T> /* idx */) { | |||
|   return v; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ ReverseBlocks
 | ||||
| 
 | ||||
| // Single block: no change
 | ||||
| template <typename T> | ||||
| HWY_API Vec1<T> ReverseBlocks(Sisd<T> /* tag */, const Vec1<T> v) { | ||||
|   return v; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Reverse
 | ||||
| 
 | ||||
| template <typename T> | ||||
|  | @ -1132,6 +1180,21 @@ HWY_API Vec1<T> Reverse(Sisd<T> /* tag */, const Vec1<T> v) { | |||
|   return v; | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec1<T> Reverse2(Sisd<T> /* tag */, const Vec1<T> v) { | ||||
|   return v; | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec1<T> Reverse4(Sisd<T> /* tag */, const Vec1<T> v) { | ||||
|   return v; | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec1<T> Reverse8(Sisd<T> /* tag */, const Vec1<T> v) { | ||||
|   return v; | ||||
| } | ||||
| 
 | ||||
| // ================================================== BLOCKWISE
 | ||||
| // Shift*Bytes, CombineShiftRightBytes, Interleave*, Shuffle* are unsupported.
 | ||||
| 
 | ||||
|  | @ -1308,41 +1371,6 @@ HWY_API Vec1<T> MaxOfLanes(Sisd<T> /* tag */, const Vec1<T> v) { | |||
|   return v; | ||||
| } | ||||
| 
 | ||||
| // ================================================== DEPRECATED
 | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API size_t StoreMaskBits(const Mask1<T> mask, uint8_t* bits) { | ||||
|   return StoreMaskBits(Sisd<T>(), mask, bits); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API bool AllTrue(const Mask1<T> mask) { | ||||
|   return AllTrue(Sisd<T>(), mask); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API bool AllFalse(const Mask1<T> mask) { | ||||
|   return AllFalse(Sisd<T>(), mask); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API size_t CountTrue(const Mask1<T> mask) { | ||||
|   return CountTrue(Sisd<T>(), mask); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec1<T> SumOfLanes(const Vec1<T> v) { | ||||
|   return SumOfLanes(Sisd<T>(), v); | ||||
| } | ||||
| template <typename T> | ||||
| HWY_API Vec1<T> MinOfLanes(const Vec1<T> v) { | ||||
|   return MinOfLanes(Sisd<T>(), v); | ||||
| } | ||||
| template <typename T> | ||||
| HWY_API Vec1<T> MaxOfLanes(const Vec1<T> v) { | ||||
|   return MaxOfLanes(Sisd<T>(), v); | ||||
| } | ||||
| 
 | ||||
| // ================================================== Operator wrapper
 | ||||
| 
 | ||||
| template <class V> | ||||
|  |  | |||
							
								
								
									
										110
									
								
								third_party/highway/hwy/ops/set_macros-inl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										110
									
								
								third_party/highway/hwy/ops/set_macros-inl.h
									
									
									
									
										vendored
									
									
								
							|  | @ -32,9 +32,10 @@ | |||
| #undef HWY_MAX_BYTES | ||||
| #undef HWY_LANES | ||||
| 
 | ||||
| #undef HWY_CAP_INTEGER64 | ||||
| #undef HWY_CAP_FLOAT16 | ||||
| #undef HWY_CAP_FLOAT64 | ||||
| #undef HWY_HAVE_SCALABLE | ||||
| #undef HWY_HAVE_INTEGER64 | ||||
| #undef HWY_HAVE_FLOAT16 | ||||
| #undef HWY_HAVE_FLOAT64 | ||||
| #undef HWY_CAP_GE256 | ||||
| #undef HWY_CAP_GE512 | ||||
| 
 | ||||
|  | @ -79,9 +80,10 @@ | |||
| #define HWY_MAX_BYTES 16 | ||||
| #define HWY_LANES(T) (16 / sizeof(T)) | ||||
| 
 | ||||
| #define HWY_CAP_INTEGER64 1 | ||||
| #define HWY_CAP_FLOAT16 1 | ||||
| #define HWY_CAP_FLOAT64 1 | ||||
| #define HWY_HAVE_SCALABLE 0 | ||||
| #define HWY_HAVE_INTEGER64 1 | ||||
| #define HWY_HAVE_FLOAT16 1 | ||||
| #define HWY_HAVE_FLOAT64 1 | ||||
| #define HWY_CAP_AES 0 | ||||
| #define HWY_CAP_GE256 0 | ||||
| #define HWY_CAP_GE512 0 | ||||
|  | @ -96,9 +98,10 @@ | |||
| #define HWY_MAX_BYTES 16 | ||||
| #define HWY_LANES(T) (16 / sizeof(T)) | ||||
| 
 | ||||
| #define HWY_CAP_INTEGER64 1 | ||||
| #define HWY_CAP_FLOAT16 1 | ||||
| #define HWY_CAP_FLOAT64 1 | ||||
| #define HWY_HAVE_SCALABLE 0 | ||||
| #define HWY_HAVE_INTEGER64 1 | ||||
| #define HWY_HAVE_FLOAT16 1 | ||||
| #define HWY_HAVE_FLOAT64 1 | ||||
| #define HWY_CAP_GE256 0 | ||||
| #define HWY_CAP_GE512 0 | ||||
| 
 | ||||
|  | @ -113,9 +116,10 @@ | |||
| #define HWY_MAX_BYTES 32 | ||||
| #define HWY_LANES(T) (32 / sizeof(T)) | ||||
| 
 | ||||
| #define HWY_CAP_INTEGER64 1 | ||||
| #define HWY_CAP_FLOAT16 1 | ||||
| #define HWY_CAP_FLOAT64 1 | ||||
| #define HWY_HAVE_SCALABLE 0 | ||||
| #define HWY_HAVE_INTEGER64 1 | ||||
| #define HWY_HAVE_FLOAT16 1 | ||||
| #define HWY_HAVE_FLOAT64 1 | ||||
| #define HWY_CAP_GE256 1 | ||||
| #define HWY_CAP_GE512 0 | ||||
| 
 | ||||
|  | @ -129,9 +133,10 @@ | |||
| #define HWY_MAX_BYTES 64 | ||||
| #define HWY_LANES(T) (64 / sizeof(T)) | ||||
| 
 | ||||
| #define HWY_CAP_INTEGER64 1 | ||||
| #define HWY_CAP_FLOAT16 1 | ||||
| #define HWY_CAP_FLOAT64 1 | ||||
| #define HWY_HAVE_SCALABLE 0 | ||||
| #define HWY_HAVE_INTEGER64 1 | ||||
| #define HWY_HAVE_FLOAT16 1 | ||||
| #define HWY_HAVE_FLOAT64 1 | ||||
| #define HWY_CAP_GE256 1 | ||||
| #define HWY_CAP_GE512 1 | ||||
| 
 | ||||
|  | @ -159,9 +164,10 @@ | |||
| #define HWY_MAX_BYTES 16 | ||||
| #define HWY_LANES(T) (16 / sizeof(T)) | ||||
| 
 | ||||
| #define HWY_CAP_INTEGER64 1 | ||||
| #define HWY_CAP_FLOAT16 0 | ||||
| #define HWY_CAP_FLOAT64 1 | ||||
| #define HWY_HAVE_SCALABLE 0 | ||||
| #define HWY_HAVE_INTEGER64 1 | ||||
| #define HWY_HAVE_FLOAT16 0 | ||||
| #define HWY_HAVE_FLOAT64 1 | ||||
| #define HWY_CAP_GE256 0 | ||||
| #define HWY_CAP_GE512 0 | ||||
| 
 | ||||
|  | @ -177,15 +183,16 @@ | |||
| #define HWY_MAX_BYTES 16 | ||||
| #define HWY_LANES(T) (16 / sizeof(T)) | ||||
| 
 | ||||
| #define HWY_CAP_INTEGER64 1 | ||||
| #define HWY_CAP_FLOAT16 1 | ||||
| #define HWY_HAVE_SCALABLE 0 | ||||
| #define HWY_HAVE_INTEGER64 1 | ||||
| #define HWY_HAVE_FLOAT16 1 | ||||
| #define HWY_CAP_GE256 0 | ||||
| #define HWY_CAP_GE512 0 | ||||
| 
 | ||||
| #if HWY_ARCH_ARM_A64 | ||||
| #define HWY_CAP_FLOAT64 1 | ||||
| #define HWY_HAVE_FLOAT64 1 | ||||
| #else | ||||
| #define HWY_CAP_FLOAT64 0 | ||||
| #define HWY_HAVE_FLOAT64 0 | ||||
| #endif | ||||
| 
 | ||||
| #define HWY_NAMESPACE N_NEON | ||||
|  | @ -196,23 +203,19 @@ | |||
| // SVE[2]
 | ||||
| #elif HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE | ||||
| 
 | ||||
| #if defined(HWY_EMULATE_SVE) && !defined(__F16C__) | ||||
| #error "Disable HWY_CAP_FLOAT16 or ensure farm_sve actually converts to f16" | ||||
| #endif | ||||
| 
 | ||||
| // SVE only requires lane alignment, not natural alignment of the entire vector.
 | ||||
| #define HWY_ALIGN alignas(8) | ||||
| 
 | ||||
| #define HWY_MAX_BYTES 256 | ||||
| 
 | ||||
| // <= HWY_MAX_BYTES / sizeof(T): exact size. Otherwise a fraction 1/div (div =
 | ||||
| // 1,2,4,8) is encoded as HWY_LANES(T) / div. This value leaves enough room for
 | ||||
| // div=8 and demoting to 1/8 the lane width while still exceeding HWY_MAX_BYTES.
 | ||||
| #define HWY_LANES(T) (32768 / sizeof(T)) | ||||
| // Value ensures MaxLanes() is the tightest possible upper bound to reduce
 | ||||
| // overallocation.
 | ||||
| #define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T)) | ||||
| 
 | ||||
| #define HWY_CAP_INTEGER64 1 | ||||
| #define HWY_CAP_FLOAT16 1 | ||||
| #define HWY_CAP_FLOAT64 1 | ||||
| #define HWY_HAVE_SCALABLE 1 | ||||
| #define HWY_HAVE_INTEGER64 1 | ||||
| #define HWY_HAVE_FLOAT16 1 | ||||
| #define HWY_HAVE_FLOAT64 1 | ||||
| #define HWY_CAP_GE256 0 | ||||
| #define HWY_CAP_GE512 0 | ||||
| 
 | ||||
|  | @ -232,9 +235,10 @@ | |||
| #define HWY_MAX_BYTES 16 | ||||
| #define HWY_LANES(T) (16 / sizeof(T)) | ||||
| 
 | ||||
| #define HWY_CAP_INTEGER64 0 | ||||
| #define HWY_CAP_FLOAT16 1 | ||||
| #define HWY_CAP_FLOAT64 0 | ||||
| #define HWY_HAVE_SCALABLE 0 | ||||
| #define HWY_HAVE_INTEGER64 1 | ||||
| #define HWY_HAVE_FLOAT16 1 | ||||
| #define HWY_HAVE_FLOAT64 0 | ||||
| #define HWY_CAP_GE256 0 | ||||
| #define HWY_CAP_GE512 0 | ||||
| 
 | ||||
|  | @ -250,9 +254,10 @@ | |||
| #define HWY_MAX_BYTES 32 | ||||
| #define HWY_LANES(T) (32 / sizeof(T)) | ||||
| 
 | ||||
| #define HWY_CAP_INTEGER64 0 | ||||
| #define HWY_CAP_FLOAT16 1 | ||||
| #define HWY_CAP_FLOAT64 0 | ||||
| #define HWY_HAVE_SCALABLE 0 | ||||
| #define HWY_HAVE_INTEGER64 1 | ||||
| #define HWY_HAVE_FLOAT16 1 | ||||
| #define HWY_HAVE_FLOAT64 0 | ||||
| #define HWY_CAP_GE256 0 | ||||
| #define HWY_CAP_GE512 0 | ||||
| 
 | ||||
|  | @ -271,20 +276,20 @@ | |||
| // The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8).
 | ||||
| #define HWY_MAX_BYTES 65536 | ||||
| 
 | ||||
| // <= HWY_MAX_BYTES / sizeof(T): exact size. Otherwise a fraction 1/div (div =
 | ||||
| // 1,2,4,8) is encoded as HWY_LANES(T) / div. This value leaves enough room for
 | ||||
| // div=8 and demoting to 1/8 the lane width while still exceeding HWY_MAX_BYTES.
 | ||||
| #define HWY_LANES(T) (8388608 / sizeof(T)) | ||||
| // = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual
 | ||||
| // LMUL. This is the tightest possible upper bound.
 | ||||
| #define HWY_LANES(T) (8192 / sizeof(T)) | ||||
| 
 | ||||
| #define HWY_CAP_INTEGER64 1 | ||||
| #define HWY_CAP_FLOAT64 1 | ||||
| #define HWY_HAVE_SCALABLE 1 | ||||
| #define HWY_HAVE_INTEGER64 1 | ||||
| #define HWY_HAVE_FLOAT64 1 | ||||
| #define HWY_CAP_GE256 0 | ||||
| #define HWY_CAP_GE512 0 | ||||
| 
 | ||||
| #if defined(__riscv_zfh) | ||||
| #define HWY_CAP_FLOAT16 1 | ||||
| #define HWY_HAVE_FLOAT16 1 | ||||
| #else | ||||
| #define HWY_CAP_FLOAT16 0 | ||||
| #define HWY_HAVE_FLOAT16 0 | ||||
| #endif | ||||
| 
 | ||||
| #define HWY_NAMESPACE N_RVV | ||||
|  | @ -300,9 +305,10 @@ | |||
| #define HWY_MAX_BYTES 8 | ||||
| #define HWY_LANES(T) 1 | ||||
| 
 | ||||
| #define HWY_CAP_INTEGER64 1 | ||||
| #define HWY_CAP_FLOAT16 1 | ||||
| #define HWY_CAP_FLOAT64 1 | ||||
| #define HWY_HAVE_SCALABLE 0 | ||||
| #define HWY_HAVE_INTEGER64 1 | ||||
| #define HWY_HAVE_FLOAT16 1 | ||||
| #define HWY_HAVE_FLOAT64 1 | ||||
| #define HWY_CAP_GE256 0 | ||||
| #define HWY_CAP_GE512 0 | ||||
| 
 | ||||
|  | @ -344,7 +350,3 @@ | |||
| #else | ||||
| #define HWY_ATTR | ||||
| #endif | ||||
| 
 | ||||
| // DEPRECATED
 | ||||
| #undef HWY_GATHER_LANES | ||||
| #define HWY_GATHER_LANES(T) HWY_LANES(T) | ||||
|  |  | |||
							
								
								
									
										184
									
								
								third_party/highway/hwy/ops/shared-inl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										184
									
								
								third_party/highway/hwy/ops/shared-inl.h
									
									
									
									
										vendored
									
									
								
							|  | @ -26,65 +26,117 @@ HWY_BEFORE_NAMESPACE(); | |||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| // SIMD operations are implemented as overloaded functions selected using a tag
 | ||||
| // type D := Simd<T, N>. T is the lane type, N an opaque integer for internal
 | ||||
| // use only. Users create D via aliases ScalableTag<T>() (a full vector),
 | ||||
| // CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes
 | ||||
| // (always a power of two) is Lanes(D()).
 | ||||
| template <typename Lane, size_t N> | ||||
| // Highway operations are implemented as overloaded functions selected using an
 | ||||
| // internal-only tag type D := Simd<T, N, kPow2>. T is the lane type. kPow2 is a
 | ||||
| // shift count applied to scalable vectors. Instead of referring to Simd<>
 | ||||
| // directly, users create D via aliases ScalableTag<T[, kPow2]>() (defaults to a
 | ||||
| // full vector, or fractions/groups if the argument is negative/positive),
 | ||||
| // CappedTag<T, kLimit> or FixedTag<T, kNumLanes>. The actual number of lanes is
 | ||||
| // Lanes(D()), a power of two. For scalable vectors, N is either HWY_LANES or a
 | ||||
| // cap. For constexpr-size vectors, N is the actual number of lanes. This
 | ||||
| // ensures Half<Full512<T>> is the same type as Full256<T>, as required by x86.
 | ||||
| template <typename Lane, size_t N, int kPow2> | ||||
| struct Simd { | ||||
|   constexpr Simd() = default; | ||||
|   using T = Lane; | ||||
|   static_assert((N & (N - 1)) == 0 && N != 0, "N must be a power of two"); | ||||
| 
 | ||||
|   // Only for use by MaxLanes, required by MSVC. Cannot be enum because GCC
 | ||||
|   // warns when using enums and non-enums in the same expression. Cannot be
 | ||||
|   // static constexpr function (another MSVC limitation).
 | ||||
|   static constexpr size_t kPrivateN = N; | ||||
|   static constexpr int kPrivatePow2 = kPow2; | ||||
| 
 | ||||
|   template <typename NewT> | ||||
|   static constexpr size_t NewN() { | ||||
|     // Round up to correctly handle scalars with N=1.
 | ||||
|     return (N * sizeof(T) + sizeof(NewT) - 1) / sizeof(NewT); | ||||
|   } | ||||
| 
 | ||||
| #if HWY_HAVE_SCALABLE | ||||
|   template <typename NewT> | ||||
|   static constexpr int Pow2Ratio() { | ||||
|     return (sizeof(NewT) > sizeof(T)) | ||||
|                ? static_cast<int>(CeilLog2(sizeof(NewT) / sizeof(T))) | ||||
|                : -static_cast<int>(CeilLog2(sizeof(T) / sizeof(NewT))); | ||||
|   } | ||||
| #endif | ||||
| 
 | ||||
|   // Widening/narrowing ops change the number of lanes and/or their type.
 | ||||
|   // To initialize such vectors, we need the corresponding tag types:
 | ||||
| 
 | ||||
|   // PromoteTo/DemoteTo() with another lane type, but same number of lanes.
 | ||||
|   template <typename NewLane> | ||||
|   using Rebind = Simd<NewLane, N>; | ||||
| // PromoteTo/DemoteTo() with another lane type, but same number of lanes.
 | ||||
| #if HWY_HAVE_SCALABLE | ||||
|   template <typename NewT> | ||||
|   using Rebind = Simd<NewT, N, kPow2 + Pow2Ratio<NewT>()>; | ||||
| #else | ||||
|   template <typename NewT> | ||||
|   using Rebind = Simd<NewT, N, kPow2>; | ||||
| #endif | ||||
| 
 | ||||
|   // MulEven() with another lane type, but same total size.
 | ||||
|   // Round up to correctly handle scalars with N=1.
 | ||||
|   template <typename NewLane> | ||||
|   using Repartition = | ||||
|       Simd<NewLane, (N * sizeof(Lane) + sizeof(NewLane) - 1) / sizeof(NewLane)>; | ||||
|   // Change lane type while keeping the same vector size, e.g. for MulEven.
 | ||||
|   template <typename NewT> | ||||
|   using Repartition = Simd<NewT, NewN<NewT>(), kPow2>; | ||||
| 
 | ||||
|   // LowerHalf() with the same lane type, but half the lanes.
 | ||||
|   // Round up to correctly handle scalars with N=1.
 | ||||
|   using Half = Simd<T, (N + 1) / 2>; | ||||
| // Half the lanes while keeping the same lane type, e.g. for LowerHalf.
 | ||||
| // Round up to correctly handle scalars with N=1.
 | ||||
| #if HWY_HAVE_SCALABLE | ||||
|   // Reducing the cap (N) is required for SVE - if N is the limiter for f32xN,
 | ||||
|   // then we expect Half<Rebind<u16>> to have N/2 lanes (rounded up).
 | ||||
|   using Half = Simd<T, (N + 1) / 2, kPow2 - 1>; | ||||
| #else | ||||
|   using Half = Simd<T, (N + 1) / 2, kPow2>; | ||||
| #endif | ||||
| 
 | ||||
|   // Combine() with the same lane type, but twice the lanes.
 | ||||
|   using Twice = Simd<T, 2 * N>; | ||||
| // Twice the lanes while keeping the same lane type, e.g. for Combine.
 | ||||
| #if HWY_HAVE_SCALABLE | ||||
|   using Twice = Simd<T, 2 * N, kPow2 + 1>; | ||||
| #else | ||||
|   using Twice = Simd<T, 2 * N, kPow2>; | ||||
| #endif | ||||
| }; | ||||
| 
 | ||||
| namespace detail { | ||||
| 
 | ||||
| // Given N from HWY_LANES(T), returns N for use in Simd<T, N> to describe:
 | ||||
| // - a full vector (pow2 = 0);
 | ||||
| // - 2,4,8 regs on RVV, otherwise a full vector (pow2 [1,3]);
 | ||||
| // - a fraction of a register from 1/8 to 1/2 (pow2 [-3,-1]).
 | ||||
| constexpr size_t ScaleByPower(size_t N, int pow2) { | ||||
| #if HWY_TARGET == HWY_RVV | ||||
|   // For fractions, if N == 1 ensure we still return at least one lane.
 | ||||
|   return pow2 >= 0 ? (N << pow2) : HWY_MAX(1, (N >> (-pow2))); | ||||
| #else | ||||
|   // If pow2 > 0, replace it with 0 (there is nothing wider than a full vector).
 | ||||
|   return HWY_MAX(1, N >> HWY_MAX(-pow2, 0)); | ||||
| #if HWY_HAVE_SCALABLE | ||||
| 
 | ||||
| template <typename T, size_t N, int kPow2> | ||||
| constexpr bool IsFull(Simd<T, N, kPow2> /* d */) { | ||||
|   return N == HWY_LANES(T) && kPow2 == 0; | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| // Returns the number of lanes (possibly zero) after applying a shift:
 | ||||
| // - 0: no change;
 | ||||
| // - [1,3]: a group of 2,4,8 [fractional] vectors;
 | ||||
| // - [-3,-1]: a fraction of a vector from 1/8 to 1/2.
 | ||||
| constexpr size_t ScaleByPower(size_t N, int pow2) { | ||||
|   return pow2 >= 0 ? (N << pow2) : (N >> (-pow2)); | ||||
| } | ||||
| 
 | ||||
| // Struct wrappers enable validation of arguments via static_assert.
 | ||||
| template <typename T, int kPow2> | ||||
| struct ScalableTagChecker { | ||||
|   static_assert(-3 <= kPow2 && kPow2 <= 3, "Fraction must be 1/8 to 8"); | ||||
|   using type = Simd<T, ScaleByPower(HWY_LANES(T), kPow2)>; | ||||
| #if HWY_TARGET == HWY_RVV | ||||
|   // Only RVV supports register groups.
 | ||||
|   using type = Simd<T, HWY_LANES(T), kPow2>; | ||||
| #elif HWY_HAVE_SCALABLE | ||||
|   // For SVE[2], only allow full or fractions.
 | ||||
|   using type = Simd<T, HWY_LANES(T), HWY_MIN(kPow2, 0)>; | ||||
| #elif HWY_TARGET == HWY_SCALAR | ||||
|   using type = Simd<T, /*N=*/1, 0>; | ||||
| #else | ||||
|   // Only allow full or fractions.
 | ||||
|   using type = Simd<T, ScaleByPower(HWY_LANES(T), HWY_MIN(kPow2, 0)), 0>; | ||||
| #endif | ||||
| }; | ||||
| 
 | ||||
| template <typename T, size_t kLimit> | ||||
| struct CappedTagChecker { | ||||
|   static_assert(kLimit != 0, "Does not make sense to have zero lanes"); | ||||
|   using type = Simd<T, HWY_MIN(kLimit, HWY_LANES(T))>; | ||||
|   using type = Simd<T, HWY_MIN(kLimit, HWY_MAX_BYTES / sizeof(T)), 0>; | ||||
| }; | ||||
| 
 | ||||
| template <typename T, size_t kNumLanes> | ||||
|  | @ -95,7 +147,7 @@ struct FixedTagChecker { | |||
|   // HWY_MAX_BYTES would still allow uint8x8, which is not supported.
 | ||||
|   static_assert(kNumLanes == 1, "Scalar only supports one lane"); | ||||
| #endif | ||||
|   using type = Simd<T, kNumLanes>; | ||||
|   using type = Simd<T, kNumLanes, 0>; | ||||
| }; | ||||
| 
 | ||||
| }  // namespace detail
 | ||||
|  | @ -114,15 +166,14 @@ using ScalableTag = typename detail::ScalableTagChecker<T, kPow2>::type; | |||
| // typically used for 1D loops with a relatively low application-defined upper
 | ||||
| // bound, e.g. for 8x8 DCTs. However, it is better if data structures are
 | ||||
| // designed to be vector-length-agnostic (e.g. a hybrid SoA where there are
 | ||||
| // chunks of say 256 DC components followed by 256 AC1 and finally 256 AC63;
 | ||||
| // chunks of `M >= MaxLanes(d)` DC components followed by M AC1, .., and M AC63;
 | ||||
| // this would enable vector-length-agnostic loops using ScalableTag).
 | ||||
| template <typename T, size_t kLimit> | ||||
| using CappedTag = typename detail::CappedTagChecker<T, kLimit>::type; | ||||
| 
 | ||||
| // Alias for a tag describing a vector with *exactly* kNumLanes active lanes,
 | ||||
| // even on targets with scalable vectors. All targets except HWY_SCALAR support
 | ||||
| // up to 16 / sizeof(T). Other targets may allow larger kNumLanes, but relying
 | ||||
| // on that is non-portable and discouraged.
 | ||||
| // even on targets with scalable vectors. HWY_SCALAR only supports one lane.
 | ||||
| // All other targets allow kNumLanes up to HWY_MAX_BYTES / sizeof(T).
 | ||||
| //
 | ||||
| // NOTE: if the application does not need to support HWY_SCALAR (+), use this
 | ||||
| // instead of CappedTag to emphasize that there will be exactly kNumLanes lanes.
 | ||||
|  | @ -163,11 +214,11 @@ using RepartitionToNarrow = Repartition<MakeNarrow<TFromD<D>>, D>; | |||
| template <class D> | ||||
| using Half = typename D::Half; | ||||
| 
 | ||||
| // Descriptor for the same lane type as D, but twice the lanes.
 | ||||
| // Tag for the same lane type as D, but twice the lanes.
 | ||||
| template <class D> | ||||
| using Twice = typename D::Twice; | ||||
| 
 | ||||
| // Same as base.h macros but with a Simd<T, N> argument instead of T.
 | ||||
| // Same as base.h macros but with a Simd<T, N, kPow2> argument instead of T.
 | ||||
| #define HWY_IF_UNSIGNED_D(D) HWY_IF_UNSIGNED(TFromD<D>) | ||||
| #define HWY_IF_SIGNED_D(D) HWY_IF_SIGNED(TFromD<D>) | ||||
| #define HWY_IF_FLOAT_D(D) HWY_IF_FLOAT(TFromD<D>) | ||||
|  | @ -175,6 +226,12 @@ using Twice = typename D::Twice; | |||
| #define HWY_IF_LANE_SIZE_D(D, bytes) HWY_IF_LANE_SIZE(TFromD<D>, bytes) | ||||
| #define HWY_IF_NOT_LANE_SIZE_D(D, bytes) HWY_IF_NOT_LANE_SIZE(TFromD<D>, bytes) | ||||
| 
 | ||||
| // MSVC workaround: use PrivateN directly instead of MaxLanes.
 | ||||
| #define HWY_IF_LT128_D(D) \ | ||||
|   hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) < 16>* = nullptr | ||||
| #define HWY_IF_GE128_D(D) \ | ||||
|   hwy::EnableIf<D::kPrivateN * sizeof(TFromD<D>) >= 16>* = nullptr | ||||
| 
 | ||||
| // Same, but with a vector argument.
 | ||||
| #define HWY_IF_UNSIGNED_V(V) HWY_IF_UNSIGNED(TFromV<V>) | ||||
| #define HWY_IF_SIGNED_V(V) HWY_IF_SIGNED(TFromV<V>) | ||||
|  | @ -183,42 +240,59 @@ using Twice = typename D::Twice; | |||
| 
 | ||||
| // For implementing functions for a specific type.
 | ||||
| // IsSame<...>() in template arguments is broken on MSVC2015.
 | ||||
| #define HWY_IF_LANES_ARE(T, V) \ | ||||
|   EnableIf<IsSameT<T, TFromD<DFromV<V>>>::value>* = nullptr | ||||
| #define HWY_IF_LANES_ARE(T, V) EnableIf<IsSameT<T, TFromV<V>>::value>* = nullptr | ||||
| 
 | ||||
| // Compile-time-constant, (typically but not guaranteed) an upper bound on the
 | ||||
| // number of lanes.
 | ||||
| // Prefer instead using Lanes() and dynamic allocation, or Rebind, or
 | ||||
| // `#if HWY_CAP_GE*`.
 | ||||
| template <typename T, size_t N> | ||||
| HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(Simd<T, N>) { | ||||
|   return N; | ||||
| template <class D> | ||||
| HWY_INLINE HWY_MAYBE_UNUSED constexpr int Pow2(D /* d */) { | ||||
|   return D::kPrivatePow2; | ||||
| } | ||||
| 
 | ||||
| // Targets with non-constexpr Lanes define this themselves.
 | ||||
| #if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SVE2 && HWY_TARGET != HWY_SVE | ||||
| // MSVC requires the explicit <D>.
 | ||||
| #define HWY_IF_POW2_GE(D, MIN) hwy::EnableIf<Pow2<D>(D()) >= (MIN)>* = nullptr | ||||
| 
 | ||||
| #if HWY_HAVE_SCALABLE | ||||
| 
 | ||||
| // Upper bound on the number of lanes. Intended for template arguments and
 | ||||
| // reducing code size (e.g. for SSE4, we know at compile-time that vectors will
 | ||||
| // not exceed 16 bytes). WARNING: this may be a loose bound, use Lanes() as the
 | ||||
| // actual size for allocating storage. WARNING: MSVC might not be able to deduce
 | ||||
| // arguments if this is used in EnableIf. See HWY_IF_LT128_D above.
 | ||||
| template <class D> | ||||
| HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) { | ||||
|   return detail::ScaleByPower(HWY_MIN(D::kPrivateN, HWY_LANES(TFromD<D>)), | ||||
|                               D::kPrivatePow2); | ||||
| } | ||||
| 
 | ||||
| #else | ||||
| // Workaround for MSVC 2017: T,N,kPow2 argument deduction fails, so returning N
 | ||||
| // is not an option, nor does a member function work.
 | ||||
| template <class D> | ||||
| HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D) { | ||||
|   return D::kPrivateN; | ||||
| } | ||||
| 
 | ||||
| // (Potentially) non-constant actual size of the vector at runtime, subject to
 | ||||
| // the limit imposed by the Simd. Useful for advancing loop counters.
 | ||||
| template <typename T, size_t N> | ||||
| HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N>) { | ||||
| // Targets with scalable vectors define this themselves.
 | ||||
| template <typename T, size_t N, int kPow2> | ||||
| HWY_INLINE HWY_MAYBE_UNUSED size_t Lanes(Simd<T, N, kPow2>) { | ||||
|   return N; | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
| #endif  // !HWY_HAVE_SCALABLE
 | ||||
| 
 | ||||
| // NOTE: GCC generates incorrect code for vector arguments to non-inlined
 | ||||
| // functions in two situations:
 | ||||
| // - on Windows and GCC 10.3, passing by value crashes due to unaligned loads:
 | ||||
| //   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412.
 | ||||
| // - on ARM64 and GCC 9.3.0 or 11.2.1, passing by const& causes many (but not
 | ||||
| // - on ARM64 and GCC 9.3.0 or 11.2.1, passing by value causes many (but not
 | ||||
| //   all) tests to fail.
 | ||||
| //
 | ||||
| // We therefore pass by const& only on GCC and (Windows or ARM64). This alias
 | ||||
| // must be used for all vector/mask parameters of functions marked HWY_NOINLINE,
 | ||||
| // and possibly also other functions that are not inlined.
 | ||||
| #if HWY_COMPILER_GCC && !HWY_COMPILER_CLANG && \ | ||||
|     ((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM64) | ||||
|     ((defined(_WIN32) || defined(_WIN64)) || HWY_ARCH_ARM_A64) | ||||
| template <class V> | ||||
| using VecArg = const V&; | ||||
| #else | ||||
|  |  | |||
							
								
								
									
										1055
									
								
								third_party/highway/hwy/ops/wasm_128-inl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1055
									
								
								third_party/highway/hwy/ops/wasm_128-inl.h
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										2155
									
								
								third_party/highway/hwy/ops/wasm_256-inl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2155
									
								
								third_party/highway/hwy/ops/wasm_256-inl.h
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										1095
									
								
								third_party/highway/hwy/ops/x86_128-inl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1095
									
								
								third_party/highway/hwy/ops/x86_128-inl.h
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							
							
								
								
									
										329
									
								
								third_party/highway/hwy/ops/x86_256-inl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										329
									
								
								third_party/highway/hwy/ops/x86_256-inl.h
									
									
									
									
										vendored
									
									
								
							|  | @ -44,10 +44,6 @@ | |||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| template <typename T> | ||||
| using Full256 = Simd<T, 32 / sizeof(T)>; | ||||
| 
 | ||||
| namespace detail { | ||||
| 
 | ||||
| template <typename T> | ||||
|  | @ -326,6 +322,38 @@ HWY_API Vec256<T> Not(const Vec256<T> v) { | |||
| #endif | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ OrAnd
 | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec256<T> OrAnd(Vec256<T> o, Vec256<T> a1, Vec256<T> a2) { | ||||
| #if HWY_TARGET <= HWY_AVX3 | ||||
|   const Full256<T> d; | ||||
|   const RebindToUnsigned<decltype(d)> du; | ||||
|   using VU = VFromD<decltype(du)>; | ||||
|   const __m256i ret = _mm256_ternarylogic_epi64( | ||||
|       BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8); | ||||
|   return BitCast(d, VU{ret}); | ||||
| #else | ||||
|   return Or(o, And(a1, a2)); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ IfVecThenElse
 | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec256<T> IfVecThenElse(Vec256<T> mask, Vec256<T> yes, Vec256<T> no) { | ||||
| #if HWY_TARGET <= HWY_AVX3 | ||||
|   const Full256<T> d; | ||||
|   const RebindToUnsigned<decltype(d)> du; | ||||
|   using VU = VFromD<decltype(du)>; | ||||
|   return BitCast(d, VU{_mm256_ternarylogic_epi64(BitCast(du, mask).raw, | ||||
|                                                  BitCast(du, yes).raw, | ||||
|                                                  BitCast(du, no).raw, 0xCA)}); | ||||
| #else | ||||
|   return IfThenElse(MaskFromVec(mask), yes, no); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Operator overloads (internal-only if float)
 | ||||
| 
 | ||||
| template <typename T> | ||||
|  | @ -785,6 +813,7 @@ HWY_API Vec256<T> IfThenZeroElse(Mask256<T> mask, Vec256<T> no) { | |||
| template <typename T, HWY_IF_FLOAT(T)> | ||||
| HWY_API Vec256<T> ZeroIfNegative(Vec256<T> v) { | ||||
|   const auto zero = Zero(Full256<T>()); | ||||
|   // AVX2 IfThenElse only looks at the MSB for 32/64-bit lanes
 | ||||
|   return IfThenElse(MaskFromVec(v), zero, v); | ||||
| } | ||||
| 
 | ||||
|  | @ -1395,7 +1424,12 @@ HWY_API Vec256<double> operator-(const Vec256<double> a, | |||
|   return Vec256<double>{_mm256_sub_pd(a.raw, b.raw)}; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Saturating addition
 | ||||
| // ------------------------------ SumsOf8
 | ||||
| HWY_API Vec256<uint64_t> SumsOf8(const Vec256<uint8_t> v) { | ||||
|   return Vec256<uint64_t>{_mm256_sad_epu8(v.raw, _mm256_setzero_si256())}; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ SaturatedAdd
 | ||||
| 
 | ||||
| // Returns a + b clamped to the destination range.
 | ||||
| 
 | ||||
|  | @ -1419,7 +1453,7 @@ HWY_API Vec256<int16_t> SaturatedAdd(const Vec256<int16_t> a, | |||
|   return Vec256<int16_t>{_mm256_adds_epi16(a.raw, b.raw)}; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Saturating subtraction
 | ||||
| // ------------------------------ SaturatedSub
 | ||||
| 
 | ||||
| // Returns a - b clamped to the destination range.
 | ||||
| 
 | ||||
|  | @ -1685,6 +1719,35 @@ HWY_API Vec256<int64_t> Abs(const Vec256<int64_t> v) { | |||
| #endif | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ IfNegativeThenElse (BroadcastSignBit)
 | ||||
| HWY_API Vec256<int8_t> IfNegativeThenElse(Vec256<int8_t> v, Vec256<int8_t> yes, | ||||
|                                           Vec256<int8_t> no) { | ||||
|   // int8: AVX2 IfThenElse only looks at the MSB.
 | ||||
|   return IfThenElse(MaskFromVec(v), yes, no); | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 2)> | ||||
| HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) { | ||||
|   static_assert(IsSigned<T>(), "Only works for signed/float"); | ||||
|   const Full256<T> d; | ||||
|   const RebindToSigned<decltype(d)> di; | ||||
| 
 | ||||
|   // 16-bit: no native blendv, so copy sign to lower byte's MSB.
 | ||||
|   v = BitCast(d, BroadcastSignBit(BitCast(di, v))); | ||||
|   return IfThenElse(MaskFromVec(v), yes, no); | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_NOT_LANE_SIZE(T, 2)> | ||||
| HWY_API Vec256<T> IfNegativeThenElse(Vec256<T> v, Vec256<T> yes, Vec256<T> no) { | ||||
|   static_assert(IsSigned<T>(), "Only works for signed/float"); | ||||
|   const Full256<T> d; | ||||
|   const RebindToFloat<decltype(d)> df; | ||||
| 
 | ||||
|   // 32/64-bit: use float IfThenElse, which only looks at the MSB.
 | ||||
|   const MFromD<decltype(df)> msb = MaskFromVec(BitCast(df, v)); | ||||
|   return BitCast(d, IfThenElse(msb, BitCast(df, yes), BitCast(df, no))); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ ShiftLeftSame
 | ||||
| 
 | ||||
| HWY_API Vec256<uint16_t> ShiftLeftSame(const Vec256<uint16_t> v, | ||||
|  | @ -2234,7 +2297,7 @@ HWY_API void ScatterOffset(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base, | |||
|   Store(v, d, lanes); | ||||
| 
 | ||||
|   alignas(32) Offset offset_lanes[N]; | ||||
|   Store(offset, Simd<Offset, N>(), offset_lanes); | ||||
|   Store(offset, Full256<Offset>(), offset_lanes); | ||||
| 
 | ||||
|   uint8_t* base_bytes = reinterpret_cast<uint8_t*>(base); | ||||
|   for (size_t i = 0; i < N; ++i) { | ||||
|  | @ -2252,7 +2315,7 @@ HWY_API void ScatterIndex(Vec256<T> v, Full256<T> d, T* HWY_RESTRICT base, | |||
|   Store(v, d, lanes); | ||||
| 
 | ||||
|   alignas(32) Index index_lanes[N]; | ||||
|   Store(index, Simd<Index, N>(), index_lanes); | ||||
|   Store(index, Full256<Index>(), index_lanes); | ||||
| 
 | ||||
|   for (size_t i = 0; i < N; ++i) { | ||||
|     base[index_lanes[i]] = lanes[i]; | ||||
|  | @ -2473,7 +2536,7 @@ HWY_API Vec256<T> ShiftRightBytes(Full256<T> /* tag */, const Vec256<T> v) { | |||
| template <int kLanes, typename T> | ||||
| HWY_API Vec256<T> ShiftRightLanes(Full256<T> d, const Vec256<T> v) { | ||||
|   const Repartition<uint8_t, decltype(d)> d8; | ||||
|   return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v))); | ||||
|   return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v))); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ CombineShiftRightBytes
 | ||||
|  | @ -2733,6 +2796,81 @@ HWY_API Vec256<T> Reverse(Full256<T> d, const Vec256<T> v) { | |||
| #endif | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Reverse2
 | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 2)> | ||||
| HWY_API Vec256<T> Reverse2(Full256<T> d, const Vec256<T> v) { | ||||
|   const Full256<uint32_t> du32; | ||||
|   return BitCast(d, RotateRight<16>(BitCast(du32, v))); | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 4)> | ||||
| HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) { | ||||
|   return Shuffle2301(v); | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 8)> | ||||
| HWY_API Vec256<T> Reverse2(Full256<T> /* tag */, const Vec256<T> v) { | ||||
|   return Shuffle01(v); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Reverse4
 | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 2)> | ||||
| HWY_API Vec256<T> Reverse4(Full256<T> d, const Vec256<T> v) { | ||||
| #if HWY_TARGET <= HWY_AVX3 | ||||
|   const RebindToSigned<decltype(d)> di; | ||||
|   alignas(32) constexpr int16_t kReverse4[16] = {3,  2,  1, 0, 7,  6,  5,  4, | ||||
|                                                  11, 10, 9, 8, 15, 14, 13, 12}; | ||||
|   const Vec256<int16_t> idx = Load(di, kReverse4); | ||||
|   return BitCast(d, Vec256<int16_t>{ | ||||
|                         _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); | ||||
| #else | ||||
|   const RepartitionToWide<decltype(d)> dw; | ||||
|   return Reverse2(d, BitCast(d, Shuffle2301(BitCast(dw, v)))); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 4)> | ||||
| HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) { | ||||
|   return Shuffle0123(v); | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 8)> | ||||
| HWY_API Vec256<T> Reverse4(Full256<T> /* tag */, const Vec256<T> v) { | ||||
|   return Vec256<T>{_mm256_permute4x64_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))}; | ||||
| } | ||||
| HWY_API Vec256<double> Reverse4(Full256<double> /* tag */, Vec256<double> v) { | ||||
|   return Vec256<double>{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))}; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Reverse8
 | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 2)> | ||||
| HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) { | ||||
| #if HWY_TARGET <= HWY_AVX3 | ||||
|   const RebindToSigned<decltype(d)> di; | ||||
|   alignas(32) constexpr int16_t kReverse8[16] = {7,  6,  5,  4,  3,  2,  1, 0, | ||||
|                                                  15, 14, 13, 12, 11, 10, 9, 8}; | ||||
|   const Vec256<int16_t> idx = Load(di, kReverse8); | ||||
|   return BitCast(d, Vec256<int16_t>{ | ||||
|                         _mm256_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); | ||||
| #else | ||||
|   const RepartitionToWide<decltype(d)> dw; | ||||
|   return Reverse2(d, BitCast(d, Shuffle0123(BitCast(dw, v)))); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 4)> | ||||
| HWY_API Vec256<T> Reverse8(Full256<T> d, const Vec256<T> v) { | ||||
|   return Reverse(d, v); | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 8)> | ||||
| HWY_API Vec256<T> Reverse8(Full256<T> /* tag */, const Vec256<T> /* v */) { | ||||
|   HWY_ASSERT(0);  // AVX2 does not have 8 64-bit lanes
 | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ InterleaveLower
 | ||||
| 
 | ||||
| // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
 | ||||
|  | @ -2782,12 +2920,6 @@ HWY_API Vec256<double> InterleaveLower(const Vec256<double> a, | |||
|   return Vec256<double>{_mm256_unpacklo_pd(a.raw, b.raw)}; | ||||
| } | ||||
| 
 | ||||
| // Additional overload for the optional Simd<> tag.
 | ||||
| template <typename T, class V = Vec256<T>> | ||||
| HWY_API V InterleaveLower(Full256<T> /* tag */, V a, V b) { | ||||
|   return InterleaveLower(a, b); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ InterleaveUpper
 | ||||
| 
 | ||||
| // All functions inside detail lack the required D parameter.
 | ||||
|  | @ -2849,11 +2981,11 @@ HWY_API V InterleaveUpper(Full256<T> /* tag */, V a, V b) { | |||
| // this is necessary because the single-lane scalar cannot return two values.
 | ||||
| template <typename T, typename TW = MakeWide<T>> | ||||
| HWY_API Vec256<TW> ZipLower(Vec256<T> a, Vec256<T> b) { | ||||
|   return BitCast(Full256<TW>(), InterleaveLower(Full256<T>(), a, b)); | ||||
|   return BitCast(Full256<TW>(), InterleaveLower(a, b)); | ||||
| } | ||||
| template <typename T, typename TW = MakeWide<T>> | ||||
| HWY_API Vec256<TW> ZipLower(Full256<TW> dw, Vec256<T> a, Vec256<T> b) { | ||||
|   return BitCast(dw, InterleaveLower(Full256<T>(), a, b)); | ||||
|   return BitCast(dw, InterleaveLower(a, b)); | ||||
| } | ||||
| 
 | ||||
| template <typename T, typename TW = MakeWide<T>> | ||||
|  | @ -3063,6 +3195,38 @@ HWY_API Vec256<double> ConcatEven(Full256<double> d, Vec256<double> hi, | |||
| #endif | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ DupEven (InterleaveLower)
 | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 4)> | ||||
| HWY_API Vec256<T> DupEven(Vec256<T> v) { | ||||
|   return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; | ||||
| } | ||||
| HWY_API Vec256<float> DupEven(Vec256<float> v) { | ||||
|   return Vec256<float>{ | ||||
|       _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(2, 2, 0, 0))}; | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 8)> | ||||
| HWY_API Vec256<T> DupEven(const Vec256<T> v) { | ||||
|   return InterleaveLower(Full256<T>(), v, v); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ DupOdd (InterleaveUpper)
 | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 4)> | ||||
| HWY_API Vec256<T> DupOdd(Vec256<T> v) { | ||||
|   return Vec256<T>{_mm256_shuffle_epi32(v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; | ||||
| } | ||||
| HWY_API Vec256<float> DupOdd(Vec256<float> v) { | ||||
|   return Vec256<float>{ | ||||
|       _mm256_shuffle_ps(v.raw, v.raw, _MM_SHUFFLE(3, 3, 1, 1))}; | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 8)> | ||||
| HWY_API Vec256<T> DupOdd(const Vec256<T> v) { | ||||
|   return InterleaveUpper(Full256<T>(), v, v); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ OddEven
 | ||||
| 
 | ||||
| namespace detail { | ||||
|  | @ -3140,6 +3304,13 @@ HWY_API Vec256<double> SwapAdjacentBlocks(Vec256<double> v) { | |||
|   return Vec256<double>{_mm256_permute4x64_pd(v.raw, _MM_SHUFFLE(1, 0, 3, 2))}; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ ReverseBlocks (ConcatLowerUpper)
 | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec256<T> ReverseBlocks(Full256<T> d, Vec256<T> v) { | ||||
|   return ConcatLowerUpper(d, v, v); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ TableLookupBytes (ZeroExtendVector)
 | ||||
| 
 | ||||
| // Both full
 | ||||
|  | @ -3436,7 +3607,7 @@ HWY_API Vec128<int16_t> DemoteTo(Full128<int16_t> /* tag */, | |||
|       _mm256_castsi256_si128(_mm256_permute4x64_epi64(i16, 0x88))}; | ||||
| } | ||||
| 
 | ||||
| HWY_API Vec128<uint8_t, 8> DemoteTo(Simd<uint8_t, 8> /* tag */, | ||||
| HWY_API Vec128<uint8_t, 8> DemoteTo(Full64<uint8_t> /* tag */, | ||||
|                                     const Vec256<int32_t> v) { | ||||
|   const __m256i u16_blocks = _mm256_packus_epi32(v.raw, v.raw); | ||||
|   // Concatenate lower 64 bits of each 128-bit block
 | ||||
|  | @ -3455,7 +3626,7 @@ HWY_API Vec128<uint8_t> DemoteTo(Full128<uint8_t> /* tag */, | |||
|       _mm256_castsi256_si128(_mm256_permute4x64_epi64(u8, 0x88))}; | ||||
| } | ||||
| 
 | ||||
| HWY_API Vec128<int8_t, 8> DemoteTo(Simd<int8_t, 8> /* tag */, | ||||
| HWY_API Vec128<int8_t, 8> DemoteTo(Full64<int8_t> /* tag */, | ||||
|                                    const Vec256<int32_t> v) { | ||||
|   const __m256i i16_blocks = _mm256_packs_epi32(v.raw, v.raw); | ||||
|   // Concatenate lower 64 bits of each 128-bit block
 | ||||
|  | @ -3553,7 +3724,7 @@ HWY_API Vec128<uint8_t, 8> U8FromU32(const Vec256<uint32_t> v) { | |||
|   const auto lo = LowerHalf(quad); | ||||
|   const auto hi = UpperHalf(Full128<uint32_t>(), quad); | ||||
|   const auto pair = LowerHalf(lo | hi); | ||||
|   return BitCast(Simd<uint8_t, 8>(), pair); | ||||
|   return BitCast(Full64<uint8_t>(), pair); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Integer <=> fp (ShiftRight, OddEven)
 | ||||
|  | @ -3691,6 +3862,19 @@ HWY_API Vec256<uint8_t> AESRound(Vec256<uint8_t> state, | |||
| #endif | ||||
| } | ||||
| 
 | ||||
| HWY_API Vec256<uint8_t> AESLastRound(Vec256<uint8_t> state, | ||||
|                                      Vec256<uint8_t> round_key) { | ||||
| #if HWY_TARGET == HWY_AVX3_DL | ||||
|   return Vec256<uint8_t>{_mm256_aesenclast_epi128(state.raw, round_key.raw)}; | ||||
| #else | ||||
|   const Full256<uint8_t> d; | ||||
|   const Half<decltype(d)> d2; | ||||
|   return Combine(d, | ||||
|                  AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), | ||||
|                  AESLastRound(LowerHalf(state), LowerHalf(round_key))); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| HWY_API Vec256<uint64_t> CLMulLower(Vec256<uint64_t> a, Vec256<uint64_t> b) { | ||||
| #if HWY_TARGET == HWY_AVX3_DL | ||||
|   return Vec256<uint64_t>{_mm256_clmulepi64_epi128(a.raw, b.raw, 0x00)}; | ||||
|  | @ -4019,7 +4203,7 @@ HWY_API size_t CompressBlendedStore(Vec256<T> v, Mask256<T> m, Full256<T> d, | |||
| #if HWY_TARGET <= HWY_AVX3_DL | ||||
|   return CompressStore(v, m, d, unaligned);  // also native
 | ||||
| #else | ||||
|   const size_t count = CountTrue(m); | ||||
|   const size_t count = CountTrue(d, m); | ||||
|   const Vec256<T> compressed = Compress(v, m); | ||||
|   // There is no 16-bit MaskedStore, so blend.
 | ||||
|   const Vec256<T> prev = LoadU(d, unaligned); | ||||
|  | @ -4244,7 +4428,7 @@ HWY_API intptr_t FindFirstTrue(const Full256<T> /* tag */, | |||
| namespace detail { | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 4)> | ||||
| HWY_INLINE Indices256<uint32_t> IndicesFromBits(Simd<T, 8> d, | ||||
| HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d, | ||||
|                                                 uint64_t mask_bits) { | ||||
|   const RebindToUnsigned<decltype(d)> d32; | ||||
|   // We need a masked Iota(). With 8 lanes, there are 256 combinations and a LUT
 | ||||
|  | @ -4307,7 +4491,7 @@ HWY_INLINE Indices256<uint32_t> IndicesFromBits(Simd<T, 8> d, | |||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 8)> | ||||
| HWY_INLINE Indices256<uint32_t> IndicesFromBits(Simd<T, 4> d, | ||||
| HWY_INLINE Indices256<uint32_t> IndicesFromBits(Full256<T> d, | ||||
|                                                 uint64_t mask_bits) { | ||||
|   const Repartition<uint32_t, decltype(d)> d32; | ||||
| 
 | ||||
|  | @ -4353,8 +4537,8 @@ HWY_INLINE Vec256<T> Compress(Vec256<T> v, const uint64_t mask_bits) { | |||
|   const auto compressed1 = Compress(promoted1, mask_bits1); | ||||
| 
 | ||||
|   const Half<decltype(du)> dh; | ||||
|   const auto demoted0 = ZeroExtendVector(DemoteTo(dh, compressed0)); | ||||
|   const auto demoted1 = ZeroExtendVector(DemoteTo(dh, compressed1)); | ||||
|   const auto demoted0 = ZeroExtendVector(du, DemoteTo(dh, compressed0)); | ||||
|   const auto demoted1 = ZeroExtendVector(du, DemoteTo(dh, compressed1)); | ||||
| 
 | ||||
|   const size_t count0 = PopCount(mask_bits0); | ||||
|   // Now combine by shifting demoted1 up. AVX2 lacks VPERMW, so start with
 | ||||
|  | @ -4625,101 +4809,6 @@ HWY_API Vec256<T> MaxOfLanes(Full256<T> d, const Vec256<T> vHL) { | |||
|   return detail::MaxOfLanes(hwy::SizeTag<sizeof(T)>(), Max(vLH, vHL)); | ||||
| } | ||||
| 
 | ||||
| // ================================================== DEPRECATED
 | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API size_t StoreMaskBits(const Mask256<T> mask, uint8_t* bits) { | ||||
|   return StoreMaskBits(Full256<T>(), mask, bits); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API bool AllTrue(const Mask256<T> mask) { | ||||
|   return AllTrue(Full256<T>(), mask); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API bool AllFalse(const Mask256<T> mask) { | ||||
|   return AllFalse(Full256<T>(), mask); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API size_t CountTrue(const Mask256<T> mask) { | ||||
|   return CountTrue(Full256<T>(), mask); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec256<T> SumOfLanes(const Vec256<T> vHL) { | ||||
|   return SumOfLanes(Full256<T>(), vHL); | ||||
| } | ||||
| template <typename T> | ||||
| HWY_API Vec256<T> MinOfLanes(const Vec256<T> vHL) { | ||||
|   return MinOfLanes(Full256<T>(), vHL); | ||||
| } | ||||
| template <typename T> | ||||
| HWY_API Vec256<T> MaxOfLanes(const Vec256<T> vHL) { | ||||
|   return MaxOfLanes(Full256<T>(), vHL); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec128<T> UpperHalf(Vec256<T> v) { | ||||
|   return UpperHalf(Full128<T>(), v); | ||||
| } | ||||
| 
 | ||||
| template <int kBytes, typename T> | ||||
| HWY_API Vec256<T> ShiftRightBytes(const Vec256<T> v) { | ||||
|   return ShiftRightBytes<kBytes>(Full256<T>(), v); | ||||
| } | ||||
| 
 | ||||
| template <int kLanes, typename T> | ||||
| HWY_API Vec256<T> ShiftRightLanes(const Vec256<T> v) { | ||||
|   return ShiftRightLanes<kLanes>(Full256<T>(), v); | ||||
| } | ||||
| 
 | ||||
| template <size_t kBytes, typename T> | ||||
| HWY_API Vec256<T> CombineShiftRightBytes(Vec256<T> hi, Vec256<T> lo) { | ||||
|   return CombineShiftRightBytes<kBytes>(Full256<T>(), hi, lo); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec256<T> InterleaveUpper(Vec256<T> a, Vec256<T> b) { | ||||
|   return InterleaveUpper(Full256<T>(), a, b); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec256<MakeWide<T>> ZipUpper(Vec256<T> a, Vec256<T> b) { | ||||
|   return InterleaveUpper(Full256<MakeWide<T>>(), a, b); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec256<T> Combine(Vec128<T> hi, Vec128<T> lo) { | ||||
|   return Combine(Full256<T>(), hi, lo); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec256<T> ZeroExtendVector(Vec128<T> lo) { | ||||
|   return ZeroExtendVector(Full256<T>(), lo); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec256<T> ConcatLowerLower(Vec256<T> hi, Vec256<T> lo) { | ||||
|   return ConcatLowerLower(Full256<T>(), hi, lo); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec256<T> ConcatLowerUpper(Vec256<T> hi, Vec256<T> lo) { | ||||
|   return ConcatLowerUpper(Full256<T>(), hi, lo); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec256<T> ConcatUpperLower(Vec256<T> hi, Vec256<T> lo) { | ||||
|   return ConcatUpperLower(Full256<T>(), hi, lo); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec256<T> ConcatUpperUpper(Vec256<T> hi, Vec256<T> lo) { | ||||
|   return ConcatUpperUpper(Full256<T>(), hi, lo); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
|  |  | |||
							
								
								
									
										340
									
								
								third_party/highway/hwy/ops/x86_512-inl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										340
									
								
								third_party/highway/hwy/ops/x86_512-inl.h
									
									
									
									
										vendored
									
									
								
							|  | @ -57,9 +57,6 @@ HWY_BEFORE_NAMESPACE(); | |||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| template <typename T> | ||||
| using Full512 = Simd<T, 64 / sizeof(T)>; | ||||
| 
 | ||||
| namespace detail { | ||||
| 
 | ||||
| template <typename T> | ||||
|  | @ -313,6 +310,30 @@ HWY_API Vec512<double> Xor(const Vec512<double> a, const Vec512<double> b) { | |||
|   return Vec512<double>{_mm512_xor_pd(a.raw, b.raw)}; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ OrAnd
 | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<T> OrAnd(Vec512<T> o, Vec512<T> a1, Vec512<T> a2) { | ||||
|   const Full512<T> d; | ||||
|   const RebindToUnsigned<decltype(d)> du; | ||||
|   using VU = VFromD<decltype(du)>; | ||||
|   const __m512i ret = _mm512_ternarylogic_epi64( | ||||
|       BitCast(du, o).raw, BitCast(du, a1).raw, BitCast(du, a2).raw, 0xF8); | ||||
|   return BitCast(d, VU{ret}); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ IfVecThenElse
 | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<T> IfVecThenElse(Vec512<T> mask, Vec512<T> yes, Vec512<T> no) { | ||||
|   const Full512<T> d; | ||||
|   const RebindToUnsigned<decltype(d)> du; | ||||
|   using VU = VFromD<decltype(du)>; | ||||
|   return BitCast(d, VU{_mm512_ternarylogic_epi64(BitCast(du, mask).raw, | ||||
|                                                  BitCast(du, yes).raw, | ||||
|                                                  BitCast(du, no).raw, 0xCA)}); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Operator overloads (internal-only if float)
 | ||||
| 
 | ||||
| template <typename T> | ||||
|  | @ -579,6 +600,13 @@ HWY_API Vec512<double> IfThenZeroElse(const Mask512<double> mask, | |||
|   return Vec512<double>{_mm512_mask_xor_pd(no.raw, mask.raw, no.raw, no.raw)}; | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<T> IfNegativeThenElse(Vec512<T> v, Vec512<T> yes, Vec512<T> no) { | ||||
|   static_assert(IsSigned<T>(), "Only works for signed/float"); | ||||
|   // AVX3 MaskFromVec only looks at the MSB
 | ||||
|   return IfThenElse(MaskFromVec(v), yes, no); | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_FLOAT(T)> | ||||
| HWY_API Vec512<T> ZeroIfNegative(const Vec512<T> v) { | ||||
|   // AVX3 MaskFromVec only looks at the MSB
 | ||||
|  | @ -681,7 +709,12 @@ HWY_API Vec512<double> operator-(const Vec512<double> a, | |||
|   return Vec512<double>{_mm512_sub_pd(a.raw, b.raw)}; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Saturating addition
 | ||||
| // ------------------------------ SumsOf8
 | ||||
| HWY_API Vec512<uint64_t> SumsOf8(const Vec512<uint8_t> v) { | ||||
|   return Vec512<uint64_t>{_mm512_sad_epu8(v.raw, _mm512_setzero_si512())}; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ SaturatedAdd
 | ||||
| 
 | ||||
| // Returns a + b clamped to the destination range.
 | ||||
| 
 | ||||
|  | @ -705,7 +738,7 @@ HWY_API Vec512<int16_t> SaturatedAdd(const Vec512<int16_t> a, | |||
|   return Vec512<int16_t>{_mm512_adds_epi16(a.raw, b.raw)}; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Saturating subtraction
 | ||||
| // ------------------------------ SaturatedSub
 | ||||
| 
 | ||||
| // Returns a - b clamped to the destination range.
 | ||||
| 
 | ||||
|  | @ -1820,7 +1853,7 @@ HWY_API Vec512<T> LoadDup128(Full512<T> /* tag */, | |||
|   // https://gcc.godbolt.org/z/-Jt_-F
 | ||||
| #if HWY_LOADDUP_ASM | ||||
|   __m512i out; | ||||
|   asm("vbroadcasti128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0])); | ||||
|   asm("vbroadcasti128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0])); | ||||
|   return Vec512<T>{out}; | ||||
| #else | ||||
|   const auto x4 = LoadU(Full128<T>(), p); | ||||
|  | @ -1831,7 +1864,7 @@ HWY_API Vec512<float> LoadDup128(Full512<float> /* tag */, | |||
|                                  const float* const HWY_RESTRICT p) { | ||||
| #if HWY_LOADDUP_ASM | ||||
|   __m512 out; | ||||
|   asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0])); | ||||
|   asm("vbroadcastf128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0])); | ||||
|   return Vec512<float>{out}; | ||||
| #else | ||||
|   const __m128 x4 = _mm_loadu_ps(p); | ||||
|  | @ -1843,7 +1876,7 @@ HWY_API Vec512<double> LoadDup128(Full512<double> /* tag */, | |||
|                                   const double* const HWY_RESTRICT p) { | ||||
| #if HWY_LOADDUP_ASM | ||||
|   __m512d out; | ||||
|   asm("vbroadcastf128 %1, %[reg]" : [ reg ] "=x"(out) : "m"(p[0])); | ||||
|   asm("vbroadcastf128 %1, %[reg]" : [reg] "=x"(out) : "m"(p[0])); | ||||
|   return Vec512<double>{out}; | ||||
| #else | ||||
|   const __m128d x2 = _mm_loadu_pd(p); | ||||
|  | @ -2007,7 +2040,7 @@ HWY_INLINE Vec512<T> GatherIndex(hwy::SizeTag<8> /* tag */, | |||
| template <typename T, typename Offset> | ||||
| HWY_API Vec512<T> GatherOffset(Full512<T> d, const T* HWY_RESTRICT base, | ||||
|                                const Vec512<Offset> offset) { | ||||
| static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); | ||||
|   static_assert(sizeof(T) == sizeof(Offset), "Must match for portability"); | ||||
|   return detail::GatherOffset(hwy::SizeTag<sizeof(T)>(), d, base, offset); | ||||
| } | ||||
| template <typename T, typename Index> | ||||
|  | @ -2173,7 +2206,7 @@ HWY_API Vec512<T> ShiftRightBytes(Full512<T> /* tag */, const Vec512<T> v) { | |||
| template <int kLanes, typename T> | ||||
| HWY_API Vec512<T> ShiftRightLanes(Full512<T> d, const Vec512<T> v) { | ||||
|   const Repartition<uint8_t, decltype(d)> d8; | ||||
|   return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(BitCast(d8, v))); | ||||
|   return BitCast(d, ShiftRightBytes<kLanes * sizeof(T)>(d8, BitCast(d8, v))); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ CombineShiftRightBytes
 | ||||
|  | @ -2396,6 +2429,78 @@ HWY_API Vec512<T> Reverse(Full512<T> d, const Vec512<T> v) { | |||
|   return TableLookupLanes(v, SetTableIndices(d, kReverse)); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Reverse2
 | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 2)> | ||||
| HWY_API Vec512<T> Reverse2(Full512<T> d, const Vec512<T> v) { | ||||
|   const Full512<uint32_t> du32; | ||||
|   return BitCast(d, RotateRight<16>(BitCast(du32, v))); | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 4)> | ||||
| HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) { | ||||
|   return Shuffle2301(v); | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 8)> | ||||
| HWY_API Vec512<T> Reverse2(Full512<T> /* tag */, const Vec512<T> v) { | ||||
|   return Shuffle01(v); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Reverse4
 | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 2)> | ||||
| HWY_API Vec512<T> Reverse4(Full512<T> d, const Vec512<T> v) { | ||||
|   const RebindToSigned<decltype(d)> di; | ||||
|   alignas(64) constexpr int16_t kReverse4[32] = { | ||||
|       3,  2,  1,  0,  7,  6,  5,  4,  11, 10, 9,  8,  15, 14, 13, 12, | ||||
|       19, 18, 17, 16, 23, 22, 21, 20, 27, 26, 25, 24, 31, 30, 29, 28}; | ||||
|   const Vec512<int16_t> idx = Load(di, kReverse4); | ||||
|   return BitCast(d, Vec512<int16_t>{ | ||||
|                         _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 4)> | ||||
| HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) { | ||||
|   return Shuffle0123(v); | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 8)> | ||||
| HWY_API Vec512<T> Reverse4(Full512<T> /* tag */, const Vec512<T> v) { | ||||
|   return Vec512<T>{_mm512_permutex_epi64(v.raw, _MM_SHUFFLE(0, 1, 2, 3))}; | ||||
| } | ||||
| HWY_API Vec512<double> Reverse4(Full512<double> /* tag */, Vec512<double> v) { | ||||
|   return Vec512<double>{_mm512_permutex_pd(v.raw, _MM_SHUFFLE(0, 1, 2, 3))}; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ Reverse8
 | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 2)> | ||||
| HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) { | ||||
|   const RebindToSigned<decltype(d)> di; | ||||
|   alignas(64) constexpr int16_t kReverse8[32] = { | ||||
|       7,  6,  5,  4,  3,  2,  1,  0,  15, 14, 13, 12, 11, 10, 9,  8, | ||||
|       23, 22, 21, 20, 19, 18, 17, 16, 31, 30, 29, 28, 27, 26, 25, 24}; | ||||
|   const Vec512<int16_t> idx = Load(di, kReverse8); | ||||
|   return BitCast(d, Vec512<int16_t>{ | ||||
|                         _mm512_permutexvar_epi16(idx.raw, BitCast(di, v).raw)}); | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 4)> | ||||
| HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) { | ||||
|   const RebindToSigned<decltype(d)> di; | ||||
|   alignas(64) constexpr int32_t kReverse8[16] = {7,  6,  5,  4,  3,  2,  1, 0, | ||||
|                                                  15, 14, 13, 12, 11, 10, 9, 8}; | ||||
|   const Vec512<int32_t> idx = Load(di, kReverse8); | ||||
|   return BitCast(d, Vec512<int32_t>{ | ||||
|                         _mm512_permutexvar_epi32(idx.raw, BitCast(di, v).raw)}); | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 8)> | ||||
| HWY_API Vec512<T> Reverse8(Full512<T> d, const Vec512<T> v) { | ||||
|   return Reverse(d, v); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ InterleaveLower
 | ||||
| 
 | ||||
| // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
 | ||||
|  | @ -2445,12 +2550,6 @@ HWY_API Vec512<double> InterleaveLower(const Vec512<double> a, | |||
|   return Vec512<double>{_mm512_unpacklo_pd(a.raw, b.raw)}; | ||||
| } | ||||
| 
 | ||||
| // Additional overload for the optional Simd<> tag.
 | ||||
| template <typename T, class V = Vec512<T>> | ||||
| HWY_API V InterleaveLower(Full512<T> /* tag */, V a, V b) { | ||||
|   return InterleaveLower(a, b); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ InterleaveUpper
 | ||||
| 
 | ||||
| // All functions inside detail lack the required D parameter.
 | ||||
|  | @ -2515,8 +2614,8 @@ HWY_API Vec512<TW> ZipLower(Vec512<T> a, Vec512<T> b) { | |||
|   return BitCast(Full512<TW>(), InterleaveLower(a, b)); | ||||
| } | ||||
| template <typename T, typename TW = MakeWide<T>> | ||||
| HWY_API Vec512<TW> ZipLower(Full512<TW> d, Vec512<T> a, Vec512<T> b) { | ||||
|   return BitCast(Full512<TW>(), InterleaveLower(d, a, b)); | ||||
| HWY_API Vec512<TW> ZipLower(Full512<TW> /* d */, Vec512<T> a, Vec512<T> b) { | ||||
|   return BitCast(Full512<TW>(), InterleaveLower(a, b)); | ||||
| } | ||||
| 
 | ||||
| template <typename T, typename TW = MakeWide<T>> | ||||
|  | @ -2564,17 +2663,17 @@ HWY_API Vec512<double> ConcatUpperUpper(Full512<double> /* tag */, | |||
| template <typename T> | ||||
| HWY_API Vec512<T> ConcatLowerUpper(Full512<T> /* tag */, const Vec512<T> hi, | ||||
|                                    const Vec512<T> lo) { | ||||
|   return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, 0x4E)}; | ||||
|   return Vec512<T>{_mm512_shuffle_i32x4(lo.raw, hi.raw, _MM_PERM_BADC)}; | ||||
| } | ||||
| HWY_API Vec512<float> ConcatLowerUpper(Full512<float> /* tag */, | ||||
|                                        const Vec512<float> hi, | ||||
|                                        const Vec512<float> lo) { | ||||
|   return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, 0x4E)}; | ||||
|   return Vec512<float>{_mm512_shuffle_f32x4(lo.raw, hi.raw, _MM_PERM_BADC)}; | ||||
| } | ||||
| HWY_API Vec512<double> ConcatLowerUpper(Full512<double> /* tag */, | ||||
|                                         const Vec512<double> hi, | ||||
|                                         const Vec512<double> lo) { | ||||
|   return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, 0x4E)}; | ||||
|   return Vec512<double>{_mm512_shuffle_f64x2(lo.raw, hi.raw, _MM_PERM_BADC)}; | ||||
| } | ||||
| 
 | ||||
| // hiH,hiL loH,loL |-> hiH,loL (= outer halves)
 | ||||
|  | @ -2675,6 +2774,36 @@ HWY_API Vec512<double> ConcatEven(Full512<double> d, Vec512<double> hi, | |||
|                                                      __mmask8{0xFF}, hi.raw)}; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ DupEven (InterleaveLower)
 | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 4)> | ||||
| HWY_API Vec512<T> DupEven(Vec512<T> v) { | ||||
|   return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_CCAA)}; | ||||
| } | ||||
| HWY_API Vec512<float> DupEven(Vec512<float> v) { | ||||
|   return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_CCAA)}; | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 8)> | ||||
| HWY_API Vec512<T> DupEven(const Vec512<T> v) { | ||||
|   return InterleaveLower(Full512<T>(), v, v); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ DupOdd (InterleaveUpper)
 | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 4)> | ||||
| HWY_API Vec512<T> DupOdd(Vec512<T> v) { | ||||
|   return Vec512<T>{_mm512_shuffle_epi32(v.raw, _MM_PERM_DDBB)}; | ||||
| } | ||||
| HWY_API Vec512<float> DupOdd(Vec512<float> v) { | ||||
|   return Vec512<float>{_mm512_shuffle_ps(v.raw, v.raw, _MM_PERM_DDBB)}; | ||||
| } | ||||
| 
 | ||||
| template <typename T, HWY_IF_LANE_SIZE(T, 8)> | ||||
| HWY_API Vec512<T> DupOdd(const Vec512<T> v) { | ||||
|   return InterleaveUpper(Full512<T>(), v, v); | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ OddEven
 | ||||
| 
 | ||||
| template <typename T> | ||||
|  | @ -2705,17 +2834,29 @@ HWY_API Vec512<double> OddEvenBlocks(Vec512<double> odd, Vec512<double> even) { | |||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<T> SwapAdjacentBlocks(Vec512<T> v) { | ||||
|   return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_SHUFFLE(2, 3, 0, 1))}; | ||||
|   return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_CDAB)}; | ||||
| } | ||||
| 
 | ||||
| HWY_API Vec512<float> SwapAdjacentBlocks(Vec512<float> v) { | ||||
|   return Vec512<float>{ | ||||
|       _mm512_shuffle_f32x4(v.raw, v.raw, _MM_SHUFFLE(2, 3, 0, 1))}; | ||||
|   return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_CDAB)}; | ||||
| } | ||||
| 
 | ||||
| HWY_API Vec512<double> SwapAdjacentBlocks(Vec512<double> v) { | ||||
|   return Vec512<double>{ | ||||
|       _mm512_shuffle_f64x2(v.raw, v.raw, _MM_SHUFFLE(2, 3, 0, 1))}; | ||||
|   return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_CDAB)}; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ ReverseBlocks
 | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<T> ReverseBlocks(Full512<T> /* tag */, Vec512<T> v) { | ||||
|   return Vec512<T>{_mm512_shuffle_i32x4(v.raw, v.raw, _MM_PERM_ABCD)}; | ||||
| } | ||||
| HWY_API Vec512<float> ReverseBlocks(Full512<float> /* tag */, Vec512<float> v) { | ||||
|   return Vec512<float>{_mm512_shuffle_f32x4(v.raw, v.raw, _MM_PERM_ABCD)}; | ||||
| } | ||||
| HWY_API Vec512<double> ReverseBlocks(Full512<double> /* tag */, | ||||
|                                      Vec512<double> v) { | ||||
|   return Vec512<double>{_mm512_shuffle_f64x2(v.raw, v.raw, _MM_PERM_ABCD)}; | ||||
| } | ||||
| 
 | ||||
| // ------------------------------ TableLookupBytes (ZeroExtendVector)
 | ||||
|  | @ -3012,17 +3153,23 @@ HWY_API Vec512<uint8_t> AESRound(Vec512<uint8_t> state, | |||
| #if HWY_TARGET == HWY_AVX3_DL | ||||
|   return Vec512<uint8_t>{_mm512_aesenc_epi128(state.raw, round_key.raw)}; | ||||
| #else | ||||
|   alignas(64) uint8_t a[64]; | ||||
|   alignas(64) uint8_t b[64]; | ||||
|   const Full512<uint8_t> d; | ||||
|   const Full128<uint8_t> d128; | ||||
|   Store(state, d, a); | ||||
|   Store(round_key, d, b); | ||||
|   for (size_t i = 0; i < 64; i += 16) { | ||||
|     const auto enc = AESRound(Load(d128, a + i), Load(d128, b + i)); | ||||
|     Store(enc, d128, a + i); | ||||
|   } | ||||
|   return Load(d, a); | ||||
|   const Half<decltype(d)> d2; | ||||
|   return Combine(d, AESRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), | ||||
|                  AESRound(LowerHalf(state), LowerHalf(round_key))); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| HWY_API Vec512<uint8_t> AESLastRound(Vec512<uint8_t> state, | ||||
|                                      Vec512<uint8_t> round_key) { | ||||
| #if HWY_TARGET == HWY_AVX3_DL | ||||
|   return Vec512<uint8_t>{_mm512_aesenclast_epi128(state.raw, round_key.raw)}; | ||||
| #else | ||||
|   const Full512<uint8_t> d; | ||||
|   const Half<decltype(d)> d2; | ||||
|   return Combine(d, | ||||
|                  AESLastRound(UpperHalf(d2, state), UpperHalf(d2, round_key)), | ||||
|                  AESLastRound(LowerHalf(state), LowerHalf(round_key))); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
|  | @ -3264,8 +3411,8 @@ HWY_API Vec512<T> Compress(Vec512<T> v, const Mask512<T> mask) { | |||
|   const auto compressed0 = Compress(promoted0, mask0); | ||||
|   const auto compressed1 = Compress(promoted1, mask1); | ||||
| 
 | ||||
|   const auto demoted0 = ZeroExtendVector(DemoteTo(duh, compressed0)); | ||||
|   const auto demoted1 = ZeroExtendVector(DemoteTo(duh, compressed1)); | ||||
|   const auto demoted0 = ZeroExtendVector(du, DemoteTo(duh, compressed0)); | ||||
|   const auto demoted1 = ZeroExtendVector(du, DemoteTo(duh, compressed1)); | ||||
| 
 | ||||
|   // Concatenate into single vector by shifting upper with writemask.
 | ||||
|   const size_t num0 = CountTrue(dw, mask0); | ||||
|  | @ -3363,7 +3510,7 @@ HWY_API size_t CompressBlendedStore(Vec512<T> v, Mask512<T> m, Full512<T> d, | |||
|   if (HWY_TARGET == HWY_AVX3_DL || sizeof(T) != 2) { | ||||
|     return CompressStore(v, m, d, unaligned); | ||||
|   } else { | ||||
|     const size_t count = CountTrue(m); | ||||
|     const size_t count = CountTrue(d, m); | ||||
|     const Vec512<T> compressed = Compress(v, m); | ||||
|     const Vec512<T> prev = LoadU(d, unaligned); | ||||
|     StoreU(IfThenElse(FirstN(d, count), compressed, prev), d, unaligned); | ||||
|  | @ -3422,9 +3569,9 @@ HWY_API void StoreInterleaved3(const Vec512<uint8_t> a, const Vec512<uint8_t> b, | |||
|   const auto k = (r2 | g2 | b2).raw;  // low byte in each 128bit: 3A 2A 1A 0A
 | ||||
| 
 | ||||
|   // To obtain 10 0A 05 00 in one vector, transpose "rows" into "columns".
 | ||||
|   const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_SHUFFLE(3, 0, 3, 0)); | ||||
|   const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_SHUFFLE(1, 2, 0, 1)); | ||||
|   const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_SHUFFLE(2, 3, 1, 2)); | ||||
|   const auto k3_k0_i3_i0 = _mm512_shuffle_i64x2(i, k, _MM_PERM_DADA); | ||||
|   const auto i1_i2_j0_j1 = _mm512_shuffle_i64x2(j, i, _MM_PERM_BCAB); | ||||
|   const auto j2_j3_k1_k2 = _mm512_shuffle_i64x2(k, j, _MM_PERM_CDBC); | ||||
| 
 | ||||
|   // Alternating order, most-significant 128 bits from the second arg.
 | ||||
|   const __mmask8 m = 0xCC; | ||||
|  | @ -3456,12 +3603,12 @@ HWY_API void StoreInterleaved4(const Vec512<uint8_t> v0, | |||
|   const auto k = ZipLower(d32, ba8, dc8).raw;  // 4x128bit: d..aB d..a8
 | ||||
|   const auto l = ZipUpper(d32, ba8, dc8).raw;  // 4x128bit: d..aF d..aC
 | ||||
|   // 128-bit blocks were independent until now; transpose 4x4.
 | ||||
|   const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(1, 0, 1, 0)); | ||||
|   const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(1, 0, 1, 0)); | ||||
|   const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_SHUFFLE(3, 2, 3, 2)); | ||||
|   const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_SHUFFLE(3, 2, 3, 2)); | ||||
|   constexpr int k20 = _MM_SHUFFLE(2, 0, 2, 0); | ||||
|   constexpr int k31 = _MM_SHUFFLE(3, 1, 3, 1); | ||||
|   const auto j1_j0_i1_i0 = _mm512_shuffle_i64x2(i, j, _MM_PERM_BABA); | ||||
|   const auto l1_l0_k1_k0 = _mm512_shuffle_i64x2(k, l, _MM_PERM_BABA); | ||||
|   const auto j3_j2_i3_i2 = _mm512_shuffle_i64x2(i, j, _MM_PERM_DCDC); | ||||
|   const auto l3_l2_k3_k2 = _mm512_shuffle_i64x2(k, l, _MM_PERM_DCDC); | ||||
|   constexpr _MM_PERM_ENUM k20 = _MM_PERM_CACA; | ||||
|   constexpr _MM_PERM_ENUM k31 = _MM_PERM_DBDB; | ||||
|   const auto l0_k0_j0_i0 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k20); | ||||
|   const auto l1_k1_j1_i1 = _mm512_shuffle_i64x2(j1_j0_i1_i0, l1_l0_k1_k0, k31); | ||||
|   const auto l2_k2_j2_i2 = _mm512_shuffle_i64x2(j3_j2_i3_i2, l3_l2_k3_k2, k20); | ||||
|  | @ -3631,103 +3778,6 @@ HWY_API Vec512<T> MaxOfLanes(Full512<T> d, Vec512<T> v) { | |||
|   return BitCast(d, Or(min, ShiftLeft<16>(min))); | ||||
| } | ||||
| 
 | ||||
| // ================================================== DEPRECATED
 | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API size_t StoreMaskBits(const Mask512<T> mask, uint8_t* bits) { | ||||
|   return StoreMaskBits(Full512<T>(), mask, bits); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API bool AllTrue(const Mask512<T> mask) { | ||||
|   return AllTrue(Full512<T>(), mask); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API bool AllFalse(const Mask512<T> mask) { | ||||
|   return AllFalse(Full512<T>(), mask); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API size_t CountTrue(const Mask512<T> mask) { | ||||
|   return CountTrue(Full512<T>(), mask); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<T> SumOfLanes(Vec512<T> v) { | ||||
|   return SumOfLanes(Full512<T>(), v); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<T> MinOfLanes(Vec512<T> v) { | ||||
|   return MinOfLanes(Full512<T>(), v); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<T> MaxOfLanes(Vec512<T> v) { | ||||
|   return MaxOfLanes(Full512<T>(), v); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec256<T> UpperHalf(Vec512<T> v) { | ||||
|   return UpperHalf(Full256<T>(), v); | ||||
| } | ||||
| 
 | ||||
| template <int kBytes, typename T> | ||||
| HWY_API Vec512<T> ShiftRightBytes(const Vec512<T> v) { | ||||
|   return ShiftRightBytes<kBytes>(Full512<T>(), v); | ||||
| } | ||||
| 
 | ||||
| template <int kLanes, typename T> | ||||
| HWY_API Vec512<T> ShiftRightLanes(const Vec512<T> v) { | ||||
|   return ShiftRightBytes<kLanes>(Full512<T>(), v); | ||||
| } | ||||
| 
 | ||||
| template <size_t kBytes, typename T> | ||||
| HWY_API Vec512<T> CombineShiftRightBytes(Vec512<T> hi, Vec512<T> lo) { | ||||
|   return CombineShiftRightBytes<kBytes>(Full512<T>(), hi, lo); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<T> InterleaveUpper(Vec512<T> a, Vec512<T> b) { | ||||
|   return InterleaveUpper(Full512<T>(), a, b); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<MakeWide<T>> ZipUpper(Vec512<T> a, Vec512<T> b) { | ||||
|   return InterleaveUpper(Full512<MakeWide<T>>(), a, b); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<T> Combine(Vec256<T> hi, Vec256<T> lo) { | ||||
|   return Combine(Full512<T>(), hi, lo); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<T> ZeroExtendVector(Vec256<T> lo) { | ||||
|   return ZeroExtendVector(Full512<T>(), lo); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<T> ConcatLowerLower(Vec512<T> hi, Vec512<T> lo) { | ||||
|   return ConcatLowerLower(Full512<T>(), hi, lo); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<T> ConcatLowerUpper(Vec512<T> hi, Vec512<T> lo) { | ||||
|   return ConcatLowerUpper(Full512<T>(), hi, lo); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<T> ConcatUpperLower(Vec512<T> hi, Vec512<T> lo) { | ||||
|   return ConcatUpperLower(Full512<T>(), hi, lo); | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| HWY_API Vec512<T> ConcatUpperUpper(Vec512<T> hi, Vec512<T> lo) { | ||||
|   return ConcatUpperUpper(Full512<T>(), hi, lo); | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
|  |  | |||
							
								
								
									
										49
									
								
								third_party/highway/hwy/targets.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										49
									
								
								third_party/highway/hwy/targets.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -15,23 +15,25 @@ | |||
| #include "hwy/targets.h" | ||||
| 
 | ||||
| #include <stdarg.h> | ||||
| #include <stddef.h> | ||||
| #include <stdint.h> | ||||
| #include <stdio.h> | ||||
| 
 | ||||
| #include <atomic> | ||||
| #include <cstddef> | ||||
| #include <limits> | ||||
| 
 | ||||
| #if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ | ||||
|     defined(THREAD_SANITIZER) | ||||
| #include "hwy/base.h" | ||||
| 
 | ||||
| #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN | ||||
| #include "sanitizer/common_interface_defs.h"  // __sanitizer_print_stack_trace
 | ||||
| #endif                                        // defined(*_SANITIZER)
 | ||||
| #endif | ||||
| 
 | ||||
| #include <stdlib.h>  // abort / exit
 | ||||
| 
 | ||||
| #if HWY_ARCH_X86 | ||||
| #include <xmmintrin.h> | ||||
| #if HWY_COMPILER_MSVC | ||||
| #include <intrin.h> | ||||
| #else  // HWY_COMPILER_MSVC
 | ||||
| #else  // !HWY_COMPILER_MSVC
 | ||||
| #include <cpuid.h> | ||||
| #endif  // HWY_COMPILER_MSVC
 | ||||
| #endif  // HWY_ARCH_X86
 | ||||
|  | @ -93,7 +95,7 @@ std::atomic<uint32_t> supported_{0};  // Not yet initialized | |||
| uint32_t supported_targets_for_test_ = 0; | ||||
| 
 | ||||
| // Mask of targets disabled at runtime with DisableTargets.
 | ||||
| uint32_t supported_mask_{std::numeric_limits<uint32_t>::max()}; | ||||
| uint32_t supported_mask_{LimitsMax<uint32_t>()}; | ||||
| 
 | ||||
| #if HWY_ARCH_X86 | ||||
| // Arbritrary bit indices indicating which instruction set extensions are
 | ||||
|  | @ -190,21 +192,22 @@ HWY_NORETURN void HWY_FORMAT(3, 4) | |||
|   va_end(args); | ||||
| 
 | ||||
|   fprintf(stderr, "Abort at %s:%d: %s\n", file, line, buf); | ||||
| #if defined(ADDRESS_SANITIZER) || defined(MEMORY_SANITIZER) || \ | ||||
|     defined(THREAD_SANITIZER) | ||||
|   // If compiled with any sanitizer print a stack trace. This call doesn't crash
 | ||||
|   // the program, instead the trap below will crash it also allowing gdb to
 | ||||
|   // break there.
 | ||||
| 
 | ||||
| // If compiled with any sanitizer, they can also print a stack trace.
 | ||||
| #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN | ||||
|   __sanitizer_print_stack_trace(); | ||||
| #endif  // defined(*_SANITIZER)
 | ||||
| #endif  // HWY_IS_*
 | ||||
|   fflush(stderr); | ||||
| 
 | ||||
| #if HWY_COMPILER_MSVC | ||||
|   abort();  // Compile error without this due to HWY_NORETURN.
 | ||||
| #elif HWY_ARCH_RVV | ||||
|   exit(1);  // trap/abort just freeze Spike
 | ||||
| #else | ||||
| // Now terminate the program:
 | ||||
| #if HWY_ARCH_RVV | ||||
|   exit(1);  // trap/abort just freeze Spike.
 | ||||
| #elif HWY_IS_DEBUG_BUILD && !HWY_COMPILER_MSVC | ||||
|   // Facilitates breaking into a debugger, but don't use this in non-debug
 | ||||
|   // builds because it looks like "illegal instruction", which is misleading.
 | ||||
|   __builtin_trap(); | ||||
| #else | ||||
|   abort();  // Compile error without this due to HWY_NORETURN.
 | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
|  | @ -213,7 +216,7 @@ void DisableTargets(uint32_t disabled_targets) { | |||
|   // We can call Update() here to initialize the mask but that will trigger a
 | ||||
|   // call to SupportedTargets() which we use in tests to tell whether any of the
 | ||||
|   // highway dynamic dispatch functions were used.
 | ||||
|   chosen_target.DeInit(); | ||||
|   GetChosenTarget().DeInit(); | ||||
| } | ||||
| 
 | ||||
| void SetSupportedTargetsForTest(uint32_t targets) { | ||||
|  | @ -222,7 +225,7 @@ void SetSupportedTargetsForTest(uint32_t targets) { | |||
|   // if not zero.
 | ||||
|   supported_.store(0, std::memory_order_release); | ||||
|   supported_targets_for_test_ = targets; | ||||
|   chosen_target.DeInit(); | ||||
|   GetChosenTarget().DeInit(); | ||||
| } | ||||
| 
 | ||||
| bool SupportedTargetsCalledForTest() { | ||||
|  | @ -344,8 +347,10 @@ uint32_t SupportedTargets() { | |||
|   return bits & supported_mask_; | ||||
| } | ||||
| 
 | ||||
| // Declared in targets.h
 | ||||
| ChosenTarget chosen_target; | ||||
| HWY_DLLEXPORT ChosenTarget& GetChosenTarget() { | ||||
|   static ChosenTarget chosen_target; | ||||
|   return chosen_target; | ||||
| } | ||||
| 
 | ||||
| void ChosenTarget::Update() { | ||||
|   // The supported variable contains the current CPU supported targets shifted
 | ||||
|  |  | |||
							
								
								
									
										16
									
								
								third_party/highway/hwy/targets.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										16
									
								
								third_party/highway/hwy/targets.h
									
									
									
									
										vendored
									
									
								
							|  | @ -22,6 +22,7 @@ | |||
| 
 | ||||
| #include "hwy/base.h" | ||||
| #include "hwy/detect_targets.h" | ||||
| #include "hwy/highway_export.h" | ||||
| 
 | ||||
| namespace hwy { | ||||
| 
 | ||||
|  | @ -29,7 +30,7 @@ namespace hwy { | |||
| // Implemented in targets.cc; unconditionally compiled to support the use case
 | ||||
| // of binary-only distributions. The HWY_SUPPORTED_TARGETS wrapper may allow
 | ||||
| // eliding calls to this function.
 | ||||
| uint32_t SupportedTargets(); | ||||
| HWY_DLLEXPORT uint32_t SupportedTargets(); | ||||
| 
 | ||||
| // Evaluates to a function call, or literal if there is a single target.
 | ||||
| #if (HWY_TARGETS & (HWY_TARGETS - 1)) == 0 | ||||
|  | @ -44,7 +45,7 @@ uint32_t SupportedTargets(); | |||
| // lower target is desired. For this reason, attempts to disable targets which
 | ||||
| // are in HWY_ENABLED_BASELINE have no effect so SupportedTargets() always
 | ||||
| // returns at least the baseline target.
 | ||||
| void DisableTargets(uint32_t disabled_targets); | ||||
| HWY_DLLEXPORT void DisableTargets(uint32_t disabled_targets); | ||||
| 
 | ||||
| // Set the mock mask of CPU supported targets instead of the actual CPU
 | ||||
| // supported targets computed in SupportedTargets(). The return value of
 | ||||
|  | @ -52,11 +53,11 @@ void DisableTargets(uint32_t disabled_targets); | |||
| // regardless of this mock, to prevent accidentally adding targets that are
 | ||||
| // known to be buggy in the current CPU. Call with a mask of 0 to disable the
 | ||||
| // mock and use the actual CPU supported targets instead.
 | ||||
| void SetSupportedTargetsForTest(uint32_t targets); | ||||
| HWY_DLLEXPORT void SetSupportedTargetsForTest(uint32_t targets); | ||||
| 
 | ||||
| // Returns whether the SupportedTargets() function was called since the last
 | ||||
| // SetSupportedTargetsForTest() call.
 | ||||
| bool SupportedTargetsCalledForTest(); | ||||
| HWY_DLLEXPORT bool SupportedTargetsCalledForTest(); | ||||
| 
 | ||||
| // Return the list of targets in HWY_TARGETS supported by the CPU as a list of
 | ||||
| // individual HWY_* target macros such as HWY_SCALAR or HWY_NEON. This list
 | ||||
|  | @ -225,7 +226,7 @@ struct ChosenTarget { | |||
|  public: | ||||
|   // Update the ChosenTarget mask based on the current CPU supported
 | ||||
|   // targets.
 | ||||
|   void Update(); | ||||
|   HWY_DLLEXPORT void Update(); | ||||
| 
 | ||||
|   // Reset the ChosenTarget to the uninitialized state.
 | ||||
|   void DeInit() { mask_.store(1); } | ||||
|  | @ -245,11 +246,12 @@ struct ChosenTarget { | |||
|   } | ||||
| 
 | ||||
|  private: | ||||
|   // Initialized to 1 so GetChosenTargetIndex() returns 0.
 | ||||
|   // Initialized to 1 so GetIndex() returns 0.
 | ||||
|   std::atomic<uint32_t> mask_{1}; | ||||
| }; | ||||
| 
 | ||||
| extern ChosenTarget chosen_target; | ||||
| // For internal use (e.g. by FunctionCache and DisableTargets).
 | ||||
| HWY_DLLEXPORT ChosenTarget& GetChosenTarget(); | ||||
| 
 | ||||
| }  // namespace hwy
 | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										4
									
								
								third_party/highway/hwy/targets_test.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								third_party/highway/hwy/targets_test.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -44,11 +44,11 @@ void CheckFakeFunction() { | |||
|     hwy::SetSupportedTargetsForTest(HWY_##TGT);                             \ | ||||
|     /* Calling Update() first to make &HWY_DYNAMIC_DISPATCH() return */     \ | ||||
|     /* the pointer to the already cached function. */                       \ | ||||
|     hwy::chosen_target.Update();                                            \ | ||||
|     hwy::GetChosenTarget().Update();                                        \ | ||||
|     EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \ | ||||
|     /* Calling DeInit() will test that the initializer function */          \ | ||||
|     /* also calls the right function. */                                    \ | ||||
|     hwy::chosen_target.DeInit();                                            \ | ||||
|     hwy::GetChosenTarget().DeInit();                                        \ | ||||
|     EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \ | ||||
|     /* Second call uses the cached value from the previous call. */         \ | ||||
|     EXPECT_EQ(uint32_t(HWY_##TGT), HWY_DYNAMIC_DISPATCH(FakeFunction)(42)); \ | ||||
|  |  | |||
							
								
								
									
										505
									
								
								third_party/highway/hwy/tests/arithmetic_test.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										505
									
								
								third_party/highway/hwy/tests/arithmetic_test.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -177,387 +177,6 @@ HWY_NOINLINE void TestAllAbs() { | |||
|   ForFloatTypes(ForPartialVectors<TestFloatAbs>()); | ||||
| } | ||||
| 
 | ||||
| template <bool kSigned> | ||||
| struct TestLeftShifts { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T t, D d) { | ||||
|     if (kSigned) { | ||||
|       // Also test positive values
 | ||||
|       TestLeftShifts</*kSigned=*/false>()(t, d); | ||||
|     } | ||||
| 
 | ||||
|     using TI = MakeSigned<T>; | ||||
|     using TU = MakeUnsigned<T>; | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
 | ||||
|     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; | ||||
| 
 | ||||
|     // 0
 | ||||
|     HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0)); | ||||
| 
 | ||||
|     // 1
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const T value = kSigned ? T(T(i) - T(N)) : T(i); | ||||
|       expected[i] = T(TU(value) << 1); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1)); | ||||
| 
 | ||||
|     // max
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const T value = kSigned ? T(T(i) - T(N)) : T(i); | ||||
|       expected[i] = T(TU(value) << kMaxShift); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| template <bool kSigned> | ||||
| struct TestVariableLeftShifts { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T t, D d) { | ||||
|     if (kSigned) { | ||||
|       // Also test positive values
 | ||||
|       TestVariableLeftShifts</*kSigned=*/false>()(t, d); | ||||
|     } | ||||
| 
 | ||||
|     using TI = MakeSigned<T>; | ||||
|     using TU = MakeUnsigned<T>; | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     const auto v0 = Zero(d); | ||||
|     const auto v1 = Set(d, 1); | ||||
|     const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
 | ||||
| 
 | ||||
|     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; | ||||
|     const auto max_shift = Set(d, kMaxShift); | ||||
|     const auto small_shifts = And(Iota(d, 0), max_shift); | ||||
|     const auto large_shifts = max_shift - small_shifts; | ||||
| 
 | ||||
|     // Same: 0
 | ||||
|     HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0)); | ||||
| 
 | ||||
|     // Same: 1
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const T value = kSigned ? T(i) - T(N) : T(i); | ||||
|       expected[i] = T(TU(value) << 1); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1)); | ||||
| 
 | ||||
|     // Same: max
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const T value = kSigned ? T(i) - T(N) : T(i); | ||||
|       expected[i] = T(TU(value) << kMaxShift); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift)); | ||||
| 
 | ||||
|     // Variable: small
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const T value = kSigned ? T(i) - T(N) : T(i); | ||||
|       expected[i] = T(TU(value) << (i & kMaxShift)); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, small_shifts)); | ||||
| 
 | ||||
|     // Variable: large
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = T(TU(1) << (kMaxShift - (i & kMaxShift))); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(v1, large_shifts)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| struct TestUnsignedRightShifts { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     const auto values = Iota(d, 0); | ||||
| 
 | ||||
|     const T kMax = LimitsMax<T>(); | ||||
|     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; | ||||
| 
 | ||||
|     // Shift by 0
 | ||||
|     HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0)); | ||||
| 
 | ||||
|     // Shift by 1
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = T(T(i & kMax) >> 1); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1)); | ||||
| 
 | ||||
|     // max
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = T(T(i & kMax) >> kMaxShift); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| struct TestRotateRight { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     constexpr size_t kBits = sizeof(T) * 8; | ||||
|     const auto mask_shift = Set(d, T{kBits}); | ||||
|     // Cover as many bit positions as possible to test shifting out
 | ||||
|     const auto values = Shl(Set(d, T{1}), And(Iota(d, 0), mask_shift)); | ||||
| 
 | ||||
|     // Rotate by 0
 | ||||
|     HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values)); | ||||
| 
 | ||||
|     // Rotate by 1
 | ||||
|     Store(values, d, expected.get()); | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = (expected[i] >> 1) | (expected[i] << (kBits - 1)); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values)); | ||||
| 
 | ||||
|     // Rotate by half
 | ||||
|     Store(values, d, expected.get()); | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = (expected[i] >> (kBits / 2)) | (expected[i] << (kBits / 2)); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values)); | ||||
| 
 | ||||
|     // Rotate by max
 | ||||
|     Store(values, d, expected.get()); | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = (expected[i] >> (kBits - 1)) | (expected[i] << 1); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| struct TestVariableUnsignedRightShifts { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     const auto v0 = Zero(d); | ||||
|     const auto v1 = Set(d, 1); | ||||
|     const auto values = Iota(d, 0); | ||||
| 
 | ||||
|     const T kMax = LimitsMax<T>(); | ||||
|     const auto max = Set(d, kMax); | ||||
| 
 | ||||
|     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; | ||||
|     const auto max_shift = Set(d, kMaxShift); | ||||
|     const auto small_shifts = And(Iota(d, 0), max_shift); | ||||
|     const auto large_shifts = max_shift - small_shifts; | ||||
| 
 | ||||
|     // Same: 0
 | ||||
|     HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0)); | ||||
| 
 | ||||
|     // Same: 1
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = T(T(i & kMax) >> 1); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1)); | ||||
| 
 | ||||
|     // Same: max
 | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift)); | ||||
| 
 | ||||
|     // Variable: small
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = T(i) >> (i & kMaxShift); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, small_shifts)); | ||||
| 
 | ||||
|     // Variable: Large
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = kMax >> (kMaxShift - (i & kMaxShift)); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(max, large_shifts)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| template <int kAmount, typename T> | ||||
| T RightShiftNegative(T val) { | ||||
|   // C++ shifts are implementation-defined for negative numbers, and we have
 | ||||
|   // seen divisions replaced with shifts, so resort to bit operations.
 | ||||
|   using TU = hwy::MakeUnsigned<T>; | ||||
|   TU bits; | ||||
|   CopyBytes<sizeof(T)>(&val, &bits); | ||||
| 
 | ||||
|   const TU shifted = TU(bits >> kAmount); | ||||
| 
 | ||||
|   const TU all = TU(~TU(0)); | ||||
|   const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount; | ||||
|   const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>()); | ||||
| 
 | ||||
|   bits = shifted | sign_extended; | ||||
|   CopyBytes<sizeof(T)>(&bits, &val); | ||||
|   return val; | ||||
| } | ||||
| 
 | ||||
| class TestSignedRightShifts { | ||||
|  public: | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
|     constexpr T kMin = LimitsMin<T>(); | ||||
|     constexpr T kMax = LimitsMax<T>(); | ||||
|     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; | ||||
| 
 | ||||
|     // First test positive values, negative are checked below.
 | ||||
|     const auto v0 = Zero(d); | ||||
|     const auto values = And(Iota(d, 0), Set(d, kMax)); | ||||
| 
 | ||||
|     // Shift by 0
 | ||||
|     HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0)); | ||||
| 
 | ||||
|     // Shift by 1
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = T(T(i & kMax) >> 1); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1)); | ||||
| 
 | ||||
|     // max
 | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift)); | ||||
| 
 | ||||
|     // Even negative value
 | ||||
|     Test<0>(kMin, d, __LINE__); | ||||
|     Test<1>(kMin, d, __LINE__); | ||||
|     Test<2>(kMin, d, __LINE__); | ||||
|     Test<kMaxShift>(kMin, d, __LINE__); | ||||
| 
 | ||||
|     const T odd = static_cast<T>(kMin + 1); | ||||
|     Test<0>(odd, d, __LINE__); | ||||
|     Test<1>(odd, d, __LINE__); | ||||
|     Test<2>(odd, d, __LINE__); | ||||
|     Test<kMaxShift>(odd, d, __LINE__); | ||||
|   } | ||||
| 
 | ||||
|  private: | ||||
|   template <int kAmount, typename T, class D> | ||||
|   void Test(T val, D d, int line) { | ||||
|     const auto expected = Set(d, RightShiftNegative<kAmount>(val)); | ||||
|     const auto in = Set(d, val); | ||||
|     const char* file = __FILE__; | ||||
|     AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line); | ||||
|     AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| struct TestVariableSignedRightShifts { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     using TU = MakeUnsigned<T>; | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     constexpr T kMin = LimitsMin<T>(); | ||||
|     constexpr T kMax = LimitsMax<T>(); | ||||
| 
 | ||||
|     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; | ||||
| 
 | ||||
|     // First test positive values, negative are checked below.
 | ||||
|     const auto v0 = Zero(d); | ||||
|     const auto positive = Iota(d, 0) & Set(d, kMax); | ||||
| 
 | ||||
|     // Shift by 0
 | ||||
|     HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive)); | ||||
|     HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0)); | ||||
| 
 | ||||
|     // Shift by 1
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = T(T(i & kMax) >> 1); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive)); | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1)); | ||||
| 
 | ||||
|     // max
 | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift)); | ||||
| 
 | ||||
|     const auto max_shift = Set(d, kMaxShift); | ||||
|     const auto small_shifts = And(Iota(d, 0), max_shift); | ||||
|     const auto large_shifts = max_shift - small_shifts; | ||||
| 
 | ||||
|     const auto negative = Iota(d, kMin); | ||||
| 
 | ||||
|     // Test varying negative to shift
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i)); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1))); | ||||
| 
 | ||||
|     // Shift MSB right by small amounts
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const size_t amount = i & kMaxShift; | ||||
|       const TU shifted = ~((1ull << (kMaxShift - amount)) - 1); | ||||
|       CopyBytes<sizeof(T)>(&shifted, &expected[i]); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts)); | ||||
| 
 | ||||
|     // Shift MSB right by large amounts
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const size_t amount = kMaxShift - (i & kMaxShift); | ||||
|       const TU shifted = ~((1ull << (kMaxShift - amount)) - 1); | ||||
|       CopyBytes<sizeof(T)>(&shifted, &expected[i]); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllShifts() { | ||||
|   ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>()); | ||||
|   ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>()); | ||||
|   ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>()); | ||||
|   ForSignedTypes(ForPartialVectors<TestSignedRightShifts>()); | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllVariableShifts() { | ||||
|   const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u; | ||||
|   const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s; | ||||
|   const ForPartialVectors<TestUnsignedRightShifts> shr_u; | ||||
|   const ForPartialVectors<TestSignedRightShifts> shr_s; | ||||
| 
 | ||||
|   shl_u(uint16_t()); | ||||
|   shr_u(uint16_t()); | ||||
| 
 | ||||
|   shl_u(uint32_t()); | ||||
|   shr_u(uint32_t()); | ||||
| 
 | ||||
|   shl_s(int16_t()); | ||||
|   shr_s(int16_t()); | ||||
| 
 | ||||
|   shl_s(int32_t()); | ||||
|   shr_s(int32_t()); | ||||
| 
 | ||||
| #if HWY_CAP_INTEGER64 | ||||
|   shl_u(uint64_t()); | ||||
|   shr_u(uint64_t()); | ||||
| 
 | ||||
|   shl_s(int64_t()); | ||||
|   shr_s(int64_t()); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllRotateRight() { | ||||
|   const ForPartialVectors<TestRotateRight> test; | ||||
|   test(uint32_t()); | ||||
| #if HWY_CAP_INTEGER64 | ||||
|   test(uint64_t()); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| struct TestUnsignedMinMax { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|  | @ -644,6 +263,84 @@ HWY_NOINLINE void TestAllMinMax() { | |||
|   ForFloatTypes(ForPartialVectors<TestFloatMinMax>()); | ||||
| } | ||||
| 
 | ||||
| class TestMinMax128 { | ||||
|   template <class D> | ||||
|   static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) { | ||||
|     alignas(16) uint64_t in[2]; | ||||
|     in[0] = lo; | ||||
|     in[1] = hi; | ||||
|     return LoadDup128(d, in); | ||||
|   } | ||||
| 
 | ||||
|  public: | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     using V = Vec<D>; | ||||
|     const size_t N = Lanes(d); | ||||
|     auto a_lanes = AllocateAligned<T>(N); | ||||
|     auto b_lanes = AllocateAligned<T>(N); | ||||
|     auto min_lanes = AllocateAligned<T>(N); | ||||
|     auto max_lanes = AllocateAligned<T>(N); | ||||
|     RandomState rng; | ||||
| 
 | ||||
|     const V v00 = Zero(d); | ||||
|     const V v01 = Make128(d, 0, 1); | ||||
|     const V v10 = Make128(d, 1, 0); | ||||
|     const V v11 = Add(v01, v10); | ||||
| 
 | ||||
|     // Same arg
 | ||||
|     HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v00)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v01)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v10)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v11, Min128(d, v11, v11)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v00, Max128(d, v00, v00)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v01)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v10)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v11)); | ||||
| 
 | ||||
|     // First arg less
 | ||||
|     HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v00, v01)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v01, v10)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v10, v11)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v00, v01)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v01, v10)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v10, v11)); | ||||
| 
 | ||||
|     // Second arg less
 | ||||
|     HWY_ASSERT_VEC_EQ(d, v00, Min128(d, v01, v00)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v01, Min128(d, v10, v01)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v10, Min128(d, v11, v10)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v01, Max128(d, v01, v00)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v10, Max128(d, v10, v01)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v11, Max128(d, v11, v10)); | ||||
| 
 | ||||
|     // Also check 128-bit blocks are independent
 | ||||
|     for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { | ||||
|       for (size_t i = 0; i < N; ++i) { | ||||
|         a_lanes[i] = Random64(&rng); | ||||
|         b_lanes[i] = Random64(&rng); | ||||
|       } | ||||
|       const V a = Load(d, a_lanes.get()); | ||||
|       const V b = Load(d, b_lanes.get()); | ||||
|       for (size_t i = 0; i < N; i += 2) { | ||||
|         const bool lt = a_lanes[i + 1] == b_lanes[i + 1] | ||||
|                             ? (a_lanes[i] < b_lanes[i]) | ||||
|                             : (a_lanes[i + 1] < b_lanes[i + 1]); | ||||
|         min_lanes[i + 0] = lt ? a_lanes[i + 0] : b_lanes[i + 0]; | ||||
|         min_lanes[i + 1] = lt ? a_lanes[i + 1] : b_lanes[i + 1]; | ||||
|         max_lanes[i + 0] = lt ? b_lanes[i + 0] : a_lanes[i + 0]; | ||||
|         max_lanes[i + 1] = lt ? b_lanes[i + 1] : a_lanes[i + 1]; | ||||
|       } | ||||
|       HWY_ASSERT_VEC_EQ(d, min_lanes.get(), Min128(d, a, b)); | ||||
|       HWY_ASSERT_VEC_EQ(d, max_lanes.get(), Max128(d, a, b)); | ||||
|     } | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllMinMax128() { | ||||
|   ForGEVectors<128, TestMinMax128>()(uint64_t()); | ||||
| } | ||||
| 
 | ||||
| struct TestUnsignedMul { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|  | @ -834,11 +531,11 @@ struct TestMulEvenOdd64 { | |||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllMulEven() { | ||||
|   ForExtendableVectors<TestMulEven> test; | ||||
|   ForGEVectors<64, TestMulEven> test; | ||||
|   test(int32_t()); | ||||
|   test(uint32_t()); | ||||
| 
 | ||||
|   ForGE128Vectors<TestMulEvenOdd64>()(uint64_t()); | ||||
|   ForGEVectors<128, TestMulEvenOdd64>()(uint64_t()); | ||||
| } | ||||
| 
 | ||||
| struct TestMulAdd { | ||||
|  | @ -1113,7 +810,6 @@ AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) { | |||
|     // negative +/- epsilon
 | ||||
|     T(-1) + eps, | ||||
|     T(-1) - eps, | ||||
| #if !defined(HWY_EMULATE_SVE)  // these are not safe to just cast to int
 | ||||
|     // +/- huge (but still fits in float)
 | ||||
|     T(1E34), | ||||
|     T(-1E35), | ||||
|  | @ -1122,7 +818,6 @@ AlignedFreeUniquePtr<T[]> RoundTestCases(T /*unused*/, D d, size_t& padded) { | |||
|     -std::numeric_limits<T>::infinity(), | ||||
|     // qNaN
 | ||||
|     GetLane(NaN(d)) | ||||
| #endif | ||||
|   }; | ||||
|   const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]); | ||||
|   const size_t N = Lanes(d); | ||||
|  | @ -1369,6 +1064,41 @@ HWY_NOINLINE void TestAllAbsDiff() { | |||
|   ForPartialVectors<TestAbsDiff>()(float()); | ||||
| } | ||||
| 
 | ||||
| struct TestSumsOf8 { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     RandomState rng; | ||||
| 
 | ||||
|     const size_t N = Lanes(d); | ||||
|     if (N < 8) return; | ||||
|     const Repartition<uint64_t, D> du64; | ||||
| 
 | ||||
|     auto in_lanes = AllocateAligned<T>(N); | ||||
|     auto sum_lanes = AllocateAligned<uint64_t>(N / 8); | ||||
| 
 | ||||
|     for (size_t rep = 0; rep < 100; ++rep) { | ||||
|       for (size_t i = 0; i < N; ++i) { | ||||
|         in_lanes[i] = Random64(&rng) & 0xFF; | ||||
|       } | ||||
| 
 | ||||
|       for (size_t idx_sum = 0; idx_sum < N / 8; ++idx_sum) { | ||||
|         uint64_t sum = 0; | ||||
|         for (size_t i = 0; i < 8; ++i) { | ||||
|           sum += in_lanes[idx_sum * 8 + i]; | ||||
|         } | ||||
|         sum_lanes[idx_sum] = sum; | ||||
|       } | ||||
| 
 | ||||
|       const Vec<D> in = Load(d, in_lanes.get()); | ||||
|       HWY_ASSERT_VEC_EQ(du64, sum_lanes.get(), SumsOf8(in)); | ||||
|     } | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllSumsOf8() { | ||||
|   ForGEVectors<64, TestSumsOf8>()(uint8_t()); | ||||
| } | ||||
| 
 | ||||
| struct TestNeg { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|  | @ -1397,10 +1127,8 @@ namespace hwy { | |||
| HWY_BEFORE_TEST(HwyArithmeticTest); | ||||
| HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllPlusMinus); | ||||
| HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSaturatingArithmetic); | ||||
| HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllShifts); | ||||
| HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllVariableShifts); | ||||
| HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllRotateRight); | ||||
| HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax); | ||||
| HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMinMax128); | ||||
| HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAverage); | ||||
| HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbs); | ||||
| HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllMul); | ||||
|  | @ -1420,6 +1148,7 @@ HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllTrunc); | |||
| HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllCeil); | ||||
| HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllFloor); | ||||
| HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllAbsDiff); | ||||
| HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllSumsOf8); | ||||
| HWY_EXPORT_AND_TEST_P(HwyArithmeticTest, TestAllNeg); | ||||
| }  // namespace hwy
 | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										81
									
								
								third_party/highway/hwy/tests/blockwise_test.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										81
									
								
								third_party/highway/hwy/tests/blockwise_test.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -393,7 +393,7 @@ HWY_NOINLINE void TestAllZip() { | |||
|   lower_unsigned(uint8_t()); | ||||
| #endif | ||||
|   lower_unsigned(uint16_t()); | ||||
| #if HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|   lower_unsigned(uint32_t());  // generates u64
 | ||||
| #endif | ||||
| 
 | ||||
|  | @ -402,7 +402,7 @@ HWY_NOINLINE void TestAllZip() { | |||
|   lower_signed(int8_t()); | ||||
| #endif | ||||
|   lower_signed(int16_t()); | ||||
| #if HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|   lower_signed(int32_t());  // generates i64
 | ||||
| #endif | ||||
| 
 | ||||
|  | @ -411,7 +411,7 @@ HWY_NOINLINE void TestAllZip() { | |||
|   upper_unsigned(uint8_t()); | ||||
| #endif | ||||
|   upper_unsigned(uint16_t()); | ||||
| #if HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|   upper_unsigned(uint32_t());  // generates u64
 | ||||
| #endif | ||||
| 
 | ||||
|  | @ -420,19 +420,20 @@ HWY_NOINLINE void TestAllZip() { | |||
|   upper_signed(int8_t()); | ||||
| #endif | ||||
|   upper_signed(int16_t()); | ||||
| #if HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|   upper_signed(int32_t());  // generates i64
 | ||||
| #endif | ||||
| 
 | ||||
|   // No float - concatenating f32 does not result in a f64
 | ||||
| } | ||||
| 
 | ||||
| template <int kBytes> | ||||
| struct TestCombineShiftRightBytesR { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T t, D d) { | ||||
| // Scalar does not define CombineShiftRightBytes.
 | ||||
| #if HWY_TARGET != HWY_SCALAR || HWY_IDE | ||||
| 
 | ||||
| template <int kBytes> | ||||
| struct TestCombineShiftRightBytes { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T, D d) { | ||||
|     const size_t kBlockSize = 16; | ||||
|     static_assert(kBytes < kBlockSize, "Shift count is per block"); | ||||
|     const Repartition<uint8_t, D> d8; | ||||
|  | @ -461,21 +462,13 @@ struct TestCombineShiftRightBytesR { | |||
|       const auto expected = BitCast(d, Load(d8, expected_bytes.get())); | ||||
|       HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightBytes<kBytes>(d, hi, lo)); | ||||
|     } | ||||
| 
 | ||||
|     TestCombineShiftRightBytesR<kBytes - 1>()(t, d); | ||||
| #else | ||||
|     (void)t; | ||||
|     (void)d; | ||||
| #endif  // #if HWY_TARGET != HWY_SCALAR
 | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| template <int kLanes> | ||||
| struct TestCombineShiftRightLanesR { | ||||
| struct TestCombineShiftRightLanes { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T t, D d) { | ||||
| // Scalar does not define CombineShiftRightBytes (needed for *Lanes).
 | ||||
| #if HWY_TARGET != HWY_SCALAR || HWY_IDE | ||||
|   HWY_NOINLINE void operator()(T, D d) { | ||||
|     const Repartition<uint8_t, D> d8; | ||||
|     const size_t N8 = Lanes(d8); | ||||
|     if (N8 < 16) return; | ||||
|  | @ -505,33 +498,29 @@ struct TestCombineShiftRightLanesR { | |||
|       const auto expected = BitCast(d, Load(d8, expected_bytes.get())); | ||||
|       HWY_ASSERT_VEC_EQ(d, expected, CombineShiftRightLanes<kLanes>(d, hi, lo)); | ||||
|     } | ||||
| 
 | ||||
|     TestCombineShiftRightLanesR<kLanes - 1>()(t, d); | ||||
| #else | ||||
|     (void)t; | ||||
|     (void)d; | ||||
| #endif  // #if HWY_TARGET != HWY_SCALAR
 | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| template <> | ||||
| struct TestCombineShiftRightBytesR<0> { | ||||
|   template <class T, class D> | ||||
|   void operator()(T /*unused*/, D /*unused*/) {} | ||||
| }; | ||||
| 
 | ||||
| template <> | ||||
| struct TestCombineShiftRightLanesR<0> { | ||||
|   template <class T, class D> | ||||
|   void operator()(T /*unused*/, D /*unused*/) {} | ||||
| }; | ||||
| #endif  // #if HWY_TARGET != HWY_SCALAR
 | ||||
| 
 | ||||
| struct TestCombineShiftRight { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T t, D d) { | ||||
| // Scalar does not define CombineShiftRightBytes.
 | ||||
| #if HWY_TARGET != HWY_SCALAR || HWY_IDE | ||||
|     constexpr int kMaxBytes = HWY_MIN(16, int(MaxLanes(d) * sizeof(T))); | ||||
|     TestCombineShiftRightBytesR<kMaxBytes - 1>()(t, d); | ||||
|     TestCombineShiftRightLanesR<kMaxBytes / int(sizeof(T)) - 1>()(t, d); | ||||
|     constexpr int kMaxLanes = kMaxBytes / static_cast<int>(sizeof(T)); | ||||
|     TestCombineShiftRightBytes<kMaxBytes - 1>()(t, d); | ||||
|     TestCombineShiftRightBytes<HWY_MAX(kMaxBytes / 2, 1)>()(t, d); | ||||
|     TestCombineShiftRightBytes<1>()(t, d); | ||||
| 
 | ||||
|     TestCombineShiftRightLanes<kMaxLanes - 1>()(t, d); | ||||
|     TestCombineShiftRightLanes<HWY_MAX(kMaxLanes / 2, -1)>()(t, d); | ||||
|     TestCombineShiftRightLanes<1>()(t, d); | ||||
| #else | ||||
|     (void)t; | ||||
|     (void)d; | ||||
| #endif | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
|  | @ -553,8 +542,10 @@ class TestSpecialShuffle32 { | |||
|   } | ||||
| 
 | ||||
|  private: | ||||
|   // HWY_INLINE works around a Clang SVE compiler bug where all but the first
 | ||||
|   // 128 bits (the NEON register) of actual are zero.
 | ||||
|   template <class D, class V> | ||||
|   HWY_NOINLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3, | ||||
|   HWY_INLINE void VerifyLanes32(D d, VecArg<V> actual, const size_t i3, | ||||
|                                 const size_t i2, const size_t i1, | ||||
|                                 const size_t i0, const char* filename, | ||||
|                                 const int line) { | ||||
|  | @ -582,8 +573,10 @@ class TestSpecialShuffle64 { | |||
|   } | ||||
| 
 | ||||
|  private: | ||||
|   // HWY_INLINE works around a Clang SVE compiler bug where all but the first
 | ||||
|   // 128 bits (the NEON register) of actual are zero.
 | ||||
|   template <class D, class V> | ||||
|   HWY_NOINLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1, | ||||
|   HWY_INLINE void VerifyLanes64(D d, VecArg<V> actual, const size_t i1, | ||||
|                                 const size_t i0, const char* filename, | ||||
|                                 const int line) { | ||||
|     using T = TFromD<D>; | ||||
|  | @ -600,19 +593,19 @@ class TestSpecialShuffle64 { | |||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllSpecialShuffles() { | ||||
|   const ForGE128Vectors<TestSpecialShuffle32> test32; | ||||
|   const ForGEVectors<128, TestSpecialShuffle32> test32; | ||||
|   test32(uint32_t()); | ||||
|   test32(int32_t()); | ||||
|   test32(float()); | ||||
| 
 | ||||
| #if HWY_CAP_INTEGER64 | ||||
|   const ForGE128Vectors<TestSpecialShuffle64> test64; | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|   const ForGEVectors<128, TestSpecialShuffle64> test64; | ||||
|   test64(uint64_t()); | ||||
|   test64(int64_t()); | ||||
| #endif | ||||
| 
 | ||||
| #if HWY_CAP_FLOAT64 | ||||
|   const ForGE128Vectors<TestSpecialShuffle64> test_d; | ||||
| #if HWY_HAVE_FLOAT64 | ||||
|   const ForGEVectors<128, TestSpecialShuffle64> test_d; | ||||
|   test_d(double()); | ||||
| #endif | ||||
| } | ||||
|  |  | |||
							
								
								
									
										52
									
								
								third_party/highway/hwy/tests/combine_test.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										52
									
								
								third_party/highway/hwy/tests/combine_test.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -22,9 +22,6 @@ | |||
| #include "hwy/highway.h" | ||||
| #include "hwy/tests/test_util-inl.h" | ||||
| 
 | ||||
| // Not yet implemented
 | ||||
| #if HWY_TARGET != HWY_RVV | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
|  | @ -85,8 +82,8 @@ struct TestLowerQuarter { | |||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllLowerHalf() { | ||||
|   ForAllTypes(ForDemoteVectors<TestLowerHalf>()); | ||||
|   ForAllTypes(ForDemoteVectors<TestLowerQuarter, 4>()); | ||||
|   ForAllTypes(ForHalfVectors<TestLowerHalf>()); | ||||
|   ForAllTypes(ForHalfVectors<TestLowerQuarter, 2>()); | ||||
| } | ||||
| 
 | ||||
| struct TestUpperHalf { | ||||
|  | @ -95,21 +92,14 @@ struct TestUpperHalf { | |||
|     // Scalar does not define UpperHalf.
 | ||||
| #if HWY_TARGET != HWY_SCALAR | ||||
|     const Half<D> d2; | ||||
| 
 | ||||
|     const auto v = Iota(d, 1); | ||||
|     const size_t N = Lanes(d); | ||||
|     auto lanes = AllocateAligned<T>(N); | ||||
|     std::fill(lanes.get(), lanes.get() + N, T(0)); | ||||
| 
 | ||||
|     Store(UpperHalf(d2, v), d2, lanes.get()); | ||||
|     const size_t N2 = Lanes(d2); | ||||
|     HWY_ASSERT(N2 * 2 == Lanes(d)); | ||||
|     auto expected = AllocateAligned<T>(N2); | ||||
|     size_t i = 0; | ||||
|     for (; i < Lanes(d2); ++i) { | ||||
|       HWY_ASSERT_EQ(T(Lanes(d2) + 1 + i), lanes[i]); | ||||
|     } | ||||
|     // Other half remains unchanged
 | ||||
|     for (; i < N; ++i) { | ||||
|       HWY_ASSERT_EQ(T(0), lanes[i]); | ||||
|     for (; i < N2; ++i) { | ||||
|       expected[i] = static_cast<T>(N2 + 1 + i); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d2, expected.get(), UpperHalf(d2, Iota(d, 1))); | ||||
| #else | ||||
|     (void)d; | ||||
| #endif | ||||
|  | @ -117,7 +107,7 @@ struct TestUpperHalf { | |||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllUpperHalf() { | ||||
|   ForAllTypes(ForShrinkableVectors<TestUpperHalf>()); | ||||
|   ForAllTypes(ForHalfVectors<TestUpperHalf>()); | ||||
| } | ||||
| 
 | ||||
| struct TestZeroExtendVector { | ||||
|  | @ -126,23 +116,23 @@ struct TestZeroExtendVector { | |||
|     const Twice<D> d2; | ||||
| 
 | ||||
|     const auto v = Iota(d, 1); | ||||
|     const size_t N = Lanes(d); | ||||
|     const size_t N2 = Lanes(d2); | ||||
|     // If equal, then N was already MaxLanes(d) and it's not clear what
 | ||||
|     // Combine or ZeroExtendVector should return.
 | ||||
|     if (N2 == N) return; | ||||
|     HWY_ASSERT(N2 == 2 * N); | ||||
|     auto lanes = AllocateAligned<T>(N2); | ||||
|     Store(v, d, &lanes[0]); | ||||
|     Store(v, d, &lanes[N2 / 2]); | ||||
|     Store(v, d, &lanes[N]); | ||||
| 
 | ||||
|     const auto ext = ZeroExtendVector(d2, v); | ||||
|     Store(ext, d2, lanes.get()); | ||||
| 
 | ||||
|     size_t i = 0; | ||||
|     // Lower half is unchanged
 | ||||
|     for (; i < N2 / 2; ++i) { | ||||
|       HWY_ASSERT_EQ(T(1 + i), lanes[i]); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, v, Load(d, &lanes[0])); | ||||
|     // Upper half is zero
 | ||||
|     for (; i < N2; ++i) { | ||||
|       HWY_ASSERT_EQ(T(0), lanes[i]); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, Zero(d), Load(d, &lanes[N])); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
|  | @ -158,7 +148,7 @@ struct TestCombine { | |||
|     auto lanes = AllocateAligned<T>(N2); | ||||
| 
 | ||||
|     const auto lo = Iota(d, 1); | ||||
|     const auto hi = Iota(d, N2 / 2 + 1); | ||||
|     const auto hi = Iota(d, static_cast<T>(N2 / 2 + 1)); | ||||
|     const auto combined = Combine(d2, hi, lo); | ||||
|     Store(combined, d2, lanes.get()); | ||||
| 
 | ||||
|  | @ -232,7 +222,7 @@ struct TestConcatOddEven { | |||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
| #if HWY_TARGET != HWY_RVV && HWY_TARGET != HWY_SCALAR | ||||
|     const size_t N = Lanes(d); | ||||
|     const auto hi = Iota(d, N); | ||||
|     const auto hi = Iota(d, static_cast<T>(N)); | ||||
|     const auto lo = Iota(d, 0); | ||||
|     const auto even = Add(Iota(d, 0), Iota(d, 0)); | ||||
|     const auto odd = Add(even, Set(d, 1)); | ||||
|  | @ -272,7 +262,3 @@ int main(int argc, char **argv) { | |||
| } | ||||
| 
 | ||||
| #endif  // HWY_ONCE
 | ||||
| 
 | ||||
| #else | ||||
| int main(int, char**) { return 0; } | ||||
| #endif  // HWY_TARGET != HWY_RVV
 | ||||
|  |  | |||
							
								
								
									
										58
									
								
								third_party/highway/hwy/tests/compare_test.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										58
									
								
								third_party/highway/hwy/tests/compare_test.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -218,6 +218,63 @@ HWY_NOINLINE void TestAllWeakFloat() { | |||
|   ForFloatTypes(ForPartialVectors<TestWeakFloat>()); | ||||
| } | ||||
| 
 | ||||
| class TestLt128 { | ||||
|   template <class D> | ||||
|   static HWY_NOINLINE Vec<D> Make128(D d, uint64_t hi, uint64_t lo) { | ||||
|     alignas(16) uint64_t in[2]; | ||||
|     in[0] = lo; | ||||
|     in[1] = hi; | ||||
|     return LoadDup128(d, in); | ||||
|   } | ||||
| 
 | ||||
|  public: | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     using V = Vec<D>; | ||||
|     const V v00 = Zero(d); | ||||
|     const V v01 = Make128(d, 0, 1); | ||||
|     const V v10 = Make128(d, 1, 0); | ||||
|     const V v11 = Add(v01, v10); | ||||
| 
 | ||||
|     const auto mask_false = MaskFalse(d); | ||||
|     const auto mask_true = MaskTrue(d); | ||||
| 
 | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v00, v00)); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v01)); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v10)); | ||||
| 
 | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, v01)); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v10)); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, v11)); | ||||
| 
 | ||||
|     // Reversed order
 | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v01, v00)); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v10, v01)); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, v11, v01)); | ||||
| 
 | ||||
|     // Also check 128-bit blocks are independent
 | ||||
|     const V iota = Iota(d, 1); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v01))); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, iota, Add(iota, v10))); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v01), iota)); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, Add(iota, v10), iota)); | ||||
| 
 | ||||
|     // Max value
 | ||||
|     const V vm = Make128(d, LimitsMax<T>(), LimitsMax<T>()); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, vm)); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v00)); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v01)); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v10)); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_false, Lt128(d, vm, v11)); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v00, vm)); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v01, vm)); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v10, vm)); | ||||
|     HWY_ASSERT_MASK_EQ(d, mask_true, Lt128(d, v11, vm)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllLt128() { ForGEVectors<128, TestLt128>()(uint64_t()); } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
|  | @ -232,6 +289,7 @@ HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictUnsigned); | |||
| HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictInt); | ||||
| HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllStrictFloat); | ||||
| HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllWeakFloat); | ||||
| HWY_EXPORT_AND_TEST_P(HwyCompareTest, TestAllLt128); | ||||
| }  // namespace hwy
 | ||||
| 
 | ||||
| // Ought not to be necessary, but without this, no tests run on RVV.
 | ||||
|  |  | |||
							
								
								
									
										328
									
								
								third_party/highway/hwy/tests/convert_test.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										328
									
								
								third_party/highway/hwy/tests/convert_test.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -57,17 +57,17 @@ struct TestBitCastFrom { | |||
|     TestBitCast<uint8_t>()(t, d); | ||||
|     TestBitCast<uint16_t>()(t, d); | ||||
|     TestBitCast<uint32_t>()(t, d); | ||||
| #if HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|     TestBitCast<uint64_t>()(t, d); | ||||
| #endif | ||||
|     TestBitCast<int8_t>()(t, d); | ||||
|     TestBitCast<int16_t>()(t, d); | ||||
|     TestBitCast<int32_t>()(t, d); | ||||
| #if HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|     TestBitCast<int64_t>()(t, d); | ||||
| #endif | ||||
|     TestBitCast<float>()(t, d); | ||||
| #if HWY_CAP_FLOAT64 | ||||
| #if HWY_HAVE_FLOAT64 | ||||
|     TestBitCast<double>()(t, d); | ||||
| #endif | ||||
|   } | ||||
|  | @ -103,39 +103,39 @@ HWY_NOINLINE void TestAllBitCast() { | |||
|   to_i32(int32_t()); | ||||
|   to_i32(float()); | ||||
| 
 | ||||
| #if HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|   const ForPartialVectors<TestBitCast<uint64_t>> to_u64; | ||||
|   to_u64(uint64_t()); | ||||
|   to_u64(int64_t()); | ||||
| #if HWY_CAP_FLOAT64 | ||||
| #if HWY_HAVE_FLOAT64 | ||||
|   to_u64(double()); | ||||
| #endif | ||||
| 
 | ||||
|   const ForPartialVectors<TestBitCast<int64_t>> to_i64; | ||||
|   to_i64(uint64_t()); | ||||
|   to_i64(int64_t()); | ||||
| #if HWY_CAP_FLOAT64 | ||||
| #if HWY_HAVE_FLOAT64 | ||||
|   to_i64(double()); | ||||
| #endif | ||||
| #endif  // HWY_CAP_INTEGER64
 | ||||
| #endif  // HWY_HAVE_INTEGER64
 | ||||
| 
 | ||||
|   const ForPartialVectors<TestBitCast<float>> to_float; | ||||
|   to_float(uint32_t()); | ||||
|   to_float(int32_t()); | ||||
|   to_float(float()); | ||||
| 
 | ||||
| #if HWY_CAP_FLOAT64 | ||||
| #if HWY_HAVE_FLOAT64 | ||||
|   const ForPartialVectors<TestBitCast<double>> to_double; | ||||
|   to_double(double()); | ||||
| #if HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|   to_double(uint64_t()); | ||||
|   to_double(int64_t()); | ||||
| #endif  // HWY_CAP_INTEGER64
 | ||||
| #endif  // HWY_CAP_FLOAT64
 | ||||
| #endif  // HWY_HAVE_INTEGER64
 | ||||
| #endif  // HWY_HAVE_FLOAT64
 | ||||
| 
 | ||||
| #if HWY_TARGET != HWY_SCALAR | ||||
|   // For non-scalar vectors, we can cast all types to all.
 | ||||
|   ForAllTypes(ForGE64Vectors<TestBitCastFrom>()); | ||||
|   ForAllTypes(ForGEVectors<64, TestBitCastFrom>()); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
|  | @ -165,39 +165,39 @@ struct TestPromoteTo { | |||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllPromoteTo() { | ||||
|   const ForPromoteVectors<TestPromoteTo<uint16_t>, 2> to_u16div2; | ||||
|   const ForPromoteVectors<TestPromoteTo<uint16_t>, 1> to_u16div2; | ||||
|   to_u16div2(uint8_t()); | ||||
| 
 | ||||
|   const ForPromoteVectors<TestPromoteTo<uint32_t>, 4> to_u32div4; | ||||
|   const ForPromoteVectors<TestPromoteTo<uint32_t>, 2> to_u32div4; | ||||
|   to_u32div4(uint8_t()); | ||||
| 
 | ||||
|   const ForPromoteVectors<TestPromoteTo<uint32_t>, 2> to_u32div2; | ||||
|   const ForPromoteVectors<TestPromoteTo<uint32_t>, 1> to_u32div2; | ||||
|   to_u32div2(uint16_t()); | ||||
| 
 | ||||
|   const ForPromoteVectors<TestPromoteTo<int16_t>, 2> to_i16div2; | ||||
|   const ForPromoteVectors<TestPromoteTo<int16_t>, 1> to_i16div2; | ||||
|   to_i16div2(uint8_t()); | ||||
|   to_i16div2(int8_t()); | ||||
| 
 | ||||
|   const ForPromoteVectors<TestPromoteTo<int32_t>, 2> to_i32div2; | ||||
|   const ForPromoteVectors<TestPromoteTo<int32_t>, 1> to_i32div2; | ||||
|   to_i32div2(uint16_t()); | ||||
|   to_i32div2(int16_t()); | ||||
| 
 | ||||
|   const ForPromoteVectors<TestPromoteTo<int32_t>, 4> to_i32div4; | ||||
|   const ForPromoteVectors<TestPromoteTo<int32_t>, 2> to_i32div4; | ||||
|   to_i32div4(uint8_t()); | ||||
|   to_i32div4(int8_t()); | ||||
| 
 | ||||
|   // Must test f16/bf16 separately because we can only load/store/convert them.
 | ||||
| 
 | ||||
| #if HWY_CAP_INTEGER64 | ||||
|   const ForPromoteVectors<TestPromoteTo<uint64_t>, 2> to_u64div2; | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|   const ForPromoteVectors<TestPromoteTo<uint64_t>, 1> to_u64div2; | ||||
|   to_u64div2(uint32_t()); | ||||
| 
 | ||||
|   const ForPromoteVectors<TestPromoteTo<int64_t>, 2> to_i64div2; | ||||
|   const ForPromoteVectors<TestPromoteTo<int64_t>, 1> to_i64div2; | ||||
|   to_i64div2(int32_t()); | ||||
| #endif | ||||
| 
 | ||||
| #if HWY_CAP_FLOAT64 | ||||
|   const ForPromoteVectors<TestPromoteTo<double>, 2> to_f64div2; | ||||
| #if HWY_HAVE_FLOAT64 | ||||
|   const ForPromoteVectors<TestPromoteTo<double>, 1> to_f64div2; | ||||
|   to_f64div2(int32_t()); | ||||
|   to_f64div2(float()); | ||||
| #endif | ||||
|  | @ -213,111 +213,6 @@ bool IsFinite(T /*unused*/) { | |||
|   return true; | ||||
| } | ||||
| 
 | ||||
| template <typename ToT> | ||||
| struct TestDemoteTo { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D from_d) { | ||||
|     static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output"); | ||||
|     static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider"); | ||||
|     const Rebind<ToT, D> to_d; | ||||
| 
 | ||||
|     const size_t N = Lanes(from_d); | ||||
|     auto from = AllocateAligned<T>(N); | ||||
|     auto expected = AllocateAligned<ToT>(N); | ||||
| 
 | ||||
|     // Narrower range in the wider type, for clamping before we cast
 | ||||
|     const T min = LimitsMin<ToT>(); | ||||
|     const T max = LimitsMax<ToT>(); | ||||
| 
 | ||||
|     const auto value_ok = [&](T& value) { | ||||
|       if (!IsFinite(value)) return false; | ||||
| #if HWY_EMULATE_SVE | ||||
|       // farm_sve just casts, which is undefined if the value is out of range.
 | ||||
|       value = HWY_MIN(HWY_MAX(min, value), max); | ||||
| #endif | ||||
|       return true; | ||||
|     }; | ||||
| 
 | ||||
|     RandomState rng; | ||||
|     for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { | ||||
|       for (size_t i = 0; i < N; ++i) { | ||||
|         do { | ||||
|           const uint64_t bits = rng(); | ||||
|           memcpy(&from[i], &bits, sizeof(T)); | ||||
|         } while (!value_ok(from[i])); | ||||
|         expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max)); | ||||
|       } | ||||
| 
 | ||||
|       HWY_ASSERT_VEC_EQ(to_d, expected.get(), | ||||
|                         DemoteTo(to_d, Load(from_d, from.get()))); | ||||
|     } | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllDemoteToInt() { | ||||
|   ForDemoteVectors<TestDemoteTo<uint8_t>>()(int16_t()); | ||||
|   ForDemoteVectors<TestDemoteTo<uint8_t>, 4>()(int32_t()); | ||||
| 
 | ||||
|   ForDemoteVectors<TestDemoteTo<int8_t>>()(int16_t()); | ||||
|   ForDemoteVectors<TestDemoteTo<int8_t>, 4>()(int32_t()); | ||||
| 
 | ||||
|   const ForDemoteVectors<TestDemoteTo<uint16_t>> to_u16; | ||||
|   to_u16(int32_t()); | ||||
| 
 | ||||
|   const ForDemoteVectors<TestDemoteTo<int16_t>> to_i16; | ||||
|   to_i16(int32_t()); | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllDemoteToMixed() { | ||||
| #if HWY_CAP_FLOAT64 | ||||
|   const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32; | ||||
|   to_i32(double()); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| template <typename ToT> | ||||
| struct TestDemoteToFloat { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D from_d) { | ||||
|     // For floats, we clamp differently and cannot call LimitsMin.
 | ||||
|     static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output"); | ||||
|     static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider"); | ||||
|     const Rebind<ToT, D> to_d; | ||||
| 
 | ||||
|     const size_t N = Lanes(from_d); | ||||
|     auto from = AllocateAligned<T>(N); | ||||
|     auto expected = AllocateAligned<ToT>(N); | ||||
| 
 | ||||
|     RandomState rng; | ||||
|     for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { | ||||
|       for (size_t i = 0; i < N; ++i) { | ||||
|         do { | ||||
|           const uint64_t bits = rng(); | ||||
|           memcpy(&from[i], &bits, sizeof(T)); | ||||
|         } while (!IsFinite(from[i])); | ||||
|         const T magn = std::abs(from[i]); | ||||
|         const T max_abs = HighestValue<ToT>(); | ||||
|         // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
 | ||||
|         // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
 | ||||
|         const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]); | ||||
|         expected[i] = static_cast<ToT>(clipped); | ||||
|       } | ||||
| 
 | ||||
|       HWY_ASSERT_VEC_EQ(to_d, expected.get(), | ||||
|                         DemoteTo(to_d, Load(from_d, from.get()))); | ||||
|     } | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllDemoteToFloat() { | ||||
|   // Must test f16 separately because we can only load/store/convert them.
 | ||||
| 
 | ||||
| #if HWY_CAP_FLOAT64 | ||||
|   const ForDemoteVectors<TestDemoteToFloat<float>, 2> to_float; | ||||
|   to_float(double()); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| template <class D> | ||||
| AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) { | ||||
|   const float test_cases[] = { | ||||
|  | @ -352,7 +247,7 @@ AlignedFreeUniquePtr<float[]> F16TestCases(D d, size_t& padded) { | |||
| struct TestF16 { | ||||
|   template <typename TF32, class DF32> | ||||
|   HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) { | ||||
| #if HWY_CAP_FLOAT16 | ||||
| #if HWY_HAVE_FLOAT16 | ||||
|     size_t padded; | ||||
|     auto in = F16TestCases(d32, padded); | ||||
|     using TF16 = float16_t; | ||||
|  | @ -406,7 +301,7 @@ AlignedFreeUniquePtr<float[]> BF16TestCases(D d, size_t& padded) { | |||
| struct TestBF16 { | ||||
|   template <typename TF32, class DF32> | ||||
|   HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) { | ||||
| #if HWY_TARGET != HWY_RVV | ||||
| #if !defined(HWY_EMULATE_SVE) | ||||
|     size_t padded; | ||||
|     auto in = BF16TestCases(d32, padded); | ||||
|     using TBF16 = bfloat16_t; | ||||
|  | @ -417,6 +312,7 @@ struct TestBF16 { | |||
| #endif | ||||
|     const Half<decltype(dbf16)> dbf16_half; | ||||
|     const size_t N = Lanes(d32); | ||||
|     HWY_ASSERT(Lanes(dbf16_half) <= N); | ||||
|     auto temp16 = AllocateAligned<TBF16>(N); | ||||
| 
 | ||||
|     for (size_t i = 0; i < padded; i += N) { | ||||
|  | @ -434,124 +330,6 @@ struct TestBF16 { | |||
| 
 | ||||
| HWY_NOINLINE void TestAllBF16() { ForShrinkableVectors<TestBF16>()(float()); } | ||||
| 
 | ||||
| template <class D> | ||||
| AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) { | ||||
|   const float test_cases[] = { | ||||
|       // Same as BF16TestCases:
 | ||||
|       // +/- 1
 | ||||
|       1.0f, | ||||
|       -1.0f, | ||||
|       // +/- 0
 | ||||
|       0.0f, | ||||
|       -0.0f, | ||||
|       // near 0
 | ||||
|       0.25f, | ||||
|       -0.25f, | ||||
|       // +/- integer
 | ||||
|       4.0f, | ||||
|       -32.0f, | ||||
|       // positive +/- delta
 | ||||
|       2.015625f, | ||||
|       3.984375f, | ||||
|       // negative +/- delta
 | ||||
|       -2.015625f, | ||||
|       -3.984375f, | ||||
| 
 | ||||
|       // No huge values - would interfere with sum. But add more to fill 2 * N:
 | ||||
|       -2.0f, | ||||
|       -10.0f, | ||||
|       0.03125f, | ||||
|       1.03125f, | ||||
|       1.5f, | ||||
|       2.0f, | ||||
|       4.0f, | ||||
|       5.0f, | ||||
|       6.0f, | ||||
|       8.0f, | ||||
|       10.0f, | ||||
|       256.0f, | ||||
|       448.0f, | ||||
|       2080.0f, | ||||
|   }; | ||||
|   const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]); | ||||
|   const size_t N = Lanes(d); | ||||
|   padded = RoundUpTo(kNumTestCases, 2 * N);  // allow loading pairs of vectors
 | ||||
|   auto in = AllocateAligned<float>(padded); | ||||
|   auto expected = AllocateAligned<float>(padded); | ||||
|   std::copy(test_cases, test_cases + kNumTestCases, in.get()); | ||||
|   std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f); | ||||
|   return in; | ||||
| } | ||||
| 
 | ||||
| class TestReorderDemote2To { | ||||
|   // In-place N^2 selection sort to avoid dependencies
 | ||||
|   void Sort(float* p, size_t count) { | ||||
|     for (size_t i = 0; i < count - 1; ++i) { | ||||
|       // Find min_element
 | ||||
|       size_t idx_min = i; | ||||
|       for (size_t j = i + 1; j < count; j++) { | ||||
|         if (p[j] < p[idx_min]) { | ||||
|           idx_min = j; | ||||
|         } | ||||
|       } | ||||
| 
 | ||||
|       // Swap with current
 | ||||
|       const float tmp = p[i]; | ||||
|       p[i] = p[idx_min]; | ||||
|       p[idx_min] = tmp; | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|  public: | ||||
|   template <typename TF32, class DF32> | ||||
|   HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) { | ||||
| #if HWY_TARGET != HWY_SCALAR | ||||
|     size_t padded; | ||||
|     auto in = ReorderBF16TestCases(d32, padded); | ||||
| 
 | ||||
|     using TBF16 = bfloat16_t; | ||||
|     const Repartition<TBF16, DF32> dbf16; | ||||
|     const Half<decltype(dbf16)> dbf16_half; | ||||
|     const size_t N = Lanes(d32); | ||||
|     auto temp16 = AllocateAligned<TBF16>(2 * N); | ||||
|     auto expected = AllocateAligned<float>(2 * N); | ||||
|     auto actual = AllocateAligned<float>(2 * N); | ||||
| 
 | ||||
|     for (size_t i = 0; i < padded; i += 2 * N) { | ||||
|       const auto f0 = Load(d32, &in[i + 0]); | ||||
|       const auto f1 = Load(d32, &in[i + N]); | ||||
|       const auto v16 = ReorderDemote2To(dbf16, f0, f1); | ||||
|       Store(v16, dbf16, temp16.get()); | ||||
|       const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0)); | ||||
|       const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N)); | ||||
| 
 | ||||
|       // Smoke test: sum should be same (with tolerance for non-associativity)
 | ||||
|       const auto sum_expected = | ||||
|           GetLane(SumOfLanes(d32, Add(promoted0, promoted1))); | ||||
|       const auto sum_actual = GetLane(SumOfLanes(d32, Add(f0, f1))); | ||||
|       HWY_ASSERT(sum_actual - 1E-4 <= sum_actual && | ||||
|                  sum_expected <= sum_actual + 1E-4); | ||||
| 
 | ||||
|       // Ensure values are the same after sorting to undo the Reorder
 | ||||
|       Store(f0, d32, expected.get() + 0); | ||||
|       Store(f1, d32, expected.get() + N); | ||||
|       Store(promoted0, d32, actual.get() + 0); | ||||
|       Store(promoted1, d32, actual.get() + N); | ||||
|       Sort(expected.get(), 2 * N); | ||||
|       Sort(actual.get(), 2 * N); | ||||
|       HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0)); | ||||
|       HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N)); | ||||
|     } | ||||
| #else  // HWY_SCALAR
 | ||||
|     (void)d32; | ||||
| #endif | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllReorderDemote2To() { | ||||
|   ForShrinkableVectors<TestReorderDemote2To>()(float()); | ||||
| } | ||||
| 
 | ||||
| struct TestConvertU8 { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, const D du32) { | ||||
|  | @ -564,7 +342,7 @@ struct TestConvertU8 { | |||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllConvertU8() { | ||||
|   ForDemoteVectors<TestConvertU8, 4>()(uint32_t()); | ||||
|   ForDemoteVectors<TestConvertU8, 2>()(uint32_t()); | ||||
| } | ||||
| 
 | ||||
| // Separate function to attempt to work around a compiler bug on ARM: when this
 | ||||
|  | @ -574,19 +352,23 @@ struct TestIntFromFloatHuge { | |||
|   HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { | ||||
|     // Still does not work, although ARMv7 manual says that float->int
 | ||||
|     // saturates, i.e. chooses the nearest representable value. Also causes
 | ||||
|     // out-of-memory for MSVC, and unsafe cast in farm_sve.
 | ||||
| #if HWY_TARGET != HWY_NEON && !HWY_COMPILER_MSVC && !defined(HWY_EMULATE_SVE) | ||||
|     // out-of-memory for MSVC.
 | ||||
| #if HWY_TARGET != HWY_NEON && !HWY_COMPILER_MSVC | ||||
|     using TI = MakeSigned<TF>; | ||||
|     const Rebind<TI, DF> di; | ||||
| 
 | ||||
|     // Huge positive (lvalue works around GCC bug, tested with 10.2.1, where
 | ||||
|     // the expected i32 value is otherwise 0x80..00).
 | ||||
|     const auto expected_max = Set(di, LimitsMax<TI>()); | ||||
|     HWY_ASSERT_VEC_EQ(di, expected_max, ConvertTo(di, Set(df, TF(1E20)))); | ||||
|     // Workaround for incorrect 32-bit GCC codegen for SSSE3 - Print-ing
 | ||||
|     // the expected lvalue also seems to prevent the issue.
 | ||||
|     const size_t N = Lanes(df); | ||||
|     auto expected = AllocateAligned<TI>(N); | ||||
| 
 | ||||
|     // Huge negative (also lvalue for safety, but GCC bug was not triggered)
 | ||||
|     const auto expected_min = Set(di, LimitsMin<TI>()); | ||||
|     HWY_ASSERT_VEC_EQ(di, expected_min, ConvertTo(di, Set(df, TF(-1E20)))); | ||||
|     // Huge positive
 | ||||
|     Store(Set(di, LimitsMax<TI>()), di, expected.get()); | ||||
|     HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(1E20)))); | ||||
| 
 | ||||
|     // Huge negative
 | ||||
|     Store(Set(di, LimitsMin<TI>()), di, expected.get()); | ||||
|     HWY_ASSERT_VEC_EQ(di, expected.get(), ConvertTo(di, Set(df, TF(-1E20)))); | ||||
| #else | ||||
|     (void)df; | ||||
| #endif | ||||
|  | @ -634,10 +416,6 @@ class TestIntFromFloat { | |||
|           const uint64_t bits = rng(); | ||||
|           memcpy(&from[i], &bits, sizeof(TF)); | ||||
|         } while (!std::isfinite(from[i])); | ||||
| #if defined(HWY_EMULATE_SVE) | ||||
|         // farm_sve just casts, which is undefined if the value is out of range.
 | ||||
|         from[i] = HWY_MIN(HWY_MAX(min / 2, from[i]), max / 2); | ||||
| #endif | ||||
|         if (from[i] >= max) { | ||||
|           expected[i] = LimitsMax<TI>(); | ||||
|         } else if (from[i] <= min) { | ||||
|  | @ -725,30 +503,21 @@ struct TestI32F64 { | |||
|     const size_t N = Lanes(df); | ||||
| 
 | ||||
|     // Integer positive
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0)))); | ||||
|     HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4)))); | ||||
| 
 | ||||
|     // Integer negative
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N)))); | ||||
|     HWY_ASSERT_VEC_EQ(df, Iota(df, -TF(N)), PromoteTo(df, Iota(di, -TI(N)))); | ||||
| 
 | ||||
|     // Above positive
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001)))); | ||||
|     HWY_ASSERT_VEC_EQ(df, Iota(df, TF(2.0)), PromoteTo(df, Iota(di, TI(2)))); | ||||
| 
 | ||||
|     // Below positive
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999)))); | ||||
|     HWY_ASSERT_VEC_EQ(df, Iota(df, TF(4.0)), PromoteTo(df, Iota(di, TI(4)))); | ||||
| 
 | ||||
|     const TF eps = static_cast<TF>(0.0001); | ||||
|     // Above negative
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), | ||||
|                       DemoteTo(di, Iota(df, -TF(N + 1) + eps))); | ||||
|     HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-4.0)), PromoteTo(df, Iota(di, TI(-4)))); | ||||
| 
 | ||||
|     // Below negative
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)), | ||||
|                       DemoteTo(di, Iota(df, -TF(N + 1) - eps))); | ||||
|     HWY_ASSERT_VEC_EQ(df, Iota(df, TF(-2.0)), PromoteTo(df, Iota(di, TI(-2)))); | ||||
| 
 | ||||
|     // Max positive int
 | ||||
|  | @ -758,22 +527,11 @@ struct TestI32F64 { | |||
|     // Min negative int
 | ||||
|     HWY_ASSERT_VEC_EQ(df, Set(df, TF(LimitsMin<TI>())), | ||||
|                       PromoteTo(df, Set(di, LimitsMin<TI>()))); | ||||
| 
 | ||||
|     // farm_sve just casts, which is undefined if the value is out of range.
 | ||||
| #if !defined(HWY_EMULATE_SVE) | ||||
|     // Huge positive float
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()), | ||||
|                       DemoteTo(di, Set(df, TF(1E12)))); | ||||
| 
 | ||||
|     // Huge negative float
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()), | ||||
|                       DemoteTo(di, Set(df, TF(-1E12)))); | ||||
| #endif | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllI32F64() { | ||||
| #if HWY_CAP_FLOAT64 | ||||
| #if HWY_HAVE_FLOAT64 | ||||
|   ForDemoteVectors<TestI32F64>()(double()); | ||||
| #endif | ||||
| } | ||||
|  | @ -790,12 +548,8 @@ namespace hwy { | |||
| HWY_BEFORE_TEST(HwyConvertTest); | ||||
| HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBitCast); | ||||
| HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllPromoteTo); | ||||
| HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToInt); | ||||
| HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToMixed); | ||||
| HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllDemoteToFloat); | ||||
| HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllF16); | ||||
| HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllBF16); | ||||
| HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllReorderDemote2To); | ||||
| HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllConvertU8); | ||||
| HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllIntFromFloat); | ||||
| HWY_EXPORT_AND_TEST_P(HwyConvertTest, TestAllFloatFromInt); | ||||
|  |  | |||
							
								
								
									
										25
									
								
								third_party/highway/hwy/tests/crypto_test.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										25
									
								
								third_party/highway/hwy/tests/crypto_test.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -74,7 +74,7 @@ class TestAES { | |||
|     } | ||||
| 
 | ||||
|     for (size_t i = 0; i < 256; i += N) { | ||||
|       const auto in = Iota(d, i); | ||||
|       const auto in = Iota(d, static_cast<T>(i)); | ||||
|       HWY_ASSERT_VEC_EQ(d, expected.get() + i, detail::SubBytes(in)); | ||||
|     } | ||||
|   } | ||||
|  | @ -89,11 +89,17 @@ class TestAES { | |||
|         0x42, 0xCA, 0x6B, 0x99, 0x7A, 0x5C, 0x58, 0x16}; | ||||
|     const auto test = LoadDup128(d, test_lanes); | ||||
| 
 | ||||
|     // = ShiftRow result
 | ||||
|     alignas(16) constexpr uint8_t expected_sr_lanes[16] = { | ||||
|         0x09, 0x28, 0x7F, 0x47, 0x6F, 0x74, 0x6A, 0xBF, | ||||
|         0x2C, 0x4A, 0x62, 0x04, 0xDA, 0x08, 0xE3, 0xEE}; | ||||
|     const auto expected_sr = LoadDup128(d, expected_sr_lanes); | ||||
| 
 | ||||
|     // = MixColumn result
 | ||||
|     alignas(16) constexpr uint8_t expected0_lanes[16] = { | ||||
|     alignas(16) constexpr uint8_t expected_mc_lanes[16] = { | ||||
|         0x52, 0x9F, 0x16, 0xC2, 0x97, 0x86, 0x15, 0xCA, | ||||
|         0xE0, 0x1A, 0xAE, 0x54, 0xBA, 0x1A, 0x26, 0x59}; | ||||
|     const auto expected0 = LoadDup128(d, expected0_lanes); | ||||
|     const auto expected_mc = LoadDup128(d, expected_mc_lanes); | ||||
| 
 | ||||
|     // = KeyAddition result
 | ||||
|     alignas(16) constexpr uint8_t expected_lanes[16] = { | ||||
|  | @ -103,17 +109,20 @@ class TestAES { | |||
| 
 | ||||
|     alignas(16) uint8_t key_lanes[16]; | ||||
|     for (size_t i = 0; i < 16; ++i) { | ||||
|       key_lanes[i] = expected0_lanes[i] ^ expected_lanes[i]; | ||||
|       key_lanes[i] = expected_mc_lanes[i] ^ expected_lanes[i]; | ||||
|     } | ||||
|     const auto round_key = LoadDup128(d, key_lanes); | ||||
| 
 | ||||
|     HWY_ASSERT_VEC_EQ(d, expected0, AESRound(test, Zero(d))); | ||||
|     HWY_ASSERT_VEC_EQ(d, expected_mc, AESRound(test, Zero(d))); | ||||
|     HWY_ASSERT_VEC_EQ(d, expected, AESRound(test, round_key)); | ||||
|     HWY_ASSERT_VEC_EQ(d, expected_sr, AESLastRound(test, Zero(d))); | ||||
|     HWY_ASSERT_VEC_EQ(d, Xor(expected_sr, round_key), | ||||
|                       AESLastRound(test, round_key)); | ||||
| 
 | ||||
|     TestSBox(t, d); | ||||
|   } | ||||
| }; | ||||
| HWY_NOINLINE void TestAllAES() { ForGE128Vectors<TestAES>()(uint8_t()); } | ||||
| HWY_NOINLINE void TestAllAES() { ForGEVectors<128, TestAES>()(uint8_t()); } | ||||
| 
 | ||||
| #else | ||||
| HWY_NOINLINE void TestAllAES() {} | ||||
|  | @ -123,7 +132,7 @@ struct TestCLMul { | |||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     // needs 64 bit lanes and 128-bit result
 | ||||
| #if HWY_TARGET != HWY_SCALAR && HWY_CAP_INTEGER64 | ||||
| #if HWY_TARGET != HWY_SCALAR && HWY_HAVE_INTEGER64 | ||||
|     const size_t N = Lanes(d); | ||||
|     if (N == 1) return; | ||||
| 
 | ||||
|  | @ -525,7 +534,7 @@ struct TestCLMul { | |||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllCLMul() { ForGE128Vectors<TestCLMul>()(uint64_t()); } | ||||
| HWY_NOINLINE void TestAllCLMul() { ForGEVectors<128, TestCLMul>()(uint64_t()); } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
|  |  | |||
							
								
								
									
										333
									
								
								third_party/highway/hwy/tests/demote_test.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										333
									
								
								third_party/highway/hwy/tests/demote_test.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,333 @@ | |||
| // Copyright 2019 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include <stddef.h> | ||||
| #include <stdint.h> | ||||
| #include <string.h> | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "tests/demote_test.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| 
 | ||||
| #include "hwy/highway.h" | ||||
| #include "hwy/tests/test_util-inl.h" | ||||
| 
 | ||||
| // Causes build timeout.
 | ||||
| #if !HWY_IS_MSAN | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| template <typename T, HWY_IF_FLOAT(T)> | ||||
| bool IsFinite(T t) { | ||||
|   return std::isfinite(t); | ||||
| } | ||||
| // Wrapper avoids calling std::isfinite for integer types (ambiguous).
 | ||||
| template <typename T, HWY_IF_NOT_FLOAT(T)> | ||||
| bool IsFinite(T /*unused*/) { | ||||
|   return true; | ||||
| } | ||||
| 
 | ||||
| template <typename ToT> | ||||
| struct TestDemoteTo { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D from_d) { | ||||
|     static_assert(!IsFloat<ToT>(), "Use TestDemoteToFloat for float output"); | ||||
|     static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider"); | ||||
|     const Rebind<ToT, D> to_d; | ||||
| 
 | ||||
|     const size_t N = Lanes(from_d); | ||||
|     auto from = AllocateAligned<T>(N); | ||||
|     auto expected = AllocateAligned<ToT>(N); | ||||
| 
 | ||||
|     // Narrower range in the wider type, for clamping before we cast
 | ||||
|     const T min = LimitsMin<ToT>(); | ||||
|     const T max = LimitsMax<ToT>(); | ||||
| 
 | ||||
|     const auto value_ok = [&](T& value) { | ||||
|       if (!IsFinite(value)) return false; | ||||
|       return true; | ||||
|     }; | ||||
| 
 | ||||
|     RandomState rng; | ||||
|     for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { | ||||
|       for (size_t i = 0; i < N; ++i) { | ||||
|         do { | ||||
|           const uint64_t bits = rng(); | ||||
|           memcpy(&from[i], &bits, sizeof(T)); | ||||
|         } while (!value_ok(from[i])); | ||||
|         expected[i] = static_cast<ToT>(HWY_MIN(HWY_MAX(min, from[i]), max)); | ||||
|       } | ||||
| 
 | ||||
|       const auto in = Load(from_d, from.get()); | ||||
|       HWY_ASSERT_VEC_EQ(to_d, expected.get(), DemoteTo(to_d, in)); | ||||
|     } | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllDemoteToInt() { | ||||
|   ForDemoteVectors<TestDemoteTo<uint8_t>>()(int16_t()); | ||||
|   ForDemoteVectors<TestDemoteTo<uint8_t>, 2>()(int32_t()); | ||||
| 
 | ||||
|   ForDemoteVectors<TestDemoteTo<int8_t>>()(int16_t()); | ||||
|   ForDemoteVectors<TestDemoteTo<int8_t>, 2>()(int32_t()); | ||||
| 
 | ||||
|   const ForDemoteVectors<TestDemoteTo<uint16_t>> to_u16; | ||||
|   to_u16(int32_t()); | ||||
| 
 | ||||
|   const ForDemoteVectors<TestDemoteTo<int16_t>> to_i16; | ||||
|   to_i16(int32_t()); | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllDemoteToMixed() { | ||||
| #if HWY_HAVE_FLOAT64 | ||||
|   const ForDemoteVectors<TestDemoteTo<int32_t>> to_i32; | ||||
|   to_i32(double()); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| template <typename ToT> | ||||
| struct TestDemoteToFloat { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D from_d) { | ||||
|     // For floats, we clamp differently and cannot call LimitsMin.
 | ||||
|     static_assert(IsFloat<ToT>(), "Use TestDemoteTo for integer output"); | ||||
|     static_assert(sizeof(T) > sizeof(ToT), "Input type must be wider"); | ||||
|     const Rebind<ToT, D> to_d; | ||||
| 
 | ||||
|     const size_t N = Lanes(from_d); | ||||
|     auto from = AllocateAligned<T>(N); | ||||
|     auto expected = AllocateAligned<ToT>(N); | ||||
| 
 | ||||
|     RandomState rng; | ||||
|     for (size_t rep = 0; rep < AdjustedReps(1000); ++rep) { | ||||
|       for (size_t i = 0; i < N; ++i) { | ||||
|         do { | ||||
|           const uint64_t bits = rng(); | ||||
|           memcpy(&from[i], &bits, sizeof(T)); | ||||
|         } while (!IsFinite(from[i])); | ||||
|         const T magn = std::abs(from[i]); | ||||
|         const T max_abs = HighestValue<ToT>(); | ||||
|         // NOTE: std:: version from C++11 cmath is not defined in RVV GCC, see
 | ||||
|         // https://lists.freebsd.org/pipermail/freebsd-current/2014-January/048130.html
 | ||||
|         const T clipped = copysign(HWY_MIN(magn, max_abs), from[i]); | ||||
|         expected[i] = static_cast<ToT>(clipped); | ||||
|       } | ||||
| 
 | ||||
|       HWY_ASSERT_VEC_EQ(to_d, expected.get(), | ||||
|                         DemoteTo(to_d, Load(from_d, from.get()))); | ||||
|     } | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllDemoteToFloat() { | ||||
|   // Must test f16 separately because we can only load/store/convert them.
 | ||||
| 
 | ||||
| #if HWY_HAVE_FLOAT64 | ||||
|   const ForDemoteVectors<TestDemoteToFloat<float>, 1> to_float; | ||||
|   to_float(double()); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| template <class D> | ||||
| AlignedFreeUniquePtr<float[]> ReorderBF16TestCases(D d, size_t& padded) { | ||||
|   const float test_cases[] = { | ||||
|       // Same as BF16TestCases:
 | ||||
|       // +/- 1
 | ||||
|       1.0f, | ||||
|       -1.0f, | ||||
|       // +/- 0
 | ||||
|       0.0f, | ||||
|       -0.0f, | ||||
|       // near 0
 | ||||
|       0.25f, | ||||
|       -0.25f, | ||||
|       // +/- integer
 | ||||
|       4.0f, | ||||
|       -32.0f, | ||||
|       // positive +/- delta
 | ||||
|       2.015625f, | ||||
|       3.984375f, | ||||
|       // negative +/- delta
 | ||||
|       -2.015625f, | ||||
|       -3.984375f, | ||||
| 
 | ||||
|       // No huge values - would interfere with sum. But add more to fill 2 * N:
 | ||||
|       -2.0f, | ||||
|       -10.0f, | ||||
|       0.03125f, | ||||
|       1.03125f, | ||||
|       1.5f, | ||||
|       2.0f, | ||||
|       4.0f, | ||||
|       5.0f, | ||||
|       6.0f, | ||||
|       8.0f, | ||||
|       10.0f, | ||||
|       256.0f, | ||||
|       448.0f, | ||||
|       2080.0f, | ||||
|   }; | ||||
|   const size_t kNumTestCases = sizeof(test_cases) / sizeof(test_cases[0]); | ||||
|   const size_t N = Lanes(d); | ||||
|   padded = RoundUpTo(kNumTestCases, 2 * N);  // allow loading pairs of vectors
 | ||||
|   auto in = AllocateAligned<float>(padded); | ||||
|   auto expected = AllocateAligned<float>(padded); | ||||
|   std::copy(test_cases, test_cases + kNumTestCases, in.get()); | ||||
|   std::fill(in.get() + kNumTestCases, in.get() + padded, 0.0f); | ||||
|   return in; | ||||
| } | ||||
| 
 | ||||
| class TestReorderDemote2To { | ||||
|   // In-place N^2 selection sort to avoid dependencies
 | ||||
|   void Sort(float* p, size_t count) { | ||||
|     for (size_t i = 0; i < count - 1; ++i) { | ||||
|       // Find min_element
 | ||||
|       size_t idx_min = i; | ||||
|       for (size_t j = i + 1; j < count; j++) { | ||||
|         if (p[j] < p[idx_min]) { | ||||
|           idx_min = j; | ||||
|         } | ||||
|       } | ||||
| 
 | ||||
|       // Swap with current
 | ||||
|       const float tmp = p[i]; | ||||
|       p[i] = p[idx_min]; | ||||
|       p[idx_min] = tmp; | ||||
|     } | ||||
|   } | ||||
| 
 | ||||
|  public: | ||||
|   template <typename TF32, class DF32> | ||||
|   HWY_NOINLINE void operator()(TF32 /*t*/, DF32 d32) { | ||||
| #if HWY_TARGET != HWY_SCALAR | ||||
| 
 | ||||
|     size_t padded; | ||||
|     auto in = ReorderBF16TestCases(d32, padded); | ||||
| 
 | ||||
|     using TBF16 = bfloat16_t; | ||||
|     const Repartition<TBF16, DF32> dbf16; | ||||
|     const Half<decltype(dbf16)> dbf16_half; | ||||
|     const size_t N = Lanes(d32); | ||||
|     auto temp16 = AllocateAligned<TBF16>(2 * N); | ||||
|     auto expected = AllocateAligned<float>(2 * N); | ||||
|     auto actual = AllocateAligned<float>(2 * N); | ||||
| 
 | ||||
|     for (size_t i = 0; i < padded; i += 2 * N) { | ||||
|       const auto f0 = Load(d32, &in[i + 0]); | ||||
|       const auto f1 = Load(d32, &in[i + N]); | ||||
|       const auto v16 = ReorderDemote2To(dbf16, f0, f1); | ||||
|       Store(v16, dbf16, temp16.get()); | ||||
|       const auto promoted0 = PromoteTo(d32, Load(dbf16_half, temp16.get() + 0)); | ||||
|       const auto promoted1 = PromoteTo(d32, Load(dbf16_half, temp16.get() + N)); | ||||
| 
 | ||||
|       // Smoke test: sum should be same (with tolerance for non-associativity)
 | ||||
|       const auto sum_expected = | ||||
|           GetLane(SumOfLanes(d32, Add(promoted0, promoted1))); | ||||
|       const auto sum_actual = GetLane(SumOfLanes(d32, Add(f0, f1))); | ||||
|       HWY_ASSERT(sum_actual - 1E-4 <= sum_actual && | ||||
|                  sum_expected <= sum_actual + 1E-4); | ||||
| 
 | ||||
|       // Ensure values are the same after sorting to undo the Reorder
 | ||||
|       Store(f0, d32, expected.get() + 0); | ||||
|       Store(f1, d32, expected.get() + N); | ||||
|       Store(promoted0, d32, actual.get() + 0); | ||||
|       Store(promoted1, d32, actual.get() + N); | ||||
|       Sort(expected.get(), 2 * N); | ||||
|       Sort(actual.get(), 2 * N); | ||||
|       HWY_ASSERT_VEC_EQ(d32, expected.get() + 0, Load(d32, actual.get() + 0)); | ||||
|       HWY_ASSERT_VEC_EQ(d32, expected.get() + N, Load(d32, actual.get() + N)); | ||||
|     } | ||||
| #else  // HWY_SCALAR
 | ||||
|     (void)d32; | ||||
| #endif | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllReorderDemote2To() { | ||||
|   ForShrinkableVectors<TestReorderDemote2To>()(float()); | ||||
| } | ||||
| 
 | ||||
| struct TestI32F64 { | ||||
|   template <typename TF, class DF> | ||||
|   HWY_NOINLINE void operator()(TF /*unused*/, const DF df) { | ||||
|     using TI = int32_t; | ||||
|     const Rebind<TI, DF> di; | ||||
|     const size_t N = Lanes(df); | ||||
| 
 | ||||
|     // Integer positive
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Iota(di, TI(4)), DemoteTo(di, Iota(df, TF(4.0)))); | ||||
| 
 | ||||
|     // Integer negative
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), DemoteTo(di, Iota(df, -TF(N)))); | ||||
| 
 | ||||
|     // Above positive
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Iota(di, TI(2)), DemoteTo(di, Iota(df, TF(2.001)))); | ||||
| 
 | ||||
|     // Below positive
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Iota(di, TI(3)), DemoteTo(di, Iota(df, TF(3.9999)))); | ||||
| 
 | ||||
|     const TF eps = static_cast<TF>(0.0001); | ||||
|     // Above negative
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N)), | ||||
|                       DemoteTo(di, Iota(df, -TF(N + 1) + eps))); | ||||
| 
 | ||||
|     // Below negative
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Iota(di, -TI(N + 1)), | ||||
|                       DemoteTo(di, Iota(df, -TF(N + 1) - eps))); | ||||
| 
 | ||||
|     // Huge positive float
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMax<TI>()), | ||||
|                       DemoteTo(di, Set(df, TF(1E12)))); | ||||
| 
 | ||||
|     // Huge negative float
 | ||||
|     HWY_ASSERT_VEC_EQ(di, Set(di, LimitsMin<TI>()), | ||||
|                       DemoteTo(di, Set(df, TF(-1E12)))); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllI32F64() { | ||||
| #if HWY_HAVE_FLOAT64 | ||||
|   ForDemoteVectors<TestI32F64>()(double()); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #endif  //  !HWY_IS_MSAN
 | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| 
 | ||||
| namespace hwy { | ||||
| #if !HWY_IS_MSAN | ||||
| HWY_BEFORE_TEST(HwyDemoteTest); | ||||
| HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToInt); | ||||
| HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToMixed); | ||||
| HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllDemoteToFloat); | ||||
| HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllReorderDemote2To); | ||||
| HWY_EXPORT_AND_TEST_P(HwyDemoteTest, TestAllI32F64); | ||||
| #endif  //  !HWY_IS_MSAN
 | ||||
| }  // namespace hwy
 | ||||
| 
 | ||||
| // Ought not to be necessary, but without this, no tests run on RVV.
 | ||||
| int main(int argc, char** argv) { | ||||
|   ::testing::InitGoogleTest(&argc, argv); | ||||
|   return RUN_ALL_TESTS(); | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
							
								
								
									
										81
									
								
								third_party/highway/hwy/tests/logical_test.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										81
									
								
								third_party/highway/hwy/tests/logical_test.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -17,7 +17,6 @@ | |||
| #include <string.h>  // memcmp
 | ||||
| 
 | ||||
| #include "hwy/aligned_allocator.h" | ||||
| #include "hwy/base.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "tests/logical_test.cc" | ||||
|  | @ -59,6 +58,15 @@ struct TestLogicalInteger { | |||
|     HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, v0)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, AndNot(vi, vi)); | ||||
| 
 | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, v0)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, vi, v0)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, OrAnd(v0, v0, vi)); | ||||
|     HWY_ASSERT_VEC_EQ(d, vi, OrAnd(v0, vi, vi)); | ||||
|     HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, v0)); | ||||
|     HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, v0)); | ||||
|     HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, v0, vi)); | ||||
|     HWY_ASSERT_VEC_EQ(d, vi, OrAnd(vi, vi, vi)); | ||||
| 
 | ||||
|     auto v = vi; | ||||
|     v = And(v, vi); | ||||
|     HWY_ASSERT_VEC_EQ(d, vi, v); | ||||
|  | @ -156,6 +164,43 @@ struct TestCopySign { | |||
|   } | ||||
| }; | ||||
| 
 | ||||
| struct TestIfVecThenElse { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     RandomState rng; | ||||
| 
 | ||||
|     using TU = MakeUnsigned<T>;  // For all-one mask
 | ||||
|     const Rebind<TU, D> du; | ||||
|     const size_t N = Lanes(d); | ||||
|     auto in1 = AllocateAligned<T>(N); | ||||
|     auto in2 = AllocateAligned<T>(N); | ||||
|     auto vec_lanes = AllocateAligned<TU>(N); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     // Each lane should have a chance of having mask=true.
 | ||||
|     for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { | ||||
|       for (size_t i = 0; i < N; ++i) { | ||||
|         in1[i] = static_cast<T>(Random32(&rng)); | ||||
|         in2[i] = static_cast<T>(Random32(&rng)); | ||||
|         vec_lanes[i] = (Random32(&rng) & 16) ? static_cast<TU>(~TU(0)) : TU(0); | ||||
|       } | ||||
| 
 | ||||
|       const auto v1 = Load(d, in1.get()); | ||||
|       const auto v2 = Load(d, in2.get()); | ||||
|       const auto vec = BitCast(d, Load(du, vec_lanes.get())); | ||||
| 
 | ||||
|       for (size_t i = 0; i < N; ++i) { | ||||
|         expected[i] = vec_lanes[i] ? in1[i] : in2[i]; | ||||
|       } | ||||
|       HWY_ASSERT_VEC_EQ(d, expected.get(), IfVecThenElse(vec, v1, v2)); | ||||
|     } | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllIfVecThenElse() { | ||||
|   ForAllTypes(ForPartialVectors<TestIfVecThenElse>()); | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllCopySign() { | ||||
|   ForFloatTypes(ForPartialVectors<TestCopySign>()); | ||||
| } | ||||
|  | @ -180,6 +225,31 @@ HWY_NOINLINE void TestAllZeroIfNegative() { | |||
|   ForFloatTypes(ForPartialVectors<TestZeroIfNegative>()); | ||||
| } | ||||
| 
 | ||||
| struct TestIfNegative { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const auto v0 = Zero(d); | ||||
|     const auto vp = Iota(d, 1); | ||||
|     const auto vn = Or(vp, SignBit(d)); | ||||
| 
 | ||||
|     // Zero and positive remain unchanged
 | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(v0, vn, v0)); | ||||
|     HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(v0, v0, vn)); | ||||
|     HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vp, vn, vp)); | ||||
|     HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vp, vp, vn)); | ||||
| 
 | ||||
|     // Negative are replaced with 2nd arg
 | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, IfNegativeThenElse(vn, v0, vp)); | ||||
|     HWY_ASSERT_VEC_EQ(d, vn, IfNegativeThenElse(vn, vn, v0)); | ||||
|     HWY_ASSERT_VEC_EQ(d, vp, IfNegativeThenElse(vn, vp, vn)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllIfNegative() { | ||||
|   ForFloatTypes(ForPartialVectors<TestIfNegative>()); | ||||
|   ForSignedTypes(ForPartialVectors<TestIfNegative>()); | ||||
| } | ||||
| 
 | ||||
| struct TestBroadcastSignBit { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|  | @ -234,16 +304,11 @@ HWY_NOINLINE void TestAllTestBit() { | |||
| struct TestPopulationCount { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
| #if HWY_TARGET == HWY_RVV || HWY_IS_DEBUG_BUILD | ||||
|     constexpr size_t kNumTests = 1 << 14; | ||||
| #else | ||||
|     constexpr size_t kNumTests = 1 << 20; | ||||
| #endif | ||||
|     RandomState rng; | ||||
|     size_t N = Lanes(d); | ||||
|     auto data = AllocateAligned<T>(N); | ||||
|     auto popcnt = AllocateAligned<T>(N); | ||||
|     for (size_t i = 0; i < kNumTests / N; i++) { | ||||
|     for (size_t i = 0; i < AdjustedReps(1 << 18) / N; i++) { | ||||
|       for (size_t i = 0; i < N; i++) { | ||||
|         data[i] = static_cast<T>(rng()); | ||||
|         popcnt[i] = static_cast<T>(PopCount(data[i])); | ||||
|  | @ -268,8 +333,10 @@ namespace hwy { | |||
| HWY_BEFORE_TEST(HwyLogicalTest); | ||||
| HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalInteger); | ||||
| HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllLogicalFloat); | ||||
| HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfVecThenElse); | ||||
| HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllCopySign); | ||||
| HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllZeroIfNegative); | ||||
| HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllIfNegative); | ||||
| HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllBroadcastSignBit); | ||||
| HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllTestBit); | ||||
| HWY_EXPORT_AND_TEST_P(HwyLogicalTest, TestAllPopulationCount); | ||||
|  |  | |||
							
								
								
									
										15
									
								
								third_party/highway/hwy/tests/mask_test.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										15
									
								
								third_party/highway/hwy/tests/mask_test.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -17,8 +17,6 @@ | |||
| #include <stdint.h> | ||||
| #include <string.h>  // memcmp
 | ||||
| 
 | ||||
| #include "hwy/base.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "tests/mask_test.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
|  | @ -55,13 +53,18 @@ struct TestFirstN { | |||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
| 
 | ||||
|     const RebindToSigned<D> di; | ||||
|     using TI = TFromD<decltype(di)>; | ||||
|     using TN = SignedFromSize<HWY_MIN(sizeof(size_t), sizeof(TI))>; | ||||
|     const size_t max_len = static_cast<size_t>(LimitsMax<TN>()); | ||||
| 
 | ||||
|     for (size_t len = 0; len <= HWY_MIN(2 * N, max_len); ++len) { | ||||
| // TODO(janwas): 8-bit FirstN (using SlideUp) causes spike to freeze.
 | ||||
| #if HWY_TARGET == HWY_RVV | ||||
|     if (sizeof(T) == 1) return; | ||||
| #endif | ||||
| 
 | ||||
|     const size_t max_lanes = AdjustedReps(HWY_MIN(2 * N, size_t(64))); | ||||
|     for (size_t len = 0; len <= HWY_MIN(max_lanes, max_len); ++len) { | ||||
|       const auto expected = | ||||
|           RebindMask(d, Lt(Iota(di, 0), Set(di, static_cast<TI>(len)))); | ||||
|       const auto actual = FirstN(d, len); | ||||
|  | @ -368,7 +371,7 @@ struct TestFindFirstTrue { | |||
|     memset(bool_lanes.get(), 0, N * sizeof(TI)); | ||||
| 
 | ||||
|     // For all combinations of zero/nonzero state of subset of lanes:
 | ||||
|     const size_t max_lanes = HWY_MIN(N, size_t(10)); | ||||
|     const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(9))); | ||||
| 
 | ||||
|     HWY_ASSERT_EQ(intptr_t(-1), FindFirstTrue(d, MaskFalse(d))); | ||||
|     HWY_ASSERT_EQ(intptr_t(0), FindFirstTrue(d, MaskTrue(d))); | ||||
|  | @ -407,7 +410,7 @@ struct TestLogicalMask { | |||
|     HWY_ASSERT_MASK_EQ(d, m_all, Not(m0)); | ||||
| 
 | ||||
|     // For all combinations of zero/nonzero state of subset of lanes:
 | ||||
|     const size_t max_lanes = HWY_MIN(N, size_t(6)); | ||||
|     const size_t max_lanes = AdjustedLog2Reps(HWY_MIN(N, size_t(6))); | ||||
|     for (size_t code = 0; code < (1ull << max_lanes); ++code) { | ||||
|       for (size_t i = 0; i < max_lanes; ++i) { | ||||
|         bool_lanes[i] = (code & (1ull << i)) ? TI(1) : TI(0); | ||||
|  |  | |||
							
								
								
									
										12
									
								
								third_party/highway/hwy/tests/memory_test.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										12
									
								
								third_party/highway/hwy/tests/memory_test.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -36,7 +36,7 @@ struct TestLoadStore { | |||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     const auto hi = Iota(d, 1 + N); | ||||
|     const auto hi = Iota(d, static_cast<T>(1 + N)); | ||||
|     const auto lo = Iota(d, 1); | ||||
|     auto lanes = AllocateAligned<T>(2 * N); | ||||
|     Store(hi, d, &lanes[N]); | ||||
|  | @ -135,7 +135,7 @@ struct TestStoreInterleaved3 { | |||
| HWY_NOINLINE void TestAllStoreInterleaved3() { | ||||
| #if HWY_TARGET == HWY_RVV | ||||
|   // Segments are limited to 8 registers, so we can only go up to LMUL=2.
 | ||||
|   const ForExtendableVectors<TestStoreInterleaved3, 4> test; | ||||
|   const ForExtendableVectors<TestStoreInterleaved3, 2> test; | ||||
| #else | ||||
|   const ForPartialVectors<TestStoreInterleaved3> test; | ||||
| #endif | ||||
|  | @ -198,7 +198,7 @@ struct TestStoreInterleaved4 { | |||
| HWY_NOINLINE void TestAllStoreInterleaved4() { | ||||
| #if HWY_TARGET == HWY_RVV | ||||
|   // Segments are limited to 8 registers, so we can only go up to LMUL=2.
 | ||||
|   const ForExtendableVectors<TestStoreInterleaved4, 4> test; | ||||
|   const ForExtendableVectors<TestStoreInterleaved4, 2> test; | ||||
| #else | ||||
|   const ForPartialVectors<TestStoreInterleaved4> test; | ||||
| #endif | ||||
|  | @ -230,7 +230,7 @@ struct TestLoadDup128 { | |||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllLoadDup128() { | ||||
|   ForAllTypes(ForGE128Vectors<TestLoadDup128>()); | ||||
|   ForAllTypes(ForGEVectors<128, TestLoadDup128>()); | ||||
| } | ||||
| 
 | ||||
| struct TestStream { | ||||
|  | @ -245,7 +245,7 @@ struct TestStream { | |||
|     std::fill(out.get(), out.get() + 2 * affected_lanes, T(0)); | ||||
| 
 | ||||
|     Stream(v, d, out.get()); | ||||
|     StoreFence(); | ||||
|     FlushStream(); | ||||
|     const auto actual = Load(d, out.get()); | ||||
|     HWY_ASSERT_VEC_EQ(d, v, actual); | ||||
|     // Ensure Stream didn't modify more memory than expected
 | ||||
|  | @ -386,7 +386,7 @@ HWY_NOINLINE void TestAllGather() { | |||
| 
 | ||||
| HWY_NOINLINE void TestAllCache() { | ||||
|   LoadFence(); | ||||
|   StoreFence(); | ||||
|   FlushStream(); | ||||
|   int test = 0; | ||||
|   Prefetch(&test); | ||||
|   FlushCacheline(&test); | ||||
|  |  | |||
							
								
								
									
										433
									
								
								third_party/highway/hwy/tests/shift_test.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										433
									
								
								third_party/highway/hwy/tests/shift_test.cc
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,433 @@ | |||
| // Copyright 2019 Google LLC
 | ||||
| //
 | ||||
| // Licensed under the Apache License, Version 2.0 (the "License");
 | ||||
| // you may not use this file except in compliance with the License.
 | ||||
| // You may obtain a copy of the License at
 | ||||
| //
 | ||||
| //      http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| //
 | ||||
| // Unless required by applicable law or agreed to in writing, software
 | ||||
| // distributed under the License is distributed on an "AS IS" BASIS,
 | ||||
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | ||||
| // See the License for the specific language governing permissions and
 | ||||
| // limitations under the License.
 | ||||
| 
 | ||||
| #include <inttypes.h> | ||||
| #include <stddef.h> | ||||
| #include <stdint.h> | ||||
| 
 | ||||
| #include <algorithm> | ||||
| #include <limits> | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "tests/shift_test.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
| #include "hwy/highway.h" | ||||
| #include "hwy/tests/test_util-inl.h" | ||||
| 
 | ||||
| HWY_BEFORE_NAMESPACE(); | ||||
| namespace hwy { | ||||
| namespace HWY_NAMESPACE { | ||||
| 
 | ||||
| template <bool kSigned> | ||||
| struct TestLeftShifts { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T t, D d) { | ||||
|     if (kSigned) { | ||||
|       // Also test positive values
 | ||||
|       TestLeftShifts</*kSigned=*/false>()(t, d); | ||||
|     } | ||||
| 
 | ||||
|     using TI = MakeSigned<T>; | ||||
|     using TU = MakeUnsigned<T>; | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
 | ||||
|     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; | ||||
| 
 | ||||
|     // 0
 | ||||
|     HWY_ASSERT_VEC_EQ(d, values, ShiftLeft<0>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, values, ShiftLeftSame(values, 0)); | ||||
| 
 | ||||
|     // 1
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const T value = kSigned ? T(T(i) - T(N)) : T(i); | ||||
|       expected[i] = T(TU(value) << 1); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<1>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, 1)); | ||||
| 
 | ||||
|     // max
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const T value = kSigned ? T(T(i) - T(N)) : T(i); | ||||
|       expected[i] = T(TU(value) << kMaxShift); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeft<kMaxShift>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftLeftSame(values, kMaxShift)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| template <bool kSigned> | ||||
| struct TestVariableLeftShifts { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T t, D d) { | ||||
|     if (kSigned) { | ||||
|       // Also test positive values
 | ||||
|       TestVariableLeftShifts</*kSigned=*/false>()(t, d); | ||||
|     } | ||||
| 
 | ||||
|     using TI = MakeSigned<T>; | ||||
|     using TU = MakeUnsigned<T>; | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     const auto v0 = Zero(d); | ||||
|     const auto v1 = Set(d, 1); | ||||
|     const auto values = Iota(d, kSigned ? -TI(N) : TI(0));  // value to shift
 | ||||
| 
 | ||||
|     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; | ||||
|     const auto max_shift = Set(d, kMaxShift); | ||||
|     const auto small_shifts = And(Iota(d, 0), max_shift); | ||||
|     const auto large_shifts = max_shift - small_shifts; | ||||
| 
 | ||||
|     // Same: 0
 | ||||
|     HWY_ASSERT_VEC_EQ(d, values, Shl(values, v0)); | ||||
| 
 | ||||
|     // Same: 1
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const T value = kSigned ? T(i) - T(N) : T(i); | ||||
|       expected[i] = T(TU(value) << 1); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, v1)); | ||||
| 
 | ||||
|     // Same: max
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const T value = kSigned ? T(i) - T(N) : T(i); | ||||
|       expected[i] = T(TU(value) << kMaxShift); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, max_shift)); | ||||
| 
 | ||||
|     // Variable: small
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const T value = kSigned ? T(i) - T(N) : T(i); | ||||
|       expected[i] = T(TU(value) << (i & kMaxShift)); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(values, small_shifts)); | ||||
| 
 | ||||
|     // Variable: large
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = T(TU(1) << (kMaxShift - (i & kMaxShift))); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shl(v1, large_shifts)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| struct TestUnsignedRightShifts { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     const auto values = Iota(d, 0); | ||||
| 
 | ||||
|     const T kMax = LimitsMax<T>(); | ||||
|     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; | ||||
| 
 | ||||
|     // Shift by 0
 | ||||
|     HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0)); | ||||
| 
 | ||||
|     // Shift by 1
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = T(T(i & kMax) >> 1); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1)); | ||||
| 
 | ||||
|     // max
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = T(T(i & kMax) >> kMaxShift); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<kMaxShift>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, kMaxShift)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| struct TestRotateRight { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     constexpr size_t kBits = sizeof(T) * 8; | ||||
|     const auto mask_shift = Set(d, T{kBits}); | ||||
|     // Cover as many bit positions as possible to test shifting out
 | ||||
|     const auto values = Shl(Set(d, T{1}), And(Iota(d, 0), mask_shift)); | ||||
| 
 | ||||
|     // Rotate by 0
 | ||||
|     HWY_ASSERT_VEC_EQ(d, values, RotateRight<0>(values)); | ||||
| 
 | ||||
|     // Rotate by 1
 | ||||
|     Store(values, d, expected.get()); | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = (expected[i] >> 1) | (expected[i] << (kBits - 1)); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<1>(values)); | ||||
| 
 | ||||
|     // Rotate by half
 | ||||
|     Store(values, d, expected.get()); | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = (expected[i] >> (kBits / 2)) | (expected[i] << (kBits / 2)); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits / 2>(values)); | ||||
| 
 | ||||
|     // Rotate by max
 | ||||
|     Store(values, d, expected.get()); | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = (expected[i] >> (kBits - 1)) | (expected[i] << 1); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), RotateRight<kBits - 1>(values)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| struct TestVariableUnsignedRightShifts { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     const auto v0 = Zero(d); | ||||
|     const auto v1 = Set(d, 1); | ||||
|     const auto values = Iota(d, 0); | ||||
| 
 | ||||
|     const T kMax = LimitsMax<T>(); | ||||
|     const auto max = Set(d, kMax); | ||||
| 
 | ||||
|     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; | ||||
|     const auto max_shift = Set(d, kMaxShift); | ||||
|     const auto small_shifts = And(Iota(d, 0), max_shift); | ||||
|     const auto large_shifts = max_shift - small_shifts; | ||||
| 
 | ||||
|     // Same: 0
 | ||||
|     HWY_ASSERT_VEC_EQ(d, values, Shr(values, v0)); | ||||
| 
 | ||||
|     // Same: 1
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = T(T(i & kMax) >> 1); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, v1)); | ||||
| 
 | ||||
|     // Same: max
 | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, Shr(values, max_shift)); | ||||
| 
 | ||||
|     // Variable: small
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = T(i) >> (i & kMaxShift); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(values, small_shifts)); | ||||
| 
 | ||||
|     // Variable: Large
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = kMax >> (kMaxShift - (i & kMaxShift)); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(max, large_shifts)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| template <int kAmount, typename T> | ||||
| T RightShiftNegative(T val) { | ||||
|   // C++ shifts are implementation-defined for negative numbers, and we have
 | ||||
|   // seen divisions replaced with shifts, so resort to bit operations.
 | ||||
|   using TU = hwy::MakeUnsigned<T>; | ||||
|   TU bits; | ||||
|   CopyBytes<sizeof(T)>(&val, &bits); | ||||
| 
 | ||||
|   const TU shifted = TU(bits >> kAmount); | ||||
| 
 | ||||
|   const TU all = TU(~TU(0)); | ||||
|   const size_t num_zero = sizeof(TU) * 8 - 1 - kAmount; | ||||
|   const TU sign_extended = static_cast<TU>((all << num_zero) & LimitsMax<TU>()); | ||||
| 
 | ||||
|   bits = shifted | sign_extended; | ||||
|   CopyBytes<sizeof(T)>(&bits, &val); | ||||
|   return val; | ||||
| } | ||||
| 
 | ||||
| class TestSignedRightShifts { | ||||
|  public: | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
|     constexpr T kMin = LimitsMin<T>(); | ||||
|     constexpr T kMax = LimitsMax<T>(); | ||||
|     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; | ||||
| 
 | ||||
|     // First test positive values, negative are checked below.
 | ||||
|     const auto v0 = Zero(d); | ||||
|     const auto values = And(Iota(d, 0), Set(d, kMax)); | ||||
| 
 | ||||
|     // Shift by 0
 | ||||
|     HWY_ASSERT_VEC_EQ(d, values, ShiftRight<0>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, values, ShiftRightSame(values, 0)); | ||||
| 
 | ||||
|     // Shift by 1
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = T(T(i & kMax) >> 1); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(values, 1)); | ||||
| 
 | ||||
|     // max
 | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(values)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(values, kMaxShift)); | ||||
| 
 | ||||
|     // Even negative value
 | ||||
|     Test<0>(kMin, d, __LINE__); | ||||
|     Test<1>(kMin, d, __LINE__); | ||||
|     Test<2>(kMin, d, __LINE__); | ||||
|     Test<kMaxShift>(kMin, d, __LINE__); | ||||
| 
 | ||||
|     const T odd = static_cast<T>(kMin + 1); | ||||
|     Test<0>(odd, d, __LINE__); | ||||
|     Test<1>(odd, d, __LINE__); | ||||
|     Test<2>(odd, d, __LINE__); | ||||
|     Test<kMaxShift>(odd, d, __LINE__); | ||||
|   } | ||||
| 
 | ||||
|  private: | ||||
|   template <int kAmount, typename T, class D> | ||||
|   void Test(T val, D d, int line) { | ||||
|     const auto expected = Set(d, RightShiftNegative<kAmount>(val)); | ||||
|     const auto in = Set(d, val); | ||||
|     const char* file = __FILE__; | ||||
|     AssertVecEqual(d, expected, ShiftRight<kAmount>(in), file, line); | ||||
|     AssertVecEqual(d, expected, ShiftRightSame(in, kAmount), file, line); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| struct TestVariableSignedRightShifts { | ||||
|   template <typename T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     using TU = MakeUnsigned<T>; | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     constexpr T kMin = LimitsMin<T>(); | ||||
|     constexpr T kMax = LimitsMax<T>(); | ||||
| 
 | ||||
|     constexpr size_t kMaxShift = (sizeof(T) * 8) - 1; | ||||
| 
 | ||||
|     // First test positive values, negative are checked below.
 | ||||
|     const auto v0 = Zero(d); | ||||
|     const auto positive = Iota(d, 0) & Set(d, kMax); | ||||
| 
 | ||||
|     // Shift by 0
 | ||||
|     HWY_ASSERT_VEC_EQ(d, positive, ShiftRight<0>(positive)); | ||||
|     HWY_ASSERT_VEC_EQ(d, positive, ShiftRightSame(positive, 0)); | ||||
| 
 | ||||
|     // Shift by 1
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = T(T(i & kMax) >> 1); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRight<1>(positive)); | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ShiftRightSame(positive, 1)); | ||||
| 
 | ||||
|     // max
 | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, ShiftRight<kMaxShift>(positive)); | ||||
|     HWY_ASSERT_VEC_EQ(d, v0, ShiftRightSame(positive, kMaxShift)); | ||||
| 
 | ||||
|     const auto max_shift = Set(d, kMaxShift); | ||||
|     const auto small_shifts = And(Iota(d, 0), max_shift); | ||||
|     const auto large_shifts = max_shift - small_shifts; | ||||
| 
 | ||||
|     const auto negative = Iota(d, kMin); | ||||
| 
 | ||||
|     // Test varying negative to shift
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = RightShiftNegative<1>(static_cast<T>(kMin + i)); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(negative, Set(d, 1))); | ||||
| 
 | ||||
|     // Shift MSB right by small amounts
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const size_t amount = i & kMaxShift; | ||||
|       const TU shifted = ~((1ull << (kMaxShift - amount)) - 1); | ||||
|       CopyBytes<sizeof(T)>(&shifted, &expected[i]); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), small_shifts)); | ||||
| 
 | ||||
|     // Shift MSB right by large amounts
 | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const size_t amount = kMaxShift - (i & kMaxShift); | ||||
|       const TU shifted = ~((1ull << (kMaxShift - amount)) - 1); | ||||
|       CopyBytes<sizeof(T)>(&shifted, &expected[i]); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Shr(Set(d, kMin), large_shifts)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllShifts() { | ||||
|   ForUnsignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/false>>()); | ||||
|   ForSignedTypes(ForPartialVectors<TestLeftShifts</*kSigned=*/true>>()); | ||||
|   ForUnsignedTypes(ForPartialVectors<TestUnsignedRightShifts>()); | ||||
|   ForSignedTypes(ForPartialVectors<TestSignedRightShifts>()); | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllVariableShifts() { | ||||
|   const ForPartialVectors<TestLeftShifts</*kSigned=*/false>> shl_u; | ||||
|   const ForPartialVectors<TestLeftShifts</*kSigned=*/true>> shl_s; | ||||
|   const ForPartialVectors<TestUnsignedRightShifts> shr_u; | ||||
|   const ForPartialVectors<TestSignedRightShifts> shr_s; | ||||
| 
 | ||||
|   shl_u(uint16_t()); | ||||
|   shr_u(uint16_t()); | ||||
| 
 | ||||
|   shl_u(uint32_t()); | ||||
|   shr_u(uint32_t()); | ||||
| 
 | ||||
|   shl_s(int16_t()); | ||||
|   shr_s(int16_t()); | ||||
| 
 | ||||
|   shl_s(int32_t()); | ||||
|   shr_s(int32_t()); | ||||
| 
 | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|   shl_u(uint64_t()); | ||||
|   shr_u(uint64_t()); | ||||
| 
 | ||||
|   shl_s(int64_t()); | ||||
|   shr_s(int64_t()); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllRotateRight() { | ||||
|   const ForPartialVectors<TestRotateRight> test; | ||||
|   test(uint32_t()); | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|   test(uint64_t()); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
| HWY_AFTER_NAMESPACE(); | ||||
| 
 | ||||
| #if HWY_ONCE | ||||
| 
 | ||||
| namespace hwy { | ||||
| HWY_BEFORE_TEST(HwyShiftTest); | ||||
| HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllShifts); | ||||
| HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllVariableShifts); | ||||
| HWY_EXPORT_AND_TEST_P(HwyShiftTest, TestAllRotateRight); | ||||
| }  // namespace hwy
 | ||||
| 
 | ||||
| // Ought not to be necessary, but without this, no tests run on RVV.
 | ||||
| int main(int argc, char** argv) { | ||||
|   ::testing::InitGoogleTest(&argc, argv); | ||||
|   return RUN_ALL_TESTS(); | ||||
| } | ||||
| 
 | ||||
| #endif | ||||
							
								
								
									
										178
									
								
								third_party/highway/hwy/tests/swizzle_test.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										178
									
								
								third_party/highway/hwy/tests/swizzle_test.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -19,6 +19,8 @@ | |||
| 
 | ||||
| #include <array>  // IWYU pragma: keep
 | ||||
| 
 | ||||
| #include "hwy/base.h" | ||||
| 
 | ||||
| #undef HWY_TARGET_INCLUDE | ||||
| #define HWY_TARGET_INCLUDE "tests/swizzle_test.cc" | ||||
| #include "hwy/foreach_target.h" | ||||
|  | @ -44,12 +46,48 @@ HWY_NOINLINE void TestAllGetLane() { | |||
|   ForAllTypes(ForPartialVectors<TestGetLane>()); | ||||
| } | ||||
| 
 | ||||
| struct TestDupEven { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 1); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), DupEven(Iota(d, 1))); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllDupEven() { | ||||
|   ForUIF3264(ForShrinkableVectors<TestDupEven>()); | ||||
| } | ||||
| 
 | ||||
| struct TestDupOdd { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
| #if HWY_TARGET != HWY_SCALAR | ||||
|     const size_t N = Lanes(d); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = static_cast<T>((static_cast<int>(i) & ~1) + 2); | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), DupOdd(Iota(d, 1))); | ||||
| #else | ||||
|     (void)d; | ||||
| #endif | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllDupOdd() { | ||||
|   ForUIF3264(ForShrinkableVectors<TestDupOdd>()); | ||||
| } | ||||
| 
 | ||||
| struct TestOddEven { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     const auto even = Iota(d, 1); | ||||
|     const auto odd = Iota(d, 1 + N); | ||||
|     const auto odd = Iota(d, static_cast<T>(1 + N)); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = static_cast<T>(1 + i + ((i & 1) ? N : 0)); | ||||
|  | @ -67,7 +105,7 @@ struct TestOddEvenBlocks { | |||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     const auto even = Iota(d, 1); | ||||
|     const auto odd = Iota(d, 1 + N); | ||||
|     const auto odd = Iota(d, static_cast<T>(1 + N)); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const size_t idx_block = i / (16 / sizeof(T)); | ||||
|  | @ -78,7 +116,7 @@ struct TestOddEvenBlocks { | |||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllOddEvenBlocks() { | ||||
|   ForAllTypes(ForShrinkableVectors<TestOddEvenBlocks>()); | ||||
|   ForAllTypes(ForGEVectors<128, TestOddEvenBlocks>()); | ||||
| } | ||||
| 
 | ||||
| struct TestSwapAdjacentBlocks { | ||||
|  | @ -100,7 +138,7 @@ struct TestSwapAdjacentBlocks { | |||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllSwapAdjacentBlocks() { | ||||
|   ForAllTypes(ForPartialVectors<TestSwapAdjacentBlocks>()); | ||||
|   ForAllTypes(ForGEVectors<128, TestSwapAdjacentBlocks>()); | ||||
| } | ||||
| 
 | ||||
| struct TestTableLookupLanes { | ||||
|  | @ -197,23 +235,131 @@ struct TestReverse { | |||
|   } | ||||
| }; | ||||
| 
 | ||||
| struct TestReverse2 { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     const RebindToUnsigned<D> du;  // Iota does not support float16_t.
 | ||||
|     const auto v = BitCast(d, Iota(du, 1)); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     // Can't set float16_t value directly, need to permute in memory.
 | ||||
|     auto copy = AllocateAligned<T>(N); | ||||
|     Store(v, d, copy.get()); | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = copy[i ^ 1]; | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse2(d, v)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| struct TestReverse4 { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     const RebindToUnsigned<D> du;  // Iota does not support float16_t.
 | ||||
|     const auto v = BitCast(d, Iota(du, 1)); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     // Can't set float16_t value directly, need to permute in memory.
 | ||||
|     auto copy = AllocateAligned<T>(N); | ||||
|     Store(v, d, copy.get()); | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = copy[i ^ 3]; | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse4(d, v)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| struct TestReverse8 { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     const RebindToUnsigned<D> du;  // Iota does not support float16_t.
 | ||||
|     const auto v = BitCast(d, Iota(du, 1)); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     // Can't set float16_t value directly, need to permute in memory.
 | ||||
|     auto copy = AllocateAligned<T>(N); | ||||
|     Store(v, d, copy.get()); | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       expected[i] = copy[i ^ 7]; | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), Reverse8(d, v)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllReverse() { | ||||
|   // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
 | ||||
|   // which requires 16 bits.
 | ||||
|   ForUIF163264(ForPartialVectors<TestReverse>()); | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllReverse2() { | ||||
|   // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
 | ||||
|   // which requires 16 bits.
 | ||||
|   ForUIF64(ForGEVectors<128, TestReverse2>()); | ||||
|   ForUIF32(ForGEVectors<64, TestReverse2>()); | ||||
|   ForUIF16(ForGEVectors<32, TestReverse2>()); | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllReverse4() { | ||||
|   // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
 | ||||
|   // which requires 16 bits.
 | ||||
|   ForUIF64(ForGEVectors<256, TestReverse4>()); | ||||
|   ForUIF32(ForGEVectors<128, TestReverse4>()); | ||||
|   ForUIF16(ForGEVectors<64, TestReverse4>()); | ||||
| } | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllReverse8() { | ||||
|   // 8-bit is not supported because Risc-V uses rgather of Lanes - Iota,
 | ||||
|   // which requires 16 bits.
 | ||||
|   ForUIF64(ForGEVectors<512, TestReverse8>()); | ||||
|   ForUIF32(ForGEVectors<256, TestReverse8>()); | ||||
|   ForUIF16(ForGEVectors<128, TestReverse8>()); | ||||
| } | ||||
| 
 | ||||
| struct TestReverseBlocks { | ||||
|   template <class T, class D> | ||||
|   HWY_NOINLINE void operator()(T /*unused*/, D d) { | ||||
|     const size_t N = Lanes(d); | ||||
|     const RebindToUnsigned<D> du;  // Iota does not support float16_t.
 | ||||
|     const auto v = BitCast(d, Iota(du, 1)); | ||||
|     auto expected = AllocateAligned<T>(N); | ||||
| 
 | ||||
|     constexpr size_t kLanesPerBlock = 16 / sizeof(T); | ||||
|     const size_t num_blocks = N / kLanesPerBlock; | ||||
|     HWY_ASSERT(num_blocks != 0); | ||||
| 
 | ||||
|     // Can't set float16_t value directly, need to permute in memory.
 | ||||
|     auto copy = AllocateAligned<T>(N); | ||||
|     Store(v, d, copy.get()); | ||||
|     for (size_t i = 0; i < N; ++i) { | ||||
|       const size_t idx_block = i / kLanesPerBlock; | ||||
|       const size_t base = (num_blocks - 1 - idx_block) * kLanesPerBlock; | ||||
|       expected[i] = copy[base + (i % kLanesPerBlock)]; | ||||
|     } | ||||
|     HWY_ASSERT_VEC_EQ(d, expected.get(), ReverseBlocks(d, v)); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| HWY_NOINLINE void TestAllReverseBlocks() { | ||||
|   ForAllTypes(ForGEVectors<128, TestReverseBlocks>()); | ||||
| } | ||||
| 
 | ||||
| class TestCompress { | ||||
|   template <typename T, typename TI, size_t N> | ||||
|   void CheckStored(Simd<T, N> d, Simd<TI, N> di, size_t expected_pos, | ||||
|                    size_t actual_pos, const AlignedFreeUniquePtr<T[]>& in, | ||||
|   template <class D, class DI, typename T = TFromD<D>, typename TI = TFromD<DI>> | ||||
|   void CheckStored(D d, DI di, size_t expected_pos, size_t actual_pos, | ||||
|                    const AlignedFreeUniquePtr<T[]>& in, | ||||
|                    const AlignedFreeUniquePtr<TI[]>& mask_lanes, | ||||
|                    const AlignedFreeUniquePtr<T[]>& expected, const T* actual_u, | ||||
|                    int line) { | ||||
|     if (expected_pos != actual_pos) { | ||||
|       hwy::Abort(__FILE__, line, | ||||
|       hwy::Abort( | ||||
|           __FILE__, line, | ||||
|           "Size mismatch for %s: expected %" PRIu64 ", actual %" PRIu64 "\n", | ||||
|                  TypeName(T(), N).c_str(), static_cast<uint64_t>(expected_pos), static_cast<uint64_t>(actual_pos)); | ||||
|           TypeName(T(), Lanes(d)).c_str(), static_cast<uint64_t>(expected_pos), | ||||
|           static_cast<uint64_t>(actual_pos)); | ||||
|     } | ||||
|     // Upper lanes are undefined. Modified from AssertVecEqual.
 | ||||
|     for (size_t i = 0; i < expected_pos; ++i) { | ||||
|  | @ -222,6 +368,7 @@ class TestCompress { | |||
|                 "Mismatch at i=%" PRIu64 " of %" PRIu64 ", line %d:\n\n", | ||||
|                 static_cast<uint64_t>(i), static_cast<uint64_t>(expected_pos), | ||||
|                 line); | ||||
|         const size_t N = Lanes(d); | ||||
|         Print(di, "mask", Load(di, mask_lanes.get()), 0, N); | ||||
|         Print(d, "in", Load(d, in.get()), 0, N); | ||||
|         Print(d, "expect", Load(d, expected.get()), 0, N); | ||||
|  | @ -251,7 +398,10 @@ class TestCompress { | |||
|       auto expected = AllocateAligned<T>(N); | ||||
|       auto actual_a = AllocateAligned<T>(misalign + N); | ||||
|       T* actual_u = actual_a.get() + misalign; | ||||
|       auto bits = AllocateAligned<uint8_t>(HWY_MAX(8, (N + 7) / 8)); | ||||
| 
 | ||||
|       const size_t bits_size = RoundUpTo((N + 7) / 8, 8); | ||||
|       auto bits = AllocateAligned<uint8_t>(bits_size); | ||||
|       memset(bits.get(), 0, bits_size);  // for MSAN
 | ||||
| 
 | ||||
|       // Each lane should have a chance of having mask=true.
 | ||||
|       for (size_t rep = 0; rep < AdjustedReps(200); ++rep) { | ||||
|  | @ -465,7 +615,7 @@ HWY_NOINLINE void TestAllCompress() { | |||
| 
 | ||||
|   test(uint16_t()); | ||||
|   test(int16_t()); | ||||
| #if HWY_CAP_FLOAT16 | ||||
| #if HWY_HAVE_FLOAT16 | ||||
|   test(float16_t()); | ||||
| #endif | ||||
| 
 | ||||
|  | @ -482,11 +632,17 @@ HWY_AFTER_NAMESPACE(); | |||
| namespace hwy { | ||||
| HWY_BEFORE_TEST(HwySwizzleTest); | ||||
| HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllGetLane); | ||||
| HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupEven); | ||||
| HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllDupOdd); | ||||
| HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEven); | ||||
| HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllOddEvenBlocks); | ||||
| HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllSwapAdjacentBlocks); | ||||
| HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllTableLookupLanes); | ||||
| HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse); | ||||
| HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse2); | ||||
| HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse4); | ||||
| HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverse8); | ||||
| HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllReverseBlocks); | ||||
| HWY_EXPORT_AND_TEST_P(HwySwizzleTest, TestAllCompress); | ||||
| }  // namespace hwy
 | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										372
									
								
								third_party/highway/hwy/tests/test_util-inl.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										372
									
								
								third_party/highway/hwy/tests/test_util-inl.h
									
									
									
									
										vendored
									
									
								
							|  | @ -41,7 +41,7 @@ HWY_NOINLINE void PrintValue(T value) { | |||
|   fprintf(stderr, "0x%02X,", byte); | ||||
| } | ||||
| 
 | ||||
| #if HWY_CAP_FLOAT16 | ||||
| #if HWY_HAVE_FLOAT16 | ||||
| HWY_NOINLINE void PrintValue(float16_t value) { | ||||
|   uint16_t bits; | ||||
|   CopyBytes<2>(&value, &bits); | ||||
|  | @ -70,8 +70,10 @@ void Print(const D d, const char* caption, VecArg<V> v, size_t lane_u = 0, | |||
| } | ||||
| 
 | ||||
| // Compare expected vector to vector.
 | ||||
| // HWY_INLINE works around a Clang SVE compiler bug where all but the first
 | ||||
| // 128 bits (the NEON register) of actual are zero.
 | ||||
| template <class D, typename T = TFromD<D>, class V = Vec<D>> | ||||
| void AssertVecEqual(D d, const T* expected, VecArg<V> actual, | ||||
| HWY_INLINE void AssertVecEqual(D d, const T* expected, VecArg<V> actual, | ||||
|                                const char* filename, const int line) { | ||||
|   const size_t N = Lanes(d); | ||||
|   auto actual_lanes = AllocateAligned<T>(N); | ||||
|  | @ -84,8 +86,10 @@ void AssertVecEqual(D d, const T* expected, VecArg<V> actual, | |||
| } | ||||
| 
 | ||||
| // Compare expected lanes to vector.
 | ||||
| // HWY_INLINE works around a Clang SVE compiler bug where all but the first
 | ||||
| // 128 bits (the NEON register) of actual are zero.
 | ||||
| template <class D, typename T = TFromD<D>, class V = Vec<D>> | ||||
| HWY_NOINLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual, | ||||
| HWY_INLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual, | ||||
|                                const char* filename, int line) { | ||||
|   auto expected_lanes = AllocateAligned<T>(Lanes(d)); | ||||
|   Store(expected, d, expected_lanes.get()); | ||||
|  | @ -96,7 +100,10 @@ HWY_NOINLINE void AssertVecEqual(D d, VecArg<V> expected, VecArg<V> actual, | |||
| template <class D> | ||||
| HWY_NOINLINE void AssertMaskEqual(D d, VecArg<Mask<D>> a, VecArg<Mask<D>> b, | ||||
|                                   const char* filename, int line) { | ||||
|   AssertVecEqual(d, VecFromMask(d, a), VecFromMask(d, b), filename, line); | ||||
|   // lvalues prevented MSAN failure in farm_sve.
 | ||||
|   const Vec<D> va = VecFromMask(d, a); | ||||
|   const Vec<D> vb = VecFromMask(d, b); | ||||
|   AssertVecEqual(d, va, vb, filename, line); | ||||
| 
 | ||||
|   const char* target_name = hwy::TargetName(HWY_TARGET); | ||||
|   AssertEqual(CountTrue(d, a), CountTrue(d, b), target_name, filename, line); | ||||
|  | @ -178,169 +185,269 @@ HWY_INLINE Mask<D> MaskFalse(const D d) { | |||
| 
 | ||||
| // Helpers for instantiating tests with combinations of lane types / counts.
 | ||||
| 
 | ||||
| // For ensuring we do not call tests with D such that widening D results in 0
 | ||||
| // lanes. Example: assume T=u32, VLEN=256, and fraction=1/8: there is no 1/8th
 | ||||
| // of a u64 vector in this case.
 | ||||
| template <class D, HWY_IF_NOT_LANE_SIZE_D(D, 8)> | ||||
| HWY_INLINE size_t PromotedLanes(const D d) { | ||||
|   return Lanes(RepartitionToWide<decltype(d)>()); | ||||
| } | ||||
| // Already the widest possible T, cannot widen.
 | ||||
| template <class D, HWY_IF_LANE_SIZE_D(D, 8)> | ||||
| HWY_INLINE size_t PromotedLanes(const D d) { | ||||
|   return Lanes(d); | ||||
| } | ||||
| // Calls Test for each CappedTag<T, N> where N is in [kMinLanes, kMul * kMinArg]
 | ||||
| // and the resulting Lanes() is in [min_lanes, max_lanes]. The upper bound
 | ||||
| // is required to ensure capped vectors remain extendable. Implemented by
 | ||||
| // recursively halving kMul until it is zero.
 | ||||
| template <typename T, size_t kMul, size_t kMinArg, class Test> | ||||
| struct ForeachCappedR { | ||||
|   static void Do(size_t min_lanes, size_t max_lanes) { | ||||
|     const CappedTag<T, kMul * kMinArg> d; | ||||
| 
 | ||||
| // For all power of two N in [kMinLanes, kMul * kMinLanes] (so that recursion
 | ||||
| // stops at kMul == 0). Note that N may be capped or a fraction.
 | ||||
| template <typename T, size_t kMul, size_t kMinLanes, class Test, | ||||
|           bool kPromote = false> | ||||
| struct ForeachSizeR { | ||||
|   static void Do() { | ||||
|     const Simd<T, kMul * kMinLanes> d; | ||||
| 
 | ||||
|     // Skip invalid fractions (e.g. 1/8th of u32x4).
 | ||||
|     const size_t lanes = kPromote ? PromotedLanes(d) : Lanes(d); | ||||
|     if (lanes < kMinLanes) return; | ||||
|     // If we already don't have enough lanes, stop.
 | ||||
|     const size_t lanes = Lanes(d); | ||||
|     if (lanes < min_lanes) return; | ||||
| 
 | ||||
|     if (lanes <= max_lanes) { | ||||
|       Test()(T(), d); | ||||
| 
 | ||||
|     static_assert(kMul != 0, "Recursion should have ended already"); | ||||
|     ForeachSizeR<T, kMul / 2, kMinLanes, Test, kPromote>::Do(); | ||||
|     } | ||||
|     ForeachCappedR<T, kMul / 2, kMinArg, Test>::Do(min_lanes, max_lanes); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // Base case to stop the recursion.
 | ||||
| template <typename T, size_t kMinLanes, class Test, bool kPromote> | ||||
| struct ForeachSizeR<T, 0, kMinLanes, Test, kPromote> { | ||||
|   static void Do() {} | ||||
| template <typename T, size_t kMinArg, class Test> | ||||
| struct ForeachCappedR<T, 0, kMinArg, Test> { | ||||
|   static void Do(size_t, size_t) {} | ||||
| }; | ||||
| 
 | ||||
| #if HWY_HAVE_SCALABLE | ||||
| 
 | ||||
| constexpr int MinVectorSize() { | ||||
| #if HWY_TARGET == HWY_RVV | ||||
|   // Actually 16 for the application processor profile, but the intrinsics are
 | ||||
|   // defined as if VLEN might be only 64: there is no vuint64mf2_t.
 | ||||
|   return 8; | ||||
| #else | ||||
|   return 16; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| template <typename T> | ||||
| constexpr int MinPow2() { | ||||
|   // Highway follows RVV LMUL in that the smallest fraction is 1/8th (encoded
 | ||||
|   // as kPow2 == -3). The fraction also must not result in zero lanes for the
 | ||||
|   // smallest possible vector size.
 | ||||
|   return HWY_MAX(-3, -static_cast<int>(CeilLog2(MinVectorSize() / sizeof(T)))); | ||||
| } | ||||
| 
 | ||||
| // Iterates kPow2 upward through +3.
 | ||||
| template <typename T, int kPow2, int kAddPow2, class Test> | ||||
| struct ForeachShiftR { | ||||
|   static void Do(size_t min_lanes) { | ||||
|     const ScalableTag<T, kPow2 + kAddPow2> d; | ||||
| 
 | ||||
|     // Precondition: [kPow2, 3] + kAddPow2 is a valid fraction of the minimum
 | ||||
|     // vector size, so we always have enough lanes, except ForGEVectors.
 | ||||
|     if (Lanes(d) >= min_lanes) { | ||||
|       Test()(T(), d); | ||||
|     } else { | ||||
|       fprintf(stderr, "%d lanes < %d: T=%d pow=%d\n", | ||||
|               static_cast<int>(Lanes(d)), static_cast<int>(min_lanes), | ||||
|               static_cast<int>(sizeof(T)), kPow2 + kAddPow2); | ||||
|       HWY_ASSERT(min_lanes != 1); | ||||
|     } | ||||
| 
 | ||||
|     ForeachShiftR<T, kPow2 + 1, kAddPow2, Test>::Do(min_lanes); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // Base case to stop the recursion.
 | ||||
| template <typename T, int kAddPow2, class Test> | ||||
| struct ForeachShiftR<T, 4, kAddPow2, Test> { | ||||
|   static void Do(size_t) {} | ||||
| }; | ||||
| #else | ||||
| // ForeachCappedR already handled all possible sizes.
 | ||||
| #endif  // HWY_HAVE_SCALABLE
 | ||||
| 
 | ||||
| // These adapters may be called directly, or via For*Types:
 | ||||
| 
 | ||||
| // Calls Test for all power of two N in [1, Lanes(d) / kFactor]. This is for
 | ||||
| // Calls Test for all power of two N in [1, Lanes(d) >> kPow2]. This is for
 | ||||
| // ops that widen their input, e.g. Combine (not supported by HWY_SCALAR).
 | ||||
| template <class Test, size_t kFactor = 2> | ||||
| template <class Test, int kPow2 = 1> | ||||
| struct ForExtendableVectors { | ||||
|   template <typename T> | ||||
|   void operator()(T /*unused*/) const { | ||||
|     constexpr size_t kMaxCapped = HWY_LANES(T); | ||||
|     // Skip CappedTag that are already full vectors.
 | ||||
|     const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2; | ||||
|     (void)kMaxCapped; | ||||
|     (void)max_lanes; | ||||
| #if HWY_TARGET == HWY_SCALAR | ||||
|     // not supported
 | ||||
| #else | ||||
|     constexpr bool kPromote = true; | ||||
|     ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(1, max_lanes); | ||||
| #if HWY_TARGET == HWY_RVV | ||||
|     ForeachSizeR<T, 8 / kFactor, HWY_LANES(T), Test, kPromote>::Do(); | ||||
|     // TODO(janwas): also capped
 | ||||
|     // ForeachSizeR<T, (16 / sizeof(T)) / kFactor, 1, Test, kPromote>::Do();
 | ||||
| #elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 | ||||
|     // Capped
 | ||||
|     ForeachSizeR<T, (16 / sizeof(T)) / kFactor, 1, Test, kPromote>::Do(); | ||||
|     // Fractions
 | ||||
|     ForeachSizeR<T, 8 / kFactor, HWY_LANES(T) / 8, Test, kPromote>::Do(); | ||||
| #else | ||||
|     ForeachSizeR<T, HWY_LANES(T) / kFactor, 1, Test, kPromote>::Do(); | ||||
|     // For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3].
 | ||||
|     ForeachShiftR<T, MinPow2<T>() + kPow2, -kPow2, Test>::Do(1); | ||||
| #elif HWY_HAVE_SCALABLE | ||||
|     // For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3].
 | ||||
|     ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -kPow2 - 3, Test>::Do(1); | ||||
| #endif | ||||
| #endif  // HWY_SCALAR
 | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // Calls Test for all power of two N in [kFactor, Lanes(d)]. This is for ops
 | ||||
| // Calls Test for all power of two N in [1 << kPow2, Lanes(d)]. This is for ops
 | ||||
| // that narrow their input, e.g. UpperHalf.
 | ||||
| template <class Test, size_t kFactor = 2> | ||||
| template <class Test, int kPow2 = 1> | ||||
| struct ForShrinkableVectors { | ||||
|   template <typename T> | ||||
|   void operator()(T /*unused*/) const { | ||||
|     constexpr size_t kMinLanes = size_t{1} << kPow2; | ||||
|     constexpr size_t kMaxCapped = HWY_LANES(T); | ||||
|     // For shrinking, an upper limit is unnecessary.
 | ||||
|     constexpr size_t max_lanes = kMaxCapped; | ||||
| 
 | ||||
|     (void)kMinLanes; | ||||
|     (void)max_lanes; | ||||
|     (void)max_lanes; | ||||
| #if HWY_TARGET == HWY_SCALAR | ||||
|     // not supported
 | ||||
| #elif HWY_TARGET == HWY_RVV | ||||
|     ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T), Test>::Do(); | ||||
|     // TODO(janwas): also capped
 | ||||
| #elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 | ||||
|     // Capped
 | ||||
|     ForeachSizeR<T, (16 / sizeof(T)) / kFactor, kFactor, Test>::Do(); | ||||
|     // Fractions
 | ||||
|     ForeachSizeR<T, 8 / kFactor, kFactor * HWY_LANES(T) / 8, Test>::Do(); | ||||
| #elif HWY_TARGET == HWY_SCALAR | ||||
|     // not supported
 | ||||
| #else | ||||
|     ForeachSizeR<T, HWY_LANES(T) / kFactor, kFactor, Test>::Do(); | ||||
|     ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(kMinLanes, | ||||
|                                                                   max_lanes); | ||||
| #if HWY_TARGET == HWY_RVV | ||||
|     // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
 | ||||
|     ForeachShiftR<T, MinPow2<T>() + kPow2, 0, Test>::Do(kMinLanes); | ||||
| #elif HWY_HAVE_SCALABLE | ||||
|     // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
 | ||||
|     ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -3, Test>::Do(kMinLanes); | ||||
| #endif | ||||
| #endif  // HWY_TARGET == HWY_SCALAR
 | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // Calls Test for all power of two N in [16 / sizeof(T), Lanes(d)]. This is for
 | ||||
| // ops that require at least 128 bits, e.g. AES or 64x64 = 128 mul.
 | ||||
| template <class Test> | ||||
| struct ForGE128Vectors { | ||||
| // Calls Test for all supported power of two vectors of at least kMinBits.
 | ||||
| // Examples: AES or 64x64 require 128 bits, casts may require 64 bits.
 | ||||
| template <size_t kMinBits, class Test> | ||||
| struct ForGEVectors { | ||||
|   template <typename T> | ||||
|   void operator()(T /*unused*/) const { | ||||
|     constexpr size_t kMaxCapped = HWY_LANES(T); | ||||
|     constexpr size_t kMinLanes = kMinBits / 8 / sizeof(T); | ||||
|     // An upper limit is unnecessary.
 | ||||
|     constexpr size_t max_lanes = kMaxCapped; | ||||
|     (void)max_lanes; | ||||
| #if HWY_TARGET == HWY_SCALAR | ||||
|     // not supported
 | ||||
| #elif HWY_TARGET == HWY_RVV | ||||
|     ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do(); | ||||
|     // TODO(janwas): also capped
 | ||||
|     // ForeachSizeR<T, 1, (16 / sizeof(T)), Test>::Do();
 | ||||
| #elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 | ||||
|     // Capped
 | ||||
|     ForeachSizeR<T, 1, 16 / sizeof(T), Test>::Do(); | ||||
|     // Fractions
 | ||||
|     ForeachSizeR<T, 8, HWY_LANES(T) / 8, Test>::Do(); | ||||
|     (void)kMinLanes;  // not supported
 | ||||
| #else | ||||
|     ForeachSizeR<T, HWY_LANES(T) / (16 / sizeof(T)), (16 / sizeof(T)), | ||||
|                  Test>::Do(); | ||||
|     ForeachCappedR<T, HWY_LANES(T) / kMinLanes, kMinLanes, Test>::Do(kMinLanes, | ||||
|                                                                      max_lanes); | ||||
| #if HWY_TARGET == HWY_RVV | ||||
|     // Can be 0 (handled below) if kMinBits > 64.
 | ||||
|     constexpr size_t kRatio = MinVectorSize() * 8 / kMinBits; | ||||
|     constexpr int kMinPow2 = | ||||
|         kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio)); | ||||
|     // For each [kMinPow2, 3]; counter is [kMinPow2, 3].
 | ||||
|     ForeachShiftR<T, kMinPow2, 0, Test>::Do(kMinLanes); | ||||
| #elif HWY_HAVE_SCALABLE | ||||
|     // Can be 0 (handled below) if kMinBits > 128.
 | ||||
|     constexpr size_t kRatio = MinVectorSize() * 8 / kMinBits; | ||||
|     constexpr int kMinPow2 = | ||||
|         kRatio == 0 ? 0 : -static_cast<int>(CeilLog2(kRatio)); | ||||
|     // For each [kMinPow2, 0]; counter is [kMinPow2 + 3, 3].
 | ||||
|     ForeachShiftR<T, kMinPow2 + 3, -3, Test>::Do(kMinLanes); | ||||
| #endif | ||||
| #endif  // HWY_TARGET == HWY_SCALAR
 | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // Calls Test for all power of two N in [8 / sizeof(T), Lanes(d)]. This is for
 | ||||
| // ops that require at least 64 bits, e.g. casts.
 | ||||
| template <class Test> | ||||
| struct ForGE64Vectors { | ||||
|   template <typename T> | ||||
|   void operator()(T /*unused*/) const { | ||||
| #if HWY_TARGET == HWY_SCALAR | ||||
|     // not supported
 | ||||
| #elif HWY_TARGET == HWY_RVV | ||||
|     ForeachSizeR<T, 8, HWY_LANES(T), Test>::Do(); | ||||
|     // TODO(janwas): also capped
 | ||||
|     // ForeachSizeR<T, 1, (8 / sizeof(T)), Test>::Do();
 | ||||
| #elif HWY_TARGET == HWY_SVE || HWY_TARGET == HWY_SVE2 | ||||
|     // Capped
 | ||||
|     ForeachSizeR<T, 1, 8 / sizeof(T), Test>::Do(); | ||||
|     // Fractions
 | ||||
|     ForeachSizeR<T, 8, HWY_LANES(T) / 8, Test>::Do(); | ||||
| #else | ||||
|     ForeachSizeR<T, HWY_LANES(T) / (8 / sizeof(T)), (8 / sizeof(T)), | ||||
|                  Test>::Do(); | ||||
| #endif | ||||
|   } | ||||
| }; | ||||
| using ForGE128Vectors = ForGEVectors<128, Test>; | ||||
| 
 | ||||
| // Calls Test for all N that can be promoted (not the same as Extendable because
 | ||||
| // HWY_SCALAR has one lane). Also used for ZipLower, but not ZipUpper.
 | ||||
| template <class Test, size_t kFactor = 2> | ||||
| template <class Test, int kPow2 = 1> | ||||
| struct ForPromoteVectors { | ||||
|   template <typename T> | ||||
|   void operator()(T /*unused*/) const { | ||||
|     constexpr size_t kFactor = size_t{1} << kPow2; | ||||
|     static_assert(kFactor >= 2 && kFactor * sizeof(T) <= sizeof(uint64_t), ""); | ||||
|     constexpr size_t kMaxCapped = HWY_LANES(T); | ||||
|     constexpr size_t kMinLanes = kFactor; | ||||
|     // Skip CappedTag that are already full vectors.
 | ||||
|     const size_t max_lanes = Lanes(ScalableTag<T>()) >> kPow2; | ||||
|     (void)kMaxCapped; | ||||
|     (void)kMinLanes; | ||||
|     (void)max_lanes; | ||||
| #if HWY_TARGET == HWY_SCALAR | ||||
|     ForeachSizeR<T, 1, 1, Test, /*kPromote=*/true>::Do(); | ||||
|     ForeachCappedR<T, 1, 1, Test>::Do(1, 1); | ||||
| #else | ||||
|     return ForExtendableVectors<Test, kFactor>()(T()); | ||||
|     // TODO(janwas): call Extendable if kMinLanes check not required?
 | ||||
|     ForeachCappedR<T, (kMaxCapped >> kPow2), 1, Test>::Do(kMinLanes, max_lanes); | ||||
| #if HWY_TARGET == HWY_RVV | ||||
|     // For each [MinPow2, 3 - kPow2]; counter is [MinPow2 + kPow2, 3].
 | ||||
|     ForeachShiftR<T, MinPow2<T>() + kPow2, -kPow2, Test>::Do(kMinLanes); | ||||
| #elif HWY_HAVE_SCALABLE | ||||
|     // For each [MinPow2, 0 - kPow2]; counter is [MinPow2 + kPow2 + 3, 3].
 | ||||
|     ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -kPow2 - 3, Test>::Do(kMinLanes); | ||||
| #endif | ||||
| #endif  // HWY_SCALAR
 | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // Calls Test for all N than can be demoted (not the same as Shrinkable because
 | ||||
| // HWY_SCALAR has one lane). Also used for LowerHalf, but not UpperHalf.
 | ||||
| template <class Test, size_t kFactor = 2> | ||||
| // HWY_SCALAR has one lane).
 | ||||
| template <class Test, int kPow2 = 1> | ||||
| struct ForDemoteVectors { | ||||
|   template <typename T> | ||||
|   void operator()(T /*unused*/) const { | ||||
|     constexpr size_t kMinLanes = size_t{1} << kPow2; | ||||
|     constexpr size_t kMaxCapped = HWY_LANES(T); | ||||
|     // For shrinking, an upper limit is unnecessary.
 | ||||
|     constexpr size_t max_lanes = kMaxCapped; | ||||
| 
 | ||||
|     (void)kMinLanes; | ||||
|     (void)max_lanes; | ||||
|     (void)max_lanes; | ||||
| #if HWY_TARGET == HWY_SCALAR | ||||
|     ForeachSizeR<T, 1, 1, Test>::Do(); | ||||
|     ForeachCappedR<T, 1, 1, Test>::Do(1, 1); | ||||
| #else | ||||
|     return ForShrinkableVectors<Test, kFactor>()(T()); | ||||
|     ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(kMinLanes, | ||||
|                                                                   max_lanes); | ||||
| 
 | ||||
| // TODO(janwas): call Extendable if kMinLanes check not required?
 | ||||
| #if HWY_TARGET == HWY_RVV | ||||
|     // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
 | ||||
|     ForeachShiftR<T, MinPow2<T>() + kPow2, 0, Test>::Do(kMinLanes); | ||||
| #elif HWY_HAVE_SCALABLE | ||||
|     // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
 | ||||
|     ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -3, Test>::Do(kMinLanes); | ||||
| #endif | ||||
| #endif  // HWY_TARGET == HWY_SCALAR
 | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
| // For LowerHalf/Quarter.
 | ||||
| template <class Test, int kPow2 = 1> | ||||
| struct ForHalfVectors { | ||||
|   template <typename T> | ||||
|   void operator()(T /*unused*/) const { | ||||
|     constexpr size_t kMinLanes = size_t{1} << kPow2; | ||||
|     constexpr size_t kMaxCapped = HWY_LANES(T); | ||||
|     // For shrinking, an upper limit is unnecessary.
 | ||||
|     constexpr size_t max_lanes = kMaxCapped; | ||||
| 
 | ||||
|     (void)kMinLanes; | ||||
|     (void)max_lanes; | ||||
|     (void)max_lanes; | ||||
| #if HWY_TARGET == HWY_SCALAR | ||||
|     ForeachCappedR<T, 1, 1, Test>::Do(1, 1); | ||||
| #else | ||||
| //    ForeachCappedR<T, (kMaxCapped >> kPow2), kMinLanes, Test>::Do(kMinLanes,
 | ||||
| //                                                                  max_lanes);
 | ||||
| 
 | ||||
| // TODO(janwas): call Extendable if kMinLanes check not required?
 | ||||
| #if HWY_TARGET == HWY_RVV | ||||
|     // For each [MinPow2 + kPow2, 3]; counter is [MinPow2 + kPow2, 3].
 | ||||
|     ForeachShiftR<T, MinPow2<T>() + kPow2, 0, Test>::Do(kMinLanes); | ||||
| #elif HWY_HAVE_SCALABLE | ||||
|     // For each [MinPow2 + kPow2, 0]; counter is [MinPow2 + kPow2 + 3, 3].
 | ||||
|     ForeachShiftR<T, MinPow2<T>() + kPow2 + 3, -3, Test>::Do(kMinLanes); | ||||
| #endif | ||||
| #endif  // HWY_TARGET == HWY_SCALAR
 | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
|  | @ -350,7 +457,7 @@ template <class Test> | |||
| struct ForPartialVectors { | ||||
|   template <typename T> | ||||
|   void operator()(T t) const { | ||||
|     ForExtendableVectors<Test, 1>()(t); | ||||
|     ForExtendableVectors<Test, 0>()(t); | ||||
|   } | ||||
| }; | ||||
| 
 | ||||
|  | @ -361,7 +468,7 @@ void ForSignedTypes(const Func& func) { | |||
|   func(int8_t()); | ||||
|   func(int16_t()); | ||||
|   func(int32_t()); | ||||
| #if HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|   func(int64_t()); | ||||
| #endif | ||||
| } | ||||
|  | @ -371,7 +478,7 @@ void ForUnsignedTypes(const Func& func) { | |||
|   func(uint8_t()); | ||||
|   func(uint16_t()); | ||||
|   func(uint32_t()); | ||||
| #if HWY_CAP_INTEGER64 | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|   func(uint64_t()); | ||||
| #endif | ||||
| } | ||||
|  | @ -385,7 +492,7 @@ void ForIntegerTypes(const Func& func) { | |||
| template <class Func> | ||||
| void ForFloatTypes(const Func& func) { | ||||
|   func(float()); | ||||
| #if HWY_CAP_FLOAT64 | ||||
| #if HWY_HAVE_FLOAT64 | ||||
|   func(double()); | ||||
| #endif | ||||
| } | ||||
|  | @ -397,32 +504,49 @@ void ForAllTypes(const Func& func) { | |||
| } | ||||
| 
 | ||||
| template <class Func> | ||||
| void ForUIF3264(const Func& func) { | ||||
| void ForUIF16(const Func& func) { | ||||
|   func(uint16_t()); | ||||
|   func(int16_t()); | ||||
| #if HWY_HAVE_FLOAT16 | ||||
|   func(float16_t()); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| template <class Func> | ||||
| void ForUIF32(const Func& func) { | ||||
|   func(uint32_t()); | ||||
|   func(int32_t()); | ||||
| #if HWY_CAP_INTEGER64 | ||||
|   func(float()); | ||||
| } | ||||
| 
 | ||||
| template <class Func> | ||||
| void ForUIF64(const Func& func) { | ||||
| #if HWY_HAVE_INTEGER64 | ||||
|   func(uint64_t()); | ||||
|   func(int64_t()); | ||||
| #endif | ||||
| #if HWY_HAVE_FLOAT64 | ||||
|   func(double()); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
|   ForFloatTypes(func); | ||||
| template <class Func> | ||||
| void ForUIF3264(const Func& func) { | ||||
|   ForUIF32(func); | ||||
|   ForUIF64(func); | ||||
| } | ||||
| 
 | ||||
| template <class Func> | ||||
| void ForUIF163264(const Func& func) { | ||||
|   ForUIF16(func); | ||||
|   ForUIF3264(func); | ||||
|   func(uint16_t()); | ||||
|   func(int16_t()); | ||||
| #if HWY_CAP_FLOAT16 | ||||
|   func(float16_t()); | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| // For tests that involve loops, adjust the trip count so that emulated tests
 | ||||
| // finish quickly (but always at least 2 iterations to ensure some diversity).
 | ||||
| constexpr size_t AdjustedReps(size_t max_reps) { | ||||
| #if HWY_ARCH_RVV | ||||
|   return HWY_MAX(max_reps / 16, 2); | ||||
|   return HWY_MAX(max_reps / 32, 2); | ||||
| #elif HWY_ARCH_ARM | ||||
|   return HWY_MAX(max_reps / 4, 2); | ||||
| #elif HWY_IS_DEBUG_BUILD | ||||
|  | @ -432,6 +556,20 @@ constexpr size_t AdjustedReps(size_t max_reps) { | |||
| #endif | ||||
| } | ||||
| 
 | ||||
| // Same as above, but the loop trip count will be 1 << max_pow2.
 | ||||
| constexpr size_t AdjustedLog2Reps(size_t max_pow2) { | ||||
|   // If "negative" (unsigned wraparound), use original.
 | ||||
| #if HWY_ARCH_RVV | ||||
|   return HWY_MIN(max_pow2 - 4, max_pow2); | ||||
| #elif HWY_ARCH_ARM | ||||
|   return HWY_MIN(max_pow2 - 1, max_pow2); | ||||
| #elif HWY_IS_DEBUG_BUILD | ||||
|   return HWY_MIN(max_pow2 - 1, max_pow2); | ||||
| #else | ||||
|   return max_pow2; | ||||
| #endif | ||||
| } | ||||
| 
 | ||||
| // NOLINTNEXTLINE(google-readability-namespace-comments)
 | ||||
| }  // namespace HWY_NAMESPACE
 | ||||
| }  // namespace hwy
 | ||||
|  |  | |||
							
								
								
									
										3
									
								
								third_party/highway/hwy/tests/test_util.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								third_party/highway/hwy/tests/test_util.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -30,9 +30,6 @@ bool BytesEqual(const void* p1, const void* p2, const size_t size, | |||
|   const uint8_t* bytes2 = reinterpret_cast<const uint8_t*>(p2); | ||||
|   for (size_t i = 0; i < size; ++i) { | ||||
|     if (bytes1[i] != bytes2[i]) { | ||||
|       fprintf(stderr, "Mismatch at byte %" PRIu64 " of %" PRIu64 ": %d != %d\n", | ||||
|               static_cast<uint64_t>(i), static_cast<uint64_t>(size), bytes1[i], | ||||
|               bytes2[i]); | ||||
|       if (pos != nullptr) { | ||||
|         *pos = i; | ||||
|       } | ||||
|  |  | |||
							
								
								
									
										35
									
								
								third_party/highway/hwy/tests/test_util.h
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										35
									
								
								third_party/highway/hwy/tests/test_util.h
									
									
									
									
										vendored
									
									
								
							|  | @ -26,6 +26,7 @@ | |||
| #include "hwy/aligned_allocator.h" | ||||
| #include "hwy/base.h" | ||||
| #include "hwy/highway.h" | ||||
| #include "hwy/highway_export.h" | ||||
| 
 | ||||
| namespace hwy { | ||||
| 
 | ||||
|  | @ -67,9 +68,7 @@ static HWY_INLINE uint32_t Random32(RandomState* rng) { | |||
|   return static_cast<uint32_t>((*rng)()); | ||||
| } | ||||
| 
 | ||||
| static HWY_INLINE uint64_t Random64(RandomState* rng) { | ||||
|   return (*rng)(); | ||||
| } | ||||
| static HWY_INLINE uint64_t Random64(RandomState* rng) { return (*rng)(); } | ||||
| 
 | ||||
| // Prevents the compiler from eliding the computations that led to "output".
 | ||||
| // Works by indicating to the compiler that "output" is being read and modified.
 | ||||
|  | @ -84,8 +83,8 @@ inline void PreventElision(T&& output) { | |||
| #endif  // HWY_COMPILER_MSVC
 | ||||
| } | ||||
| 
 | ||||
| bool BytesEqual(const void* p1, const void* p2, const size_t size, | ||||
|                 size_t* pos = nullptr); | ||||
| HWY_TEST_DLLEXPORT bool BytesEqual(const void* p1, const void* p2, | ||||
|                                    const size_t size, size_t* pos = nullptr); | ||||
| 
 | ||||
| void AssertStringEqual(const char* expected, const char* actual, | ||||
|                        const char* target_name, const char* filename, int line); | ||||
|  | @ -129,25 +128,25 @@ HWY_INLINE TypeInfo MakeTypeInfo() { | |||
|   return info; | ||||
| } | ||||
| 
 | ||||
| bool IsEqual(const TypeInfo& info, const void* expected_ptr, | ||||
| HWY_TEST_DLLEXPORT bool IsEqual(const TypeInfo& info, const void* expected_ptr, | ||||
|                                 const void* actual_ptr); | ||||
| 
 | ||||
| void TypeName(const TypeInfo& info, size_t N, char* string100); | ||||
| HWY_TEST_DLLEXPORT void TypeName(const TypeInfo& info, size_t N, char* string100); | ||||
| 
 | ||||
| void PrintArray(const TypeInfo& info, const char* caption, | ||||
|                 const void* array_void, size_t N, size_t lane_u = 0, | ||||
|                 size_t max_lanes = 7); | ||||
| HWY_TEST_DLLEXPORT void PrintArray(const TypeInfo& info, const char* caption, | ||||
|                                    const void* array_void, size_t N, | ||||
|                                    size_t lane_u = 0, size_t max_lanes = 7); | ||||
| 
 | ||||
| HWY_NORETURN void PrintMismatchAndAbort(const TypeInfo& info, | ||||
|                                         const void* expected_ptr, | ||||
|                                         const void* actual_ptr, | ||||
|                                         const char* target_name, | ||||
|                                         const char* filename, int line, | ||||
|                                         size_t lane = 0, size_t num_lanes = 1); | ||||
| HWY_TEST_DLLEXPORT HWY_NORETURN void PrintMismatchAndAbort( | ||||
|     const TypeInfo& info, const void* expected_ptr, const void* actual_ptr, | ||||
|     const char* target_name, const char* filename, int line, size_t lane = 0, | ||||
|     size_t num_lanes = 1); | ||||
| 
 | ||||
| void AssertArrayEqual(const TypeInfo& info, const void* expected_void, | ||||
| HWY_TEST_DLLEXPORT void AssertArrayEqual(const TypeInfo& info, | ||||
|                                          const void* expected_void, | ||||
|                                          const void* actual_void, size_t N, | ||||
|                       const char* target_name, const char* filename, int line); | ||||
|                                          const char* target_name, | ||||
|                                          const char* filename, int line); | ||||
| 
 | ||||
| }  // namespace detail
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -52,10 +52,10 @@ HWY_NOINLINE void TestAllName() { ForAllTypes(ForPartialVectors<TestName>()); } | |||
| struct TestEqualInteger { | ||||
|   template <class T> | ||||
|   HWY_NOINLINE void operator()(T /*t*/) const { | ||||
|     HWY_ASSERT(IsEqual(T(0), T(0))); | ||||
|     HWY_ASSERT(IsEqual(T(1), T(1))); | ||||
|     HWY_ASSERT(IsEqual(T(-1), T(-1))); | ||||
|     HWY_ASSERT(IsEqual(LimitsMin<T>(), LimitsMin<T>())); | ||||
|     HWY_ASSERT_EQ(T(0), T(0)); | ||||
|     HWY_ASSERT_EQ(T(1), T(1)); | ||||
|     HWY_ASSERT_EQ(T(-1), T(-1)); | ||||
|     HWY_ASSERT_EQ(LimitsMin<T>(), LimitsMin<T>()); | ||||
| 
 | ||||
|     HWY_ASSERT(!IsEqual(T(0), T(1))); | ||||
|     HWY_ASSERT(!IsEqual(T(1), T(0))); | ||||
|  |  | |||
							
								
								
									
										2
									
								
								third_party/highway/libhwy-contrib.pc.in
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								third_party/highway/libhwy-contrib.pc.in
									
									
									
									
										vendored
									
									
								
							|  | @ -4,7 +4,7 @@ libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@ | |||
| includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ | ||||
| 
 | ||||
| Name: libhwy-contrib | ||||
| Description: Additions to Highway: image and math library | ||||
| Description: Additions to Highway: dot product, image, math, sort | ||||
| Version: @HWY_LIBRARY_VERSION@ | ||||
| Libs: -L${libdir} -lhwy_contrib | ||||
| Cflags: -I${includedir} | ||||
|  |  | |||
							
								
								
									
										2
									
								
								third_party/highway/libhwy.pc.in
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								third_party/highway/libhwy.pc.in
									
									
									
									
										vendored
									
									
								
							|  | @ -7,4 +7,4 @@ Name: libhwy | |||
| Description: Efficient and performance-portable SIMD wrapper | ||||
| Version: @HWY_LIBRARY_VERSION@ | ||||
| Libs: -L${libdir} -lhwy | ||||
| Cflags: -I${includedir} | ||||
| Cflags: -I${includedir} -D@DLLEXPORT_TO_DEFINE@ | ||||
|  |  | |||
							
								
								
									
										9
									
								
								third_party/highway/preamble.js.lds
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								third_party/highway/preamble.js.lds
									
									
									
									
										vendored
									
									
										Normal file
									
								
							|  | @ -0,0 +1,9 @@ | |||
| /* | ||||
|  * Copyright 2019 Google LLC | ||||
|  * | ||||
|  * This source code is licensed under the BSD-style license found in the | ||||
|  * LICENSE file in the root directory of this source tree. | ||||
|  */ | ||||
| 
 | ||||
| /* mock crypto module for benchmarks and unit tests or std::random_device fails at runtime */ | ||||
| var crypto = { getRandomValues: function(array) { for (var i = 0; i < array.length; i++) array[i] = (Math.random()*256)|0 } }; | ||||
							
								
								
									
										4
									
								
								third_party/highway/run_tests.sh
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								third_party/highway/run_tests.sh
									
									
									
									
										vendored
									
									
								
							|  | @ -59,7 +59,7 @@ export QEMU_LD_PREFIX=/usr/arm-linux-gnueabihf | |||
| rm -rf build_arm7 | ||||
| mkdir build_arm7 | ||||
| cd build_arm7 | ||||
| CC=arm-linux-gnueabihf-gcc CXX=arm-linux-gnueabihf-g++ cmake .. -DHWY_CMAKE_ARM7:BOOL=ON -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON | ||||
| CC=arm-linux-gnueabihf-gcc-11 CXX=arm-linux-gnueabihf-g++-11 cmake .. -DHWY_CMAKE_ARM7:BOOL=ON -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON | ||||
| make -j8 | ||||
| ctest | ||||
| cd .. | ||||
|  | @ -71,7 +71,7 @@ export QEMU_LD_PREFIX=/usr/aarch64-linux-gnu | |||
| rm -rf build_arm8 | ||||
| mkdir build_arm8 | ||||
| cd build_arm8 | ||||
| CC=aarch64-linux-gnu-gcc CXX=aarch64-linux-gnu-g++ cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON | ||||
| CC=aarch64-linux-gnu-gcc-11 CXX=aarch64-linux-gnu-g++-11 cmake .. -DHWY_WARNINGS_ARE_ERRORS:BOOL=ON | ||||
| make -j8 | ||||
| ctest | ||||
| cd .. | ||||
|  |  | |||
|  | @ -34,7 +34,7 @@ jobs: | |||
|             env_stack_size: 1 | ||||
|             max_stack: 3000 | ||||
|             # Conformance tooling test requires numpy. | ||||
|             apt_pkgs: python3-numpy | ||||
|             apt_pkgs: graphviz python3-numpy | ||||
|           - name: lowprecision | ||||
|             mode: release | ||||
|             test_in_pr: true | ||||
|  | @ -461,8 +461,8 @@ jobs: | |||
|     runs-on: ubuntu-latest | ||||
|     env: | ||||
|       CCACHE_DIR: ${{ github.workspace }}/.ccache | ||||
|       EM_VERSION: 2.0.23 | ||||
|       V8_VERSION: 9.3.22 | ||||
|       EM_VERSION: 3.1.4 | ||||
|       V8_VERSION: 9.8.177 | ||||
|       V8: ${{ github.workspace }}/.jsvu/v8 | ||||
|       BUILD_TARGET: wasm32 | ||||
| 
 | ||||
|  | @ -506,7 +506,7 @@ jobs: | |||
|           ${{ runner.os }}-${{ steps.git-env.outputs.parent }}-${{ matrix.variant }} | ||||
| 
 | ||||
|     - name: Install emsdk | ||||
|       uses: mymindstorm/setup-emsdk@v10 | ||||
|       uses: mymindstorm/setup-emsdk@v11 | ||||
|       # TODO(deymo): We could cache this action but it doesn't work when running | ||||
|       # in a matrix. | ||||
|       with: | ||||
|  |  | |||
							
								
								
									
										2
									
								
								third_party/jpeg-xl/deps.sh
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								third_party/jpeg-xl/deps.sh
									
									
									
									
										vendored
									
									
								
							|  | @ -14,7 +14,7 @@ MYDIR=$(dirname $(realpath "$0")) | |||
| # Git revisions we use for the given submodules. Update these whenever you | ||||
| # update a git submodule. | ||||
| THIRD_PARTY_GFLAGS="827c769e5fc98e0f2a34c47cef953cc6328abced" | ||||
| THIRD_PARTY_HIGHWAY="e69083a12a05caf037cabecdf1b248b7579705a5" | ||||
| THIRD_PARTY_HIGHWAY="f13e3b956eb226561ac79427893ec0afd66f91a8" | ||||
| THIRD_PARTY_SKCMS="64374756e03700d649f897dbd98c95e78c30c7da" | ||||
| THIRD_PARTY_SJPEG="868ab558fad70fcbe8863ba4e85179eeb81cc840" | ||||
| THIRD_PARTY_ZLIB="cacf7f1d4e3d44d871b605da3b647f07d718623f" | ||||
|  |  | |||
							
								
								
									
										34
									
								
								third_party/jpeg-xl/lib/extras/codec.cc
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										34
									
								
								third_party/jpeg-xl/lib/extras/codec.cc
									
									
									
									
										vendored
									
									
								
							|  | @ -5,6 +5,12 @@ | |||
| 
 | ||||
| #include "lib/extras/codec.h" | ||||
| 
 | ||||
| #include "jxl/decode.h" | ||||
| #include "jxl/types.h" | ||||
| #include "lib/extras/packed_image.h" | ||||
| #include "lib/jxl/base/padded_bytes.h" | ||||
| #include "lib/jxl/base/status.h" | ||||
| 
 | ||||
| #if JPEGXL_ENABLE_APNG | ||||
| #include "lib/extras/enc/apng.h" | ||||
| #endif | ||||
|  | @ -68,6 +74,14 @@ Status Encode(const CodecInOut& io, const extras::Codec codec, | |||
|     JXL_WARNING("Writing JPEG data as pixels"); | ||||
|   } | ||||
| 
 | ||||
|   extras::PackedPixelFile ppf; | ||||
|   size_t num_channels = io.metadata.m.color_encoding.Channels(); | ||||
|   JxlPixelFormat format = { | ||||
|       static_cast<uint32_t>(num_channels), | ||||
|       bits_per_sample <= 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16, | ||||
|       JXL_NATIVE_ENDIAN, 0}; | ||||
|   std::vector<uint8_t> bytes_vector; | ||||
|   const bool floating_point = bits_per_sample > 16; | ||||
|   switch (codec) { | ||||
|     case extras::Codec::kPNG: | ||||
| #if JPEGXL_ENABLE_APNG | ||||
|  | @ -87,8 +101,24 @@ Status Encode(const CodecInOut& io, const extras::Codec codec, | |||
|       return JXL_FAILURE("JPEG XL was built without JPEG support"); | ||||
| #endif | ||||
|     case extras::Codec::kPNM: | ||||
|       return extras::EncodeImagePNM(&io, c_desired, bits_per_sample, pool, | ||||
|                                     bytes); | ||||
| 
 | ||||
|       // Choose native for PFM; PGM/PPM require big-endian (N/A for PBM)
 | ||||
|       format.endianness = floating_point ? JXL_NATIVE_ENDIAN : JXL_BIG_ENDIAN; | ||||
|       if (floating_point) { | ||||
|         format.data_type = JXL_TYPE_FLOAT; | ||||
|       } | ||||
|       if (!c_desired.IsSRGB()) { | ||||
|         JXL_WARNING( | ||||
|             "PNM encoder cannot store custom ICC profile; decoder\n" | ||||
|             "will need hint key=color_space to get the same values"); | ||||
|       } | ||||
|       JXL_RETURN_IF_ERROR(extras::ConvertCodecInOutToPackedPixelFile( | ||||
|           io, format, c_desired, pool, &ppf)); | ||||
|       JXL_RETURN_IF_ERROR( | ||||
|           extras::EncodeImagePNM(ppf, bits_per_sample, pool, &bytes_vector)); | ||||
|       bytes->assign(bytes_vector.data(), | ||||
|                     bytes_vector.data() + bytes_vector.size()); | ||||
|       return true; | ||||
|     case extras::Codec::kPGX: | ||||
|       return extras::EncodeImagePGX(&io, c_desired, bits_per_sample, pool, | ||||
|                                     bytes); | ||||
|  |  | |||
Some files were not shown because too many files have changed in this diff Show more
		Loading…
	
		Reference in a new issue
	
	 Kagami Sascha Rosylight
						Kagami Sascha Rosylight