Bug 1877328 - Update libjxl to 07203da045f6b41f9b3b5b86023fd70b075137f6 r=saschanaz

Differential Revision: https://phabricator.services.mozilla.com/D199944
2024-01-31 16:50:40 +00:00 · 2024-01-31 16:50:40 +00:00 · f942f8df8e
commit f942f8df8e
parent dc85cacfa3
118 changed files with 1087 additions and 2510 deletions
--- a/media/libjxl/moz.build
+++ b/media/libjxl/moz.build
@ -21,7 +21,6 @@ SOURCES += [
    "/third_party/jpeg-xl/lib/jxl/color_encoding_internal.cc",
    "/third_party/jpeg-xl/lib/jxl/compressed_dc.cc",
    "/third_party/jpeg-xl/lib/jxl/convolve_separable5.cc",
-    "/third_party/jpeg-xl/lib/jxl/convolve_separable7.cc",
    "/third_party/jpeg-xl/lib/jxl/convolve_slow.cc",
    "/third_party/jpeg-xl/lib/jxl/convolve_symmetric3.cc",
    "/third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc",
@ -44,7 +43,6 @@ SOURCES += [
    "/third_party/jpeg-xl/lib/jxl/fast_dct.cc",
    "/third_party/jpeg-xl/lib/jxl/fields.cc",
    "/third_party/jpeg-xl/lib/jxl/frame_header.cc",
-    "/third_party/jpeg-xl/lib/jxl/gauss_blur.cc",
    "/third_party/jpeg-xl/lib/jxl/headers.cc",
    "/third_party/jpeg-xl/lib/jxl/huffman_table.cc",
    "/third_party/jpeg-xl/lib/jxl/icc_codec.cc",
--- a/media/libjxl/moz.yaml
+++ b/media/libjxl/moz.yaml
@ -10,9 +10,9 @@ origin:

  url: https://github.com/libjxl/libjxl

-  release: b26041c708d523ac53bb7d95d4f5c4a5d3b1ce30 (2024-01-08T13:55:50Z).
+  release: 07203da045f6b41f9b3b5b86023fd70b075137f6 (2024-01-29T17:41:05Z).

-  revision: b26041c708d523ac53bb7d95d4f5c4a5d3b1ce30
+  revision: 07203da045f6b41f9b3b5b86023fd70b075137f6

  license: Apache-2.0

--- a/third_party/jpeg-xl/AUTHORS
+++ b/third_party/jpeg-xl/AUTHORS
@ -79,6 +79,7 @@ Pieter Wuille
 roland-rollo
 Samuel Leong <wvvwvvvvwvvw@gmail.com>
 Sandro <sandro.jaeckel@gmail.com>
+sandstrom
 Sergey Fedorov <vital.had@gmail.com>
 Stephan T. Lavavej <stl@nuwen.net>
 StepSecurity Bot <bot@stepsecurity.io>
@ -87,7 +88,9 @@ Thomas Bonfort <thomas.bonfort@airbus.com>
 Timo Rothenpieler <timo@rothenpieler.org>
 tmkk <tmkkmac@gmail.com>
 Vincent Torri <vincent.torri@gmail.com>
+Wonwoo Choi <chwo9843@gmail.com>
 xiota
 Yonatan Nebenzhal <yonatan.nebenzhl@gmail.com>
 Ziemowit Zabawa <ziemek.zabawa@outlook.com>
 源文雨 <41315874+fumiama@users.noreply.github.com>
+oupson <oupson1er@gmail.com>
--- a/third_party/jpeg-xl/CMakeLists.txt
+++ b/third_party/jpeg-xl/CMakeLists.txt
@ -160,6 +160,8 @@ set(JPEGXL_ENABLE_AVX512_SPR false CACHE BOOL
    "Build with AVX-512FP16 support (faster on CPUs that support it, but larger binary size).")
 set(JPEGXL_ENABLE_AVX512_ZEN4 false CACHE BOOL
 "Build with Zen4-optimized AVX512 support (faster on CPUs that support it, but larger binary size).")
+set(JPEGXL_ENABLE_WASM_TRHEADS true CACHE BOOL
+    "Builds WASM modules with threads suppurt")

 # Force system dependencies.
 set(JPEGXL_FORCE_SYSTEM_BROTLI false CACHE BOOL
@ -263,7 +265,7 @@ if(JPEGXL_STATIC)
  endif()
 endif()  # JPEGXL_STATIC

-if (EMSCRIPTEN)
+if (EMSCRIPTEN AND JPEGXL_ENABLE_WASM_TRHEADS)
  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -pthread")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
--- a/third_party/jpeg-xl/deps.sh
+++ b/third_party/jpeg-xl/deps.sh
@ -18,7 +18,7 @@ THIRD_PARTY_BROTLI="36533a866ed1ca4b75cf049f4521e4ec5fe24727"
 THIRD_PARTY_HIGHWAY="ba0900a4957b929390ab73827235557959234fea"
 THIRD_PARTY_SKCMS="42030a771244ba67f86b1c1c76a6493f873c5f91"
 THIRD_PARTY_SJPEG="e5ab13008bb214deb66d5f3e17ca2f8dbff150bf"
-THIRD_PARTY_ZLIB="cacf7f1d4e3d44d871b605da3b647f07d718623f"
+THIRD_PARTY_ZLIB="51b7f2abdade71cd9bb0e7a373ef2610ec6f9daf" # v1.3.1
 THIRD_PARTY_LIBPNG="f135775ad4e5d4408d2e12ffcc71bb36e6b48551" # v1.6.40
 THIRD_PARTY_LIBJPEG_TURBO="8ecba3647edb6dd940463fedf38ca33a8e2a73d1" # 2.1.5.1

--- a/third_party/jpeg-xl/examples/decode_oneshot.cc
+++ b/third_party/jpeg-xl/examples/decode_oneshot.cc
@ -12,10 +12,12 @@
 #endif

 #include <inttypes.h>
+#include <jxl/codestream_header.h>
 #include <jxl/decode.h>
 #include <jxl/decode_cxx.h>
 #include <jxl/resizable_parallel_runner.h>
 #include <jxl/resizable_parallel_runner_cxx.h>
+#include <jxl/types.h>
 #include <limits.h>
 #include <stdint.h>
 #include <stdio.h>
--- a/third_party/jpeg-xl/examples/encode_oneshot.cc
+++ b/third_party/jpeg-xl/examples/encode_oneshot.cc
@ -6,13 +6,18 @@
 // This example encodes a file containing a floating point image to another
 // file containing JPEG XL image with a single frame.

+#include <jxl/codestream_header.h>
+#include <jxl/color_encoding.h>
 #include <jxl/encode.h>
 #include <jxl/encode_cxx.h>
 #include <jxl/thread_parallel_runner.h>
 #include <jxl/thread_parallel_runner_cxx.h>
+#include <jxl/types.h>
 #include <limits.h>
 #include <string.h>

+#include <cstdint>
+#include <cstdio>
 #include <sstream>
 #include <string>
 #include <vector>
--- a/third_party/jpeg-xl/lib/BUILD
+++ b/third_party/jpeg-xl/lib/BUILD
@ -200,9 +200,6 @@ cc_library(
    compatible_with = DEFAULT_COMPATIBILITY,
    defines = [
        "JPEGXL_ENABLE_SKCMS=1",
-        "JPEGXL_MAJOR_VERSION=" + str(libjxl_major_version),
-        "JPEGXL_MINOR_VERSION=" + str(libjxl_minor_version),
-        "JPEGXL_PATCH_VERSION=" + str(libjxl_patch_version),
    ],
    deps = [
        ":base",
--- a/third_party/jpeg-xl/lib/CMakeLists.txt
+++ b/third_party/jpeg-xl/lib/CMakeLists.txt
@ -163,11 +163,10 @@ if (JPEGXL_ENABLE_JPEGLI)
  include(jpegli.cmake)
 endif()

-# Install all the library headers from the source and the generated ones. There
-# is no distinction on which libraries use which header since it is expected
-# that all developer libraries are available together at build time.
-install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/jxl
-  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")
+# For simplicity all the library headers, both source and generated ones, are
+# gathered in the binary folder. There is no distinction on which libraries use
+# which header since it is expected that all developer libraries are available
+# together at build time.
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/jxl
  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}")

--- a/third_party/jpeg-xl/lib/extras/codec.cc
+++ b/third_party/jpeg-xl/lib/extras/codec.cc
@ -40,23 +40,10 @@ Status SetFromBytes(const Span<const uint8_t> bytes,
  return JXL_FAILURE("Codecs failed to decode");
 }

-Status Encode(const CodecInOut& io, const extras::Codec codec,
-              const ColorEncoding& c_desired, size_t bits_per_sample,
+Status Encode(const extras::PackedPixelFile& ppf, const extras::Codec codec,
              std::vector<uint8_t>* bytes, ThreadPool* pool) {
  bytes->clear();
-  JXL_CHECK(!io.Main().c_current().ICC().empty());
-  JXL_CHECK(!c_desired.ICC().empty());
-  io.CheckMetadata();
-  if (io.Main().IsJPEG()) {
-    JXL_WARNING("Writing JPEG data as pixels");
-  }
-  JxlPixelFormat format = {
-      0,  // num_channels is ignored by the converter
-      bits_per_sample <= 8 ? JXL_TYPE_UINT8 : JXL_TYPE_UINT16, JXL_BIG_ENDIAN,
-      0};
-  const bool floating_point = bits_per_sample > 16;
  std::unique_ptr<extras::Encoder> encoder;
-  std::ostringstream os;
  switch (codec) {
    case extras::Codec::kPNG:
      encoder = extras::GetAPNGEncoder();
@ -66,25 +53,20 @@ Status Encode(const CodecInOut& io, const extras::Codec codec,
        return JXL_FAILURE("JPEG XL was built without (A)PNG support");
      }
    case extras::Codec::kJPG:
-      format.data_type = JXL_TYPE_UINT8;
      encoder = extras::GetJPEGEncoder();
      if (encoder) {
-        os << io.jpeg_quality;
-        encoder->SetOption("q", os.str());
        break;
      } else {
        return JXL_FAILURE("JPEG XL was built without JPEG support");
      }
    case extras::Codec::kPNM:
-      if (io.Main().HasAlpha()) {
+      if (ppf.info.alpha_bits > 0) {
        encoder = extras::GetPAMEncoder();
-      } else if (io.Main().IsGray()) {
+      } else if (ppf.info.num_color_channels == 1) {
        encoder = extras::GetPGMEncoder();
-      } else if (!floating_point) {
+      } else if (ppf.info.bits_per_sample <= 16) {
        encoder = extras::GetPPMEncoder();
      } else {
-        format.data_type = JXL_TYPE_FLOAT;
-        format.endianness = JXL_LITTLE_ENDIAN;
        encoder = extras::GetPFMEncoder();
      }
      break;
@ -94,7 +76,6 @@ Status Encode(const CodecInOut& io, const extras::Codec codec,
    case extras::Codec::kGIF:
      return JXL_FAILURE("Encoding to GIF is not implemented");
    case extras::Codec::kEXR:
-      format.data_type = JXL_TYPE_FLOAT;
      encoder = extras::GetEXREncoder();
      if (encoder) {
        break;
@ -112,15 +93,6 @@ Status Encode(const CodecInOut& io, const extras::Codec codec,
  if (!encoder) {
    return JXL_FAILURE("Invalid codec.");
  }
-
-  extras::PackedPixelFile ppf;
-  JXL_RETURN_IF_ERROR(
-      ConvertCodecInOutToPackedPixelFile(io, format, c_desired, pool, &ppf));
-  ppf.info.bits_per_sample = bits_per_sample;
-  if (format.data_type == JXL_TYPE_FLOAT) {
-    ppf.info.bits_per_sample = 32;
-    ppf.info.exponent_bits_per_sample = 8;
-  }
  extras::EncodedImage encoded_image;
  JXL_RETURN_IF_ERROR(encoder->Encode(ppf, &encoded_image, pool));
  JXL_ASSERT(encoded_image.bitstreams.size() == 1);
@ -129,45 +101,12 @@ Status Encode(const CodecInOut& io, const extras::Codec codec,
  return true;
 }

-Status Encode(const CodecInOut& io, const ColorEncoding& c_desired,
-              size_t bits_per_sample, const std::string& pathname,
+Status Encode(const extras::PackedPixelFile& ppf, const std::string& pathname,
              std::vector<uint8_t>* bytes, ThreadPool* pool) {
  std::string extension;
  const extras::Codec codec =
-      extras::CodecFromPath(pathname, &bits_per_sample, &extension);
-
-  // Warn about incorrect usage of PGM/PGX/PPM - only the latter supports
-  // color, but CodecFromPath lumps them all together.
-  if (codec == extras::Codec::kPNM && extension != ".pfm") {
-    if (io.Main().HasAlpha() && extension != ".pam") {
-      JXL_WARNING(
-          "For images with alpha, the filename should end with .pam.\n");
-    } else if (!io.Main().IsGray() && extension == ".pgm") {
-      JXL_WARNING("For color images, the filename should end with .ppm.\n");
-    } else if (io.Main().IsGray() && extension == ".ppm") {
-      JXL_WARNING(
-          "For grayscale images, the filename should not end with .ppm.\n");
-    }
-    if (bits_per_sample > 16) {
-      JXL_WARNING("PPM only supports up to 16 bits per sample");
-      bits_per_sample = 16;
-    }
-  } else if (codec == extras::Codec::kPGX && !io.Main().IsGray()) {
-    JXL_WARNING("Storing color image to PGX - use .ppm extension instead.\n");
-  }
-  if (bits_per_sample > 16 && codec == extras::Codec::kPNG) {
-    JXL_WARNING("PNG only supports up to 16 bits per sample");
-    bits_per_sample = 16;
-  }
-
-  return Encode(io, codec, c_desired, bits_per_sample, bytes, pool);
-}
-
-Status Encode(const CodecInOut& io, const std::string& pathname,
-              std::vector<uint8_t>* bytes, ThreadPool* pool) {
-  // TODO(lode): need to take the floating_point_sample field into account
-  return Encode(io, io.metadata.m.color_encoding,
-                io.metadata.m.bit_depth.bits_per_sample, pathname, bytes, pool);
+      extras::CodecFromPath(pathname, nullptr, &extension);
+  return Encode(ppf, codec, bytes, pool);
 }

 }  // namespace jxl
--- a/third_party/jpeg-xl/lib/extras/codec.h
+++ b/third_party/jpeg-xl/lib/extras/codec.h
@ -43,18 +43,10 @@ JXL_INLINE Status SetFromBytes(const Span<const uint8_t> bytes, CodecInOut* io,
                      orig_codec);
 }

-// Replaces "bytes" with an encoding of pixels transformed from c_current
-// color space to c_desired.
-Status Encode(const CodecInOut& io, extras::Codec codec,
-              const ColorEncoding& c_desired, size_t bits_per_sample,
-              std::vector<uint8_t>* bytes, ThreadPool* pool = nullptr);
+Status Encode(const extras::PackedPixelFile& ppf, const extras::Codec codec,
+              std::vector<uint8_t>* bytes, ThreadPool* pool);

-// Deduces codec, calls Encode and writes to file.
-Status Encode(const CodecInOut& io, const ColorEncoding& c_desired,
-              size_t bits_per_sample, const std::string& pathname,
-              std::vector<uint8_t>* bytes, ThreadPool* pool = nullptr);
-// Same, but defaults to metadata.original color_encoding and bits_per_sample.
-Status Encode(const CodecInOut& io, const std::string& pathname,
+Status Encode(const extras::PackedPixelFile& ppf, const std::string& pathname,
              std::vector<uint8_t>* bytes, ThreadPool* pool = nullptr);

 }  // namespace jxl
--- a/third_party/jpeg-xl/lib/extras/codec_test.cc
+++ b/third_party/jpeg-xl/lib/extras/codec_test.cc
@ -3,24 +3,33 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-#include "lib/extras/codec.h"
-
+#include <jxl/codestream_header.h>
+#include <jxl/color_encoding.h>
+#include <jxl/encode.h>
+#include <jxl/types.h>
 #include <stddef.h>

 #include <algorithm>
 #include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <memory>
 #include <sstream>
 #include <string>
 #include <utility>
 #include <vector>

 #include "lib/extras/common.h"
+#include "lib/extras/dec/color_hints.h"
 #include "lib/extras/dec/decode.h"
 #include "lib/extras/dec/pnm.h"
 #include "lib/extras/enc/encode.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/byte_order.h"
 #include "lib/jxl/base/random.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/test_utils.h"
 #include "lib/jxl/testing.h"

--- a/third_party/jpeg-xl/lib/extras/dec/decode.cc
+++ b/third_party/jpeg-xl/lib/extras/dec/decode.cc
@ -37,7 +37,6 @@ std::string GetExtension(const std::string& path) {

 Codec CodecFromPath(std::string path, size_t* JXL_RESTRICT bits_per_sample,
                    std::string* extension) {
-  std::string base;
  std::string ext = GetExtension(path);
  if (extension) {
    if (extension->empty()) {
--- a/third_party/jpeg-xl/lib/extras/dec/pnm.cc
+++ b/third_party/jpeg-xl/lib/extras/dec/pnm.cc
@ -390,7 +390,7 @@ StatusOr<ChunkedPNMDecoder> ChunkedPNMDecoder::Init(const char* path) {
  if (header.ysize * row_size + dec.data_start_ < size) {
    return JXL_FAILURE("Invalid ppm");
  }
-  return std::move(dec);
+  return dec;
 }

 jxl::Status ChunkedPNMDecoder::InitializePPF(const ColorHints& color_hints,
--- a/third_party/jpeg-xl/lib/extras/enc/apng.cc
+++ b/third_party/jpeg-xl/lib/extras/enc/apng.cc
@ -344,13 +344,11 @@ Status APNGEncoder::EncodePackedPixelFileToAPNG(
                 PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE,
                 PNG_FILTER_TYPE_BASE);
    if (count == 0) {
-      if (!MaybeAddSRGB(ppf.color_encoding, png_ptr, info_ptr)) {
+      if (!ppf.icc.empty()) {
+        png_set_benign_errors(png_ptr, 1);
+        png_set_iCCP(png_ptr, info_ptr, "1", 0, ppf.icc.data(), ppf.icc.size());
+      } else if (!MaybeAddSRGB(ppf.color_encoding, png_ptr, info_ptr)) {
        MaybeAddCICP(ppf.color_encoding, png_ptr, info_ptr);
-        if (!ppf.icc.empty()) {
-          png_set_benign_errors(png_ptr, 1);
-          png_set_iCCP(png_ptr, info_ptr, "1", 0, ppf.icc.data(),
-                       ppf.icc.size());
-        }
        MaybeAddCHRM(ppf.color_encoding, png_ptr, info_ptr);
        MaybeAddGAMA(ppf.color_encoding, png_ptr, info_ptr);
      }
--- a/third_party/jpeg-xl/lib/extras/enc/jpegli.cc
+++ b/third_party/jpeg-xl/lib/extras/enc/jpegli.cc
@ -7,12 +7,33 @@

 #include <jxl/cms.h>
 #include <jxl/codestream_header.h>
+#include <jxl/types.h>
 #include <setjmp.h>
 #include <stdint.h>

+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <hwy/aligned_allocator.h>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "lib/extras/enc/encode.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jpegli/common.h"
 #include "lib/jpegli/encode.h"
+#include "lib/jpegli/types.h"
+#include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/image.h"

 namespace jxl {
 namespace extras {
--- a/third_party/jpeg-xl/lib/extras/enc/jpg.cc
+++ b/third_party/jpeg-xl/lib/extras/enc/jpg.cc
@ -186,14 +186,6 @@ Status SetJpegProgression(int progressive_id,
  return true;
 }

-bool IsSRGBEncoding(const JxlColorEncoding& c) {
-  return ((c.color_space == JXL_COLOR_SPACE_RGB ||
-           c.color_space == JXL_COLOR_SPACE_GRAY) &&
-          c.primaries == JXL_PRIMARIES_SRGB &&
-          c.white_point == JXL_WHITE_POINT_D65 &&
-          c.transfer_function == JXL_TRANSFER_FUNCTION_SRGB);
-}
-
 void WriteICCProfile(jpeg_compress_struct* const cinfo,
                     const std::vector<uint8_t>& icc) {
  constexpr size_t kMaxIccBytesInMarker =
@ -598,18 +590,14 @@ class JPEGEncoder : public Encoder {
      }
    }
    params.is_xyb = (ppf.color_encoding.color_space == JXL_COLOR_SPACE_XYB);
-    std::vector<uint8_t> icc;
-    if (!IsSRGBEncoding(ppf.color_encoding)) {
-      icc = ppf.icc;
-    }
    encoded_image->bitstreams.clear();
    encoded_image->bitstreams.reserve(ppf.frames.size());
    for (const auto& frame : ppf.frames) {
      JXL_RETURN_IF_ERROR(VerifyPackedImage(frame.color, ppf.info));
      encoded_image->bitstreams.emplace_back();
      JXL_RETURN_IF_ERROR(EncodeImageJPG(
-          frame.color, ppf.info, icc, ppf.metadata.exif, jpeg_encoder, params,
-          pool, &encoded_image->bitstreams.back()));
+          frame.color, ppf.info, ppf.icc, ppf.metadata.exif, jpeg_encoder,
+          params, pool, &encoded_image->bitstreams.back()));
    }
    return true;
  }
--- a/third_party/jpeg-xl/lib/extras/jpegli_test.cc
+++ b/third_party/jpeg-xl/lib/extras/jpegli_test.cc
@ -8,11 +8,18 @@
 #include "lib/extras/dec/jpegli.h"

 #include <jxl/color_encoding.h>
+#include <jxl/types.h>
 #include <stdint.h>

+#include <cstddef>
 #include <cstdint>
+#include <cstdio>
+#include <cstring>
 #include <memory>
+#include <ostream>
+#include <sstream>
 #include <string>
+#include <utility>
 #include <vector>

 #include "lib/extras/dec/color_hints.h"
--- a/third_party/jpeg-xl/lib/extras/packed_image.h
+++ b/third_party/jpeg-xl/lib/extras/packed_image.h
@ -260,6 +260,8 @@ class PackedPixelFile {
  size_t num_frames() const {
    return chunked_frames.empty() ? frames.size() : chunked_frames.size();
  }
+  size_t xsize() const { return info.xsize; }
+  size_t ysize() const { return info.ysize; }
 };

 }  // namespace extras
--- a/third_party/jpeg-xl/lib/extras/packed_image_convert.cc
+++ b/third_party/jpeg-xl/lib/extras/packed_image_convert.cc
@ -98,6 +98,8 @@ Status ConvertPackedPixelFileToCodecInOut(const PackedPixelFile& ppf,

  io->metadata.m.SetAlphaBits(ppf.info.alpha_bits,
                              ppf.info.alpha_premultiplied);
+  ExtraChannelInfo* alpha = io->metadata.m.Find(ExtraChannel::kAlpha);
+  if (alpha) alpha->bit_depth = io->metadata.m.bit_depth;

  io->metadata.m.xyb_encoded = !ppf.info.uses_original_profile;
  JXL_ASSERT(ppf.info.orientation > 0 && ppf.info.orientation <= 8);
@ -193,6 +195,35 @@ Status ConvertPackedPixelFileToCodecInOut(const PackedPixelFile& ppf,
  return true;
 }

+PackedPixelFile ConvertImage3FToPackedPixelFile(const Image3F& image,
+                                                const ColorEncoding& c_enc,
+                                                JxlPixelFormat format,
+                                                ThreadPool* pool) {
+  PackedPixelFile ppf;
+  ppf.info.xsize = image.xsize();
+  ppf.info.ysize = image.ysize();
+  ppf.info.num_color_channels = 3;
+  ppf.info.bits_per_sample = PackedImage::BitsPerChannel(format.data_type);
+  ppf.info.exponent_bits_per_sample = format.data_type == JXL_TYPE_FLOAT ? 8
+                                      : format.data_type == JXL_TYPE_FLOAT16
+                                          ? 5
+                                          : 0;
+  ppf.color_encoding = c_enc.ToExternal();
+  ppf.frames.clear();
+  PackedFrame frame(image.xsize(), image.ysize(), format);
+  const ImageF* channels[3];
+  for (int c = 0; c < 3; ++c) {
+    channels[c] = &image.Plane(c);
+  }
+  bool float_samples = ppf.info.exponent_bits_per_sample > 0;
+  JXL_CHECK(ConvertChannelsToExternal(
+      channels, 3, ppf.info.bits_per_sample, float_samples, format.endianness,
+      frame.color.stride, pool, frame.color.pixels(0, 0, 0),
+      frame.color.pixels_size, PixelCallback(), Orientation::kIdentity));
+  ppf.frames.emplace_back(std::move(frame));
+  return ppf;
+}
+
 // Allows converting from internal CodecInOut to external PackedPixelFile
 Status ConvertCodecInOutToPackedPixelFile(const CodecInOut& io,
                                          const JxlPixelFormat& pixel_format,
@ -200,7 +231,6 @@ Status ConvertCodecInOutToPackedPixelFile(const CodecInOut& io,
                                          ThreadPool* pool,
                                          PackedPixelFile* ppf) {
  const bool has_alpha = io.metadata.m.HasAlpha();
-  bool alpha_premultiplied = false;
  JXL_ASSERT(!io.frames.empty());

  if (has_alpha) {
@ -209,7 +239,10 @@ Status ConvertCodecInOutToPackedPixelFile(const CodecInOut& io,
    const auto* alpha_channel = io.metadata.m.Find(ExtraChannel::kAlpha);
    JXL_ASSERT(alpha_channel->bit_depth.exponent_bits_per_sample ==
               io.metadata.m.bit_depth.exponent_bits_per_sample);
-    alpha_premultiplied = alpha_channel->alpha_associated;
+    ppf->info.alpha_bits = alpha_channel->bit_depth.bits_per_sample;
+    ppf->info.alpha_exponent_bits =
+        alpha_channel->bit_depth.exponent_bits_per_sample;
+    ppf->info.alpha_premultiplied = alpha_channel->alpha_associated;
  }

  // Convert the image metadata
@ -226,9 +259,6 @@ Status ConvertCodecInOutToPackedPixelFile(const CodecInOut& io,
  ppf->info.relative_to_max_display =
      io.metadata.m.tone_mapping.relative_to_max_display;

-  ppf->info.alpha_bits = io.metadata.m.GetAlphaBits();
-  ppf->info.alpha_premultiplied = alpha_premultiplied;
-
  ppf->info.uses_original_profile = !io.metadata.m.xyb_encoded;
  JXL_ASSERT(0 < io.metadata.m.orientation && io.metadata.m.orientation <= 8);
  ppf->info.orientation =
--- a/third_party/jpeg-xl/lib/extras/packed_image_convert.h
+++ b/third_party/jpeg-xl/lib/extras/packed_image_convert.h
@ -30,6 +30,11 @@ Status ConvertCodecInOutToPackedPixelFile(const CodecInOut& io,
                                          const ColorEncoding& c_desired,
                                          ThreadPool* pool,
                                          PackedPixelFile* ppf);
+
+PackedPixelFile ConvertImage3FToPackedPixelFile(const Image3F& image,
+                                                const ColorEncoding& c_enc,
+                                                JxlPixelFormat format,
+                                                ThreadPool* pool);
 }  // namespace extras
 }  // namespace jxl

--- a/third_party/jpeg-xl/lib/include/jxl/cms_interface.h
+++ b/third_party/jpeg-xl/lib/include/jxl/cms_interface.h
@ -20,6 +20,8 @@

 #include <jxl/color_encoding.h>
 #include <jxl/types.h>
+#include <stddef.h>
+#include <stdint.h>

 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
--- a/third_party/jpeg-xl/lib/include/jxl/decode.h
+++ b/third_party/jpeg-xl/lib/include/jxl/decode.h
@ -20,7 +20,7 @@
 #include <jxl/memory_manager.h>
 #include <jxl/parallel_runner.h>
 #include <jxl/types.h>
-#include <jxl/version.h>
+#include <jxl/version.h>  // TODO(eustas): remove before v1.0
 #include <stddef.h>
 #include <stdint.h>

@ -1359,15 +1359,7 @@ JXL_EXPORT JxlDecoderStatus JxlDecoderGetBoxType(JxlDecoder* dec,

 /**
 * Returns the size of a box as it appears in the container file, after the @ref
- * JXL_DEC_BOX event. For a non-compressed box, this is the size of the
- * contents, excluding the 4 bytes indicating the box type. For a compressed
- * "brob" box, this is the size of the compressed box contents plus the
- * additional 4 byte indicating the underlying box type, but excluding the 4
- * bytes indicating "brob". This function gives the size of the data that will
- * be written in the output buffer when getting boxes in the default raw
- * compressed mode. When @ref JxlDecoderSetDecompressBoxes is enabled, the
- * return value of function does not change, and the decompressed size is not
- * known before it has already been decompressed and output.
+ * JXL_DEC_BOX event. This includes all the box headers.
 *
 * @param dec decoder object
 * @param size raw size of the box in bytes
--- a/third_party/jpeg-xl/lib/include/jxl/decode_cxx.h
+++ b/third_party/jpeg-xl/lib/include/jxl/decode_cxx.h
@ -16,6 +16,7 @@
 #define JXL_DECODE_CXX_H_

 #include <jxl/decode.h>
+#include <jxl/memory_manager.h>

 #include <memory>

--- a/third_party/jpeg-xl/lib/include/jxl/encode.h
+++ b/third_party/jpeg-xl/lib/include/jxl/encode.h
@ -15,15 +15,16 @@

 #include <jxl/cms_interface.h>
 #include <jxl/codestream_header.h>
+#include <jxl/color_encoding.h>
 #include <jxl/jxl_export.h>
 #include <jxl/memory_manager.h>
 #include <jxl/parallel_runner.h>
 #include <jxl/stats.h>
-#include <jxl/version.h>
+#include <jxl/types.h>
+#include <jxl/version.h>  // TODO(eustas): remove before v1.0
+#include <stddef.h>
 #include <stdint.h>

-#include "jxl/types.h"
-
 #if defined(__cplusplus) || defined(c_plusplus)
 extern "C" {
 #endif
@ -378,6 +379,14 @@ typedef enum {
   */
  JXL_ENC_FRAME_SETTING_JPEG_KEEP_JUMBF = 37,

+  /** If this mode is disabled, the encoder will not make any image quality
+   * decisions that are computed based on the full image, but stored only once
+   * (e.g. the X quant multiplier in the frame header). Used mainly for testing
+   * equivalence of streaming and non-streaming code.
+   * 0 = disabled, 1 = enabled (default)
+   */
+  JXL_ENC_FRAME_SETTING_USE_FULL_IMAGE_HEURISTICS = 38,
+
  /** Enum value not to be used as an option. This value is added to force the
   * C compiler to have the enum to take a known size.
   */
@ -1193,8 +1202,8 @@ JXL_EXPORT JxlEncoderStatus JxlEncoderSetBasicInfo(JxlEncoder* enc,
 * JXL_ENC_ERROR or JXL_ENC_NOT_SUPPORTED otherwise
 */
 JXL_EXPORT JxlEncoderStatus JxlEncoderSetUpsamplingMode(JxlEncoder* enc,
-                                                        const int64_t factor,
-                                                        const int64_t mode);
+                                                        int64_t factor,
+                                                        int64_t mode);

 /**
 * Initializes a JxlExtraChannelInfo struct to default values.
--- a/third_party/jpeg-xl/lib/include/jxl/encode_cxx.h
+++ b/third_party/jpeg-xl/lib/include/jxl/encode_cxx.h
@ -16,6 +16,7 @@
 #define JXL_ENCODE_CXX_H_

 #include <jxl/encode.h>
+#include <jxl/memory_manager.h>

 #include <memory>

--- a/third_party/jpeg-xl/lib/include/jxl/resizable_parallel_runner_cxx.h
+++ b/third_party/jpeg-xl/lib/include/jxl/resizable_parallel_runner_cxx.h
@ -16,6 +16,7 @@
 #ifndef JXL_RESIZABLE_PARALLEL_RUNNER_CXX_H_
 #define JXL_RESIZABLE_PARALLEL_RUNNER_CXX_H_

+#include <jxl/memory_manager.h>
 #include <jxl/resizable_parallel_runner.h>

 #include <memory>
--- a/third_party/jpeg-xl/lib/include/jxl/thread_parallel_runner_cxx.h
+++ b/third_party/jpeg-xl/lib/include/jxl/thread_parallel_runner_cxx.h
@ -15,8 +15,10 @@
 #ifndef JXL_THREAD_PARALLEL_RUNNER_CXX_H_
 #define JXL_THREAD_PARALLEL_RUNNER_CXX_H_

+#include <jxl/memory_manager.h>
 #include <jxl/thread_parallel_runner.h>

+#include <cstddef>
 #include <memory>

 #if !(defined(__cplusplus) || defined(c_plusplus))
--- a/third_party/jpeg-xl/lib/jxl.cmake
+++ b/third_party/jpeg-xl/lib/jxl.cmake
@ -43,9 +43,6 @@ else()
 endif ()

 set(OBJ_COMPILE_DEFINITIONS
-  JPEGXL_MAJOR_VERSION=${JPEGXL_MAJOR_VERSION}
-  JPEGXL_MINOR_VERSION=${JPEGXL_MINOR_VERSION}
-  JPEGXL_PATCH_VERSION=${JPEGXL_PATCH_VERSION}
  # Used to determine if we are building the library when defined or just
  # including the library when not defined. This is public so libjxl shared
  # library gets this define too.
@ -55,6 +52,9 @@ set(OBJ_COMPILE_DEFINITIONS
 # Generate version.h
 configure_file("jxl/version.h.in" "include/jxl/version.h")

+list(APPEND JPEGXL_INTERNAL_PUBLIC_HEADERS
+  ${CMAKE_CURRENT_BINARY_DIR}/include/jxl/version.h)
+
 # Headers for exporting/importing public headers
 include(GenerateExportHeader)

@ -87,6 +87,16 @@ target_include_directories(jxl_base INTERFACE
  ${PROJECT_SOURCE_DIR}
  ${JXL_HWY_INCLUDE_DIRS}
 )
+
+# On android, link with log to use android-related log functions.
+if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+  find_library(log-lib log)
+  if(log-lib)
+    target_link_libraries(jxl_base INTERFACE ${log-lib})
+    target_compile_definitions(jxl_base INTERFACE USE_ANDROID_LOGGER)
+  endif()
+endif()
+
 add_dependencies(jxl_base jxl_export)

 # Decoder-only object library
--- a/third_party/jpeg-xl/lib/jxl/base/data_parallel.h
+++ b/third_party/jpeg-xl/lib/jxl/base/data_parallel.h
@ -13,7 +13,7 @@
 #include <stddef.h>
 #include <stdint.h>

-#include "lib/jxl/base/bits.h"
+#include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
 #if JXL_COMPILER_MSVC
 // suppress warnings about the const & applied to function types
--- a/third_party/jpeg-xl/lib/jxl/base/float.h
+++ b/third_party/jpeg-xl/lib/jxl/base/float.h
@ -12,6 +12,7 @@
 #include <string.h>

 #include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"

 namespace jxl {
@ -61,7 +62,9 @@ static Status JXL_INLINE LoadFloatRow(const uint8_t* src, size_t count,

    case JXL_TYPE_UINT8:
      for (size_t i = 0; i < count; ++i) {
-        callback(i, src[stride * i] * scale);
+        // Integer multiply uint8 value before scaling so that the UINT8 value
+        // and the corresponding UINT16 value convert to the same float
+        callback(i, (src[stride * i] * 257) * scale);
      }
      return true;

--- a/third_party/jpeg-xl/lib/jxl/base/status.h
+++ b/third_party/jpeg-xl/lib/jxl/base/status.h
@ -74,14 +74,31 @@ namespace jxl {
 #define JXL_DEBUG_ON_ABORT JXL_DEBUG_ON_ERROR
 #endif  // JXL_DEBUG_ON_ABORT

-// Print a debug message on standard error. You should use the JXL_DEBUG macro
-// instead of calling Debug directly. This function returns false, so it can be
-// used as a return value in JXL_FAILURE.
+#ifdef USE_ANDROID_LOGGER
+#include <android/log.h>
+#define LIBJXL_ANDROID_LOG_TAG ("libjxl")
+inline void android_vprintf(const char* format, va_list args) {
+  char* message = nullptr;
+  int res = vasprintf(&message, format, args);
+  if (res != -1) {
+    __android_log_write(ANDROID_LOG_DEBUG, LIBJXL_ANDROID_LOG_TAG, message);
+    free(message);
+  }
+}
+#endif
+
+// Print a debug message on standard error or android logs. You should use the
+// JXL_DEBUG macro instead of calling Debug directly. This function returns
+// false, so it can be used as a return value in JXL_FAILURE.
 JXL_FORMAT(1, 2)
 inline JXL_NOINLINE bool Debug(const char* format, ...) {
  va_list args;
  va_start(args, format);
+#ifdef USE_ANDROID_LOGGER
+  android_vprintf(format, args);
+#else
  vfprintf(stderr, format, args);
+#endif
  va_end(args);
  return false;
 }
@ -110,8 +127,12 @@ inline JXL_NOINLINE bool Debug(const char* format, ...) {
 // JXL_DEBUG version that prints the debug message if the global verbose level
 // defined at compile time by JXL_DEBUG_V_LEVEL is greater or equal than the
 // passed level.
+#if JXL_DEBUG_V_LEVEL > 0
 #define JXL_DEBUG_V(level, format, ...) \
  JXL_DEBUG(level <= JXL_DEBUG_V_LEVEL, format, ##__VA_ARGS__)
+#else
+#define JXL_DEBUG_V(level, format, ...)
+#endif

 // Warnings (via JXL_WARNING) are enabled by default in debug builds (opt and
 // debug).
@ -329,7 +350,11 @@ inline JXL_FORMAT(2, 3) Status
      (JXL_DEBUG_ON_ALL_ERROR && !status)) {
    va_list args;
    va_start(args, format);
+#ifdef USE_ANDROID_LOGGER
+    android_vprintf(format, args);
+#else
    vfprintf(stderr, format, args);
+#endif
    va_end(args);
  }
 #ifdef JXL_CRASH_ON_ERROR
--- a/third_party/jpeg-xl/lib/jxl/blending_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/blending_test.cc
@ -3,11 +3,17 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

+#include <jxl/types.h>
+
 #include <cstdint>
+#include <sstream>
+#include <utility>
 #include <vector>

-#include "lib/extras/codec.h"
-#include "lib/jxl/image_test_utils.h"
+#include "lib/extras/dec/decode.h"
+#include "lib/extras/dec/jxl.h"
+#include "lib/extras/packed_image.h"
+#include "lib/jxl/base/span.h"
 #include "lib/jxl/test_utils.h"
 #include "lib/jxl/testing.h"

@ -19,19 +25,30 @@ using ::testing::SizeIs;
 TEST(BlendingTest, Crops) {
  const std::vector<uint8_t> compressed =
      jxl::test::ReadTestData("jxl/blending/cropped_traffic_light.jxl");
-  CodecInOut decoded;
-  ASSERT_TRUE(test::DecodeFile({}, Bytes(compressed), &decoded));
+  extras::JXLDecompressParams dparams;
+  dparams.accepted_formats = {{3, JXL_TYPE_UINT16, JXL_LITTLE_ENDIAN, 0}};
+  extras::PackedPixelFile decoded;
+  ASSERT_TRUE(DecodeImageJXL(compressed.data(), compressed.size(), dparams,
+                             /*decoded_bytes=*/nullptr, &decoded));
  ASSERT_THAT(decoded.frames, SizeIs(4));

  int i = 0;
-  for (const ImageBundle& ib : decoded.frames) {
+  for (auto&& decoded_frame : decoded.frames) {
    std::ostringstream filename;
    filename << "jxl/blending/cropped_traffic_light_frame-" << i << ".png";
    const std::vector<uint8_t> compressed_frame =
        jxl::test::ReadTestData(filename.str());
-    CodecInOut frame;
-    ASSERT_TRUE(SetFromBytes(Bytes(compressed_frame), &frame));
-    JXL_EXPECT_OK(SamePixels(ib.color(), *frame.Main().color(), _));
+    extras::PackedPixelFile decoded_frame_ppf;
+    decoded_frame_ppf.info = decoded.info;
+    decoded_frame_ppf.icc = decoded.icc;
+    decoded_frame_ppf.color_encoding = decoded.color_encoding;
+    decoded_frame_ppf.extra_channels_info = decoded.extra_channels_info;
+    decoded_frame_ppf.frames.emplace_back(std::move(decoded_frame));
+    extras::PackedPixelFile expected_frame_ppf;
+    ASSERT_TRUE(extras::DecodeBytes(Bytes(compressed_frame),
+                                    extras::ColorHints(), &expected_frame_ppf));
+    EXPECT_EQ(0.0f,
+              test::ComputeDistance2(decoded_frame_ppf, expected_frame_ppf));
    ++i;
  }
 }
--- a/third_party/jpeg-xl/lib/jxl/box_content_decoder.h
+++ b/third_party/jpeg-xl/lib/jxl/box_content_decoder.h
@ -11,9 +11,6 @@
 #include <stdint.h>
 #include <stdlib.h>

-#include <memory>
-#include <vector>
-
 namespace jxl {

 /** Outputs the contents of a box in a streaming fashion, either directly, or
--- a/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.cc
+++ b/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli.cc
@ -41,7 +41,6 @@
 #include "lib/jxl/base/printf_macros.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/convolve.h"
-#include "lib/jxl/gauss_blur.h"
 #include "lib/jxl/image_ops.h"

 #ifndef JXL_BUTTERAUGLI_ONCE
--- a/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/butteraugli/butteraugli_test.cc
@ -9,12 +9,12 @@
 #include <stddef.h>

 #include <algorithm>
+#include <cstdint>
 #include <utility>

 #include "lib/extras/metrics.h"
 #include "lib/extras/packed_image.h"
 #include "lib/jxl/base/random.h"
-#include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/enc_external_image.h"
 #include "lib/jxl/image.h"
@ -109,7 +109,7 @@ TEST(ButteraugliInPlaceTest, LargeImage) {
  EXPECT_TRUE(ButteraugliInterfaceInPlace(std::move(rgb0), std::move(rgb1), ba,
                                          diffmap2, diffval2));
  double distp2 = ComputeDistanceP(diffmap2, ba, 3.0);
-  EXPECT_NEAR(diffval, diffval2, 1e-10);
+  EXPECT_NEAR(diffval, diffval2, 5e-7);
  EXPECT_NEAR(distp, distp2, 1e-7);
 }

--- a/third_party/jpeg-xl/lib/jxl/cms/color_encoding_cms.h
+++ b/third_party/jpeg-xl/lib/jxl/cms/color_encoding_cms.h
@ -13,11 +13,9 @@
 #include <cmath>
 #include <cstdint>
 #include <cstring>
-#include <string>
 #include <utility>
 #include <vector>

-#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/status.h"

 namespace jxl {
--- a/third_party/jpeg-xl/lib/jxl/color_encoding_internal_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/color_encoding_internal_test.cc
@ -10,7 +10,6 @@
 #include <cstdlib>  // rand

 #include "lib/jxl/cms/color_encoding_cms.h"
-#include "lib/jxl/encode_internal.h"
 #include "lib/jxl/test_utils.h"
 #include "lib/jxl/testing.h"

--- a/third_party/jpeg-xl/lib/jxl/color_management_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/color_management_test.cc
@ -8,20 +8,26 @@
 #include <stdint.h>

 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
-#include <new>
+#include <cstdio>
+#include <cstdlib>
+#include <ostream>
 #include <string>
 #include <utility>
+#include <vector>

 #include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/random.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/cms/color_encoding_cms.h"
 #include "lib/jxl/cms/opsin_params.h"
 #include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_metadata.h"
+#include "lib/jxl/image_ops.h"
 #include "lib/jxl/image_test_utils.h"
 #include "lib/jxl/test_utils.h"
 #include "lib/jxl/testing.h"
--- a/third_party/jpeg-xl/lib/jxl/convolve.h
+++ b/third_party/jpeg-xl/lib/jxl/convolve.h
@ -9,8 +9,8 @@
 // 2D convolution.

 #include <stddef.h>
-#include <stdint.h>

+#include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/image.h"

@ -55,19 +55,6 @@ struct WeightsSeparable5 {
  float vert[3 * 4];
 };

-// Weights for separable 7x7 filters (typically but not necessarily the same
-// values for horizontal and vertical directions). The kernel must already be
-// normalized, but note that values for negative offsets are omitted, so the
-// given values do not sum to 1.
-//
-// NOTE: for >= 7x7 Gaussian kernels, it is faster to use FastGaussian instead,
-// at least when images exceed the L1 cache size.
-struct WeightsSeparable7 {
-  // Horizontal 1D, distances 0..3 (each replicated 4x)
-  float horz[4 * 4];
-  float vert[4 * 4];
-};
-
 const WeightsSymmetric3& WeightsSymmetric3Lowpass();
 const WeightsSeparable5& WeightsSeparable5Lowpass();
 const WeightsSymmetric5& WeightsSymmetric5Lowpass();
@ -80,10 +67,6 @@ void SlowSeparable5(const ImageF& in, const Rect& in_rect,
                    const WeightsSeparable5& weights, ThreadPool* pool,
                    ImageF* out, const Rect& out_rect);

-void SlowSeparable7(const ImageF& in, const Rect& in_rect,
-                    const WeightsSeparable7& weights, ThreadPool* pool,
-                    ImageF* out, const Rect& out_rect);
-
 void Symmetric3(const ImageF& in, const Rect& rect,
                const WeightsSymmetric3& weights, ThreadPool* pool,
                ImageF* out);
@ -100,10 +83,6 @@ void Separable5(const ImageF& in, const Rect& rect,
                const WeightsSeparable5& weights, ThreadPool* pool,
                ImageF* out);

-void Separable7(const ImageF& in, const Rect& rect,
-                const WeightsSeparable7& weights, ThreadPool* pool,
-                ImageF* out);
-
 }  // namespace jxl

 #endif  // LIB_JXL_CONVOLVE_H_
--- a/third_party/jpeg-xl/lib/jxl/convolve_separable7.cc
+++ b/third_party/jpeg-xl/lib/jxl/convolve_separable7.cc
@ -1,285 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/convolve.h"
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "lib/jxl/convolve_separable7.cc"
-#include <hwy/foreach_target.h>
-#include <hwy/highway.h>
-
-#include "lib/jxl/convolve-inl.h"
-
-HWY_BEFORE_NAMESPACE();
-namespace jxl {
-namespace HWY_NAMESPACE {
-
-// These templates are not found via ADL.
-using hwy::HWY_NAMESPACE::Add;
-using hwy::HWY_NAMESPACE::Mul;
-using hwy::HWY_NAMESPACE::MulAdd;
-using hwy::HWY_NAMESPACE::Vec;
-
-// 7x7 convolution by separable kernel with a single scan through the input.
-// Extended version of Separable5, see documentation there.
-class Separable7Strategy {
-  using D = HWY_CAPPED(float, 16);
-  using V = Vec<D>;
-
- public:
-  static constexpr int64_t kRadius = 3;
-
-  template <size_t kSizeModN, class WrapRow>
-  static JXL_MAYBE_INLINE void ConvolveRow(
-      const float* const JXL_RESTRICT row_m, const size_t xsize,
-      const int64_t stride, const WrapRow& wrap_row,
-      const WeightsSeparable7& weights, float* const JXL_RESTRICT row_out) {
-    const D d;
-    const int64_t neg_stride = -stride;  // allows LEA addressing.
-    const float* const JXL_RESTRICT row_t3 =
-        wrap_row(row_m + 3 * neg_stride, stride);
-    const float* const JXL_RESTRICT row_t2 =
-        wrap_row(row_m + 2 * neg_stride, stride);
-    const float* const JXL_RESTRICT row_t1 =
-        wrap_row(row_m + 1 * neg_stride, stride);
-    const float* const JXL_RESTRICT row_b1 =
-        wrap_row(row_m + 1 * stride, stride);
-    const float* const JXL_RESTRICT row_b2 =
-        wrap_row(row_m + 2 * stride, stride);
-    const float* const JXL_RESTRICT row_b3 =
-        wrap_row(row_m + 3 * stride, stride);
-
-    const V wh0 = LoadDup128(d, weights.horz + 0 * 4);
-    const V wh1 = LoadDup128(d, weights.horz + 1 * 4);
-    const V wh2 = LoadDup128(d, weights.horz + 2 * 4);
-    const V wh3 = LoadDup128(d, weights.horz + 3 * 4);
-    const V wv0 = LoadDup128(d, weights.vert + 0 * 4);
-    const V wv1 = LoadDup128(d, weights.vert + 1 * 4);
-    const V wv2 = LoadDup128(d, weights.vert + 2 * 4);
-    const V wv3 = LoadDup128(d, weights.vert + 3 * 4);
-
-    size_t x = 0;
-
-    // More than one iteration for scalars.
-    for (; x < kRadius; x += Lanes(d)) {
-      const V conv0 =
-          Mul(HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2, wh3), wv0);
-
-      const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2, wh3);
-      const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2, wh3);
-      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
-
-      const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2, wh3);
-      const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2, wh3);
-      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
-
-      const V conv3t = HorzConvolveFirst(row_t3, x, xsize, wh0, wh1, wh2, wh3);
-      const V conv3b = HorzConvolveFirst(row_b3, x, xsize, wh0, wh1, wh2, wh3);
-      const V conv3 = MulAdd(Add(conv3t, conv3b), wv3, conv2);
-
-      Store(conv3, d, row_out + x);
-    }
-
-    // Main loop: load inputs without padding
-    for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) {
-      const V conv0 = Mul(HorzConvolve(row_m + x, wh0, wh1, wh2, wh3), wv0);
-
-      const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2, wh3);
-      const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2, wh3);
-      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
-
-      const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2, wh3);
-      const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2, wh3);
-      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
-
-      const V conv3t = HorzConvolve(row_t3 + x, wh0, wh1, wh2, wh3);
-      const V conv3b = HorzConvolve(row_b3 + x, wh0, wh1, wh2, wh3);
-      const V conv3 = MulAdd(Add(conv3t, conv3b), wv3, conv2);
-
-      Store(conv3, d, row_out + x);
-    }
-
-    // Last full vector to write (the above loop handled mod >= kRadius)
-#if HWY_TARGET == HWY_SCALAR
-    while (x < xsize) {
-#else
-    if (kSizeModN < kRadius) {
-#endif
-      const V conv0 =
-          Mul(HorzConvolveLast<kSizeModN>(row_m, x, xsize, wh0, wh1, wh2, wh3),
-              wv0);
-
-      const V conv1t =
-          HorzConvolveLast<kSizeModN>(row_t1, x, xsize, wh0, wh1, wh2, wh3);
-      const V conv1b =
-          HorzConvolveLast<kSizeModN>(row_b1, x, xsize, wh0, wh1, wh2, wh3);
-      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
-
-      const V conv2t =
-          HorzConvolveLast<kSizeModN>(row_t2, x, xsize, wh0, wh1, wh2, wh3);
-      const V conv2b =
-          HorzConvolveLast<kSizeModN>(row_b2, x, xsize, wh0, wh1, wh2, wh3);
-      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
-
-      const V conv3t =
-          HorzConvolveLast<kSizeModN>(row_t3, x, xsize, wh0, wh1, wh2, wh3);
-      const V conv3b =
-          HorzConvolveLast<kSizeModN>(row_b3, x, xsize, wh0, wh1, wh2, wh3);
-      const V conv3 = MulAdd(Add(conv3t, conv3b), wv3, conv2);
-
-      Store(conv3, d, row_out + x);
-      x += Lanes(d);
-    }
-
-    // If mod = 0, the above vector was the last.
-    if (kSizeModN != 0) {
-      for (; x < xsize; ++x) {
-        float mul = 0.0f;
-        for (int64_t dy = -kRadius; dy <= kRadius; ++dy) {
-          const float wy = weights.vert[std::abs(dy) * 4];
-          const float* clamped_row = wrap_row(row_m + dy * stride, stride);
-          for (int64_t dx = -kRadius; dx <= kRadius; ++dx) {
-            const float wx = weights.horz[std::abs(dx) * 4];
-            const int64_t clamped_x = Mirror(x + dx, xsize);
-            mul += clamped_row[clamped_x] * wx * wy;
-          }
-        }
-        row_out[x] = mul;
-      }
-    }
-  }
-
- private:
-  // Same as HorzConvolve for the first/last vector in a row.
-  static JXL_MAYBE_INLINE V HorzConvolveFirst(
-      const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize,
-      const V wh0, const V wh1, const V wh2, const V wh3) {
-    const D d;
-    const V c = LoadU(d, row + x);
-    const V mul0 = Mul(c, wh0);
-
-#if HWY_TARGET == HWY_SCALAR
-    const V l1 = LoadU(d, row + Mirror(x - 1, xsize));
-    const V l2 = LoadU(d, row + Mirror(x - 2, xsize));
-    const V l3 = LoadU(d, row + Mirror(x - 3, xsize));
-#else
-    (void)xsize;
-    const V l1 = Neighbors::FirstL1(c);
-    const V l2 = Neighbors::FirstL2(c);
-    const V l3 = Neighbors::FirstL3(c);
-#endif
-
-    const V r1 = LoadU(d, row + x + 1);
-    const V r2 = LoadU(d, row + x + 2);
-    const V r3 = LoadU(d, row + x + 3);
-
-    const V mul1 = MulAdd(Add(l1, r1), wh1, mul0);
-    const V mul2 = MulAdd(Add(l2, r2), wh2, mul1);
-    const V mul3 = MulAdd(Add(l3, r3), wh3, mul2);
-    return mul3;
-  }
-
-  template <size_t kSizeModN>
-  static JXL_MAYBE_INLINE V HorzConvolveLast(
-      const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize,
-      const V wh0, const V wh1, const V wh2, const V wh3) {
-    const D d;
-    const V c = LoadU(d, row + x);
-    const V mul0 = Mul(c, wh0);
-
-    const V l1 = LoadU(d, row + x - 1);
-    const V l2 = LoadU(d, row + x - 2);
-    const V l3 = LoadU(d, row + x - 3);
-
-    V r1, r2, r3;
-#if HWY_TARGET == HWY_SCALAR
-    r1 = LoadU(d, row + Mirror(x + 1, xsize));
-    r2 = LoadU(d, row + Mirror(x + 2, xsize));
-    r3 = LoadU(d, row + Mirror(x + 3, xsize));
-#else
-    const size_t N = Lanes(d);
-    if (kSizeModN == 0) {
-      r3 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 3)));
-      r2 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 2)));
-      r1 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 1)));
-    } else if (kSizeModN == 1) {
-      const auto last = LoadU(d, row + xsize - N);
-      r3 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 2)));
-      r2 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1)));
-      r1 = last;
-    } else /* kSizeModN >= 2 */ {
-      const auto last = LoadU(d, row + xsize - N);
-      r3 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1)));
-      r2 = last;
-      r1 = LoadU(d, row + x + 1);
-    }
-#endif
-
-    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
-    const V sum1 = Add(l1, r1);
-    const V mul1 = MulAdd(sum1, wh1, mul0);
-    const V sum2 = Add(l2, r2);
-    const V mul2 = MulAdd(sum2, wh2, mul1);
-    const V sum3 = Add(l3, r3);
-    const V mul3 = MulAdd(sum3, wh3, mul2);
-    return mul3;
-  }
-
-  // Returns one vector of horizontal convolution results; lane i is the result
-  // for pixel pos + i. This is the fast path for interior pixels, i.e. kRadius
-  // valid pixels before/after pos.
-  static JXL_MAYBE_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos,
-                                         const V wh0, const V wh1, const V wh2,
-                                         const V wh3) {
-    const D d;
-    const V c = LoadU(d, pos);
-    const V mul0 = Mul(c, wh0);
-
-    // TODO(janwas): better to Combine
-    const V l1 = LoadU(d, pos - 1);
-    const V r1 = LoadU(d, pos + 1);
-    const V l2 = LoadU(d, pos - 2);
-    const V r2 = LoadU(d, pos + 2);
-    const V l3 = LoadU(d, pos - 3);
-    const V r3 = LoadU(d, pos + 3);
-    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
-    const V sum1 = Add(l1, r1);
-    const V mul1 = MulAdd(sum1, wh1, mul0);
-    const V sum2 = Add(l2, r2);
-    const V mul2 = MulAdd(sum2, wh2, mul1);
-    const V sum3 = Add(l3, r3);
-    const V mul3 = MulAdd(sum3, wh3, mul2);
-    return mul3;
-  }
-};
-
-void Separable7(const ImageF& in, const Rect& rect,
-                const WeightsSeparable7& weights, ThreadPool* pool,
-                ImageF* out) {
-  using Conv = ConvolveT<Separable7Strategy>;
-  if (rect.xsize() >= Conv::MinWidth()) {
-    return Conv::Run(in, rect, weights, pool, out);
-  }
-
-  return SlowSeparable7(in, rect, weights, pool, out, Rect(*out));
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace jxl
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace jxl {
-
-HWY_EXPORT(Separable7);
-void Separable7(const ImageF& in, const Rect& rect,
-                const WeightsSeparable7& weights, ThreadPool* pool,
-                ImageF* out) {
-  return HWY_DYNAMIC_DISPATCH(Separable7)(in, rect, weights, pool, out);
-}
-
-}  // namespace jxl
-#endif  // HWY_ONCE
--- a/third_party/jpeg-xl/lib/jxl/convolve_slow.cc
+++ b/third_party/jpeg-xl/lib/jxl/convolve_slow.cc
@ -195,10 +195,4 @@ void SlowSeparable5(const ImageF& in, const Rect& in_rect,
  SlowSeparable<2>(in, in_rect, weights, pool, out, out_rect);
 }

-void SlowSeparable7(const ImageF& in, const Rect& in_rect,
-                    const WeightsSeparable7& weights, ThreadPool* pool,
-                    ImageF* out, const Rect& out_rect) {
-  SlowSeparable<3>(in, in_rect, weights, pool, out, out_rect);
-}
-
 }  // namespace jxl
--- a/third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc
+++ b/third_party/jpeg-xl/lib/jxl/convolve_symmetric5.cc
@ -39,7 +39,7 @@ static float WeightedSumBorder(const ImageF& in, const WrapY wrap_y,
  const float sum_2 = wx2 * (in_m2 + in_p2);
  const float sum_1 = wx1 * (in_m1 + in_p1);
  const float sum_0 = wx0 * in_00;
-  return sum_2 + sum_1 + sum_0;
+  return sum_2 + (sum_1 + sum_0);
 }

 template <class WrapY, class V>
--- a/third_party/jpeg-xl/lib/jxl/convolve_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/convolve_test.cc
@ -151,28 +151,6 @@ void VerifySeparable5(const size_t xsize, const size_t ysize, ThreadPool* pool,
  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
 }

-void VerifySeparable7(const size_t xsize, const size_t ysize, ThreadPool* pool,
-                      Rng* rng) {
-  const Rect rect(0, 0, xsize, ysize);
-
-  ImageF in(xsize, ysize);
-  GenerateImage(*rng, &in, 0.0f, 1.0f);
-
-  ImageF out_expected(xsize, ysize);
-  ImageF out_actual(xsize, ysize);
-
-  // Gaussian sigma 1.0
-  const WeightsSeparable7 weights = {{HWY_REP4(0.383103f), HWY_REP4(0.241843f),
-                                      HWY_REP4(0.060626f), HWY_REP4(0.00598f)},
-                                     {HWY_REP4(0.383103f), HWY_REP4(0.241843f),
-                                      HWY_REP4(0.060626f), HWY_REP4(0.00598f)}};
-
-  SlowSeparable7(in, rect, weights, pool, &out_expected, rect);
-  Separable7(in, rect, weights, pool, &out_actual);
-
-  JXL_ASSERT_OK(VerifyRelativeError(out_expected, out_actual, 1E-5f, 1E-5f, _));
-}
-
 // For all xsize/ysize and kernels:
 void TestConvolve() {
  TestNeighbors();
@ -204,10 +182,6 @@ void TestConvolve() {
                    JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sep5------------------");
                    VerifySeparable5(xsize, ysize, null_pool, &rng);
                    VerifySeparable5(xsize, ysize, &pool3, &rng);
-
-                    JXL_DEBUG(JXL_DEBUG_CONVOLVE, "Sep7------------------");
-                    VerifySeparable7(xsize, ysize, null_pool, &rng);
-                    VerifySeparable7(xsize, ysize, &pool3, &rng);
                  }
                },
                "TestConvolve"));
--- a/third_party/jpeg-xl/lib/jxl/dec_cache.h
+++ b/third_party/jpeg-xl/lib/jxl/dec_cache.h
@ -7,28 +7,36 @@
 #define LIB_JXL_DEC_CACHE_H_

 #include <jxl/decode.h>
+#include <jxl/types.h>
 #include <stdint.h>

+#include <algorithm>
 #include <atomic>
 #include <cmath>
 #include <hwy/base.h>  // HWY_ALIGN_MAX
+#include <memory>
+#include <vector>

+#include "hwy/aligned_allocator.h"
 #include "lib/jxl/ac_strategy.h"
 #include "lib/jxl/base/common.h"  // kMaxNumPasses
+#include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/coeff_order.h"
-#include "lib/jxl/convolve.h"
+#include "lib/jxl/common.h"
+#include "lib/jxl/dct_util.h"
 #include "lib/jxl/dec_ans.h"
-#include "lib/jxl/dec_group_border.h"
-#include "lib/jxl/dec_noise.h"
+#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_metadata.h"
 #include "lib/jxl/passes_state.h"
-#include "lib/jxl/quant_weights.h"
 #include "lib/jxl/render_pipeline/render_pipeline.h"
+#include "lib/jxl/render_pipeline/render_pipeline_stage.h"
 #include "lib/jxl/render_pipeline/stage_upsampling.h"
-#include "lib/jxl/sanitizers.h"

 namespace jxl {

--- a/third_party/jpeg-xl/lib/jxl/dec_external_image.cc
+++ b/third_party/jpeg-xl/lib/jxl/dec_external_image.cc
@ -42,8 +42,8 @@ void FloatToU32(const float* in, uint32_t* out, size_t num, float mul,
  const hwy::HWY_NAMESPACE::Rebind<uint32_t, decltype(d)> du;

  // Unpoison accessing partially-uninitialized vectors with memory sanitizer.
-  // This is because we run NearestInt() on the vector, which triggers msan even
-  // it it safe to do so since the values are not mixed between lanes.
+  // This is because we run NearestInt() on the vector, which triggers MSAN even
+  // it is safe to do so since the values are not mixed between lanes.
  const size_t num_round_up = RoundUpTo(num, Lanes(d));
  msan::UnpoisonMemory(in + num, sizeof(in[0]) * (num_round_up - num));

--- a/third_party/jpeg-xl/lib/jxl/dec_external_image.h
+++ b/third_party/jpeg-xl/lib/jxl/dec_external_image.h
@ -8,17 +8,15 @@

 // Interleaved image for color transforms and Codec.

-#include <jxl/decode.h>
 #include <jxl/types.h>
 #include <stddef.h>
-#include <stdint.h>

 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/dec_cache.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_metadata.h"

 namespace jxl {

--- a/third_party/jpeg-xl/lib/jxl/dec_frame.cc
+++ b/third_party/jpeg-xl/lib/jxl/dec_frame.cc
@ -5,20 +5,19 @@

 #include "lib/jxl/dec_frame.h"

-#include <jxl/types.h>
+#include <jxl/decode.h>
 #include <stddef.h>
 #include <stdint.h>

 #include <algorithm>
 #include <atomic>
-#include <hwy/aligned_allocator.h>
-#include <numeric>
+#include <cstdlib>
+#include <memory>
 #include <utility>
 #include <vector>

 #include "lib/jxl/ac_context.h"
 #include "lib/jxl/ac_strategy.h"
-#include "lib/jxl/ans_params.h"
 #include "lib/jxl/base/bits.h"
 #include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
@ -30,26 +29,29 @@
 #include "lib/jxl/coeff_order_fwd.h"
 #include "lib/jxl/common.h"  // kMaxNumPasses
 #include "lib/jxl/compressed_dc.h"
+#include "lib/jxl/dct_util.h"
 #include "lib/jxl/dec_ans.h"
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/dec_cache.h"
 #include "lib/jxl/dec_group.h"
 #include "lib/jxl/dec_modular.h"
+#include "lib/jxl/dec_noise.h"
 #include "lib/jxl/dec_patch_dictionary.h"
-#include "lib/jxl/dec_xyb.h"
+#include "lib/jxl/entropy_coder.h"
 #include "lib/jxl/epf.h"
 #include "lib/jxl/fields.h"
 #include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_metadata.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/jpeg/jpeg_data.h"
 #include "lib/jxl/loop_filter.h"
 #include "lib/jxl/passes_state.h"
 #include "lib/jxl/quant_weights.h"
 #include "lib/jxl/quantizer.h"
-#include "lib/jxl/sanitizers.h"
+#include "lib/jxl/render_pipeline/render_pipeline.h"
 #include "lib/jxl/splines.h"
 #include "lib/jxl/toc.h"

--- a/third_party/jpeg-xl/lib/jxl/dec_frame.h
+++ b/third_party/jpeg-xl/lib/jxl/dec_frame.h
@ -10,17 +10,21 @@
 #include <jxl/types.h>
 #include <stdint.h>

+#include <algorithm>
+#include <cstddef>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/blending.h"
 #include "lib/jxl/common.h"  // JXL_HIGH_PRECISION
 #include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/dec_cache.h"
 #include "lib/jxl/dec_modular.h"
 #include "lib/jxl/frame_header.h"
-#include "lib/jxl/headers.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_metadata.h"

--- a/third_party/jpeg-xl/lib/jxl/dec_xyb.h
+++ b/third_party/jpeg-xl/lib/jxl/dec_xyb.h
@ -10,11 +10,13 @@

 #include <jxl/cms_interface.h>

+#include <cstddef>
+#include <cstdint>
+
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_metadata.h"

--- a/third_party/jpeg-xl/lib/jxl/decode.cc
+++ b/third_party/jpeg-xl/lib/jxl/decode.cc
@ -5,6 +5,7 @@

 #include <jxl/decode.h>
 #include <jxl/types.h>
+#include <jxl/version.h>

 #include <algorithm>
 #include <array>
--- a/third_party/jpeg-xl/lib/jxl/decode_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/decode_test.cc
@ -3,28 +3,45 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

+#include "lib/extras/dec/decode.h"
+
 #include <jxl/cms.h>
+#include <jxl/codestream_header.h>
 #include <jxl/color_encoding.h>
 #include <jxl/decode.h>
 #include <jxl/decode_cxx.h>
+#include <jxl/memory_manager.h>
+#include <jxl/parallel_runner.h>
+#include <jxl/resizable_parallel_runner.h>
 #include <jxl/resizable_parallel_runner_cxx.h>
+#include <jxl/thread_parallel_runner.h>
 #include <jxl/thread_parallel_runner_cxx.h>
 #include <jxl/types.h>

+#include <algorithm>
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
+#include <ostream>
+#include <set>
 #include <sstream>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <vector>

-#include "lib/extras/codec.h"
 #include "lib/extras/dec/color_description.h"
+#include "lib/extras/enc/encode.h"
+#include "lib/extras/enc/jpg.h"
+#include "lib/extras/packed_image.h"
 #include "lib/jxl/base/byte_order.h"
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/override.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
+#include "lib/jxl/butteraugli/butteraugli.h"
 #include "lib/jxl/cms/color_encoding_cms.h"
 #include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/dec_bit_reader.h"
@ -35,14 +52,17 @@
 #include "lib/jxl/enc_fields.h"
 #include "lib/jxl/enc_frame.h"
 #include "lib/jxl/enc_icc_codec.h"
+#include "lib/jxl/enc_params.h"
 #include "lib/jxl/enc_progressive_split.h"
 #include "lib/jxl/encode_internal.h"
 #include "lib/jxl/fields.h"
 #include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/headers.h"
-#include "lib/jxl/icc_codec.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_metadata.h"
+#include "lib/jxl/image_ops.h"
 #include "lib/jxl/jpeg/enc_jpeg_data.h"
 #include "lib/jxl/padded_bytes.h"
 #include "lib/jxl/test_image.h"
@ -275,9 +295,20 @@ std::vector<uint8_t> CreateTestJXLCodestream(
  if (params.jpeg_codestream != nullptr) {
    if (jxl::extras::CanDecode(jxl::extras::Codec::kJPG)) {
      std::vector<uint8_t> jpeg_bytes;
-      io.jpeg_quality = 70;
-      EXPECT_TRUE(Encode(io, extras::Codec::kJPG, io.metadata.m.color_encoding,
-                         /*bits_per_sample=*/8, &jpeg_bytes));
+      extras::PackedPixelFile ppf;
+      extras::PackedFrame frame(xsize, ysize, format);
+      JXL_ASSERT(frame.color.pixels_size == pixels.size());
+      memcpy(frame.color.pixels(0, 0, 0), pixels.data(), pixels.size());
+      ppf.frames.emplace_back(std::move(frame));
+      ppf.info.xsize = xsize;
+      ppf.info.ysize = ysize;
+      ppf.info.num_color_channels = grayscale ? 1 : 3;
+      ppf.info.bits_per_sample = 16;
+      auto encoder = extras::GetJPEGEncoder();
+      encoder->SetOption("quality", "70");
+      extras::EncodedImage encoded;
+      EXPECT_TRUE(encoder->Encode(ppf, &encoded));
+      jpeg_bytes = encoded.bitstreams[0];
      Bytes(jpeg_bytes).AppendTo(params.jpeg_codestream);
      EXPECT_TRUE(jxl::jpeg::DecodeImageJPG(
          jxl::Bytes(jpeg_bytes.data(), jpeg_bytes.size()), &io));
@ -1660,7 +1691,7 @@ TEST(DecodeTest, PixelTestWithICCProfileLossy) {
  EXPECT_THAT(
      ButteraugliDistance(io0.frames, io1.frames, ba, *JxlGetDefaultCms(),
                          /*distmap=*/nullptr, nullptr),
-      IsSlightlyBelow(0.55f));
+      IsSlightlyBelow(0.56f));

  JxlDecoderDestroy(dec);
 }
@ -2104,7 +2135,7 @@ TEST(DecodeTest, PixelTestOpaqueSrgbLossyNoise) {
    EXPECT_THAT(
        ButteraugliDistance(io0.frames, io1.frames, ba, *JxlGetDefaultCms(),
                            /*distmap=*/nullptr, nullptr),
-        IsSlightlyBelow(1.2222f));
+        IsSlightlyBelow(1.3f));

    JxlDecoderDestroy(dec);
  }
--- a/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.cc
+++ b/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.cc
@ -5,7 +5,18 @@

 #include "lib/jxl/decode_to_jpeg.h"

+#include <jxl/decode.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
 #include "lib/jxl/common.h"  // JPEGXL_ENABLE_TRANSCODE_JPEG
+#include "lib/jxl/jpeg/dec_jpeg_data.h"
+#include "lib/jxl/jpeg/jpeg_data.h"

 namespace jxl {

--- a/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.h
+++ b/third_party/jpeg-xl/lib/jxl/decode_to_jpeg.h
@ -14,13 +14,16 @@
 #include <stdint.h>
 #include <stdlib.h>

+#include <algorithm>
+#include <cstring>
 #include <memory>
+#include <utility>
 #include <vector>

 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"  // JPEGXL_ENABLE_TRANSCODE_JPEG
+#include "lib/jxl/common.h"
 #include "lib/jxl/image_bundle.h"
-#include "lib/jxl/jpeg/dec_jpeg_data.h"
+#include "lib/jxl/jpeg/jpeg_data.h"
 #if JPEGXL_ENABLE_TRANSCODE_JPEG
 #include "lib/jxl/jpeg/dec_jpeg_data_writer.h"
 #endif  // JPEGXL_ENABLE_TRANSCODE_JPEG
@ -109,7 +112,7 @@ class JxlToJpegDecoder {
                                    jpeg_data_.get())) {
        return false;
      }
-      ib->jpeg_data.reset(jpeg_data_.release());
+      ib->jpeg_data = std::move(jpeg_data_);
    }
    return true;
  }
--- a/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_ac_strategy.h
@ -8,9 +8,13 @@

 #include <cstddef>

+#include "lib/jxl/ac_strategy.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/status.h"
+#include "lib/jxl/chroma_from_luma.h"
 #include "lib/jxl/enc_cache.h"
+#include "lib/jxl/enc_params.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/quant_weights.h"

--- a/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.cc
@ -41,7 +41,6 @@
 #include "lib/jxl/enc_transforms-inl.h"
 #include "lib/jxl/epf.h"
 #include "lib/jxl/frame_dimensions.h"
-#include "lib/jxl/gauss_blur.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
 #include "lib/jxl/image_ops.h"
@ -408,8 +407,10 @@ struct AdaptiveQuantizationImpl {
  void ComputeTile(float butteraugli_target, float scale, const Image3F& xyb,
                   const Rect& rect_in, const Rect& rect_out, const int thread,
                   ImageF* mask, ImageF* mask1x1) {
-    const size_t xsize = rect_in.xsize();
-    const size_t ysize = rect_in.ysize();
+    JXL_ASSERT(rect_in.x0() % 8 == 0);
+    JXL_ASSERT(rect_in.y0() % 8 == 0);
+    const size_t xsize = xyb.xsize();
+    const size_t ysize = xyb.ysize();

    // The XYB gamma is 3.0 to be able to decode faster with two muls.
    // Butteraugli's gamma is matching the gamma of human eye, around 2.6.
@ -420,21 +421,30 @@ struct AdaptiveQuantizationImpl {

    const HWY_FULL(float) df;

-    size_t y_start = rect_out.y0() * 8;
-    size_t y_end = y_start + rect_out.ysize() * 8;
+    size_t y_start_1x1 = rect_in.y0() + rect_out.y0() * 8;
+    size_t y_end_1x1 = y_start_1x1 + rect_out.ysize() * 8;

-    size_t x_start = rect_out.x0() * 8;
-    size_t x_end = x_start + rect_out.xsize() * 8;
+    size_t x_start_1x1 = rect_in.x0() + rect_out.x0() * 8;
+    size_t x_end_1x1 = x_start_1x1 + rect_out.xsize() * 8;
+
+    if (rect_in.x0() != 0 && rect_out.x0() == 0) x_start_1x1 -= 2;
+    if (rect_in.x1() < xsize && rect_out.x1() * 8 == rect_in.xsize()) {
+      x_end_1x1 += 2;
+    }
+    if (rect_in.y0() != 0 && rect_out.y0() == 0) y_start_1x1 -= 2;
+    if (rect_in.y1() < ysize && rect_out.y1() * 8 == rect_in.ysize()) {
+      y_end_1x1 += 2;
+    }

    // Computes image (padded to multiple of 8x8) of local pixel differences.
    // Subsample both directions by 4.
    // 1x1 Laplacian of intensity.
-    for (size_t y = y_start; y < y_end; ++y) {
+    for (size_t y = y_start_1x1; y < y_end_1x1; ++y) {
      const size_t y2 = y + 1 < ysize ? y + 1 : y;
      const size_t y1 = y > 0 ? y - 1 : y;
-      const float* row_in = rect_in.ConstPlaneRow(xyb, 1, y);
-      const float* row_in1 = rect_in.ConstPlaneRow(xyb, 1, y1);
-      const float* row_in2 = rect_in.ConstPlaneRow(xyb, 1, y2);
+      const float* row_in = xyb.ConstPlaneRow(1, y);
+      const float* row_in1 = xyb.ConstPlaneRow(1, y1);
+      const float* row_in2 = xyb.ConstPlaneRow(1, y2);
      float* mask1x1_out = mask1x1->Row(y);
      auto scalar_pixel1x1 = [&](size_t x) {
        const size_t x2 = x + 1 < xsize ? x + 1 : x;
@ -451,15 +461,21 @@ struct AdaptiveQuantizationImpl {
        static const float kOffset = 0.01;
        mask1x1_out[x] = kMul / (diff + kOffset);
      };
-      for (size_t x = x_start; x < x_end; ++x) {
+      for (size_t x = x_start_1x1; x < x_end_1x1; ++x) {
        scalar_pixel1x1(x);
      }
    }

+    size_t y_start = rect_in.y0() + rect_out.y0() * 8;
+    size_t y_end = y_start + rect_out.ysize() * 8;
+
+    size_t x_start = rect_in.x0() + rect_out.x0() * 8;
+    size_t x_end = x_start + rect_out.xsize() * 8;
+
    if (x_start != 0) x_start -= 4;
-    if (x_end != rect_in.xsize()) x_end += 4;
+    if (x_end != xsize) x_end += 4;
    if (y_start != 0) y_start -= 4;
-    if (y_end != rect_in.ysize()) y_end += 4;
+    if (y_end != ysize) y_end += 4;
    pre_erosion[thread].ShrinkTo((x_end - x_start) / 4, (y_end - y_start) / 4);

    static const float limit = 0.2f;
@ -467,9 +483,9 @@ struct AdaptiveQuantizationImpl {
      size_t y2 = y + 1 < ysize ? y + 1 : y;
      size_t y1 = y > 0 ? y - 1 : y;

-      const float* row_in = rect_in.ConstPlaneRow(xyb, 1, y);
-      const float* row_in1 = rect_in.ConstPlaneRow(xyb, 1, y1);
-      const float* row_in2 = rect_in.ConstPlaneRow(xyb, 1, y2);
+      const float* row_in = xyb.ConstPlaneRow(1, y);
+      const float* row_in1 = xyb.ConstPlaneRow(1, y1);
+      const float* row_in2 = xyb.ConstPlaneRow(1, y2);
      float* JXL_RESTRICT row_out = diff_buffer.Row(thread);

      auto scalar_pixel = [&](size_t x) {
@ -552,7 +568,8 @@ struct AdaptiveQuantizationImpl {
  ImageF diff_buffer;
 };

-static void Blur1x1Masking(ThreadPool* pool, ImageF* mask1x1) {
+static void Blur1x1Masking(ThreadPool* pool, ImageF* mask1x1,
+                           const Rect& rect) {
  // Blur the mask1x1 to obtain the masking image.
  // Before blurring it contains an image of absolute value of the
  // Laplacian of the intensity channel.
@ -578,10 +595,9 @@ static void Blur1x1Masking(ThreadPool* pool, ImageF* mask1x1) {
                        {HWY_REP4(normalize_mul * kFilterMask1x1[1])},
                        {HWY_REP4(normalize_mul * kFilterMask1x1[4])},
                        {HWY_REP4(normalize_mul * kFilterMask1x1[3])}};
-  Rect from_rect(0, 0, mask1x1->xsize(), mask1x1->ysize());
-  ImageF temp(mask1x1->xsize(), mask1x1->ysize());
-  Symmetric5(*mask1x1, from_rect, weights, pool, &temp);
-  CopyImageTo(temp, mask1x1);  // TODO: make it a swap
+  ImageF temp(rect.xsize(), rect.ysize());
+  Symmetric5(*mask1x1, rect, weights, pool, &temp);
+  *mask1x1 = std::move(temp);
 }

 ImageF AdaptiveQuantizationMap(const float butteraugli_target,
@ -595,7 +611,7 @@ ImageF AdaptiveQuantizationMap(const float butteraugli_target,
  const size_t ysize_blocks = rect.ysize() / kBlockDim;
  impl.aq_map = ImageF(xsize_blocks, ysize_blocks);
  *mask = ImageF(xsize_blocks, ysize_blocks);
-  *mask1x1 = ImageF(rect.xsize(), rect.ysize());
+  *mask1x1 = ImageF(xyb.xsize(), xyb.ysize());
  JXL_CHECK(RunOnPool(
      pool, 0,
      DivCeil(xsize_blocks, kEncTileDimInBlocks) *
@ -618,7 +634,7 @@ ImageF AdaptiveQuantizationMap(const float butteraugli_target,
      },
      "AQ DiffPrecompute"));

-  Blur1x1Masking(pool, mask1x1);
+  Blur1x1Masking(pool, mask1x1, rect);
  return std::move(impl).aq_map;
 }

--- a/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_adaptive_quantization.h
@ -6,19 +6,14 @@
 #ifndef LIB_JXL_ENC_ADAPTIVE_QUANTIZATION_H_
 #define LIB_JXL_ENC_ADAPTIVE_QUANTIZATION_H_

+#include <jxl/cms_interface.h>
 #include <stddef.h>

 #include "lib/jxl/ac_strategy.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/enc_cache.h"
-#include "lib/jxl/enc_params.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/image.h"
-#include "lib/jxl/image_bundle.h"
-#include "lib/jxl/loop_filter.h"
-#include "lib/jxl/quant_weights.h"
-#include "lib/jxl/quantizer.h"
-#include "lib/jxl/splines.h"

 // Heuristics to find a good quantizer for a given image. InitialQuantField
 // produces a quantization field (i.e. relative quantization amounts for each
--- a/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.cc
@ -45,6 +45,10 @@ void ProcessTile(const CompressParams& cparams, const FrameHeader& frame_header,
                 const ImageF& quant_field, const AcStrategyImage& ac_strategy,
                 ImageB* epf_sharpness, const Rect& rect,
                 ArControlFieldHeuristics::TempImages* temp_image) {
+  JXL_ASSERT(opsin_rect.x0() % 8 == 0);
+  JXL_ASSERT(opsin_rect.y0() % 8 == 0);
+  JXL_ASSERT(opsin_rect.xsize() % 8 == 0);
+  JXL_ASSERT(opsin_rect.ysize() % 8 == 0);
  constexpr size_t N = kBlockDim;
  if (cparams.butteraugli_distance < kMinButteraugliForDynamicAR ||
      cparams.speed_tier > SpeedTier::kWombat ||
@ -62,73 +66,65 @@ void ProcessTile(const CompressParams& cparams, const FrameHeader& frame_header,
  const size_t sharpness_stride =
      static_cast<size_t>(epf_sharpness->PixelsPerRow());

-  size_t by0 = rect.y0();
-  size_t by1 = rect.y0() + rect.ysize();
-  size_t bx0 = rect.x0();
-  size_t bx1 = rect.x0() + rect.xsize();
+  size_t by0 = opsin_rect.y0() / 8 + rect.y0();
+  size_t by1 = by0 + rect.ysize();
+  size_t bx0 = opsin_rect.x0() / 8 + rect.x0();
+  size_t bx1 = bx0 + rect.xsize();
  temp_image->InitOnce();
  ImageF& laplacian_sqrsum = temp_image->laplacian_sqrsum;
  // Calculate the L2 of the 3x3 Laplacian in an integral transform
  // (for example 32x32 dct). This relates to transforms ability
  // to propagate artefacts.
-  size_t y0 = by0 == 0 ? 2 : 0;
-  size_t y1 = by1 * N + 4 <= opsin_rect.ysize() + 2
-                  ? (by1 - by0) * N + 4
-                  : opsin_rect.ysize() + 2 - by0 * N;
-  size_t x0 = bx0 == 0 ? 2 : 0;
-  size_t x1 = bx1 * N + 4 <= opsin_rect.xsize() + 2
-                  ? (bx1 - bx0) * N + 4
-                  : opsin_rect.xsize() + 2 - bx0 * N;
+  size_t y0 = by0 == 0 ? 0 : by0 * N - 2;
+  size_t y1 = by1 * N == opsin.ysize() ? by1 * N : by1 * N + 2;
+  size_t x0 = bx0 == 0 ? 0 : bx0 * N - 2;
+  size_t x1 = bx1 * N == opsin.xsize() ? bx1 * N : bx1 * N + 2;
  HWY_FULL(float) df;
  for (size_t y = y0; y < y1; y++) {
-    float* JXL_RESTRICT laplacian_sqrsum_row = laplacian_sqrsum.Row(y);
-    size_t cy = y + by0 * N - 2;
+    float* JXL_RESTRICT laplacian_sqrsum_row =
+        laplacian_sqrsum.Row(y + 2 - by0 * N);
    const float* JXL_RESTRICT in_row_t[3];
    const float* JXL_RESTRICT in_row[3];
    const float* JXL_RESTRICT in_row_b[3];
    for (size_t c = 0; c < 3; c++) {
-      in_row_t[c] = opsin_rect.ConstPlaneRow(opsin, c, cy > 0 ? cy - 1 : cy);
-      in_row[c] = opsin_rect.ConstPlaneRow(opsin, c, cy);
-      in_row_b[c] = opsin_rect.ConstPlaneRow(
-          opsin, c, cy + 1 < opsin_rect.ysize() ? cy + 1 : cy);
+      in_row_t[c] = opsin.ConstPlaneRow(c, y > 0 ? y - 1 : y);
+      in_row[c] = opsin.ConstPlaneRow(c, y);
+      in_row_b[c] = opsin.ConstPlaneRow(c, y + 1 < opsin.ysize() ? y + 1 : y);
    }
    auto compute_laplacian_scalar = [&](size_t x) {
-      size_t cx = x + bx0 * N - 2;
-      const size_t prevX = cx >= 1 ? cx - 1 : cx;
-      const size_t nextX = cx + 1 < opsin_rect.xsize() ? cx + 1 : cx;
+      const size_t prevX = x >= 1 ? x - 1 : x;
+      const size_t nextX = x + 1 < opsin.xsize() ? x + 1 : x;
      float sumsqr = 0;
      for (size_t c = 0; c < 3; c++) {
        float laplacian =
-            kChannelWeights[c] * in_row[c][cx] +
+            kChannelWeights[c] * in_row[c][x] +
            kChannelWeightsLapNeg[c] *
                (in_row[c][prevX] + in_row[c][nextX] + in_row_b[c][prevX] +
-                 in_row_b[c][cx] + in_row_b[c][nextX] + in_row_t[c][prevX] +
-                 in_row_t[c][cx] + in_row_t[c][nextX]);
+                 in_row_b[c][x] + in_row_b[c][nextX] + in_row_t[c][prevX] +
+                 in_row_t[c][x] + in_row_t[c][nextX]);
        sumsqr += laplacian * laplacian;
      }
-      laplacian_sqrsum_row[x] = sumsqr;
+      laplacian_sqrsum_row[x + 2 - bx0 * N] = sumsqr;
    };
    size_t x = x0;
-    for (; x + bx0 * N < 3; x++) {
+    for (; x < 1; x++) {
      compute_laplacian_scalar(x);
    }
    // Interior. One extra pixel of border as the last pixel is special.
-    for (; x + Lanes(df) <= x1 &&
-           x + Lanes(df) + bx0 * N - 1 <= opsin_rect.xsize();
+    for (; x + Lanes(df) <= x1 && x + Lanes(df) + 1 <= opsin.xsize();
         x += Lanes(df)) {
-      size_t cx = x + bx0 * N - 2;
      auto sumsqr = Zero(df);
      for (size_t c = 0; c < 3; c++) {
        auto laplacian =
-            Mul(LoadU(df, in_row[c] + cx), Set(df, kChannelWeights[c]));
-        auto sum_oth0 = LoadU(df, in_row[c] + cx - 1);
-        auto sum_oth1 = LoadU(df, in_row[c] + cx + 1);
-        auto sum_oth2 = LoadU(df, in_row_t[c] + cx - 1);
-        auto sum_oth3 = LoadU(df, in_row_t[c] + cx);
-        sum_oth0 = Add(sum_oth0, LoadU(df, in_row_t[c] + cx + 1));
-        sum_oth1 = Add(sum_oth1, LoadU(df, in_row_b[c] + cx - 1));
-        sum_oth2 = Add(sum_oth2, LoadU(df, in_row_b[c] + cx));
-        sum_oth3 = Add(sum_oth3, LoadU(df, in_row_b[c] + cx + 1));
+            Mul(LoadU(df, in_row[c] + x), Set(df, kChannelWeights[c]));
+        auto sum_oth0 = LoadU(df, in_row[c] + x - 1);
+        auto sum_oth1 = LoadU(df, in_row[c] + x + 1);
+        auto sum_oth2 = LoadU(df, in_row_t[c] + x - 1);
+        auto sum_oth3 = LoadU(df, in_row_t[c] + x);
+        sum_oth0 = Add(sum_oth0, LoadU(df, in_row_t[c] + x + 1));
+        sum_oth1 = Add(sum_oth1, LoadU(df, in_row_b[c] + x - 1));
+        sum_oth2 = Add(sum_oth2, LoadU(df, in_row_b[c] + x));
+        sum_oth3 = Add(sum_oth3, LoadU(df, in_row_b[c] + x + 1));
        sum_oth0 = Add(sum_oth0, sum_oth1);
        sum_oth2 = Add(sum_oth2, sum_oth3);
        sum_oth0 = Add(sum_oth0, sum_oth2);
@ -136,7 +132,7 @@ void ProcessTile(const CompressParams& cparams, const FrameHeader& frame_header,
            MulAdd(Set(df, kChannelWeightsLapNeg[c]), sum_oth0, laplacian);
        sumsqr = MulAdd(laplacian, laplacian, sumsqr);
      }
-      StoreU(sumsqr, df, laplacian_sqrsum_row + x);
+      StoreU(sumsqr, df, laplacian_sqrsum_row + x + 2 - bx0 * N);
    }
    for (; x < x1; x++) {
      compute_laplacian_scalar(x);
@ -150,13 +146,13 @@ void ProcessTile(const CompressParams& cparams, const FrameHeader& frame_header,
  ImageF& sqrsum_00 = temp_image->sqrsum_00;
  size_t sqrsum_00_stride = sqrsum_00.PixelsPerRow();
  float* JXL_RESTRICT sqrsum_00_row = sqrsum_00.Row(0);
-  for (size_t y = 0; y < (by1 - by0) * 2; y++) {
+  for (size_t y = 0; y < rect.ysize() * 2; y++) {
    const float* JXL_RESTRICT rows_in[4];
    for (size_t iy = 0; iy < 4; iy++) {
      rows_in[iy] = laplacian_sqrsum.ConstRow(y * 4 + iy + 2);
    }
    float* JXL_RESTRICT row_out = sqrsum_00_row + y * sqrsum_00_stride;
-    for (size_t x = 0; x < (bx1 - bx0) * 2; x++) {
+    for (size_t x = 0; x < rect.xsize() * 2; x++) {
      auto sum = Zero(df4);
      for (size_t iy = 0; iy < 4; iy++) {
        for (size_t ix = 0; ix < 4; ix += Lanes(df4)) {
@ -173,7 +169,7 @@ void ProcessTile(const CompressParams& cparams, const FrameHeader& frame_header,
  ImageF& sqrsum_22 = temp_image->sqrsum_22;
  size_t sqrsum_22_stride = sqrsum_22.PixelsPerRow();
  float* JXL_RESTRICT sqrsum_22_row = sqrsum_22.Row(0);
-  for (size_t y = 0; y < (by1 - by0) * 2 + 1; y++) {
+  for (size_t y = 0; y < rect.ysize() * 2 + 1; y++) {
    const float* JXL_RESTRICT rows_in[4];
    for (size_t iy = 0; iy < 4; iy++) {
      rows_in[iy] = laplacian_sqrsum.ConstRow(y * 4 + iy);
@ -182,21 +178,21 @@ void ProcessTile(const CompressParams& cparams, const FrameHeader& frame_header,
    // ignore pixels outside the image.
    // Y coordinates are relative to by0*8+y*4.
    size_t sy = y * 4 + by0 * 8 > 0 ? 0 : 2;
-    size_t ey = y * 4 + by0 * 8 + 4 <= opsin_rect.ysize() + 2
+    size_t ey = y * 4 + by0 * 8 + 2 <= opsin.ysize()
                    ? 4
-                    : opsin_rect.ysize() - y * 4 - by0 * 8 + 2;
-    for (size_t x = 0; x < (bx1 - bx0) * 2 + 1; x++) {
+                    : opsin.ysize() - y * 4 - by0 * 8 + 2;
+    for (size_t x = 0; x < rect.xsize() * 2 + 1; x++) {
      // ignore pixels outside the image.
      // X coordinates are relative to bx0*8.
      size_t sx = x * 4 + bx0 * 8 > 0 ? x * 4 : x * 4 + 2;
-      size_t ex = x * 4 + bx0 * 8 + 4 <= opsin_rect.xsize() + 2
+      size_t ex = x * 4 + bx0 * 8 + 2 <= opsin.xsize()
                      ? x * 4 + 4
-                      : opsin_rect.xsize() - bx0 * 8 + 2;
+                      : opsin.xsize() - bx0 * 8 + 2;
      if (ex - sx == 4 && ey - sy == 4) {
        auto sum = Zero(df4);
-        for (size_t iy = 0; iy < 4; iy++) {
-          for (size_t ix = 0; ix < 4; ix += Lanes(df4)) {
-            sum = Add(sum, Load(df4, rows_in[iy] + sx + ix));
+        for (size_t iy = sy; iy < ey; iy++) {
+          for (size_t ix = sx; ix < ex; ix += Lanes(df4)) {
+            sum = Add(sum, Load(df4, rows_in[iy] + ix));
          }
        }
        row_out[x] = GetLane(Sqrt(SumOfLanes(df4, sum))) * (1.0f / 4.0f);
@ -211,11 +207,11 @@ void ProcessTile(const CompressParams& cparams, const FrameHeader& frame_header,
      }
    }
  }
-  for (size_t by = by0; by < by1; by++) {
+  for (size_t by = rect.y0(); by < rect.y1(); by++) {
    AcStrategyRow acs_row = ac_strategy.ConstRow(by);
    uint8_t* JXL_RESTRICT out_row = epf_sharpness->Row(by);
    const float* JXL_RESTRICT quant_row = quant_field.Row(by);
-    for (size_t bx = bx0; bx < bx1; bx++) {
+    for (size_t bx = rect.x0(); bx < rect.x1(); bx++) {
      AcStrategy acs = acs_row[bx];
      if (!acs.IsFirstBlock()) continue;
      // The errors are going to be linear to the quantization value in this
@ -223,12 +219,12 @@ void ProcessTile(const CompressParams& cparams, const FrameHeader& frame_header,
      float quant_val = 1.0f / quant_row[bx];

      const auto sq00 = [&](size_t y, size_t x) {
-        return sqrsum_00_row[((by - by0) * 2 + y) * sqrsum_00_stride +
-                             (bx - bx0) * 2 + x];
+        return sqrsum_00_row[((by - rect.y0()) * 2 + y) * sqrsum_00_stride +
+                             (bx - rect.x0()) * 2 + x];
      };
      const auto sq22 = [&](size_t y, size_t x) {
-        return sqrsum_22_row[((by - by0) * 2 + y) * sqrsum_22_stride +
-                             (bx - bx0) * 2 + x];
+        return sqrsum_22_row[((by - rect.y0()) * 2 + y) * sqrsum_22_stride +
+                             (bx - rect.x0()) * 2 + x];
      };
      float sqrsum_integral_transform = 0;
      for (size_t iy = 0; iy < acs.covered_blocks_y() * 2; iy++) {
--- a/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_ar_control_field.h
@ -10,6 +10,7 @@

 #include <vector>

+#include "lib/jxl/ac_strategy.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/image.h"
--- a/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_butteraugli_comparator.h
@ -10,6 +10,7 @@
 #include <stddef.h>

 #include <memory>
+#include <vector>

 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
--- a/third_party/jpeg-xl/lib/jxl/enc_cache.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_cache.h
@ -10,23 +10,20 @@
 #include <stddef.h>
 #include <stdint.h>

+#include <memory>
 #include <vector>

-#include "lib/jxl/ac_strategy.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/coeff_order.h"
-#include "lib/jxl/coeff_order_fwd.h"
 #include "lib/jxl/dct_util.h"
 #include "lib/jxl/enc_ans.h"
+#include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/enc_progressive_split.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/image.h"
-#include "lib/jxl/image_bundle.h"
 #include "lib/jxl/passes_state.h"
 #include "lib/jxl/quant_weights.h"
-#include "lib/jxl/quantizer.h"

 namespace jxl {

--- a/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.cc
@ -192,19 +192,6 @@ void InitDCStorage(size_t num_blocks, ImageF* dc_values) {
  }
 }

-void ComputeDC(const ImageF& dc_values, bool fast, int32_t* dc_x,
-               int32_t* dc_b) {
-  constexpr float kDistanceMultiplierDC = 1e-5f;
-  const float* JXL_RESTRICT dc_values_yx = dc_values.Row(0);
-  const float* JXL_RESTRICT dc_values_x = dc_values.Row(1);
-  const float* JXL_RESTRICT dc_values_yb = dc_values.Row(2);
-  const float* JXL_RESTRICT dc_values_b = dc_values.Row(3);
-  *dc_x = FindBestMultiplier(dc_values_yx, dc_values_x, dc_values.xsize(), 0.0f,
-                             kDistanceMultiplierDC, fast);
-  *dc_b = FindBestMultiplier(dc_values_yb, dc_values_b, dc_values.xsize(),
-                             jxl::cms::kYToBRatio, kDistanceMultiplierDC, fast);
-}
-
 void ComputeTile(const Image3F& opsin, const Rect& opsin_rect,
                 const DequantMatrices& dequant,
                 const AcStrategyImage* ac_strategy,
@ -363,7 +350,6 @@ HWY_AFTER_NAMESPACE();
 namespace jxl {

 HWY_EXPORT(InitDCStorage);
-HWY_EXPORT(ComputeDC);
 HWY_EXPORT(ComputeTile);

 void CfLHeuristics::Init(const Rect& rect) {
@ -387,14 +373,6 @@ void CfLHeuristics::ComputeTile(const Rect& r, const Image3F& opsin,
   mem.get() + thread * ItemsPerThread());
 }

-void CfLHeuristics::ComputeDC(bool fast, ColorCorrelationMap* cmap) {
-  int32_t ytob_dc = 0;
-  int32_t ytox_dc = 0;
-  HWY_DYNAMIC_DISPATCH(ComputeDC)(dc_values, fast, &ytox_dc, &ytob_dc);
-  cmap->SetYToBDC(ytob_dc);
-  cmap->SetYToXDC(ytox_dc);
-}
-
 void ColorCorrelationMapEncodeDC(const ColorCorrelationMap& map,
                                 BitWriter* writer, size_t layer,
                                 AuxOut* aux_out) {
--- a/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_chroma_from_luma.h
@ -41,8 +41,6 @@ struct CfLHeuristics {
                   const ImageI* raw_quant_field, const Quantizer* quantizer,
                   bool fast, size_t thread, ColorCorrelationMap* cmap);

-  void ComputeDC(bool fast, ColorCorrelationMap* cmap);
-
  ImageF dc_values;
  hwy::AlignedFreeUniquePtr<float[]> mem;

--- a/third_party/jpeg-xl/lib/jxl/enc_external_image.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_external_image.cc
@ -62,7 +62,14 @@ Status ConvertFromExternalNoSizeCheck(const uint8_t* data, size_t xsize,
  size_t bytes_per_pixel = format.num_channels * bytes_per_channel;
  size_t pixel_offset = c * bytes_per_channel;
  // Only for uint8/16.
-  float scale = 1. / ((1ull << bits_per_sample) - 1);
+  float scale = 1.0f;
+  if (format.data_type == JXL_TYPE_UINT8) {
+    // We will do an integer multiplication by 257 in LoadFloatRow so that a
+    // UINT8 value and the corresponding UINT16 value convert to the same float
+    scale = 1.0f / (257 * ((1ull << bits_per_sample) - 1));
+  } else {
+    scale = 1.0f / ((1ull << bits_per_sample) - 1);
+  }

  const bool little_endian =
      format.endianness == JXL_LITTLE_ENDIAN ||
--- a/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.cc
@ -3867,8 +3867,8 @@ void LLProcess(JxlFastLosslessFrameState* frame_state, bool is_last,
  bool streaming = !onegroup && output_processor;
  size_t total_groups = frame_state->num_groups_x * frame_state->num_groups_y;
  size_t max_groups = streaming ? kMaxLocalGroups : total_groups;
-  size_t start_pos = 0;
 #if !FJXL_STANDALONE
+  size_t start_pos = 0;
  if (streaming) {
    start_pos = output_processor->CurrentPosition();
    output_processor->Seek(start_pos + frame_state->ac_group_data_offset);
--- a/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_fast_lossless.h
@ -10,11 +10,7 @@
 // FJXL_STANDALONE=1 for a stand-alone jxl encoder
 // FJXL_STANDALONE=0 for use in libjxl to encode frames (but no image header)
 #ifndef FJXL_STANDALONE
-#ifdef JPEGXL_MAJOR_VERSION
 #define FJXL_STANDALONE 0
-#else
-#define FJXL_STANDALONE 1
-#endif
 #endif

 #if !FJXL_STANDALONE
--- a/third_party/jpeg-xl/lib/jxl/enc_frame.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_frame.cc
@ -145,7 +145,6 @@ Status CopyColorChannels(JxlChunkedFrameInputSource input, Rect rect,
                       " color channels, received only %u channels",
                       color_channels, format.num_channels);
  }
-  *color = Image3F(rect.xsize(), rect.ysize());
  const uint8_t* data = reinterpret_cast<const uint8_t*>(buffer.get());
  for (size_t c = 0; c < color_channels; ++c) {
    JXL_RETURN_IF_ERROR(ConvertFromExternalNoSizeCheck(
@ -221,12 +220,14 @@ void SetProgressiveMode(const CompressParams& cparams,
      {/*num_coefficients=*/8, /*shift=*/0,
       /*suitable_for_downsampling_of_at_least=*/0},
  };
+  bool progressive_mode = ApplyOverride(cparams.progressive_mode, false);
+  bool qprogressive_mode = ApplyOverride(cparams.qprogressive_mode, false);
  if (cparams.custom_progressive_mode) {
    progressive_splitter->SetProgressiveMode(*cparams.custom_progressive_mode);
-  } else if (cparams.qprogressive_mode) {
+  } else if (qprogressive_mode) {
    progressive_splitter->SetProgressiveMode(
        ProgressiveMode{progressive_passes_dc_quant_ac_full_ac});
-  } else if (cparams.progressive_mode) {
+  } else if (progressive_mode) {
    progressive_splitter->SetProgressiveMode(
        ProgressiveMode{progressive_passes_dc_vlf_lf_full_ac});
  }
@ -630,7 +631,7 @@ void ComputeChromacityAdjustments(const CompressParams& cparams,
  // look at the individual pixels and make a guess how difficult
  // the image would be based on the worst case pixel.
  PixelStatsForChromacityAdjustment pixel_stats;
-  if (cparams.speed_tier <= SpeedTier::kWombat) {
+  if (cparams.speed_tier <= SpeedTier::kSquirrel) {
    pixel_stats.Calc(&opsin, rect);
  }
  // For X take the most severe adjustment.
@ -640,8 +641,9 @@ void ComputeChromacityAdjustments(const CompressParams& cparams,
  frame_header->b_qm_scale = 2 + pixel_stats.HowMuchIsBChannelPixelized();
 }

-void ComputeNoiseParams(const CompressParams& cparams, bool color_is_jpeg,
-                        const Image3F& opsin, const FrameDimensions& frame_dim,
+void ComputeNoiseParams(const CompressParams& cparams, bool streaming_mode,
+                        bool color_is_jpeg, const Image3F& opsin,
+                        const FrameDimensions& frame_dim,
                        FrameHeader* frame_header, NoiseParams* noise_params) {
  if (cparams.photon_noise_iso > 0) {
    *noise_params = SimulatePhotonNoise(frame_dim.xsize, frame_dim.ysize,
@ -651,7 +653,8 @@ void ComputeNoiseParams(const CompressParams& cparams, bool color_is_jpeg,
      noise_params->lut[i] = cparams.manual_noise[i];
    }
  } else if (frame_header->encoding == FrameEncoding::kVarDCT &&
-             frame_header->flags & FrameHeader::kNoise && !color_is_jpeg) {
+             frame_header->flags & FrameHeader::kNoise && !color_is_jpeg &&
+             !streaming_mode) {
    // Don't start at zero amplitude since adding noise is expensive -- it
    // significantly slows down decoding, and this is unlikely to
    // completely go away even with advanced optimizations. After the
@ -1397,10 +1400,13 @@ Status ComputeEncodingData(
  Rect patch_rect = Rect(x0, y0, xsize, ysize).Extend(max_border, frame_rect);
  JXL_ASSERT(patch_rect.IsInside(frame_rect));

-  Image3F color;
+  // Allocating a large enough image avoids a copy when padding.
+  Image3F color(RoundUpToBlockDim(patch_rect.xsize()),
+                RoundUpToBlockDim(patch_rect.ysize()));
+  color.ShrinkTo(patch_rect.xsize(), patch_rect.ysize());
  std::vector<ImageF> extra_channels(num_extra_channels);
  for (auto& extra_channel : extra_channels) {
-    extra_channel = jxl::ImageF(xsize, ysize);
+    extra_channel = jxl::ImageF(patch_rect.xsize(), patch_rect.ysize());
  }
  ImageF* alpha = alpha_eci ? &extra_channels[alpha_idx] : nullptr;
  ImageF* black = black_eci ? &extra_channels[black_idx] : nullptr;
@ -1421,27 +1427,21 @@ Status ComputeEncodingData(
  Image3F linear_storage;
  Image3F* linear = nullptr;

-  Image3F opsin;
  if (!jpeg_data) {
-    // Allocating a large enough image avoids a copy when padding.
-    opsin = Image3F(RoundUpToBlockDim(color.xsize()),
-                    RoundUpToBlockDim(color.ysize()));
-    opsin.ShrinkTo(color.xsize(), color.ysize());
    if (frame_header.color_transform == ColorTransform::kXYB &&
        frame_info.ib_needs_color_transform) {
      if (frame_header.encoding == FrameEncoding::kVarDCT &&
          cparams.speed_tier <= SpeedTier::kKitten) {
-        linear_storage = Image3F(color.xsize(), color.ysize());
+        linear_storage = Image3F(patch_rect.xsize(), patch_rect.ysize());
        linear = &linear_storage;
      }
-      ToXYB(color, c_enc, metadata->m.IntensityTarget(), black, pool, &opsin,
-            cms, linear);
-    } else {  // RGB or YCbCr: don't do anything (forward YCbCr is not
-              // implemented, this is only used when the input is already in
-              // YCbCr)
-              // If encoding a special DC or reference frame, don't do anything:
-              // input is already in XYB.
-      CopyImageTo(color, &opsin);
+      ToXYB(c_enc, metadata->m.IntensityTarget(), black, pool, &color, cms,
+            linear);
+    } else {
+      // Nothing to do.
+      // RGB or YCbCr: forward YCbCr is not implemented, this is only used when
+      // the input is already in YCbCr
+      // If encoding a special DC or reference frame: input is already in XYB.
    }
    bool lossless = cparams.IsLossless();
    if (alpha && !alpha_eci->alpha_associated &&
@ -1449,32 +1449,29 @@ Status ComputeEncodingData(
        !ApplyOverride(cparams.keep_invisible, lossless) &&
        cparams.ec_resampling == cparams.resampling) {
      // simplify invisible pixels
-      SimplifyInvisible(&opsin, *alpha, lossless);
+      SimplifyInvisible(&color, *alpha, lossless);
      if (linear) {
        SimplifyInvisible(linear, *alpha, lossless);
      }
    }
-    PadImageToBlockMultipleInPlace(&opsin);
+    PadImageToBlockMultipleInPlace(&color);
  }
-  color = Image3F();

-  // Rectangle within opsin that corresponds to the currently processed group in
+  // Rectangle within color that corresponds to the currently processed group in
  // streaming mode.
-  Rect opsin_rect(x0 - patch_rect.x0(), y0 - patch_rect.y0(),
+  Rect group_rect(x0 - patch_rect.x0(), y0 - patch_rect.y0(),
                  RoundUpToBlockDim(xsize), RoundUpToBlockDim(ysize));

  if (enc_state.initialize_global_state && !jpeg_data) {
-    ComputeChromacityAdjustments(cparams, opsin, opsin_rect,
+    ComputeChromacityAdjustments(cparams, color, group_rect,
                                 &mutable_frame_header);
  }

-  if (!enc_state.streaming_mode) {
-    ComputeNoiseParams(cparams, !!jpeg_data, opsin, frame_dim,
-                       &mutable_frame_header,
-                       &shared.image_features.noise_params);
-  }
+  ComputeNoiseParams(cparams, enc_state.streaming_mode, !!jpeg_data, color,
+                     frame_dim, &mutable_frame_header,
+                     &shared.image_features.noise_params);

-  DownsampleColorChannels(cparams, frame_header, !!jpeg_data, &opsin);
+  DownsampleColorChannels(cparams, frame_header, !!jpeg_data, &color);

  if (cparams.ec_resampling != 1 && !cparams.already_downsampled) {
    for (ImageF& ec : extra_channels) {
@ -1483,7 +1480,7 @@ Status ComputeEncodingData(
  }

  if (!enc_state.streaming_mode) {
-    opsin_rect = Rect(opsin);
+    group_rect = Rect(color);
  }

  if (frame_header.encoding == FrameEncoding::kVarDCT) {
@ -1496,7 +1493,7 @@ Status ComputeEncodingData(
          *jpeg_data, frame_header, pool, &enc_modular, &enc_state));
    } else {
      JXL_RETURN_IF_ERROR(ComputeVarDCTEncodingData(
-          frame_header, linear, &opsin, opsin_rect, cms, pool, &enc_modular,
+          frame_header, linear, &color, group_rect, cms, pool, &enc_modular,
          &enc_state, aux_out));
    }
    ComputeAllCoeffOrders(enc_state, frame_dim);
@ -1508,16 +1505,15 @@ Status ComputeEncodingData(
        TokenizeAllCoefficients(frame_header, pool, &enc_state));
  }

-  JXL_RETURN_IF_ERROR(enc_modular.ComputeEncodingData(
-      frame_header, metadata->m, &opsin, extra_channels, &enc_state, cms, pool,
-      aux_out,
-      /* do_color=*/frame_header.encoding == FrameEncoding::kModular));
-  if (enc_state.initialize_global_state) {
-    JXL_RETURN_IF_ERROR(enc_modular.ComputeTree(pool));
-  }
-  JXL_RETURN_IF_ERROR(enc_modular.ComputeTokens(pool));
-
  if (!enc_state.streaming_mode) {
+    if (cparams.modular_mode || !extra_channels.empty()) {
+      JXL_RETURN_IF_ERROR(enc_modular.ComputeEncodingData(
+          frame_header, metadata->m, &color, extra_channels, &enc_state, cms,
+          pool, aux_out, /*do_color=*/cparams.modular_mode));
+    }
+    JXL_RETURN_IF_ERROR(enc_modular.ComputeTree(pool));
+    JXL_RETURN_IF_ERROR(enc_modular.ComputeTokens(pool));
+
    mutable_frame_header.UpdateFlag(shared.image_features.patches.HasAny(),
                                    FrameHeader::kPatches);
    mutable_frame_header.UpdateFlag(shared.image_features.splines.HasAny(),
@ -1683,10 +1679,10 @@ void ComputePermutationForStreaming(size_t xsize, size_t ysize,
      size_t ac_x0 = dc_x * kBlockDim;
      size_t ac_y1 = std::min<size_t>(group_ysize, ac_y0 + kBlockDim);
      size_t ac_x1 = std::min<size_t>(group_xsize, ac_x0 + kBlockDim);
-      for (size_t ac_y = ac_y0; ac_y < ac_y1; ++ac_y) {
-        for (size_t ac_x = ac_x0; ac_x < ac_x1; ++ac_x) {
-          size_t group_ix = ac_y * group_xsize + ac_x;
-          for (size_t pass = 0; pass < num_passes; ++pass) {
+      for (size_t pass = 0; pass < num_passes; ++pass) {
+        for (size_t ac_y = ac_y0; ac_y < ac_y1; ++ac_y) {
+          for (size_t ac_x = ac_x0; ac_x < ac_x1; ++ac_x) {
+            size_t group_ix = ac_y * group_xsize + ac_x;
            size_t old_ix =
                AcGroupIndex(pass, group_ix, num_groups, num_dc_groups);
            permutation[old_ix] = new_ix++;
--- a/third_party/jpeg-xl/lib/jxl/enc_frame.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_frame.h
@ -6,9 +6,13 @@
 #ifndef LIB_JXL_ENC_FRAME_H_
 #define LIB_JXL_ENC_FRAME_H_

+#include <jxl/cms_interface.h>
 #include <jxl/types.h>

+#include <cstddef>
+#include <cstdint>
 #include <string>
+#include <vector>

 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
@ -18,6 +22,7 @@
 #include "lib/jxl/encode_internal.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_metadata.h"

 namespace jxl {

--- a/third_party/jpeg-xl/lib/jxl/enc_gaborish.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_gaborish.cc
@ -49,9 +49,12 @@ void GaborishInverse(Image3F* in_out, const Rect& rect, float mul[3],
  // image and reuse the existing planes of the in/out image.
  ImageF temp(in_out->Plane(2).xsize(), in_out->Plane(2).ysize());
  CopyImageTo(in_out->Plane(2), &temp);
-  Symmetric5(in_out->Plane(0), rect, weights[0], pool, &in_out->Plane(2), rect);
-  Symmetric5(in_out->Plane(1), rect, weights[1], pool, &in_out->Plane(0), rect);
-  Symmetric5(temp, rect, weights[2], pool, &in_out->Plane(1), rect);
+  Rect xrect = rect.Extend(3, Rect(*in_out));
+  Symmetric5(in_out->Plane(0), xrect, weights[0], pool, &in_out->Plane(2),
+             xrect);
+  Symmetric5(in_out->Plane(1), xrect, weights[1], pool, &in_out->Plane(0),
+             xrect);
+  Symmetric5(temp, xrect, weights[2], pool, &in_out->Plane(1), xrect);
  // Now planes are 1, 2, 0.
  in_out->Plane(0).Swap(in_out->Plane(1));
  // 2 1 0
--- a/third_party/jpeg-xl/lib/jxl/enc_gaborish.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_gaborish.h
@ -8,9 +8,6 @@

 // Linear smoothing (3x3 convolution) for deblocking without too much blur.

-#include <stdint.h>
-
-#include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/image.h"

--- a/third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_gaborish_test.cc
@ -7,7 +7,11 @@

 #include <hwy/base.h>

+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/status.h"
 #include "lib/jxl/convolve.h"
+#include "lib/jxl/image.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/image_test_utils.h"
 #include "lib/jxl/testing.h"
--- a/third_party/jpeg-xl/lib/jxl/enc_group.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_group.h
@ -7,7 +7,6 @@
 #define LIB_JXL_ENC_GROUP_H_

 #include <stddef.h>
-#include <stdint.h>

 #include "lib/jxl/base/status.h"
 #include "lib/jxl/enc_bit_writer.h"
--- a/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_heuristics.cc
@ -5,13 +5,31 @@

 #include "lib/jxl/enc_heuristics.h"

+#include <jxl/cms_interface.h>
 #include <stddef.h>
 #include <stdint.h>

 #include <algorithm>
+#include <cstdlib>
+#include <limits>
+#include <memory>
 #include <numeric>
 #include <string>
+#include <utility>
+#include <vector>

+#include "lib/jxl/ac_context.h"
+#include "lib/jxl/ac_strategy.h"
+#include "lib/jxl/base/common.h"
+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/data_parallel.h"
+#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/status.h"
+#include "lib/jxl/butteraugli/butteraugli.h"
+#include "lib/jxl/chroma_from_luma.h"
+#include "lib/jxl/coeff_order.h"
+#include "lib/jxl/coeff_order_fwd.h"
+#include "lib/jxl/dec_xyb.h"
 #include "lib/jxl/enc_ac_strategy.h"
 #include "lib/jxl/enc_adaptive_quantization.h"
 #include "lib/jxl/enc_ar_control_field.h"
@ -20,11 +38,16 @@
 #include "lib/jxl/enc_gaborish.h"
 #include "lib/jxl/enc_modular.h"
 #include "lib/jxl/enc_noise.h"
+#include "lib/jxl/enc_params.h"
 #include "lib/jxl/enc_patch_dictionary.h"
-#include "lib/jxl/enc_photon_noise.h"
 #include "lib/jxl/enc_quant_weights.h"
 #include "lib/jxl/enc_splines.h"
-#include "lib/jxl/enc_xyb.h"
+#include "lib/jxl/frame_dimensions.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/image.h"
+#include "lib/jxl/image_ops.h"
+#include "lib/jxl/passes_state.h"
+#include "lib/jxl/quant_weights.h"

 namespace jxl {

@ -735,14 +758,7 @@ Status LossyFrameHeuristics(const FrameHeader& frame_header,
    PatchDictionaryEncoder::SubtractFrom(image_features.patches, opsin);
  }

-  static const float kAcQuant = 0.79f;
  const float quant_dc = InitialQuantDC(cparams.butteraugli_distance);
-  // We don't know the quant field yet, but for computing the global scale
-  // assuming that it will be the same as for Falcon mode is good enough.
-  if (initialize_global_state) {
-    quantizer.ComputeGlobalScaleAndQuant(
-        quant_dc, kAcQuant / cparams.butteraugli_distance, 0);
-  }

  // TODO(veluca): we can now run all the code from here to FindBestQuantizer
  // (excluded) one rect at a time. Do that.
@ -779,9 +795,10 @@ Status LossyFrameHeuristics(const FrameHeader& frame_header,
        ImageF(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
    initial_quant_masking =
        ImageF(frame_dim.xsize_blocks, frame_dim.ysize_blocks);
-    float q = kAcQuant / cparams.butteraugli_distance;
+    float q = 0.79 / cparams.butteraugli_distance;
    FillImage(q, &initial_quant_field);
    FillImage(1.0f / (q + 0.001f), &initial_quant_masking);
+    quantizer.ComputeGlobalScaleAndQuant(quant_dc, q, 0);
  } else {
    // Call this here, as it relies on pre-gaborish values.
    float butteraugli_distance_for_iqf = cparams.butteraugli_distance;
@ -791,9 +808,8 @@ Status LossyFrameHeuristics(const FrameHeader& frame_header,
    initial_quant_field = InitialQuantField(
        butteraugli_distance_for_iqf, *opsin, rect, pool, 1.0f,
        &initial_quant_masking, &initial_quant_masking1x1);
-    if (initialize_global_state) {
-      quantizer.SetQuantField(quant_dc, initial_quant_field, nullptr);
-    }
+    float q = 0.39 / cparams.butteraugli_distance;
+    quantizer.ComputeGlobalScaleAndQuant(quant_dc, q, 0);
  }

  // TODO(veluca): do something about animations.
@ -875,10 +891,6 @@ Status LossyFrameHeuristics(const FrameHeader& frame_header,
      process_tile, "Enc Heuristics"));

  acs_heuristics.Finalize(frame_dim, ac_strategy, aux_out);
-  if (cparams.speed_tier <= SpeedTier::kHare && initialize_global_state) {
-    cfl_heuristics.ComputeDC(/*fast=*/cparams.speed_tier >= SpeedTier::kWombat,
-                             &cmap);
-  }

  // Refine quantization levels.
  if (!streaming_mode) {
--- a/third_party/jpeg-xl/lib/jxl/enc_heuristics.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_heuristics.h
@ -10,15 +10,11 @@

 #include <jxl/cms_interface.h>
 #include <stddef.h>
-#include <stdint.h>
-
-#include <string>

 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/image.h"
-#include "lib/jxl/modular/encoding/enc_ma.h"

 namespace jxl {

--- a/third_party/jpeg-xl/lib/jxl/enc_icc_codec.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_icc_codec.cc
@ -7,6 +7,7 @@

 #include <stdint.h>

+#include <limits>
 #include <map>
 #include <string>
 #include <vector>
@ -93,6 +94,8 @@ static inline void EncodeVarInt(uint64_t value, PaddedBytes* data) {
  data->resize(pos);
 }

+constexpr size_t kSizeLimit = std::numeric_limits<uint32_t>::max() >> 2;
+
 }  // namespace

 // Outputs a transformed form of the given icc profile. The result itself is
@ -103,6 +106,13 @@ Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result) {
  PaddedBytes commands;
  PaddedBytes data;

+  static_assert(sizeof(size_t) >= 4, "size_t is too short");
+  // Fuzzer expects that PredictICC can accept any input,
+  // but 1GB should be enough for any purpose.
+  if (size > kSizeLimit) {
+    return JXL_FAILURE("ICC profile is too large");
+  }
+
  EncodeVarInt(size, result);

  // Header
@ -227,6 +237,11 @@ Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result) {
  Tag tag;
  size_t tagstart = 0, tagsize = 0, clutstart = 0;

+  // Should always check tag_sane before doing math with tagsize.
+  const auto tag_sane = [&tagsize]() {
+    return (tagsize > 8) && (tagsize < kSizeLimit);
+  };
+
  size_t last0 = pos;
  // This loop appends commands to the output, processing some sub-section of a
  // current tagged element each time. We need to keep track of the tagtype of
@ -241,7 +256,8 @@ Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result) {
    PaddedBytes data_add;

    // This means the loop brought the position beyond the tag end.
-    if (pos > tagstart + tagsize) {
+    // If tagsize is nonsensical, any pos looks "ok-ish".
+    if ((pos > tagstart + tagsize) && (tagsize < kSizeLimit)) {
      tag = {{0, 0, 0, 0}};  // nonsensical value
    }

@ -252,7 +268,7 @@ Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result) {
      tagstart = tagstarts[index];
      tagsize = tagsizes[index];

-      if (tag == kMlucTag && pos + tagsize <= size && tagsize > 8 &&
+      if (tag == kMlucTag && tag_sane() && pos + tagsize <= size &&
          icc[pos + 4] == 0 && icc[pos + 5] == 0 && icc[pos + 6] == 0 &&
          icc[pos + 7] == 0) {
        size_t num = tagsize - 8;
@ -268,7 +284,7 @@ Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result) {
        Unshuffle(data_add.data() + start, num, 2);
      }

-      if (tag == kCurvTag && pos + tagsize <= size && tagsize > 8 &&
+      if (tag == kCurvTag && tag_sane() && pos + tagsize <= size &&
          icc[pos + 4] == 0 && icc[pos + 5] == 0 && icc[pos + 6] == 0 &&
          icc[pos + 7] == 0) {
        size_t num = tagsize - 8;
@ -334,8 +350,8 @@ Status PredictICC(const uint8_t* icc, size_t size, PaddedBytes* result) {
    }

    if (commands_add.empty() && data_add.empty() && tag == kGbd_Tag &&
-        pos == tagstart + 8 && pos + tagsize - 8 <= size && pos > 16 &&
-        tagsize > 8) {
+        tag_sane() && pos == tagstart + 8 && pos + tagsize - 8 <= size &&
+        pos > 16) {
      size_t width = 4, order = 0, stride = width;
      size_t num = tagsize - 8;
      uint8_t flags = (order << 2) | (width - 1) | (stride == width ? 0 : 16);
--- a/third_party/jpeg-xl/lib/jxl/enc_modular.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_modular.cc
@ -73,115 +73,6 @@ static const float squeeze_luma_qtable[16] = {
 static const float squeeze_chroma_qtable[16] = {
    1024, 512, 256, 128, 64, 32, 16, 8, 4, 2, 1, 0.5, 0.5, 0.5, 0.5, 0.5};

-// `cutoffs` must be sorted.
-Tree MakeFixedTree(int property, const std::vector<int32_t>& cutoffs,
-                   Predictor pred, size_t num_pixels) {
-  size_t log_px = CeilLog2Nonzero(num_pixels);
-  size_t min_gap = 0;
-  // Reduce fixed tree height when encoding small images.
-  if (log_px < 14) {
-    min_gap = 8 * (14 - log_px);
-  }
-  Tree tree;
-  struct NodeInfo {
-    size_t begin, end, pos;
-  };
-  std::queue<NodeInfo> q;
-  // Leaf IDs will be set by roundtrip decoding the tree.
-  tree.push_back(PropertyDecisionNode::Leaf(pred));
-  q.push(NodeInfo{0, cutoffs.size(), 0});
-  while (!q.empty()) {
-    NodeInfo info = q.front();
-    q.pop();
-    if (info.begin + min_gap >= info.end) continue;
-    uint32_t split = (info.begin + info.end) / 2;
-    tree[info.pos] =
-        PropertyDecisionNode::Split(property, cutoffs[split], tree.size());
-    q.push(NodeInfo{split + 1, info.end, tree.size()});
-    tree.push_back(PropertyDecisionNode::Leaf(pred));
-    q.push(NodeInfo{info.begin, split, tree.size()});
-    tree.push_back(PropertyDecisionNode::Leaf(pred));
-  }
-  return tree;
-}
-
-Tree PredefinedTree(ModularOptions::TreeKind tree_kind, size_t total_pixels) {
-  if (tree_kind == ModularOptions::TreeKind::kJpegTranscodeACMeta ||
-      tree_kind == ModularOptions::TreeKind::kTrivialTreeNoPredictor) {
-    // All the data is 0, so no need for a fancy tree.
-    return {PropertyDecisionNode::Leaf(Predictor::Zero)};
-  }
-  if (tree_kind == ModularOptions::TreeKind::kFalconACMeta) {
-    // All the data is 0 except the quant field. TODO(veluca): make that 0 too.
-    return {PropertyDecisionNode::Leaf(Predictor::Left)};
-  }
-  if (tree_kind == ModularOptions::TreeKind::kACMeta) {
-    // Small image.
-    if (total_pixels < 1024) {
-      return {PropertyDecisionNode::Leaf(Predictor::Left)};
-    }
-    Tree tree;
-    // 0: c > 1
-    tree.push_back(PropertyDecisionNode::Split(0, 1, 1));
-    // 1: c > 2
-    tree.push_back(PropertyDecisionNode::Split(0, 2, 3));
-    // 2: c > 0
-    tree.push_back(PropertyDecisionNode::Split(0, 0, 5));
-    // 3: EPF control field (all 0 or 4), top > 0
-    tree.push_back(PropertyDecisionNode::Split(6, 0, 21));
-    // 4: ACS+QF, y > 0
-    tree.push_back(PropertyDecisionNode::Split(2, 0, 7));
-    // 5: CfL x
-    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Gradient));
-    // 6: CfL b
-    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Gradient));
-    // 7: QF: split according to the left quant value.
-    tree.push_back(PropertyDecisionNode::Split(7, 5, 9));
-    // 8: ACS: split in 4 segments (8x8 from 0 to 3, large square 4-5, large
-    // rectangular 6-11, 8x8 12+), according to previous ACS value.
-    tree.push_back(PropertyDecisionNode::Split(7, 5, 15));
-    // QF
-    tree.push_back(PropertyDecisionNode::Split(7, 11, 11));
-    tree.push_back(PropertyDecisionNode::Split(7, 3, 13));
-    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
-    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
-    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
-    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
-    // ACS
-    tree.push_back(PropertyDecisionNode::Split(7, 11, 17));
-    tree.push_back(PropertyDecisionNode::Split(7, 3, 19));
-    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
-    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
-    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
-    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
-    // EPF, left > 0
-    tree.push_back(PropertyDecisionNode::Split(7, 0, 23));
-    tree.push_back(PropertyDecisionNode::Split(7, 0, 25));
-    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
-    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
-    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
-    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
-    return tree;
-  }
-  if (tree_kind == ModularOptions::TreeKind::kWPFixedDC) {
-    std::vector<int32_t> cutoffs = {
-        -500, -392, -255, -191, -127, -95, -63, -47, -31, -23, -15,
-        -11,  -7,   -4,   -3,   -1,   0,   1,   3,   5,   7,   11,
-        15,   23,   31,   47,   63,   95,  127, 191, 255, 392, 500};
-    return MakeFixedTree(kWPProp, cutoffs, Predictor::Weighted, total_pixels);
-  }
-  if (tree_kind == ModularOptions::TreeKind::kGradientFixedDC) {
-    std::vector<int32_t> cutoffs = {
-        -500, -392, -255, -191, -127, -95, -63, -47, -31, -23, -15,
-        -11,  -7,   -4,   -3,   -1,   0,   1,   3,   5,   7,   11,
-        15,   23,   31,   47,   63,   95,  127, 191, 255, 392, 500};
-    return MakeFixedTree(kGradientProp, cutoffs, Predictor::Gradient,
-                         total_pixels);
-  }
-  JXL_UNREACHABLE("Unreachable");
-  return {};
-}
-
 // Merges the trees in `trees` using nodes that decide on stream_id, as defined
 // by `tree_splits`.
 void MergeTrees(const std::vector<Tree>& trees,
@ -759,7 +650,6 @@ Status ModularFrameEncoder::ComputeEncodingData(
  if (cparams_.responsive && !gi.channel.empty() &&
      max_bitdepth + 2 < level_max_bitdepth) {
    Transform t(TransformId::kSqueeze);
-    t.squeezes = cparams_.squeezes;
    do_transform(gi, t, weighted::Header(), pool);
    max_bitdepth += 2;
  }
@ -1194,19 +1084,23 @@ Status ModularFrameEncoder::EncodeStream(BitWriter* writer, AuxOut* aux_out,
  if (stream_images_[stream_id].channel.empty()) {
    return true;  // Image with no channels, header never gets decoded.
  }
-  JXL_RETURN_IF_ERROR(
-      Bundle::Write(stream_headers_[stream_id], writer, layer, aux_out));
-  WriteTokens(tokens_[stream_id], code_, context_map_, 0, writer, layer,
-              aux_out);
+  if (tokens_.empty()) {
+    JXL_RETURN_IF_ERROR(ModularGenericCompress(
+        stream_images_[stream_id], stream_options_[stream_id], writer, aux_out,
+        layer, stream_id));
+  } else {
+    JXL_RETURN_IF_ERROR(
+        Bundle::Write(stream_headers_[stream_id], writer, layer, aux_out));
+    WriteTokens(tokens_[stream_id], code_, context_map_, 0, writer, layer,
+                aux_out);
+  }
  return true;
 }

 void ModularFrameEncoder::ClearStreamData(const ModularStreamId& stream) {
  size_t stream_id = stream.ID(frame_dim_);
  Image empty_image;
-  std::vector<Token> empty_tokens;
  std::swap(stream_images_[stream_id], empty_image);
-  std::swap(tokens_[stream_id], empty_tokens);
 }

 namespace {
--- a/third_party/jpeg-xl/lib/jxl/enc_params.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_params.h
@ -8,19 +8,18 @@

 // Parameters and flags that govern JXL compression.

+#include <jxl/cms_interface.h>
 #include <jxl/encode.h>
 #include <stddef.h>
-#include <stdint.h>

-#include <string>
+#include <vector>

 #include "lib/jxl/base/override.h"
-#include "lib/jxl/butteraugli/butteraugli.h"
 #include "lib/jxl/enc_progressive_split.h"
+#include "lib/jxl/frame_dimensions.h"
 #include "lib/jxl/frame_header.h"
 #include "lib/jxl/modular/encoding/dec_ma.h"
 #include "lib/jxl/modular/options.h"
-#include "lib/jxl/modular/transform/transform.h"
 #include "lib/jxl/splines.h"

 namespace jxl {
@ -92,10 +91,10 @@ struct CompressParams {
  int epf = -1;

  // Progressive mode.
-  bool progressive_mode = false;
+  Override progressive_mode = Override::kDefault;

  // Quantized-progressive mode.
-  bool qprogressive_mode = false;
+  Override qprogressive_mode = Override::kDefault;

  // Put center groups first in the bitstream.
  bool centerfirst = false;
@ -137,8 +136,6 @@ struct CompressParams {
  // modular mode options below
  ModularOptions options;
  int responsive = -1;
-  // empty for default squeeze
-  std::vector<SqueezeParams> squeezes;
  int colorspace = -1;
  // Use Global channel palette if #colors < this percentage of range
  float channel_colors_pre_transform_percent = 95.f;
@ -173,7 +170,7 @@ struct CompressParams {
  void SetLossless() {
    modular_mode = true;
    butteraugli_distance = 0.0f;
-    for (float &f : ec_distance) f = 0.0f;
+    for (float& f : ec_distance) f = 0.0f;
    color_transform = jxl::ColorTransform::kNone;
  }

@ -198,6 +195,8 @@ struct CompressParams {

  // See JXL_ENC_FRAME_SETTING_BUFFERING option value.
  int buffering = 0;
+  // See JXL_ENC_FRAME_SETTING_USE_FULL_IMAGE_HEURISTICS option value.
+  bool use_full_image_heuristics = true;

  std::vector<float> manual_noise;
  std::vector<float> manual_xyb_factors;
--- a/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_patch_dictionary.cc
@ -748,8 +748,8 @@ void RoundtripPatchFrame(Image3F* reference_frame,
  cparams.modular_mode = true;
  cparams.responsive = 0;
  cparams.progressive_dc = 0;
-  cparams.progressive_mode = false;
-  cparams.qprogressive_mode = false;
+  cparams.progressive_mode = Override::kOff;
+  cparams.qprogressive_mode = Override::kOff;
  // Use gradient predictor and not Predictor::Best.
  cparams.options.predictor = Predictor::Gradient;
  patch_frame_info.save_as_reference = idx;  // always saved.
--- a/third_party/jpeg-xl/lib/jxl/enc_transforms-inl.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_transforms-inl.h
@ -402,7 +402,7 @@ void AFVTransformFromPixels(const float* JXL_RESTRICT pixels,
  HWY_ALIGN float scratch_space[4 * 8 * 5];
  size_t afv_x = afv_kind & 1;
  size_t afv_y = afv_kind / 2;
-  HWY_ALIGN float block[4 * 8];
+  HWY_ALIGN float block[4 * 8] = {};
  for (size_t iy = 0; iy < 4; iy++) {
    for (size_t ix = 0; ix < 4; ix++) {
      block[(afv_y == 1 ? 3 - iy : iy) * 4 + (afv_x == 1 ? 3 - ix : ix)] =
--- a/third_party/jpeg-xl/lib/jxl/enc_xyb.cc
+++ b/third_party/jpeg-xl/lib/jxl/enc_xyb.cc
@ -113,95 +113,82 @@ V LinearFromSRGB(V encoded) {
  return TF_SRGB().DisplayFromEncoded(encoded);
 }

-Status LinearSRGBToXYB(const Image3F& linear,
-                       const float* JXL_RESTRICT premul_absorb,
-                       ThreadPool* pool, Image3F* JXL_RESTRICT xyb) {
-  const size_t xsize = linear.xsize();
+Status LinearSRGBToXYB(const float* JXL_RESTRICT premul_absorb,
+                       ThreadPool* pool, Image3F* JXL_RESTRICT image) {
+  const size_t xsize = image->xsize();

  const HWY_FULL(float) d;
  return RunOnPool(
-      pool, 0, static_cast<uint32_t>(linear.ysize()), ThreadPool::NoInit,
+      pool, 0, static_cast<uint32_t>(image->ysize()), ThreadPool::NoInit,
      [&](const uint32_t task, size_t /*thread*/) {
        const size_t y = static_cast<size_t>(task);
-        const float* JXL_RESTRICT row_in0 = linear.ConstPlaneRow(0, y);
-        const float* JXL_RESTRICT row_in1 = linear.ConstPlaneRow(1, y);
-        const float* JXL_RESTRICT row_in2 = linear.ConstPlaneRow(2, y);
-        float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y);
-        float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y);
-        float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y);
+        float* JXL_RESTRICT row0 = image->PlaneRow(0, y);
+        float* JXL_RESTRICT row1 = image->PlaneRow(1, y);
+        float* JXL_RESTRICT row2 = image->PlaneRow(2, y);

        for (size_t x = 0; x < xsize; x += Lanes(d)) {
-          const auto in_r = Load(d, row_in0 + x);
-          const auto in_g = Load(d, row_in1 + x);
-          const auto in_b = Load(d, row_in2 + x);
-          LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x,
-                         row_xyb1 + x, row_xyb2 + x);
+          const auto in_r = Load(d, row0 + x);
+          const auto in_g = Load(d, row1 + x);
+          const auto in_b = Load(d, row2 + x);
+          LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row0 + x, row1 + x,
+                         row2 + x);
        }
      },
      "LinearToXYB");
 }

-Status SRGBToXYB(const Image3F& srgb, const float* JXL_RESTRICT premul_absorb,
-                 ThreadPool* pool, Image3F* JXL_RESTRICT xyb) {
-  const size_t xsize = srgb.xsize();
+Status SRGBToXYB(const float* JXL_RESTRICT premul_absorb, ThreadPool* pool,
+                 Image3F* JXL_RESTRICT image) {
+  const size_t xsize = image->xsize();

  const HWY_FULL(float) d;
  return RunOnPool(
-      pool, 0, static_cast<uint32_t>(srgb.ysize()), ThreadPool::NoInit,
+      pool, 0, static_cast<uint32_t>(image->ysize()), ThreadPool::NoInit,
      [&](const uint32_t task, size_t /*thread*/) {
        const size_t y = static_cast<size_t>(task);
-        const float* JXL_RESTRICT row_srgb0 = srgb.ConstPlaneRow(0, y);
-        const float* JXL_RESTRICT row_srgb1 = srgb.ConstPlaneRow(1, y);
-        const float* JXL_RESTRICT row_srgb2 = srgb.ConstPlaneRow(2, y);
-        float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y);
-        float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y);
-        float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y);
+        float* JXL_RESTRICT row0 = image->PlaneRow(0, y);
+        float* JXL_RESTRICT row1 = image->PlaneRow(1, y);
+        float* JXL_RESTRICT row2 = image->PlaneRow(2, y);

        for (size_t x = 0; x < xsize; x += Lanes(d)) {
-          const auto in_r = LinearFromSRGB(Load(d, row_srgb0 + x));
-          const auto in_g = LinearFromSRGB(Load(d, row_srgb1 + x));
-          const auto in_b = LinearFromSRGB(Load(d, row_srgb2 + x));
-          LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x,
-                         row_xyb1 + x, row_xyb2 + x);
+          const auto in_r = LinearFromSRGB(Load(d, row0 + x));
+          const auto in_g = LinearFromSRGB(Load(d, row1 + x));
+          const auto in_b = LinearFromSRGB(Load(d, row2 + x));
+          LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row0 + x, row1 + x,
+                         row2 + x);
        }
      },
      "SRGBToXYB");
 }

-Status SRGBToXYBAndLinear(const Image3F& srgb,
-                          const float* JXL_RESTRICT premul_absorb,
-                          ThreadPool* pool, Image3F* JXL_RESTRICT xyb,
+Status SRGBToXYBAndLinear(const float* JXL_RESTRICT premul_absorb,
+                          ThreadPool* pool, Image3F* JXL_RESTRICT image,
                          Image3F* JXL_RESTRICT linear) {
-  const size_t xsize = srgb.xsize();
+  const size_t xsize = image->xsize();

  const HWY_FULL(float) d;
  return RunOnPool(
-      pool, 0, static_cast<uint32_t>(srgb.ysize()), ThreadPool::NoInit,
+      pool, 0, static_cast<uint32_t>(image->ysize()), ThreadPool::NoInit,
      [&](const uint32_t task, size_t /*thread*/) {
        const size_t y = static_cast<size_t>(task);
-        const float* JXL_RESTRICT row_srgb0 = srgb.ConstPlaneRow(0, y);
-        const float* JXL_RESTRICT row_srgb1 = srgb.ConstPlaneRow(1, y);
-        const float* JXL_RESTRICT row_srgb2 = srgb.ConstPlaneRow(2, y);
-
+        float* JXL_RESTRICT row_image0 = image->PlaneRow(0, y);
+        float* JXL_RESTRICT row_image1 = image->PlaneRow(1, y);
+        float* JXL_RESTRICT row_image2 = image->PlaneRow(2, y);
        float* JXL_RESTRICT row_linear0 = linear->PlaneRow(0, y);
        float* JXL_RESTRICT row_linear1 = linear->PlaneRow(1, y);
        float* JXL_RESTRICT row_linear2 = linear->PlaneRow(2, y);

-        float* JXL_RESTRICT row_xyb0 = xyb->PlaneRow(0, y);
-        float* JXL_RESTRICT row_xyb1 = xyb->PlaneRow(1, y);
-        float* JXL_RESTRICT row_xyb2 = xyb->PlaneRow(2, y);
-
        for (size_t x = 0; x < xsize; x += Lanes(d)) {
-          const auto in_r = LinearFromSRGB(Load(d, row_srgb0 + x));
-          const auto in_g = LinearFromSRGB(Load(d, row_srgb1 + x));
-          const auto in_b = LinearFromSRGB(Load(d, row_srgb2 + x));
+          const auto in_r = LinearFromSRGB(Load(d, row_image0 + x));
+          const auto in_g = LinearFromSRGB(Load(d, row_image1 + x));
+          const auto in_b = LinearFromSRGB(Load(d, row_image2 + x));

          Store(in_r, d, row_linear0 + x);
          Store(in_g, d, row_linear1 + x);
          Store(in_b, d, row_linear2 + x);

-          LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_xyb0 + x,
-                         row_xyb1 + x, row_xyb2 + x);
+          LinearRGBToXYB(in_r, in_g, in_b, premul_absorb, row_image0 + x,
+                         row_image1 + x, row_image2 + x);
        }
      },
      "SRGBToXYBAndLinear");
@ -281,38 +268,13 @@ Image3F TransformToLinearRGB(const Image3F& in,
  return out;
 }

-void Image3FToXYB(const Image3F& in, const ColorEncoding& color_encoding,
-                  float intensity_target, ThreadPool* pool,
-                  Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms) {
-  JXL_ASSERT(SameSize(in, *xyb));
-
-  const HWY_FULL(float) d;
-  // Pre-broadcasted constants
-  HWY_ALIGN float premul_absorb[MaxLanes(d) * 12];
-  ComputePremulAbsorb(intensity_target, premul_absorb);
-
-  bool is_gray = color_encoding.IsGray();
-  const ColorEncoding& c_linear_srgb = ColorEncoding::LinearSRGB(is_gray);
-  if (c_linear_srgb.SameColorEncoding(color_encoding)) {
-    JXL_CHECK(LinearSRGBToXYB(in, premul_absorb, pool, xyb));
-  } else if (color_encoding.IsSRGB()) {
-    JXL_CHECK(SRGBToXYB(in, premul_absorb, pool, xyb));
-  } else {
-    Image3F linear =
-        TransformToLinearRGB(in, color_encoding, intensity_target, cms, pool);
-    JXL_CHECK(LinearSRGBToXYB(linear, premul_absorb, pool, xyb));
-  }
-}
-
 // This is different from Butteraugli's OpsinDynamicsImage() in the sense that
 // it does not contain a sensitivity multiplier based on the blurred image.
-void ToXYB(const Image3F& color, const ColorEncoding& c_current,
-           float intensity_target, const ImageF* black, ThreadPool* pool,
-           Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms,
-           Image3F* const JXL_RESTRICT linear) {
-  JXL_ASSERT(SameSize(color, *xyb));
-  if (black) JXL_ASSERT(SameSize(color, *black));
-  if (linear) JXL_ASSERT(SameSize(color, *linear));
+void ToXYB(const ColorEncoding& c_current, float intensity_target,
+           const ImageF* black, ThreadPool* pool, Image3F* JXL_RESTRICT image,
+           const JxlCmsInterface& cms, Image3F* const JXL_RESTRICT linear) {
+  if (black) JXL_ASSERT(SameSize(*image, *black));
+  if (linear) JXL_ASSERT(SameSize(*image, *linear));

  const HWY_FULL(float) d;
  // Pre-broadcasted constants
@ -326,12 +288,12 @@ void ToXYB(const Image3F& color, const ColorEncoding& c_current,
  // Linear sRGB inputs are rare but can be useful for the fastest encoders, for
  // which undoing the sRGB transfer function would be a large part of the cost.
  if (c_linear_srgb.SameColorEncoding(c_current)) {
-    JXL_CHECK(LinearSRGBToXYB(color, premul_absorb, pool, xyb));
    // This only happens if kitten or slower, moving ImageBundle might be
    // possible but the encoder is much slower than this copy.
    if (want_linear) {
-      CopyImageTo(color, linear);
+      CopyImageTo(*image, linear);
    }
+    JXL_CHECK(LinearSRGBToXYB(premul_absorb, pool, image));
    return;
  }

@ -340,31 +302,20 @@ void ToXYB(const Image3F& color, const ColorEncoding& c_current,
    // Common case: can avoid allocating/copying
    if (want_linear) {
      // Slow encoder also wants linear sRGB.
-      JXL_CHECK(SRGBToXYBAndLinear(color, premul_absorb, pool, xyb, linear));
+      JXL_CHECK(SRGBToXYBAndLinear(premul_absorb, pool, image, linear));
    } else {
-      JXL_CHECK(SRGBToXYB(color, premul_absorb, pool, xyb));
+      JXL_CHECK(SRGBToXYB(premul_absorb, pool, image));
    }
    return;
  }

-  // General case: not sRGB, need color transform.
-  Image3F linear_storage;  // Local storage only used if !want_linear.
-  Image3F* linear_storage_ptr;
+  JXL_CHECK(ApplyColorTransform(c_current, intensity_target, *image, black,
+                                Rect(*image), c_linear_srgb, cms, pool,
+                                want_linear ? linear : image));
  if (want_linear) {
-    // Caller asked for linear, use that storage directly.
-    linear_storage_ptr = linear;
-  } else {
-    // Caller didn't ask for linear, create our own local storage
-    // OK to reuse metadata, it will not be changed.
-    linear_storage = Image3F(color.xsize(), color.ysize());
-    linear_storage_ptr = &linear_storage;
+    CopyImageTo(*linear, image);
  }
-
-  JXL_CHECK(ApplyColorTransform(c_current, intensity_target, color, black,
-                                Rect(color), c_linear_srgb, cms, pool,
-                                linear_storage_ptr));
-
-  JXL_CHECK(LinearSRGBToXYB(*linear_storage_ptr, premul_absorb, pool, xyb));
+  JXL_CHECK(LinearSRGBToXYB(premul_absorb, pool, image));
 }

 // Transform RGB to YCbCr.
@ -436,17 +387,18 @@ HWY_AFTER_NAMESPACE();
 #if HWY_ONCE
 namespace jxl {
 HWY_EXPORT(ToXYB);
-void ToXYB(const Image3F& color, const ColorEncoding& c_current,
-           float intensity_target, const ImageF* black, ThreadPool* pool,
-           Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms,
-           Image3F* const JXL_RESTRICT linear) {
+void ToXYB(const ColorEncoding& c_current, float intensity_target,
+           const ImageF* black, ThreadPool* pool, Image3F* JXL_RESTRICT image,
+           const JxlCmsInterface& cms, Image3F* const JXL_RESTRICT linear) {
  HWY_DYNAMIC_DISPATCH(ToXYB)
-  (color, c_current, intensity_target, black, pool, xyb, cms, linear);
+  (c_current, intensity_target, black, pool, image, cms, linear);
 }

 void ToXYB(const ImageBundle& in, ThreadPool* pool, Image3F* JXL_RESTRICT xyb,
           const JxlCmsInterface& cms, Image3F* JXL_RESTRICT linear) {
-  ToXYB(in.color(), in.c_current(), in.metadata()->IntensityTarget(),
+  *xyb = Image3F(in.xsize(), in.ysize());
+  CopyImageTo(in.color(), xyb);
+  ToXYB(in.c_current(), in.metadata()->IntensityTarget(),
        in.HasBlack() ? &in.black() : nullptr, pool, xyb, cms, linear);
 }

@ -484,14 +436,6 @@ void ScaleXYB(Image3F* opsin) {
  }
 }

-HWY_EXPORT(Image3FToXYB);
-void Image3FToXYB(const Image3F& in, const ColorEncoding& color_encoding,
-                  float intensity_target, ThreadPool* pool,
-                  Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms) {
-  return HWY_DYNAMIC_DISPATCH(Image3FToXYB)(in, color_encoding,
-                                            intensity_target, pool, xyb, cms);
-}
-
 HWY_EXPORT(RgbToYcbcr);
 Status RgbToYcbcr(const ImageF& r_plane, const ImageF& g_plane,
                  const ImageF& b_plane, ImageF* y_plane, ImageF* cb_plane,
--- a/third_party/jpeg-xl/lib/jxl/enc_xyb.h
+++ b/third_party/jpeg-xl/lib/jxl/enc_xyb.h
@ -10,29 +10,26 @@

 #include <jxl/cms_interface.h>

+#include <cstddef>
+
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/enc_bit_writer.h"
+#include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"

 namespace jxl {

-// Converts any color space to XYB. If `linear` is not null, fills it with a
-// linear sRGB copy of `in`.
-void ToXYB(const Image3F& color, const ColorEncoding& c_current,
-           float intensity_target, const ImageF* black, ThreadPool* pool,
-           Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms,
-           Image3F* const JXL_RESTRICT linear);
+// Converts any color space to XYB in-place. If `linear` is not null, fills it
+// with a linear sRGB copy of `image`.
+void ToXYB(const ColorEncoding& c_current, float intensity_target,
+           const ImageF* black, ThreadPool* pool, Image3F* JXL_RESTRICT image,
+           const JxlCmsInterface& cms, Image3F* JXL_RESTRICT linear);

 void ToXYB(const ImageBundle& in, ThreadPool* pool, Image3F* JXL_RESTRICT xyb,
           const JxlCmsInterface& cms, Image3F* JXL_RESTRICT linear = nullptr);

-void Image3FToXYB(const Image3F& in, const ColorEncoding& color_encoding,
-                  float intensity_target, ThreadPool* pool,
-                  Image3F* JXL_RESTRICT xyb, const JxlCmsInterface& cms);
-
 void LinearRGBRowToXYB(float* JXL_RESTRICT row0, float* JXL_RESTRICT row1,
                       float* JXL_RESTRICT row2,
                       const float* JXL_RESTRICT premul_absorb, size_t xsize);
--- a/third_party/jpeg-xl/lib/jxl/encode.cc
+++ b/third_party/jpeg-xl/lib/jxl/encode.cc
@ -8,6 +8,7 @@
 #include <jxl/codestream_header.h>
 #include <jxl/encode.h>
 #include <jxl/types.h>
+#include <jxl/version.h>

 #include <algorithm>
 #include <cstddef>
@ -1641,10 +1642,12 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetOption(
      frame_settings->values.cparams.responsive = value;
      break;
    case JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC:
-      frame_settings->values.cparams.progressive_mode = value;
+      frame_settings->values.cparams.progressive_mode =
+          static_cast<jxl::Override>(value);
      break;
    case JXL_ENC_FRAME_SETTING_QPROGRESSIVE_AC:
-      frame_settings->values.cparams.qprogressive_mode = value;
+      frame_settings->values.cparams.qprogressive_mode =
+          static_cast<jxl::Override>(value);
      break;
    case JXL_ENC_FRAME_SETTING_PROGRESSIVE_DC:
      if (value < -1 || value > 2) {
@ -1671,7 +1674,6 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetOption(
      // alternatively, in the cjxl binary like now)
      frame_settings->values.cparams.lossy_palette = (value == 1);
      break;
-      return JXL_ENC_SUCCESS;
    case JXL_ENC_FRAME_SETTING_COLOR_TRANSFORM:
      if (value < -1 || value > 2) {
        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_API_USAGE,
@ -1760,6 +1762,13 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetOption(
    case JXL_ENC_FRAME_SETTING_JPEG_KEEP_JUMBF:
      frame_settings->values.cparams.jpeg_keep_jumbf = value;
      break;
+    case JXL_ENC_FRAME_SETTING_USE_FULL_IMAGE_HEURISTICS:
+      if (value < 0 || value > 1) {
+        return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
+                             "Option value has to be 0 or 1");
+      }
+      frame_settings->values.cparams.use_full_image_heuristics = value;
+      break;

    default:
      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
@ -1855,6 +1864,7 @@ JxlEncoderStatus JxlEncoderFrameSettingsSetFloatOption(
    case JXL_ENC_FRAME_SETTING_JPEG_KEEP_EXIF:
    case JXL_ENC_FRAME_SETTING_JPEG_KEEP_XMP:
    case JXL_ENC_FRAME_SETTING_JPEG_KEEP_JUMBF:
+    case JXL_ENC_FRAME_SETTING_USE_FULL_IMAGE_HEURISTICS:
      return JXL_API_ERROR(frame_settings->enc, JXL_ENC_ERR_NOT_SUPPORTED,
                           "Int option, try setting it with "
                           "JxlEncoderFrameSettingsSetOption");
--- a/third_party/jpeg-xl/lib/jxl/encode_internal.h
+++ b/third_party/jpeg-xl/lib/jxl/encode_internal.h
@ -7,17 +7,21 @@
 #ifndef LIB_JXL_ENCODE_INTERNAL_H_
 #define LIB_JXL_ENCODE_INTERNAL_H_

+#include <jxl/cms_interface.h>
+#include <jxl/codestream_header.h>
 #include <jxl/encode.h>
 #include <jxl/memory_manager.h>
-#include <jxl/parallel_runner.h>
 #include <jxl/types.h>
-#include <sys/types.h>

+#include <algorithm>
+#include <array>
 #include <cstddef>
 #include <cstdint>
+#include <cstring>
 #include <functional>
 #include <map>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>

@ -29,6 +33,7 @@
 #include "lib/jxl/enc_aux_out.h"
 #include "lib/jxl/enc_fast_lossless.h"
 #include "lib/jxl/enc_params.h"
+#include "lib/jxl/image_metadata.h"
 #include "lib/jxl/jpeg/jpeg_data.h"
 #include "lib/jxl/memory_manager_internal.h"
 #include "lib/jxl/padded_bytes.h"
--- a/third_party/jpeg-xl/lib/jxl/encode_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/encode_test.cc
@ -6,14 +6,18 @@
 #include <jxl/cms.h>
 #include <jxl/cms_interface.h>
 #include <jxl/codestream_header.h>
+#include <jxl/color_encoding.h>
 #include <jxl/decode.h>
 #include <jxl/decode_cxx.h>
 #include <jxl/encode.h>
 #include <jxl/encode_cxx.h>
+#include <jxl/memory_manager.h>
+#include <jxl/types.h>

 #include <cstddef>
 #include <cstdint>
 #include <cstdio>
+#include <cstdlib>
 #include <cstring>
 #include <mutex>
 #include <ostream>
@ -23,16 +27,18 @@
 #include <utility>
 #include <vector>

-#include "jxl/types.h"
 #include "lib/extras/codec.h"
 #include "lib/extras/dec/jxl.h"
 #include "lib/extras/metrics.h"
 #include "lib/extras/packed_image.h"
+#include "lib/jxl/base/byte_order.h"
 #include "lib/jxl/base/c_callback_support.h"
+#include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
 #include "lib/jxl/common.h"  // JXL_HIGH_PRECISION
+#include "lib/jxl/enc_params.h"
 #include "lib/jxl/encode_internal.h"
-#include "lib/jxl/jpeg/dec_jpeg_data.h"
-#include "lib/jxl/jpeg/dec_jpeg_data_writer.h"
+#include "lib/jxl/modular/options.h"
 #include "lib/jxl/test_image.h"
 #include "lib/jxl/test_utils.h"
 #include "lib/jxl/testing.h"
@ -516,7 +522,7 @@ TEST(EncodeTest, frame_settingsTest) {
    VerifyFrameEncoding(63, 129, enc.get(), frame_settings, 2830,
                        /*lossy_use_original_profile=*/false);
    EXPECT_EQ(false, enc->last_used_cparams.responsive);
-    EXPECT_EQ(true, enc->last_used_cparams.progressive_mode);
+    EXPECT_EQ(jxl::Override::kOn, enc->last_used_cparams.progressive_mode);
    EXPECT_EQ(2, enc->last_used_cparams.progressive_dc);
  }

@ -979,7 +985,7 @@ TEST(EncodeTest, JXL_TRANSCODE_JPEG_TEST(ProgressiveJPEGReconstructionTest)) {
  JxlEncoderFrameSettings* frame_settings =
      JxlEncoderFrameSettingsCreate(enc.get(), NULL);

-  frame_settings->values.cparams.progressive_mode = true;
+  frame_settings->values.cparams.progressive_mode = jxl::Override::kOn;

  EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderStoreJPEGMetadata(enc.get(), JXL_TRUE));
  EXPECT_EQ(JXL_ENC_SUCCESS,
@ -1774,7 +1780,7 @@ class EncoderStreamingTest : public testing::TestWithParam<StreamingTestParam> {
  static void SetupEncoder(JxlEncoderFrameSettings* frame_settings,
                           const StreamingTestParam& p,
                           const JxlBasicInfo& basic_info,
-                           size_t number_extra_channels) {
+                           size_t number_extra_channels, bool streaming) {
    JxlEncoderStruct* enc = frame_settings->enc;
    EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetBasicInfo(enc, &basic_info));
    if (p.fast_lossless()) {
@ -1788,9 +1794,14 @@ class EncoderStreamingTest : public testing::TestWithParam<StreamingTestParam> {
    JxlColorEncodingSetToSRGB(&color_encoding, /*is_gray=*/false);
    EXPECT_EQ(JXL_ENC_SUCCESS,
              JxlEncoderSetColorEncoding(enc, &color_encoding));
+    EXPECT_EQ(JXL_ENC_SUCCESS,
+              JxlEncoderFrameSettingsSetOption(frame_settings,
+                                               JXL_ENC_FRAME_SETTING_BUFFERING,
+                                               streaming ? 3 : 0));
    EXPECT_EQ(JXL_ENC_SUCCESS,
              JxlEncoderFrameSettingsSetOption(
-                  frame_settings, JXL_ENC_FRAME_SETTING_BUFFERING, 3));
+                  frame_settings,
+                  JXL_ENC_FRAME_SETTING_USE_FULL_IMAGE_HEURISTICS, 0));
    if (p.use_container()) {
      EXPECT_EQ(JXL_ENC_SUCCESS, JxlEncoderSetCodestreamLevel(enc, 10));
    }
@ -1875,7 +1886,7 @@ TEST_P(EncoderStreamingTest, OutputCallback) {
    ASSERT_NE(nullptr, enc.get());
    JxlEncoderFrameSettings* frame_settings =
        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
-    SetupEncoder(frame_settings, p, basic_info, number_extra_channels);
+    SetupEncoder(frame_settings, p, basic_info, number_extra_channels, false);
    SetupInputNonStreaming(frame_settings, p, number_extra_channels, frame,
                           ec_frame);
    uint8_t* next_out = compressed.data();
@ -1890,7 +1901,7 @@ TEST_P(EncoderStreamingTest, OutputCallback) {
    ASSERT_NE(nullptr, enc.get());
    JxlEncoderFrameSettings* frame_settings =
        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
-    SetupEncoder(frame_settings, p, basic_info, number_extra_channels);
+    SetupEncoder(frame_settings, p, basic_info, number_extra_channels, true);
    SetupInputNonStreaming(frame_settings, p, number_extra_channels, frame,
                           ec_frame);
    JxlStreamingAdapter streaming_adapter(enc.get(), p.return_large_buffers(),
@ -1901,7 +1912,7 @@ TEST_P(EncoderStreamingTest, OutputCallback) {
  }

  EXPECT_TRUE(SameDecodedPixels(compressed, streaming_compressed));
-  EXPECT_LE(streaming_compressed.size(), compressed.size() + 16);
+  EXPECT_LE(streaming_compressed.size(), compressed.size() + 1024);
 }

 TEST_P(EncoderStreamingTest, ChunkedFrame) {
@ -1928,7 +1939,7 @@ TEST_P(EncoderStreamingTest, ChunkedFrame) {
    ASSERT_NE(nullptr, enc.get());
    JxlEncoderFrameSettings* frame_settings =
        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
-    SetupEncoder(frame_settings, p, basic_info, number_extra_channels);
+    SetupEncoder(frame_settings, p, basic_info, number_extra_channels, false);
    SetupInputNonStreaming(frame_settings, p, number_extra_channels, frame,
                           ec_frame);
    uint8_t* next_out = compressed.data();
@ -1942,7 +1953,7 @@ TEST_P(EncoderStreamingTest, ChunkedFrame) {
    ASSERT_NE(nullptr, enc.get());
    JxlEncoderFrameSettings* frame_settings =
        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
-    SetupEncoder(frame_settings, p, basic_info, number_extra_channels);
+    SetupEncoder(frame_settings, p, basic_info, number_extra_channels, true);
    SetupInputStreaming(frame_settings, p, number_extra_channels, frame,
                        ec_frame);
    uint8_t* next_out = streaming_compressed.data();
@ -1951,7 +1962,7 @@ TEST_P(EncoderStreamingTest, ChunkedFrame) {
  }

  EXPECT_TRUE(SameDecodedPixels(compressed, streaming_compressed));
-  EXPECT_LE(streaming_compressed.size(), compressed.size() + 16);
+  EXPECT_LE(streaming_compressed.size(), compressed.size() + 1024);
 }

 TEST_P(EncoderStreamingTest, ChunkedAndOutputCallback) {
@ -1978,7 +1989,7 @@ TEST_P(EncoderStreamingTest, ChunkedAndOutputCallback) {
    ASSERT_NE(nullptr, enc.get());
    JxlEncoderFrameSettings* frame_settings =
        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
-    SetupEncoder(frame_settings, p, basic_info, number_extra_channels);
+    SetupEncoder(frame_settings, p, basic_info, number_extra_channels, false);
    SetupInputNonStreaming(frame_settings, p, number_extra_channels, frame,
                           ec_frame);
    uint8_t* next_out = compressed.data();
@ -1993,7 +2004,7 @@ TEST_P(EncoderStreamingTest, ChunkedAndOutputCallback) {
    ASSERT_NE(nullptr, enc.get());
    JxlEncoderFrameSettings* frame_settings =
        JxlEncoderFrameSettingsCreate(enc.get(), NULL);
-    SetupEncoder(frame_settings, p, basic_info, number_extra_channels);
+    SetupEncoder(frame_settings, p, basic_info, number_extra_channels, true);
    JxlStreamingAdapter streaming_adapter =
        JxlStreamingAdapter(enc.get(), p.return_large_buffers(), p.can_seek());
    SetupInputStreaming(frame_settings, p, number_extra_channels, frame,
@ -2003,7 +2014,7 @@ TEST_P(EncoderStreamingTest, ChunkedAndOutputCallback) {
  }

  EXPECT_TRUE(SameDecodedPixels(compressed, streaming_compressed));
-  EXPECT_LE(streaming_compressed.size(), compressed.size() + 16);
+  EXPECT_LE(streaming_compressed.size(), compressed.size() + 1024);
 }

 JXL_GTEST_INSTANTIATE_TEST_SUITE_P(
--- a/third_party/jpeg-xl/lib/jxl/frame_header.cc
+++ b/third_party/jpeg-xl/lib/jxl/frame_header.cc
@ -368,8 +368,7 @@ Status FrameHeader::VisitFields(Visitor* JXL_RESTRICT visitor) {
      JXL_QUIET_RETURN_IF_ERROR(visitor->VisitNested(&animation_frame));
    }
    JXL_QUIET_RETURN_IF_ERROR(visitor->Bool(true, &is_last));
-  }
-  if (frame_type != FrameType::kRegularFrame) {
+  } else {
    is_last = false;
  }

--- a/third_party/jpeg-xl/lib/jxl/gauss_blur.cc
+++ b/third_party/jpeg-xl/lib/jxl/gauss_blur.cc
@ -1,623 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/gauss_blur.h"
-
-#include <string.h>
-
-#include <algorithm>
-#include <cmath>
-
-#undef HWY_TARGET_INCLUDE
-#define HWY_TARGET_INCLUDE "lib/jxl/gauss_blur.cc"
-#include <hwy/cache_control.h>
-#include <hwy/foreach_target.h>
-#include <hwy/highway.h>
-
-#include "lib/jxl/base/common.h"
-#include "lib/jxl/base/compiler_specific.h"
-#include "lib/jxl/base/matrix_ops.h"
-#include "lib/jxl/image_ops.h"
-HWY_BEFORE_NAMESPACE();
-namespace jxl {
-namespace HWY_NAMESPACE {
-
-// These templates are not found via ADL.
-using hwy::HWY_NAMESPACE::Add;
-using hwy::HWY_NAMESPACE::Broadcast;
-using hwy::HWY_NAMESPACE::GetLane;
-using hwy::HWY_NAMESPACE::Mul;
-using hwy::HWY_NAMESPACE::MulAdd;
-using hwy::HWY_NAMESPACE::NegMulSub;
-#if HWY_TARGET != HWY_SCALAR
-using hwy::HWY_NAMESPACE::ShiftLeftLanes;
-#endif
-using hwy::HWY_NAMESPACE::Vec;
-
-void FastGaussian1D(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
-                    const float* JXL_RESTRICT in, intptr_t width,
-                    float* JXL_RESTRICT out) {
-  // Although the current output depends on the previous output, we can unroll
-  // up to 4x by precomputing up to fourth powers of the constants. Beyond that,
-  // numerical precision might become a problem. Macro because this is tested
-  // in #if alongside HWY_TARGET.
-#define JXL_GAUSS_MAX_LANES 4
-  using D = HWY_CAPPED(float, JXL_GAUSS_MAX_LANES);
-  using V = Vec<D>;
-  const D d;
-  const V mul_in_1 = Load(d, rg->mul_in + 0 * 4);
-  const V mul_in_3 = Load(d, rg->mul_in + 1 * 4);
-  const V mul_in_5 = Load(d, rg->mul_in + 2 * 4);
-  const V mul_prev_1 = Load(d, rg->mul_prev + 0 * 4);
-  const V mul_prev_3 = Load(d, rg->mul_prev + 1 * 4);
-  const V mul_prev_5 = Load(d, rg->mul_prev + 2 * 4);
-  const V mul_prev2_1 = Load(d, rg->mul_prev2 + 0 * 4);
-  const V mul_prev2_3 = Load(d, rg->mul_prev2 + 1 * 4);
-  const V mul_prev2_5 = Load(d, rg->mul_prev2 + 2 * 4);
-  V prev_1 = Zero(d);
-  V prev_3 = Zero(d);
-  V prev_5 = Zero(d);
-  V prev2_1 = Zero(d);
-  V prev2_3 = Zero(d);
-  V prev2_5 = Zero(d);
-
-  const intptr_t N = rg->radius;
-
-  intptr_t n = -N + 1;
-  // Left side with bounds checks and only write output after n >= 0.
-  const intptr_t first_aligned = RoundUpTo(N + 1, Lanes(d));
-  for (; n < std::min(first_aligned, width); ++n) {
-    const intptr_t left = n - N - 1;
-    const intptr_t right = n + N - 1;
-    const float left_val = left >= 0 ? in[left] : 0.0f;
-    const float right_val = right < width ? in[right] : 0.0f;
-    const V sum = Set(d, left_val + right_val);
-
-    // (Only processing a single lane here, no need to broadcast)
-    V out_1 = Mul(sum, mul_in_1);
-    V out_3 = Mul(sum, mul_in_3);
-    V out_5 = Mul(sum, mul_in_5);
-
-    out_1 = MulAdd(mul_prev2_1, prev2_1, out_1);
-    out_3 = MulAdd(mul_prev2_3, prev2_3, out_3);
-    out_5 = MulAdd(mul_prev2_5, prev2_5, out_5);
-    prev2_1 = prev_1;
-    prev2_3 = prev_3;
-    prev2_5 = prev_5;
-
-    out_1 = MulAdd(mul_prev_1, prev_1, out_1);
-    out_3 = MulAdd(mul_prev_3, prev_3, out_3);
-    out_5 = MulAdd(mul_prev_5, prev_5, out_5);
-    prev_1 = out_1;
-    prev_3 = out_3;
-    prev_5 = out_5;
-
-    if (n >= 0) {
-      out[n] = GetLane(Add(out_1, Add(out_3, out_5)));
-    }
-  }
-
-  // The above loop is effectively scalar but it is convenient to use the same
-  // prev/prev2 variables, so broadcast to each lane before the unrolled loop.
-#if HWY_TARGET != HWY_SCALAR && JXL_GAUSS_MAX_LANES > 1
-  prev2_1 = Broadcast<0>(prev2_1);
-  prev2_3 = Broadcast<0>(prev2_3);
-  prev2_5 = Broadcast<0>(prev2_5);
-  prev_1 = Broadcast<0>(prev_1);
-  prev_3 = Broadcast<0>(prev_3);
-  prev_5 = Broadcast<0>(prev_5);
-#endif
-
-  // Unrolled, no bounds checking needed.
-  for (; n < width - N + 1 - (JXL_GAUSS_MAX_LANES - 1); n += Lanes(d)) {
-    const V sum = Add(LoadU(d, in + n - N - 1), LoadU(d, in + n + N - 1));
-
-    // To get a vector of output(s), we multiply broadcasted vectors (of each
-    // input plus the two previous outputs) and add them all together.
-    // Incremental broadcasting and shifting is expected to be cheaper than
-    // horizontal adds or transposing 4x4 values because they run on a different
-    // port, concurrently with the FMA.
-    const V in0 = Broadcast<0>(sum);
-    V out_1 = Mul(in0, mul_in_1);
-    V out_3 = Mul(in0, mul_in_3);
-    V out_5 = Mul(in0, mul_in_5);
-
-#if HWY_TARGET != HWY_SCALAR && JXL_GAUSS_MAX_LANES >= 2
-    const V in1 = Broadcast<1>(sum);
-    out_1 = MulAdd(ShiftLeftLanes<1>(mul_in_1), in1, out_1);
-    out_3 = MulAdd(ShiftLeftLanes<1>(mul_in_3), in1, out_3);
-    out_5 = MulAdd(ShiftLeftLanes<1>(mul_in_5), in1, out_5);
-
-#if JXL_GAUSS_MAX_LANES >= 4
-    const V in2 = Broadcast<2>(sum);
-    out_1 = MulAdd(ShiftLeftLanes<2>(mul_in_1), in2, out_1);
-    out_3 = MulAdd(ShiftLeftLanes<2>(mul_in_3), in2, out_3);
-    out_5 = MulAdd(ShiftLeftLanes<2>(mul_in_5), in2, out_5);
-
-    const V in3 = Broadcast<3>(sum);
-    out_1 = MulAdd(ShiftLeftLanes<3>(mul_in_1), in3, out_1);
-    out_3 = MulAdd(ShiftLeftLanes<3>(mul_in_3), in3, out_3);
-    out_5 = MulAdd(ShiftLeftLanes<3>(mul_in_5), in3, out_5);
-#endif
-#endif
-
-    out_1 = MulAdd(mul_prev2_1, prev2_1, out_1);
-    out_3 = MulAdd(mul_prev2_3, prev2_3, out_3);
-    out_5 = MulAdd(mul_prev2_5, prev2_5, out_5);
-
-    out_1 = MulAdd(mul_prev_1, prev_1, out_1);
-    out_3 = MulAdd(mul_prev_3, prev_3, out_3);
-    out_5 = MulAdd(mul_prev_5, prev_5, out_5);
-#if HWY_TARGET == HWY_SCALAR || JXL_GAUSS_MAX_LANES == 1
-    prev2_1 = prev_1;
-    prev2_3 = prev_3;
-    prev2_5 = prev_5;
-    prev_1 = out_1;
-    prev_3 = out_3;
-    prev_5 = out_5;
-#else
-    prev2_1 = Broadcast<JXL_GAUSS_MAX_LANES - 2>(out_1);
-    prev2_3 = Broadcast<JXL_GAUSS_MAX_LANES - 2>(out_3);
-    prev2_5 = Broadcast<JXL_GAUSS_MAX_LANES - 2>(out_5);
-    prev_1 = Broadcast<JXL_GAUSS_MAX_LANES - 1>(out_1);
-    prev_3 = Broadcast<JXL_GAUSS_MAX_LANES - 1>(out_3);
-    prev_5 = Broadcast<JXL_GAUSS_MAX_LANES - 1>(out_5);
-#endif
-
-    Store(Add(out_1, Add(out_3, out_5)), d, out + n);
-  }
-
-  // Remainder handling with bounds checks
-  for (; n < width; ++n) {
-    const intptr_t left = n - N - 1;
-    const intptr_t right = n + N - 1;
-    const float left_val = left >= 0 ? in[left] : 0.0f;
-    const float right_val = right < width ? in[right] : 0.0f;
-    const V sum = Set(d, left_val + right_val);
-
-    // (Only processing a single lane here, no need to broadcast)
-    V out_1 = Mul(sum, mul_in_1);
-    V out_3 = Mul(sum, mul_in_3);
-    V out_5 = Mul(sum, mul_in_5);
-
-    out_1 = MulAdd(mul_prev2_1, prev2_1, out_1);
-    out_3 = MulAdd(mul_prev2_3, prev2_3, out_3);
-    out_5 = MulAdd(mul_prev2_5, prev2_5, out_5);
-    prev2_1 = prev_1;
-    prev2_3 = prev_3;
-    prev2_5 = prev_5;
-
-    out_1 = MulAdd(mul_prev_1, prev_1, out_1);
-    out_3 = MulAdd(mul_prev_3, prev_3, out_3);
-    out_5 = MulAdd(mul_prev_5, prev_5, out_5);
-    prev_1 = out_1;
-    prev_3 = out_3;
-    prev_5 = out_5;
-
-    out[n] = GetLane(Add(out_1, Add(out_3, out_5)));
-  }
-}
-
-// Ring buffer is for n, n-1, n-2; round up to 4 for faster modulo.
-constexpr size_t kMod = 4;
-
-// Avoids an unnecessary store during warmup.
-struct OutputNone {
-  template <class V>
-  void operator()(const V& /*unused*/, float* JXL_RESTRICT /*pos*/,
-                  ptrdiff_t /*offset*/) const {}
-};
-
-// Common case: write output vectors in all VerticalBlock except warmup.
-struct OutputStore {
-  template <class V>
-  void operator()(const V& out, float* JXL_RESTRICT pos,
-                  ptrdiff_t offset) const {
-    // Stream helps for large images but is slower for images that fit in cache.
-    const HWY_FULL(float) df;
-    Store(out, df, pos + offset);
-  }
-};
-
-// At top/bottom borders, we don't have two inputs to load, so avoid addition.
-// pos may even point to all zeros if the row is outside the input image.
-class SingleInput {
- public:
-  explicit SingleInput(const float* pos) : pos_(pos) {}
-  Vec<HWY_FULL(float)> operator()(const size_t offset) const {
-    const HWY_FULL(float) df;
-    return Load(df, pos_ + offset);
-  }
-  const float* pos_;
-};
-
-// In the middle of the image, we need to load from a row above and below, and
-// return the sum.
-class TwoInputs {
- public:
-  TwoInputs(const float* pos1, const float* pos2) : pos1_(pos1), pos2_(pos2) {}
-  Vec<HWY_FULL(float)> operator()(const size_t offset) const {
-    const HWY_FULL(float) df;
-    const auto in1 = Load(df, pos1_ + offset);
-    const auto in2 = Load(df, pos2_ + offset);
-    return Add(in1, in2);
-  }
-
- private:
-  const float* pos1_;
-  const float* pos2_;
-};
-
-// Block := kVectors consecutive full vectors (one cache line except on the
-// right boundary, where we can only rely on having one vector). Unrolling to
-// the cache line size improves cache utilization.
-template <size_t kVectors, class V, class Input, class Output>
-void VerticalBlock(const V& d1_1, const V& d1_3, const V& d1_5, const V& n2_1,
-                   const V& n2_3, const V& n2_5, const Input& input,
-                   size_t& ctr, float* ring_buffer, const Output output,
-                   float* JXL_RESTRICT out_pos) {
-  const HWY_FULL(float) d;
-  constexpr size_t kVN = MaxLanes(d);
-  // More cache-friendly to process an entirely cache line at a time
-  constexpr size_t kLanes = kVectors * kVN;
-
-  float* JXL_RESTRICT y_1 = ring_buffer + 0 * kLanes * kMod;
-  float* JXL_RESTRICT y_3 = ring_buffer + 1 * kLanes * kMod;
-  float* JXL_RESTRICT y_5 = ring_buffer + 2 * kLanes * kMod;
-
-  const size_t n_0 = (++ctr) % kMod;
-  const size_t n_1 = (ctr - 1) % kMod;
-  const size_t n_2 = (ctr - 2) % kMod;
-
-  for (size_t idx_vec = 0; idx_vec < kVectors; ++idx_vec) {
-    const V sum = input(idx_vec * kVN);
-
-    const V y_n1_1 = Load(d, y_1 + kLanes * n_1 + idx_vec * kVN);
-    const V y_n1_3 = Load(d, y_3 + kLanes * n_1 + idx_vec * kVN);
-    const V y_n1_5 = Load(d, y_5 + kLanes * n_1 + idx_vec * kVN);
-    const V y_n2_1 = Load(d, y_1 + kLanes * n_2 + idx_vec * kVN);
-    const V y_n2_3 = Load(d, y_3 + kLanes * n_2 + idx_vec * kVN);
-    const V y_n2_5 = Load(d, y_5 + kLanes * n_2 + idx_vec * kVN);
-    // (35)
-    const V y1 = MulAdd(n2_1, sum, NegMulSub(d1_1, y_n1_1, y_n2_1));
-    const V y3 = MulAdd(n2_3, sum, NegMulSub(d1_3, y_n1_3, y_n2_3));
-    const V y5 = MulAdd(n2_5, sum, NegMulSub(d1_5, y_n1_5, y_n2_5));
-    Store(y1, d, y_1 + kLanes * n_0 + idx_vec * kVN);
-    Store(y3, d, y_3 + kLanes * n_0 + idx_vec * kVN);
-    Store(y5, d, y_5 + kLanes * n_0 + idx_vec * kVN);
-    output(Add(y1, Add(y3, y5)), out_pos, idx_vec * kVN);
-  }
-  // NOTE: flushing cache line out_pos hurts performance - less so with
-  // clflushopt than clflush but still a significant slowdown.
-}
-
-// Reads/writes one block (kVectors full vectors) in each row.
-template <size_t kVectors>
-void VerticalStrip(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
-                   const ImageF& in, const size_t x, ImageF* JXL_RESTRICT out) {
-  // We're iterating vertically, so use multiple full-length vectors (each lane
-  // is one column of row n).
-  using D = HWY_FULL(float);
-  using V = Vec<D>;
-  const D d;
-  constexpr size_t kVN = MaxLanes(d);
-  // More cache-friendly to process an entirely cache line at a time
-  constexpr size_t kLanes = kVectors * kVN;
-#if HWY_TARGET == HWY_SCALAR
-  const V d1_1 = Set(d, rg->d1[0 * 4]);
-  const V d1_3 = Set(d, rg->d1[1 * 4]);
-  const V d1_5 = Set(d, rg->d1[2 * 4]);
-  const V n2_1 = Set(d, rg->n2[0 * 4]);
-  const V n2_3 = Set(d, rg->n2[1 * 4]);
-  const V n2_5 = Set(d, rg->n2[2 * 4]);
-#else
-  const V d1_1 = LoadDup128(d, rg->d1 + 0 * 4);
-  const V d1_3 = LoadDup128(d, rg->d1 + 1 * 4);
-  const V d1_5 = LoadDup128(d, rg->d1 + 2 * 4);
-  const V n2_1 = LoadDup128(d, rg->n2 + 0 * 4);
-  const V n2_3 = LoadDup128(d, rg->n2 + 1 * 4);
-  const V n2_5 = LoadDup128(d, rg->n2 + 2 * 4);
-#endif
-
-  const size_t N = rg->radius;
-  const size_t ysize = in.ysize();
-
-  size_t ctr = 0;
-  HWY_ALIGN float ring_buffer[3 * kLanes * kMod] = {0};
-  HWY_ALIGN static constexpr float zero[kLanes] = {0};
-
-  // Warmup: top is out of bounds (zero padded), bottom is usually in-bounds.
-  ssize_t n = -static_cast<ssize_t>(N) + 1;
-  for (; n < 0; ++n) {
-    // bottom is always non-negative since n is initialized in -N + 1.
-    const size_t bottom = n + N - 1;
-    VerticalBlock<kVectors>(
-        d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
-        SingleInput(bottom < ysize ? in.ConstRow(bottom) + x : zero), ctr,
-        ring_buffer, OutputNone(), nullptr);
-  }
-  JXL_DASSERT(n >= 0);
-
-  // Start producing output; top is still out of bounds.
-  for (; static_cast<size_t>(n) < std::min(N + 1, ysize); ++n) {
-    const size_t bottom = n + N - 1;
-    VerticalBlock<kVectors>(
-        d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
-        SingleInput(bottom < ysize ? in.ConstRow(bottom) + x : zero), ctr,
-        ring_buffer, OutputStore(), out->Row(n) + x);
-  }
-
-  // Interior outputs with prefetching and without bounds checks.
-  constexpr size_t kPrefetchRows = 8;
-  for (; n < static_cast<ssize_t>(ysize - N + 1 - kPrefetchRows); ++n) {
-    const size_t top = n - N - 1;
-    const size_t bottom = n + N - 1;
-    VerticalBlock<kVectors>(
-        d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
-        TwoInputs(in.ConstRow(top) + x, in.ConstRow(bottom) + x), ctr,
-        ring_buffer, OutputStore(), out->Row(n) + x);
-    hwy::Prefetch(in.ConstRow(top + kPrefetchRows) + x);
-    hwy::Prefetch(in.ConstRow(bottom + kPrefetchRows) + x);
-  }
-
-  // Bottom border without prefetching and with bounds checks.
-  for (; static_cast<size_t>(n) < ysize; ++n) {
-    const size_t top = n - N - 1;
-    const size_t bottom = n + N - 1;
-    VerticalBlock<kVectors>(
-        d1_1, d1_3, d1_5, n2_1, n2_3, n2_5,
-        TwoInputs(in.ConstRow(top) + x,
-                  bottom < ysize ? in.ConstRow(bottom) + x : zero),
-        ctr, ring_buffer, OutputStore(), out->Row(n) + x);
-  }
-}
-
-// Apply 1D vertical scan to multiple columns (one per vector lane).
-// Not yet parallelized.
-void FastGaussianVertical(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
-                          const ImageF& in, ThreadPool* /*pool*/,
-                          ImageF* JXL_RESTRICT out) {
-  JXL_CHECK(SameSize(in, *out));
-
-  const HWY_FULL(float) df;
-  constexpr size_t kCacheLineLanes = 64 / sizeof(float);
-  constexpr size_t kVN = MaxLanes(df);
-  constexpr size_t kCacheLineVectors =
-      (kVN < kCacheLineLanes) ? (kCacheLineLanes / kVN) : 4;
-  constexpr size_t kFastPace = kCacheLineVectors * kVN;
-
-  size_t x = 0;
-  for (; x + kFastPace <= in.xsize(); x += kFastPace) {
-    VerticalStrip<kCacheLineVectors>(rg, in, x, out);
-  }
-  for (; x < in.xsize(); x += kVN) {
-    VerticalStrip<1>(rg, in, x, out);
-  }
-}
-
-// TODO(veluca): consider replacing with FastGaussian.
-ImageF ConvolveXSampleAndTranspose(const ImageF& in,
-                                   const std::vector<float>& kernel,
-                                   const size_t res) {
-  JXL_ASSERT(kernel.size() % 2 == 1);
-  JXL_ASSERT(in.xsize() % res == 0);
-  const size_t offset = res / 2;
-  const size_t out_xsize = in.xsize() / res;
-  ImageF out(in.ysize(), out_xsize);
-  const int r = kernel.size() / 2;
-  HWY_FULL(float) df;
-  std::vector<float> row_tmp(in.xsize() + 2 * r + Lanes(df));
-  float* const JXL_RESTRICT rowp = &row_tmp[r];
-  std::vector<float> padded_k = kernel;
-  padded_k.resize(padded_k.size() + Lanes(df));
-  const float* const kernelp = &padded_k[r];
-  for (size_t y = 0; y < in.ysize(); ++y) {
-    ExtrapolateBorders(in.Row(y), rowp, in.xsize(), r);
-    size_t x = offset, ox = 0;
-    for (; x < static_cast<uint32_t>(r) && x < in.xsize(); x += res, ++ox) {
-      float sum = 0.0f;
-      for (int i = -r; i <= r; ++i) {
-        sum += rowp[std::max<int>(
-                   0, std::min<int>(static_cast<int>(x) + i, in.xsize()))] *
-               kernelp[i];
-      }
-      out.Row(ox)[y] = sum;
-    }
-    for (; x + r < in.xsize(); x += res, ++ox) {
-      auto sum = Zero(df);
-      for (int i = -r; i <= r; i += Lanes(df)) {
-        sum = MulAdd(LoadU(df, rowp + x + i), LoadU(df, kernelp + i), sum);
-      }
-      out.Row(ox)[y] = GetLane(SumOfLanes(df, sum));
-    }
-    for (; x < in.xsize(); x += res, ++ox) {
-      float sum = 0.0f;
-      for (int i = -r; i <= r; ++i) {
-        sum += rowp[std::max<int>(
-                   0, std::min<int>(static_cast<int>(x) + i, in.xsize()))] *
-               kernelp[i];
-      }
-      out.Row(ox)[y] = sum;
-    }
-  }
-  return out;
-}
-
-// NOLINTNEXTLINE(google-readability-namespace-comments)
-}  // namespace HWY_NAMESPACE
-}  // namespace jxl
-HWY_AFTER_NAMESPACE();
-
-#if HWY_ONCE
-namespace jxl {
-
-HWY_EXPORT(FastGaussian1D);
-HWY_EXPORT(ConvolveXSampleAndTranspose);
-void FastGaussian1D(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
-                    const float* JXL_RESTRICT in, intptr_t width,
-                    float* JXL_RESTRICT out) {
-  return HWY_DYNAMIC_DISPATCH(FastGaussian1D)(rg, in, width, out);
-}
-
-HWY_EXPORT(FastGaussianVertical);  // Local function.
-
-void ExtrapolateBorders(const float* const JXL_RESTRICT row_in,
-                        float* const JXL_RESTRICT row_out, const int xsize,
-                        const int radius) {
-  const int lastcol = xsize - 1;
-  for (int x = 1; x <= radius; ++x) {
-    row_out[-x] = row_in[std::min(x, xsize - 1)];
-  }
-  memcpy(row_out, row_in, xsize * sizeof(row_out[0]));
-  for (int x = 1; x <= radius; ++x) {
-    row_out[lastcol + x] = row_in[std::max(0, lastcol - x)];
-  }
-}
-
-ImageF ConvolveXSampleAndTranspose(const ImageF& in,
-                                   const std::vector<float>& kernel,
-                                   const size_t res) {
-  return HWY_DYNAMIC_DISPATCH(ConvolveXSampleAndTranspose)(in, kernel, res);
-}
-
-Image3F ConvolveXSampleAndTranspose(const Image3F& in,
-                                    const std::vector<float>& kernel,
-                                    const size_t res) {
-  return Image3F(ConvolveXSampleAndTranspose(in.Plane(0), kernel, res),
-                 ConvolveXSampleAndTranspose(in.Plane(1), kernel, res),
-                 ConvolveXSampleAndTranspose(in.Plane(2), kernel, res));
-}
-
-ImageF ConvolveAndSample(const ImageF& in, const std::vector<float>& kernel,
-                         const size_t res) {
-  ImageF tmp = ConvolveXSampleAndTranspose(in, kernel, res);
-  return ConvolveXSampleAndTranspose(tmp, kernel, res);
-}
-
-// Implements "Recursive Implementation of the Gaussian Filter Using Truncated
-// Cosine Functions" by Charalampidis [2016].
-hwy::AlignedUniquePtr<RecursiveGaussian> CreateRecursiveGaussian(double sigma) {
-  auto rg = hwy::MakeUniqueAligned<RecursiveGaussian>();
-  constexpr double kPi = 3.141592653589793238;
-
-  const double radius = roundf(3.2795 * sigma + 0.2546);  // (57), "N"
-
-  // Table I, first row
-  const double pi_div_2r = kPi / (2.0 * radius);
-  const double omega[3] = {pi_div_2r, 3.0 * pi_div_2r, 5.0 * pi_div_2r};
-
-  // (37), k={1,3,5}
-  const double p_1 = +1.0 / std::tan(0.5 * omega[0]);
-  const double p_3 = -1.0 / std::tan(0.5 * omega[1]);
-  const double p_5 = +1.0 / std::tan(0.5 * omega[2]);
-
-  // (44), k={1,3,5}
-  const double r_1 = +p_1 * p_1 / std::sin(omega[0]);
-  const double r_3 = -p_3 * p_3 / std::sin(omega[1]);
-  const double r_5 = +p_5 * p_5 / std::sin(omega[2]);
-
-  // (50), k={1,3,5}
-  const double neg_half_sigma2 = -0.5 * sigma * sigma;
-  const double recip_radius = 1.0 / radius;
-  double rho[3];
-  for (size_t i = 0; i < 3; ++i) {
-    rho[i] = std::exp(neg_half_sigma2 * omega[i] * omega[i]) * recip_radius;
-  }
-
-  // second part of (52), k1,k2 = 1,3; 3,5; 5,1
-  const double D_13 = p_1 * r_3 - r_1 * p_3;
-  const double D_35 = p_3 * r_5 - r_3 * p_5;
-  const double D_51 = p_5 * r_1 - r_5 * p_1;
-
-  // (52), k=5
-  const double recip_d13 = 1.0 / D_13;
-  const double zeta_15 = D_35 * recip_d13;
-  const double zeta_35 = D_51 * recip_d13;
-
-  double A[9] = {p_1,     p_3,     p_5,  //
-                 r_1,     r_3,     r_5,  //  (56)
-                 zeta_15, zeta_35, 1};
-  JXL_CHECK(Inv3x3Matrix(A));
-  const double gamma[3] = {1, radius * radius - sigma * sigma,  // (55)
-                           zeta_15 * rho[0] + zeta_35 * rho[1] + rho[2]};
-  double beta[3];
-  Mul3x3Vector(A, gamma, beta);  // (53)
-
-  // Sanity check: correctly solved for beta (IIR filter weights are normalized)
-  const double sum = beta[0] * p_1 + beta[1] * p_3 + beta[2] * p_5;  // (39)
-  JXL_ASSERT(std::abs(sum - 1) < 1E-12);
-  (void)sum;
-
-  rg->radius = static_cast<int>(radius);
-
-  double n2[3];
-  double d1[3];
-  for (size_t i = 0; i < 3; ++i) {
-    n2[i] = -beta[i] * std::cos(omega[i] * (radius + 1.0));  // (33)
-    d1[i] = -2.0 * std::cos(omega[i]);                       // (33)
-
-    for (size_t lane = 0; lane < 4; ++lane) {
-      rg->n2[4 * i + lane] = static_cast<float>(n2[i]);
-      rg->d1[4 * i + lane] = static_cast<float>(d1[i]);
-    }
-
-    const double d_2 = d1[i] * d1[i];
-
-    // Obtained by expanding (35) for four consecutive outputs via sympy:
-    // n, d, p, pp = symbols('n d p pp')
-    // i0, i1, i2, i3 = symbols('i0 i1 i2 i3')
-    // o0, o1, o2, o3 = symbols('o0 o1 o2 o3')
-    // o0 = n*i0 - d*p - pp
-    // o1 = n*i1 - d*o0 - p
-    // o2 = n*i2 - d*o1 - o0
-    // o3 = n*i3 - d*o2 - o1
-    // Then expand(o3) and gather terms for p(prev), pp(prev2) etc.
-    rg->mul_prev[4 * i + 0] = -d1[i];
-    rg->mul_prev[4 * i + 1] = d_2 - 1.0;
-    rg->mul_prev[4 * i + 2] = -d_2 * d1[i] + 2.0 * d1[i];
-    rg->mul_prev[4 * i + 3] = d_2 * d_2 - 3.0 * d_2 + 1.0;
-    rg->mul_prev2[4 * i + 0] = -1.0;
-    rg->mul_prev2[4 * i + 1] = d1[i];
-    rg->mul_prev2[4 * i + 2] = -d_2 + 1.0;
-    rg->mul_prev2[4 * i + 3] = d_2 * d1[i] - 2.0 * d1[i];
-    rg->mul_in[4 * i + 0] = n2[i];
-    rg->mul_in[4 * i + 1] = -d1[i] * n2[i];
-    rg->mul_in[4 * i + 2] = d_2 * n2[i] - n2[i];
-    rg->mul_in[4 * i + 3] = -d_2 * d1[i] * n2[i] + 2.0 * d1[i] * n2[i];
-  }
-  return rg;
-}
-
-namespace {
-
-// Apply 1D horizontal scan to each row.
-void FastGaussianHorizontal(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
-                            const ImageF& in, ThreadPool* pool,
-                            ImageF* JXL_RESTRICT out) {
-  JXL_CHECK(SameSize(in, *out));
-
-  const intptr_t xsize = in.xsize();
-  JXL_CHECK(RunOnPool(
-      pool, 0, in.ysize(), ThreadPool::NoInit,
-      [&](const uint32_t task, size_t /*thread*/) {
-        const size_t y = task;
-        const float* row_in = in.ConstRow(y);
-        float* JXL_RESTRICT row_out = out->Row(y);
-        FastGaussian1D(rg, row_in, xsize, row_out);
-      },
-      "FastGaussianHorizontal"));
-}
-
-}  // namespace
-
-void FastGaussian(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
-                  const ImageF& in, ThreadPool* pool, ImageF* JXL_RESTRICT temp,
-                  ImageF* JXL_RESTRICT out) {
-  FastGaussianHorizontal(rg, in, pool, temp);
-  HWY_DYNAMIC_DISPATCH(FastGaussianVertical)(rg, *temp, pool, out);
-}
-
-}  // namespace jxl
-#endif  // HWY_ONCE
--- a/third_party/jpeg-xl/lib/jxl/gauss_blur.h
+++ b/third_party/jpeg-xl/lib/jxl/gauss_blur.h
@ -1,94 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#ifndef LIB_JXL_GAUSS_BLUR_H_
-#define LIB_JXL_GAUSS_BLUR_H_
-
-#include <stddef.h>
-
-#include <cmath>
-#include <hwy/aligned_allocator.h>
-#include <vector>
-
-#include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/status.h"
-#include "lib/jxl/image.h"
-
-namespace jxl {
-
-template <typename T>
-std::vector<T> GaussianKernel(int radius, T sigma) {
-  JXL_ASSERT(sigma > 0.0);
-  std::vector<T> kernel(2 * radius + 1);
-  const T scaler = -1.0 / (2 * sigma * sigma);
-  double sum = 0.0;
-  for (int i = -radius; i <= radius; ++i) {
-    const T val = std::exp(scaler * i * i);
-    kernel[i + radius] = val;
-    sum += val;
-  }
-  for (size_t i = 0; i < kernel.size(); ++i) {
-    kernel[i] /= sum;
-  }
-  return kernel;
-}
-
-// All convolution functions below apply mirroring of the input on the borders
-// in the following way:
-//
-//     input: [a0 a1 a2 ...  aN]
-//     mirrored input: [aR ... a1 | a0 a1 a2 .... aN | aN-1 ... aN-R]
-//
-// where R is the radius of the kernel (i.e. kernel size is 2*R+1).
-
-// REQUIRES: in.xsize() and in.ysize() are integer multiples of res.
-ImageF ConvolveAndSample(const ImageF& in, const std::vector<float>& kernel,
-                         const size_t res);
-
-// Private, used by test.
-void ExtrapolateBorders(const float* const JXL_RESTRICT row_in,
-                        float* const JXL_RESTRICT row_out, const int xsize,
-                        const int radius);
-
-// Only for use by CreateRecursiveGaussian and FastGaussian*.
-#pragma pack(push, 1)
-struct RecursiveGaussian {
-  // For k={1,3,5} in that order, each broadcasted 4x for LoadDup128. Used only
-  // for vertical passes.
-  float n2[3 * 4];
-  float d1[3 * 4];
-
-  // We unroll horizontal passes 4x - one output per lane. These are each lane's
-  // multiplier for the previous output (relative to the first of the four
-  // outputs). Indexing: 4 * 0..2 (for {1,3,5}) + 0..3 for the lane index.
-  float mul_prev[3 * 4];
-  // Ditto for the second to last output.
-  float mul_prev2[3 * 4];
-
-  // We multiply a vector of inputs 0..3 by a vector shifted from this array.
-  // in=0 uses all 4 (nonzero) terms; for in=3, the lower three lanes are 0.
-  float mul_in[3 * 4];
-
-  size_t radius;
-};
-#pragma pack(pop)
-
-// Precomputation for FastGaussian*; users may use the same pointer/storage in
-// subsequent calls to FastGaussian* with the same sigma.
-hwy::AlignedUniquePtr<RecursiveGaussian> CreateRecursiveGaussian(double sigma);
-
-// 1D Gaussian with zero-pad boundary handling and runtime independent of sigma.
-void FastGaussian1D(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
-                    const float* JXL_RESTRICT in, intptr_t width,
-                    float* JXL_RESTRICT out);
-
-// 2D Gaussian with zero-pad boundary handling and runtime independent of sigma.
-void FastGaussian(const hwy::AlignedUniquePtr<RecursiveGaussian>& rg,
-                  const ImageF& in, ThreadPool* pool, ImageF* JXL_RESTRICT temp,
-                  ImageF* JXL_RESTRICT out);
-
-}  // namespace jxl
-
-#endif  // LIB_JXL_GAUSS_BLUR_H_
--- a/third_party/jpeg-xl/lib/jxl/gauss_blur_gbench.cc
+++ b/third_party/jpeg-xl/lib/jxl/gauss_blur_gbench.cc
@ -1,126 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include <hwy/targets.h>
-
-#include "benchmark/benchmark.h"
-#include "lib/jxl/convolve.h"
-#include "lib/jxl/gauss_blur.h"
-#include "lib/jxl/image_ops.h"
-
-namespace jxl {
-namespace {
-
-JXL_MAYBE_UNUSED ImageF Convolve(const ImageF& in,
-                                 const std::vector<float>& kernel) {
-  return ConvolveAndSample(in, kernel, 1);
-}
-
-void BM_GaussBlur1d(benchmark::State& state) {
-  // Uncomment to disable SIMD and force and scalar implementation
-  // hwy::DisableTargets(~HWY_SCALAR);
-  // Uncomment to run AVX2
-  // hwy::DisableTargets(HWY_AVX3);
-
-  const size_t length = state.range();
-  const double sigma = 7.0;  // (from Butteraugli application)
-  ImageF in(length, 1);
-  const float expected = length;
-  FillImage(expected, &in);
-
-  ImageF temp(length, 1);
-  ImageF out(length, 1);
-  const auto rg = CreateRecursiveGaussian(sigma);
-  for (auto _ : state) {
-    FastGaussian1D(rg, in.Row(0), length, out.Row(0));
-    // Prevent optimizing out
-    JXL_ASSERT(std::abs(out.ConstRow(0)[length / 2] - expected) / expected <
-               9E-5);
-  }
-  state.SetItemsProcessed(length * state.iterations());
-}
-
-void BM_GaussBlur2d(benchmark::State& state) {
-  // See GaussBlur1d for SIMD changes.
-
-  const size_t xsize = state.range();
-  const size_t ysize = xsize;
-  const double sigma = 7.0;  // (from Butteraugli application)
-  ImageF in(xsize, ysize);
-  const float expected = xsize + ysize;
-  FillImage(expected, &in);
-
-  ImageF temp(xsize, ysize);
-  ImageF out(xsize, ysize);
-  ThreadPool* null_pool = nullptr;
-  const auto rg = CreateRecursiveGaussian(sigma);
-  for (auto _ : state) {
-    FastGaussian(rg, in, null_pool, &temp, &out);
-    // Prevent optimizing out
-    JXL_ASSERT(std::abs(out.ConstRow(ysize / 2)[xsize / 2] - expected) /
-                   expected <
-               9E-5);
-  }
-  state.SetItemsProcessed(xsize * ysize * state.iterations());
-}
-
-void BM_GaussBlurFir(benchmark::State& state) {
-  // See GaussBlur1d for SIMD changes.
-
-  const size_t xsize = state.range();
-  const size_t ysize = xsize;
-  const double sigma = 7.0;  // (from Butteraugli application)
-  ImageF in(xsize, ysize);
-  const float expected = xsize + ysize;
-  FillImage(expected, &in);
-
-  ImageF temp(xsize, ysize);
-  ImageF out(xsize, ysize);
-  const std::vector<float> kernel =
-      GaussianKernel(static_cast<int>(4 * sigma), static_cast<float>(sigma));
-  for (auto _ : state) {
-    // Prevent optimizing out
-    JXL_ASSERT(std::abs(Convolve(in, kernel).ConstRow(ysize / 2)[xsize / 2] -
-                        expected) /
-                   expected <
-               9E-5);
-  }
-  state.SetItemsProcessed(xsize * ysize * state.iterations());
-}
-
-void BM_GaussBlurSep7(benchmark::State& state) {
-  // See GaussBlur1d for SIMD changes.
-
-  const size_t xsize = state.range();
-  const size_t ysize = xsize;
-  ImageF in(xsize, ysize);
-  const float expected = xsize + ysize;
-  FillImage(expected, &in);
-
-  ImageF temp(xsize, ysize);
-  ImageF out(xsize, ysize);
-  ThreadPool* null_pool = nullptr;
-  // Gaussian with sigma 1
-  const WeightsSeparable7 weights = {{HWY_REP4(0.383103f), HWY_REP4(0.241843f),
-                                      HWY_REP4(0.060626f), HWY_REP4(0.00598f)},
-                                     {HWY_REP4(0.383103f), HWY_REP4(0.241843f),
-                                      HWY_REP4(0.060626f), HWY_REP4(0.00598f)}};
-  for (auto _ : state) {
-    Separable7(in, Rect(in), weights, null_pool, &out);
-    // Prevent optimizing out
-    JXL_ASSERT(std::abs(out.ConstRow(ysize / 2)[xsize / 2] - expected) /
-                   expected <
-               9E-5);
-  }
-  state.SetItemsProcessed(xsize * ysize * state.iterations());
-}
-
-BENCHMARK(BM_GaussBlur1d)->Range(1 << 8, 1 << 14);
-BENCHMARK(BM_GaussBlur2d)->Range(1 << 7, 1 << 10);
-BENCHMARK(BM_GaussBlurFir)->Range(1 << 7, 1 << 10);
-BENCHMARK(BM_GaussBlurSep7)->Range(1 << 7, 1 << 10);
-
-}  // namespace
-}  // namespace jxl
--- a/third_party/jpeg-xl/lib/jxl/gauss_blur_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/gauss_blur_test.cc
@ -1,453 +0,0 @@
-// Copyright (c) the JPEG XL Project Authors. All rights reserved.
-//
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "lib/jxl/gauss_blur.h"
-
-#include <cmath>
-#include <hwy/targets.h>
-#include <vector>
-
-#include "lib/extras/time.h"
-#include "lib/jxl/base/printf_macros.h"
-#include "lib/jxl/convolve.h"
-#include "lib/jxl/image_ops.h"
-#include "lib/jxl/image_test_utils.h"
-#include "lib/jxl/testing.h"
-
-namespace jxl {
-
-bool NearEdge(const int64_t width, const int64_t peak) {
-  // When around 3*sigma from the edge, there is negligible truncation.
-  return peak < 10 || peak > width - 10;
-}
-
-// Follow the curve downwards by scanning right from `peak` and verifying
-// identical values at the same offset to the left.
-void VerifySymmetric(const int64_t width, const int64_t peak,
-                     const float* out) {
-  const double tolerance = NearEdge(width, peak) ? 0.015 : 6E-7;
-  for (int64_t i = 1;; ++i) {
-    // Stop if we passed either end of the array
-    if (peak - i < 0 || peak + i >= width) break;
-    EXPECT_GT(out[peak + i - 1] + tolerance, out[peak + i]);  // descending
-    EXPECT_NEAR(out[peak - i], out[peak + i], tolerance);     // symmetric
-  }
-}
-
-void TestImpulseResponse(size_t width, size_t peak) {
-  const auto rg3 = CreateRecursiveGaussian(3.0);
-  const auto rg4 = CreateRecursiveGaussian(4.0);
-  const auto rg5 = CreateRecursiveGaussian(5.0);
-
-  // Extra padding for 4x unrolling
-  auto in = hwy::AllocateAligned<float>(width + 3);
-  memset(in.get(), 0, sizeof(float) * (width + 3));
-  in[peak] = 1.0f;
-
-  auto out3 = hwy::AllocateAligned<float>(width + 3);
-  auto out4 = hwy::AllocateAligned<float>(width + 3);
-  auto out5 = hwy::AllocateAligned<float>(width + 3);
-  FastGaussian1D(rg3, in.get(), width, out3.get());
-  FastGaussian1D(rg4, out3.get(), width, out4.get());
-  FastGaussian1D(rg5, in.get(), width, out5.get());
-
-  VerifySymmetric(width, peak, out3.get());
-  VerifySymmetric(width, peak, out4.get());
-  VerifySymmetric(width, peak, out5.get());
-
-  // Wider kernel has flatter peak
-  EXPECT_LT(out5[peak] + 0.05, out3[peak]);
-
-  // Gauss3 o Gauss4 ~= Gauss5
-  const double tolerance = NearEdge(width, peak) ? 0.04 : 0.01;
-  for (size_t i = 0; i < width; ++i) {
-    EXPECT_NEAR(out4[i], out5[i], tolerance);
-  }
-}
-
-void TestImpulseResponseForWidth(size_t width) {
-  for (size_t i = 0; i < width; ++i) {
-    TestImpulseResponse(width, i);
-  }
-}
-
-TEST(GaussBlurTest, ImpulseResponse) {
-  TestImpulseResponseForWidth(10);  // tiny even
-  TestImpulseResponseForWidth(15);  // small odd
-  TestImpulseResponseForWidth(32);  // power of two
-  TestImpulseResponseForWidth(31);  // power of two - 1
-  TestImpulseResponseForWidth(33);  // power of two + 1
-}
-
-ImageF Convolve(const ImageF& in, const std::vector<float>& kernel) {
-  return ConvolveAndSample(in, kernel, 1);
-}
-
-// Higher-precision version for accuracy test.
-ImageF ConvolveAndTransposeF64(const ImageF& in,
-                               const std::vector<double>& kernel) {
-  JXL_ASSERT(kernel.size() % 2 == 1);
-  ImageF out(in.ysize(), in.xsize());
-  const int r = kernel.size() / 2;
-  std::vector<float> row_tmp(in.xsize() + 2 * r);
-  float* const JXL_RESTRICT rowp = &row_tmp[r];
-  const double* const kernelp = &kernel[r];
-  for (size_t y = 0; y < in.ysize(); ++y) {
-    ExtrapolateBorders(in.Row(y), rowp, in.xsize(), r);
-    for (size_t x = 0, ox = 0; x < in.xsize(); ++x, ++ox) {
-      double sum = 0.0;
-      for (int i = -r; i <= r; ++i) {
-        sum += rowp[std::max<int>(
-                   0, std::min<int>(static_cast<int>(x) + i, in.xsize()))] *
-               kernelp[i];
-      }
-      out.Row(ox)[y] = static_cast<float>(sum);
-    }
-  }
-  return out;
-}
-
-ImageF ConvolveF64(const ImageF& in, const std::vector<double>& kernel) {
-  ImageF tmp = ConvolveAndTransposeF64(in, kernel);
-  return ConvolveAndTransposeF64(tmp, kernel);
-}
-
-void TestDirac2D(size_t xsize, size_t ysize, double sigma) {
-  ImageF in(xsize, ysize);
-  ZeroFillImage(&in);
-  // We anyway ignore the border below, so might as well choose the middle.
-  in.Row(ysize / 2)[xsize / 2] = 1.0f;
-
-  ImageF temp(xsize, ysize);
-  ImageF out(xsize, ysize);
-  const auto rg = CreateRecursiveGaussian(sigma);
-  ThreadPool* null_pool = nullptr;
-  FastGaussian(rg, in, null_pool, &temp, &out);
-
-  const std::vector<float> kernel =
-      GaussianKernel(static_cast<int>(4 * sigma), static_cast<float>(sigma));
-  const ImageF expected = Convolve(in, kernel);
-
-  const double max_l1 = sigma < 1.5 ? 5E-3 : 6E-4;
-  const size_t border = 2 * sigma;
-
-  JXL_ASSERT_OK(VerifyRelativeError(expected, out, max_l1, 1E-8, _, border));
-}
-
-TEST(GaussBlurTest, Test2D) {
-  const std::vector<int> dimensions{6, 15, 17, 64, 50, 49};
-  for (int xsize : dimensions) {
-    for (int ysize : dimensions) {
-      for (double sigma : {1.0, 2.5, 3.6, 7.0}) {
-        TestDirac2D(static_cast<size_t>(xsize), static_cast<size_t>(ysize),
-                    sigma);
-      }
-    }
-  }
-}
-
-// Slow (44 sec). To run, remove the disabled prefix.
-TEST(GaussBlurTest, DISABLED_SlowTestDirac1D) {
-  const double sigma = 7.0;
-  const auto rg = CreateRecursiveGaussian(sigma);
-
-  // IPOL accuracy test uses 10^-15 tolerance, this is 2*10^-11.
-  const size_t radius = static_cast<size_t>(7 * sigma);
-  const std::vector<double> kernel = GaussianKernel(radius, sigma);
-
-  const size_t length = 16384;
-  ImageF inputs(length, 1);
-  ZeroFillImage(&inputs);
-
-  auto outputs = hwy::AllocateAligned<float>(length);
-
-  // One per center position
-  auto sum_abs_err = hwy::AllocateAligned<double>(length);
-  std::fill(sum_abs_err.get(), sum_abs_err.get() + length, 0.0);
-
-  for (size_t center = radius; center < length - radius; ++center) {
-    inputs.Row(0)[center - 1] = 0.0f;  // reset last peak, entire array now 0
-    inputs.Row(0)[center] = 1.0f;
-    FastGaussian1D(rg, inputs.Row(0), length, outputs.get());
-
-    const ImageF outputs_fir = ConvolveF64(inputs, kernel);
-
-    for (size_t i = 0; i < length; ++i) {
-      const float abs_err = std::abs(outputs[i] - outputs_fir.Row(0)[i]);
-      sum_abs_err[i] += static_cast<double>(abs_err);
-    }
-  }
-
-  const double max_abs_err =
-      *std::max_element(sum_abs_err.get(), sum_abs_err.get() + length);
-  printf("Max abs err: %.8e\n", max_abs_err);
-}
-
-void TestRandom(size_t xsize, size_t ysize, float min, float max, double sigma,
-                double max_l1, double max_rel) {
-  printf("%4" PRIuS " x %4" PRIuS " %4.1f %4.1f sigma %.1f\n", xsize, ysize,
-         min, max, sigma);
-  ImageF in(xsize, ysize);
-  RandomFillImage(&in, min, max, 65537 + xsize * 129 + ysize);
-  // FastGaussian/Convolve handle borders differently, so keep those pixels 0.
-  const size_t border = 4 * sigma;
-  SetBorder(border, 0.0f, &in);
-
-  ImageF temp(xsize, ysize);
-  ImageF out(xsize, ysize);
-  const auto rg = CreateRecursiveGaussian(sigma);
-  ThreadPool* null_pool = nullptr;
-  FastGaussian(rg, in, null_pool, &temp, &out);
-
-  const std::vector<float> kernel =
-      GaussianKernel(static_cast<int>(4 * sigma), static_cast<float>(sigma));
-  const ImageF expected = Convolve(in, kernel);
-
-  JXL_ASSERT_OK(VerifyRelativeError(expected, out, max_l1, max_rel, _, border));
-}
-
-void TestRandomForSizes(float min, float max, double sigma) {
-  double max_l1 = 6E-3;
-  double max_rel = 3E-3;
-  TestRandom(128, 1, min, max, sigma, max_l1, max_rel);
-  TestRandom(1, 128, min, max, sigma, max_l1, max_rel);
-  TestRandom(30, 201, min, max, sigma, max_l1 * 1.6, max_rel * 1.2);
-  TestRandom(201, 30, min, max, sigma, max_l1 * 1.6, max_rel * 1.2);
-  TestRandom(201, 201, min, max, sigma, max_l1 * 2.0, max_rel * 1.2);
-}
-
-TEST(GaussBlurTest, TestRandom) {
-  // small non-negative
-  TestRandomForSizes(0.0f, 10.0f, 3.0f);
-  TestRandomForSizes(0.0f, 10.0f, 7.0f);
-
-  // small negative
-  TestRandomForSizes(-4.0f, -1.0f, 3.0f);
-  TestRandomForSizes(-4.0f, -1.0f, 7.0f);
-
-  // mixed positive/negative
-  TestRandomForSizes(-6.0f, 6.0f, 3.0f);
-  TestRandomForSizes(-6.0f, 6.0f, 7.0f);
-}
-
-TEST(GaussBlurTest, TestSign) {
-  const size_t xsize = 500;
-  const size_t ysize = 606;
-  ImageF in(xsize, ysize);
-
-  ZeroFillImage(&in);
-  const float center[33 * 33] = {
-      -0.128445f, -0.098473f, -0.121883f, -0.093601f, 0.095665f,  -0.271332f,
-      -0.705475f, -1.324005f, -2.020741f, -1.329464f, 1.834064f,  4.787300f,
-      5.834560f,  5.272720f,  3.967960f,  3.547935f,  3.432732f,  3.383015f,
-      3.239326f,  3.290806f,  3.298954f,  3.397808f,  3.359730f,  3.533844f,
-      3.511856f,  3.436787f,  3.428310f,  3.460209f,  3.550011f,  3.590942f,
-      3.593109f,  3.560005f,  3.443165f,  0.089741f,  0.179230f,  -0.032997f,
-      -0.182610f, 0.005669f,  -0.244759f, -0.395123f, -0.514961f, -1.003529f,
-      -1.798656f, -2.377975f, 0.222191f,  3.957664f,  5.946804f,  5.543129f,
-      4.290096f,  3.621010f,  3.407257f,  3.392494f,  3.345367f,  3.391903f,
-      3.441605f,  3.429260f,  3.444969f,  3.507130f,  3.518612f,  3.443111f,
-      3.475948f,  3.536148f,  3.470333f,  3.628311f,  3.600243f,  3.292892f,
-      -0.226730f, -0.573616f, -0.762165f, -0.398739f, -0.189842f, -0.275921f,
-      -0.446739f, -0.550037f, -0.461033f, -0.724792f, -1.448349f, -1.814064f,
-      -0.491032f, 2.817703f,  5.213242f,  5.675629f,  4.864548f,  3.876324f,
-      3.535587f,  3.530312f,  3.413765f,  3.386261f,  3.404854f,  3.383472f,
-      3.420830f,  3.326496f,  3.257877f,  3.362152f,  3.489609f,  3.619587f,
-      3.555805f,  3.423164f,  3.309708f,  -0.483940f, -0.502926f, -0.592983f,
-      -0.492527f, -0.413616f, -0.482555f, -0.475506f, -0.447990f, -0.338120f,
-      -0.189072f, -0.376427f, -0.910828f, -1.878044f, -1.937927f, 1.423218f,
-      4.871609f,  5.767548f,  5.103741f,  3.983868f,  3.633003f,  3.458263f,
-      3.507309f,  3.247021f,  3.220612f,  3.326061f,  3.352814f,  3.291061f,
-      3.322739f,  3.444302f,  3.506207f,  3.556839f,  3.529575f,  3.457024f,
-      -0.408161f, -0.431343f, -0.454369f, -0.356419f, -0.380924f, -0.399452f,
-      -0.439476f, -0.412189f, -0.306816f, -0.008213f, -0.325813f, -0.537842f,
-      -0.984100f, -1.805332f, -2.028198f, 0.773205f,  4.423046f,  5.604839f,
-      5.231617f,  4.080299f,  3.603008f,  3.498741f,  3.517010f,  3.333897f,
-      3.381336f,  3.342617f,  3.369686f,  3.434155f,  3.490452f,  3.607029f,
-      3.555298f,  3.702297f,  3.618679f,  -0.503609f, -0.578564f, -0.419014f,
-      -0.239883f, 0.269836f,  0.022984f,  -0.455067f, -0.621777f, -0.304176f,
-      -0.163792f, -0.490250f, -0.466637f, -0.391792f, -0.657940f, -1.498035f,
-      -1.895836f, 0.036537f,  3.462456f,  5.586445f,  5.658791f,  4.434784f,
-      3.423435f,  3.318848f,  3.202328f,  3.532764f,  3.436687f,  3.354881f,
-      3.356941f,  3.382645f,  3.503902f,  3.512867f,  3.632366f,  3.537312f,
-      -0.274734f, -0.658829f, -0.726532f, -0.281254f, 0.053196f,  -0.064991f,
-      -0.608517f, -0.720966f, -0.070602f, -0.111320f, -0.440956f, -0.492180f,
-      -0.488762f, -0.569283f, -1.012741f, -1.582779f, -2.101479f, -1.392380f,
-      2.451153f,  5.555855f,  6.096313f,  5.230045f,  4.068172f,  3.404274f,
-      3.392586f,  3.326065f,  3.156670f,  3.284828f,  3.347012f,  3.319252f,
-      3.352310f,  3.610790f,  3.499847f,  -0.150600f, -0.314445f, -0.093575f,
-      -0.057384f, 0.053688f,  -0.189255f, -0.263515f, -0.318653f, 0.053246f,
-      0.080627f,  -0.119553f, -0.152454f, -0.305420f, -0.404869f, -0.385944f,
-      -0.689949f, -1.204914f, -1.985748f, -1.711361f, 1.260658f,  4.626896f,
-      5.888351f,  5.450989f,  4.070587f,  3.539200f,  3.383492f,  3.296318f,
-      3.267334f,  3.436028f,  3.463005f,  3.502625f,  3.522282f,  3.403763f,
-      -0.348049f, -0.302303f, -0.137016f, -0.041737f, -0.164001f, -0.358849f,
-      -0.469627f, -0.428291f, -0.375797f, -0.246346f, -0.118950f, -0.084229f,
-      -0.205681f, -0.241199f, -0.391796f, -0.323151f, -0.241211f, -0.834137f,
-      -1.684219f, -1.972137f, 0.448399f,  4.019985f,  5.648144f,  5.647846f,
-      4.295094f,  3.641884f,  3.374790f,  3.197342f,  3.425545f,  3.507481f,
-      3.478065f,  3.430889f,  3.341900f,  -1.016304f, -0.959221f, -0.909466f,
-      -0.810715f, -0.590729f, -0.594467f, -0.646721f, -0.629364f, -0.528561f,
-      -0.551819f, -0.301086f, -0.149101f, -0.060146f, -0.162220f, -0.326210f,
-      -0.156548f, -0.036293f, -0.426098f, -1.145470f, -1.628998f, -2.003052f,
-      -1.142891f, 2.885162f,  5.652863f,  5.718426f,  4.911140f,  3.234222f,
-      3.473373f,  3.577183f,  3.271603f,  3.410435f,  3.505489f,  3.434032f,
-      -0.508911f, -0.438797f, -0.437450f, -0.627426f, -0.511745f, -0.304874f,
-      -0.274246f, -0.261841f, -0.228466f, -0.342491f, -0.528206f, -0.490082f,
-      -0.516350f, -0.361694f, -0.398514f, -0.276020f, -0.210369f, -0.355938f,
-      -0.402622f, -0.538864f, -1.249573f, -2.100105f, -0.996178f, 1.886410f,
-      4.929745f,  5.630871f,  5.444199f,  4.042740f,  3.739189f,  3.691399f,
-      3.391956f,  3.469696f,  3.431232f,  0.204849f,  0.205433f,  -0.131927f,
-      -0.367908f, -0.374378f, -0.126820f, -0.186951f, -0.228565f, -0.081776f,
-      -0.143143f, -0.379230f, -0.598701f, -0.458019f, -0.295586f, -0.407730f,
-      -0.245853f, -0.043140f, 0.024242f,  -0.038998f, -0.044151f, -0.425991f,
-      -1.240753f, -1.943146f, -2.174755f, 0.523415f,  4.376751f,  5.956558f,
-      5.850082f,  4.403152f,  3.517399f,  3.560753f,  3.554836f,  3.471985f,
-      -0.508503f, -0.109783f, 0.057747f,  0.190079f,  -0.257153f, -0.591980f,
-      -0.666771f, -0.525391f, -0.293060f, -0.489731f, -0.304855f, -0.259644f,
-      -0.367825f, -0.346977f, -0.292889f, -0.215652f, -0.120705f, -0.176010f,
-      -0.422905f, -0.114647f, -0.289749f, -0.374203f, -0.606754f, -1.127949f,
-      -1.994583f, -0.588058f, 3.415840f,  5.603470f,  5.811581f,  4.959423f,
-      3.721760f,  3.710499f,  3.785461f,  -0.554588f, -0.565517f, -0.434578f,
-      -0.012482f, -0.284660f, -0.699795f, -0.957535f, -0.755135f, -0.382034f,
-      -0.321552f, -0.287571f, -0.279537f, -0.314972f, -0.256287f, -0.372818f,
-      -0.316017f, -0.287975f, -0.365639f, -0.512589f, -0.420692f, -0.436485f,
-      -0.295353f, -0.451958f, -0.755459f, -1.272358f, -2.301353f, -1.776161f,
-      1.572483f,  4.826286f,  5.741898f,  5.162853f,  4.028049f,  3.686325f,
-      -0.495590f, -0.664413f, -0.760044f, -0.152634f, -0.286480f, -0.340462f,
-      0.076477f,  0.187706f,  -0.068787f, -0.293491f, -0.361145f, -0.292515f,
-      -0.140671f, -0.190723f, -0.333302f, -0.368168f, -0.192581f, -0.154499f,
-      -0.236544f, -0.124405f, -0.208321f, -0.465607f, -0.883080f, -1.104813f,
-      -1.210567f, -1.415665f, -1.924683f, -1.634758f, 0.601017f,  4.276672f,
-      5.501350f,  5.331257f,  3.809288f,  -0.727722f, -0.533619f, -0.511524f,
-      -0.470688f, -0.610710f, -0.575130f, -0.311115f, -0.090420f, -0.297676f,
-      -0.646118f, -0.742805f, -0.485050f, -0.330910f, -0.275417f, -0.357037f,
-      -0.425598f, -0.481876f, -0.488941f, -0.393551f, -0.051105f, -0.090755f,
-      -0.328674f, -0.536369f, -0.533684f, -0.336960f, -0.689194f, -1.187195f,
-      -1.860954f, -2.290253f, -0.424774f, 3.050060f,  5.083332f,  5.291920f,
-      -0.343605f, -0.190975f, -0.303692f, -0.456512f, -0.681820f, -0.690693f,
-      -0.416729f, -0.286446f, -0.442055f, -0.709148f, -0.569160f, -0.382423f,
-      -0.402321f, -0.383362f, -0.366413f, -0.290718f, -0.110069f, -0.220280f,
-      -0.279018f, -0.255424f, -0.262081f, -0.487556f, -0.444492f, -0.250500f,
-      -0.119583f, -0.291557f, -0.537781f, -1.104073f, -1.737091f, -1.697441f,
-      -0.323456f, 2.042049f,  4.605103f,  -0.310631f, -0.279568f, -0.012695f,
-      -0.160130f, -0.358746f, -0.421101f, -0.559677f, -0.474136f, -0.416565f,
-      -0.561817f, -0.534672f, -0.519157f, -0.767197f, -0.605831f, -0.186523f,
-      0.219872f,  0.264984f,  -0.193432f, -0.363182f, -0.467472f, -0.462009f,
-      -0.571053f, -0.522476f, -0.315903f, -0.237427f, -0.147320f, -0.100201f,
-      -0.237568f, -0.763435f, -1.242043f, -2.135159f, -1.409485f, 1.236370f,
-      -0.474247f, -0.517906f, -0.410217f, -0.542244f, -0.795986f, -0.590004f,
-      -0.388863f, -0.462921f, -0.810627f, -0.778637f, -0.512486f, -0.718025f,
-      -0.710854f, -0.482513f, -0.318233f, -0.194962f, -0.220116f, -0.421673f,
-      -0.534233f, -0.403339f, -0.389332f, -0.407303f, -0.437355f, -0.469730f,
-      -0.359600f, -0.352745f, -0.466755f, -0.414585f, -0.430756f, -0.656822f,
-      -1.237038f, -2.046097f, -1.574898f, -0.593815f, -0.582165f, -0.336098f,
-      -0.372612f, -0.554386f, -0.410603f, -0.428276f, -0.647644f, -0.640720f,
-      -0.582207f, -0.414112f, -0.435547f, -0.435505f, -0.332561f, -0.248116f,
-      -0.340221f, -0.277855f, -0.352699f, -0.377319f, -0.230850f, -0.313267f,
-      -0.446270f, -0.346237f, -0.420422f, -0.530781f, -0.400341f, -0.463661f,
-      -0.209091f, -0.056705f, -0.011772f, -0.169388f, -0.736275f, -1.463017f,
-      -0.752701f, -0.668865f, -0.329765f, -0.299347f, -0.245667f, -0.286999f,
-      -0.520420f, -0.675438f, -0.255753f, 0.141357f,  -0.079639f, -0.419476f,
-      -0.374069f, -0.046253f, 0.116116f,  -0.145847f, -0.380371f, -0.563412f,
-      -0.638634f, -0.310116f, -0.260914f, -0.508404f, -0.465508f, -0.527824f,
-      -0.370979f, -0.305595f, -0.244694f, -0.254490f, 0.009968f,  -0.050201f,
-      -0.331219f, -0.614960f, -0.788208f, -0.483242f, -0.367516f, -0.186951f,
-      -0.180031f, 0.129711f,  -0.127811f, -0.384750f, -0.499542f, -0.418613f,
-      -0.121635f, 0.203197f,  -0.167290f, -0.397270f, -0.355461f, -0.218746f,
-      -0.376785f, -0.521698f, -0.721581f, -0.845741f, -0.535439f, -0.220882f,
-      -0.309067f, -0.555248f, -0.690342f, -0.664948f, -0.390102f, 0.020355f,
-      -0.130447f, -0.173252f, -0.170059f, -0.633663f, -0.956001f, -0.621696f,
-      -0.388302f, -0.342262f, -0.244370f, -0.386948f, -0.401421f, -0.172979f,
-      -0.206163f, -0.450058f, -0.525789f, -0.549274f, -0.349251f, -0.474613f,
-      -0.667976f, -0.435600f, -0.175369f, -0.196877f, -0.202976f, -0.242481f,
-      -0.258369f, -0.189133f, -0.395397f, -0.765499f, -0.944016f, -0.850967f,
-      -0.631561f, -0.152493f, -0.046432f, -0.262066f, -0.195919f, 0.048218f,
-      0.084972f,  0.039902f,  0.000618f,  -0.404430f, -0.447456f, -0.418076f,
-      -0.631935f, -0.717415f, -0.502888f, -0.530514f, -0.747826f, -0.704041f,
-      -0.674969f, -0.516853f, -0.418446f, -0.327740f, -0.308815f, -0.481636f,
-      -0.440083f, -0.481720f, -0.341053f, -0.283897f, -0.324368f, -0.352829f,
-      -0.434349f, -0.545589f, -0.533104f, -0.472755f, -0.570496f, -0.557735f,
-      -0.708176f, -0.493332f, -0.194416f, -0.186249f, -0.256710f, -0.271835f,
-      -0.304752f, -0.431267f, -0.422398f, -0.646725f, -0.680801f, -0.249031f,
-      -0.058567f, -0.213890f, -0.383949f, -0.540291f, -0.549877f, -0.225567f,
-      -0.037174f, -0.499874f, -0.641010f, -0.628044f, -0.390549f, -0.311497f,
-      -0.542313f, -0.569565f, -0.473408f, -0.331245f, -0.357197f, -0.285599f,
-      -0.200157f, -0.201866f, -0.124428f, -0.346016f, -0.392311f, -0.264496f,
-      -0.285370f, -0.436974f, -0.523483f, -0.410461f, -0.267925f, -0.055016f,
-      -0.382458f, -0.319771f, -0.049927f, 0.124329f,  0.266102f,  -0.106606f,
-      -0.773647f, -0.973053f, -0.708206f, -0.486137f, -0.319923f, -0.493900f,
-      -0.490860f, -0.324986f, -0.147346f, -0.146088f, -0.161758f, -0.084396f,
-      -0.379494f, 0.041626f,  -0.113361f, -0.277767f, 0.083366f,  0.126476f,
-      0.139057f,  0.038040f,  0.038162f,  -0.242126f, -0.411736f, -0.370049f,
-      -0.455357f, -0.039257f, 0.264442f,  -0.271492f, -0.425346f, -0.514847f,
-      -0.448650f, -0.580399f, -0.652603f, -0.774803f, -0.692524f, -0.579578f,
-      -0.465206f, -0.386265f, -0.458012f, -0.446594f, -0.284893f, -0.345448f,
-      -0.350876f, -0.440350f, -0.360378f, -0.270428f, 0.237213f,  -0.063602f,
-      -0.364529f, -0.179867f, 0.078197f,  0.117947f,  -0.093410f, -0.359119f,
-      -0.480961f, -0.540638f, -0.436287f, -0.598576f, -0.253735f, -0.060093f,
-      -0.549145f, -0.808327f, -0.698593f, -0.595764f, -0.582508f, -0.497353f,
-      -0.480892f, -0.584240f, -0.665791f, -0.690903f, -0.743446f, -0.796677f,
-      -0.782391f, -0.649010f, -0.628139f, -0.880848f, -0.829361f, -0.373272f,
-      -0.223667f, 0.174572f,  -0.348743f, -0.798901f, -0.692307f, -0.607609f,
-      -0.401455f, -0.480919f, -0.450798f, -0.435413f, -0.322338f, -0.228382f,
-      -0.450466f, -0.504440f, -0.477402f, -0.662224f, -0.583397f, -0.217445f,
-      -0.157459f, -0.079584f, -0.226168f, -0.488720f, -0.669624f, -0.666878f,
-      -0.565311f, -0.549625f, -0.364601f, -0.497627f, -0.736897f, -0.763023f,
-      -0.741020f, -0.404503f, 0.184814f,  -0.075315f, -0.281513f, -0.532906f,
-      -0.405800f, -0.313438f, -0.536652f, -0.403381f, 0.011967f,  0.103310f,
-      -0.269848f, -0.508656f, -0.445923f, -0.644859f, -0.617870f, -0.500927f,
-      -0.371559f, -0.125580f, 0.028625f,  -0.154713f, -0.442024f, -0.492764f,
-      -0.199371f, 0.236305f,  0.225925f,  0.075577f,  -0.285812f, -0.437145f,
-      -0.374260f, -0.156693f, -0.129635f, -0.243206f, -0.123058f, 0.162148f,
-      -0.313152f, -0.337982f, -0.358421f, 0.040070f,  0.038925f,  -0.333313f,
-      -0.351662f, 0.023014f,  0.091362f,  -0.282890f, -0.373253f, -0.389050f,
-      -0.532707f, -0.423347f, -0.349968f, -0.287045f, -0.202442f, -0.308430f,
-      -0.222801f, -0.106323f, -0.056358f, 0.027222f,  0.390732f,  0.033558f,
-      -0.160088f, -0.382217f, -0.535282f, -0.515900f, -0.022736f, 0.165665f,
-      -0.111408f, -0.233784f, -0.312357f, -0.541885f, -0.480022f, -0.482513f,
-      -0.246254f, 0.132244f,  0.090134f,  0.234634f,  -0.089249f, -0.460854f,
-      -0.515457f, -0.450874f, -0.311031f, -0.387680f, -0.360554f, -0.179241f,
-      -0.283817f, -0.475815f, -0.246399f, -0.388958f, -0.551140f, -0.496239f,
-      -0.559879f, -0.379761f, -0.254288f, -0.395111f, -0.613018f, -0.459427f,
-      -0.263580f, -0.268929f, 0.080826f,  0.115616f,  -0.097324f, -0.325310f,
-      -0.480450f, -0.313286f, -0.310371f, -0.517361f, -0.288288f, -0.112679f,
-      -0.173241f, -0.221664f, -0.039452f, -0.107578f, -0.089630f, -0.483768f,
-      -0.571087f, -0.497108f, -0.321533f, -0.375492f, -0.540363f, -0.406815f,
-      -0.388512f, -0.514561f, -0.540192f, -0.402412f, -0.232246f, -0.304749f,
-      -0.383724f, -0.679596f, -0.685463f, -0.694538f, -0.642937f, -0.425789f,
-      0.103271f,  -0.194862f, -0.487999f, -0.717281f, -0.681850f, -0.709286f,
-      -0.615398f, -0.554245f, -0.254681f, -0.049950f, -0.002914f, -0.095383f,
-      -0.370911f, -0.564224f, -0.242714f};
-  const size_t xtest = xsize / 2;
-  const size_t ytest = ysize / 2;
-
-  for (intptr_t dy = -16; dy <= 16; ++dy) {
-    float* row = in.Row(ytest + dy);
-    for (intptr_t dx = -16; dx <= 16; ++dx)
-      row[xtest + dx] = center[(dy + 16) * 33 + (dx + 16)];
-  }
-
-  const double sigma = 7.155933;
-
-  ImageF temp(xsize, ysize);
-  ImageF out_rg(xsize, ysize);
-  const auto rg = CreateRecursiveGaussian(sigma);
-  ThreadPool* null_pool = nullptr;
-  FastGaussian(rg, in, null_pool, &temp, &out_rg);
-
-  ImageF out_old;
-  {
-    const std::vector<float> kernel =
-        GaussianKernel(static_cast<int>(4 * sigma), static_cast<float>(sigma));
-    printf("old kernel size %" PRIuS "\n", kernel.size());
-    out_old = Convolve(in, kernel);
-  }
-
-  printf("rg %.4f old %.4f\n", out_rg.Row(ytest)[xtest],
-         out_old.Row(ytest)[xtest]);
-}
-
-}  // namespace jxl
--- a/third_party/jpeg-xl/lib/jxl/gradient_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/gradient_test.cc
@ -10,15 +10,16 @@

 #include <algorithm>
 #include <array>
+#include <cmath>
 #include <utility>
+#include <vector>

 #include "lib/jxl/base/common.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/span.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/color_encoding_internal.h"
-#include "lib/jxl/enc_cache.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
--- a/third_party/jpeg-xl/lib/jxl/image.h
+++ b/third_party/jpeg-xl/lib/jxl/image.h
@ -19,6 +19,7 @@

 #include <algorithm>
 #include <sstream>
+#include <string>
 #include <utility>  // std::move

 #include "lib/jxl/base/compiler_specific.h"
@ -328,7 +329,7 @@ class RectT {
    return CeilShiftRight(shift, shift);
  }

-  RectT<T> Extend(T border, RectT<T> parent) {
+  RectT<T> Extend(T border, RectT<T> parent) const {
    T new_x0 = x0() > parent.x0() + border ? x0() - border : parent.x0();
    T new_y0 = y0() > parent.y0() + border ? y0() - border : parent.y0();
    T new_x1 = x1() + border > parent.x1() ? parent.x1() : x1() + border;
--- a/third_party/jpeg-xl/lib/jxl/image_bundle.h
+++ b/third_party/jpeg-xl/lib/jxl/image_bundle.h
@ -12,22 +12,21 @@
 #include <stddef.h>
 #include <stdint.h>

+#include <memory>
+#include <string>
+#include <utility>
 #include <vector>

-#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/base/status.h"
 #include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/common.h"  // JPEGXL_ENABLE_TRANSCODE_JPEG
-#include "lib/jxl/dec_bit_reader.h"
-#include "lib/jxl/dec_xyb.h"
-#include "lib/jxl/field_encodings.h"
 #include "lib/jxl/frame_header.h"
-#include "lib/jxl/headers.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_metadata.h"
+#include "lib/jxl/image_ops.h"
 #include "lib/jxl/jpeg/jpeg_data.h"
-#include "lib/jxl/quantizer.h"

 namespace jxl {

--- a/third_party/jpeg-xl/lib/jxl/image_metadata.h
+++ b/third_party/jpeg-xl/lib/jxl/image_metadata.h
@ -16,10 +16,13 @@
 #include <string>
 #include <vector>

+#include "lib/jxl/base/compiler_specific.h"
+#include "lib/jxl/base/status.h"
 #include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/dec_bit_reader.h"
+#include "lib/jxl/field_encodings.h"
 #include "lib/jxl/fields.h"
 #include "lib/jxl/headers.h"
-#include "lib/jxl/jpeg/jpeg_data.h"

 namespace jxl {

--- a/third_party/jpeg-xl/lib/jxl/jxl_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/jxl_test.cc
@ -6,42 +6,40 @@
 #include "lib/extras/dec/jxl.h"

 #include <jxl/cms.h>
+#include <jxl/color_encoding.h>
 #include <jxl/encode.h>
+#include <jxl/types.h>

-#include <array>
+#include <algorithm>
+#include <cstddef>
 #include <cstdint>
+#include <cstdio>
+#include <cstring>
 #include <future>
 #include <ostream>
 #include <string>
 #include <tuple>
-#include <utility>
 #include <vector>

 #include "lib/extras/codec.h"
 #include "lib/extras/dec/decode.h"
 #include "lib/extras/enc/encode.h"
+#include "lib/extras/enc/jxl.h"
 #include "lib/extras/packed_image.h"
 #include "lib/jxl/alpha.h"
-#include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/override.h"
 #include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/common.h"  // JXL_HIGH_PRECISION
 #include "lib/jxl/enc_butteraugli_comparator.h"
-#include "lib/jxl/enc_cache.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/fake_parallel_runner_testonly.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
-#include "lib/jxl/image_ops.h"
-#include "lib/jxl/image_test_utils.h"
-#include "lib/jxl/jpeg/dec_jpeg_data.h"
-#include "lib/jxl/jpeg/dec_jpeg_data_writer.h"
+#include "lib/jxl/image_metadata.h"
 #include "lib/jxl/jpeg/enc_jpeg_data.h"
-#include "lib/jxl/jpeg/jpeg_data.h"
-#include "lib/jxl/modular/options.h"
 #include "lib/jxl/test_image.h"
 #include "lib/jxl/test_utils.h"
 #include "lib/jxl/testing.h"
@ -122,7 +120,7 @@ TEST(JxlTest, RoundtripSmallD1) {

  {
    PackedPixelFile ppf_out;
-    EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 1027, 40);
+    EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 916, 40);
    EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.888));
  }

@ -357,8 +355,8 @@ TEST(JxlTest, RoundtripLargeFast) {
  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 7);  // kSquirrel

  PackedPixelFile ppf_out;
-  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 505555, 5000);
-  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(75));
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 492867, 5000);
+  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(78));
 }

 TEST(JxlTest, RoundtripDotsForceEpf) {
@ -374,7 +372,7 @@ TEST(JxlTest, RoundtripDotsForceEpf) {
  cparams.AddOption(JXL_ENC_FRAME_SETTING_DOTS, 1);

  PackedPixelFile ppf_out;
-  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 40777, 300);
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 41355, 300);
  EXPECT_THAT(ComputeDistance2(t.ppf(), ppf_out), IsSlightlyBelow(18));
 }

@ -454,7 +452,7 @@ TEST(JxlTest, RoundtripSmallNL) {
  t.SetDimensions(xsize, ysize);

  PackedPixelFile ppf_out;
-  EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 1027, 45);
+  EXPECT_NEAR(Roundtrip(t.ppf(), {}, {}, pool, &ppf_out), 916, 45);
  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(0.82));
 }

@ -470,7 +468,7 @@ TEST(JxlTest, RoundtripNoGaborishNoAR) {
  cparams.AddOption(JXL_ENC_FRAME_SETTING_GABORISH, 0);

  PackedPixelFile ppf_out;
-  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 41769, 400);
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 41142, 400);
  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.8));
 }

@ -488,7 +486,7 @@ TEST(JxlTest, RoundtripSmallNoGaborish) {
  cparams.AddOption(JXL_ENC_FRAME_SETTING_GABORISH, 0);

  PackedPixelFile ppf_out;
-  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 1032, 20);
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 1006, 20);
  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.1));
 }

@ -861,7 +859,7 @@ TEST(JxlTest, RoundtripAlphaResampling) {
  cparams.AddOption(JXL_ENC_FRAME_SETTING_EXTRA_CHANNEL_RESAMPLING, 2);

  PackedPixelFile ppf_out;
-  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 13655, 130);
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 13507, 130);
  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(5.2));
 }

@ -952,9 +950,11 @@ TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8)) {
  t.DecodeFromBytes(orig).ClearMetadata();

  JXLCompressParams cparams = CompressParamsForLossless();
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);

  PackedPixelFile ppf_out;
-  EXPECT_EQ(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 223058);
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, dparams, &pool, &ppf_out), 223058);
  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
 }

@ -968,9 +968,11 @@ TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8ThunderGradient)) {
  JXLCompressParams cparams = CompressParamsForLossless();
  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 2);             // kThunder
  cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR, 5);  // Gradient
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);

  PackedPixelFile ppf_out;
-  EXPECT_EQ(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 261684);
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, dparams, &pool, &ppf_out), 261684);
  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
 }

@ -983,10 +985,12 @@ TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8LightningGradient)) {

  JXLCompressParams cparams = CompressParamsForLossless();
  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 1);  // kLightning
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);

  PackedPixelFile ppf_out;
  // Lax comparison because different SIMD will cause different compression.
-  EXPECT_THAT(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out),
+  EXPECT_THAT(Roundtrip(t.ppf(), cparams, dparams, &pool, &ppf_out),
              IsSlightlyBelow(286848u));
  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
 }
@ -1000,9 +1004,11 @@ TEST(JxlTest, JXL_SLOW_TEST(RoundtripLossless8Falcon)) {

  JXLCompressParams cparams = CompressParamsForLossless();
  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 3);  // kFalcon
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);

  PackedPixelFile ppf_out;
-  EXPECT_EQ(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 230766);
+  EXPECT_EQ(Roundtrip(t.ppf(), cparams, dparams, &pool, &ppf_out), 230766);
  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
 }

@ -1136,8 +1142,8 @@ TEST(JxlTest, RoundtripNoise) {
  cparams.AddOption(JXL_ENC_FRAME_SETTING_NOISE, 1);

  PackedPixelFile ppf_out;
-  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 42345, 750);
-  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.35));
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, pool, &ppf_out), 41009, 750);
+  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.42));
 }

 TEST(JxlTest, RoundtripLossless8Gray) {
@ -1235,7 +1241,7 @@ TEST(JxlTest, RoundtripAnimationPatches) {
  PackedPixelFile ppf_out;
  // 40k with no patches, 27k with patch frames encoded multiple times.
  EXPECT_THAT(Roundtrip(t.ppf(), cparams, dparams, pool, &ppf_out),
-              IsSlightlyBelow(19252));
+              IsSlightlyBelow(19300));
  EXPECT_EQ(ppf_out.frames.size(), t.ppf().frames.size());
  // >10 with broken patches; not all patches are detected on borders.
  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.9));
@ -1468,7 +1474,7 @@ TEST(JxlTest, RoundtripProgressive) {
  cparams.AddOption(JXL_ENC_FRAME_SETTING_RESPONSIVE, 1);

  PackedPixelFile ppf_out;
-  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 71444, 750);
+  EXPECT_NEAR(Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out), 70544, 750);
  EXPECT_THAT(ButteraugliDistance(t.ppf(), ppf_out), IsSlightlyBelow(1.4));
 }

@ -1569,9 +1575,11 @@ TEST_P(JxlTest, LosslessSmallFewColors) {
  JXLCompressParams cparams;
  cparams.distance = 0;
  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 1);
+  JXLDecompressParams dparams;
+  dparams.accepted_formats.push_back(t.ppf().frames[0].color.format);

  PackedPixelFile ppf_out;
-  Roundtrip(t.ppf(), cparams, {}, &pool, &ppf_out);
+  Roundtrip(t.ppf(), cparams, dparams, &pool, &ppf_out);
  EXPECT_EQ(ComputeDistance2(t.ppf(), ppf_out), 0.0);
 }

@ -1585,6 +1593,7 @@ struct StreamingTestParam {
  size_t ysize;
  bool is_grey;
  int effort;
+  bool progressive;

  size_t num_channels() const { return is_grey ? 1 : 3; }

@ -1594,10 +1603,11 @@ struct StreamingTestParam {
    std::vector<StreamingTestParam> params;
    for (int e : {1, 3, 4, 7}) {
      for (bool g : {false, true}) {
-        params.push_back(StreamingTestParam{357, 517, g, e});
-        params.push_back(StreamingTestParam{2247, 2357, g, e});
+        params.push_back(StreamingTestParam{357, 517, g, e, false});
+        params.push_back(StreamingTestParam{2247, 2357, g, e, false});
      }
    }
+    params.push_back(StreamingTestParam{2247, 2357, false, 1, true});
    return params;
  }
 };
@ -1606,6 +1616,9 @@ std::ostream& operator<<(std::ostream& out, StreamingTestParam p) {
  out << (p.is_grey ? "Grey" : "RGB");
  out << p.xsize << "x" << p.ysize;
  out << "e" << p.effort;
+  if (p.progressive) {
+    out << "Progressive";
+  }
  return out;
 }

@ -1624,6 +1637,9 @@ TEST_P(JxlStreamingTest, Roundtrip) {
  cparams.distance = 0.1;
  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, p.effort);
  cparams.AddOption(JXL_ENC_FRAME_SETTING_BUFFERING, 3);
+  if (p.progressive) {
+    cparams.AddOption(JXL_ENC_FRAME_SETTING_PROGRESSIVE_AC, 1);
+  }

  ThreadPoolForTests pool(8);
  PackedPixelFile ppf_out;
@ -1635,5 +1651,27 @@ JXL_GTEST_INSTANTIATE_TEST_SUITE_P(
    JxlStreamingTest, JxlStreamingTest,
    testing::ValuesIn(StreamingTestParam::All()));

+// This is broken on mingw32, so we only enable it for x86_64 now.
+TEST(JxlTest, JXL_X86_64_TEST(StreamingSamePixels)) {
+  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
+
+  jxl::test::TestImage image;
+  image.DecodeFromBytes(orig);
+  JXLCompressParams cparams;
+  cparams.distance = 1.0;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 6);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_USE_FULL_IMAGE_HEURISTICS, 0);
+
+  ThreadPoolForTests pool(8);
+  PackedPixelFile ppf_out;
+  Roundtrip(image.ppf(), cparams, {}, &pool, &ppf_out);
+
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_BUFFERING, 3);
+  PackedPixelFile ppf_out_streaming;
+  Roundtrip(image.ppf(), cparams, {}, &pool, &ppf_out_streaming);
+
+  EXPECT_TRUE(jxl::test::SamePixels(ppf_out, ppf_out_streaming));
+}
+
 }  // namespace
 }  // namespace jxl
--- a/third_party/jpeg-xl/lib/jxl/memory_manager_internal.h
+++ b/third_party/jpeg-xl/lib/jxl/memory_manager_internal.h
@ -10,11 +10,9 @@

 #include <jxl/memory_manager.h>
 #include <stddef.h>
-#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>  // memcpy

-#include <atomic>
 #include <memory>

 #include "lib/jxl/base/compiler_specific.h"
--- a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.cc
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.cc
@ -67,6 +67,38 @@ inline std::array<uint8_t, 3> PredictorColor(Predictor p) {
  };
 }

+// `cutoffs` must be sorted.
+Tree MakeFixedTree(int property, const std::vector<int32_t> &cutoffs,
+                   Predictor pred, size_t num_pixels) {
+  size_t log_px = CeilLog2Nonzero(num_pixels);
+  size_t min_gap = 0;
+  // Reduce fixed tree height when encoding small images.
+  if (log_px < 14) {
+    min_gap = 8 * (14 - log_px);
+  }
+  Tree tree;
+  struct NodeInfo {
+    size_t begin, end, pos;
+  };
+  std::queue<NodeInfo> q;
+  // Leaf IDs will be set by roundtrip decoding the tree.
+  tree.push_back(PropertyDecisionNode::Leaf(pred));
+  q.push(NodeInfo{0, cutoffs.size(), 0});
+  while (!q.empty()) {
+    NodeInfo info = q.front();
+    q.pop();
+    if (info.begin + min_gap >= info.end) continue;
+    uint32_t split = (info.begin + info.end) / 2;
+    tree[info.pos] =
+        PropertyDecisionNode::Split(property, cutoffs[split], tree.size());
+    q.push(NodeInfo{split + 1, info.end, tree.size()});
+    tree.push_back(PropertyDecisionNode::Leaf(pred));
+    q.push(NodeInfo{info.begin, split, tree.size()});
+    tree.push_back(PropertyDecisionNode::Leaf(pred));
+  }
+  return tree;
+}
+
 }  // namespace

 void GatherTreeData(const Image &image, pixel_type chan, size_t group_id,
@ -168,6 +200,83 @@ void GatherTreeData(const Image &image, pixel_type chan, size_t group_id,
  }
 }

+Tree PredefinedTree(ModularOptions::TreeKind tree_kind, size_t total_pixels) {
+  if (tree_kind == ModularOptions::TreeKind::kJpegTranscodeACMeta ||
+      tree_kind == ModularOptions::TreeKind::kTrivialTreeNoPredictor) {
+    // All the data is 0, so no need for a fancy tree.
+    return {PropertyDecisionNode::Leaf(Predictor::Zero)};
+  }
+  if (tree_kind == ModularOptions::TreeKind::kFalconACMeta) {
+    // All the data is 0 except the quant field. TODO(veluca): make that 0 too.
+    return {PropertyDecisionNode::Leaf(Predictor::Left)};
+  }
+  if (tree_kind == ModularOptions::TreeKind::kACMeta) {
+    // Small image.
+    if (total_pixels < 1024) {
+      return {PropertyDecisionNode::Leaf(Predictor::Left)};
+    }
+    Tree tree;
+    // 0: c > 1
+    tree.push_back(PropertyDecisionNode::Split(0, 1, 1));
+    // 1: c > 2
+    tree.push_back(PropertyDecisionNode::Split(0, 2, 3));
+    // 2: c > 0
+    tree.push_back(PropertyDecisionNode::Split(0, 0, 5));
+    // 3: EPF control field (all 0 or 4), top > 0
+    tree.push_back(PropertyDecisionNode::Split(6, 0, 21));
+    // 4: ACS+QF, y > 0
+    tree.push_back(PropertyDecisionNode::Split(2, 0, 7));
+    // 5: CfL x
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Gradient));
+    // 6: CfL b
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Gradient));
+    // 7: QF: split according to the left quant value.
+    tree.push_back(PropertyDecisionNode::Split(7, 5, 9));
+    // 8: ACS: split in 4 segments (8x8 from 0 to 3, large square 4-5, large
+    // rectangular 6-11, 8x8 12+), according to previous ACS value.
+    tree.push_back(PropertyDecisionNode::Split(7, 5, 15));
+    // QF
+    tree.push_back(PropertyDecisionNode::Split(7, 11, 11));
+    tree.push_back(PropertyDecisionNode::Split(7, 3, 13));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Left));
+    // ACS
+    tree.push_back(PropertyDecisionNode::Split(7, 11, 17));
+    tree.push_back(PropertyDecisionNode::Split(7, 3, 19));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    // EPF, left > 0
+    tree.push_back(PropertyDecisionNode::Split(7, 0, 23));
+    tree.push_back(PropertyDecisionNode::Split(7, 0, 25));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    tree.push_back(PropertyDecisionNode::Leaf(Predictor::Zero));
+    return tree;
+  }
+  if (tree_kind == ModularOptions::TreeKind::kWPFixedDC) {
+    std::vector<int32_t> cutoffs = {
+        -500, -392, -255, -191, -127, -95, -63, -47, -31, -23, -15,
+        -11,  -7,   -4,   -3,   -1,   0,   1,   3,   5,   7,   11,
+        15,   23,   31,   47,   63,   95,  127, 191, 255, 392, 500};
+    return MakeFixedTree(kWPProp, cutoffs, Predictor::Weighted, total_pixels);
+  }
+  if (tree_kind == ModularOptions::TreeKind::kGradientFixedDC) {
+    std::vector<int32_t> cutoffs = {
+        -500, -392, -255, -191, -127, -95, -63, -47, -31, -23, -15,
+        -11,  -7,   -4,   -3,   -1,   0,   1,   3,   5,   7,   11,
+        15,   23,   31,   47,   63,   95,  127, 191, 255, 392, 500};
+    return MakeFixedTree(kGradientProp, cutoffs, Predictor::Gradient,
+                         total_pixels);
+  }
+  JXL_UNREACHABLE("Unreachable");
+  return {};
+}
+
 Tree LearnTree(TreeSamples &&tree_samples, size_t total_pixels,
               const ModularOptions &options,
               const std::vector<ModularMultiplierInfo> &multiplier_info = {},
@ -494,8 +603,11 @@ Status ModularEncode(const Image &image, const ModularOptions &options,
    std::vector<uint8_t> context_map;

    std::vector<std::vector<Token>> tree_tokens(1);
+
    tree_storage =
-        LearnTree(std::move(tree_samples_storage), *total_pixels, options);
+        options.tree_kind == ModularOptions::TreeKind::kLearn
+            ? LearnTree(std::move(tree_samples_storage), *total_pixels, options)
+            : PredefinedTree(options.tree_kind, *total_pixels);
    tree = &tree_storage;
    tokens = &tokens_storage[0];

--- a/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.h
+++ b/third_party/jpeg-xl/lib/jxl/modular/encoding/enc_encoding.h
@ -9,11 +9,12 @@
 #include <cstddef>
 #include <vector>

+#include "lib/jxl/base/status.h"
 #include "lib/jxl/enc_ans.h"
 #include "lib/jxl/enc_bit_writer.h"
-#include "lib/jxl/image.h"
 #include "lib/jxl/modular/encoding/dec_ma.h"
 #include "lib/jxl/modular/encoding/enc_ma.h"
+#include "lib/jxl/modular/modular_image.h"
 #include "lib/jxl/modular/options.h"

 namespace jxl {
@ -21,6 +22,8 @@ namespace jxl {
 struct AuxOut;
 struct GroupHeader;

+Tree PredefinedTree(ModularOptions::TreeKind tree_kind, size_t total_pixels);
+
 Tree LearnTree(TreeSamples &&tree_samples, size_t total_pixels,
               const ModularOptions &options,
               const std::vector<ModularMultiplierInfo> &multiplier_info = {},
--- a/third_party/jpeg-xl/lib/jxl/modular_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/modular_test.cc
@ -4,36 +4,50 @@
 // license that can be found in the LICENSE file.

 #include <jxl/cms.h>
+#include <jxl/encode.h>
+#include <jxl/types.h>

-#include <array>
+#include <cstddef>
 #include <cstdint>
+#include <sstream>
 #include <string>
 #include <utility>
 #include <vector>

 #include "lib/extras/codec.h"
 #include "lib/extras/dec/jxl.h"
+#include "lib/extras/enc/jxl.h"
 #include "lib/extras/metrics.h"
+#include "lib/extras/packed_image.h"
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/data_parallel.h"
-#include "lib/jxl/base/override.h"
+#include "lib/jxl/base/random.h"
 #include "lib/jxl/base/span.h"
+#include "lib/jxl/base/status.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/color_encoding_internal.h"
+#include "lib/jxl/dec_bit_reader.h"
 #include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_bit_writer.h"
 #include "lib/jxl/enc_butteraugli_comparator.h"
-#include "lib/jxl/enc_cache.h"
 #include "lib/jxl/enc_fields.h"
 #include "lib/jxl/enc_params.h"
 #include "lib/jxl/enc_toc.h"
+#include "lib/jxl/fields.h"
+#include "lib/jxl/frame_header.h"
+#include "lib/jxl/headers.h"
 #include "lib/jxl/image.h"
 #include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_metadata.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/image_test_utils.h"
 #include "lib/jxl/modular/encoding/enc_encoding.h"
 #include "lib/jxl/modular/encoding/encoding.h"
-#include "lib/jxl/modular/encoding/ma_common.h"
+#include "lib/jxl/modular/modular_image.h"
+#include "lib/jxl/modular/options.h"
+#include "lib/jxl/modular/transform/transform.h"
 #include "lib/jxl/padded_bytes.h"
+#include "lib/jxl/test_image.h"
 #include "lib/jxl/test_utils.h"
 #include "lib/jxl/testing.h"

@ -42,23 +56,25 @@ namespace {

 using test::ReadTestData;
 using test::Roundtrip;
+using test::TestImage;

 void TestLosslessGroups(size_t group_size_shift) {
  const std::vector<uint8_t> orig = ReadTestData("jxl/flower/flower.png");
-  CompressParams cparams;
-  cparams.SetLossless();
-  cparams.modular_group_size_shift = group_size_shift;
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  t.SetDimensions(t.ppf().xsize() / 4, t.ppf().ysize() / 4);

-  CodecInOut io_out;
+  extras::JXLCompressParams cparams;
+  cparams.distance = 0.0f;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_GROUP_SIZE, group_size_shift);
+  extras::JXLDecompressParams dparams;
+  dparams.accepted_formats = {{3, JXL_TYPE_UINT16, JXL_LITTLE_ENDIAN, 0}};

-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
-  io.ShrinkTo(io.xsize() / 4, io.ysize() / 4);
-
-  size_t compressed_size;
-  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
+  extras::PackedPixelFile ppf_out;
+  size_t compressed_size =
+      Roundtrip(t.ppf(), cparams, dparams, nullptr, &ppf_out);
  EXPECT_LE(compressed_size, 280000u);
-  JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io_out.Main().color(), _));
+  EXPECT_EQ(0.0f, test::ComputeDistance2(t.ppf(), ppf_out));
 }

 TEST(ModularTest, RoundtripLosslessGroups128) { TestLosslessGroups(0); }
@ -74,24 +90,26 @@ TEST(ModularTest, JXL_TSAN_SLOW_TEST(RoundtripLosslessGroups1024)) {
 TEST(ModularTest, RoundtripLosslessCustomWP_PermuteRCT) {
  const std::vector<uint8_t> orig =
      ReadTestData("external/wesaturate/500px/u76c0g_bliznaca_srgb8.png");
-  CompressParams cparams;
-  cparams.SetLossless();
+  TestImage t;
+  t.DecodeFromBytes(orig).ClearMetadata();
+  t.SetDimensions(100, 100);
+
+  extras::JXLCompressParams cparams;
+  cparams.distance = 0.0f;
  // 9 = permute to GBR, to test the special case of permutation-only
-  cparams.colorspace = 9;
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_COLOR_SPACE, 9);
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_MODULAR_PREDICTOR,
+                    static_cast<int64_t>(Predictor::Weighted));
  // slowest speed so different WP modes are tried
-  cparams.speed_tier = SpeedTier::kTortoise;
-  cparams.options.predictor = {Predictor::Weighted};
+  cparams.AddOption(JXL_ENC_FRAME_SETTING_EFFORT, 9);
+  extras::JXLDecompressParams dparams;
+  dparams.accepted_formats = {{3, JXL_TYPE_UINT16, JXL_LITTLE_ENDIAN, 0}};

-  CodecInOut io_out;
-
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
-  io.ShrinkTo(100, 100);
-
-  size_t compressed_size;
-  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io_out, _, &compressed_size));
+  extras::PackedPixelFile ppf_out;
+  size_t compressed_size =
+      Roundtrip(t.ppf(), cparams, dparams, nullptr, &ppf_out);
  EXPECT_LE(compressed_size, 10169u);
-  JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io_out.Main().color(), _));
+  EXPECT_EQ(0.0f, test::ComputeDistance2(t.ppf(), ppf_out));
 }

 TEST(ModularTest, RoundtripLossyDeltaPalette) {
@ -231,38 +249,6 @@ TEST(ModularTest, RoundtripExtraProperties) {
  }
 }

-TEST(ModularTest, RoundtripLosslessCustomSqueeze) {
-  const std::vector<uint8_t> orig =
-      ReadTestData("external/wesaturate/500px/tmshre_riaphotographs_srgb8.png");
-  CodecInOut io;
-  ASSERT_TRUE(SetFromBytes(Bytes(orig), &io));
-
-  CompressParams cparams;
-  cparams.modular_mode = true;
-  cparams.color_transform = jxl::ColorTransform::kNone;
-  cparams.butteraugli_distance = 0.f;
-  cparams.options.predictor = {Predictor::Zero};
-  cparams.speed_tier = SpeedTier::kThunder;
-  cparams.responsive = 1;
-  // Custom squeeze params, atm just for testing
-  SqueezeParams p;
-  p.horizontal = true;
-  p.in_place = false;
-  p.begin_c = 0;
-  p.num_c = 3;
-  cparams.squeezes.push_back(p);
-  p.begin_c = 1;
-  p.in_place = true;
-  p.horizontal = false;
-  cparams.squeezes.push_back(p);
-
-  CodecInOut io2;
-  size_t compressed_size;
-  JXL_EXPECT_OK(Roundtrip(&io, cparams, {}, &io2, _, &compressed_size));
-  EXPECT_LE(compressed_size, 265000u);
-  JXL_EXPECT_OK(SamePixels(*io.Main().color(), *io2.Main().color(), _));
-}
-
 struct RoundtripLosslessConfig {
  int bitdepth;
  int responsive;
--- a/third_party/jpeg-xl/lib/jxl/opsin_image_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/opsin_image_test.cc
@ -5,12 +5,17 @@

 #include <jxl/cms.h>

+#include <cstddef>
+#include <utility>
+
 #include "lib/jxl/base/compiler_specific.h"
 #include "lib/jxl/base/matrix_ops.h"
 #include "lib/jxl/cms/opsin_params.h"
 #include "lib/jxl/dec_xyb.h"
 #include "lib/jxl/enc_xyb.h"
 #include "lib/jxl/image.h"
+#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_metadata.h"
 #include "lib/jxl/opsin_params.h"
 #include "lib/jxl/testing.h"

--- a/third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc
+++ b/third_party/jpeg-xl/lib/jxl/opsin_inverse_test.cc
@ -5,13 +5,15 @@

 #include <jxl/cms.h>

+#include <utility>
+
 #include "lib/jxl/base/data_parallel.h"
 #include "lib/jxl/codec_in_out.h"
 #include "lib/jxl/color_encoding_internal.h"
 #include "lib/jxl/dec_xyb.h"
 #include "lib/jxl/enc_xyb.h"
 #include "lib/jxl/image.h"
-#include "lib/jxl/image_bundle.h"
+#include "lib/jxl/image_ops.h"
 #include "lib/jxl/image_test_utils.h"
 #include "lib/jxl/testing.h"

--- a/Show more
+++ b/Show more