forked from mirrors/gecko-dev
This updates all crates that can be updated with no addition of new crates, that start with letter d, e or f. Differential Revision: https://phabricator.services.mozilla.com/D133029
1351 lines
54 KiB
C++
1351 lines
54 KiB
C++
// Copyright Mozilla Foundation. See the COPYRIGHT
|
|
// file at the top-level directory of this distribution.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
|
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
|
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
|
// option. This file may not be copied, modified, or distributed
|
|
// except according to those terms.
|
|
|
|
#pragma once
|
|
|
|
#ifndef encoding_rs_cpp_h_
|
|
#define encoding_rs_cpp_h_
|
|
|
|
#include <memory>
|
|
#include <optional>
|
|
#include <string>
|
|
#include <string_view>
|
|
#include <tuple>
|
|
#include <vector>
|
|
#include "gsl/gsl"
|
|
|
|
namespace encoding_rs {
|
|
class Encoding;
|
|
class Decoder;
|
|
class Encoder;
|
|
}; // namespace encoding_rs
|
|
|
|
#define ENCODING_RS_ENCODING encoding_rs::Encoding
|
|
#define ENCODING_RS_NOT_NULL_CONST_ENCODING_PTR \
|
|
gsl::not_null<const encoding_rs::Encoding*>
|
|
#define ENCODING_RS_ENCODER encoding_rs::Encoder
|
|
#define ENCODING_RS_DECODER encoding_rs::Decoder
|
|
|
|
#include "encoding_rs.h"
|
|
|
|
namespace encoding_rs {
|
|
|
|
/**
|
|
* A converter that decodes a byte stream into Unicode according to a
|
|
* character encoding in a streaming (incremental) manner.
|
|
*
|
|
* The various `decode_*` methods take an input buffer (`src`) and an output
|
|
* buffer `dst` both of which are caller-allocated. There are variants for
|
|
* both UTF-8 and UTF-16 output buffers.
|
|
*
|
|
* A `decode_*` method decodes bytes from `src` into Unicode characters stored
|
|
* into `dst` until one of the following three things happens:
|
|
*
|
|
* 1. A malformed byte sequence is encountered (`*_without_replacement`
|
|
* variants only).
|
|
*
|
|
* 2. The output buffer has been filled so near capacity that the decoder
|
|
* cannot be sure that processing an additional byte of input wouldn't
|
|
* cause so much output that the output buffer would overflow.
|
|
*
|
|
* 3. All the input bytes have been processed.
|
|
*
|
|
* The `decode_*` method then returns tuple of a status indicating which one
|
|
* of the three reasons to return happened, how many input bytes were read,
|
|
* how many output code units (`uint8_t` when decoding into UTF-8 and `char16_t`
|
|
* when decoding to UTF-16) were written, and in the case of the
|
|
* variants performing replacement, a boolean indicating whether an error was
|
|
* replaced with the REPLACEMENT CHARACTER during the call.
|
|
*
|
|
* The number of bytes "written" is what's logically written. Garbage may be
|
|
* written in the output buffer beyond the point logically written to.
|
|
*
|
|
* In the case of the `*_without_replacement` variants, the status is a
|
|
* `uint32_t` whose possible values are packed info about a malformed byte
|
|
* sequence, `OUTPUT_FULL` and `INPUT_EMPTY` corresponding to the three cases
|
|
* listed above).
|
|
*
|
|
* Packed info about malformed sequences has the following format:
|
|
* The lowest 8 bits, which can have the decimal value 0, 1, 2 or 3,
|
|
* indicate the number of bytes that were consumed after the malformed
|
|
* sequence and whose next-lowest 8 bits, when shifted right by 8 indicate
|
|
* the length of the malformed byte sequence (possible decimal values 1, 2,
|
|
* 3 or 4). The maximum possible sum of the two is 6.
|
|
*
|
|
* In the case of methods whose name does not end with
|
|
* `*_without_replacement`, malformed sequences are automatically replaced
|
|
* with the REPLACEMENT CHARACTER and errors do not cause the methods to
|
|
* return early.
|
|
*
|
|
* When decoding to UTF-8, the output buffer must have at least 4 bytes of
|
|
* space. When decoding to UTF-16, the output buffer must have at least two
|
|
* UTF-16 code units (`char16_t`) of space.
|
|
*
|
|
* When decoding to UTF-8 without replacement, the methods are guaranteed
|
|
* not to return indicating that more output space is needed if the length
|
|
* of the output buffer is at least the length returned by
|
|
* `max_utf8_buffer_length_without_replacement()`. When decoding to UTF-8
|
|
* with replacement, the length of the output buffer that guarantees the
|
|
* methods not to return indicating that more output space is needed is given
|
|
* by `max_utf8_buffer_length()`. When decoding to UTF-16 with
|
|
* or without replacement, the length of the output buffer that guarantees
|
|
* the methods not to return indicating that more output space is needed is
|
|
* given by `max_utf16_buffer_length()`.
|
|
*
|
|
* The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
|
|
* and the output after each `decode_*` call is guaranteed to consist of
|
|
* complete characters. (I.e. the code unit sequence for the last character is
|
|
* guaranteed not to be split across output buffers.)
|
|
*
|
|
* The boolean argument `last` indicates that the end of the stream is reached
|
|
* when all the bytes in `src` have been consumed.
|
|
*
|
|
* A `Decoder` object can be used to incrementally decode a byte stream.
|
|
*
|
|
* During the processing of a single stream, the caller must call `decode_*`
|
|
* zero or more times with `last` set to `false` and then call `decode_*` at
|
|
* least once with `last` set to `true`. If `decode_*` returns `INPUT_EMPTY`,
|
|
* the processing of the stream has ended. Otherwise, the caller must call
|
|
* `decode_*` again with `last` set to `true` (or treat a malformed result,
|
|
* i.e. neither `INPUT_EMPTY` nor `OUTPUT_FULL`, as a fatal error).
|
|
*
|
|
* Once the stream has ended, the `Decoder` object must not be used anymore.
|
|
* That is, you need to create another one to process another stream.
|
|
*
|
|
* When the decoder returns `OUTPUT_FULL` or the decoder returns a malformed
|
|
* result and the caller does not wish to treat it as a fatal error, the input
|
|
* buffer `src` may not have been completely consumed. In that case, the caller
|
|
* must pass the unconsumed contents of `src` to `decode_*` again upon the next
|
|
* call.
|
|
*
|
|
* # Infinite loops
|
|
*
|
|
* When converting with a fixed-size output buffer whose size is too small to
|
|
* accommodate one character of output, an infinite loop ensues. When
|
|
* converting with a fixed-size output buffer, it generally makes sense to
|
|
* make the buffer fairly large (e.g. couple of kilobytes).
|
|
*/
|
|
class Decoder final {
|
|
public:
|
|
~Decoder() {}
|
|
static inline void operator delete(void* decoder) {
|
|
decoder_free(reinterpret_cast<Decoder*>(decoder));
|
|
}
|
|
|
|
/**
|
|
* The `Encoding` this `Decoder` is for.
|
|
*
|
|
* BOM sniffing can change the return value of this method during the life
|
|
* of the decoder.
|
|
*/
|
|
inline gsl::not_null<const Encoding*> encoding() const {
|
|
return gsl::not_null<const Encoding*>(decoder_encoding(this));
|
|
}
|
|
|
|
/**
|
|
* Query the worst-case UTF-8 output size _with replacement_.
|
|
*
|
|
* Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
|
|
* that will not overflow given the current state of the decoder and
|
|
* `byte_length` number of additional input bytes when decoding with
|
|
* errors handled by outputting a REPLACEMENT CHARACTER for each malformed
|
|
* sequence or `std::optional` without value if `size_t` would overflow.
|
|
*/
|
|
inline std::optional<size_t> max_utf8_buffer_length(
|
|
size_t byte_length) const {
|
|
size_t val = decoder_max_utf8_buffer_length(this, byte_length);
|
|
if (val == SIZE_MAX) {
|
|
return std::nullopt;
|
|
}
|
|
return val;
|
|
}
|
|
|
|
/**
|
|
* Query the worst-case UTF-8 output size _without replacement_.
|
|
*
|
|
* Returns the size of the output buffer in UTF-8 code units (`uint8_t`)
|
|
* that will not overflow given the current state of the decoder and
|
|
* `byte_length` number of additional input bytes when decoding without
|
|
* replacement error handling or `std::optional` without value if `size_t`
|
|
* would overflow.
|
|
*
|
|
* Note that this value may be too small for the `_with_replacement` case.
|
|
* Use `max_utf8_buffer_length()` for that case.
|
|
*/
|
|
inline std::optional<size_t> max_utf8_buffer_length_without_replacement(
|
|
size_t byte_length) const {
|
|
size_t val =
|
|
decoder_max_utf8_buffer_length_without_replacement(this, byte_length);
|
|
if (val == SIZE_MAX) {
|
|
return std::nullopt;
|
|
}
|
|
return val;
|
|
}
|
|
|
|
/**
|
|
* Incrementally decode a byte stream into UTF-8 with malformed sequences
|
|
* replaced with the REPLACEMENT CHARACTER.
|
|
*
|
|
* See the documentation of the class for documentation for `decode_*`
|
|
* methods collectively.
|
|
*/
|
|
inline std::tuple<uint32_t, size_t, size_t, bool> decode_to_utf8(
|
|
gsl::span<const uint8_t> src, gsl::span<uint8_t> dst, bool last) {
|
|
size_t src_read = src.size();
|
|
size_t dst_written = dst.size();
|
|
bool had_replacements;
|
|
uint32_t result =
|
|
decoder_decode_to_utf8(this, null_to_bogus<const uint8_t>(src.data()),
|
|
&src_read, null_to_bogus<uint8_t>(dst.data()),
|
|
&dst_written, last, &had_replacements);
|
|
return {result, src_read, dst_written, had_replacements};
|
|
}
|
|
|
|
/**
|
|
* Incrementally decode a byte stream into UTF-8 _without replacement_.
|
|
*
|
|
* See the documentation of the class for documentation for `decode_*`
|
|
* methods collectively.
|
|
*/
|
|
inline std::tuple<uint32_t, size_t, size_t>
|
|
decode_to_utf8_without_replacement(gsl::span<const uint8_t> src,
|
|
gsl::span<uint8_t> dst, bool last) {
|
|
size_t src_read = src.size();
|
|
size_t dst_written = dst.size();
|
|
uint32_t result = decoder_decode_to_utf8_without_replacement(
|
|
this, null_to_bogus<const uint8_t>(src.data()), &src_read,
|
|
null_to_bogus<uint8_t>(dst.data()), &dst_written, last);
|
|
return {result, src_read, dst_written};
|
|
}
|
|
|
|
/**
|
|
* Query the worst-case UTF-16 output size (with or without replacement).
|
|
*
|
|
* Returns the size of the output buffer in UTF-16 code units (`char16_t`)
|
|
* that will not overflow given the current state of the decoder and
|
|
* `byte_length` number of additional input bytes or `std::optional`
|
|
* without value if `size_t` would overflow.
|
|
*
|
|
* Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
|
|
* return value of this method applies also in the
|
|
* `_without_replacement` case.
|
|
*/
|
|
inline std::optional<size_t> max_utf16_buffer_length(
|
|
size_t byte_length) const {
|
|
size_t val = decoder_max_utf16_buffer_length(this, byte_length);
|
|
if (val == SIZE_MAX) {
|
|
return std::nullopt;
|
|
}
|
|
return val;
|
|
}
|
|
|
|
/**
|
|
* Incrementally decode a byte stream into UTF-16 with malformed sequences
|
|
* replaced with the REPLACEMENT CHARACTER.
|
|
*
|
|
* See the documentation of the class for documentation for `decode_*`
|
|
* methods collectively.
|
|
*/
|
|
inline std::tuple<uint32_t, size_t, size_t, bool> decode_to_utf16(
|
|
gsl::span<const uint8_t> src, gsl::span<char16_t> dst, bool last) {
|
|
size_t src_read = src.size();
|
|
size_t dst_written = dst.size();
|
|
bool had_replacements;
|
|
uint32_t result =
|
|
decoder_decode_to_utf16(this, null_to_bogus<const uint8_t>(src.data()),
|
|
&src_read, null_to_bogus<char16_t>(dst.data()),
|
|
&dst_written, last, &had_replacements);
|
|
return {result, src_read, dst_written, had_replacements};
|
|
}
|
|
|
|
/**
|
|
* Incrementally decode a byte stream into UTF-16 _without replacement_.
|
|
*
|
|
* See the documentation of the class for documentation for `decode_*`
|
|
* methods collectively.
|
|
*/
|
|
inline std::tuple<uint32_t, size_t, size_t>
|
|
decode_to_utf16_without_replacement(gsl::span<const uint8_t> src,
|
|
gsl::span<char16_t> dst, bool last) {
|
|
size_t src_read = src.size();
|
|
size_t dst_written = dst.size();
|
|
uint32_t result = decoder_decode_to_utf16_without_replacement(
|
|
this, null_to_bogus<const uint8_t>(src.data()), &src_read,
|
|
null_to_bogus<char16_t>(dst.data()), &dst_written, last);
|
|
return {result, src_read, dst_written};
|
|
}
|
|
|
|
/**
|
|
* Checks for compatibility with storing Unicode scalar values as unsigned
|
|
* bytes taking into account the state of the decoder.
|
|
*
|
|
* Returns `std::nullopt` if the decoder is not in a neutral state, including
|
|
* waiting for the BOM, or if the encoding is never Latin1-byte-compatible.
|
|
*
|
|
* Otherwise returns the index of the first byte whose unsigned value doesn't
|
|
* directly correspond to the decoded Unicode scalar value, or the length
|
|
* of the input if all bytes in the input decode directly to scalar values
|
|
* corresponding to the unsigned byte values.
|
|
*
|
|
* Does not change the state of the decoder.
|
|
*
|
|
* Do not use this unless you are supporting SpiderMonkey/V8-style string
|
|
* storage optimizations.
|
|
*/
|
|
inline std::optional<size_t> latin1_byte_compatible_up_to(
|
|
gsl::span<const uint8_t> buffer) const {
|
|
size_t val = decoder_latin1_byte_compatible_up_to(
|
|
this, null_to_bogus<const uint8_t>(buffer.data()),
|
|
static_cast<size_t>(buffer.size()));
|
|
if (val == SIZE_MAX) {
|
|
return std::nullopt;
|
|
}
|
|
return val;
|
|
}
|
|
|
|
private:
|
|
/**
|
|
* Replaces `nullptr` with a bogus pointer suitable for use as part of a
|
|
* zero-length Rust slice.
|
|
*/
|
|
template <class T>
|
|
static inline T* null_to_bogus(T* ptr) {
|
|
return ptr ? ptr : reinterpret_cast<T*>(alignof(T));
|
|
}
|
|
|
|
Decoder() = delete;
|
|
Decoder(const Decoder&) = delete;
|
|
Decoder& operator=(const Decoder&) = delete;
|
|
};
|
|
|
|
/**
|
|
* A converter that encodes a Unicode stream into bytes according to a
|
|
* character encoding in a streaming (incremental) manner.
|
|
*
|
|
* The various `encode_*` methods take an input buffer (`src`) and an output
|
|
* buffer `dst` both of which are caller-allocated. There are variants for
|
|
* both UTF-8 and UTF-16 input buffers.
|
|
*
|
|
* An `encode_*` method encode characters from `src` into bytes characters
|
|
* stored into `dst` until one of the following three things happens:
|
|
*
|
|
* 1. An unmappable character is encountered (`*_without_replacement` variants
|
|
* only).
|
|
*
|
|
* 2. The output buffer has been filled so near capacity that the decoder
|
|
* cannot be sure that processing an additional character of input wouldn't
|
|
* cause so much output that the output buffer would overflow.
|
|
*
|
|
* 3. All the input characters have been processed.
|
|
*
|
|
* The `encode_*` method then returns tuple of a status indicating which one
|
|
* of the three reasons to return happened, how many input code units (`uint8_t`
|
|
* when encoding from UTF-8 and `char16_t` when encoding from UTF-16) were read,
|
|
* how many output bytes were written, and in the case of the variants that
|
|
* perform replacement, a boolean indicating whether an unmappable
|
|
* character was replaced with a numeric character reference during the call.
|
|
*
|
|
* The number of bytes "written" is what's logically written. Garbage may be
|
|
* written in the output buffer beyond the point logically written to.
|
|
*
|
|
* In the case of the methods whose name ends with
|
|
* `*_without_replacement`, the status is a `uint32_t` whose possible values
|
|
* are an unmappable code point, `OUTPUT_FULL` and `INPUT_EMPTY` corresponding
|
|
* to the three cases listed above).
|
|
*
|
|
* In the case of methods whose name does not end with
|
|
* `*_without_replacement`, unmappable characters are automatically replaced
|
|
* with the corresponding numeric character references and unmappable
|
|
* characters do not cause the methods to return early.
|
|
*
|
|
* When encoding from UTF-8 without replacement, the methods are guaranteed
|
|
* not to return indicating that more output space is needed if the length
|
|
* of the output buffer is at least the length returned by
|
|
* `max_buffer_length_from_utf8_without_replacement()`. When encoding from
|
|
* UTF-8 with replacement, the length of the output buffer that guarantees the
|
|
* methods not to return indicating that more output space is needed in the
|
|
* absence of unmappable characters is given by
|
|
* `max_buffer_length_from_utf8_if_no_unmappables()`. When encoding from
|
|
* UTF-16 without replacement, the methods are guaranteed not to return
|
|
* indicating that more output space is needed if the length of the output
|
|
* buffer is at least the length returned by
|
|
* `max_buffer_length_from_utf16_without_replacement()`. When encoding
|
|
* from UTF-16 with replacement, the the length of the output buffer that
|
|
* guarantees the methods not to return indicating that more output space is
|
|
* needed in the absence of unmappable characters is given by
|
|
* `max_buffer_length_from_utf16_if_no_unmappables()`.
|
|
* When encoding with replacement, applications are not expected to size the
|
|
* buffer for the worst case ahead of time but to resize the buffer if there
|
|
* are unmappable characters. This is why max length queries are only available
|
|
* for the case where there are no unmappable characters.
|
|
*
|
|
* When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. When
|
|
* encoding from UTF-16, unpaired surrogates in the input are treated as U+FFFD
|
|
* REPLACEMENT CHARACTERS. Therefore, in order for astral characters not to
|
|
* turn into a pair of REPLACEMENT CHARACTERS, the caller must ensure that
|
|
* surrogate pairs are not split across input buffer boundaries.
|
|
*
|
|
* After an `encode_*` call returns, the output produced so far, taken as a
|
|
* whole from the start of the stream, is guaranteed to consist of a valid
|
|
* byte sequence in the target encoding. (I.e. the code unit sequence for a
|
|
* character is guaranteed not to be split across output buffers. However, due
|
|
* to the stateful nature of ISO-2022-JP, the stream needs to be considered
|
|
* from the start for it to be valid. For other encodings, the validity holds
|
|
* on a per-output buffer basis.)
|
|
*
|
|
* The boolean argument `last` indicates that the end of the stream is reached
|
|
* when all the characters in `src` have been consumed. This argument is needed
|
|
* for ISO-2022-JP and is ignored for other encodings.
|
|
*
|
|
* An `Encoder` object can be used to incrementally encode a byte stream.
|
|
*
|
|
* During the processing of a single stream, the caller must call `encode_*`
|
|
* zero or more times with `last` set to `false` and then call `encode_*` at
|
|
* least once with `last` set to `true`. If `encode_*` returns `INPUT_EMPTY`,
|
|
* the processing of the stream has ended. Otherwise, the caller must call
|
|
* `encode_*` again with `last` set to `true` (or treat an unmappable result,
|
|
* i.e. neither `INPUT_EMPTY` nor `OUTPUT_FULL`, as a fatal error).
|
|
*
|
|
* Once the stream has ended, the `Encoder` object must not be used anymore.
|
|
* That is, you need to create another one to process another stream.
|
|
*
|
|
* When the encoder returns `OUTPUT_FULL` or the encoder returns an unmappable
|
|
* result and the caller does not wish to treat it as a fatal error, the input
|
|
* buffer `src` may not have been completely consumed. In that case, the caller
|
|
* must pass the unconsumed contents of `src` to `encode_*` again upon the next
|
|
* call.
|
|
*
|
|
* # Infinite loops
|
|
*
|
|
* When converting with a fixed-size output buffer whose size is too small to
|
|
* accommodate one character of output, an infinite loop ensues. When
|
|
* converting with a fixed-size output buffer, it generally makes sense to
|
|
* make the buffer fairly large (e.g. couple of kilobytes).
|
|
*/
|
|
class Encoder final {
|
|
public:
|
|
~Encoder() {}
|
|
|
|
static inline void operator delete(void* encoder) {
|
|
encoder_free(reinterpret_cast<Encoder*>(encoder));
|
|
}
|
|
|
|
/**
|
|
* The `Encoding` this `Encoder` is for.
|
|
*/
|
|
inline gsl::not_null<const Encoding*> encoding() const {
|
|
return gsl::not_null<const Encoding*>(encoder_encoding(this));
|
|
}
|
|
|
|
/**
|
|
* Returns `true` if this is an ISO-2022-JP encoder that's not in the
|
|
* ASCII state and `false` otherwise.
|
|
*/
|
|
inline bool has_pending_state() const {
|
|
return encoder_has_pending_state(this);
|
|
}
|
|
|
|
/**
|
|
* Query the worst-case output size when encoding from UTF-8 with
|
|
* replacement.
|
|
*
|
|
* Returns the size of the output buffer in bytes that will not overflow
|
|
* given the current state of the encoder and `byte_length` number of
|
|
* additional input code units if there are no unmappable characters in
|
|
* the input or `SIZE_MAX` if `size_t` would overflow.
|
|
*/
|
|
inline std::optional<size_t> max_buffer_length_from_utf8_if_no_unmappables(
|
|
size_t byte_length) const {
|
|
size_t val = encoder_max_buffer_length_from_utf8_if_no_unmappables(
|
|
this, byte_length);
|
|
if (val == SIZE_MAX) {
|
|
return std::nullopt;
|
|
}
|
|
return val;
|
|
}
|
|
|
|
/**
|
|
* Query the worst-case output size when encoding from UTF-8 without
|
|
* replacement.
|
|
*
|
|
* Returns the size of the output buffer in bytes that will not overflow
|
|
* given the current state of the encoder and `byte_length` number of
|
|
* additional input code units or `SIZE_MAX` if `size_t` would overflow.
|
|
*/
|
|
inline std::optional<size_t> max_buffer_length_from_utf8_without_replacement(
|
|
size_t byte_length) const {
|
|
size_t val = encoder_max_buffer_length_from_utf8_without_replacement(
|
|
this, byte_length);
|
|
if (val == SIZE_MAX) {
|
|
return std::nullopt;
|
|
}
|
|
return val;
|
|
}
|
|
|
|
/**
|
|
* Incrementally encode into byte stream from UTF-8 with unmappable
|
|
* characters replaced with HTML (decimal) numeric character references.
|
|
*
|
|
* See the documentation of the class for documentation for `encode_*`
|
|
* methods collectively.
|
|
*/
|
|
inline std::tuple<uint32_t, size_t, size_t, bool> encode_from_utf8(
|
|
std::string_view src, gsl::span<uint8_t> dst, bool last) {
|
|
size_t src_read = src.size();
|
|
size_t dst_written = dst.size();
|
|
bool had_replacements;
|
|
uint32_t result = encoder_encode_from_utf8(
|
|
this,
|
|
null_to_bogus<const uint8_t>(
|
|
reinterpret_cast<const uint8_t*>(src.data())),
|
|
&src_read, null_to_bogus<uint8_t>(dst.data()), &dst_written, last,
|
|
&had_replacements);
|
|
return {result, src_read, dst_written, had_replacements};
|
|
}
|
|
|
|
/**
|
|
* Incrementally encode into byte stream from UTF-8 _without replacement_.
|
|
*
|
|
* See the documentation of the class for documentation for `encode_*`
|
|
* methods collectively.
|
|
*/
|
|
inline std::tuple<uint32_t, size_t, size_t>
|
|
encode_from_utf8_without_replacement(std::string_view src,
|
|
gsl::span<uint8_t> dst, bool last) {
|
|
size_t src_read = src.size();
|
|
size_t dst_written = dst.size();
|
|
uint32_t result = encoder_encode_from_utf8_without_replacement(
|
|
this,
|
|
null_to_bogus<const uint8_t>(
|
|
reinterpret_cast<const uint8_t*>(src.data())),
|
|
&src_read, null_to_bogus<uint8_t>(dst.data()), &dst_written, last);
|
|
return {result, src_read, dst_written};
|
|
}
|
|
|
|
/**
|
|
* Query the worst-case output size when encoding from UTF-16 with
|
|
* replacement.
|
|
*
|
|
* Returns the size of the output buffer in bytes that will not overflow
|
|
* given the current state of the encoder and `u16_length` number of
|
|
* additional input code units if there are no unmappable characters in
|
|
* the input or `SIZE_MAX` if `size_t` would overflow.
|
|
*/
|
|
inline std::optional<size_t> max_buffer_length_from_utf16_if_no_unmappables(
|
|
size_t u16_length) const {
|
|
size_t val = encoder_max_buffer_length_from_utf16_if_no_unmappables(
|
|
this, u16_length);
|
|
if (val == SIZE_MAX) {
|
|
return std::nullopt;
|
|
}
|
|
return val;
|
|
}
|
|
|
|
/**
|
|
* Query the worst-case output size when encoding from UTF-16 without
|
|
* replacement.
|
|
*
|
|
* Returns the size of the output buffer in bytes that will not overflow
|
|
* given the current state of the encoder and `u16_length` number of
|
|
* additional input code units or `SIZE_MAX` if `size_t` would overflow.
|
|
*/
|
|
inline std::optional<size_t> max_buffer_length_from_utf16_without_replacement(
|
|
size_t u16_length) const {
|
|
size_t val = encoder_max_buffer_length_from_utf16_without_replacement(
|
|
this, u16_length);
|
|
if (val == SIZE_MAX) {
|
|
return std::nullopt;
|
|
}
|
|
return val;
|
|
}
|
|
|
|
/**
|
|
* Incrementally encode into byte stream from UTF-16 with unmappable
|
|
* characters replaced with HTML (decimal) numeric character references.
|
|
*
|
|
* See the documentation of the class for documentation for `encode_*`
|
|
* methods collectively.
|
|
*/
|
|
inline std::tuple<uint32_t, size_t, size_t, bool> encode_from_utf16(
|
|
std::u16string_view src, gsl::span<uint8_t> dst, bool last) {
|
|
size_t src_read = src.size();
|
|
size_t dst_written = dst.size();
|
|
bool had_replacements;
|
|
uint32_t result = encoder_encode_from_utf16(
|
|
this, null_to_bogus<const char16_t>(src.data()), &src_read,
|
|
null_to_bogus<uint8_t>(dst.data()), &dst_written, last,
|
|
&had_replacements);
|
|
return {result, src_read, dst_written, had_replacements};
|
|
}
|
|
|
|
/**
|
|
* Incrementally encode into byte stream from UTF-16 _without replacement_.
|
|
*
|
|
* See the documentation of the class for documentation for `encode_*`
|
|
* methods collectively.
|
|
*/
|
|
inline std::tuple<uint32_t, size_t, size_t>
|
|
encode_from_utf16_without_replacement(std::u16string_view src,
|
|
gsl::span<uint8_t> dst, bool last) {
|
|
size_t src_read = src.size();
|
|
size_t dst_written = dst.size();
|
|
uint32_t result = encoder_encode_from_utf16_without_replacement(
|
|
this, null_to_bogus<const char16_t>(src.data()), &src_read,
|
|
null_to_bogus<uint8_t>(dst.data()), &dst_written, last);
|
|
return {result, src_read, dst_written};
|
|
}
|
|
|
|
private:
|
|
/**
|
|
* Replaces `nullptr` with a bogus pointer suitable for use as part of a
|
|
* zero-length Rust slice.
|
|
*/
|
|
template <class T>
|
|
static inline T* null_to_bogus(T* ptr) {
|
|
return ptr ? ptr : reinterpret_cast<T*>(alignof(T));
|
|
}
|
|
|
|
Encoder() = delete;
|
|
Encoder(const Encoder&) = delete;
|
|
Encoder& operator=(const Encoder&) = delete;
|
|
};
|
|
|
|
/**
|
|
* An encoding as defined in the Encoding Standard
|
|
* (https://encoding.spec.whatwg.org/).
|
|
*
|
|
* An _encoding_ defines a mapping from a byte sequence to a Unicode code point
|
|
* sequence and, in most cases, vice versa. Each encoding has a name, an output
|
|
* encoding, and one or more labels.
|
|
*
|
|
* _Labels_ are ASCII-case-insensitive strings that are used to identify an
|
|
* encoding in formats and protocols. The _name_ of the encoding is the
|
|
* preferred label in the case appropriate for returning from the
|
|
* `characterSet` property of the `Document` DOM interface, except for
|
|
* the replacement encoding whose name is not one of its labels.
|
|
*
|
|
* The _output encoding_ is the encoding used for form submission and URL
|
|
* parsing on Web pages in the encoding. This is UTF-8 for the replacement,
|
|
* UTF-16LE and UTF-16BE encodings and the encoding itself for other
|
|
* encodings.
|
|
*
|
|
* # Streaming vs. Non-Streaming
|
|
*
|
|
* When you have the entire input in a single buffer, you can use the
|
|
* methods `decode()`, `decode_with_bom_removal()`,
|
|
* `decode_without_bom_handling()`,
|
|
* `decode_without_bom_handling_and_without_replacement()` and
|
|
* `encode()`. Unlike the rest of the API, these methods perform heap
|
|
* allocations. You should the `Decoder` and `Encoder` objects when your input
|
|
* is split into multiple buffers or when you want to control the allocation of
|
|
* the output buffers.
|
|
*
|
|
* # Instances
|
|
*
|
|
* All instances of `Encoding` are statically allocated and have the process's
|
|
* lifetime. There is precisely one unique `Encoding` instance for each
|
|
* encoding defined in the Encoding Standard.
|
|
*
|
|
* To obtain a reference to a particular encoding whose identity you know at
|
|
* compile time, use a `static` that refers to encoding. There is a `static`
|
|
* for each encoding. The `static`s are named in all caps with hyphens
|
|
* replaced with underscores and with `_ENCODING` appended to the
|
|
* name. For example, if you know at compile time that you will want to
|
|
* decode using the UTF-8 encoding, use the `UTF_8_ENCODING` `static`.
|
|
*
|
|
* If you don't know what encoding you need at compile time and need to
|
|
* dynamically get an encoding by label, use `Encoding::for_label()`.
|
|
*
|
|
* Instances of `Encoding` can be compared with `==`.
|
|
*/
|
|
class Encoding final {
|
|
public:
|
|
/**
|
|
* Implements the _get an encoding_ algorithm
|
|
* (https://encoding.spec.whatwg.org/#concept-encoding-get).
|
|
*
|
|
* If, after ASCII-lowercasing and removing leading and trailing
|
|
* whitespace, the argument matches a label defined in the Encoding
|
|
* Standard, `const Encoding*` representing the corresponding
|
|
* encoding is returned. If there is no match, `nullptr` is returned.
|
|
*
|
|
* This is the right method to use if the action upon the method returning
|
|
* `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`)
|
|
* instead. When the action upon the method returning `nullptr` is not to
|
|
* proceed with a fallback but to refuse processing,
|
|
* `for_label_no_replacement()` is more appropriate.
|
|
*/
|
|
static inline const Encoding* for_label(gsl::cstring_span<> label) {
|
|
return encoding_for_label(
|
|
null_to_bogus<const uint8_t>(
|
|
reinterpret_cast<const uint8_t*>(label.data())),
|
|
label.length());
|
|
}
|
|
|
|
/**
|
|
* This method behaves the same as `for_label()`, except when `for_label()`
|
|
* would return `REPLACEMENT_ENCODING`, this method returns `nullptr` instead.
|
|
*
|
|
* This method is useful in scenarios where a fatal error is required
|
|
* upon invalid label, because in those cases the caller typically wishes
|
|
* to treat the labels that map to the replacement encoding as fatal
|
|
* errors, too.
|
|
*
|
|
* It is not OK to use this method when the action upon the method returning
|
|
* `nullptr` is to use a fallback encoding (e.g. `WINDOWS_1252_ENCODING`). In
|
|
* such a case, the `for_label()` method should be used instead in order to
|
|
* avoid
|
|
* unsafe fallback for labels that `for_label()` maps to
|
|
* `REPLACEMENT_ENCODING`.
|
|
*/
|
|
static inline const Encoding* for_label_no_replacement(
|
|
gsl::cstring_span<> label) {
|
|
return encoding_for_label_no_replacement(
|
|
null_to_bogus<const uint8_t>(
|
|
reinterpret_cast<const uint8_t*>(label.data())),
|
|
label.length());
|
|
}
|
|
|
|
/**
|
|
* Performs non-incremental BOM sniffing.
|
|
*
|
|
* The argument must either be a buffer representing the entire input
|
|
* stream (non-streaming case) or a buffer representing at least the first
|
|
* three bytes of the input stream (streaming case).
|
|
*
|
|
* Returns a std::optinal wrapping `make_tuple(UTF_8_ENCODING, 3)`,
|
|
* `make_tuple(UTF_16LE_ENCODING, 2)` or `make_tuple(UTF_16BE_ENCODING, 3)`
|
|
* if the argument starts with the UTF-8, UTF-16LE or UTF-16BE BOM or
|
|
* `std::nullopt` otherwise.
|
|
*/
|
|
static inline std::optional<
|
|
std::tuple<gsl::not_null<const Encoding*>, size_t>>
|
|
for_bom(gsl::span<const uint8_t> buffer) {
|
|
size_t len = buffer.size();
|
|
const Encoding* encoding =
|
|
encoding_for_bom(null_to_bogus(buffer.data()), &len);
|
|
if (encoding) {
|
|
return std::make_tuple(gsl::not_null<const Encoding*>(encoding), len);
|
|
}
|
|
return std::nullopt;
|
|
}
|
|
|
|
/**
|
|
* Returns the name of this encoding.
|
|
*
|
|
* This name is appropriate to return as-is from the DOM
|
|
* `document.characterSet` property.
|
|
*/
|
|
inline std::string name() const {
|
|
std::string name(ENCODING_NAME_MAX_LENGTH, '\0');
|
|
// http://herbsutter.com/2008/04/07/cringe-not-vectors-are-guaranteed-to-be-contiguous/#comment-483
|
|
size_t length = encoding_name(this, reinterpret_cast<uint8_t*>(&name[0]));
|
|
name.resize(length);
|
|
return name;
|
|
}
|
|
|
|
/**
|
|
* Checks whether the _output encoding_ of this encoding can encode every
|
|
* Unicode code point. (Only true if the output encoding is UTF-8.)
|
|
*/
|
|
inline bool can_encode_everything() const {
|
|
return encoding_can_encode_everything(this);
|
|
}
|
|
|
|
/**
|
|
* Checks whether the bytes 0x00...0x7F map exclusively to the characters
|
|
* U+0000...U+007F and vice versa.
|
|
*/
|
|
inline bool is_ascii_compatible() const {
|
|
return encoding_is_ascii_compatible(this);
|
|
}
|
|
|
|
/**
|
|
* Checks whether this encoding maps one byte to one Basic Multilingual
|
|
* Plane code point (i.e. byte length equals decoded UTF-16 length) and
|
|
* vice versa (for mappable characters).
|
|
*
|
|
* `true` iff this encoding is on the list of Legacy single-byte
|
|
* encodings (https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
|
|
* in the spec or x-user-defined.
|
|
*/
|
|
inline bool is_single_byte() const { return encoding_is_single_byte(this); }
|
|
|
|
/**
|
|
* Returns the _output encoding_ of this encoding. This is UTF-8 for
|
|
* UTF-16BE, UTF-16LE and replacement and the encoding itself otherwise.
|
|
*/
|
|
inline gsl::not_null<const Encoding*> output_encoding() const {
|
|
return gsl::not_null<const Encoding*>(encoding_output_encoding(this));
|
|
}
|
|
|
|
/**
|
|
* Decode complete input to `std::string` _with BOM sniffing_ and with
|
|
* malformed sequences replaced with the REPLACEMENT CHARACTER when the
|
|
* entire input is available as a single buffer (i.e. the end of the
|
|
* buffer marks the end of the stream).
|
|
*
|
|
* This method implements the (non-streaming version of) the
|
|
* _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
|
|
*
|
|
* The second item in the returned tuple is the encoding that was actually
|
|
* used (which may differ from this encoding thanks to BOM sniffing).
|
|
*
|
|
* The third item in the returned tuple indicates whether there were
|
|
* malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
|
|
*
|
|
* _Note:_ It is wrong to use this when the input buffer represents only
|
|
* a segment of the input instead of the whole input. Use `new_decoder()`
|
|
* when decoding segmented input.
|
|
*/
|
|
inline std::tuple<std::string, gsl::not_null<const Encoding*>, bool> decode(
|
|
gsl::span<const uint8_t> bytes) const {
|
|
auto opt = Encoding::for_bom(bytes);
|
|
const Encoding* encoding;
|
|
if (opt) {
|
|
size_t bom_length;
|
|
std::tie(encoding, bom_length) = *opt;
|
|
bytes = bytes.subspan(bom_length);
|
|
} else {
|
|
encoding = this;
|
|
}
|
|
auto [str, had_errors] = encoding->decode_without_bom_handling(bytes);
|
|
return {str, gsl::not_null<const Encoding*>(encoding), had_errors};
|
|
}
|
|
|
|
/**
|
|
* Decode complete input to `std::string` _with BOM removal_ and with
|
|
* malformed sequences replaced with the REPLACEMENT CHARACTER when the
|
|
* entire input is available as a single buffer (i.e. the end of the
|
|
* buffer marks the end of the stream).
|
|
*
|
|
* When invoked on `UTF_8`, this method implements the (non-streaming
|
|
* version of) the _UTF-8 decode_
|
|
* (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
|
|
*
|
|
* The second item in the returned pair indicates whether there were
|
|
* malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
|
|
*
|
|
* _Note:_ It is wrong to use this when the input buffer represents only
|
|
* a segment of the input instead of the whole input. Use
|
|
* `new_decoder_with_bom_removal()` when decoding segmented input.
|
|
*/
|
|
inline std::tuple<std::string, bool> decode_with_bom_removal(
|
|
gsl::span<const uint8_t> bytes) const {
|
|
if (this == UTF_8_ENCODING && bytes.size() >= 3 &&
|
|
(gsl::as_bytes(bytes.first<3>()) ==
|
|
gsl::as_bytes(gsl::make_span("\xEF\xBB\xBF")))) {
|
|
bytes = bytes.subspan(3, bytes.size() - 3);
|
|
} else if (this == UTF_16LE_ENCODING && bytes.size() >= 2 &&
|
|
(gsl::as_bytes(bytes.first<2>()) ==
|
|
gsl::as_bytes(gsl::make_span("\xFF\xFE")))) {
|
|
bytes = bytes.subspan(2, bytes.size() - 2);
|
|
} else if (this == UTF_16BE_ENCODING && bytes.size() >= 2 &&
|
|
(gsl::as_bytes(bytes.first<2>()) ==
|
|
gsl::as_bytes(gsl::make_span("\xFE\xFF")))) {
|
|
bytes = bytes.subspan(2, bytes.size() - 2);
|
|
}
|
|
return decode_without_bom_handling(bytes);
|
|
}
|
|
|
|
/**
|
|
* Decode complete input to `std::string` _without BOM handling_ and
|
|
* with malformed sequences replaced with the REPLACEMENT CHARACTER when
|
|
* the entire input is available as a single buffer (i.e. the end of the
|
|
* buffer marks the end of the stream).
|
|
*
|
|
* When invoked on `UTF_8`, this method implements the (non-streaming
|
|
* version of) the _UTF-8 decode without BOM_
|
|
* (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
|
|
*
|
|
* The second item in the returned pair indicates whether there were
|
|
* malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
|
|
*
|
|
* _Note:_ It is wrong to use this when the input buffer represents only
|
|
* a segment of the input instead of the whole input. Use
|
|
* `new_decoder_without_bom_handling()` when decoding segmented input.
|
|
*/
|
|
inline std::tuple<std::string, bool> decode_without_bom_handling(
|
|
gsl::span<const uint8_t> bytes) const {
|
|
auto decoder = new_decoder_without_bom_handling();
|
|
auto needed = decoder->max_utf8_buffer_length(bytes.size());
|
|
if (!needed) {
|
|
throw std::overflow_error("Overflow in buffer size computation.");
|
|
}
|
|
std::string string(needed.value(), '\0');
|
|
const auto [result, read, written, had_errors] = decoder->decode_to_utf8(
|
|
bytes,
|
|
gsl::make_span(reinterpret_cast<uint8_t*>(&string[0]), string.size()),
|
|
true);
|
|
assert(read == static_cast<size_t>(bytes.size()));
|
|
assert(written <= static_cast<size_t>(string.size()));
|
|
assert(result == INPUT_EMPTY);
|
|
string.resize(written);
|
|
return {string, had_errors};
|
|
}
|
|
|
|
/**
|
|
* Decode complete input to `std::string` _without BOM handling_ and
|
|
* _with malformed sequences treated as fatal_ when the entire input is
|
|
* available as a single buffer (i.e. the end of the buffer marks the end
|
|
* of the stream).
|
|
*
|
|
* When invoked on `UTF_8`, this method implements the (non-streaming
|
|
* version of) the _UTF-8 decode without BOM or fail_
|
|
* (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
|
|
* spec concept.
|
|
*
|
|
* Returns `std::nullopt` if a malformed sequence was encountered and the result
|
|
* of the decode as `std::optional<std::string>` otherwise.
|
|
*
|
|
* _Note:_ It is wrong to use this when the input buffer represents only
|
|
* a segment of the input instead of the whole input. Use
|
|
* `new_decoder_without_bom_handling()` when decoding segmented input.
|
|
*/
|
|
inline std::optional<std::string>
|
|
decode_without_bom_handling_and_without_replacement(
|
|
gsl::span<const uint8_t> bytes) const {
|
|
auto decoder = new_decoder_without_bom_handling();
|
|
auto needed =
|
|
decoder->max_utf8_buffer_length_without_replacement(bytes.size());
|
|
if (!needed) {
|
|
throw std::overflow_error("Overflow in buffer size computation.");
|
|
}
|
|
std::string string(needed.value(), '\0');
|
|
const auto [result, read, written] =
|
|
decoder->decode_to_utf8_without_replacement(
|
|
bytes,
|
|
gsl::make_span(reinterpret_cast<uint8_t*>(&string[0]),
|
|
string.size()),
|
|
true);
|
|
assert(result != OUTPUT_FULL);
|
|
if (result == INPUT_EMPTY) {
|
|
assert(read == static_cast<size_t>(bytes.size()));
|
|
assert(written <= static_cast<size_t>(string.size()));
|
|
string.resize(written);
|
|
return string;
|
|
}
|
|
return std::nullopt;
|
|
}
|
|
|
|
/**
|
|
* Decode complete input to `std::u16string` _with BOM sniffing_ and with
|
|
* malformed sequences replaced with the REPLACEMENT CHARACTER when the
|
|
* entire input is available as a single buffer (i.e. the end of the
|
|
* buffer marks the end of the stream).
|
|
*
|
|
* This method implements the (non-streaming version of) the
|
|
* _decode_ (https://encoding.spec.whatwg.org/#decode) spec concept.
|
|
*
|
|
* The second item in the returned tuple is the encoding that was actually
|
|
* used (which may differ from this encoding thanks to BOM sniffing).
|
|
*
|
|
* The third item in the returned tuple indicates whether there were
|
|
* malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
|
|
*
|
|
* _Note:_ It is wrong to use this when the input buffer represents only
|
|
* a segment of the input instead of the whole input. Use `new_decoder()`
|
|
* when decoding segmented input.
|
|
*/
|
|
inline std::tuple<std::u16string, gsl::not_null<const Encoding*>, bool>
|
|
decode16(gsl::span<const uint8_t> bytes) const {
|
|
auto opt = Encoding::for_bom(bytes);
|
|
const Encoding* encoding;
|
|
if (opt) {
|
|
size_t bom_length;
|
|
std::tie(encoding, bom_length) = *opt;
|
|
bytes = bytes.subspan(bom_length);
|
|
} else {
|
|
encoding = this;
|
|
}
|
|
auto [str, had_errors] = encoding->decode16_without_bom_handling(bytes);
|
|
return {str, gsl::not_null<const Encoding*>(encoding), had_errors};
|
|
}
|
|
|
|
/**
|
|
* Decode complete input to `std::u16string` _with BOM removal_ and with
|
|
* malformed sequences replaced with the REPLACEMENT CHARACTER when the
|
|
* entire input is available as a single buffer (i.e. the end of the
|
|
* buffer marks the end of the stream).
|
|
*
|
|
* When invoked on `UTF_8`, this method implements the (non-streaming
|
|
* version of) the _UTF-8 decode_
|
|
* (https://encoding.spec.whatwg.org/#utf-8-decode) spec concept.
|
|
*
|
|
* The second item in the returned pair indicates whether there were
|
|
* malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
|
|
*
|
|
* _Note:_ It is wrong to use this when the input buffer represents only
|
|
* a segment of the input instead of the whole input. Use
|
|
* `new_decoder_with_bom_removal()` when decoding segmented input.
|
|
*/
|
|
inline std::tuple<std::u16string, bool> decode16_with_bom_removal(
|
|
gsl::span<const uint8_t> bytes) const {
|
|
if (this == UTF_8_ENCODING && bytes.size() >= 3 &&
|
|
(gsl::as_bytes(bytes.first<3>()) ==
|
|
gsl::as_bytes(gsl::make_span("\xEF\xBB\xBF")))) {
|
|
bytes = bytes.subspan(3, bytes.size() - 3);
|
|
} else if (this == UTF_16LE_ENCODING && bytes.size() >= 2 &&
|
|
(gsl::as_bytes(bytes.first<2>()) ==
|
|
gsl::as_bytes(gsl::make_span("\xFF\xFE")))) {
|
|
bytes = bytes.subspan(2, bytes.size() - 2);
|
|
} else if (this == UTF_16BE_ENCODING && bytes.size() >= 2 &&
|
|
(gsl::as_bytes(bytes.first<2>()) ==
|
|
gsl::as_bytes(gsl::make_span("\xFE\xFF")))) {
|
|
bytes = bytes.subspan(2, bytes.size() - 2);
|
|
}
|
|
return decode16_without_bom_handling(bytes);
|
|
}
|
|
|
|
/**
|
|
* Decode complete input to `std::u16string` _without BOM handling_ and
|
|
* with malformed sequences replaced with the REPLACEMENT CHARACTER when
|
|
* the entire input is available as a single buffer (i.e. the end of the
|
|
* buffer marks the end of the stream).
|
|
*
|
|
* When invoked on `UTF_8`, this method implements the (non-streaming
|
|
* version of) the _UTF-8 decode without BOM_
|
|
* (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom) spec concept.
|
|
*
|
|
* The second item in the returned pair indicates whether there were
|
|
* malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
|
|
*
|
|
* _Note:_ It is wrong to use this when the input buffer represents only
|
|
* a segment of the input instead of the whole input. Use
|
|
* `new_decoder_without_bom_handling()` when decoding segmented input.
|
|
*/
|
|
inline std::tuple<std::u16string, bool> decode16_without_bom_handling(
|
|
gsl::span<const uint8_t> bytes) const {
|
|
auto decoder = new_decoder_without_bom_handling();
|
|
auto needed = decoder->max_utf16_buffer_length(bytes.size());
|
|
if (!needed) {
|
|
throw std::overflow_error("Overflow in buffer size computation.");
|
|
}
|
|
std::u16string string(needed.value(), '\0');
|
|
const auto [result, read, written, had_errors] = decoder->decode_to_utf16(
|
|
bytes, gsl::make_span(&string[0], string.size()), true);
|
|
assert(read == static_cast<size_t>(bytes.size()));
|
|
assert(written <= static_cast<size_t>(string.size()));
|
|
assert(result == INPUT_EMPTY);
|
|
string.resize(written);
|
|
return {string, had_errors};
|
|
}
|
|
|
|
/**
|
|
* Decode complete input to `std::u16string` _without BOM handling_ and
|
|
* _with malformed sequences treated as fatal_ when the entire input is
|
|
* available as a single buffer (i.e. the end of the buffer marks the end
|
|
* of the stream).
|
|
*
|
|
* When invoked on `UTF_8`, this method implements the (non-streaming
|
|
* version of) the _UTF-8 decode without BOM or fail_
|
|
* (https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
|
|
* spec concept.
|
|
*
|
|
* Returns `std::nullopt` if a malformed sequence was encountered and the result
|
|
* of the decode as `std::optional<std::u16string>` otherwise.
|
|
*
|
|
* _Note:_ It is wrong to use this when the input buffer represents only
|
|
* a segment of the input instead of the whole input. Use
|
|
* `new_decoder_without_bom_handling()` when decoding segmented input.
|
|
*/
|
|
inline std::optional<std::u16string>
|
|
decode16_without_bom_handling_and_without_replacement(
|
|
gsl::span<const uint8_t> bytes) const {
|
|
auto decoder = new_decoder_without_bom_handling();
|
|
auto needed = decoder->max_utf16_buffer_length(bytes.size());
|
|
if (!needed) {
|
|
throw std::overflow_error("Overflow in buffer size computation.");
|
|
}
|
|
std::u16string string(needed.value(), '\0');
|
|
const auto [result, read, written] =
|
|
decoder->decode_to_utf16_without_replacement(
|
|
bytes, gsl::make_span(&string[0], string.size()), true);
|
|
assert(result != OUTPUT_FULL);
|
|
if (result == INPUT_EMPTY) {
|
|
assert(read == static_cast<size_t>(bytes.size()));
|
|
assert(written <= static_cast<size_t>(string.size()));
|
|
string.resize(written);
|
|
return string;
|
|
}
|
|
return std::nullopt;
|
|
}
|
|
|
|
/**
|
|
* Encode complete input to `std::vector<uint8_t>` with unmappable characters
|
|
* replaced with decimal numeric character references when the entire input
|
|
* is available as a single buffer (i.e. the end of the buffer marks the
|
|
* end of the stream).
|
|
*
|
|
* This method implements the (non-streaming version of) the
|
|
* _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
|
|
*
|
|
* The second item in the returned tuple is the encoding that was actually
|
|
* used (which may differ from this encoding thanks to some encodings
|
|
* having UTF-8 as their output encoding).
|
|
*
|
|
* The third item in the returned tuple indicates whether there were
|
|
* unmappable characters (that were replaced with HTML numeric character
|
|
* references).
|
|
*
|
|
* _Note:_ It is wrong to use this when the input buffer represents only
|
|
* a segment of the input instead of the whole input. Use `new_encoder()`
|
|
* when encoding segmented output.
|
|
*/
|
|
inline std::tuple<std::vector<uint8_t>, gsl::not_null<const Encoding*>, bool>
|
|
encode(std::string_view string) const {
|
|
auto output_enc = output_encoding();
|
|
if (output_enc == UTF_8_ENCODING) {
|
|
std::vector<uint8_t> vec(string.size());
|
|
std::memcpy(&vec[0], string.data(), string.size());
|
|
}
|
|
auto encoder = output_enc->new_encoder();
|
|
auto needed =
|
|
encoder->max_buffer_length_from_utf8_if_no_unmappables(string.size());
|
|
if (!needed) {
|
|
throw std::overflow_error("Overflow in buffer size computation.");
|
|
}
|
|
std::vector<uint8_t> vec(needed.value());
|
|
bool total_had_errors = false;
|
|
size_t total_read = 0;
|
|
size_t total_written = 0;
|
|
for (;;) {
|
|
const auto [result, read, written, had_errors] =
|
|
encoder->encode_from_utf8(string.substr(total_read),
|
|
gsl::make_span(vec).subspan(total_written),
|
|
true);
|
|
total_read += read;
|
|
total_written += written;
|
|
total_had_errors |= had_errors;
|
|
if (result == INPUT_EMPTY) {
|
|
assert(total_read == static_cast<size_t>(string.size()));
|
|
assert(total_written <= static_cast<size_t>(vec.size()));
|
|
vec.resize(total_written);
|
|
return {vec, gsl::not_null<const Encoding*>(output_enc),
|
|
total_had_errors};
|
|
}
|
|
auto needed = encoder->max_buffer_length_from_utf8_if_no_unmappables(
|
|
string.size() - total_read);
|
|
if (!needed) {
|
|
throw std::overflow_error("Overflow in buffer size computation.");
|
|
}
|
|
vec.resize(total_written + needed.value());
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Encode complete input to `std::vector<uint8_t>` with unmappable characters
|
|
* replaced with decimal numeric character references when the entire input
|
|
* is available as a single buffer (i.e. the end of the buffer marks the
|
|
* end of the stream).
|
|
*
|
|
* This method implements the (non-streaming version of) the
|
|
* _encode_ (https://encoding.spec.whatwg.org/#encode) spec concept.
|
|
*
|
|
* The second item in the returned tuple is the encoding that was actually
|
|
* used (which may differ from this encoding thanks to some encodings
|
|
* having UTF-8 as their output encoding).
|
|
*
|
|
* The third item in the returned tuple indicates whether there were
|
|
* unmappable characters (that were replaced with HTML numeric character
|
|
* references).
|
|
*
|
|
* _Note:_ It is wrong to use this when the input buffer represents only
|
|
* a segment of the input instead of the whole input. Use `new_encoder()`
|
|
* when encoding segmented output.
|
|
*/
|
|
inline std::tuple<std::vector<uint8_t>, gsl::not_null<const Encoding*>, bool>
|
|
encode(std::u16string_view string) const {
|
|
auto output_enc = output_encoding();
|
|
auto encoder = output_enc->new_encoder();
|
|
auto needed =
|
|
encoder->max_buffer_length_from_utf16_if_no_unmappables(string.size());
|
|
if (!needed) {
|
|
throw std::overflow_error("Overflow in buffer size computation.");
|
|
}
|
|
std::vector<uint8_t> vec(needed.value());
|
|
bool total_had_errors = false;
|
|
size_t total_read = 0;
|
|
size_t total_written = 0;
|
|
for (;;) {
|
|
const auto [result, read, written, had_errors] =
|
|
encoder->encode_from_utf16(string.substr(total_read),
|
|
gsl::make_span(vec).subspan(total_written),
|
|
true);
|
|
total_read += read;
|
|
total_written += written;
|
|
total_had_errors |= had_errors;
|
|
if (result == INPUT_EMPTY) {
|
|
assert(total_read == static_cast<size_t>(string.size()));
|
|
assert(total_written <= static_cast<size_t>(vec.size()));
|
|
vec.resize(total_written);
|
|
return {vec, gsl::not_null<const Encoding*>(output_enc),
|
|
total_had_errors};
|
|
}
|
|
auto needed = encoder->max_buffer_length_from_utf16_if_no_unmappables(
|
|
string.size() - total_read);
|
|
if (!needed) {
|
|
throw std::overflow_error("Overflow in buffer size computation.");
|
|
}
|
|
vec.resize(total_written + needed.value());
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Instantiates a new decoder for this encoding with BOM sniffing enabled.
|
|
*
|
|
* BOM sniffing may cause the returned decoder to morph into a decoder
|
|
* for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
|
|
*/
|
|
inline std::unique_ptr<Decoder> new_decoder() const {
|
|
return std::unique_ptr<Decoder>(encoding_new_decoder(this));
|
|
}
|
|
|
|
/**
|
|
* Instantiates a new decoder for this encoding with BOM sniffing enabled
|
|
* into memory occupied by a previously-instantiated decoder.
|
|
*
|
|
* BOM sniffing may cause the returned decoder to morph into a decoder
|
|
* for UTF-8, UTF-16LE or UTF-16BE instead of this encoding.
|
|
*/
|
|
inline void new_decoder_into(Decoder& decoder) const {
|
|
encoding_new_decoder_into(this, &decoder);
|
|
}
|
|
|
|
/**
|
|
* Instantiates a new decoder for this encoding with BOM removal.
|
|
*
|
|
* If the input starts with bytes that are the BOM for this encoding,
|
|
* those bytes are removed. However, the decoder never morphs into a
|
|
* decoder for another encoding: A BOM for another encoding is treated as
|
|
* (potentially malformed) input to the decoding algorithm for this
|
|
* encoding.
|
|
*/
|
|
inline std::unique_ptr<Decoder> new_decoder_with_bom_removal() const {
|
|
return std::unique_ptr<Decoder>(
|
|
encoding_new_decoder_with_bom_removal(this));
|
|
}
|
|
|
|
/**
|
|
* Instantiates a new decoder for this encoding with BOM removal
|
|
* into memory occupied by a previously-instantiated decoder.
|
|
*
|
|
* If the input starts with bytes that are the BOM for this encoding,
|
|
* those bytes are removed. However, the decoder never morphs into a
|
|
* decoder for another encoding: A BOM for another encoding is treated as
|
|
* (potentially malformed) input to the decoding algorithm for this
|
|
* encoding.
|
|
*/
|
|
inline void new_decoder_with_bom_removal_into(Decoder& decoder) const {
|
|
encoding_new_decoder_with_bom_removal_into(this, &decoder);
|
|
}
|
|
|
|
/**
|
|
* Instantiates a new decoder for this encoding with BOM handling disabled.
|
|
*
|
|
* If the input starts with bytes that look like a BOM, those bytes are
|
|
* not treated as a BOM. (Hence, the decoder never morphs into a decoder
|
|
* for another encoding.)
|
|
*
|
|
* _Note:_ If the caller has performed BOM sniffing on its own but has not
|
|
* removed the BOM, the caller should use `new_decoder_with_bom_removal()`
|
|
* instead of this method to cause the BOM to be removed.
|
|
*/
|
|
inline std::unique_ptr<Decoder> new_decoder_without_bom_handling() const {
|
|
return std::unique_ptr<Decoder>(
|
|
encoding_new_decoder_without_bom_handling(this));
|
|
}
|
|
|
|
/**
|
|
* Instantiates a new decoder for this encoding with BOM handling disabled
|
|
* into memory occupied by a previously-instantiated decoder.
|
|
*
|
|
* If the input starts with bytes that look like a BOM, those bytes are
|
|
* not treated as a BOM. (Hence, the decoder never morphs into a decoder
|
|
* for another encoding.)
|
|
*
|
|
* _Note:_ If the caller has performed BOM sniffing on its own but has not
|
|
* removed the BOM, the caller should use
|
|
* `new_decoder_with_bom_removal_into()`
|
|
* instead of this method to cause the BOM to be removed.
|
|
*/
|
|
inline void new_decoder_without_bom_handling_into(Decoder& decoder) const {
|
|
encoding_new_decoder_without_bom_handling_into(this, &decoder);
|
|
}
|
|
|
|
/**
|
|
* Instantiates a new encoder for the output encoding of this encoding.
|
|
*/
|
|
inline std::unique_ptr<Encoder> new_encoder() const {
|
|
return std::unique_ptr<Encoder>(encoding_new_encoder(this));
|
|
}
|
|
|
|
/**
|
|
* Instantiates a new encoder for the output encoding of this encoding
|
|
* into memory occupied by a previously-instantiated encoder.
|
|
*/
|
|
inline void new_encoder_into(Encoder& encoder) const {
|
|
encoding_new_encoder_into(this, &encoder);
|
|
}
|
|
|
|
/**
|
|
* Validates UTF-8.
|
|
*
|
|
* Returns the index of the first byte that makes the input malformed as
|
|
* UTF-8 or the length of the input if the input is entirely valid.
|
|
*/
|
|
static inline size_t utf8_valid_up_to(gsl::span<const uint8_t> buffer) {
|
|
return encoding_utf8_valid_up_to(
|
|
null_to_bogus<const uint8_t>(buffer.data()), buffer.size());
|
|
}
|
|
|
|
/**
|
|
* Validates ASCII.
|
|
*
|
|
* Returns the index of the first byte that makes the input malformed as
|
|
* ASCII or the length of the input if the input is entirely valid.
|
|
*/
|
|
static inline size_t ascii_valid_up_to(gsl::span<const uint8_t> buffer) {
|
|
return encoding_ascii_valid_up_to(
|
|
null_to_bogus<const uint8_t>(buffer.data()), buffer.size());
|
|
}
|
|
|
|
/**
|
|
* Validates ISO-2022-JP ASCII-state data.
|
|
*
|
|
* Returns the index of the first byte that makes the input not
|
|
* representable in the ASCII state of ISO-2022-JP or the length of the
|
|
* input if the input is entirely representable in the ASCII state of
|
|
* ISO-2022-JP.
|
|
*/
|
|
static inline size_t iso_2022_jp_ascii_valid_up_to(
|
|
gsl::span<const uint8_t> buffer) {
|
|
return encoding_iso_2022_jp_ascii_valid_up_to(
|
|
null_to_bogus<const uint8_t>(buffer.data()), buffer.size());
|
|
}
|
|
|
|
private:
|
|
/**
|
|
* Replaces `nullptr` with a bogus pointer suitable for use as part of a
|
|
* zero-length Rust slice.
|
|
*/
|
|
template <class T>
|
|
static inline T* null_to_bogus(T* ptr) {
|
|
return ptr ? ptr : reinterpret_cast<T*>(alignof(T));
|
|
}
|
|
|
|
Encoding() = delete;
|
|
Encoding(const Encoding&) = delete;
|
|
Encoding& operator=(const Encoding&) = delete;
|
|
~Encoding() = delete;
|
|
};
|
|
|
|
}; // namespace encoding_rs
|
|
|
|
#endif // encoding_rs_cpp_h_
|