forked from mirrors/gecko-dev
		
	 c21767988b
			
		
	
	
		c21767988b
		
	
	
	
	
		
			
			This will make customizing flags on the regex type less error-prone, as passing a flag (e.g. `RustRegex::FLAG_CASEI`) will not implicitly disable the default `UNICODE` flag. Names and comments were taken from the original `RegexBuilder` type in the `regex` crate which this type is inspired by, and the type was changed to be cheaply copyable and support method chaining, so that it can be used more ergonomically. Differential Revision: https://phabricator.services.mozilla.com/D159083
		
			
				
	
	
		
			707 lines
		
	
	
	
		
			23 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			707 lines
		
	
	
	
		
			23 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 | |
| /* vim: set ts=8 sts=2 et sw=2 tw=80: */
 | |
| /* This Source Code Form is subject to the terms of the Mozilla Public
 | |
|  * License, v. 2.0. If a copy of the MPL was not distributed with this file,
 | |
|  * You can obtain one at http://mozilla.org/MPL/2.0/. */
 | |
| 
 | |
| #ifndef mozilla_RustRegex_h
 | |
| #define mozilla_RustRegex_h
 | |
| 
 | |
| #include "nsPrintfCString.h"
 | |
| #include "nsTArray.h"
 | |
| #include "rure.h"
 | |
| #include "mozilla/Maybe.h"
 | |
| #include "mozilla/UniquePtr.h"
 | |
| 
 | |
| namespace mozilla {
 | |
| 
 | |
| // This header is a thin wrapper around the `rure.h` header file, which declares
 | |
| // the C API for interacting with the rust `regex` crate. This is intended to
 | |
| // make the type more ergonomic to use with mozilla types.
 | |
| 
 | |
| class RustRegex;
 | |
| class RustRegexSet;
 | |
| class RustRegexOptions;
 | |
| class RustRegexCaptures;
 | |
| class RustRegexIter;
 | |
| class RustRegexIterCaptureNames;
 | |
| 
 | |
| using RustRegexMatch = rure_match;
 | |
| 
 | |
| /*
 | |
|  * RustRegexCaptures represents storage for sub-capture locations of a match.
 | |
|  *
 | |
|  * Computing the capture groups of a match can carry a significant performance
 | |
|  * penalty, so their use in the API is optional.
 | |
|  *
 | |
|  * A RustRegexCaptures value may outlive its corresponding RustRegex and can be
 | |
|  * freed independently.
 | |
|  *
 | |
|  * It is not safe to use from multiple threads simultaneously.
 | |
|  */
 | |
| class RustRegexCaptures final {
 | |
|  public:
 | |
|   RustRegexCaptures() = default;
 | |
| 
 | |
|   // Check if the `RustRegexCaptures` object is valid.
 | |
|   bool IsValid() const { return mPtr != nullptr; }
 | |
|   explicit operator bool() const { return IsValid(); }
 | |
| 
 | |
|   /*
 | |
|    * CaptureAt returns Some if and only if the capturing group at the
 | |
|    * index given was part of the match. If so, the returned RustRegexMatch
 | |
|    * object contains the start and end offsets (in bytes) of the match.
 | |
|    *
 | |
|    * If no capture group with the index aIdx exists, or the group was not part
 | |
|    * of the match, then Nothing is returned.  (A capturing group exists if and
 | |
|    * only if aIdx is less than Length().)
 | |
|    *
 | |
|    * Note that index 0 corresponds to the full match.
 | |
|    */
 | |
|   Maybe<RustRegexMatch> CaptureAt(size_t aIdx) const {
 | |
|     RustRegexMatch match;
 | |
|     if (mPtr && rure_captures_at(mPtr.get(), aIdx, &match)) {
 | |
|       return Some(match);
 | |
|     }
 | |
|     return Nothing();
 | |
|   }
 | |
|   Maybe<RustRegexMatch> operator[](size_t aIdx) const {
 | |
|     return CaptureAt(aIdx);
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * Returns the number of capturing groups in this `RustRegexCaptures`.
 | |
|    */
 | |
|   size_t Length() const { return mPtr ? rure_captures_len(mPtr.get()) : 0; }
 | |
| 
 | |
|  private:
 | |
|   friend class RustRegex;
 | |
|   friend class RustRegexIter;
 | |
| 
 | |
|   explicit RustRegexCaptures(rure* aRe)
 | |
|       : mPtr(aRe ? rure_captures_new(aRe) : nullptr) {}
 | |
| 
 | |
|   struct Deleter {
 | |
|     void operator()(rure_captures* ptr) const { rure_captures_free(ptr); }
 | |
|   };
 | |
|   UniquePtr<rure_captures, Deleter> mPtr;
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * RustRegexIterCaptureNames is an iterator over the list of capture group names
 | |
|  * in this particular RustRegex.
 | |
|  *
 | |
|  * A RustRegexIterCaptureNames value may not outlive its corresponding
 | |
|  * RustRegex, and should be destroyed before its corresponding RustRegex is
 | |
|  * destroyed.
 | |
|  *
 | |
|  * It is not safe to use from multiple threads simultaneously.
 | |
|  */
 | |
| class RustRegexIterCaptureNames {
 | |
|  public:
 | |
|   RustRegexIterCaptureNames() = delete;
 | |
| 
 | |
|   // Check if the `RustRegexIterCaptureNames` object is valid.
 | |
|   bool IsValid() const { return mPtr != nullptr; }
 | |
|   explicit operator bool() const { return IsValid(); }
 | |
| 
 | |
|   /*
 | |
|    * Advances the iterator and returns true if and only if another capture group
 | |
|    * name exists.
 | |
|    *
 | |
|    * The value of the capture group name is written to the provided pointer.
 | |
|    */
 | |
|   mozilla::Maybe<const char*> Next() {
 | |
|     char* next = nullptr;
 | |
|     if (mPtr && rure_iter_capture_names_next(mPtr.get(), &next)) {
 | |
|       return Some(next);
 | |
|     }
 | |
|     return Nothing();
 | |
|   }
 | |
| 
 | |
|  private:
 | |
|   friend class RustRegex;
 | |
| 
 | |
|   explicit RustRegexIterCaptureNames(rure* aRe)
 | |
|       : mPtr(aRe ? rure_iter_capture_names_new(aRe) : nullptr) {}
 | |
| 
 | |
|   struct Deleter {
 | |
|     void operator()(rure_iter_capture_names* ptr) const {
 | |
|       rure_iter_capture_names_free(ptr);
 | |
|     }
 | |
|   };
 | |
|   UniquePtr<rure_iter_capture_names, Deleter> mPtr;
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * RustRegexIter is an iterator over successive non-overlapping matches in a
 | |
|  * particular haystack.
 | |
|  *
 | |
|  * A RustRegexIter value may not outlive its corresponding RustRegex and should
 | |
|  * be destroyed before its corresponding RustRegex is destroyed.
 | |
|  *
 | |
|  * It is not safe to use from multiple threads simultaneously.
 | |
|  */
 | |
| class RustRegexIter {
 | |
|  public:
 | |
|   RustRegexIter() = delete;
 | |
| 
 | |
|   // Check if the `RustRegexIter` object is valid.
 | |
|   bool IsValid() const { return mPtr != nullptr; }
 | |
|   explicit operator bool() const { return IsValid(); }
 | |
| 
 | |
|   /*
 | |
|    * Next() returns Some if and only if this regex matches anywhere in haystack.
 | |
|    * The returned RustRegexMatch object contains the start and end offsets (in
 | |
|    * bytes) of the match.
 | |
|    *
 | |
|    * If no match is found, then subsequent calls will return Nothing()
 | |
|    * indefinitely.
 | |
|    *
 | |
|    * Next() should be preferred to NextCaptures() since it may be faster.
 | |
|    *
 | |
|    * N.B. The performance of this search is not impacted by the presence of
 | |
|    * capturing groups in your regular expression.
 | |
|    */
 | |
|   mozilla::Maybe<RustRegexMatch> Next() {
 | |
|     RustRegexMatch match{};
 | |
|     if (mPtr &&
 | |
|         rure_iter_next(mPtr.get(), mHaystackPtr, mHaystackSize, &match)) {
 | |
|       return Some(match);
 | |
|     }
 | |
|     return Nothing();
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * NextCaptures returns a valid RustRegexCaptures if and only if this regex
 | |
|    * matches anywhere in haystack. If a match is found, then all of its capture
 | |
|    * locations are stored in the returned RustRegexCaptures object.
 | |
|    *
 | |
|    * If no match is found, then subsequent calls will return an invalid
 | |
|    * `RustRegexCaptures` indefinitely.
 | |
|    *
 | |
|    * Only use this function if you specifically need access to capture
 | |
|    * locations. It is not necessary to use this function just because your
 | |
|    * regular expression contains capturing groups.
 | |
|    *
 | |
|    * Capture locations can be accessed using the methods on RustRegexCaptures.
 | |
|    *
 | |
|    * N.B. The performance of this search can be impacted by the number of
 | |
|    * capturing groups. If you're using this function, it may be beneficial to
 | |
|    * use non-capturing groups (e.g., `(?:re)`) where possible.
 | |
|    */
 | |
|   RustRegexCaptures NextCaptures() {
 | |
|     RustRegexCaptures captures(mRe);
 | |
|     if (mPtr && rure_iter_next_captures(mPtr.get(), mHaystackPtr, mHaystackSize,
 | |
|                                         captures.mPtr.get())) {
 | |
|       return captures;
 | |
|     }
 | |
|     return {};
 | |
|   }
 | |
| 
 | |
|  private:
 | |
|   friend class RustRegex;
 | |
|   RustRegexIter(rure* aRe, const std::string_view& aHaystack)
 | |
|       : mRe(aRe),
 | |
|         mHaystackPtr(reinterpret_cast<const uint8_t*>(aHaystack.data())),
 | |
|         mHaystackSize(aHaystack.size()),
 | |
|         mPtr(aRe ? rure_iter_new(aRe) : nullptr) {}
 | |
| 
 | |
|   rure* MOZ_NON_OWNING_REF mRe;
 | |
|   const uint8_t* MOZ_NON_OWNING_REF mHaystackPtr;
 | |
|   size_t mHaystackSize;
 | |
| 
 | |
|   struct Deleter {
 | |
|     void operator()(rure_iter* ptr) const { rure_iter_free(ptr); }
 | |
|   };
 | |
|   UniquePtr<rure_iter, Deleter> mPtr;
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * RustRegexOptions is the set of configuration options for compiling a regular
 | |
|  * expression.
 | |
|  *
 | |
|  * All flags on this type can be used to set default flags while compiling, and
 | |
|  * can be toggled in the expression itself using standard syntax, e.g. `(?i)`
 | |
|  * turns case-insensitive matching on, and `(?-i)` disables it.
 | |
|  *
 | |
|  * In addition, two non-flag options are available: setting the size limit of
 | |
|  * the compiled program and setting the size limit of the cache of states that
 | |
|  * the DFA uses while searching.
 | |
|  *
 | |
|  * For most uses, the default settings will work fine, and a default-constructed
 | |
|  * RustRegexOptions can be passed.
 | |
|  */
 | |
| class RustRegexOptions {
 | |
|  public:
 | |
|   RustRegexOptions() = default;
 | |
| 
 | |
|   /*
 | |
|    * Set the value for the case insensitive (i) flag.
 | |
|    *
 | |
|    * When enabled, letters in the pattern will match both upper case and lower
 | |
|    * case variants.
 | |
|    */
 | |
|   RustRegexOptions& CaseInsensitive(bool aYes) {
 | |
|     return SetFlag(aYes, RURE_FLAG_CASEI);
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * Set the value for the multi-line matching (m) flag.
 | |
|    *
 | |
|    * When enabled, ^ matches the beginning of lines and $ matches the end of
 | |
|    * lines.
 | |
|    *
 | |
|    * By default, they match beginning/end of the input.
 | |
|    */
 | |
|   RustRegexOptions& MultiLine(bool aYes) {
 | |
|     return SetFlag(aYes, RURE_FLAG_MULTI);
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * Set the value for the any character (s) flag, where in . matches anything
 | |
|    * when s is set and matches anything except for new line when it is not set
 | |
|    * (the default).
 | |
|    *
 | |
|    * N.B. “matches anything” means “any byte” when Unicode is disabled and means
 | |
|    * “any valid UTF-8 encoding of any Unicode scalar value” when Unicode is
 | |
|    * enabled.
 | |
|    */
 | |
|   RustRegexOptions& DotMatchesNewLine(bool aYes) {
 | |
|     return SetFlag(aYes, RURE_FLAG_DOTNL);
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * Set the value for the greedy swap (U) flag.
 | |
|    *
 | |
|    * When enabled, a pattern like a* is lazy (tries to find shortest match) and
 | |
|    * a*? is greedy (tries to find longest match).
 | |
|    *
 | |
|    * By default, a* is greedy and a*? is lazy.
 | |
|    */
 | |
|   RustRegexOptions& SwapGreed(bool aYes) {
 | |
|     return SetFlag(aYes, RURE_FLAG_SWAP_GREED);
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * Set the value for the ignore whitespace (x) flag.
 | |
|    *
 | |
|    * When enabled, whitespace such as new lines and spaces will be ignored
 | |
|    * between expressions of the pattern, and # can be used to start a comment
 | |
|    * until the next new line.
 | |
|    */
 | |
|   RustRegexOptions& IgnoreWhitespace(bool aYes) {
 | |
|     return SetFlag(aYes, RURE_FLAG_SPACE);
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * Set the value for the Unicode (u) flag.
 | |
|    *
 | |
|    * Enabled by default. When disabled, character classes such as \w only match
 | |
|    * ASCII word characters instead of all Unicode word characters.
 | |
|    */
 | |
|   RustRegexOptions& Unicode(bool aYes) {
 | |
|     return SetFlag(aYes, RURE_FLAG_UNICODE);
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * SizeLimit sets the appoximate size limit of the compiled regular
 | |
|    * expression.
 | |
|    *
 | |
|    * This size limit roughly corresponds to the number of bytes occupied by
 | |
|    * a single compiled program. If the program would exceed this number,
 | |
|    * then an invalid RustRegex will be constructed.
 | |
|    */
 | |
|   RustRegexOptions& SizeLimit(size_t aLimit) {
 | |
|     mSizeLimit = Some(aLimit);
 | |
|     return *this;
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * DFASizeLimit sets the approximate size of the cache used by the DFA during
 | |
|    * search.
 | |
|    *
 | |
|    * This roughly corresponds to the number of bytes that the DFA will use while
 | |
|    * searching.
 | |
|    *
 | |
|    * Note that this is a *per thread* limit. There is no way to set a global
 | |
|    * limit. In particular, if a regular expression is used from multiple threads
 | |
|    * simultaneously, then each thread may use up to the number of bytes
 | |
|    * specified here.
 | |
|    */
 | |
|   RustRegexOptions& DFASizeLimit(size_t aLimit) {
 | |
|     mDFASizeLimit = Some(aLimit);
 | |
|     return *this;
 | |
|   }
 | |
| 
 | |
|  private:
 | |
|   friend class RustRegex;
 | |
|   friend class RustRegexSet;
 | |
| 
 | |
|   struct OptionsDeleter {
 | |
|     void operator()(rure_options* ptr) const { rure_options_free(ptr); }
 | |
|   };
 | |
| 
 | |
|   UniquePtr<rure_options, OptionsDeleter> GetOptions() const {
 | |
|     UniquePtr<rure_options, OptionsDeleter> options;
 | |
|     if (mSizeLimit || mDFASizeLimit) {
 | |
|       options.reset(rure_options_new());
 | |
|       if (mSizeLimit) {
 | |
|         rure_options_size_limit(options.get(), *mSizeLimit);
 | |
|       }
 | |
|       if (mDFASizeLimit) {
 | |
|         rure_options_dfa_size_limit(options.get(), *mDFASizeLimit);
 | |
|       }
 | |
|     }
 | |
|     return options;
 | |
|   }
 | |
| 
 | |
|   uint32_t GetFlags() const { return mFlags; }
 | |
| 
 | |
|   RustRegexOptions& SetFlag(bool aYes, uint32_t aFlag) {
 | |
|     if (aYes) {
 | |
|       mFlags |= aFlag;
 | |
|     } else {
 | |
|       mFlags &= ~aFlag;
 | |
|     }
 | |
|     return *this;
 | |
|   }
 | |
| 
 | |
|   uint32_t mFlags = RURE_DEFAULT_FLAGS;
 | |
|   Maybe<size_t> mSizeLimit;
 | |
|   Maybe<size_t> mDFASizeLimit;
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * RustRegex is the type of a compiled regular expression.
 | |
|  *
 | |
|  * A RustRegex can be safely used from multiple threads simultaneously.
 | |
|  *
 | |
|  * When calling the matching methods on this type, they will generally have the
 | |
|  * following parameters:
 | |
|  *
 | |
|  * aHaystack
 | |
|  *   may contain arbitrary bytes, but ASCII compatible text is more useful.
 | |
|  *   UTF-8 is even more useful. Other text encodings aren't supported.
 | |
|  *
 | |
|  * aStart
 | |
|  *   the position in bytes at which to start searching. Note that setting the
 | |
|  *   start position is distinct from using a substring for `aHaystack`, since
 | |
|  *   the regex engine may look at bytes before the start position to determine
 | |
|  *   match information. For example, if the start position is greater than 0,
 | |
|  *   then the \A ("begin text") anchor can never match.
 | |
|  */
 | |
| class RustRegex final {
 | |
|  public:
 | |
|   // Create a new invalid RustRegex object
 | |
|   RustRegex() = default;
 | |
| 
 | |
|   /*
 | |
|    * Compiles the given pattern into a regular expression. The pattern must be
 | |
|    * valid UTF-8 and the length corresponds to the number of bytes in the
 | |
|    * pattern.
 | |
|    *
 | |
|    * If an error occurs, the constructed RustRegex will be `!IsValid()`.
 | |
|    *
 | |
|    * The compiled expression returned may be used from multiple threads
 | |
|    * simultaneously.
 | |
|    */
 | |
|   explicit RustRegex(const std::string_view& aPattern,
 | |
|                      const RustRegexOptions& aOptions = {}) {
 | |
| #ifdef DEBUG
 | |
|     rure_error* error = rure_error_new();
 | |
| #else
 | |
|     rure_error* error = nullptr;
 | |
| #endif
 | |
|     mPtr.reset(rure_compile(reinterpret_cast<const uint8_t*>(aPattern.data()),
 | |
|                             aPattern.size(), aOptions.GetFlags(),
 | |
|                             aOptions.GetOptions().get(), error));
 | |
| #ifdef DEBUG
 | |
|     if (!mPtr) {
 | |
|       NS_WARNING(nsPrintfCString("RustRegex compile failed: %s",
 | |
|                                  rure_error_message(error))
 | |
|                      .get());
 | |
|     }
 | |
|     rure_error_free(error);
 | |
| #endif
 | |
|   }
 | |
| 
 | |
|   // Check if the compiled `RustRegex` is valid.
 | |
|   bool IsValid() const { return mPtr != nullptr; }
 | |
|   explicit operator bool() const { return IsValid(); }
 | |
| 
 | |
|   /*
 | |
|    * IsMatch returns true if and only if this regex matches anywhere in
 | |
|    * aHaystack.
 | |
|    *
 | |
|    * See the type-level comment for details on aHaystack and aStart.
 | |
|    *
 | |
|    * IsMatch() should be preferred to Find() since it may be faster.
 | |
|    *
 | |
|    * N.B. The performance of this search is not impacted by the presence of
 | |
|    * capturing groups in your regular expression.
 | |
|    */
 | |
|   bool IsMatch(const std::string_view& aHaystack, size_t aStart = 0) const {
 | |
|     return mPtr &&
 | |
|            rure_is_match(mPtr.get(),
 | |
|                          reinterpret_cast<const uint8_t*>(aHaystack.data()),
 | |
|                          aHaystack.size(), aStart);
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * Find returns Some if and only if this regex matches anywhere in
 | |
|    * haystack. The returned RustRegexMatch object contains the start and end
 | |
|    * offsets (in bytes) of the match.
 | |
|    *
 | |
|    * See the type-level comment for details on aHaystack and aStart.
 | |
|    *
 | |
|    * Find() should be preferred to FindCaptures() since it may be faster.
 | |
|    *
 | |
|    * N.B. The performance of this search is not impacted by the presence of
 | |
|    * capturing groups in your regular expression.
 | |
|    */
 | |
|   Maybe<RustRegexMatch> Find(const std::string_view& aHaystack,
 | |
|                              size_t aStart = 0) const {
 | |
|     RustRegexMatch match{};
 | |
|     if (mPtr && rure_find(mPtr.get(),
 | |
|                           reinterpret_cast<const uint8_t*>(aHaystack.data()),
 | |
|                           aHaystack.size(), aStart, &match)) {
 | |
|       return Some(match);
 | |
|     }
 | |
|     return Nothing();
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * FindCaptures() returns a valid RustRegexCaptures if and only if this
 | |
|    * regex matches anywhere in haystack. If a match is found, then all of its
 | |
|    * capture locations are stored in the returned RustRegexCaptures object.
 | |
|    *
 | |
|    * See the type-level comment for details on aHaystack and aStart.
 | |
|    *
 | |
|    * Only use this function if you specifically need access to capture
 | |
|    * locations. It is not necessary to use this function just because your
 | |
|    * regular expression contains capturing groups.
 | |
|    *
 | |
|    * Capture locations can be accessed using the methods on RustRegexCaptures.
 | |
|    *
 | |
|    * N.B. The performance of this search can be impacted by the number of
 | |
|    * capturing groups. If you're using this function, it may be beneficial to
 | |
|    * use non-capturing groups (e.g., `(?:re)`) where possible.
 | |
|    */
 | |
|   RustRegexCaptures FindCaptures(const std::string_view& aHaystack,
 | |
|                                  size_t aStart = 0) const {
 | |
|     RustRegexCaptures captures(mPtr.get());
 | |
|     if (mPtr &&
 | |
|         rure_find_captures(mPtr.get(),
 | |
|                            reinterpret_cast<const uint8_t*>(aHaystack.data()),
 | |
|                            aHaystack.size(), aStart, captures.mPtr.get())) {
 | |
|       return captures;
 | |
|     }
 | |
|     return {};
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * ShortestMatch() returns Some if and only if this regex matches anywhere
 | |
|    * in haystack. If a match is found, then its end location is stored in the
 | |
|    * pointer given. The end location is the place at which the regex engine
 | |
|    * determined that a match exists, but may occur before the end of the
 | |
|    * proper leftmost-first match.
 | |
|    *
 | |
|    * See the type-level comment for details on aHaystack and aStart.
 | |
|    *
 | |
|    * ShortestMatch should be preferred to Find since it may be faster.
 | |
|    *
 | |
|    * N.B. The performance of this search is not impacted by the presence of
 | |
|    * capturing groups in your regular expression.
 | |
|    */
 | |
|   Maybe<size_t> ShortestMatch(const std::string_view& aHaystack,
 | |
|                               size_t aStart = 0) const {
 | |
|     size_t end = 0;
 | |
|     if (mPtr &&
 | |
|         rure_shortest_match(mPtr.get(),
 | |
|                             reinterpret_cast<const uint8_t*>(aHaystack.data()),
 | |
|                             aHaystack.size(), aStart, &end)) {
 | |
|       return Some(end);
 | |
|     }
 | |
|     return Nothing();
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * Create an iterator over all successive non-overlapping matches of this
 | |
|    * regex in aHaystack.
 | |
|    *
 | |
|    * See the type-level comment for details on aHaystack.
 | |
|    *
 | |
|    * Both aHaystack and this regex must remain valid until the returned
 | |
|    * `RustRegexIter` is destroyed.
 | |
|    */
 | |
|   RustRegexIter IterMatches(const std::string_view& aHaystack) const {
 | |
|     return RustRegexIter(mPtr.get(), aHaystack);
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * Returns the capture index for the name given. If no such named capturing
 | |
|    * group exists in this regex, then -1 is returned.
 | |
|    *
 | |
|    * The capture index may be used with RustRegexCaptures::CaptureAt.
 | |
|    *
 | |
|    * This function never returns 0 since the first capture group always
 | |
|    * corresponds to the entire match and is always unnamed.
 | |
|    */
 | |
|   int32_t CaptureNameIndex(const char* aName) const {
 | |
|     return mPtr ? rure_capture_name_index(mPtr.get(), aName) : -1;
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * Create an iterator over the list of capture group names in this particular
 | |
|    * regex.
 | |
|    *
 | |
|    * This regex must remain valid until the returned `RustRegexIterCaptureNames`
 | |
|    * is destroyed.
 | |
|    */
 | |
|   RustRegexIterCaptureNames IterCaptureNames() const {
 | |
|     return RustRegexIterCaptureNames(mPtr.get());
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * Count the number of successive non-overlapping matches of this regex in
 | |
|    * aHaystack.
 | |
|    *
 | |
|    * See the type-level comment for details on aHaystack.
 | |
|    */
 | |
|   size_t CountMatches(const std::string_view& aHaystack) const {
 | |
|     size_t count = 0;
 | |
|     auto iter = IterMatches(aHaystack);
 | |
|     while (iter.Next()) {
 | |
|       count++;
 | |
|     }
 | |
|     return count;
 | |
|   }
 | |
| 
 | |
|  private:
 | |
|   struct Deleter {
 | |
|     void operator()(rure* ptr) const { rure_free(ptr); }
 | |
|   };
 | |
|   UniquePtr<rure, Deleter> mPtr;
 | |
| };
 | |
| 
 | |
| /*
 | |
|  * RustRegexSet is the type of a set of compiled regular expression.
 | |
|  *
 | |
|  * A RustRegexSet can be safely used from multiple threads simultaneously.
 | |
|  *
 | |
|  * When calling the matching methods on this type, they will generally have the
 | |
|  * following parameters:
 | |
|  *
 | |
|  * aHaystack
 | |
|  *   may contain arbitrary bytes, but ASCII compatible text is more useful.
 | |
|  *   UTF-8 is even more useful. Other text encodings aren't supported.
 | |
|  *
 | |
|  * aStart
 | |
|  *   the position in bytes at which to start searching. Note that setting the
 | |
|  *   start position is distinct from using a substring for `aHaystack`, since
 | |
|  *   the regex engine may look at bytes before the start position to determine
 | |
|  *   match information. For example, if the start position is greater than 0,
 | |
|  *   then the \A ("begin text") anchor can never match.
 | |
|  */
 | |
| class RustRegexSet final {
 | |
|  public:
 | |
|   /*
 | |
|    * Compiles the given range of patterns into a single regular expression which
 | |
|    * can be matched in a linear-scan. Each pattern in aPatterns must be valid
 | |
|    * UTF-8, and implicitly coerce to `std::string_view`.
 | |
|    *
 | |
|    * If an error occurs, the constructed RustRegexSet will be `!IsValid()`.
 | |
|    *
 | |
|    * The compiled expression returned may be used from multiple threads
 | |
|    * simultaneously.
 | |
|    */
 | |
|   template <typename Patterns>
 | |
|   explicit RustRegexSet(Patterns&& aPatterns,
 | |
|                         const RustRegexOptions& aOptions = {}) {
 | |
| #ifdef DEBUG
 | |
|     rure_error* error = rure_error_new();
 | |
| #else
 | |
|     rure_error* error = nullptr;
 | |
| #endif
 | |
|     AutoTArray<const uint8_t*, 4> patternPtrs;
 | |
|     AutoTArray<size_t, 4> patternSizes;
 | |
|     for (auto&& pattern : std::forward<Patterns>(aPatterns)) {
 | |
|       std::string_view view = pattern;
 | |
|       patternPtrs.AppendElement(reinterpret_cast<const uint8_t*>(view.data()));
 | |
|       patternSizes.AppendElement(view.size());
 | |
|     }
 | |
|     mPtr.reset(rure_compile_set(patternPtrs.Elements(), patternSizes.Elements(),
 | |
|                                 patternPtrs.Length(), aOptions.GetFlags(),
 | |
|                                 aOptions.GetOptions().get(), error));
 | |
| #ifdef DEBUG
 | |
|     if (!mPtr) {
 | |
|       NS_WARNING(nsPrintfCString("RustRegexSet compile failed: %s",
 | |
|                                  rure_error_message(error))
 | |
|                      .get());
 | |
|     }
 | |
|     rure_error_free(error);
 | |
| #endif
 | |
|   }
 | |
| 
 | |
|   // Check if the `RustRegexSet` object is valid.
 | |
|   bool IsValid() const { return mPtr != nullptr; }
 | |
|   explicit operator bool() const { return IsValid(); }
 | |
| 
 | |
|   /*
 | |
|    * IsMatch returns true if and only if any regexes within the set
 | |
|    * match anywhere in the haystack. Once a match has been located, the
 | |
|    * matching engine will quit immediately.
 | |
|    *
 | |
|    * See the type-level comment for details on aHaystack and aStart.
 | |
|    */
 | |
|   bool IsMatch(const std::string_view& aHaystack, size_t aStart = 0) const {
 | |
|     return mPtr &&
 | |
|            rure_set_is_match(mPtr.get(),
 | |
|                              reinterpret_cast<const uint8_t*>(aHaystack.data()),
 | |
|                              aHaystack.size(), aStart);
 | |
|   }
 | |
| 
 | |
|   struct SetMatches {
 | |
|     bool matchedAny = false;
 | |
|     nsTArray<bool> matches;
 | |
|   };
 | |
| 
 | |
|   /*
 | |
|    * Matches() compares each regex in the set against the haystack and
 | |
|    * returns a list with the match result of each pattern. Match results are
 | |
|    * ordered in the same way as the regex set was compiled. For example, index 0
 | |
|    * of matches corresponds to the first pattern passed to the constructor.
 | |
|    *
 | |
|    * See the type-level comment for details on aHaystack and aStart.
 | |
|    *
 | |
|    * Only use this function if you specifically need to know which regexes
 | |
|    * matched within the set. To determine if any of the regexes matched without
 | |
|    * caring which, use IsMatch.
 | |
|    */
 | |
|   SetMatches Matches(const std::string_view& aHaystack,
 | |
|                      size_t aStart = 0) const {
 | |
|     nsTArray<bool> matches;
 | |
|     matches.SetLength(Length());
 | |
|     bool any = mPtr && rure_set_matches(
 | |
|                            mPtr.get(),
 | |
|                            reinterpret_cast<const uint8_t*>(aHaystack.data()),
 | |
|                            aHaystack.size(), aStart, matches.Elements());
 | |
|     return SetMatches{any, std::move(matches)};
 | |
|   }
 | |
| 
 | |
|   /*
 | |
|    * Returns the number of patterns the regex set was compiled with.
 | |
|    */
 | |
|   size_t Length() const { return mPtr ? rure_set_len(mPtr.get()) : 0; }
 | |
| 
 | |
|  private:
 | |
|   struct Deleter {
 | |
|     void operator()(rure_set* ptr) const { rure_set_free(ptr); }
 | |
|   };
 | |
|   UniquePtr<rure_set, Deleter> mPtr;
 | |
| };
 | |
| 
 | |
| }  // namespace mozilla
 | |
| 
 | |
| #endif  // mozilla_RustRegex_h
 |