Backed out 3 changesets (bug 1743322) for causing multiple failures CLOSED TREE

Backed out changeset fa443c853d2d Backed out changeset d299fa6358f7 (bug 1743322) Backed out changeset 25647d613534 (bug 1743322)
2022-12-06 11:48:14 +02:00 · 2022-12-06 11:48:14 +02:00 · 70bc66e002
commit 70bc66e002
parent 568b6a87f9
1 changed files with 90 additions and 107 deletions
--- a/memory/build/mozjemalloc.cpp
+++ b/memory/build/mozjemalloc.cpp
@ -492,7 +492,7 @@ static size_t gPageSize;
 #  define END_GLOBALS
 #  define DEFINE_GLOBAL(type) static const type
 #  define GLOBAL_LOG2 LOG2
-#  define GLOBAL_ASSERT_HELPER1(x) static_assert(x, #x)
+#  define GLOBAL_ASSERT_HELPER1(x) static_assert(x, #  x)
 #  define GLOBAL_ASSERT_HELPER2(x, y) static_assert(x, y)
 #  define GLOBAL_ASSERT(...)                                               \
    MACRO_CALL(                                                            \
@ -772,75 +772,6 @@ class SizeClass {
  size_t mSize;
 };

-// Fast division
-//
-// During deallocation we want to divide by the size class.  This class
-// provides a routine and sets up a constant as follows.
-//
-// To divide by a number D that is not a power of two we multiply by (2^17 /
-// D) and then right shift by 17 positions.
-//
-//   X / D
-//
-// becomes
-//
-//   (X * Inv) >> SIZE_INV_SHIFT
-//
-// Where Inv is calculated during the FastDivisor constructor as:
-//
-//   Inv = 2^SIZE_INV_SHIFT / D
-//
-template <typename T>
-class FastDivisor {
- private:
-  // The shift amount is chosen to minimise the size of inv while
-  // working for divisors up to 65536 in steps of 16.  I arrived at 17
-  // experimentally.  I wanted a low number to minimise the range of inv
-  // so it can fit in a uint16_t, 16 didn't work but 17 worked perfectly.
-  //
-  // We'd need to increase this if we allocated memory on smaller boundaries
-  // than 16.
-  static const unsigned divide_inv_shift = 17;
-
-  // We can fit the inverted divisor in 16 bits.
-  T inv;
-
- public:
-  // Needed so mBins can be constructed.
-  FastDivisor() : inv(0) {}
-
-  FastDivisor(unsigned div, unsigned max) {
-    MOZ_ASSERT(div <= max);
-
-    // divide_inv_shift is large enough.
-    MOZ_ASSERT((1U << divide_inv_shift) >= div);
-
-    unsigned inv_ = ((1U << divide_inv_shift) / div) + 1;
-
-    // Make sure that max * inv does not overflow.
-    MOZ_DIAGNOSTIC_ASSERT(max < UINT_MAX / inv_);
-
-    MOZ_ASSERT(inv_ <= std::numeric_limits<T>::max());
-    inv = static_cast<T>(inv_);
-
-    // Initialisation made inv non-zero.
-    MOZ_ASSERT(inv);
-  }
-
-  // Note that this always occurs in unsigned regardless of inv's type.  That
-  // is, inv is zero-extended before the operation.
-  inline unsigned divide(unsigned num) const {
-    // Check that inv was initialised.
-    MOZ_ASSERT(inv);
-    return (num * inv) >> divide_inv_shift;
-  }
-};
-
-template <typename T>
-unsigned inline operator/(unsigned num, FastDivisor<T> divisor) {
-  return divisor.divide(num);
-}
-
 // ***************************************************************************
 // Radix tree data structures.
 //
@ -993,6 +924,9 @@ struct arena_bin_t {
  // Bin's size class.
  size_t mSizeClass;

+  // Total size of a run for this bin's size class.
+  size_t mRunSize;
+
  // Total number of regions in a run for this bin's size class.
  uint32_t mRunNumRegions;

@ -1003,14 +937,7 @@ struct arena_bin_t {
  uint32_t mRunFirstRegionOffset;

  // Current number of runs in this bin, full or otherwise.
-  uint32_t mNumRuns;
-
-  // A constant for fast division by size class.  This value is 16 bits wide so
-  // it is placed last.
-  FastDivisor<uint16_t> mSizeDivisor;
-
-  // Total number of pages in a run for this bin's size class.
-  uint8_t mRunSizePages;
+  unsigned long mNumRuns;

  // Amount of overhead runs are allowed to have.
  static constexpr double kRunOverhead = 1.6_percent;
@ -1035,17 +962,6 @@ struct arena_bin_t {
  inline void Init(SizeClass aSizeClass);
 };

-// We try to keep the above structure aligned with common cache lines sizes,
-// often that's 64 bytes on x86 and ARM, we don't make assumptions for other
-// architectures.
-#if defined(__x86_64__) || defined(__aarch64__)
-// On 64bit platforms this structure is often 48 bytes
-// long, which means every other array element will be properly aligned.
-static_assert(sizeof(arena_bin_t) == 48);
-#elif defined(__x86__) || defined(__arm__)
-static_assert(sizeof(arena_bin_t) == 32);
-#endif
-
 struct arena_t {
 #if defined(MOZ_DIAGNOSTIC_ASSERT_ENABLED)
  uint32_t mMagic;
@ -2432,6 +2348,68 @@ inline void* arena_t::ArenaRunRegAlloc(arena_run_t* aRun, arena_bin_t* aBin) {
  return nullptr;
 }

+// To divide by a number D that is not a power of two we multiply by (2^21 /
+// D) and then right shift by 21 positions.
+//
+//   X / D
+//
+// becomes
+//
+//   (X * size_invs[D - 3]) >> SIZE_INV_SHIFT
+//
+// Where D is d/Q and Q is a constant factor.
+template <unsigned Q, unsigned Max>
+struct FastDivide {
+  static_assert(IsPowerOfTwo(Q), "q must be a power-of-two");
+
+  // We don't need FastDivide when dividing by a power-of-two. So when we set
+  // the range (min_divisor - max_divisor inclusive) we can avoid powers-of-two.
+
+  // Because Q is a power of two Q*3 is the first not-power-of-two.
+  static const unsigned min_divisor = Q * 3;
+  static const unsigned max_divisor =
+      mozilla::IsPowerOfTwo(Max) ? Max - Q : Max;
+  // +1 because this range is inclusive.
+  static const unsigned num_divisors = (max_divisor - min_divisor) / Q + 1;
+
+  static const unsigned inv_shift = 21;
+
+  static constexpr unsigned inv(unsigned s) {
+    return ((1U << inv_shift) / (s * Q)) + 1;
+  }
+
+  static unsigned divide(size_t num, unsigned div) {
+    // clang-format off
+    static const unsigned size_invs[] = {
+      inv(3),
+      inv(4),  inv(5),  inv(6),  inv(7),
+      inv(8),  inv(9),  inv(10), inv(11),
+      inv(12), inv(13), inv(14), inv(15),
+      inv(16), inv(17), inv(18), inv(19),
+      inv(20), inv(21), inv(22), inv(23),
+      inv(24), inv(25), inv(26), inv(27),
+      inv(28), inv(29), inv(30), inv(31)
+    };
+    // clang-format on
+
+    // If the divisor is valid (min is below max) then the size_invs array must
+    // be large enough.
+    static_assert(!(min_divisor < max_divisor) ||
+                      num_divisors <= sizeof(size_invs) / sizeof(unsigned),
+                  "num_divisors does not match array size");
+
+    MOZ_ASSERT(div >= min_divisor);
+    MOZ_ASSERT(div <= max_divisor);
+    MOZ_ASSERT(div % Q == 0);
+
+    // If Q isn't a power of two this optimisation would be pointless, we expect
+    // /Q to be reduced to a shift, but we asserted this above.
+    const unsigned idx = div / Q - 3;
+    MOZ_ASSERT(idx < sizeof(size_invs) / sizeof(unsigned));
+    return (num * size_invs[idx]) >> inv_shift;
+  }
+};
+
 static inline void arena_run_reg_dalloc(arena_run_t* run, arena_bin_t* bin,
                                        void* ptr, size_t size) {
  unsigned diff, regind, elm, bit;
@ -2442,11 +2420,22 @@ static inline void arena_run_reg_dalloc(arena_run_t* run, arena_bin_t* bin,
  // actual division here can reduce allocator throughput by over 20%!
  diff =
      (unsigned)((uintptr_t)ptr - (uintptr_t)run - bin->mRunFirstRegionOffset);
-
-  MOZ_ASSERT(diff <=
-             (static_cast<unsigned>(bin->mRunSizePages) << gPageSize2Pow));
-  regind = diff / bin->mSizeDivisor;
-
+  if (mozilla::IsPowerOfTwo(size)) {
+    regind = diff >> FloorLog2(size);
+  } else {
+    SizeClass sc(size);
+    switch (sc.Type()) {
+      case SizeClass::Quantum:
+        regind = FastDivide<kQuantum, kMaxQuantumClass>::divide(diff, size);
+        break;
+      case SizeClass::QuantumWide:
+        regind =
+            FastDivide<kQuantumWide, kMaxQuantumWideClass>::divide(diff, size);
+        break;
+      default:
+        regind = diff / size;
+    }
+  }
  MOZ_DIAGNOSTIC_ASSERT(diff == regind * size);
  MOZ_DIAGNOSTIC_ASSERT(regind < bin->mRunNumRegions);

@ -2804,11 +2793,10 @@ void arena_t::DallocRun(arena_run_t* aRun, bool aDirty) {
  MOZ_RELEASE_ASSERT(run_ind < gChunkNumPages - 1);
  if ((chunk->map[run_ind].bits & CHUNK_MAP_LARGE) != 0) {
    size = chunk->map[run_ind].bits & ~gPageSizeMask;
-    run_pages = (size >> gPageSize2Pow);
  } else {
-    run_pages = aRun->mBin->mRunSizePages;
-    size = run_pages << gPageSize2Pow;
+    size = aRun->mBin->mRunSize;
  }
+  run_pages = (size >> gPageSize2Pow);

  // Mark pages as unallocated in the chunk map.
  if (aDirty) {
@ -2942,8 +2930,7 @@ arena_run_t* arena_t::GetNonFullBinRun(arena_bin_t* aBin) {
  // No existing runs have any space available.

  // Allocate a new run.
-  run = AllocRun(static_cast<size_t>(aBin->mRunSizePages) << gPageSize2Pow,
-                 false, false);
+  run = AllocRun(aBin->mRunSize, false, false);
  if (!run) {
    return nullptr;
  }
@ -2994,7 +2981,7 @@ void arena_bin_t::Init(SizeClass aSizeClass) {
  mSizeClass = aSizeClass.Size();
  mNumRuns = 0;

-  // Run size expansion loop.
+  // mRunSize expansion loop.
  while (true) {
    try_nregs = ((try_run_size - kFixedHeaderSize) / mSizeClass) +
                1;  // Counter-act try_nregs-- in loop.
@ -3055,12 +3042,10 @@ void arena_bin_t::Init(SizeClass aSizeClass) {
  MOZ_ASSERT((try_mask_nelms << (LOG2(sizeof(int)) + 3)) >= try_nregs);

  // Copy final settings.
-  MOZ_ASSERT((try_run_size >> gPageSize2Pow) <= UINT8_MAX);
-  mRunSizePages = static_cast<uint8_t>(try_run_size >> gPageSize2Pow);
+  mRunSize = try_run_size;
  mRunNumRegions = try_nregs;
  mRunNumRegionsMask = try_mask_nelms;
  mRunFirstRegionOffset = try_reg0_offset;
-  mSizeDivisor = FastDivisor<uint16_t>(aSizeClass.Size(), try_run_size);
 }

 void* arena_t::MallocSmall(size_t aSize, bool aZero) {
@ -4597,11 +4582,9 @@ inline void MozJemalloc::jemalloc_stats_internal(
          aBinStats[j].num_non_full_runs += num_non_full_runs;
          aBinStats[j].num_runs += bin->mNumRuns;
          aBinStats[j].bytes_unused += bin_unused;
-          size_t bytes_per_run = static_cast<size_t>(bin->mRunSizePages)
-                                 << gPageSize2Pow;
          aBinStats[j].bytes_total +=
-              bin->mNumRuns * (bytes_per_run - bin->mRunFirstRegionOffset);
-          aBinStats[j].bytes_per_run = bytes_per_run;
+              bin->mNumRuns * (bin->mRunSize - bin->mRunFirstRegionOffset);
+          aBinStats[j].bytes_per_run = bin->mRunSize;
        }
      }
    }