mirror of
				https://github.com/mozilla/gecko-dev.git
				synced 2025-11-04 02:09:05 +02:00 
			
		
		
		
	Done with: ./mach static-analysis check --checks="-*, modernize-concat-nested-namespaces" --fix . and then clang-format on the files Differential Revision: https://phabricator.services.mozilla.com/D58217 --HG-- extra : moz-landing-system : lando
		
			
				
	
	
		
			333 lines
		
	
	
	
		
			13 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			333 lines
		
	
	
	
		
			13 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 | 
						|
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
 | 
						|
/* This Source Code Form is subject to the terms of the Mozilla Public
 | 
						|
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 | 
						|
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 | 
						|
 | 
						|
#include "ImageScaling.h"
 | 
						|
#include "mozilla/Attributes.h"
 | 
						|
 | 
						|
#include "SSEHelpers.h"
 | 
						|
 | 
						|
/* The functions below use the following system for averaging 4 pixels:
 | 
						|
 *
 | 
						|
 * The first observation is that a half-adder is implemented as follows:
 | 
						|
 * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1);
 | 
						|
 *
 | 
						|
 * This can be trivially extended to three pixels by observaring that when
 | 
						|
 * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the
 | 
						|
 * carries of the individual numbers, since the sum of 3 bits can only ever
 | 
						|
 * have a carry of one.
 | 
						|
 *
 | 
						|
 * We then observe that the average is then ((carry << 1) + sum) >> 1, or,
 | 
						|
 * assuming eliminating overflows and underflows, carry + (sum >> 1).
 | 
						|
 *
 | 
						|
 * We now average our existing sum with the fourth number, so we get:
 | 
						|
 * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1).
 | 
						|
 *
 | 
						|
 * We now observe that our sum has been moved into place relative to the
 | 
						|
 * carry, so we can now average with the carry to get the final 4 input
 | 
						|
 * average: avg = (sum2 + carry) >> 1;
 | 
						|
 *
 | 
						|
 * Or to reverse the proof:
 | 
						|
 * avg = ((sum >> 1) + carry + d >> 1) >> 1
 | 
						|
 * avg = ((a + b + c) >> 1 + d >> 1) >> 1
 | 
						|
 * avg = ((a + b + c + d) >> 2)
 | 
						|
 *
 | 
						|
 * An additional fact used in the SSE versions is the concept that we can
 | 
						|
 * trivially convert a rounded average to a truncated average:
 | 
						|
 *
 | 
						|
 * We have:
 | 
						|
 * f(a, b) = (a + b + 1) >> 1
 | 
						|
 *
 | 
						|
 * And want:
 | 
						|
 * g(a, b) = (a + b) >> 1
 | 
						|
 *
 | 
						|
 * Observe:
 | 
						|
 * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1)
 | 
						|
 *            == ~((-a - 1 + -b - 1 + 1) >> 1)
 | 
						|
 *            == ~((-a - 1 + -b) >> 1)
 | 
						|
 *            == ~((-(a + b) - 1) >> 1)
 | 
						|
 *            == ~((~(a + b)) >> 1)
 | 
						|
 *            == (a + b) >> 1
 | 
						|
 *            == g(a, b)
 | 
						|
 */
 | 
						|
 | 
						|
MOZ_ALWAYS_INLINE __m128i _mm_not_si128(__m128i arg) {
 | 
						|
  __m128i minusone = _mm_set1_epi32(0xffffffff);
 | 
						|
  return _mm_xor_si128(arg, minusone);
 | 
						|
}
 | 
						|
 | 
						|
/* We have to pass pointers here, MSVC does not allow passing more than 3
 | 
						|
 * __m128i arguments on the stack. And it does not allow 16-byte aligned
 | 
						|
 * stack variables. This inlines properly on MSVC 2010. It does -not- inline
 | 
						|
 * with just the inline directive.
 | 
						|
 */
 | 
						|
MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i* a, __m128i* b, __m128i* c,
 | 
						|
                                       __m128i* d) {
 | 
						|
#define shuf1 _MM_SHUFFLE(2, 0, 2, 0)
 | 
						|
#define shuf2 _MM_SHUFFLE(3, 1, 3, 1)
 | 
						|
 | 
						|
// This cannot be an inline function as the __Imm argument to _mm_shuffle_ps
 | 
						|
// needs to be a compile time constant.
 | 
						|
#define shuffle_si128(arga, argb, imm)                      \
 | 
						|
  _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), \
 | 
						|
                                  _mm_castsi128_ps((argb)), (imm)));
 | 
						|
 | 
						|
  __m128i t = shuffle_si128(*a, *b, shuf1);
 | 
						|
  *b = shuffle_si128(*a, *b, shuf2);
 | 
						|
  *a = t;
 | 
						|
  t = shuffle_si128(*c, *d, shuf1);
 | 
						|
  *d = shuffle_si128(*c, *d, shuf2);
 | 
						|
  *c = t;
 | 
						|
 | 
						|
#undef shuf1
 | 
						|
#undef shuf2
 | 
						|
#undef shuffle_si128
 | 
						|
 | 
						|
  __m128i sum = _mm_xor_si128(*a, _mm_xor_si128(*b, *c));
 | 
						|
 | 
						|
  __m128i carry =
 | 
						|
      _mm_or_si128(_mm_and_si128(*a, *b),
 | 
						|
                   _mm_or_si128(_mm_and_si128(*a, *c), _mm_and_si128(*b, *c)));
 | 
						|
 | 
						|
  sum = _mm_avg_epu8(_mm_not_si128(sum), _mm_not_si128(*d));
 | 
						|
 | 
						|
  return _mm_not_si128(_mm_avg_epu8(sum, _mm_not_si128(carry)));
 | 
						|
}
 | 
						|
 | 
						|
MOZ_ALWAYS_INLINE __m128i avg_sse2_4x2_4x1(__m128i a, __m128i b) {
 | 
						|
  return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
 | 
						|
}
 | 
						|
 | 
						|
MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b) {
 | 
						|
  __m128i t = _mm_castps_si128(_mm_shuffle_ps(
 | 
						|
      _mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1)));
 | 
						|
  b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b),
 | 
						|
                                      _MM_SHUFFLE(2, 0, 2, 0)));
 | 
						|
  a = t;
 | 
						|
 | 
						|
  return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
 | 
						|
}
 | 
						|
 | 
						|
MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c,
 | 
						|
                                  uint32_t d) {
 | 
						|
  uint32_t sum = a ^ b ^ c;
 | 
						|
  uint32_t carry = (a & b) | (a & c) | (b & c);
 | 
						|
 | 
						|
  uint32_t mask = 0xfefefefe;
 | 
						|
 | 
						|
  // Not having a byte based average instruction means we should mask to avoid
 | 
						|
  // underflow.
 | 
						|
  sum = (((sum ^ d) & mask) >> 1) + (sum & d);
 | 
						|
 | 
						|
  return (((sum ^ carry) & mask) >> 1) + (sum & carry);
 | 
						|
}
 | 
						|
 | 
						|
// Simple 2 pixel average version of the function above.
 | 
						|
MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b) {
 | 
						|
  uint32_t sum = a ^ b;
 | 
						|
  uint32_t carry = (a & b);
 | 
						|
 | 
						|
  uint32_t mask = 0xfefefefe;
 | 
						|
 | 
						|
  return ((sum & mask) >> 1) + carry;
 | 
						|
}
 | 
						|
 | 
						|
namespace mozilla::gfx {
 | 
						|
 | 
						|
void ImageHalfScaler::HalfImage2D_SSE2(uint8_t* aSource, int32_t aSourceStride,
 | 
						|
                                       const IntSize& aSourceSize,
 | 
						|
                                       uint8_t* aDest, uint32_t aDestStride) {
 | 
						|
  const int Bpp = 4;
 | 
						|
 | 
						|
  for (int y = 0; y < aSourceSize.height; y += 2) {
 | 
						|
    __m128i* storage = (__m128i*)(aDest + (y / 2) * aDestStride);
 | 
						|
    int x = 0;
 | 
						|
    // Run a loop depending on alignment.
 | 
						|
    if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
 | 
						|
        !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
 | 
						|
      for (; x < (aSourceSize.width - 7); x += 8) {
 | 
						|
        __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
 | 
						|
        __m128i* lowerRow =
 | 
						|
            (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
 | 
						|
 | 
						|
        __m128i a = _mm_load_si128(upperRow);
 | 
						|
        __m128i b = _mm_load_si128(upperRow + 1);
 | 
						|
        __m128i c = _mm_load_si128(lowerRow);
 | 
						|
        __m128i d = _mm_load_si128(lowerRow + 1);
 | 
						|
 | 
						|
        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
 | 
						|
      }
 | 
						|
    } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
 | 
						|
      for (; x < (aSourceSize.width - 7); x += 8) {
 | 
						|
        __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
 | 
						|
        __m128i* lowerRow =
 | 
						|
            (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
 | 
						|
 | 
						|
        __m128i a = _mm_load_si128(upperRow);
 | 
						|
        __m128i b = _mm_load_si128(upperRow + 1);
 | 
						|
        __m128i c = loadUnaligned128(lowerRow);
 | 
						|
        __m128i d = loadUnaligned128(lowerRow + 1);
 | 
						|
 | 
						|
        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
 | 
						|
      }
 | 
						|
    } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
 | 
						|
      for (; x < (aSourceSize.width - 7); x += 8) {
 | 
						|
        __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
 | 
						|
        __m128i* lowerRow =
 | 
						|
            (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
 | 
						|
 | 
						|
        __m128i a = loadUnaligned128((__m128i*)upperRow);
 | 
						|
        __m128i b = loadUnaligned128((__m128i*)upperRow + 1);
 | 
						|
        __m128i c = _mm_load_si128((__m128i*)lowerRow);
 | 
						|
        __m128i d = _mm_load_si128((__m128i*)lowerRow + 1);
 | 
						|
 | 
						|
        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
 | 
						|
      }
 | 
						|
    } else {
 | 
						|
      for (; x < (aSourceSize.width - 7); x += 8) {
 | 
						|
        __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
 | 
						|
        __m128i* lowerRow =
 | 
						|
            (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
 | 
						|
 | 
						|
        __m128i a = loadUnaligned128(upperRow);
 | 
						|
        __m128i b = loadUnaligned128(upperRow + 1);
 | 
						|
        __m128i c = loadUnaligned128(lowerRow);
 | 
						|
        __m128i d = loadUnaligned128(lowerRow + 1);
 | 
						|
 | 
						|
        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    uint32_t* unalignedStorage = (uint32_t*)storage;
 | 
						|
    // Take care of the final pixels, we know there's an even number of pixels
 | 
						|
    // in the source rectangle. We use a 2x2 'simd' implementation for this.
 | 
						|
    //
 | 
						|
    // Potentially we only have to do this in the last row since overflowing
 | 
						|
    // 8 pixels in an earlier row would appear to be harmless as it doesn't
 | 
						|
    // touch invalid memory. Even when reading and writing to the same surface.
 | 
						|
    // in practice we only do this when doing an additional downscale pass, and
 | 
						|
    // in this situation we have unused stride to write into harmlessly.
 | 
						|
    // I do not believe the additional code complexity would be worth it though.
 | 
						|
    for (; x < aSourceSize.width; x += 2) {
 | 
						|
      uint8_t* upperRow = aSource + (y * aSourceStride + x * Bpp);
 | 
						|
      uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp);
 | 
						|
 | 
						|
      *unalignedStorage++ =
 | 
						|
          Avg2x2(*(uint32_t*)upperRow, *((uint32_t*)upperRow + 1),
 | 
						|
                 *(uint32_t*)lowerRow, *((uint32_t*)lowerRow + 1));
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
void ImageHalfScaler::HalfImageVertical_SSE2(uint8_t* aSource,
 | 
						|
                                             int32_t aSourceStride,
 | 
						|
                                             const IntSize& aSourceSize,
 | 
						|
                                             uint8_t* aDest,
 | 
						|
                                             uint32_t aDestStride) {
 | 
						|
  for (int y = 0; y < aSourceSize.height; y += 2) {
 | 
						|
    __m128i* storage = (__m128i*)(aDest + (y / 2) * aDestStride);
 | 
						|
    int x = 0;
 | 
						|
    // Run a loop depending on alignment.
 | 
						|
    if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
 | 
						|
        !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
 | 
						|
      for (; x < (aSourceSize.width - 3); x += 4) {
 | 
						|
        uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
 | 
						|
        uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
 | 
						|
 | 
						|
        __m128i a = _mm_load_si128((__m128i*)upperRow);
 | 
						|
        __m128i b = _mm_load_si128((__m128i*)lowerRow);
 | 
						|
 | 
						|
        *storage++ = avg_sse2_4x2_4x1(a, b);
 | 
						|
      }
 | 
						|
    } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
 | 
						|
      // This line doesn't align well.
 | 
						|
      for (; x < (aSourceSize.width - 3); x += 4) {
 | 
						|
        uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
 | 
						|
        uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
 | 
						|
 | 
						|
        __m128i a = _mm_load_si128((__m128i*)upperRow);
 | 
						|
        __m128i b = loadUnaligned128((__m128i*)lowerRow);
 | 
						|
 | 
						|
        *storage++ = avg_sse2_4x2_4x1(a, b);
 | 
						|
      }
 | 
						|
    } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
 | 
						|
      for (; x < (aSourceSize.width - 3); x += 4) {
 | 
						|
        uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
 | 
						|
        uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
 | 
						|
 | 
						|
        __m128i a = loadUnaligned128((__m128i*)upperRow);
 | 
						|
        __m128i b = _mm_load_si128((__m128i*)lowerRow);
 | 
						|
 | 
						|
        *storage++ = avg_sse2_4x2_4x1(a, b);
 | 
						|
      }
 | 
						|
    } else {
 | 
						|
      for (; x < (aSourceSize.width - 3); x += 4) {
 | 
						|
        uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
 | 
						|
        uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
 | 
						|
 | 
						|
        __m128i a = loadUnaligned128((__m128i*)upperRow);
 | 
						|
        __m128i b = loadUnaligned128((__m128i*)lowerRow);
 | 
						|
 | 
						|
        *storage++ = avg_sse2_4x2_4x1(a, b);
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    uint32_t* unalignedStorage = (uint32_t*)storage;
 | 
						|
    // Take care of the final pixels, we know there's an even number of pixels
 | 
						|
    // in the source rectangle.
 | 
						|
    //
 | 
						|
    // Similar overflow considerations are valid as in the previous function.
 | 
						|
    for (; x < aSourceSize.width; x++) {
 | 
						|
      uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
 | 
						|
      uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
 | 
						|
 | 
						|
      *unalignedStorage++ = Avg2(*(uint32_t*)upperRow, *(uint32_t*)lowerRow);
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
void ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t* aSource,
 | 
						|
                                               int32_t aSourceStride,
 | 
						|
                                               const IntSize& aSourceSize,
 | 
						|
                                               uint8_t* aDest,
 | 
						|
                                               uint32_t aDestStride) {
 | 
						|
  for (int y = 0; y < aSourceSize.height; y++) {
 | 
						|
    __m128i* storage = (__m128i*)(aDest + (y * aDestStride));
 | 
						|
    int x = 0;
 | 
						|
    // Run a loop depending on alignment.
 | 
						|
    if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
 | 
						|
      for (; x < (aSourceSize.width - 7); x += 8) {
 | 
						|
        __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
 | 
						|
 | 
						|
        __m128i a = _mm_load_si128(pixels);
 | 
						|
        __m128i b = _mm_load_si128(pixels + 1);
 | 
						|
 | 
						|
        *storage++ = avg_sse2_8x1_4x1(a, b);
 | 
						|
      }
 | 
						|
    } else {
 | 
						|
      for (; x < (aSourceSize.width - 7); x += 8) {
 | 
						|
        __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
 | 
						|
 | 
						|
        __m128i a = loadUnaligned128(pixels);
 | 
						|
        __m128i b = loadUnaligned128(pixels + 1);
 | 
						|
 | 
						|
        *storage++ = avg_sse2_8x1_4x1(a, b);
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    uint32_t* unalignedStorage = (uint32_t*)storage;
 | 
						|
    // Take care of the final pixels, we know there's an even number of pixels
 | 
						|
    // in the source rectangle.
 | 
						|
    //
 | 
						|
    // Similar overflow considerations are valid as in the previous function.
 | 
						|
    for (; x < aSourceSize.width; x += 2) {
 | 
						|
      uint32_t* pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4));
 | 
						|
 | 
						|
      *unalignedStorage++ = Avg2(*pixels, *(pixels + 1));
 | 
						|
    }
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
}  // namespace mozilla::gfx
 |