Created attachment 176859 [details] setRGBA benchmark with different variants of this function. Floats is slow and shouldn't be used in this case.
Created attachment 176860 [details] Benchmark run results on Intel Core I7 and ARM CPUs
Comment on attachment 176859 [details] setRGBA benchmark with different variants of this function. #include <memory.h> #include <stdio.h> #include <stdlib.h> #include <time.h> typedef unsigned PixelData; bool m_premultiplyAlpha = true; inline void setRGBA_orig(PixelData* dest, unsigned r, unsigned g, unsigned b, unsigned a) { if (m_premultiplyAlpha && a < 255) { if (!a) { *dest = 0; return; } float alphaPercent = a / 255.0f; r = static_cast<unsigned>(r * alphaPercent); g = static_cast<unsigned>(g * alphaPercent); b = static_cast<unsigned>(b * alphaPercent); } *dest = (a << 24 | r << 16 | g << 8 | b); } const int fixPointShift = 24; const unsigned fixPointMult = unsigned(1.0 / 255.0 * (1 << fixPointShift)) + 1; inline void setRGBA_fixPointMult(PixelData* dest, unsigned r, unsigned g, unsigned b, unsigned a) { if (m_premultiplyAlpha && a < 255) { if (!a) { *dest = 0; return; } unsigned alphaMult = a * fixPointMult; r = (r * alphaMult) >> fixPointShift; g = (g * alphaMult) >> fixPointShift; b = (b * alphaMult) >> fixPointShift; } *dest = (a << 24 | r << 16 | g << 8 | b); } inline void setRGBA_integerDiv(PixelData* dest, unsigned r, unsigned g, unsigned b, unsigned a) { if (m_premultiplyAlpha && a < 255) { if (!a) { *dest = 0; return; } r = r * a / 255; g = g * a / 255; b = b * a / 255; } *dest = (a << 24 | r << 16 | g << 8 | b); } inline unsigned fastDiv255(unsigned a) { return (a + (a >> 8) + 1) >> 8; } inline void setRGBA_fastDiv255(PixelData* dest, unsigned r, unsigned g, unsigned b, unsigned a) { if (m_premultiplyAlpha && a < 255) { if (!a) { *dest = 0; return; } r = fastDiv255(r * a); g = fastDiv255(g * a); b = fastDiv255(b * a); } *dest = (a << 24 | r << 16 | g << 8 | b); } const unsigned bufferSize = 256 * 1024 * 1024; // Something bigger than all cpu caches PixelData source[bufferSize]; PixelData dest[bufferSize]; inline unsigned char* castUChar(void* p) { return static_cast<unsigned char *>(p); } void runTests() { unsigned i, tmp; unsigned char *bytesEnd = castUChar(&(source[bufferSize])); PixelData* destPosition; unsigned char *bytes; clock_t start; clock_t end; start = clock(); for(bytes = castUChar(&(source[0])), destPosition = &dest[0]; bytes < bytesEnd; bytes += 4, destPosition++) { setRGBA_integerDiv(destPosition, bytes[0], bytes[1], bytes[2], bytes[3]); } end = clock(); printf("1: %lu\t", end - start); start = clock(); for(bytes = castUChar(&(source[0])), destPosition = &dest[0]; bytes < bytesEnd; bytes += 4, destPosition++) { setRGBA_fastDiv255(destPosition, bytes[0], bytes[1], bytes[2], bytes[3]); } end = clock(); printf("2: %lu\t", end - start); start = clock(); for(bytes = castUChar(&(source[0])), destPosition = &dest[0]; bytes < bytesEnd; bytes += 4, destPosition++) { setRGBA_fixPointMult(destPosition, bytes[0], bytes[1], bytes[2], bytes[3]); } end = clock(); printf("3: %lu\t", end - start); start = clock(); for(bytes = castUChar(&(source[0])), destPosition = &dest[0]; bytes < bytesEnd; bytes += 4, destPosition++) { setRGBA_orig(destPosition, bytes[0], bytes[1], bytes[2], bytes[3]); } end = clock(); printf("4: %lu\n", end - start); } int main(int argc, char *argv[]) { unsigned i, j; unsigned diffValues = 0; // Fastdiv and fixedpoint mult precision check for(i = 0; i < 256; i++) { for(j = 0; j < 256; j++) { unsigned a = fastDiv255(i * j); unsigned alphaMult = i * fixPointMult; unsigned b = (j * alphaMult) >> fixPointShift; unsigned c = j * i / 255; float alphaPercent = i / 255.0f; unsigned d = static_cast<unsigned>(j * alphaPercent); if (a != d || b != d || c != d) { printf("Alpha; %u,\t color: %u,\t a: %u,\t b: %u,\t c: %u,\t d: %u\n", i, j, a, b, c, d); diffValues++; } } } printf("Different %u values\n", diffValues); memset(&dest[0], 0, sizeof(dest)); do { // Run benchmarks with both values of m_premultiplyAlpha to make sure it is not optimized out printf("Premultiply Alpha: %d\n",m_premultiplyAlpha); printf("All 0 buffer:\n"); memset(&source[0], 0, sizeof(source)); runTests(); printf("All 255 buffer:\n"); memset(&source[0], 255, sizeof(source)); runTests(); printf("Pseudo random buffer:\n"); // Fill buffer with 25% 0 alpha, 25% 255 alpha and other random values for(i = 0; i < bufferSize; i += 4) { source[i] = PixelData(rand()) & 0xFFFFFF; source[i] = PixelData(rand()); source[i] = PixelData(rand()) | 0xFF000000; source[i] = PixelData(rand()); } runTests(); m_premultiplyAlpha = 1 - m_premultiplyAlpha; } while(m_premultiplyAlpha == false); }
Benchmark was compiled like this: gcc -O3 setRGBPerf.cpp -o setRGBPerf Close to real life and slowest case of benchmark when tests run on pseudo random source buffer and m_premultiplyAlpha is true. Pseudo random buffer is generated so, that approx 25% of pixels have 0 alpha, 25% have 255 alpha, other values are random. Macbook Pro Ivy Bridge I7 2.7GHz, 256M elements Pseudo random buffer: 1: 580000 2: 560000 3: 500000 4: 3290000 Samsung Galaxy Note II, 32M elements Pseudo random buffer: 1: 380000 2: 360000 3: 360000 4: 1480000 1. Float conversion/division replace with simple integer division. 2. "Fast division" used instead of x/255 like x/256 + x/256^2 = (x + x / 256) / 256 . 1 is added to compensate for rounding error. 3. Fixed point pre-calculated constant is used as 1/255 multiplier. 4. Original function body. There is simple test at the beginning of main function to make sure that all 4 methods produce identical results. Obviously, that original function is significantly slower and fixed point multipler is the fastest, but ordinary integer division also works well.
Created attachment 178738 [details] Patch
Run of ImageDecoder benchmark (attached to bug 88424) on Intel i7 930 : Without patch: time ./ImageDecoderPerf img_with_alpha.png Done real 1m11.751s user 1m10.628s sys 0m0.160s With patch: time ./ImageDecoderPerf img_with_alpha.png Done real 1m9.862s user 1m8.464s sys 0m0.200s
(In reply to comment #5) > Run of ImageDecoder benchmark (attached to bug 88424) on Intel i7 930 : > > Without patch: > user 1m10.628s > > With patch: > user 1m8.464s This looks like a nearly 3% improvement in speed. Is this significant enough that the (perhaps small) complexity of the fixed-point math implementation is worth it? Will this work as written under 64-bit architectures?
Comment on attachment 178738 [details] Patch View in context: https://bugs.webkit.org/attachment.cgi?id=178738&action=review > Source/WebCore/ChangeLog:8 > + Replace floating point operations used for alpha premultiply with fixed point arifmetics *arithmetic* > Source/WebCore/platform/image-decoders/ImageDecoder.h:173 > + static const unsigned fixPointMult = unsigned(1.0 / 255.0 * (1 << fixPointShift)) + 1; This should be a c++ style cast. static_cast<unsigned>(1.0 / 255.0 * ...) > Source/WebCore/platform/image-decoders/ImageDecoder.h:177 > + b = (b * alphaMult) >> fixPointShift; What if we made the fixedpoint operation an template or inline function: (Pseudocode): static unsigned alphaMult<fixPointShift>(unsigned component, unsigned alphaMult) { return (component * alphaMult) >> fixPointShift; } r = alphaMult (r, alphaMult); g = alphaMult (g, alphaMult); ... etc ... In fact, you might be able to use SSE here since we are doing the same operation on all three values. I wonder what kind of improvement that would be...
Comment on attachment 178738 [details] Patch View in context: https://bugs.webkit.org/attachment.cgi?id=178738&action=review >> Source/WebCore/platform/image-decoders/ImageDecoder.h:177 >> + b = (b * alphaMult) >> fixPointShift; > > What if we made the fixedpoint operation an template or inline function: > > (Pseudocode): > static unsigned alphaMult<fixPointShift>(unsigned component, unsigned alphaMult) { > return (component * alphaMult) >> fixPointShift; > } > > r = alphaMult (r, alphaMult); > g = alphaMult (g, alphaMult); > ... etc ... > > In fact, you might be able to use SSE here since we are doing the same operation on all three values. I wonder what kind of improvement that would be... Actually, compiler is doing sse instructions for floats here, but in this case conversions to/from int make it slow. If the operation would be more complex, it might be faster. But in this case simple integer math works better.
Created attachment 178946 [details] Patch
Comment on attachment 178946 [details] Patch Looks great! Thanks. r = me.
(In reply to comment #6) > (In reply to comment #5) > > Run of ImageDecoder benchmark (attached to bug 88424) on Intel i7 930 : > > > > Without patch: > > user 1m10.628s > > > > With patch: > > user 1m8.464s > > This looks like a nearly 3% improvement in speed. Is this significant enough that the (perhaps small) complexity of the fixed-point math implementation is worth it? The PNG decoder is pretty slow itself, but there is no single point where it can give bigger improvement. Another option would be just to use integer division. It is about function itself would become about 15% slower than with use of fixed point precalculated multiplier, but it is still would be 5-6 times faster than original code with float/int conversions. > Will this work as written under 64-bit architectures? Why not? It uses only lower 32 bits and I don't see any reason why it would fail. BTW, it is actually tested on 64-bit ubuntu, but unsigned is 32 bit on 64 bit linuxes. May be it is really better just to use integer division to avoid all confusions.
Comment on attachment 178946 [details] Patch Clearing flags on attachment: 178946 Committed r137413: <http://trac.webkit.org/changeset/137413>
All reviewed patches have been landed. Closing bug.