103693 – Remove conversion to/from float and float division from ImageFrame::setRGBA

Bug 103693 - Remove conversion to/from float and float division from ImageFrame::setRGBA

Summary: Remove conversion to/from float and float division from ImageFrame::setRGBA

Status:	RESOLVED FIXED

Alias:	None

Product:	WebKit
Classification:	Unclassified
Component:	Images (show other bugs)
Version:	528+ (Nightly build)
Hardware:	Unspecified Unspecified

Importance:	P2 Normal
Assignee:	Viatcheslav Ostapenko

URL:
Keywords:	Performance

Depends on:
Blocks:

Reported:	2012-11-29 17:32 PST by Viatcheslav Ostapenko
Modified:	2012-12-11 21:36 PST (History)
CC List:	2 users (show)

See Also:

Attachments
setRGBA benchmark with different variants of this function. (875 bytes, text/x-c++src) 2012-11-29 17:32 PST, Viatcheslav Ostapenko	no flags	Details
Benchmark run results on Intel Core I7 and ARM CPUs (903 bytes, text/plain) 2012-11-29 17:35 PST, Viatcheslav Ostapenko	no flags	Details
Patch (2.30 KB, patch) 2012-12-11 00:26 PST, Viatcheslav Ostapenko	no flags	Details \| Formatted Diff \| Diff
Patch (2.82 KB, patch) 2012-12-11 18:59 PST, Viatcheslav Ostapenko	no flags	Details \| Formatted Diff \| Diff
Show Obsolete (1) View All Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Viatcheslav Ostapenko 2012-11-29 17:32:25 PST

Created attachment 176859 [details]
setRGBA benchmark with different variants of this function.

Floats is slow and shouldn't be used in this case.

Comment 1 Viatcheslav Ostapenko 2012-11-29 17:35:11 PST

Created attachment 176860 [details]
Benchmark run results on Intel Core I7 and ARM CPUs

Comment 2 Viatcheslav Ostapenko 2012-11-29 17:46:14 PST

Comment on attachment 176859 [details]
setRGBA benchmark with different variants of this function.

#include <memory.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

typedef unsigned PixelData;

bool m_premultiplyAlpha = true;

inline void setRGBA_orig(PixelData* dest, unsigned r, unsigned g, unsigned b, unsigned a)
{
    if (m_premultiplyAlpha && a < 255) {
        if (!a) {
            *dest = 0;
            return;
        }

        float alphaPercent = a / 255.0f;
        r = static_cast<unsigned>(r * alphaPercent);
        g = static_cast<unsigned>(g * alphaPercent);
        b = static_cast<unsigned>(b * alphaPercent);
    }

    *dest = (a << 24 | r << 16 | g << 8 | b);
}

const int fixPointShift = 24;
const unsigned fixPointMult = unsigned(1.0 / 255.0 * (1 << fixPointShift)) + 1;
inline void setRGBA_fixPointMult(PixelData* dest, unsigned r, unsigned g, unsigned b, unsigned a)
{
    if (m_premultiplyAlpha && a < 255) {
        if (!a) {
            *dest = 0;
            return;
        }

        unsigned alphaMult = a * fixPointMult;
        r = (r * alphaMult) >> fixPointShift;
        g = (g * alphaMult) >> fixPointShift;
        b = (b * alphaMult) >> fixPointShift;
    }

    *dest = (a << 24 | r << 16 | g << 8 | b);
}

inline void setRGBA_integerDiv(PixelData* dest, unsigned r, unsigned g, unsigned b, unsigned a)
{
    if (m_premultiplyAlpha && a < 255) {
        if (!a) {
            *dest = 0;
            return;
        }

        r = r * a / 255;
        g = g * a / 255;
        b = b * a / 255;
    }

    *dest = (a << 24 | r << 16 | g << 8 | b);
}

inline unsigned fastDiv255(unsigned a)
{
    return (a + (a >> 8) + 1) >> 8;
}

inline void setRGBA_fastDiv255(PixelData* dest, unsigned r, unsigned g, unsigned b, unsigned a)
{
    if (m_premultiplyAlpha && a < 255) {
        if (!a) {
            *dest = 0;
            return;
        }

        r = fastDiv255(r * a);
        g = fastDiv255(g * a);
        b = fastDiv255(b * a);
    }

    *dest = (a << 24 | r << 16 | g << 8 | b);
}

const unsigned bufferSize = 256 * 1024 * 1024; // Something bigger than all cpu caches
PixelData source[bufferSize];
PixelData dest[bufferSize];

inline unsigned char* castUChar(void* p)
{
    return static_cast<unsigned char *>(p);
}

void runTests()
{
    unsigned i, tmp;
    unsigned char *bytesEnd = castUChar(&(source[bufferSize]));
    PixelData* destPosition;
    unsigned char *bytes;

    clock_t start;
    clock_t end;

    start = clock();
    for(bytes = castUChar(&(source[0])), destPosition = &dest[0]; bytes < bytesEnd; bytes += 4, destPosition++) {
        setRGBA_integerDiv(destPosition, bytes[0], bytes[1], bytes[2], bytes[3]);
    }
    end = clock();
    printf("1: %lu\t", end - start);

    start = clock();
    for(bytes = castUChar(&(source[0])), destPosition = &dest[0]; bytes < bytesEnd; bytes += 4, destPosition++) {
        setRGBA_fastDiv255(destPosition, bytes[0], bytes[1], bytes[2], bytes[3]);
    }
    end = clock();
    printf("2: %lu\t", end - start);

    start = clock();
    for(bytes = castUChar(&(source[0])), destPosition = &dest[0]; bytes < bytesEnd; bytes += 4, destPosition++) {
        setRGBA_fixPointMult(destPosition, bytes[0], bytes[1], bytes[2], bytes[3]);
    }
    end = clock();
    printf("3: %lu\t", end - start);

    start = clock();
    for(bytes = castUChar(&(source[0])), destPosition = &dest[0]; bytes < bytesEnd; bytes += 4, destPosition++) {
        setRGBA_orig(destPosition, bytes[0], bytes[1], bytes[2], bytes[3]);
    }
    end = clock();
    printf("4: %lu\n", end - start);
}

int main(int argc, char *argv[])
{
    unsigned i, j;

    unsigned diffValues = 0;
    // Fastdiv and fixedpoint mult precision check
    for(i = 0; i < 256; i++) {
        for(j = 0; j < 256; j++) {

            unsigned a = fastDiv255(i * j);

            unsigned alphaMult = i * fixPointMult;
            unsigned b = (j * alphaMult) >> fixPointShift;

            unsigned c = j * i / 255;

            float alphaPercent = i / 255.0f;
            unsigned d = static_cast<unsigned>(j * alphaPercent);
            if (a != d || b != d || c != d) {
                printf("Alpha; %u,\t color: %u,\t a: %u,\t b: %u,\t c: %u,\t d: %u\n", i, j, a, b, c, d);
                diffValues++;
            }
        }
    }
    printf("Different %u values\n", diffValues);

    memset(&dest[0], 0, sizeof(dest));
    do { // Run benchmarks with both values of m_premultiplyAlpha to make sure it is not optimized out
        printf("Premultiply Alpha: %d\n",m_premultiplyAlpha);

        printf("All 0 buffer:\n");
        memset(&source[0], 0, sizeof(source));
        runTests();

        printf("All 255 buffer:\n");
        memset(&source[0], 255, sizeof(source));
        runTests();

        printf("Pseudo random buffer:\n");
        // Fill buffer with 25% 0 alpha, 25% 255 alpha and other random values
        for(i = 0; i < bufferSize; i += 4) {
            source[i] = PixelData(rand()) & 0xFFFFFF;
            source[i] = PixelData(rand());
            source[i] = PixelData(rand()) | 0xFF000000;
            source[i] = PixelData(rand());
        }
        runTests();

        m_premultiplyAlpha = 1 - m_premultiplyAlpha;
    } while(m_premultiplyAlpha == false);
}

Comment 3 Viatcheslav Ostapenko 2012-11-29 17:58:17 PST

Benchmark was compiled like this:

gcc -O3 setRGBPerf.cpp -o setRGBPerf


Close to real life and slowest case of benchmark when tests run on pseudo random source buffer and m_premultiplyAlpha is true.
Pseudo random buffer is generated so, that approx 25% of pixels have 0 alpha, 25% have 255 alpha, other values are random.

Macbook Pro Ivy Bridge I7 2.7GHz, 256M elements
Pseudo random buffer:
1: 580000	2: 560000	3: 500000	4: 3290000

Samsung Galaxy Note II, 32M elements
Pseudo random buffer:
1: 380000	2: 360000	3: 360000	4: 1480000

1. Float conversion/division replace with simple integer division.
2. "Fast division" used instead of x/255 like x/256 + x/256^2 = (x + x / 256) / 256 . 1 is added to compensate for rounding error.
3. Fixed point pre-calculated constant is used as 1/255 multiplier.
4. Original function body.

There is simple test at the beginning of main function to make sure that all 4 methods produce identical results.

Obviously, that original function is significantly slower and fixed point multipler is the fastest, but ordinary integer division also works well.

Comment 4 Viatcheslav Ostapenko 2012-12-11 00:26:16 PST

Created attachment 178738 [details]
Patch

Comment 5 Viatcheslav Ostapenko 2012-12-11 00:30:07 PST

Run of ImageDecoder benchmark (attached to bug 88424) on Intel i7 930 :

Without patch:
time ./ImageDecoderPerf img_with_alpha.png 
Done

real	1m11.751s
user	1m10.628s
sys	0m0.160s

With patch:
time ./ImageDecoderPerf img_with_alpha.png 
Done

real	1m9.862s
user	1m8.464s
sys	0m0.200s

Comment 6 Brent Fulgham 2012-12-11 14:26:26 PST

(In reply to comment #5)
> Run of ImageDecoder benchmark (attached to bug 88424) on Intel i7 930 :
> 
> Without patch:
> user    1m10.628s
> 
> With patch:
> user    1m8.464s

This looks like a nearly 3% improvement in speed.  Is this significant enough that the (perhaps small) complexity of the fixed-point math implementation is worth it?

Will this work as written under 64-bit architectures?

Comment 7 Brent Fulgham 2012-12-11 14:40:24 PST

Comment on attachment 178738 [details]
Patch

View in context: https://bugs.webkit.org/attachment.cgi?id=178738&action=review

> Source/WebCore/ChangeLog:8
> +        Replace floating point operations used for alpha premultiply with fixed point arifmetics

*arithmetic*

> Source/WebCore/platform/image-decoders/ImageDecoder.h:173
> +                static const unsigned fixPointMult = unsigned(1.0 / 255.0 * (1 << fixPointShift)) + 1;

This should be a c++ style cast.
static_cast<unsigned>(1.0 / 255.0 * ...)

> Source/WebCore/platform/image-decoders/ImageDecoder.h:177
> +                b = (b * alphaMult) >> fixPointShift;

What if we made the fixedpoint operation an template or inline function:

(Pseudocode):
static unsigned alphaMult<fixPointShift>(unsigned component, unsigned alphaMult) {
    return (component * alphaMult) >> fixPointShift;
}

r = alphaMult (r, alphaMult);
g = alphaMult (g, alphaMult);
... etc ...

In fact, you might be able to use SSE here since we are doing the same operation on all three values.  I wonder what kind of improvement that would be...

Comment 8 Viatcheslav Ostapenko 2012-12-11 18:18:17 PST

Comment on attachment 178738 [details]
Patch

View in context: https://bugs.webkit.org/attachment.cgi?id=178738&action=review

>> Source/WebCore/platform/image-decoders/ImageDecoder.h:177
>> +                b = (b * alphaMult) >> fixPointShift;
> 
> What if we made the fixedpoint operation an template or inline function:
> 
> (Pseudocode):
> static unsigned alphaMult<fixPointShift>(unsigned component, unsigned alphaMult) {
>     return (component * alphaMult) >> fixPointShift;
> }
> 
> r = alphaMult (r, alphaMult);
> g = alphaMult (g, alphaMult);
> ... etc ...
> 
> In fact, you might be able to use SSE here since we are doing the same operation on all three values.  I wonder what kind of improvement that would be...

Actually, compiler is doing sse instructions for floats here, but in this case conversions to/from int make it slow. If the operation would be more complex, it might be faster. But in this case simple integer math works better.

Comment 9 Viatcheslav Ostapenko 2012-12-11 18:59:45 PST

Created attachment 178946 [details]
Patch

Comment 10 Brent Fulgham 2012-12-11 20:18:34 PST

Comment on attachment 178946 [details]
Patch

Looks great!  Thanks. r = me.

Comment 11 Viatcheslav Ostapenko 2012-12-11 20:32:31 PST

(In reply to comment #6)
> (In reply to comment #5)
> > Run of ImageDecoder benchmark (attached to bug 88424) on Intel i7 930 :
> > 
> > Without patch:
> > user    1m10.628s
> > 
> > With patch:
> > user    1m8.464s
> 
> This looks like a nearly 3% improvement in speed.  Is this significant enough that the (perhaps small) complexity of the fixed-point math implementation is worth it?

The PNG decoder is pretty slow itself, but there is no single point where it can give bigger improvement.
Another option would be just to use integer division. It is about function itself would become about 15% slower than with use of fixed point precalculated multiplier, but it is still would be 5-6 times faster than original code with float/int conversions.

> Will this work as written under 64-bit architectures?

Why not? It uses only lower 32 bits and I don't see any reason why it would fail.
BTW, it is actually tested on 64-bit ubuntu, but unsigned is 32 bit on 64 bit linuxes.

May be it is really better just to use integer division to avoid all confusions.

Comment 12 WebKit Review Bot 2012-12-11 21:36:02 PST

Comment on attachment 178946 [details]
Patch

Clearing flags on attachment: 178946

Committed r137413: <http://trac.webkit.org/changeset/137413>

Comment 13 WebKit Review Bot 2012-12-11 21:36:06 PST

All reviewed patches have been landed.  Closing bug.