From 661854bb89c1ca7fee26eb63999e0fc92b538bf6 Mon Sep 17 00:00:00 2001
From: Thiago Macieira <thiago.macieira@intel.com>
Date: Mon, 29 Sep 2014 19:28:24 -0700
Subject: [PATCH] Add a quick optimization for QVersionNumber's constructors

If the data fits inline, let's store it using the dummy member. GCC,
Clang and ICC optimize the code on all architectures I tested to one
single store.

Previously, the function for "return QVersionNumber(5,4,0);" was:
 x86-64:
        movb    $7, (%rdi)
        movb    $5, 1(%rdi)
        movb    $4, 2(%rdi)
        movb    $0, 3(%rdi)
 x86:
        movb    $7, (%eax)
        movb    $5, 1(%eax)
        movb    $4, 2(%eax)
        movb    $0, 3(%eax)
 ia64:
        addl r17 = 7, r0
        adds r16 = 1, in0
        adds r15 = 2, in0
        adds r14 = 3, in0
        st1 [in0] = r17
        addl r17 = 5, r0
        ;;
        st1 [r16] = r17
        addl r16 = 4, r0
        ;;
        st1 [r15] = r16
        st1 [r14] = r0
 armv7a:
        mov     r1, #7
        mov     r2, #5
        strb    r1, [r0]
        mov     r1, #4
        strb    r2, [r0, #1]
        mov     r2, #0
        strb    r1, [r0, #2]
        strb    r2, [r0, #3]
 mips32:
        li      $3,7                    # 0x7
        sb      $3,3($4)
        li      $3,5                    # 0x5
        sb      $3,0($4)
        li      $3,4                    # 0x4
        sb      $3,1($4)
        sb      $0,2($4)
 mips64:
        li      $3,7                    # 0x7
        sb      $3,7($4)
        li      $3,5                    # 0x5
        sb      $3,0($4)
        li      $3,4                    # 0x4
        sb      $3,1($4)
        sb      $0,2($4)
 ppc32:
        li 10,7
        stb 10,3(3)
        li 10,5
        stb 10,0(3)
        li 10,4
        stb 10,1(3)
        li 10,0
        stb 10,2(3)
 ppc64:
        li 10,7
        stb 10,7(3)
        li 10,5
        stb 10,0(3)
        li 10,4
        stb 10,1(3)
        li 10,0
        stb 10,2(3)

Now it is:
 x86-64:
        movq    $263431, (%rdi)
 x86:
        movl    $263431, (%eax)
 ia64:
        addl r14 = 263431, r0
        ;;
        st8 [in0] = r14
 armv7a:
        movw    r3, #1287
        movt    r3, 4
        str     r3, [r0]
 mips32:
        li      $3,84148224                     # 0x5040000
        addiu   $3,$3,7
        sw      $3,0($4)
 mips64:
        li      $3,321                  # 0x141
        dsll    $3,$3,50
        daddiu  $3,$3,7
        sd      $3,0($4)
 ppc64:
        lis 9,0x504
        sldi 9,9,32
        ori 9,9,7
        std 9,0(3)
 ppc32:
        lis 9,0x504
        ori 9,9,7
        stw 9,0(3)

All assembly listings from GCC 4.8.1, but the Clang and ICC outputs are
identical or at least very similar (I tested Clang for ARM, MIPS and
PowerPC). Both MIPS and PowerPC were compiled in big-endian mode and this
listing shows that the 64-bit implementation is correct.

Additionally, the output is also the same for GCC when using brace
initialization (that is, return QVersionNumber{5,4,0}). Clang and ICC
couldn't optimize that.

Change-Id: I9a4a4c9fc83f1182401f63fd2da829c935a8c9da
Reviewed-by: Keith Gardner <kreios4004@gmail.com>
Reviewed-by: Marc Mutz <marc.mutz@kdab.com>
---
 src/corelib/tools/qversionnumber.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/corelib/tools/qversionnumber.h b/src/corelib/tools/qversionnumber.h
index 0ea73cc8428..ebf1844f389 100644
--- a/src/corelib/tools/qversionnumber.h
+++ b/src/corelib/tools/qversionnumber.h
@@ -192,9 +192,19 @@ class QVersionNumber
         }
         void setInlineData(const int *data, int len)
         {
+            dummy = 1 + len * 2;
+#if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+            for (int i = 0; i < len; ++i)
+                dummy |= quintptr(data[i] & 0xFF) << (8 * (i + 1));
+#elif Q_BYTE_ORDER == Q_BIG_ENDIAN
+            for (int i = 0; i < len; ++i)
+                dummy |= quintptr(data[i] & 0xFF) << (8 * (sizeof(void *) - i - 1));
+#else
+            // the code above is equivalent to:
             setInlineSize(len);
             for (int i = 0; i < len; ++i)
-                inline_segments[InlineSegmentStartIdx + i] = qint8(data[i]);
+                inline_segments[InlineSegmentStartIdx + i] = data[i] & 0xFF;
+#endif
         }
 
         Q_CORE_EXPORT void setVector(int len, int maj, int min, int mic);
-- 
GitLab