diff --git a/crengine/include/lvdrawbuf.h b/crengine/include/lvdrawbuf.h
index ee0d2dd31..5de108322 100644
--- a/crengine/include/lvdrawbuf.h
+++ b/crengine/include/lvdrawbuf.h
@@ -99,6 +99,10 @@ class LVDrawBuf : public CacheableObject
     virtual void setHidePartialGlyphs( bool hide ) = 0;
     /// set to true to invert images only (so they get inverted back to normal by nightmode)
     virtual void setInvertImages( bool invert ) = 0;
+    /// set to true to enforce dithering (only relevant for 8bpp Gray drawBuf)
+    virtual void setDitherImages( bool dither ) = 0;
+    /// set to true to switch to a more costly smooth scaler instead of nearest neighbor
+    virtual void setSmoothScalingImages( bool smooth ) = 0;
     /// invert image
     virtual void  Invert() = 0;
     /// get buffer width, pixels
@@ -231,6 +235,8 @@ class LVBaseDrawBuf : public LVDrawBuf
     lUInt32 _textColor;
     bool _hidePartialGlyphs;
     bool _invertImages;
+    bool _ditherImages;
+    bool _smoothImages;
     int _drawnImagesCount;
     int _drawnImagesSurface;
 public:
@@ -238,6 +244,10 @@ class LVBaseDrawBuf : public LVDrawBuf
     virtual void setHidePartialGlyphs( bool hide ) { _hidePartialGlyphs = hide; }
     /// set to true to invert images only (so they get inverted back to normal by nightmode)
     virtual void setInvertImages( bool invert ) { _invertImages = invert; }
+    /// set to true to enforce dithering (only relevant for 8bpp Gray drawBuf)
+    virtual void setDitherImages( bool dither ) { _ditherImages = dither; }
+    /// set to true to switch to a more costly smooth scaler instead of nearest neighbor
+    virtual void setSmoothScalingImages( bool smooth ) { _smoothImages = smooth; }
     /// returns current background color
     virtual lUInt32 GetBackgroundColor() { return _backgroundColor; }
     /// sets current background color
@@ -277,7 +287,8 @@ class LVBaseDrawBuf : public LVDrawBuf
     int getDrawnImagesSurface() { return _drawnImagesSurface; }
 
     LVBaseDrawBuf() : _dx(0), _dy(0), _rowsize(0), _data(NULL), _hidePartialGlyphs(true),
-                        _invertImages(false), _drawnImagesCount(0), _drawnImagesSurface(0) { }
+                        _invertImages(false), _ditherImages(false), _smoothImages(false),
+                        _drawnImagesCount(0), _drawnImagesSurface(0) { }
     virtual ~LVBaseDrawBuf() { }
 };
 
@@ -399,11 +410,12 @@ class LVGrayDrawBuf : public LVBaseDrawBuf
 //       c.f., https://github.com/koreader/koreader-base/pull/878#issuecomment-476723747
 #ifdef CR_RENDER_32BPP_RGB_PXFMT
 inline lUInt32 RevRGB( lUInt32 cl ) {
-    return ((cl>>16)&0x0000FF) | ((cl<<16)&0xFF0000) | (cl&0x00FF00);
+    return ((cl<<16)&0xFF0000) | ((cl>>16)&0x0000FF) | (cl&0x00FF00);
 }
 
 inline lUInt32 RevRGBA( lUInt32 cl ) {
-    return (cl&0xFF000000) | ((cl>>16)&0x0000FF) | ((cl<<16)&0xFF0000) | (cl&0x00FF00);
+    // Swap B <-> R, keep G & A
+    return ((cl<<16)&0x00FF0000) | ((cl>>16)&0x000000FF) | (cl&0xFF00FF00);
 }
 #else
 inline lUInt32 RevRGB( lUInt32 cl ) {
@@ -423,6 +435,65 @@ inline lUInt16 rgb888to565( lUInt32 cl ) {
     return (lUInt16)(((cl>>8)& 0xF800) | ((cl>>5 )& 0x07E0) | ((cl>>3 )& 0x001F));
 }
 
+#define DIV255(V)                                                                                        \
+({                                                                                                       \
+	auto _v = (V) + 128;                                                                             \
+	(((_v >> 8U) + _v) >> 8U);                                                                       \
+})
+
+// Quantize an 8-bit color value down to a palette of 16 evenly spaced colors, using an ordered 8x8 dithering pattern.
+// With a grayscale input, this happens to match the eInk palette perfectly ;).
+// If the input is not grayscale, and the output fb is not grayscale either,
+// this usually still happens to match the eInk palette after the EPDC's own quantization pass.
+// c.f., https://en.wikipedia.org/wiki/Ordered_dithering
+// & https://github.com/ImageMagick/ImageMagick/blob/ecfeac404e75f304004f0566557848c53030bad6/MagickCore/threshold.c#L1627
+// NOTE: As the references imply, this is straight from ImageMagick,
+//       with only minor simplifications to enforce Q8 & avoid fp maths.
+static inline lUInt8 dither_o8x8(int x, int y, lUInt8 v)
+{
+	// c.f., https://github.com/ImageMagick/ImageMagick/blob/ecfeac404e75f304004f0566557848c53030bad6/config/thresholds.xml#L107
+	static const lUInt8 threshold_map_o8x8[] = { 1,  49, 13, 61, 4,  52, 16, 64, 33, 17, 45, 29, 36, 20, 48, 32,
+						      9,  57, 5,  53, 12, 60, 8,  56, 41, 25, 37, 21, 44, 28, 40, 24,
+						      3,  51, 15, 63, 2,  50, 14, 62, 35, 19, 47, 31, 34, 18, 46, 30,
+						      11, 59, 7,  55, 10, 58, 6,  54, 43, 27, 39, 23, 42, 26, 38, 22 };
+
+	// Constants:
+	// Quantum = 8; Levels = 16; map Divisor = 65
+	// QuantumRange = 0xFF
+	// QuantumScale = 1.0 / QuantumRange
+	//
+	// threshold = QuantumScale * v * ((L-1) * (D-1) + 1)
+	// NOTE: The initial computation of t (specifically, what we pass to DIV255) would overflow an uint8_t.
+	//       So jump to shorts, and do it signed to be extra careful, although I don't *think* we can ever underflow here.
+	lInt16 t = (lInt16) DIV255(v * ((15U << 6) + 1U));
+	// level = t / (D-1);
+	lInt16 l = (t >> 6);
+	// t -= l * (D-1);
+	t = (lInt16)(t - (l << 6));
+
+	// map width & height = 8
+	// c = ClampToQuantum((l+(t >= map[(x % mw) + mw * (y % mh)])) * QuantumRange / (L-1));
+	lInt16 q = (lInt16)((l + (t >= threshold_map_o8x8[(x & 7U) + 8U * (y & 7U)])) * 17);
+	// NOTE: For some arcane reason, on ARM (at least), this is noticeably faster than Pillow's CLIP8 macro.
+	//       Following this logic with ternary operators yields similar results,
+	//       so I'm guessing it's the < 256 part of Pillow's macro that doesn't agree with GCC/ARM...
+	lUInt8 c;
+	if (q > 0xFF) {
+		c = 0xFF;
+	} else if (q < 0) {
+		c = 0U;
+	} else {
+		c = (lUInt8) q;
+	}
+
+	return c;
+}
+
+// Declare our bit of scaler ripped from Qt5...
+namespace CRe {
+lUInt8* qSmoothScaleImage(const lUInt8* src, int sw, int sh, bool ignore_alpha, int dw, int dh);
+}
+
 /// 32-bit RGB buffer
 class LVColorDrawBuf : public LVBaseDrawBuf
 {
diff --git a/crengine/qimagescale/QIMAGETRANSFORM_LICENSE.txt b/crengine/qimagescale/QIMAGETRANSFORM_LICENSE.txt
new file mode 100644
index 000000000..67c910826
--- /dev/null
+++ b/crengine/qimagescale/QIMAGETRANSFORM_LICENSE.txt
@@ -0,0 +1,60 @@
+qimagetransform.cpp was contributed by Daniel M. Duley based on code from Imlib2.
+
+Copyright (C) 2004, 2005 Daniel M. Duley
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+Imlib2 License
+
+Copyright (C) 2000 Carsten Haitzler and various contributors (see
+AUTHORS)
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies of the Software and its Copyright notices. In addition
+publicly documented acknowledgment must be given that this software has
+been used if no source code of this software is made available publicly.
+This includes acknowledgments in either Copyright notices, Manuals,
+Publicity and Marketing documents or any documentation provided with any
+product containing this software. This License does not apply to any
+software that links to the libraries provided by this software
+(statically or dynamically), but only to the software provided.
+
+Please see the COPYING.PLAIN for a plain-english explanation of this
+notice and it's intent.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/crengine/qimagescale/qdrawhelper_p.h b/crengine/qimagescale/qdrawhelper_p.h
new file mode 100644
index 000000000..205ee3a1f
--- /dev/null
+++ b/crengine/qimagescale/qdrawhelper_p.h
@@ -0,0 +1,182 @@
+/****************************************************************************
+**
+** Copyright (C) 2016 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of the QtGui module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#ifndef QDRAWHELPER_P_H
+#define QDRAWHELPER_P_H
+
+#include "qglobal.h"
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#endif
+#if defined(__SSE2__)
+#include <immintrin.h>
+#include <x86intrin.h>
+#endif
+
+namespace CRe {
+
+#if defined(__GNUC__)
+#  if (defined(__i386) || defined(__i386__) || defined(_M_IX86)) && defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+#    define Q_DECL_VECTORCALL __attribute__((sseregparm,regparm(3)))
+#  else
+#    define Q_DECL_VECTORCALL
+#  endif
+#elif defined(_MSC_VER)
+#  define Q_DECL_VECTORCALL __vectorcall
+#else
+#  define Q_DECL_VECTORCALL
+#endif
+
+#if __SIZEOF_POINTER__ == 8 // 64-bit versions
+
+static inline __attribute__((always_inline)) uint INTERPOLATE_PIXEL_256(uint x, uint a, uint y, uint b) {
+    quint64 t = (((quint64(x)) | ((quint64(x)) << 24)) & 0x00ff00ff00ff00ff) * a;
+    t += (((quint64(y)) | ((quint64(y)) << 24)) & 0x00ff00ff00ff00ff) * b;
+    t >>= 8;
+    t &= 0x00ff00ff00ff00ff;
+    return (uint(t)) | (uint(t >> 24));
+}
+
+#else // 32-bit versions
+
+static inline __attribute__((always_inline)) uint INTERPOLATE_PIXEL_256(uint x, uint a, uint y, uint b) {
+    uint t = (x & 0xff00ff) * a + (y & 0xff00ff) * b;
+    t >>= 8;
+    t &= 0xff00ff;
+
+    x = ((x >> 8) & 0xff00ff) * a + ((y >> 8) & 0xff00ff) * b;
+    x &= 0xff00ff00;
+    x |= t;
+    return x;
+}
+
+#endif
+
+// NOTE: Unlike the SIMD qimagescale_* routines, these ones seem to offer a very small performance gain.
+#if defined(__SSE2__)
+static inline __attribute__((always_inline)) uint interpolate_4_pixels_sse2(__m128i vt, __m128i vb, uint distx, uint disty)
+{
+    // First interpolate top and bottom pixels in parallel.
+    vt = _mm_unpacklo_epi8(vt, _mm_setzero_si128());
+    vb = _mm_unpacklo_epi8(vb, _mm_setzero_si128());
+    vt = _mm_mullo_epi16(vt, _mm_set1_epi16(256 - disty));
+    vb = _mm_mullo_epi16(vb, _mm_set1_epi16(disty));
+    __m128i vlr = _mm_add_epi16(vt, vb);
+    vlr = _mm_srli_epi16(vlr, 8);
+    // vlr now contains the result of the first two interpolate calls vlr = unpacked((xright << 64) | xleft)
+
+    // Now the last interpolate between left and right..
+    const __m128i vidistx = _mm_shufflelo_epi16(_mm_cvtsi32_si128(256 - distx), _MM_SHUFFLE(0, 0, 0, 0));
+    const __m128i vdistx = _mm_shufflelo_epi16(_mm_cvtsi32_si128(distx), _MM_SHUFFLE(0, 0, 0, 0));
+    const __m128i vmulx = _mm_unpacklo_epi16(vidistx, vdistx);
+    vlr = _mm_unpacklo_epi16(vlr, _mm_srli_si128(vlr, 8));
+    // vlr now contains the colors of left and right interleaved { la, ra, lr, rr, lg, rg, lb, rb }
+    vlr = _mm_madd_epi16(vlr, vmulx); // Multiply and horizontal add.
+    vlr = _mm_srli_epi32(vlr, 8);
+    vlr = _mm_packs_epi32(vlr, vlr);
+    vlr = _mm_packus_epi16(vlr, vlr);
+    return _mm_cvtsi128_si32(vlr);
+}
+
+static inline uint interpolate_4_pixels(uint tl, uint tr, uint bl, uint br, uint distx, uint disty)
+{
+    __m128i vt = _mm_unpacklo_epi32(_mm_cvtsi32_si128(tl), _mm_cvtsi32_si128(tr));
+    __m128i vb = _mm_unpacklo_epi32(_mm_cvtsi32_si128(bl), _mm_cvtsi32_si128(br));
+    return interpolate_4_pixels_sse2(vt, vb, distx, disty);
+}
+
+static inline uint interpolate_4_pixels(const uint t[], const uint b[], uint distx, uint disty)
+{
+    __m128i vt = _mm_loadl_epi64((const __m128i*)t);
+    __m128i vb = _mm_loadl_epi64((const __m128i*)b);
+    return interpolate_4_pixels_sse2(vt, vb, distx, disty);
+}
+
+#elif defined(__ARM_NEON__)
+static inline __attribute__((always_inline)) uint interpolate_4_pixels_neon(uint32x2_t vt32, uint32x2_t vb32, uint distx, uint disty)
+{
+    uint16x8_t vt16 = vmovl_u8(vreinterpret_u8_u32(vt32));
+    uint16x8_t vb16 = vmovl_u8(vreinterpret_u8_u32(vb32));
+    vt16 = vmulq_n_u16(vt16, 256 - disty);
+    vt16 = vmlaq_n_u16(vt16, vb16, disty);
+    vt16 = vshrq_n_u16(vt16, 8);
+    uint16x4_t vl16 = vget_low_u16(vt16);
+    uint16x4_t vr16 = vget_high_u16(vt16);
+    vl16 = vmul_n_u16(vl16, 256 - distx);
+    vl16 = vmla_n_u16(vl16, vr16, distx);
+    vl16 = vshr_n_u16(vl16, 8);
+    uint8x8_t vr = vmovn_u16(vcombine_u16(vl16, vl16));
+    return vget_lane_u32(vreinterpret_u32_u8(vr), 0);
+}
+
+static inline uint interpolate_4_pixels(uint tl, uint tr, uint bl, uint br, uint distx, uint disty)
+{
+    uint32x2_t vt32 = vmov_n_u32(tl);
+    uint32x2_t vb32 = vmov_n_u32(bl);
+    vt32 = vset_lane_u32(tr, vt32, 1);
+    vb32 = vset_lane_u32(br, vb32, 1);
+    return interpolate_4_pixels_neon(vt32, vb32, distx, disty);
+}
+
+static inline uint interpolate_4_pixels(const uint t[], const uint b[], uint distx, uint disty)
+{
+    uint32x2_t vt32 = vld1_u32(t);
+    uint32x2_t vb32 = vld1_u32(b);
+    return interpolate_4_pixels_neon(vt32, vb32, distx, disty);
+}
+
+#else
+static inline uint interpolate_4_pixels(uint tl, uint tr, uint bl, uint br, uint distx, uint disty)
+{
+    uint idistx = 256 - distx;
+    uint idisty = 256 - disty;
+    uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx);
+    uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx);
+    return INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty);
+}
+
+static inline uint interpolate_4_pixels(const uint t[], const uint b[], uint distx, uint disty)
+{
+    return interpolate_4_pixels(t[0], t[1], b[0], b[1], distx, disty);
+}
+#endif
+
+}
+
+#endif // QDRAWHELPER_P_H
diff --git a/crengine/qimagescale/qglobal.h b/crengine/qimagescale/qglobal.h
new file mode 100644
index 000000000..bce56d481
--- /dev/null
+++ b/crengine/qimagescale/qglobal.h
@@ -0,0 +1,81 @@
+/****************************************************************************
+**
+** Copyright (C) 2016 The Qt Company Ltd.
+** Copyright (C) 2016 Intel Corporation.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of the QtCore module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#ifndef QGLOBAL_H
+#define QGLOBAL_H
+
+namespace CRe {
+
+#if defined(_WIN32) && !defined(__GNUC__)
+typedef __int64 qint64;            /* 64 bit signed */
+typedef unsigned __int64 quint64;  /* 64 bit unsigned */
+#else
+typedef long long qint64;           /* 64 bit signed */
+typedef unsigned long long quint64; /* 64 bit unsigned */
+#endif
+
+}
+
+/*
+   Useful type definitions for Qt
+*/
+
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef unsigned long ulong;
+
+namespace CRe {
+
+/*
+   Utility macros and inline functions
+*/
+
+template <typename T>
+constexpr inline T qAbs(const T &t) { return t >= 0 ? t : -t; }
+
+template <typename T>
+constexpr inline const T &qMin(const T &a, const T &b) { return (a < b) ? a : b; }
+template <typename T>
+constexpr inline const T &qMax(const T &a, const T &b) { return (a < b) ? b : a; }
+
+}
+
+#endif /* QGLOBAL_H */
diff --git a/crengine/qimagescale/qimagescale.cpp b/crengine/qimagescale/qimagescale.cpp
new file mode 100644
index 000000000..46da8f888
--- /dev/null
+++ b/crengine/qimagescale/qimagescale.cpp
@@ -0,0 +1,789 @@
+/****************************************************************************
+**
+** Copyright (C) 2016 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of the QtGui module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+#include "qglobal.h"
+#include "qrgb.h"
+#include "qimagescale_p.h"
+#include "qdrawhelper_p.h"
+
+#include <stdlib.h>
+#include <iostream>
+
+#ifndef FBINK_QIS_NO_SIMD
+#if defined(__ARM_NEON__)
+#include "qimagescale_neon.cpp"
+#endif
+#if defined(__SSE4_1__)
+#include "qimagescale_sse4.cpp"
+#endif
+#endif
+
+namespace CRe {
+
+/*
+ * Copyright (C) 2004, 2005 Daniel M. Duley
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/* OTHER CREDITS:
+ *
+ * This is the normal smoothscale method, based on Imlib2's smoothscale.
+ *
+ * Originally I took the algorithm used in NetPBM and Qt and added MMX/3dnow
+ * optimizations. It ran in about 1/2 the time as Qt. Then I ported Imlib's
+ * C algorithm and it ran at about the same speed as my MMX optimized one...
+ * Finally I ported Imlib's MMX version and it ran in less than half the
+ * time as my MMX algorithm, (taking only a quarter of the time Qt does).
+ * After further optimization it seems to run at around 1/6th.
+ *
+ * Changes include formatting, namespaces and other C++'ings, removal of old
+ * #ifdef'ed code, and removal of unneeded border calculation code.
+ * Later the code has been refactored, an SSE4.1 optimizated path have been
+ * added instead of the removed MMX assembler, and scaling of clipped area
+ * removed, and an RGBA64 version written
+ *
+ * Imlib2 is (C) Carsten Haitzler and various contributors. The MMX code
+ * is by Willem Monsuwe <willem@stack.nl>. All other modifications are
+ * (C) Daniel M. Duley.
+ */
+
+
+namespace QImageScale {
+    static const unsigned int** qimageCalcYPoints(const unsigned int *src, int sw, int sh, int dh);
+    static int* qimageCalcXPoints(int sw, int dw);
+    static int* qimageCalcApoints(int s, int d, int up);
+    static QImageScaleInfo* qimageFreeScaleInfo(QImageScaleInfo *isi);
+    static QImageScaleInfo *qimageCalcScaleInfo(const unsigned char* img, int sw, int sh, int dw, int dh, char aa);
+}
+
+using namespace QImageScale;
+
+//
+// Code ported from Imlib...
+//
+
+static const unsigned int** QImageScale::qimageCalcYPoints(const unsigned int *src,
+                                                           int sw, int sh, int dh)
+{
+    const unsigned int **p;
+    int j = 0, rv = 0;
+    qint64 val, inc;
+
+    if (dh < 0) {
+        dh = -dh;
+        rv = 1;
+    }
+    p = new const unsigned int* [dh+1];
+
+    int up = qAbs(dh) >= sh;
+    val = up ? 0x8000 * sh / dh - 0x8000 : 0;
+    inc = (((qint64)sh) << 16) / dh;
+    for (int i = 0; i < dh; i++) {
+        p[j++] = src + qMax(0LL, val >> 16) * sw;
+        val += inc;
+    }
+    if (rv) {
+        for (int i = dh / 2; --i >= 0; ) {
+            const unsigned int *tmp = p[i];
+            p[i] = p[dh - i - 1];
+            p[dh - i - 1] = tmp;
+        }
+    }
+    return(p);
+}
+
+static int* QImageScale::qimageCalcXPoints(int sw, int dw)
+{
+    int *p, j = 0, rv = 0;
+    qint64 val, inc;
+
+    if (dw < 0) {
+        dw = -dw;
+        rv = 1;
+    }
+    p = new int[dw+1];
+
+    int up = qAbs(dw) >= sw;
+    val = up ? 0x8000 * sw / dw - 0x8000 : 0;
+    inc = (((qint64)sw) << 16) / dw;
+    for (int i = 0; i < dw; i++) {
+        p[j++] = qMax(0LL, val >> 16);
+        val += inc;
+    }
+
+    if (rv) {
+        for (int i = dw / 2; --i >= 0; ) {
+            int tmp = p[i];
+            p[i] = p[dw - i - 1];
+            p[dw - i - 1] = tmp;
+        }
+    }
+   return p;
+}
+
+static int* QImageScale::qimageCalcApoints(int s, int d, int up)
+{
+    int *p, j = 0, rv = 0;
+
+    if (d < 0) {
+        rv = 1;
+        d = -d;
+    }
+    p = new int[d];
+
+    if (up) {
+        /* scaling up */
+        qint64 val = 0x8000 * s / d - 0x8000;
+        qint64 inc = (((qint64)s) << 16) / d;
+        for (int i = 0; i < d; i++) {
+            int pos = val >> 16;
+            if (pos < 0)
+                p[j++] = 0;
+            else if (pos >= (s - 1))
+                p[j++] = 0;
+            else
+                p[j++] = (val >> 8) - ((val >> 8) & 0xffffff00);
+            val += inc;
+        }
+    } else {
+        /* scaling down */
+        qint64 val = 0;
+        qint64 inc = (((qint64)s) << 16) / d;
+        int Cp = (((d << 14) + s - 1) / s);
+        for (int i = 0; i < d; i++) {
+            int ap = ((0x10000 - (val & 0xffff)) * Cp) >> 16;
+            p[j] = ap | (Cp << 16);
+            j++;
+            val += inc;
+        }
+    }
+    if (rv) {
+        int tmp;
+        for (int i = d / 2; --i >= 0; ) {
+            tmp = p[i];
+            p[i] = p[d - i - 1];
+            p[d - i - 1] = tmp;
+        }
+    }
+    return p;
+}
+
+static QImageScaleInfo* QImageScale::qimageFreeScaleInfo(QImageScaleInfo *isi)
+{
+    if (isi) {
+        delete[] isi->xpoints;
+        delete[] isi->ypoints;
+        delete[] isi->xapoints;
+        delete[] isi->yapoints;
+        delete isi;
+    }
+    return 0;
+}
+
+static QImageScaleInfo* QImageScale::qimageCalcScaleInfo(const unsigned char* img,
+                                                         int sw, int sh,
+                                                         int dw, int dh, char aa)
+{
+    QImageScaleInfo *isi;
+    int scw, sch;
+
+    scw = dw;
+    sch = dh;
+
+    isi = new QImageScaleInfo;
+    if (!isi)
+        return 0;
+
+    isi->xup_yup = (qAbs(dw) >= sw) + ((qAbs(dh) >= sh) << 1);
+
+    isi->xpoints = qimageCalcXPoints(sw, scw);
+    if (!isi->xpoints)
+        return qimageFreeScaleInfo(isi);
+    // NOTE: We use sw directly as a simplification. Technically, it's img bytes-per-lines / bytes-per-pixel
+    //       (i.e., img's width * number of color components / sizeof(uint32_t) for unpadded packed pixels).
+    //       As we enforce 32bpp input, n is 4, as is sizeof(uint32_t), hence using width directly ;).
+    // NOTE: Qt's Rgba64 codepath *still* divides by 4, so, err, double-check that?
+    isi->ypoints = qimageCalcYPoints((const unsigned int *)img,
+                                     sw, sh, sch);
+    if (!isi->ypoints)
+        return qimageFreeScaleInfo(isi);
+    if (aa) {
+        isi->xapoints = qimageCalcApoints(sw, scw, isi->xup_yup & 1);
+        if (!isi->xapoints)
+            return qimageFreeScaleInfo(isi);
+        isi->yapoints = qimageCalcApoints(sh, sch, isi->xup_yup & 2);
+        if (!isi->yapoints)
+            return qimageFreeScaleInfo(isi);
+    }
+    return isi;
+}
+
+
+static void qt_qimageScaleAARGBA_up_x_down_y(QImageScaleInfo *isi, unsigned int *dest,
+                                             int dw, int dh, int dow, int sow);
+
+static void qt_qimageScaleAARGBA_down_x_up_y(QImageScaleInfo *isi, unsigned int *dest,
+                                             int dw, int dh, int dow, int sow);
+
+static void qt_qimageScaleAARGBA_down_xy(QImageScaleInfo *isi, unsigned int *dest,
+                                         int dw, int dh, int dow, int sow);
+
+#ifndef FBINK_QIS_NO_SIMD
+#if defined(__SSE4_1__)
+template<bool RGB>
+void qt_qimageScaleAARGBA_up_x_down_y_sse4(QImageScaleInfo *isi, unsigned int *dest,
+                                           int dw, int dh, int dow, int sow);
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_x_up_y_sse4(QImageScaleInfo *isi, unsigned int *dest,
+                                           int dw, int dh, int dow, int sow);
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo *isi, unsigned int *dest,
+                                       int dw, int dh, int dow, int sow);
+#endif
+
+#if defined(__ARM_NEON__)
+template<bool RGB>
+void qt_qimageScaleAARGBA_up_x_down_y_neon(QImageScaleInfo *isi, unsigned int *dest,
+                                           int dw, int dh, int dow, int sow);
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_x_up_y_neon(QImageScaleInfo *isi, unsigned int *dest,
+                                           int dw, int dh, int dow, int sow);
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_xy_neon(QImageScaleInfo *isi, unsigned int *dest,
+                                       int dw, int dh, int dow, int sow);
+#endif
+#endif
+
+static void qt_qimageScaleAARGBA_up_xy(QImageScaleInfo *isi, unsigned int *dest,
+                                       int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    int *xpoints = isi->xpoints;
+    int *xapoints = isi->xapoints;
+    int *yapoints = isi->yapoints;
+
+    /* go through every scanline in the output buffer */
+    for (int y = 0; y < dh; y++) {
+        /* calculate the source line we'll scan from */
+        const unsigned int *sptr = ypoints[y];
+        unsigned int *dptr = dest + (y * dow);
+        const int yap = yapoints[y];
+        if (yap > 0) {
+            for (int x = 0; x < dw; x++) {
+                const unsigned int *pix = sptr + xpoints[x];
+                const int xap = xapoints[x];
+                if (xap > 0)
+                    *dptr = interpolate_4_pixels(pix, pix + sow, xap, yap);
+                else
+                    *dptr = INTERPOLATE_PIXEL_256(pix[0], 256 - yap, pix[sow], yap);
+                dptr++;
+            }
+        } else {
+            for (int x = 0; x < dw; x++) {
+                const unsigned int *pix = sptr + xpoints[x];
+                const int xap = xapoints[x];
+                if (xap > 0)
+                    *dptr = INTERPOLATE_PIXEL_256(pix[0], 256 - xap, pix[1], xap);
+                else
+                    *dptr = pix[0];
+                dptr++;
+            }
+        }
+    }
+}
+
+/* scale by area sampling - with alpha */
+static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest,
+                                 int dw, int dh, int dow, int sow)
+{
+    /* scaling up both ways */
+    if (isi->xup_yup == 3) {
+        qt_qimageScaleAARGBA_up_xy(isi, dest, dw, dh, dow, sow);
+    }
+    /* if we're scaling down vertically */
+    else if (isi->xup_yup == 1) {
+#ifndef FBINK_QIS_NO_SIMD
+#if defined(__SSE4_1__)
+        qt_qimageScaleAARGBA_up_x_down_y_sse4<false>(isi, dest, dw, dh, dow, sow);
+#elif defined(__ARM_NEON__)
+        qt_qimageScaleAARGBA_up_x_down_y_neon<false>(isi, dest, dw, dh, dow, sow);
+#else
+        qt_qimageScaleAARGBA_up_x_down_y(isi, dest, dw, dh, dow, sow);
+#endif
+#else
+        qt_qimageScaleAARGBA_up_x_down_y(isi, dest, dw, dh, dow, sow);
+#endif
+    }
+    /* if we're scaling down horizontally */
+    else if (isi->xup_yup == 2) {
+#ifndef FBINK_QIS_NO_SIMD
+#if defined(__SSE4_1__)
+        qt_qimageScaleAARGBA_down_x_up_y_sse4<false>(isi, dest, dw, dh, dow, sow);
+#elif defined(__ARM_NEON__)
+        qt_qimageScaleAARGBA_down_x_up_y_neon<false>(isi, dest, dw, dh, dow, sow);
+#else
+        qt_qimageScaleAARGBA_down_x_up_y(isi, dest, dw, dh, dow, sow);
+#endif
+#else
+        qt_qimageScaleAARGBA_down_x_up_y(isi, dest, dw, dh, dow, sow);
+#endif
+    }
+    /* if we're scaling down horizontally & vertically */
+    else {
+#ifndef FBINK_QIS_NO_SIMD
+#if defined(__SSE4_1__)
+        qt_qimageScaleAARGBA_down_xy_sse4<false>(isi, dest, dw, dh, dow, sow);
+#elif defined(__ARM_NEON__)
+        qt_qimageScaleAARGBA_down_xy_neon<false>(isi, dest, dw, dh, dow, sow);
+#else
+        qt_qimageScaleAARGBA_down_xy(isi, dest, dw, dh, dow, sow);
+#endif
+#else
+        qt_qimageScaleAARGBA_down_xy(isi, dest, dw, dh, dow, sow);
+#endif
+    }
+}
+
+inline static void qt_qimageScaleAARGBA_helper(const unsigned int *pix, int xyap, int Cxy, int step, int &r, int &g, int &b, int &a)
+{
+    r = qRed(*pix)   * xyap;
+    g = qGreen(*pix) * xyap;
+    b = qBlue(*pix)  * xyap;
+    a = qAlpha(*pix) * xyap;
+    int j;
+    for (j = (1 << 14) - xyap; j > Cxy; j -= Cxy) {
+        pix += step;
+        r += qRed(*pix)   * Cxy;
+        g += qGreen(*pix) * Cxy;
+        b += qBlue(*pix)  * Cxy;
+        a += qAlpha(*pix) * Cxy;
+    }
+    pix += step;
+    r += qRed(*pix)   * j;
+    g += qGreen(*pix) * j;
+    b += qBlue(*pix)  * j;
+    a += qAlpha(*pix) * j;
+}
+
+static void qt_qimageScaleAARGBA_up_x_down_y(QImageScaleInfo *isi, unsigned int *dest,
+                                             int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    int *xpoints = isi->xpoints;
+    int *xapoints = isi->xapoints;
+    int *yapoints = isi->yapoints;
+
+    /* go through every scanline in the output buffer */
+    for (int y = 0; y < dh; y++) {
+        int Cy = yapoints[y] >> 16;
+        int yap = yapoints[y] & 0xffff;
+
+        unsigned int *dptr = dest + (y * dow);
+        for (int x = 0; x < dw; x++) {
+            const unsigned int *sptr = ypoints[y] + xpoints[x];
+            int r, g, b, a;
+            qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow, r, g, b, a);
+
+            int xap = xapoints[x];
+            if (xap > 0) {
+                int rr, gg, bb, aa;
+                qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow, rr, gg, bb, aa);
+
+                r = r * (256 - xap);
+                g = g * (256 - xap);
+                b = b * (256 - xap);
+                a = a * (256 - xap);
+                r = (r + (rr * xap)) >> 8;
+                g = (g + (gg * xap)) >> 8;
+                b = (b + (bb * xap)) >> 8;
+                a = (a + (aa * xap)) >> 8;
+            }
+            *dptr++ = qRgba(r >> 14, g >> 14, b >> 14, a >> 14);
+        }
+    }
+}
+
+static void qt_qimageScaleAARGBA_down_x_up_y(QImageScaleInfo *isi, unsigned int *dest,
+                                             int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    int *xpoints = isi->xpoints;
+    int *xapoints = isi->xapoints;
+    int *yapoints = isi->yapoints;
+
+    /* go through every scanline in the output buffer */
+    for (int y = 0; y < dh; y++) {
+        unsigned int *dptr = dest + (y * dow);
+        for (int x = 0; x < dw; x++) {
+            int Cx = xapoints[x] >> 16;
+            int xap = xapoints[x] & 0xffff;
+
+            const unsigned int *sptr = ypoints[y] + xpoints[x];
+            int r, g, b, a;
+            qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, r, g, b, a);
+
+            int yap = yapoints[y];
+            if (yap > 0) {
+                int rr, gg, bb, aa;
+                qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1, rr, gg, bb, aa);
+
+                r = r * (256 - yap);
+                g = g * (256 - yap);
+                b = b * (256 - yap);
+                a = a * (256 - yap);
+                r = (r + (rr * yap)) >> 8;
+                g = (g + (gg * yap)) >> 8;
+                b = (b + (bb * yap)) >> 8;
+                a = (a + (aa * yap)) >> 8;
+            }
+            *dptr = qRgba(r >> 14, g >> 14, b >> 14, a >> 14);
+            dptr++;
+        }
+    }
+}
+
+static void qt_qimageScaleAARGBA_down_xy(QImageScaleInfo *isi, unsigned int *dest,
+                                         int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    int *xpoints = isi->xpoints;
+    int *xapoints = isi->xapoints;
+    int *yapoints = isi->yapoints;
+
+    for (int y = 0; y < dh; y++) {
+        int Cy = (yapoints[y]) >> 16;
+        int yap = (yapoints[y]) & 0xffff;
+
+        unsigned int *dptr = dest + (y * dow);
+        for (int x = 0; x < dw; x++) {
+            int Cx = xapoints[x] >> 16;
+            int xap = xapoints[x] & 0xffff;
+
+            const unsigned int *sptr = ypoints[y] + xpoints[x];
+            int rx, gx, bx, ax;
+            qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, rx, gx, bx, ax);
+
+            int r = ((rx>>4) * yap);
+            int g = ((gx>>4) * yap);
+            int b = ((bx>>4) * yap);
+            int a = ((ax>>4) * yap);
+
+            int j;
+            for (j = (1 << 14) - yap; j > Cy; j -= Cy) {
+                sptr += sow;
+                qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, rx, gx, bx, ax);
+                r += ((rx>>4) * Cy);
+                g += ((gx>>4) * Cy);
+                b += ((bx>>4) * Cy);
+                a += ((ax>>4) * Cy);
+            }
+            sptr += sow;
+            qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, rx, gx, bx, ax);
+
+            r += ((rx>>4) * j);
+            g += ((gx>>4) * j);
+            b += ((bx>>4) * j);
+            a += ((ax>>4) * j);
+
+            *dptr = qRgba(r >> 24, g >> 24, b >> 24, a >> 24);
+            dptr++;
+        }
+    }
+}
+
+static void qt_qimageScaleAARGB_up_x_down_y(QImageScaleInfo *isi, unsigned int *dest,
+                                            int dw, int dh, int dow, int sow);
+
+static void qt_qimageScaleAARGB_down_x_up_y(QImageScaleInfo *isi, unsigned int *dest,
+                                            int dw, int dh, int dow, int sow);
+
+static void qt_qimageScaleAARGB_down_xy(QImageScaleInfo *isi, unsigned int *dest,
+                                        int dw, int dh, int dow, int sow);
+
+/* scale by area sampling - IGNORE the ALPHA byte*/
+static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest,
+                                int dw, int dh, int dow, int sow)
+{
+    /* scaling up both ways */
+    if (isi->xup_yup == 3) {
+        qt_qimageScaleAARGBA_up_xy(isi, dest, dw, dh, dow, sow);
+    }
+    /* if we're scaling down vertically */
+    else if (isi->xup_yup == 1) {
+#ifndef FBINK_QIS_NO_SIMD
+#if defined(__SSE4_1__)
+        qt_qimageScaleAARGBA_up_x_down_y_sse4<true>(isi, dest, dw, dh, dow, sow);
+#elif defined(__ARM_NEON__)
+        qt_qimageScaleAARGBA_up_x_down_y_neon<true>(isi, dest, dw, dh, dow, sow);
+#else
+        qt_qimageScaleAARGB_up_x_down_y(isi, dest, dw, dh, dow, sow);
+#endif
+#else
+        qt_qimageScaleAARGB_up_x_down_y(isi, dest, dw, dh, dow, sow);
+#endif
+    }
+    /* if we're scaling down horizontally */
+    else if (isi->xup_yup == 2) {
+#ifndef FBINK_QIS_NO_SIMD
+#if defined(__SSE4_1__)
+        qt_qimageScaleAARGBA_down_x_up_y_sse4<true>(isi, dest, dw, dh, dow, sow);
+#elif defined(__ARM_NEON__)
+        qt_qimageScaleAARGBA_down_x_up_y_neon<true>(isi, dest, dw, dh, dow, sow);
+#else
+        qt_qimageScaleAARGB_down_x_up_y(isi, dest, dw, dh, dow, sow);
+#endif
+#else
+        qt_qimageScaleAARGB_down_x_up_y(isi, dest, dw, dh, dow, sow);
+#endif
+    }
+    /* if we're scaling down horizontally & vertically */
+    else {
+#ifndef FBINK_QIS_NO_SIMD
+#if defined(__SSE4_1__)
+        qt_qimageScaleAARGBA_down_xy_sse4<true>(isi, dest, dw, dh, dow, sow);
+#elif defined(__ARM_NEON__)
+        qt_qimageScaleAARGBA_down_xy_neon<true>(isi, dest, dw, dh, dow, sow);
+#else
+        qt_qimageScaleAARGB_down_xy(isi, dest, dw, dh, dow, sow);
+#endif
+#else
+        qt_qimageScaleAARGB_down_xy(isi, dest, dw, dh, dow, sow);
+#endif
+    }
+}
+
+
+inline static void qt_qimageScaleAARGB_helper(const unsigned int *pix, int xyap, int Cxy, int step, int &r, int &g, int &b)
+{
+    r = qRed(*pix)   * xyap;
+    g = qGreen(*pix) * xyap;
+    b = qBlue(*pix)  * xyap;
+    int j;
+    for (j = (1 << 14) - xyap; j > Cxy; j -= Cxy) {
+        pix += step;
+        r += qRed(*pix)   * Cxy;
+        g += qGreen(*pix) * Cxy;
+        b += qBlue(*pix)  * Cxy;
+    }
+    pix += step;
+    r += qRed(*pix)   * j;
+    g += qGreen(*pix) * j;
+    b += qBlue(*pix)  * j;
+}
+
+static void qt_qimageScaleAARGB_up_x_down_y(QImageScaleInfo *isi, unsigned int *dest,
+                                            int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    int *xpoints = isi->xpoints;
+    int *xapoints = isi->xapoints;
+    int *yapoints = isi->yapoints;
+
+    /* go through every scanline in the output buffer */
+    for (int y = 0; y < dh; y++) {
+        int Cy = yapoints[y] >> 16;
+        int yap = yapoints[y] & 0xffff;
+
+        unsigned int *dptr = dest + (y * dow);
+        for (int x = 0; x < dw; x++) {
+            const unsigned int *sptr = ypoints[y] + xpoints[x];
+            int r, g, b;
+            qt_qimageScaleAARGB_helper(sptr, yap, Cy, sow, r, g, b);
+
+            int xap = xapoints[x];
+            if (xap > 0) {
+                int rr, bb, gg;
+                qt_qimageScaleAARGB_helper(sptr + 1, yap, Cy, sow, rr, gg, bb);
+
+                r = r * (256 - xap);
+                g = g * (256 - xap);
+                b = b * (256 - xap);
+                r = (r + (rr * xap)) >> 8;
+                g = (g + (gg * xap)) >> 8;
+                b = (b + (bb * xap)) >> 8;
+            }
+            *dptr++ = qRgb(r >> 14, g >> 14, b >> 14);
+        }
+    }
+}
+
+static void qt_qimageScaleAARGB_down_x_up_y(QImageScaleInfo *isi, unsigned int *dest,
+                                            int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    int *xpoints = isi->xpoints;
+    int *xapoints = isi->xapoints;
+    int *yapoints = isi->yapoints;
+
+    /* go through every scanline in the output buffer */
+    for (int y = 0; y < dh; y++) {
+        unsigned int *dptr = dest + (y * dow);
+        for (int x = 0; x < dw; x++) {
+            int Cx = xapoints[x] >> 16;
+            int xap = xapoints[x] & 0xffff;
+
+            const unsigned int *sptr = ypoints[y] + xpoints[x];
+            int r, g, b;
+            qt_qimageScaleAARGB_helper(sptr, xap, Cx, 1, r, g, b);
+
+            int yap = yapoints[y];
+            if (yap > 0) {
+                int rr, bb, gg;
+                qt_qimageScaleAARGB_helper(sptr + sow, xap, Cx, 1, rr, gg, bb);
+
+                r = r * (256 - yap);
+                g = g * (256 - yap);
+                b = b * (256 - yap);
+                r = (r + (rr * yap)) >> 8;
+                g = (g + (gg * yap)) >> 8;
+                b = (b + (bb * yap)) >> 8;
+            }
+            *dptr++ = qRgb(r >> 14, g >> 14, b >> 14);
+        }
+    }
+}
+
+static void qt_qimageScaleAARGB_down_xy(QImageScaleInfo *isi, unsigned int *dest,
+                                        int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    int *xpoints = isi->xpoints;
+    int *xapoints = isi->xapoints;
+    int *yapoints = isi->yapoints;
+
+    for (int y = 0; y < dh; y++) {
+        int Cy = yapoints[y] >> 16;
+        int yap = yapoints[y] & 0xffff;
+
+        unsigned int *dptr = dest + (y * dow);
+        for (int x = 0; x < dw; x++) {
+            int Cx = xapoints[x] >> 16;
+            int xap = xapoints[x] & 0xffff;
+
+            const unsigned int *sptr = ypoints[y] + xpoints[x];
+            int rx, gx, bx;
+            qt_qimageScaleAARGB_helper(sptr, xap, Cx, 1, rx, gx, bx);
+
+            int r = (rx >> 4) * yap;
+            int g = (gx >> 4) * yap;
+            int b = (bx >> 4) * yap;
+
+            int j;
+            for (j = (1 << 14) - yap; j > Cy; j -= Cy) {
+                sptr += sow;
+                qt_qimageScaleAARGB_helper(sptr, xap, Cx, 1, rx, gx, bx);
+
+                r += (rx >> 4) * Cy;
+                g += (gx >> 4) * Cy;
+                b += (bx >> 4) * Cy;
+            }
+            sptr += sow;
+            qt_qimageScaleAARGB_helper(sptr, xap, Cx, 1, rx, gx, bx);
+
+            r += (rx >> 4) * j;
+            g += (gx >> 4) * j;
+            b += (bx >> 4) * j;
+
+            *dptr = qRgb(r >> 24, g >> 24, b >> 24);
+            dptr++;
+        }
+    }
+}
+
+unsigned char* qSmoothScaleImage(const unsigned char* src, int sw, int sh, bool ignore_alpha, int dw, int dh)
+{
+    unsigned char* buffer = nullptr;
+    if (src == nullptr || dw <= 0 || dh <= 0)
+        return buffer;
+
+    // NOTE: We enforce 32bpp input buffers, because that's what Qt uses, even for RGB with no alpha.
+    //       (the pixelformat constant is helpfully named RGB32 to remind you of that ;)).
+    QImageScaleInfo *scaleinfo =
+        qimageCalcScaleInfo(src, sw, sh, dw, dh, true);
+    if (!scaleinfo)
+        return buffer;
+
+    // SSE/NEON friendly alignment, just in case...
+    void *ptr;
+    // NOTE: Output format is always RGBA! So make enough room for 4 bytes per pixel ;).
+    if (posix_memalign(&ptr, 16, dw * dh * 4) != 0) {
+        std::cerr << "qSmoothScaleImage: out of memory, returning null!" << std::endl;
+        qimageFreeScaleInfo(scaleinfo);
+        return nullptr;
+    } else {
+        buffer = (unsigned char*) ptr;
+    }
+
+    // NOTE: See comment in qimageCalcScaleInfo regarding our simplification of using sw directly.
+    //       Here, the Rgba64 codepath *does* divide by 8, because it casts buffer to QRgba64 *,
+    //       which I imagine is an uint64_t ;).
+    if (!ignore_alpha) {
+        qt_qimageScaleAARGBA(scaleinfo, (unsigned int *)buffer,
+                             dw, dh, dw, sw);
+    } else {
+        // NOTE: Input buffer is still 32bpp, we just skip *processing* of the alpha channel.
+        qt_qimageScaleAARGB(scaleinfo, (unsigned int *)buffer,
+                            dw, dh, dw, sw);
+    }
+
+    qimageFreeScaleInfo(scaleinfo);
+    return buffer;
+}
+
+}
diff --git a/crengine/qimagescale/qimagescale_neon.cpp b/crengine/qimagescale/qimagescale_neon.cpp
new file mode 100644
index 000000000..bfde5e858
--- /dev/null
+++ b/crengine/qimagescale/qimagescale_neon.cpp
@@ -0,0 +1,212 @@
+/****************************************************************************
+**
+** Copyright (C) 2016 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of the QtGui module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include "qimagescale_p.h"
+
+#if defined(__ARM_NEON__)
+
+namespace CRe {
+
+using namespace QImageScale;
+
+inline static uint32x4_t qt_qimageScaleAARGBA_helper(const unsigned int *pix, int xyap, int Cxy, int step)
+{
+    uint32x2_t vpix32 = vmov_n_u32(*pix);
+    uint16x4_t vpix16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vpix32)));
+    uint32x4_t vx = vmull_n_u16(vpix16, xyap);
+    int i;
+    for (i = (1 << 14) - xyap; i > Cxy; i -= Cxy) {
+        pix += step;
+        vpix32 = vmov_n_u32(*pix);
+        vpix16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vpix32)));
+        vx = vaddq_u32(vx, vmull_n_u16(vpix16, Cxy));
+    }
+    pix += step;
+    vpix32 = vmov_n_u32(*pix);
+    vpix16 = vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vpix32)));
+    vx = vaddq_u32(vx, vmull_n_u16(vpix16, i));
+    return vx;
+}
+
+template<bool RGB>
+void qt_qimageScaleAARGBA_up_x_down_y_neon(QImageScaleInfo *isi, unsigned int *dest,
+                                           int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    int *xpoints = isi->xpoints;
+    int *xapoints = isi->xapoints;
+    int *yapoints = isi->yapoints;
+
+    /* go through every scanline in the output buffer */
+    for (int y = 0; y < dh; y++) {
+        int Cy = yapoints[y] >> 16;
+        int yap = yapoints[y] & 0xffff;
+
+        unsigned int *dptr = dest + (y * dow);
+        for (int x = 0; x < dw; x++) {
+            const unsigned int *sptr = ypoints[y] + xpoints[x];
+            uint32x4_t vx = qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow);
+
+            int xap = xapoints[x];
+            if (xap > 0) {
+                uint32x4_t vr = qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow);
+
+                vx = vmulq_n_u32(vx, 256 - xap);
+                vr = vmulq_n_u32(vr, xap);
+                vx = vaddq_u32(vx, vr);
+                vx = vshrq_n_u32(vx, 8);
+            }
+            vx = vshrq_n_u32(vx, 14);
+            const uint16x4_t vx16 = vmovn_u32(vx);
+            const uint8x8_t vx8 = vmovn_u16(vcombine_u16(vx16, vx16));
+            *dptr = vget_lane_u32(vreinterpret_u32_u8(vx8), 0);
+            if (RGB)
+                *dptr |= 0xff000000;
+            dptr++;
+        }
+    }
+}
+
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_x_up_y_neon(QImageScaleInfo *isi, unsigned int *dest,
+                                           int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    int *xpoints = isi->xpoints;
+    int *xapoints = isi->xapoints;
+    int *yapoints = isi->yapoints;
+
+    /* go through every scanline in the output buffer */
+    for (int y = 0; y < dh; y++) {
+        unsigned int *dptr = dest + (y * dow);
+        for (int x = 0; x < dw; x++) {
+            int Cx = xapoints[x] >> 16;
+            int xap = xapoints[x] & 0xffff;
+
+            const unsigned int *sptr = ypoints[y] + xpoints[x];
+            uint32x4_t vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1);
+
+            int yap = yapoints[y];
+            if (yap > 0) {
+                uint32x4_t vr = qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1);
+
+                vx = vmulq_n_u32(vx, 256 - yap);
+                vr = vmulq_n_u32(vr, yap);
+                vx = vaddq_u32(vx, vr);
+                vx = vshrq_n_u32(vx, 8);
+            }
+            vx = vshrq_n_u32(vx, 14);
+            const uint16x4_t vx16 = vmovn_u32(vx);
+            const uint8x8_t vx8 = vmovn_u16(vcombine_u16(vx16, vx16));
+            *dptr = vget_lane_u32(vreinterpret_u32_u8(vx8), 0);
+            if (RGB)
+                *dptr |= 0xff000000;
+            dptr++;
+        }
+    }
+}
+
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_xy_neon(QImageScaleInfo *isi, unsigned int *dest,
+                                       int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    int *xpoints = isi->xpoints;
+    int *xapoints = isi->xapoints;
+    int *yapoints = isi->yapoints;
+
+    for (int y = 0; y < dh; y++) {
+        int Cy = yapoints[y] >> 16;
+        int yap = yapoints[y] & 0xffff;
+
+        unsigned int *dptr = dest + (y * dow);
+        for (int x = 0; x < dw; x++) {
+            const int Cx = xapoints[x] >> 16;
+            const int xap = xapoints[x] & 0xffff;
+
+            const unsigned int *sptr = ypoints[y] + xpoints[x];
+            uint32x4_t vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1);
+            vx = vshrq_n_u32(vx, 4);
+            uint32x4_t vr = vmulq_n_u32(vx, yap);
+
+            int j;
+            for (j = (1 << 14) - yap; j > Cy; j -= Cy) {
+                sptr += sow;
+                vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1);
+                vx = vshrq_n_u32(vx, 4);
+                vx = vmulq_n_u32(vx, Cy);
+                vr = vaddq_u32(vr, vx);
+            }
+            sptr += sow;
+            vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1);
+            vx = vshrq_n_u32(vx, 4);
+            vx = vmulq_n_u32(vx, j);
+            vr = vaddq_u32(vr, vx);
+
+            vx = vshrq_n_u32(vr, 24);
+            const uint16x4_t vx16 = vmovn_u32(vx);
+            const uint8x8_t vx8 = vmovn_u16(vcombine_u16(vx16, vx16));
+            *dptr = vget_lane_u32(vreinterpret_u32_u8(vx8), 0);
+            if (RGB)
+                *dptr |= 0xff000000;
+            dptr++;
+        }
+    }
+}
+
+template void qt_qimageScaleAARGBA_up_x_down_y_neon<false>(QImageScaleInfo *isi, unsigned int *dest,
+                                                           int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_up_x_down_y_neon<true>(QImageScaleInfo *isi, unsigned int *dest,
+                                                          int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_x_up_y_neon<false>(QImageScaleInfo *isi, unsigned int *dest,
+                                                           int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_x_up_y_neon<true>(QImageScaleInfo *isi, unsigned int *dest,
+                                                          int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_xy_neon<false>(QImageScaleInfo *isi, unsigned int *dest,
+                                                       int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_xy_neon<true>(QImageScaleInfo *isi, unsigned int *dest,
+                                                      int dw, int dh, int dow, int sow);
+}
+
+#endif
diff --git a/crengine/qimagescale/qimagescale_p.h b/crengine/qimagescale/qimagescale_p.h
new file mode 100644
index 000000000..aa8d8fbc7
--- /dev/null
+++ b/crengine/qimagescale/qimagescale_p.h
@@ -0,0 +1,69 @@
+/****************************************************************************
+**
+** Copyright (C) 2016 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of the QtGui module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+#ifndef QIMAGESCALE_P_H
+#define QIMAGESCALE_P_H
+
+//
+//  W A R N I N G
+//  -------------
+//
+// This file is not part of the Qt API.  It exists purely as an
+// implementation detail.  This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+//
+
+namespace CRe {
+
+unsigned char* qSmoothScaleImage(const unsigned char* src, int sw, int sh, bool ignore_alpha, int dw, int dh);
+
+namespace QImageScale {
+    struct QImageScaleInfo {
+        int *xpoints{nullptr};
+        const unsigned int **ypoints{nullptr};
+        int *xapoints{nullptr};
+        int *yapoints{nullptr};
+        int xup_yup{0};
+    };
+}
+
+}
+
+#endif
diff --git a/crengine/qimagescale/qimagescale_sse4.cpp b/crengine/qimagescale/qimagescale_sse4.cpp
new file mode 100644
index 000000000..e61c969e7
--- /dev/null
+++ b/crengine/qimagescale/qimagescale_sse4.cpp
@@ -0,0 +1,222 @@
+/****************************************************************************
+**
+** Copyright (C) 2016 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of the QtGui module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#include "qimagescale_p.h"
+
+#if defined(__SSE4_1__)
+
+namespace CRe {
+
+using namespace QImageScale;
+
+inline static __m128i Q_DECL_VECTORCALL
+qt_qimageScaleAARGBA_helper(const unsigned int *pix, int xyap, int Cxy, int step, const __m128i vxyap, const __m128i vCxy)
+{
+    __m128i vpix = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*pix));
+    __m128i vx = _mm_mullo_epi32(vpix, vxyap);
+    int i;
+    for (i = (1 << 14) - xyap; i > Cxy; i -= Cxy) {
+        pix += step;
+        vpix = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*pix));
+        vx = _mm_add_epi32(vx, _mm_mullo_epi32(vpix, vCxy));
+    }
+    pix += step;
+    vpix = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*pix));
+    vx = _mm_add_epi32(vx, _mm_mullo_epi32(vpix, _mm_set1_epi32(i)));
+    return vx;
+}
+
+template<bool RGB>
+void qt_qimageScaleAARGBA_up_x_down_y_sse4(QImageScaleInfo *isi, unsigned int *dest,
+                                           int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    int *xpoints = isi->xpoints;
+    int *xapoints = isi->xapoints;
+    int *yapoints = isi->yapoints;
+
+    const __m128i v256 = _mm_set1_epi32(256);
+
+    /* go through every scanline in the output buffer */
+    for (int y = 0; y < dh; y++) {
+        int Cy = yapoints[y] >> 16;
+        int yap = yapoints[y] & 0xffff;
+        const __m128i vCy = _mm_set1_epi32(Cy);
+        const __m128i vyap = _mm_set1_epi32(yap);
+
+        unsigned int *dptr = dest + (y * dow);
+        for (int x = 0; x < dw; x++) {
+            const unsigned int *sptr = ypoints[y] + xpoints[x];
+            __m128i vx = qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow, vyap, vCy);
+
+            int xap = xapoints[x];
+            if (xap > 0) {
+                const __m128i vxap = _mm_set1_epi32(xap);
+                const __m128i vinvxap = _mm_sub_epi32(v256, vxap);
+                __m128i vr = qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow, vyap, vCy);
+
+                vx = _mm_mullo_epi32(vx, vinvxap);
+                vr = _mm_mullo_epi32(vr, vxap);
+                vx = _mm_add_epi32(vx, vr);
+                vx = _mm_srli_epi32(vx, 8);
+            }
+            vx = _mm_srli_epi32(vx, 14);
+            vx = _mm_packus_epi32(vx, _mm_setzero_si128());
+            vx = _mm_packus_epi16(vx, _mm_setzero_si128());
+            *dptr = _mm_cvtsi128_si32(vx);
+            if (RGB)
+                *dptr |= 0xff000000;
+            dptr++;
+        }
+    }
+}
+
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_x_up_y_sse4(QImageScaleInfo *isi, unsigned int *dest,
+                                           int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    int *xpoints = isi->xpoints;
+    int *xapoints = isi->xapoints;
+    int *yapoints = isi->yapoints;
+
+    const __m128i v256 = _mm_set1_epi32(256);
+
+    /* go through every scanline in the output buffer */
+    for (int y = 0; y < dh; y++) {
+        unsigned int *dptr = dest + (y * dow);
+        for (int x = 0; x < dw; x++) {
+            int Cx = xapoints[x] >> 16;
+            int xap = xapoints[x] & 0xffff;
+            const __m128i vCx = _mm_set1_epi32(Cx);
+            const __m128i vxap = _mm_set1_epi32(xap);
+
+            const unsigned int *sptr = ypoints[y] + xpoints[x];
+            __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
+
+            int yap = yapoints[y];
+            if (yap > 0) {
+                const __m128i vyap = _mm_set1_epi32(yap);
+                const __m128i vinvyap = _mm_sub_epi32(v256, vyap);
+                __m128i vr = qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1, vxap, vCx);
+
+                vx = _mm_mullo_epi32(vx, vinvyap);
+                vr = _mm_mullo_epi32(vr, vyap);
+                vx = _mm_add_epi32(vx, vr);
+                vx = _mm_srli_epi32(vx, 8);
+            }
+            vx = _mm_srli_epi32(vx, 14);
+            vx = _mm_packus_epi32(vx, _mm_setzero_si128());
+            vx = _mm_packus_epi16(vx, _mm_setzero_si128());
+            *dptr = _mm_cvtsi128_si32(vx);
+            if (RGB)
+                *dptr |= 0xff000000;
+            dptr++;
+        }
+    }
+}
+
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo *isi, unsigned int *dest,
+                                       int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    int *xpoints = isi->xpoints;
+    int *xapoints = isi->xapoints;
+    int *yapoints = isi->yapoints;
+
+    for (int y = 0; y < dh; y++) {
+        int Cy = yapoints[y] >> 16;
+        int yap = yapoints[y] & 0xffff;
+        const __m128i vCy = _mm_set1_epi32(Cy);
+        const __m128i vyap = _mm_set1_epi32(yap);
+
+        unsigned int *dptr = dest + (y * dow);
+        for (int x = 0; x < dw; x++) {
+            const int Cx = xapoints[x] >> 16;
+            const int xap = xapoints[x] & 0xffff;
+            const __m128i vCx = _mm_set1_epi32(Cx);
+            const __m128i vxap = _mm_set1_epi32(xap);
+
+            const unsigned int *sptr = ypoints[y] + xpoints[x];
+            __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
+            __m128i vr = _mm_mullo_epi32(_mm_srli_epi32(vx, 4), vyap);
+
+            int j;
+            for (j = (1 << 14) - yap; j > Cy; j -= Cy) {
+                sptr += sow;
+                vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
+                vr = _mm_add_epi32(vr, _mm_mullo_epi32(_mm_srli_epi32(vx, 4), vCy));
+            }
+            sptr += sow;
+            vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
+            vr = _mm_add_epi32(vr, _mm_mullo_epi32(_mm_srli_epi32(vx, 4), _mm_set1_epi32(j)));
+
+            vr = _mm_srli_epi32(vr, 24);
+            vr = _mm_packus_epi32(vr, _mm_setzero_si128());
+            vr = _mm_packus_epi16(vr, _mm_setzero_si128());
+            *dptr = _mm_cvtsi128_si32(vr);
+            if (RGB)
+                *dptr |= 0xff000000;
+            dptr++;
+        }
+    }
+}
+
+template void qt_qimageScaleAARGBA_up_x_down_y_sse4<false>(QImageScaleInfo *isi, unsigned int *dest,
+                                                           int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_up_x_down_y_sse4<true>(QImageScaleInfo *isi, unsigned int *dest,
+                                                          int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_x_up_y_sse4<false>(QImageScaleInfo *isi, unsigned int *dest,
+                                                           int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_x_up_y_sse4<true>(QImageScaleInfo *isi, unsigned int *dest,
+                                                          int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_xy_sse4<false>(QImageScaleInfo *isi, unsigned int *dest,
+                                                       int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_xy_sse4<true>(QImageScaleInfo *isi, unsigned int *dest,
+                                                      int dw, int dh, int dow, int sow);
+
+}
+
+#endif
diff --git a/crengine/qimagescale/qrgb.h b/crengine/qimagescale/qrgb.h
new file mode 100644
index 000000000..fc90d3527
--- /dev/null
+++ b/crengine/qimagescale/qrgb.h
@@ -0,0 +1,67 @@
+/****************************************************************************
+**
+** Copyright (C) 2016 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of the QtGui module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#ifndef QRGB_H
+#define QRGB_H
+
+namespace CRe {
+
+typedef unsigned int QRgb;                        // RGB triplet
+
+inline constexpr int qRed(QRgb rgb)                // get red part of RGB
+{ return ((rgb >> 16) & 0xff); }
+
+inline constexpr int qGreen(QRgb rgb)                // get green part of RGB
+{ return ((rgb >> 8) & 0xff); }
+
+inline constexpr int qBlue(QRgb rgb)                // get blue part of RGB
+{ return (rgb & 0xff); }
+
+inline constexpr int qAlpha(QRgb rgb)                // get alpha part of RGBA
+{ return rgb >> 24; }
+
+inline constexpr QRgb qRgb(int r, int g, int b)// set RGB value
+{ return (0xffu << 24) | ((r & 0xffu) << 16) | ((g & 0xffu) << 8) | (b & 0xffu); }
+
+inline constexpr QRgb qRgba(int r, int g, int b, int a)// set RGBA value
+{ return ((a & 0xffu) << 24) | ((r & 0xffu) << 16) | ((g & 0xffu) << 8) | (b & 0xffu); }
+
+}
+
+#endif // QRGB_H
diff --git a/crengine/src/lvdrawbuf.cpp b/crengine/src/lvdrawbuf.cpp
index 6417b1724..1f2177df5 100644
--- a/crengine/src/lvdrawbuf.cpp
+++ b/crengine/src/lvdrawbuf.cpp
@@ -33,6 +33,9 @@ void LVDrawBuf::RoundRect( int x0, int y0, int x1, int y1, int borderWidth, int
     // TODO: draw rounded corners
 }
 
+// NOTE: For more accurate (but slightly more costly) conversions, see:
+//       stb does (lUInt8) (((r*77) + (g*150) + (b*29)) >> 8) (That's roughly the Rec601Luma algo)
+//       Qt5 does (lUInt8) (((r*11) + (g*16) + (b*5)) >> 5) (That's closer to Rec601Luminance or Rec709Luminance IIRC)
 static lUInt32 rgbToGray( lUInt32 color )
 {
     lUInt32 r = (0xFF0000 & color) >> 16;
@@ -400,6 +403,9 @@ class LVImageScaledDrawCallback : public LVImageDecoderCallback
     int * xmap;
     int * ymap;
     bool dither;
+    bool invert;
+    bool smoothscale;
+    lUInt8 * decoded;
     bool isNinePatch;
 public:
     static int * GenMap( int src_len, int dst_len )
@@ -446,8 +452,8 @@ class LVImageScaledDrawCallback : public LVImageDecoderCallback
         }
         return map;
     }
-    LVImageScaledDrawCallback(LVBaseDrawBuf * dstbuf, LVImageSourceRef img, int x, int y, int width, int height, bool dith )
-    : src(img), dst(dstbuf), dst_x(x), dst_y(y), dst_dx(width), dst_dy(height), xmap(0), ymap(0), dither(dith)
+    LVImageScaledDrawCallback(LVBaseDrawBuf * dstbuf, LVImageSourceRef img, int x, int y, int width, int height, bool dith, bool inv, bool smooth )
+    : src(img), dst(dstbuf), dst_x(x), dst_y(y), dst_dx(width), dst_dy(height), xmap(0), ymap(0), dither(dith), invert(inv), smoothscale(smooth), decoded(0)
     {
         src_dx = img->GetWidth();
         src_dy = img->GetHeight();
@@ -458,18 +464,28 @@ class LVImageScaledDrawCallback : public LVImageDecoderCallback
             isNinePatch = true;
             ninePatch = np->frame;
         }
+        // If smoothscaling was requested, but no scaling was needed, disable the post-processing pass
+        if (smoothscale && src_dx == dst_dx && src_dy == dst_dy) {
+            smoothscale = false;
+            //fprintf( stderr, "Disabling smoothscale because no scaling was needed (%dx%d -> %dx%d)\n", src_dx, src_dy, dst_dx, dst_dy );
+        }
         if ( src_dx != dst_dx || isNinePatch) {
             if (isNinePatch)
                 xmap = GenNinePatchMap(src_dx, dst_dx, ninePatch.left, ninePatch.right);
-            else
+            else if (!smoothscale)
                 xmap = GenMap( src_dx, dst_dx );
         }
         if ( src_dy != dst_dy || isNinePatch) {
             if (isNinePatch)
                 ymap = GenNinePatchMap(src_dy, dst_dy, ninePatch.top, ninePatch.bottom);
-            else
+            else if (!smoothscale)
                 ymap = GenMap( src_dy, dst_dy );
         }
+        // If we have a smoothscale post-processing pass, we'll need to build a buffer of the *full* decoded image.
+        if (smoothscale) {
+            // Byte-sized buffer, we're 32bpp, so, 4 bytes per pixel.
+            decoded = new lUInt8[src_dy * (src_dx * 4)];
+        }
     }
     virtual ~LVImageScaledDrawCallback()
     {
@@ -477,6 +493,8 @@ class LVImageScaledDrawCallback : public LVImageDecoderCallback
             delete[] xmap;
         if (ymap)
             delete[] ymap;
+        if (decoded)
+            delete[] decoded;
     }
     virtual void OnStartDecode( LVImageSource * )
     {
@@ -488,8 +506,15 @@ class LVImageScaledDrawCallback : public LVImageDecoderCallback
             if (y == 0 || y == src_dy-1) // ignore first and last lines
                 return true;
         }
+        // Defer everything to the post-process pass for smooth scaling, we just have to store the line in our decoded buffer
+        if (smoothscale) {
+            //fprintf( stderr, "Smoothscale l_%d pass\n", y );
+            memcpy(decoded + (y * (src_dx * 4)), data, (src_dx * 4));
+            return true;
+        }
         int yy = -1;
         int yy2 = -1;
+        const lUInt32 rgba_invert = invert ? 0x00FFFFFF : 0;
         if (ymap) {
             for (int i = 0; i < dst_dy; i++) {
                 if (ymap[i] == y) {
@@ -530,7 +555,7 @@ class LVImageScaledDrawCallback : public LVImageDecoderCallback
                 row += dst_x;
                 for (int x=0; x<dst_dx; x++)
                 {
-                    lUInt32 cl = data[xmap ? xmap[x] : x];
+                    lUInt32 cl = data[xmap ? xmap[x] : x] ^ rgba_invert;
                     int xx = x + dst_x;
                     lUInt32 alpha = (cl >> 24)&0xFF;
                     if ( xx<clip.left || xx>=clip.right || alpha==0xFF )
@@ -551,7 +576,7 @@ class LVImageScaledDrawCallback : public LVImageDecoderCallback
                 row += dst_x;
                 for (int x=0; x<dst_dx; x++)
                 {
-                    lUInt32 cl = data[xmap ? xmap[x] : x];
+                    lUInt32 cl = data[xmap ? xmap[x] : x] ^ rgba_invert;
                     int xx = x + dst_x;
                     lUInt32 alpha = (cl >> 24)&0xFF;
                     if ( xx<clip.left || xx>=clip.right || alpha==0xFF )
@@ -572,7 +597,7 @@ class LVImageScaledDrawCallback : public LVImageDecoderCallback
                 for (int x=0; x<dst_dx; x++)
                 {
                     int srcx = xmap ? xmap[x] : x;
-                    lUInt32 cl = data[srcx];
+                    lUInt32 cl = data[srcx] ^ rgba_invert;
                     int xx = x + dst_x;
                     lUInt32 alpha = (cl >> 24)&0xFF;
                     if ( xx<clip.left || xx>=clip.right || alpha==0xFF )
@@ -592,12 +617,15 @@ class LVImageScaledDrawCallback : public LVImageDecoderCallback
                     }
 
                     lUInt8 dcl;
-                    if ( dither && bpp < 8) {
+                    if ( dither && bpp < 8 ) {
 #if (GRAY_INVERSE==1)
                         dcl = (lUInt8)DitherNBitColor( cl^0xFFFFFF, x, yy, bpp );
 #else
                         dcl = (lUInt8)DitherNBitColor( cl, x, yy, bpp );
 #endif
+                    } else if ( dither && bpp == 8 ) {
+                        dcl = rgbToGray( cl );
+                        dcl = dither_o8x8( x, yy, dcl );
                     } else {
                         dcl = rgbToGray( cl, bpp );
                     }
@@ -612,7 +640,7 @@ class LVImageScaledDrawCallback : public LVImageDecoderCallback
                 //row += dst_x;
                 for (int x=0; x<dst_dx; x++)
                 {
-                    lUInt32 cl = data[xmap ? xmap[x] : x];
+                    lUInt32 cl = data[xmap ? xmap[x] : x] ^ rgba_invert;
                     int xx = x + dst_x;
                     lUInt32 alpha = (cl >> 24)&0xFF;
                     if ( xx<clip.left || xx>=clip.right || alpha==0xFF )
@@ -652,7 +680,7 @@ class LVImageScaledDrawCallback : public LVImageDecoderCallback
                 //row += dst_x;
                 for (int x=0; x<dst_dx; x++)
                 {
-                    lUInt32 cl = data[xmap ? xmap[x] : x];
+                    lUInt32 cl = data[xmap ? xmap[x] : x] ^ rgba_invert;
                     int xx = x + dst_x;
                     lUInt32 alpha = (cl >> 24)&0xFF;
                     if ( xx<clip.left || xx>=clip.right || (alpha&0x80) )
@@ -681,8 +709,41 @@ class LVImageScaledDrawCallback : public LVImageDecoderCallback
         }
         return true;
     }
-    virtual void OnEndDecode( LVImageSource *, bool )
+    virtual void OnEndDecode( LVImageSource * obj, bool )
     {
+        // If we're not smooth scaling, we're done!
+        if (!smoothscale) {
+            return;
+        }
+
+        // Scale our decoded data...
+        lUInt8 * sdata = nullptr;
+        //fprintf( stderr, "Requesting smooth scaling (%dx%d -> %dx%d)\n", src_dx, src_dy, dst_dx, dst_dy );
+        sdata = CRe::qSmoothScaleImage(decoded, src_dx, src_dy, false, dst_dx, dst_dy);
+        if (sdata == nullptr) {
+                // Hu oh... Scaling failed! Return *without* drawing anything!
+                // We skipped map generation, so we can't easily fallback to nearest-neighbor...
+                //fprintf( stderr, "Smooth scaling failed :(\n" );
+                return;
+        }
+
+        // Process as usual, with a bit of a hack to avoid code duplication...
+        smoothscale = false;
+        for (int y=0; y < dst_dy; y++) {
+            lUInt8 * row = sdata + (y * (dst_dx * 4));
+            this->OnLineDecoded( obj, y, (lUInt32 *) row );
+        }
+
+        // This prints the unscaled decoded buffer, for debugging purposes ;).
+        /*
+        for (int y=0; y < src_dy; y++) {
+            lUInt8 * row = decoded + (y * (src_dx * 4));
+            this->OnLineDecoded( obj, y, (lUInt32 *) row );
+        }
+        */
+
+        // And now that it's been rendered we can free the scaled buffer (it was allocated by CRe::qSmoothScaleImage).
+        free(sdata);
     }
 };
 
@@ -707,10 +768,8 @@ void LVGrayDrawBuf::Draw( LVImageSourceRef img, int x, int y, int width, int hei
     //fprintf( stderr, "LVGrayDrawBuf::Draw( img(%d, %d), %d, %d, %d, %d\n", img->GetWidth(), img->GetHeight(), x, y, width, height );
     if ( width<=0 || height<=0 )
         return;
-    LVImageScaledDrawCallback drawcb( this, img, x, y, width, height, dither );
+    LVImageScaledDrawCallback drawcb( this, img, x, y, width, height, _ditherImages, _invertImages, _smoothImages );
     img->Decode( &drawcb );
-    if ( _invertImages )
-        InvertRect(x, y, x+width, y+height);
 
     _drawnImagesCount++;
     _drawnImagesSurface += width*height;
@@ -929,7 +988,7 @@ void LVGrayDrawBuf::InvertRect(int x0, int y0, int x1, int y1)
             lUInt8 * line = GetScanLine(y0);
             for (int y=y0; y<y1; y++) {
                 for (int x=x0; x<x1; x++)
-                    line[x] = ~line[x];
+                    line[x] ^= 0xFF;
                 line += _rowsize;
             }
         }
@@ -1302,10 +1361,8 @@ int  LVColorDrawBuf::GetBitsPerPixel()
 void LVColorDrawBuf::Draw( LVImageSourceRef img, int x, int y, int width, int height, bool dither )
 {
     //fprintf( stderr, "LVColorDrawBuf::Draw( img(%d, %d), %d, %d, %d, %d\n", img->GetWidth(), img->GetHeight(), x, y, width, height );
-    LVImageScaledDrawCallback drawcb( this, img, x, y, width, height, dither );
+    LVImageScaledDrawCallback drawcb( this, img, x, y, width, height, dither, _invertImages, _smoothImages );
     img->Decode( &drawcb );
-    if ( _invertImages )
-        InvertRect(x, y, x+width, y+height);
     _drawnImagesCount++;
     _drawnImagesSurface += width*height;
 }
diff --git a/crengine/src/lvimg.cpp b/crengine/src/lvimg.cpp
index 3f0f58525..cbc3d27a3 100644
--- a/crengine/src/lvimg.cpp
+++ b/crengine/src/lvimg.cpp
@@ -810,7 +810,7 @@ class LVJpegImageSource : public LVNodeImageSource
                     }
                     callback->OnLineDecoded( this, y, row );
                 }
-                callback->OnEndDecode(this, true);
+                callback->OnEndDecode(this, false);
             }
 
         if ( buffer )
@@ -939,11 +939,11 @@ bool LVPngImageSource::Decode( LVImageDecoderCallback * callback )
         {
             callback->OnLineDecoded( this, y,  (lUInt32*) image[y] );
         }
-
         png_read_end(png_ptr, info_ptr);
 
         callback->OnEndDecode(this, false);
-        for (lUInt32 i=0; i<height; i++) delete [] image[i];
+        for (lUInt32 i=0; i<height; i++)
+            delete [] image[i];
         delete [] image;
     }
     png_destroy_read_struct(&png_ptr, &info_ptr, NULL);