diff --git a/cocos/network/CCDownloader-curl.cpp b/cocos/network/CCDownloader-curl.cpp
index 579fb350beb9..49377e41b91f 100644
--- a/cocos/network/CCDownloader-curl.cpp
+++ b/cocos/network/CCDownloader-curl.cpp
@@ -283,10 +283,10 @@ class DownloadTaskCURL : public IDownloadTask {
     string _tempFileName;
     std::string _checksumFileName;
     vector<unsigned char> _buf;
-    FileStream* _fs;
+    FileStream* _fs = nullptr;
 
     // calculate md5 in downloading time support
-    FileStream* _fsMd5; // store md5 state realtime
+    FileStream* _fsMd5 = nullptr; // store md5 state realtime
     md5_state_s _md5State;
 
 
@@ -812,8 +812,10 @@ void DownloaderCURL::_onDownloadFinished(TaskWrapper&& wrapper, int checkState)
     if (coTask._fs) {
         do {
             auto pFileUtils = FileUtils::getInstance();
-            coTask._fs->close();
-            coTask._fsMd5->close();
+            delete coTask._fs;
+            coTask._fs = nullptr;
+            delete coTask._fsMd5;
+            coTask._fsMd5 = nullptr;
 
             if (checkState & kCheckSumStateSucceed) // No need download
             {
diff --git a/extensions/scripting/lua-bindings/script/cocostudio/CocoStudio.lua b/extensions/scripting/lua-bindings/script/cocostudio/CocoStudio.lua
index 60fd56a67a41..ee7254a9587c 100644
--- a/extensions/scripting/lua-bindings/script/cocostudio/CocoStudio.lua
+++ b/extensions/scripting/lua-bindings/script/cocostudio/CocoStudio.lua
@@ -2,9 +2,7 @@ if nil == ccs then
     return
 end
 
-if not json then
-    json = cjson
-end
+local json = require 'cjson'
 
 require "cocos.cocostudio.StudioConstants"
 
diff --git a/extensions/scripting/lua-bindings/script/init.lua b/extensions/scripting/lua-bindings/script/init.lua
index 670609c77eb5..ceeb043ec3b5 100644
--- a/extensions/scripting/lua-bindings/script/init.lua
+++ b/extensions/scripting/lua-bindings/script/init.lua
@@ -22,6 +22,7 @@ THE SOFTWARE.
 
 ]]
 
+-- lua-5.4
 if (math.pow == nil) then
     math.pow = function (x,y)
         return x ^ y
diff --git a/external/README.md b/external/README.md
index 595be79903ff..21d8a4c589dd 100644
--- a/external/README.md
+++ b/external/README.md
@@ -86,7 +86,7 @@
 
 ## jpeg (libjpeg-turbo)
 - Upstream: https://github.com/libjpeg-turbo/libjpeg-turbo
-- Version: 2.0.6 with CMakeLists.txt modified for sutiable as a thirdparty of CMake build system
+- Version: 2.1.0 with CMakeLists.txt modified for sutiable as a thirdparty of CMake build system
 - License: BSD-style (IJG,BSD-3-Clause,zlib)
 
 ## kcp
diff --git a/external/jpeg/.gitattributes b/external/jpeg/.gitattributes
new file mode 100644
index 000000000000..6c9660afc063
--- /dev/null
+++ b/external/jpeg/.gitattributes
@@ -0,0 +1,4 @@
+/appveyor.yml export-ignore
+/.gitattributes export-ignore
+/.github export-ignore
+*.ppm binary
diff --git a/external/jpeg/.gitignore b/external/jpeg/.gitignore
deleted file mode 100644
index 3cd9f3b7d6b6..000000000000
--- a/external/jpeg/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-# Ignore config files by cmake
-jconfig.h
-jconfigint.h
-libjpeg.map
\ No newline at end of file
diff --git a/external/jpeg/BUILDING.md b/external/jpeg/BUILDING.md
index 2aef181fefbe..f91abcd44bc6 100644
--- a/external/jpeg/BUILDING.md
+++ b/external/jpeg/BUILDING.md
@@ -12,10 +12,7 @@ Build Requirements
 
 - [NASM](http://www.nasm.us) or [YASM](http://yasm.tortall.net)
   (if building x86 or x86-64 SIMD extensions)
-  * If using NASM, 2.10 or later is required.
-  * If using NASM, 2.10 or later (except 2.11.08) is required for an x86-64 Mac
-    build (2.11.08 does not work properly with libjpeg-turbo's x86-64 SIMD code
-    when building macho64 objects.)
+  * If using NASM, 2.13 or later is required.
   * If using YASM, 1.2.0 or later is required.
   * If building on macOS, NASM or YASM can be obtained from
     [MacPorts](http://www.macports.org/) or [Homebrew](http://brew.sh/).
@@ -49,10 +46,8 @@ Build Requirements
 
 - If building the TurboJPEG Java wrapper, JDK or OpenJDK 1.5 or later is
   required.  Most modern Linux distributions, as well as Solaris 10 and later,
-  include JDK or OpenJDK.  On OS X 10.5 and 10.6, it will be necessary to
-  install the Java Developer Package, which can be downloaded from
-  <http://developer.apple.com/downloads> (Apple ID required.)  For other
-  systems, you can obtain the Oracle Java Development Kit from
+  include JDK or OpenJDK.  For other systems, you can obtain the Oracle Java
+  Development Kit from
   <http://www.oracle.com/technetwork/java/javase/downloads>.
 
   * If using JDK 11 or later, CMake 3.10.x or later must also be used.
@@ -62,22 +57,22 @@ Build Requirements
 - Microsoft Visual C++ 2005 or later
 
   If you don't already have Visual C++, then the easiest way to get it is by
-  installing the
-  [Windows SDK](http://msdn.microsoft.com/en-us/windows/bb980924.aspx).
-  The Windows SDK includes both 32-bit and 64-bit Visual C++ compilers and
-  everything necessary to build libjpeg-turbo.
-
-  * You can also use Microsoft Visual Studio Express/Community Edition, which
-    is a free download.  (NOTE: versions prior to 2012 can only be used to
-    build 32-bit code.)
+  installing
+  [Visual Studio Community Edition](https://visualstudio.microsoft.com),
+  which includes everything necessary to build libjpeg-turbo.
+
+  * You can also download and install the standalone Windows SDK (for Windows 7
+    or later), which includes command-line versions of the 32-bit and 64-bit
+    Visual C++ compilers.
   * If you intend to build libjpeg-turbo from the command line, then add the
     appropriate compiler and SDK directories to the `INCLUDE`, `LIB`, and
     `PATH` environment variables.  This is generally accomplished by
-    executing `vcvars32.bat` or `vcvars64.bat` and `SetEnv.cmd`.
-    `vcvars32.bat` and `vcvars64.bat` are part of Visual C++ and are located in
-    the same directory as the compiler.  `SetEnv.cmd` is part of the Windows
-    SDK.  You can pass optional arguments to `SetEnv.cmd` to specify a 32-bit
-    or 64-bit build environment.
+    executing `vcvars32.bat` or `vcvars64.bat`, which are located in the same
+    directory as the compiler.
+  * If built with Visual C++ 2015 or later, the libjpeg-turbo static libraries
+    cannot be used with earlier versions of Visual C++, and vice versa.
+  * The libjpeg API DLL (**jpeg{version}.dll**) will depend on the C run-time
+    DLLs corresponding to the version of Visual C++ that was used to build it.
 
    ... OR ...
 
@@ -108,6 +103,13 @@ directory, whereas *{source_directory}* refers to the libjpeg-turbo source
 directory.  For in-tree builds, these directories are the same.
 
 
+Ninja
+-----
+
+In all of the procedures and recipes below, replace `make` with `ninja` and
+`Unix Makefiles` with `Ninja` if using Ninja.
+
+
 Build Procedure
 ---------------
 
@@ -333,7 +335,7 @@ Build Recipes
 -------------
 
 
-### 32-bit Build on 64-bit Linux/Unix/Mac
+### 32-bit Build on 64-bit Linux/Unix
 
 Use export/setenv to set the following environment variables before running
 CMake:
@@ -405,103 +407,9 @@ compression/decompression.  This section describes how to build libjpeg-turbo
 for these platforms.
 
 
-### Additional build requirements
-
-- For configurations that require [gas-preprocessor.pl]
-  (https://raw.githubusercontent.com/libjpeg-turbo/gas-preprocessor/master/gas-preprocessor.pl),
-  it should be installed in your `PATH`.
-
-
-### Armv7 (32-bit)
-
-**gas-preprocessor.pl required**
-
-The following scripts demonstrate how to build libjpeg-turbo to run on the
-iPhone 3GS-4S/iPad 1st-3rd Generation and newer:
-
-#### Xcode 4.2 and earlier (LLVM-GCC)
-
-    IOS_PLATFORMDIR=/Developer/Platforms/iPhoneOS.platform
-    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
-    export CFLAGS="-mfloat-abi=softfp -march=armv7 -mcpu=cortex-a8 -mtune=cortex-a8 -mfpu=neon -miphoneos-version-min=3.0"
-
-    cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Darwin)
-    set(CMAKE_SYSTEM_PROCESSOR arm)
-    set(CMAKE_C_COMPILER ${IOS_PLATFORMDIR}/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2)
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
-      [additional CMake flags] {source_directory}
-    make
-
-#### Xcode 4.3-4.6 (LLVM-GCC)
-
-Same as above, but replace the first line with:
-
-    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-
-#### Xcode 5 and later (Clang)
-
-    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
-    export CFLAGS="-mfloat-abi=softfp -arch armv7 -miphoneos-version-min=3.0"
-    export ASMFLAGS="-no-integrated-as"
-
-    cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Darwin)
-    set(CMAKE_SYSTEM_PROCESSOR arm)
-    set(CMAKE_C_COMPILER /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang)
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
-      [additional CMake flags] {source_directory}
-    make
-
-
-### Armv7s (32-bit)
-
-**gas-preprocessor.pl required**
-
-The following scripts demonstrate how to build libjpeg-turbo to run on the
-iPhone 5/iPad 4th Generation and newer:
-
-#### Xcode 4.5-4.6 (LLVM-GCC)
-
-    IOS_PLATFORMDIR=/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform
-    IOS_SYSROOT=($IOS_PLATFORMDIR/Developer/SDKs/iPhoneOS*.sdk)
-    export CFLAGS="-Wall -mfloat-abi=softfp -march=armv7s -mcpu=swift -mtune=swift -mfpu=neon -miphoneos-version-min=6.0"
-
-    cd {build_directory}
-
-    cat <<EOF >toolchain.cmake
-    set(CMAKE_SYSTEM_NAME Darwin)
-    set(CMAKE_SYSTEM_PROCESSOR arm)
-    set(CMAKE_C_COMPILER ${IOS_PLATFORMDIR}/Developer/usr/bin/arm-apple-darwin10-llvm-gcc-4.2)
-    EOF
-
-    cmake -G"Unix Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain.cmake \
-      -DCMAKE_OSX_SYSROOT=${IOS_SYSROOT[0]} \
-      [additional CMake flags] {source_directory}
-    make
-
-#### Xcode 5 and later (Clang)
-
-Same as the Armv7 build procedure for Xcode 5 and later, except replace the
-compiler flags as follows:
-
-    export CFLAGS="-Wall -mfloat-abi=softfp -arch armv7s -miphoneos-version-min=6.0"
-
-
 ### Armv8 (64-bit)
 
-**gas-preprocessor.pl required if using Xcode < 6**
+**Xcode 5 or later required, Xcode 6.3.x or later recommended**
 
 The following script demonstrates how to build libjpeg-turbo to run on the
 iPhone 5S/iPad Mini 2/iPad Air and newer.
@@ -523,9 +431,6 @@ iPhone 5S/iPad Mini 2/iPad Air and newer.
       [additional CMake flags] {source_directory}
     make
 
-Once built, lipo can be used to combine the Armv7, v7s, and/or v8 variants into
-a universal library.
-
 
 Building libjpeg-turbo for Android
 ----------------------------------
@@ -536,6 +441,8 @@ Building libjpeg-turbo for Android platforms requires v13b or later of the
 
 ### Armv7 (32-bit)
 
+**NDK r19 or later with Clang recommended**
+
 The following is a general recipe script that can be modified for your specific
 needs.
 
@@ -561,6 +468,8 @@ needs.
 
 ### Armv8 (64-bit)
 
+**Clang recommended**
+
 The following is a general recipe script that can be modified for your specific
 needs.
 
@@ -735,44 +644,23 @@ Mac
     make dmg
 
 Create Mac package/disk image.  This requires pkgbuild and productbuild, which
-are installed by default on OS X 10.7 and later and which can be obtained by
-installing Xcode 3.2.6 (with the "Unix Development" option) on OS X 10.6.
-Packages built in this manner can be installed on OS X 10.5 and later, but they
-must be built on OS X 10.6 or later.
-
-    make udmg
-
-This creates a Mac package/disk image that contains universal x86-64/i386/Arm
-binaries.  The following CMake variables control which architectures are
-included in the universal binaries.  Setting any of these variables to an empty
-string excludes that architecture from the package.
-
-* `OSX_32BIT_BUILD`: Directory containing an i386 (32-bit) Mac build of
-  libjpeg-turbo (default: *{source_directory}*/osxx86)
-* `IOS_ARMV7_BUILD`: Directory containing an Armv7 (32-bit) iOS build of
-  libjpeg-turbo (default: *{source_directory}*/iosarmv7)
-* `IOS_ARMV7S_BUILD`: Directory containing an Armv7s (32-bit) iOS build of
-  libjpeg-turbo (default: *{source_directory}*/iosarmv7s)
-* `IOS_ARMV8_BUILD`: Directory containing an Armv8 (64-bit) iOS build of
-  libjpeg-turbo (default: *{source_directory}*/iosarmv8)
-
-You should first use CMake to configure i386, Armv7, Armv7s, and/or Armv8
-sub-builds of libjpeg-turbo (see "Build Recipes" and "Building libjpeg-turbo
-for iOS" above) in build directories that match those specified in the
-aforementioned CMake variables.  Next, configure the primary build of
-libjpeg-turbo as an out-of-tree build, and build it.  Once the primary build
-has been built, run `make udmg` from the build directory.  The packaging system
-will build the sub-builds, use lipo to combine them into a single set of
-universal binaries, then package the universal binaries in the same manner as
-`make dmg`.
-
-
-Cygwin
-------
-
-    make cygwinpkg
-
-Build a Cygwin binary package.
+are installed by default on OS X/macOS 10.7 and later.
+
+In order to create a Mac package/disk image that contains universal
+x86-64/Arm binaries, set the following CMake variable:
+
+* `ARMV8_BUILD`: Directory containing an Armv8 (64-bit) iOS or macOS build of
+  libjpeg-turbo to include in the universal binaries
+
+You should first use CMake to configure an Armv8 sub-build of libjpeg-turbo
+(see "Building libjpeg-turbo for iOS" above, if applicable) in a build
+directory that matches the one specified in the aforementioned CMake variable.
+Next, configure the primary (x86-64) build of libjpeg-turbo as an out-of-tree
+build, specifying the aforementioned CMake variable, and build it.  Once the
+primary build has been built, run `make dmg` from the build directory.  The
+packaging system will build the sub-build, use lipo to combine it with the
+primary build into a single set of universal binaries, then package the
+universal binaries.
 
 
 Windows
diff --git a/external/jpeg/CMakeLists.txt b/external/jpeg/CMakeLists.txt
index 718dcf26435d..1c58ee66e36a 100644
--- a/external/jpeg/CMakeLists.txt
+++ b/external/jpeg/CMakeLists.txt
@@ -5,7 +5,7 @@ if(CMAKE_EXECUTABLE_SUFFIX)
 endif()
 
 project(libjpeg-turbo C)
-set(VERSION 2.0.6)
+set(VERSION 2.1.0)
 string(REPLACE "." ";" VERSION_TRIPLET ${VERSION})
 list(GET VERSION_TRIPLET 0 VERSION_MAJOR)
 list(GET VERSION_TRIPLET 1 VERSION_MINOR)
@@ -41,12 +41,19 @@ message(STATUS "VERSION = ${VERSION}, BUILD = ${BUILD}")
 # Detect CPU type and whether we're building 64-bit or 32-bit code
 math(EXPR BITS "${CMAKE_SIZEOF_VOID_P} * 8")
 string(TOLOWER ${CMAKE_SYSTEM_PROCESSOR} CMAKE_SYSTEM_PROCESSOR_LC)
+set(COUNT 1)
+foreach(ARCH ${CMAKE_OSX_ARCHITECTURES})
+  if(COUNT GREATER 1)
+    message(FATAL_ERROR "The libjpeg-turbo build system does not support multiple values in CMAKE_OSX_ARCHITECTURES.")
+  endif()
+  math(EXPR COUNT "${COUNT}+1")
+endforeach()
 if(CMAKE_SYSTEM_PROCESSOR_LC MATCHES "x86_64" OR
   CMAKE_SYSTEM_PROCESSOR_LC MATCHES "amd64" OR
   CMAKE_SYSTEM_PROCESSOR_LC MATCHES "i[0-9]86" OR
   CMAKE_SYSTEM_PROCESSOR_LC MATCHES "x86" OR
   CMAKE_SYSTEM_PROCESSOR_LC MATCHES "ia32")
-  if(BITS EQUAL 64)
+  if(BITS EQUAL 64 OR CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
     set(CPU_TYPE x86_64)
   else()
     set(CPU_TYPE i386)
@@ -55,18 +62,30 @@ if(CMAKE_SYSTEM_PROCESSOR_LC MATCHES "x86_64" OR
     set(CMAKE_SYSTEM_PROCESSOR ${CPU_TYPE})
   endif()
 elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "aarch64" OR
-  CMAKE_SYSTEM_PROCESSOR_LC MATCHES "arm*")
+  CMAKE_SYSTEM_PROCESSOR_LC MATCHES "^arm")
   if(BITS EQUAL 64)
     set(CPU_TYPE arm64)
   else()
     set(CPU_TYPE arm)
   endif()
-elseif(CMAKE_SYSTEM_PROCESSOR_LC MATCHES "ppc*" OR
-  CMAKE_SYSTEM_PROCESSOR_LC MATCHES "powerpc*")
+elseif(CMAKE_SYSTEM_PROCESSOR_LC MATCHES "^ppc" OR
+  CMAKE_SYSTEM_PROCESSOR_LC MATCHES "^powerpc")
   set(CPU_TYPE powerpc)
 else()
   set(CPU_TYPE ${CMAKE_SYSTEM_PROCESSOR_LC})
 endif()
+if(CMAKE_OSX_ARCHITECTURES MATCHES "x86_64" OR
+  CMAKE_OSX_ARCHITECTURES MATCHES "arm64" OR
+  CMAKE_OSX_ARCHITECTURES MATCHES "i386")
+  set(CPU_TYPE ${CMAKE_OSX_ARCHITECTURES})
+endif()
+if(CMAKE_OSX_ARCHITECTURES MATCHES "ppc")
+  set(CPU_TYPE powerpc)
+endif()
+if(MSVC_IDE AND CMAKE_GENERATOR_PLATFORM MATCHES "arm64")
+  set(CPU_TYPE arm64)
+endif()
+
 message(STATUS "${BITS}-bit build (${CPU_TYPE})")
 
 
@@ -84,7 +103,9 @@ if(WIN32)
     set(CMAKE_INSTALL_DEFAULT_PREFIX "${CMAKE_INSTALL_DEFAULT_PREFIX}64")
   endif()
 else()
-  set(CMAKE_INSTALL_DEFAULT_PREFIX /opt/${CMAKE_PROJECT_NAME})
+  if(NOT CMAKE_INSTALL_DEFAULT_PREFIX)
+    set(CMAKE_INSTALL_DEFAULT_PREFIX /opt/${CMAKE_PROJECT_NAME})
+  endif()
 endif()
 if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
   set(CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_DEFAULT_PREFIX}" CACHE PATH
@@ -103,13 +124,15 @@ if(CMAKE_INSTALL_PREFIX STREQUAL "${CMAKE_INSTALL_DEFAULT_PREFIX}")
   if(UNIX AND NOT APPLE)
     if(BITS EQUAL 64)
       set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib64")
+    elseif(CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
+      set(CMAKE_INSTALL_DEFAULT_LIBDIR "libx32")
     else()
       set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib32")
     endif()
   endif()
 endif()
 
-# include("cmakescripts/GNUInstallDirs.cmake")
+# include(cmakescripts/GNUInstallDirs.cmake)
 
 macro(report_directory var)
   if(CMAKE_INSTALL_${var} STREQUAL CMAKE_INSTALL_FULL_${var})
@@ -135,9 +158,9 @@ endforeach()
 
 macro(boolean_number var)
   if(${var})
-    set(${var} 1)
+    set(${var} 1 ${ARGN})
   else()
-    set(${var} 0)
+    set(${var} 0 ${ARGN})
   endif()
 endmacro()
 
@@ -153,8 +176,12 @@ option(WITH_ARITH_DEC "Include arithmetic decoding support when emulating the li
 boolean_number(WITH_ARITH_DEC)
 option(WITH_ARITH_ENC "Include arithmetic encoding support when emulating the libjpeg v6b API/ABI" TRUE)
 boolean_number(WITH_ARITH_ENC)
-option(WITH_JAVA "Build Java wrapper for the TurboJPEG API library (implies ENABLE_SHARED=1)" FALSE)
-boolean_number(WITH_JAVA)
+if(CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
+  set(WITH_JAVA 0)
+else()
+  option(WITH_JAVA "Build Java wrapper for the TurboJPEG API library (implies ENABLE_SHARED=1)" FALSE)
+  boolean_number(WITH_JAVA)
+endif()
 option(WITH_JPEG7 "Emulate libjpeg v7 API/ABI (this makes ${CMAKE_PROJECT_NAME} backward-incompatible with libjpeg v6b)" FALSE)
 boolean_number(WITH_JPEG7)
 option(WITH_JPEG8 "Emulate libjpeg v8 API/ABI (this makes ${CMAKE_PROJECT_NAME} backward-incompatible with libjpeg v6b)" FALSE)
@@ -165,14 +192,7 @@ option(WITH_SIMD "Include SIMD extensions, if available for this platform" TRUE)
 boolean_number(WITH_SIMD)
 option(WITH_TURBOJPEG "Include the TurboJPEG API library and associated test programs" TRUE)
 boolean_number(WITH_TURBOJPEG)
-
-# Add by halx99
-option(JTURBO_BUILD_BINARIES "Build TurboJPEG binaries" TRUE)
-boolean_number(JTURBO_BUILD_BINARIES)
-option(JTURBO_BUILD_TESTS "Build TurboJPEG tests and examples" TRUE)
-boolean_number(JTURBO_BUILD_TESTS)
-option(JTURBO_ENABLE_INSTALL "Enable TurboJPEG install" TRUE)
-boolean_number(JTURBO_ENABLE_INSTALL)
+option(WITH_FUZZ "Build fuzz targets" FALSE)
 
 macro(report_option var desc)
   if(${var})
@@ -424,13 +444,6 @@ if(UNIX)
         exit(is_shifting_signed(-0x7F7E80B1L));
       }" RIGHT_SHIFT_IS_UNSIGNED)
   endif()
-
-  if(CMAKE_CROSSCOMPILING)
-    set(__CHAR_UNSIGNED__ 0)
-  else()
-    check_c_source_runs("int main(void) { return ((char) -1 < 0); }"
-      __CHAR_UNSIGNED__)
-  endif()
 endif()
 
 if(MSVC)
@@ -558,6 +571,9 @@ endif()
 
 if(WITH_SIMD)
   add_subdirectory(simd)
+  if(NEON_INTRINSICS)
+    add_definitions(-DNEON_INTRINSICS)
+  endif()
 elseif(NOT WITH_12BIT)
   message(STATUS "SIMD extensions: None (WITH_SIMD = ${WITH_SIMD})")
 endif()
@@ -578,13 +594,11 @@ if(ENABLE_SHARED)
   add_subdirectory(sharedlib)
 endif()
 
-set(libname jpeg)
-
 if(ENABLE_STATIC)
-  add_library(${libname} STATIC ${JPEG_SOURCES} $<TARGET_OBJECTS:simd>
+  add_library(jpeg STATIC ${JPEG_SOURCES} $<TARGET_OBJECTS:simd>
     ${SIMD_OBJS})
   if(NOT MSVC)
-    set_target_properties(${libname} PROPERTIES OUTPUT_NAME jpeg)
+    set_target_properties(jpeg PROPERTIES OUTPUT_NAME jpeg)
   endif()
 endif()
 
@@ -622,19 +636,17 @@ if(WITH_TURBOJPEG)
         LINK_FLAGS "${TJMAPFLAG}${TJMAPFILE}")
     endif()
 
-    if(JTURBO_BUILD_TESTS)
-      add_executable(tjunittest tjunittest.c tjutil.c md5/md5.c md5/md5hl.c)
-      target_link_libraries(tjunittest turbojpeg)
-
-      add_executable(tjbench tjbench.c tjutil.c)
-      target_link_libraries(tjbench turbojpeg)
-      if(UNIX)
-        target_link_libraries(tjbench m)
-      endif()
+    add_executable(tjunittest tjunittest.c tjutil.c md5/md5.c md5/md5hl.c)
+    target_link_libraries(tjunittest turbojpeg)
 
-      add_executable(tjexample tjexample.c)
-      target_link_libraries(tjexample turbojpeg)
+    add_executable(tjbench tjbench.c tjutil.c)
+    target_link_libraries(tjbench turbojpeg)
+    if(UNIX)
+      target_link_libraries(tjbench m)
     endif()
+
+    add_executable(tjexample tjexample.c)
+    target_link_libraries(tjexample turbojpeg)
   endif()
 
   if(ENABLE_STATIC)
@@ -647,20 +659,19 @@ if(WITH_TURBOJPEG)
       set_target_properties(turbojpeg-static PROPERTIES OUTPUT_NAME turbojpeg)
     endif()
 
-    if(JTURBO_BUILD_TESTS)
-      add_executable(tjunittest-static tjunittest.c tjutil.c md5/md5.c
-        md5/md5hl.c)
-      target_link_libraries(tjunittest-static turbojpeg-static)
+    add_executable(tjunittest-static tjunittest.c tjutil.c md5/md5.c
+      md5/md5hl.c)
+    target_link_libraries(tjunittest-static turbojpeg-static)
 
-      add_executable(tjbench-static tjbench.c tjutil.c)
-      target_link_libraries(tjbench-static turbojpeg-static)
-      if(UNIX)
-        target_link_libraries(tjbench-static m)
-      endif()
+    add_executable(tjbench-static tjbench.c tjutil.c)
+    target_link_libraries(tjbench-static turbojpeg-static)
+    if(UNIX)
+      target_link_libraries(tjbench-static m)
     endif()
   endif()
 endif()
 
+if(WITH_TOOLS)
 if(WIN32)
   set(USE_SETMODE "-DUSE_SETMODE")
 endif()
@@ -672,847 +683,846 @@ else()
   set(DJPEG_BMP_SOURCES wrbmp.c wrtarga.c)
 endif()
 
-if(JTURBO_BUILD_BINARIES)
-  if(ENABLE_STATIC)
-    add_executable(cjpeg-static cjpeg.c cdjpeg.c rdgif.c rdppm.c rdswitch.c
-      ${CJPEG_BMP_SOURCES})
-    set_property(TARGET cjpeg-static PROPERTY COMPILE_FLAGS ${COMPILE_FLAGS})
-    target_link_libraries(cjpeg-static ${libname})
-
-    add_executable(djpeg-static djpeg.c cdjpeg.c rdcolmap.c rdswitch.c wrgif.c
-      wrppm.c ${DJPEG_BMP_SOURCES})
-    set_property(TARGET djpeg-static PROPERTY COMPILE_FLAGS ${COMPILE_FLAGS})
-    target_link_libraries(djpeg-static ${libname})
-
-    add_executable(jpegtran-static jpegtran.c cdjpeg.c rdswitch.c transupp.c)
-    target_link_libraries(jpegtran-static ${libname})
-    set_property(TARGET jpegtran-static PROPERTY COMPILE_FLAGS "${USE_SETMODE}")
-  endif()
+if(ENABLE_STATIC)
+  add_executable(cjpeg-static cjpeg.c cdjpeg.c rdgif.c rdppm.c rdswitch.c
+    ${CJPEG_BMP_SOURCES})
+  set_property(TARGET cjpeg-static PROPERTY COMPILE_FLAGS ${COMPILE_FLAGS})
+  target_link_libraries(cjpeg-static jpeg-static)
+
+  add_executable(djpeg-static djpeg.c cdjpeg.c rdcolmap.c rdswitch.c wrgif.c
+    wrppm.c ${DJPEG_BMP_SOURCES})
+  set_property(TARGET djpeg-static PROPERTY COMPILE_FLAGS ${COMPILE_FLAGS})
+  target_link_libraries(djpeg-static jpeg-static)
+
+  add_executable(jpegtran-static jpegtran.c cdjpeg.c rdswitch.c transupp.c)
+  target_link_libraries(jpegtran-static jpeg-static)
+  set_property(TARGET jpegtran-static PROPERTY COMPILE_FLAGS "${USE_SETMODE}")
+endif()
 
-  add_executable(rdjpgcom rdjpgcom.c)
+add_executable(rdjpgcom rdjpgcom.c)
 
-  add_executable(wrjpgcom wrjpgcom.c)
+add_executable(wrjpgcom wrjpgcom.c)
 endif()
 
 ###############################################################################
 # TESTS
 ###############################################################################
 if(JTURBO_BUILD_TESTS)
-  add_subdirectory(md5)
+if(WITH_FUZZ)
+  add_subdirectory(fuzz)
+endif()
 
-  if(MSVC_IDE OR XCODE)
-    set(OBJDIR "\${CTEST_CONFIGURATION_TYPE}/")
-  else()
-    set(OBJDIR "")
-  endif()
+add_subdirectory(md5)
 
-  enable_testing()
-
-  if(WITH_12BIT)
-    set(TESTORIG testorig12.jpg)
-    set(MD5_JPEG_RGB_ISLOW 9d7369207c520d37f2c1cbfcb82b2964)
-    set(MD5_JPEG_RGB_ISLOW2 a00bd20d8ae49684640ef7177d2e0b64)
-    set(MD5_PPM_RGB_ISLOW f3301d2219783b8b3d942b7239fa50c0)
-    set(MD5_JPEG_422_IFAST_OPT 7322e3bd2f127f7de4b40d4480ce60e4)
-    set(MD5_PPM_422_IFAST 79807fa552899e66a04708f533e16950)
-    set(MD5_PPM_422M_IFAST 07737bfe8a7c1c87aaa393a0098d16b0)
-    set(MD5_JPEG_420_IFAST_Q100_PROG 008ab68d6ddbba04a8f01deee4e0f9f8)
-    set(MD5_PPM_420_Q100_IFAST 1b3730122709f53d007255e8dfd3305e)
-    set(MD5_PPM_420M_Q100_IFAST 980a1a3c5bf9510022869d30b7d26566)
-    set(MD5_JPEG_GRAY_ISLOW 235c90707b16e2e069f37c888b2636d9)
-    set(MD5_PPM_GRAY_ISLOW 7213c10af507ad467da5578ca5ee1fca)
-    set(MD5_PPM_GRAY_ISLOW_RGB e96ee81c30a6ed422d466338bd3de65d)
-    set(MD5_JPEG_420S_IFAST_OPT 7af8e60be4d9c227ec63ac9b6630855e)
-
-    set(MD5_JPEG_3x2_FLOAT_PROG_SSE a8c17daf77b457725ec929e215b603f8)
-    set(MD5_PPM_3x2_FLOAT_SSE 42876ab9e5c2f76a87d08db5fbd57956)
-    set(MD5_JPEG_3x2_FLOAT_PROG_NO_FP_CONTRACT a8c17daf77b457725ec929e215b603f8)
-    set(MD5_PPM_3x2_FLOAT_NO_FP_CONTRACT ${MD5_PPM_3x2_FLOAT_SSE})
-    set(MD5_JPEG_3x2_FLOAT_PROG_FP_CONTRACT
-      ${MD5_JPEG_3x2_FLOAT_PROG_NO_FP_CONTRACT})
-    set(MD5_PPM_3x2_FLOAT_FP_CONTRACT ${MD5_PPM_3x2_FLOAT_SSE})
-    set(MD5_JPEG_3x2_FLOAT_PROG_387 bc6dbbefac2872f6b9d6c4a0ae60c3c0)
-    set(MD5_PPM_3x2_FLOAT_387 bcc5723c61560463ac60f772e742d092)
-    set(MD5_JPEG_3x2_FLOAT_PROG_MSVC e27840755870fa849872e58aa0cd1400)
-    set(MD5_PPM_3x2_FLOAT_MSVC 6c2880b83bb1aa41dfe330e7a9768690)
-
-    set(MD5_JPEG_3x2_IFAST_PROG 1396cc2b7185cfe943d408c9d305339e)
-    set(MD5_PPM_3x2_IFAST 3975985ef6eeb0a2cdc58daa651ccc00)
-    set(MD5_PPM_420M_ISLOW_2_1 4ca6be2a6f326ff9eaab63e70a8259c0)
-    set(MD5_PPM_420M_ISLOW_15_8 12aa9f9534c1b3d7ba047322226365eb)
-    set(MD5_PPM_420M_ISLOW_13_8 f7e22817c7b25e1393e4ec101e9d4e96)
-    set(MD5_PPM_420M_ISLOW_11_8 800a16f9f4dc9b293197bfe11be10a82)
-    set(MD5_PPM_420M_ISLOW_9_8 06b7a92a9bc69f4dc36ec40f1937d55c)
-    set(MD5_PPM_420M_ISLOW_7_8 3ec444a14a4ab4eab88ffc49c48eca43)
-    set(MD5_PPM_420M_ISLOW_3_4 3e726b7ea872445b19437d1c1d4f0d93)
-    set(MD5_PPM_420M_ISLOW_5_8 a8a771abdc94301d20ffac119b2caccd)
-    set(MD5_PPM_420M_ISLOW_1_2 b419124dd5568b085787234866102866)
-    set(MD5_PPM_420M_ISLOW_3_8 343d19015531b7bbe746124127244fa8)
-    set(MD5_PPM_420M_ISLOW_1_4 35fd59d866e44659edfa3c18db2a3edb)
-    set(MD5_PPM_420M_ISLOW_1_8 ccaed48ac0aedefda5d4abe4013f4ad7)
-    set(MD5_PPM_420_ISLOW_SKIP15_31 86664cd9dc956536409e44e244d20a97)
-    set(MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71 452a21656115a163029cfba5c04fa76a)
-    set(MD5_PPM_444_ISLOW_SKIP1_6 ef63901f71ef7a75cd78253fc0914f84)
-    set(MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13 15b173fb5872d9575572fbcc1b05956f)
-    set(MD5_JPEG_CROP cdb35ff4b4519392690ea040c56ea99c)
-  else()
-    set(TESTORIG testorig.jpg)
-    set(MD5_JPEG_RGB_ISLOW 1d44a406f61da743b5fd31c0a9abdca3)
-    set(MD5_JPEG_RGB_ISLOW2 31d121e57b6c2934c890a7fc7763bcd4)
-    set(MD5_PPM_RGB_ISLOW 00a257f5393fef8821f2b88ac7421291)
-    set(MD5_BMP_RGB_ISLOW_565 f07d2e75073e4bb10f6c6f4d36e2e3be)
-    set(MD5_BMP_RGB_ISLOW_565D 4cfa0928ef3e6bb626d7728c924cfda4)
-    set(MD5_JPEG_422_IFAST_OPT 2540287b79d913f91665e660303ab2c8)
-    set(MD5_PPM_422_IFAST 35bd6b3f833bad23de82acea847129fa)
-    set(MD5_PPM_422M_IFAST 8dbc65323d62cca7c91ba02dd1cfa81d)
-    set(MD5_BMP_422M_IFAST_565 3294bd4d9a1f2b3d08ea6020d0db7065)
-    set(MD5_BMP_422M_IFAST_565D da98c9c7b6039511be4a79a878a9abc1)
-    set(MD5_JPEG_420_IFAST_Q100_PROG e59bb462016a8d9a748c330a3474bb55)
-    set(MD5_PPM_420_Q100_IFAST 5a732542015c278ff43635e473a8a294)
-    set(MD5_PPM_420M_Q100_IFAST ff692ee9323a3b424894862557c092f1)
-    set(MD5_JPEG_GRAY_ISLOW 72b51f894b8f4a10b3ee3066770aa38d)
-    set(MD5_PPM_GRAY_ISLOW 8d3596c56eace32f205deccc229aa5ed)
-    set(MD5_PPM_GRAY_ISLOW_RGB 116424ac07b79e5e801f00508eab48ec)
-    set(MD5_BMP_GRAY_ISLOW_565 12f78118e56a2f48b966f792fedf23cc)
-    set(MD5_BMP_GRAY_ISLOW_565D bdbbd616441a24354c98553df5dc82db)
-    set(MD5_JPEG_420S_IFAST_OPT 388708217ac46273ca33086b22827ed8)
-
-    set(MD5_JPEG_3x2_FLOAT_PROG_SSE 343e3f8caf8af5986ebaf0bdc13b5c71)
-    set(MD5_PPM_3x2_FLOAT_SSE 1a75f36e5904d6fc3a85a43da9ad89bb)
-    set(MD5_JPEG_3x2_FLOAT_PROG_NO_FP_CONTRACT 9bca803d2042bd1eb03819e2bf92b3e5)
-    set(MD5_PPM_3x2_FLOAT_NO_FP_CONTRACT f6bfab038438ed8f5522fbd33595dcdc)
-    set(MD5_JPEG_3x2_FLOAT_PROG_FP_CONTRACT 
-      ${MD5_JPEG_3x2_FLOAT_PROG_NO_FP_CONTRACT})
-    set(MD5_PPM_3x2_FLOAT_FP_CONTRACT 0e917a34193ef976b679a6b069b1be26)
-    set(MD5_JPEG_3x2_FLOAT_PROG_387 1657664a410e0822c924b54f6f65e6e9)
-    set(MD5_PPM_3x2_FLOAT_387 cb0a1f027f3d2917c902b5640214e025)
-    set(MD5_JPEG_3x2_FLOAT_PROG_MSVC 7999ce9cd0ee9b6c7043b7351ab7639d)
-    set(MD5_PPM_3x2_FLOAT_MSVC 28cdc448a6b75e97892f0e0f8d4b21f3)
-
-    set(MD5_JPEG_3x2_IFAST_PROG 1ee5d2c1a77f2da495f993c8c7cceca5)
-    set(MD5_PPM_3x2_IFAST fd283664b3b49127984af0a7f118fccd)
-    set(MD5_JPEG_420_ISLOW_ARI e986fb0a637a8d833d96e8a6d6d84ea1)
-    set(MD5_JPEG_444_ISLOW_PROGARI 0a8f1c8f66e113c3cf635df0a475a617)
-    # Since v1.5.1, libjpeg-turbo uses the separate non-fancy upsampling and
-    # YCbCr -> RGB color conversion routines rather than merged upsampling/color
-    # conversion when fancy upsampling is disabled on platforms that have a SIMD
-    # implementation of YCbCr -> RGB color conversion but no SIMD implementation
-    # of merged upsampling/color conversion.  This was intended to improve the
-    # performance of the Arm Neon SIMD extensions, the only SIMD extensions for
-    # which those circumstances currently apply.  The separate non-fancy
-    # upsampling and color conversion routines usually produce bitwise-identical
-    # output to the merged upsampling/color conversion routines, but that is not
-    # the case when skipping scanlines starting at an odd-numbered scanline.  In
-    # libjpeg-turbo 2.0.5 and prior, doing that while using merged h2v2
-    # upsampling caused a segfault, so this test validates the fix for that
-    # segfault.  Unfortunately, however, the test also produces different bitwise
-    # output when using the Neon SIMD extensions, because of the aforementioned
-    # optimization.  The easiest workaround is to use the old test from
-    # libjpeg-turbo 2.0.5 and prior when using the Neon SIMD extensions.  The
-    # aforementioned segfault never would have occurred with the Neon SIMD
-    # extensions anyhow, since merged upsampling is disabled when using them.
-    if((CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm") AND WITH_SIMD)
-      set(MD5_PPM_420M_IFAST_ARI 72b59a99bcf1de24c5b27d151bde2437)
-    else()
-      set(MD5_PPM_420M_IFAST_ARI 57251da28a35b46eecb7177d82d10e0e)
-    endif()
-    set(MD5_JPEG_420_ISLOW 9a68f56bc76e466aa7e52f415d0f4a5f)
-    set(MD5_PPM_420M_ISLOW_2_1 9f9de8c0612f8d06869b960b05abf9c9)
-    set(MD5_PPM_420M_ISLOW_15_8 b6875bc070720b899566cc06459b63b7)
-    set(MD5_PPM_420M_ISLOW_13_8 bc3452573c8152f6ae552939ee19f82f)
-    set(MD5_PPM_420M_ISLOW_11_8 d8cc73c0aaacd4556569b59437ba00a5)
-    set(MD5_PPM_420M_ISLOW_9_8 d25e61bc7eac0002f5b393aa223747b6)
-    set(MD5_PPM_420M_ISLOW_7_8 ddb564b7c74a09494016d6cd7502a946)
-    set(MD5_PPM_420M_ISLOW_3_4 8ed8e68808c3fbc4ea764fc9d2968646)
-    set(MD5_PPM_420M_ISLOW_5_8 a3363274999da2366a024efae6d16c9b)
-    set(MD5_PPM_420M_ISLOW_1_2 e692a315cea26b988c8e8b29a5dbcd81)
-    set(MD5_PPM_420M_ISLOW_3_8 79eca9175652ced755155c90e785a996)
-    set(MD5_PPM_420M_ISLOW_1_4 79cd778f8bf1a117690052cacdd54eca)
-    set(MD5_PPM_420M_ISLOW_1_8 391b3d4aca640c8567d6f8745eb2142f)
-    set(MD5_BMP_420_ISLOW_256 4980185e3776e89bd931736e1cddeee6)
-    set(MD5_BMP_420_ISLOW_565 bf9d13e16c4923b92e1faa604d7922cb)
-    set(MD5_BMP_420_ISLOW_565D 6bde71526acc44bcff76f696df8638d2)
-    set(MD5_BMP_420M_ISLOW_565 8dc0185245353cfa32ad97027342216f)
-    set(MD5_BMP_420M_ISLOW_565D ce034037d212bc403330df6f915c161b)
-    set(MD5_PPM_420_ISLOW_SKIP15_31 c4c65c1e43d7275cd50328a61e6534f0)
-    set(MD5_PPM_420_ISLOW_ARI_SKIP16_139 087c6b123db16ac00cb88c5b590bb74a)
-    set(MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71 26eb36ccc7d1f0cb80cdabb0ac8b5d99)
-    set(MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4 886c6775af22370257122f8b16207e6d)
-    set(MD5_PPM_444_ISLOW_SKIP1_6 5606f86874cf26b8fcee1117a0a436a6)
-    set(MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13 db87dc7ce26bcdc7a6b56239ce2b9d6c)
-    set(MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0 cb57b32bd6d03e35432362f7bf184b6d)
-    set(MD5_JPEG_CROP b4197f377e621c4e9b1d20471432610d)
-  endif()
+if(MSVC_IDE OR XCODE)
+  set(OBJDIR "\${CTEST_CONFIGURATION_TYPE}/")
+else()
+  set(OBJDIR "")
+endif()
 
-  if(WITH_JAVA)
-    add_test(TJUnitTest
-      ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
-        -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
-        TJUnitTest)
-    add_test(TJUnitTest-yuv
-      ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
-        -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
-        TJUnitTest -yuv)
-    add_test(TJUnitTest-yuv-nopad
-      ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
-        -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
-        TJUnitTest -yuv -noyuvpad)
-    add_test(TJUnitTest-bi
-      ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
-        -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
-        TJUnitTest -bi)
-    add_test(TJUnitTest-bi-yuv
-      ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
-        -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
-        TJUnitTest -bi -yuv)
-    add_test(TJUnitTest-bi-yuv-nopad
-      ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
-        -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
-        TJUnitTest -bi -yuv -noyuvpad)
-  endif()
+enable_testing()
 
-  set(TEST_LIBTYPES "")
-  if(ENABLE_SHARED)
-    set(TEST_LIBTYPES ${TEST_LIBTYPES} shared)
-  endif()
-  if(ENABLE_STATIC)
-    set(TEST_LIBTYPES ${TEST_LIBTYPES} static)
-  endif()
+if(WITH_12BIT)
+  set(TESTORIG testorig12.jpg)
+  set(MD5_JPEG_RGB_ISLOW 9d7369207c520d37f2c1cbfcb82b2964)
+  set(MD5_JPEG_RGB_ISLOW2 a00bd20d8ae49684640ef7177d2e0b64)
+  set(MD5_PPM_RGB_ISLOW f3301d2219783b8b3d942b7239fa50c0)
+  set(MD5_JPEG_422_IFAST_OPT 7322e3bd2f127f7de4b40d4480ce60e4)
+  set(MD5_PPM_422_IFAST 79807fa552899e66a04708f533e16950)
+  set(MD5_JPEG_440_ISLOW e25c1912e38367be505a89c410c1c2d2)
+  set(MD5_PPM_440_ISLOW e7d2e26288870cfcb30f3114ad01e380)
+  set(MD5_PPM_422M_IFAST 07737bfe8a7c1c87aaa393a0098d16b0)
+  set(MD5_JPEG_420_IFAST_Q100_PROG 9447cef4803d9b0f74bcf333cc710a29)
+  set(MD5_PPM_420_Q100_IFAST 1b3730122709f53d007255e8dfd3305e)
+  set(MD5_PPM_420M_Q100_IFAST 980a1a3c5bf9510022869d30b7d26566)
+  set(MD5_JPEG_GRAY_ISLOW 235c90707b16e2e069f37c888b2636d9)
+  set(MD5_PPM_GRAY_ISLOW 7213c10af507ad467da5578ca5ee1fca)
+  set(MD5_PPM_GRAY_ISLOW_RGB e96ee81c30a6ed422d466338bd3de65d)
+  set(MD5_JPEG_420S_IFAST_OPT 7af8e60be4d9c227ec63ac9b6630855e)
+
+  set(MD5_JPEG_3x2_FLOAT_PROG_SSE a8c17daf77b457725ec929e215b603f8)
+  set(MD5_PPM_3x2_FLOAT_SSE 42876ab9e5c2f76a87d08db5fbd57956)
+  set(MD5_JPEG_3x2_FLOAT_PROG_NO_FP_CONTRACT a8c17daf77b457725ec929e215b603f8)
+  set(MD5_PPM_3x2_FLOAT_NO_FP_CONTRACT ${MD5_PPM_3x2_FLOAT_SSE})
+  set(MD5_JPEG_3x2_FLOAT_PROG_FP_CONTRACT
+    ${MD5_JPEG_3x2_FLOAT_PROG_NO_FP_CONTRACT})
+  set(MD5_PPM_3x2_FLOAT_FP_CONTRACT ${MD5_PPM_3x2_FLOAT_SSE})
+  set(MD5_JPEG_3x2_FLOAT_PROG_387 bc6dbbefac2872f6b9d6c4a0ae60c3c0)
+  set(MD5_PPM_3x2_FLOAT_387 bcc5723c61560463ac60f772e742d092)
+  set(MD5_JPEG_3x2_FLOAT_PROG_MSVC e27840755870fa849872e58aa0cd1400)
+  set(MD5_PPM_3x2_FLOAT_MSVC 6c2880b83bb1aa41dfe330e7a9768690)
+
+  set(MD5_JPEG_3x2_IFAST_PROG 1396cc2b7185cfe943d408c9d305339e)
+  set(MD5_PPM_3x2_IFAST 3975985ef6eeb0a2cdc58daa651ccc00)
+  set(MD5_PPM_420M_ISLOW_2_1 4ca6be2a6f326ff9eaab63e70a8259c0)
+  set(MD5_PPM_420M_ISLOW_15_8 12aa9f9534c1b3d7ba047322226365eb)
+  set(MD5_PPM_420M_ISLOW_13_8 f7e22817c7b25e1393e4ec101e9d4e96)
+  set(MD5_PPM_420M_ISLOW_11_8 800a16f9f4dc9b293197bfe11be10a82)
+  set(MD5_PPM_420M_ISLOW_9_8 06b7a92a9bc69f4dc36ec40f1937d55c)
+  set(MD5_PPM_420M_ISLOW_7_8 3ec444a14a4ab4eab88ffc49c48eca43)
+  set(MD5_PPM_420M_ISLOW_3_4 3e726b7ea872445b19437d1c1d4f0d93)
+  set(MD5_PPM_420M_ISLOW_5_8 a8a771abdc94301d20ffac119b2caccd)
+  set(MD5_PPM_420M_ISLOW_1_2 b419124dd5568b085787234866102866)
+  set(MD5_PPM_420M_ISLOW_3_8 343d19015531b7bbe746124127244fa8)
+  set(MD5_PPM_420M_ISLOW_1_4 35fd59d866e44659edfa3c18db2a3edb)
+  set(MD5_PPM_420M_ISLOW_1_8 ccaed48ac0aedefda5d4abe4013f4ad7)
+  set(MD5_PPM_420_ISLOW_SKIP15_31 86664cd9dc956536409e44e244d20a97)
+  set(MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71 452a21656115a163029cfba5c04fa76a)
+  set(MD5_PPM_444_ISLOW_SKIP1_6 ef63901f71ef7a75cd78253fc0914f84)
+  set(MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13 15b173fb5872d9575572fbcc1b05956f)
+  set(MD5_JPEG_CROP cdb35ff4b4519392690ea040c56ea99c)
+else()
+  set(TESTORIG testorig.jpg)
+  set(MD5_JPEG_RGB_ISLOW 1d44a406f61da743b5fd31c0a9abdca3)
+  set(MD5_JPEG_RGB_ISLOW2 31d121e57b6c2934c890a7fc7763bcd4)
+  set(MD5_PPM_RGB_ISLOW 00a257f5393fef8821f2b88ac7421291)
+  set(MD5_BMP_RGB_ISLOW_565 f07d2e75073e4bb10f6c6f4d36e2e3be)
+  set(MD5_BMP_RGB_ISLOW_565D 4cfa0928ef3e6bb626d7728c924cfda4)
+  set(MD5_JPEG_422_IFAST_OPT 2540287b79d913f91665e660303ab2c8)
+  set(MD5_PPM_422_IFAST 35bd6b3f833bad23de82acea847129fa)
+  set(MD5_JPEG_440_ISLOW 538bc02bd4b4658fd85de6ece6cbeda6)
+  set(MD5_PPM_440_ISLOW 11e7eab7ef7ef3276934bb7e7b6bb377)
+  set(MD5_PPM_422M_IFAST 8dbc65323d62cca7c91ba02dd1cfa81d)
+  set(MD5_BMP_422M_IFAST_565 3294bd4d9a1f2b3d08ea6020d0db7065)
+  set(MD5_BMP_422M_IFAST_565D da98c9c7b6039511be4a79a878a9abc1)
+  set(MD5_JPEG_420_IFAST_Q100_PROG 0ba15f9dab81a703505f835f9dbbac6d)
+  set(MD5_PPM_420_Q100_IFAST 5a732542015c278ff43635e473a8a294)
+  set(MD5_PPM_420M_Q100_IFAST ff692ee9323a3b424894862557c092f1)
+  set(MD5_JPEG_GRAY_ISLOW 72b51f894b8f4a10b3ee3066770aa38d)
+  set(MD5_PPM_GRAY_ISLOW 8d3596c56eace32f205deccc229aa5ed)
+  set(MD5_PPM_GRAY_ISLOW_RGB 116424ac07b79e5e801f00508eab48ec)
+  set(MD5_BMP_GRAY_ISLOW_565 12f78118e56a2f48b966f792fedf23cc)
+  set(MD5_BMP_GRAY_ISLOW_565D bdbbd616441a24354c98553df5dc82db)
+  set(MD5_JPEG_420S_IFAST_OPT 388708217ac46273ca33086b22827ed8)
+
+  set(MD5_JPEG_3x2_FLOAT_PROG_SSE 343e3f8caf8af5986ebaf0bdc13b5c71)
+  set(MD5_PPM_3x2_FLOAT_SSE 1a75f36e5904d6fc3a85a43da9ad89bb)
+  set(MD5_JPEG_3x2_FLOAT_PROG_NO_FP_CONTRACT 9bca803d2042bd1eb03819e2bf92b3e5)
+  set(MD5_PPM_3x2_FLOAT_NO_FP_CONTRACT f6bfab038438ed8f5522fbd33595dcdc)
+  set(MD5_JPEG_3x2_FLOAT_PROG_FP_CONTRACT
+    ${MD5_JPEG_3x2_FLOAT_PROG_NO_FP_CONTRACT})
+  set(MD5_PPM_3x2_FLOAT_FP_CONTRACT 0e917a34193ef976b679a6b069b1be26)
+  set(MD5_JPEG_3x2_FLOAT_PROG_387 1657664a410e0822c924b54f6f65e6e9)
+  set(MD5_PPM_3x2_FLOAT_387 cb0a1f027f3d2917c902b5640214e025)
+  set(MD5_JPEG_3x2_FLOAT_PROG_MSVC 7999ce9cd0ee9b6c7043b7351ab7639d)
+  set(MD5_PPM_3x2_FLOAT_MSVC 28cdc448a6b75e97892f0e0f8d4b21f3)
+
+  set(MD5_JPEG_3x2_IFAST_PROG 1ee5d2c1a77f2da495f993c8c7cceca5)
+  set(MD5_PPM_3x2_IFAST fd283664b3b49127984af0a7f118fccd)
+  set(MD5_JPEG_420_ISLOW_ARI e986fb0a637a8d833d96e8a6d6d84ea1)
+  set(MD5_JPEG_444_ISLOW_PROGARI 0a8f1c8f66e113c3cf635df0a475a617)
+  set(MD5_PPM_420M_IFAST_ARI 57251da28a35b46eecb7177d82d10e0e)
+  set(MD5_JPEG_420_ISLOW 9a68f56bc76e466aa7e52f415d0f4a5f)
+  set(MD5_PPM_420M_ISLOW_2_1 9f9de8c0612f8d06869b960b05abf9c9)
+  set(MD5_PPM_420M_ISLOW_15_8 b6875bc070720b899566cc06459b63b7)
+  set(MD5_PPM_420M_ISLOW_13_8 bc3452573c8152f6ae552939ee19f82f)
+  set(MD5_PPM_420M_ISLOW_11_8 d8cc73c0aaacd4556569b59437ba00a5)
+  set(MD5_PPM_420M_ISLOW_9_8 d25e61bc7eac0002f5b393aa223747b6)
+  set(MD5_PPM_420M_ISLOW_7_8 ddb564b7c74a09494016d6cd7502a946)
+  set(MD5_PPM_420M_ISLOW_3_4 8ed8e68808c3fbc4ea764fc9d2968646)
+  set(MD5_PPM_420M_ISLOW_5_8 a3363274999da2366a024efae6d16c9b)
+  set(MD5_PPM_420M_ISLOW_1_2 e692a315cea26b988c8e8b29a5dbcd81)
+  set(MD5_PPM_420M_ISLOW_3_8 79eca9175652ced755155c90e785a996)
+  set(MD5_PPM_420M_ISLOW_1_4 79cd778f8bf1a117690052cacdd54eca)
+  set(MD5_PPM_420M_ISLOW_1_8 391b3d4aca640c8567d6f8745eb2142f)
+  set(MD5_BMP_420_ISLOW_256 4980185e3776e89bd931736e1cddeee6)
+  set(MD5_BMP_420_ISLOW_565 bf9d13e16c4923b92e1faa604d7922cb)
+  set(MD5_BMP_420_ISLOW_565D 6bde71526acc44bcff76f696df8638d2)
+  set(MD5_BMP_420M_ISLOW_565 8dc0185245353cfa32ad97027342216f)
+  set(MD5_BMP_420M_ISLOW_565D ce034037d212bc403330df6f915c161b)
+  set(MD5_PPM_420_ISLOW_SKIP15_31 c4c65c1e43d7275cd50328a61e6534f0)
+  set(MD5_PPM_420_ISLOW_ARI_SKIP16_139 087c6b123db16ac00cb88c5b590bb74a)
+  set(MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71 26eb36ccc7d1f0cb80cdabb0ac8b5d99)
+  set(MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4 886c6775af22370257122f8b16207e6d)
+  set(MD5_PPM_444_ISLOW_SKIP1_6 5606f86874cf26b8fcee1117a0a436a6)
+  set(MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13 db87dc7ce26bcdc7a6b56239ce2b9d6c)
+  set(MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0 cb57b32bd6d03e35432362f7bf184b6d)
+  set(MD5_JPEG_CROP b4197f377e621c4e9b1d20471432610d)
+endif()
 
-  set(TESTIMAGES ${CMAKE_CURRENT_SOURCE_DIR}/testimages)
-  set(MD5CMP ${CMAKE_CURRENT_BINARY_DIR}/md5/md5cmp)
-  if(CMAKE_CROSSCOMPILING)
-    file(RELATIVE_PATH TESTIMAGES ${CMAKE_CURRENT_BINARY_DIR} ${TESTIMAGES})
-    file(RELATIVE_PATH MD5CMP ${CMAKE_CURRENT_BINARY_DIR} ${MD5CMP})
-  endif()
+if(WITH_JAVA)
+  add_test(TJUnitTest
+    ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
+      -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
+      TJUnitTest)
+  add_test(TJUnitTest-yuv
+    ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
+      -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
+      TJUnitTest -yuv)
+  add_test(TJUnitTest-yuv-nopad
+    ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
+      -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
+      TJUnitTest -yuv -noyuvpad)
+  add_test(TJUnitTest-bi
+    ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
+      -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
+      TJUnitTest -bi)
+  add_test(TJUnitTest-bi-yuv
+    ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
+      -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
+      TJUnitTest -bi -yuv)
+  add_test(TJUnitTest-bi-yuv-nopad
+    ${Java_JAVA_EXECUTABLE} ${JAVAARGS} -cp java/turbojpeg.jar
+      -Djava.library.path=${CMAKE_CURRENT_BINARY_DIR}/${OBJDIR}
+      TJUnitTest -bi -yuv -noyuvpad)
+endif()
 
-  # The output of the floating point DCT/IDCT algorithms differs depending on the
-  # type of floating point math used, so the FLOATTEST CMake variable must be
-  # set in order to tell the testing system which floating point results it
-  # should expect:
-  #
-  # sse = validate against the expected results from the libjpeg-turbo SSE SIMD
-  #       extensions
-  # no-fp-contract = validate against the expected results from the C code when
-  #                  floating point expression contraction is disabled (the
-  #                  default with Clang, with GCC when building for platforms
-  #                  that lack fused multiply-add [FMA] instructions, or when
-  #                  passing -ffp-contract=off to the compiler)
-  # fp-contract = validate against the expected results from the C code when
-  #               floating point expression contraction is enabled (the default
-  #               with GCC when building for platforms that have fused multiply-
-  #               add [FMA] instructions or when passing -ffp-contract=fast to
-  #               the compiler)
-  # 387 = validate against the expected results from the C code when the 387 FPU
-  #       is being used for floating point math (which is generally the default
-  #       with x86 compilers)
-  # msvc = validate against the expected results from the C code when compiled
-  #        with a 32-bit version of Visual C++
-  
-  if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386")
-    if(WITH_SIMD)
-      set(DEFAULT_FLOATTEST sse)
-    elseif(CPU_TYPE STREQUAL "x86_64")
-      set(DEFAULT_FLOATTEST no-fp-contract)
-    elseif(CPU_TYPE STREQUAL "i386" AND MSVC)
-      set(DEFAULT_FLOATTEST msvc)
-    # else we can't really set an intelligent default for i386.  The appropriate
-    # value could be 387, no-fp-contract, or fp-contract, depending on the
-    # compiler and compiler options.  We leave it to the user to set FLOATTEST
-    # manually.
-    endif()
-  else()
-    if((CPU_TYPE STREQUAL "powerpc" OR CPU_TYPE STREQUAL "arm64") AND
-      NOT CMAKE_C_COMPILER_ID STREQUAL "Clang")
-      set(DEFAULT_FLOATTEST fp-contract)
-    else()
-      set(DEFAULT_FLOATTEST no-fp-contract)
-    endif()
+set(TEST_LIBTYPES "")
+if(ENABLE_SHARED)
+  set(TEST_LIBTYPES ${TEST_LIBTYPES} shared)
+endif()
+if(ENABLE_STATIC)
+  set(TEST_LIBTYPES ${TEST_LIBTYPES} static)
+endif()
+
+set(TESTIMAGES ${CMAKE_CURRENT_SOURCE_DIR}/testimages)
+set(MD5CMP ${CMAKE_CURRENT_BINARY_DIR}/md5/md5cmp)
+if(CMAKE_CROSSCOMPILING)
+  file(RELATIVE_PATH TESTIMAGES ${CMAKE_CURRENT_BINARY_DIR} ${TESTIMAGES})
+  file(RELATIVE_PATH MD5CMP ${CMAKE_CURRENT_BINARY_DIR} ${MD5CMP})
+endif()
+
+# The output of the floating point DCT/IDCT algorithms differs depending on the
+# type of floating point math used, so the FLOATTEST CMake variable must be
+# set in order to tell the testing system which floating point results it
+# should expect:
+#
+# sse = validate against the expected results from the libjpeg-turbo SSE SIMD
+#       extensions
+# no-fp-contract = validate against the expected results from the C code when
+#                  floating point expression contraction is disabled (the
+#                  default with Clang, with GCC when building for platforms
+#                  that lack fused multiply-add [FMA] instructions, or when
+#                  passing -ffp-contract=off to the compiler)
+# fp-contract = validate against the expected results from the C code when
+#               floating point expression contraction is enabled (the default
+#               with GCC when building for platforms that have fused multiply-
+#               add [FMA] instructions or when passing -ffp-contract=fast to
+#               the compiler)
+# 387 = validate against the expected results from the C code when the 387 FPU
+#       is being used for floating point math (which is generally the default
+#       with x86 compilers)
+# msvc = validate against the expected results from the C code when compiled
+#        with a 32-bit version of Visual C++
+
+if(CPU_TYPE STREQUAL "x86_64" OR CPU_TYPE STREQUAL "i386")
+  if(WITH_SIMD)
+    set(DEFAULT_FLOATTEST sse)
+  elseif(CPU_TYPE STREQUAL "x86_64")
+    set(DEFAULT_FLOATTEST no-fp-contract)
+  elseif(CPU_TYPE STREQUAL "i386" AND MSVC)
+    set(DEFAULT_FLOATTEST msvc)
+  # else we can't really set an intelligent default for i386.  The appropriate
+  # value could be 387, no-fp-contract, or fp-contract, depending on the
+  # compiler and compiler options.  We leave it to the user to set FLOATTEST
+  # manually.
   endif()
-  
-  # This causes FLOATTEST to reset to the default value if WITH_SIMD has
-  # changed.
-  if(DEFINED WITH_SIMD_INT AND NOT WITH_SIMD EQUAL WITH_SIMD_INT)
-    set(FORCE_FLOATTEST "FORCE")
+else()
+  if((CPU_TYPE STREQUAL "powerpc" OR CPU_TYPE STREQUAL "arm64") AND
+    NOT CMAKE_C_COMPILER_ID STREQUAL "Clang" AND NOT MSVC)
+    set(DEFAULT_FLOATTEST fp-contract)
+  else()
+    set(DEFAULT_FLOATTEST no-fp-contract)
   endif()
-  set(WITH_SIMD_INT ${WITH_SIMD} CACHE INTERNAL "")
-  set(FLOATTEST ${DEFAULT_FLOATTEST} CACHE STRING
-    "The type of floating point math used by the floating point DCT/IDCT algorithms.  This tells the testing system which numerical results it should expect from those tests.  [sse = libjpeg-turbo x86/x86-64 SIMD extensions, no-fp-contract = generic FPU with floating point expression contraction disabled, fp-contract = generic FPU with floating point expression contraction enabled, 387 = 387 FPU, msvc = 32-bit Visual Studio] (default = ${DEFAULT_FLOATTEST})"
-    ${FORCE_FLOATTEST})
-  message(STATUS "FLOATTEST = ${FLOATTEST}")
-  
-  if(FLOATTEST)
-    string(TOUPPER ${FLOATTEST} FLOATTEST_UC)
-    string(REGEX REPLACE "-" "_" FLOATTEST_UC ${FLOATTEST_UC})
-    string(TOLOWER ${FLOATTEST} FLOATTEST)
-    if(NOT FLOATTEST STREQUAL "sse" AND
-      NOT FLOATTEST STREQUAL "no-fp-contract" AND
-      NOT FLOATTEST STREQUAL "fp-contract" AND NOT FLOATTEST STREQUAL "387" AND
-      NOT FLOATTEST STREQUAL "msvc")
-      message(FATAL_ERROR "\"${FLOATTEST}\" is not a valid value for FLOATTEST.")
-    endif()
+endif()
+
+# This causes FLOATTEST to reset to the default value if WITH_SIMD has
+# changed.
+if(DEFINED WITH_SIMD_INT AND NOT WITH_SIMD EQUAL WITH_SIMD_INT)
+  set(FORCE_FLOATTEST "FORCE")
+endif()
+set(WITH_SIMD_INT ${WITH_SIMD} CACHE INTERNAL "")
+set(FLOATTEST ${DEFAULT_FLOATTEST} CACHE STRING
+  "The type of floating point math used by the floating point DCT/IDCT algorithms.  This tells the testing system which numerical results it should expect from those tests.  [sse = libjpeg-turbo x86/x86-64 SIMD extensions, no-fp-contract = generic FPU with floating point expression contraction disabled, fp-contract = generic FPU with floating point expression contraction enabled, 387 = 387 FPU, msvc = 32-bit Visual Studio] (default = ${DEFAULT_FLOATTEST})"
+  ${FORCE_FLOATTEST})
+message(STATUS "FLOATTEST = ${FLOATTEST}")
+
+if(FLOATTEST)
+  string(TOUPPER ${FLOATTEST} FLOATTEST_UC)
+  string(REGEX REPLACE "-" "_" FLOATTEST_UC ${FLOATTEST_UC})
+  string(TOLOWER ${FLOATTEST} FLOATTEST)
+  if(NOT FLOATTEST STREQUAL "sse" AND
+    NOT FLOATTEST STREQUAL "no-fp-contract" AND
+    NOT FLOATTEST STREQUAL "fp-contract" AND NOT FLOATTEST STREQUAL "387" AND
+    NOT FLOATTEST STREQUAL "msvc")
+    message(FATAL_ERROR "\"${FLOATTEST}\" is not a valid value for FLOATTEST.")
   endif()
+endif()
 
-  foreach(libtype ${TEST_LIBTYPES})
-    if(libtype STREQUAL "static")
-      set(suffix -static)
-    endif()
-    if(WITH_TURBOJPEG)
-      add_test(tjunittest-${libtype}
-        ${CMAKE_CROSSCOMPILING_EMULATOR} tjunittest${suffix})
-      add_test(tjunittest-${libtype}-alloc
-        ${CMAKE_CROSSCOMPILING_EMULATOR} tjunittest${suffix} -alloc)
-      add_test(tjunittest-${libtype}-yuv
-        ${CMAKE_CROSSCOMPILING_EMULATOR} tjunittest${suffix} -yuv)
-      add_test(tjunittest-${libtype}-yuv-alloc
-        ${CMAKE_CROSSCOMPILING_EMULATOR} tjunittest${suffix} -yuv -alloc)
-      add_test(tjunittest-${libtype}-yuv-nopad
-        ${CMAKE_CROSSCOMPILING_EMULATOR} tjunittest${suffix} -yuv -noyuvpad)
-      add_test(tjunittest-${libtype}-bmp
-        ${CMAKE_CROSSCOMPILING_EMULATOR} tjunittest${suffix} -bmp)
-
-      set(MD5_PPM_GRAY_TILE 89d3ca21213d9d864b50b4e4e7de4ca6)
-      set(MD5_PPM_420_8x8_TILE 847fceab15c5b7b911cb986cf0f71de3)
-      set(MD5_PPM_420_16x16_TILE ca45552a93687e078f7137cc4126a7b0)
-      set(MD5_PPM_420_32x32_TILE d8676f1d6b68df358353bba9844f4a00)
-      set(MD5_PPM_420_64x64_TILE 4e4c1a3d7ea4bace4f868bcbe83b7050)
-      set(MD5_PPM_420_128x128_TILE f24c3429c52265832beab9df72a0ceae)
-      set(MD5_PPM_420M_8x8_TILE bc25320e1f4c31ce2e610e43e9fd173c)
-      set(MD5_PPM_420M_TILE 75ffdf14602258c5c189522af57fa605)
-      set(MD5_PPM_422_8x8_TILE d83dacd9fc73b0a6f10c09acad64eb1e)
-      set(MD5_PPM_422_16x16_TILE 35077fb610d72dd743b1eb0cbcfe10fb)
-      set(MD5_PPM_422_32x32_TILE e6902ed8a449ecc0f0d6f2bf945f65f7)
-      set(MD5_PPM_422_64x64_TILE 2b4502a8f316cedbde1da7bce3d2231e)
-      set(MD5_PPM_422_128x128_TILE f0b5617d578f5e13c8eee215d64d4877)
-      set(MD5_PPM_422M_8x8_TILE 828941d7f41cd6283abd6beffb7fd51d)
-      set(MD5_PPM_422M_TILE e877ae1324c4a280b95376f7f018172f)
-      set(MD5_PPM_444_TILE 7964e41e67cfb8d0a587c0aa4798f9c3)
-
-      # Test compressing from/decompressing to an arbitrary subregion of a larger
-      # image buffer
-      add_test(tjbench-${libtype}-tile-cp
-        ${CMAKE_COMMAND} -E copy_if_different ${TESTIMAGES}/testorig.ppm
-          testout_tile.ppm)
-      add_test(tjbench-${libtype}-tile
-        ${CMAKE_CROSSCOMPILING_EMULATOR} tjbench${suffix} testout_tile.ppm 95
-          -rgb -quiet -tile -benchtime 0.01 -warmup 0)
-      set_tests_properties(tjbench-${libtype}-tile
-        PROPERTIES DEPENDS tjbench-${libtype}-tile-cp)
-
-      foreach(tile 8 16 32 64 128)
-        add_test(tjbench-${libtype}-tile-gray-${tile}x${tile}-cmp
-          ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP} ${MD5_PPM_GRAY_TILE}
-            testout_tile_GRAY_Q95_${tile}x${tile}.ppm)
-        foreach(subsamp 420 422)
-          add_test(tjbench-${libtype}-tile-${subsamp}-${tile}x${tile}-cmp
-            ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP}
-              ${MD5_PPM_${subsamp}_${tile}x${tile}_TILE}
-              testout_tile_${subsamp}_Q95_${tile}x${tile}.ppm)
-        endforeach()
-        add_test(tjbench-${libtype}-tile-444-${tile}x${tile}-cmp
-          ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP} ${MD5_PPM_444_TILE}
-            testout_tile_444_Q95_${tile}x${tile}.ppm)
-        foreach(subsamp gray 420 422 444)
-          set_tests_properties(tjbench-${libtype}-tile-${subsamp}-${tile}x${tile}-cmp
-            PROPERTIES DEPENDS tjbench-${libtype}-tile)
-        endforeach()
+foreach(libtype ${TEST_LIBTYPES})
+  if(libtype STREQUAL "static")
+    set(suffix -static)
+  endif()
+  if(WITH_TURBOJPEG)
+    add_test(tjunittest-${libtype}
+      ${CMAKE_CROSSCOMPILING_EMULATOR} tjunittest${suffix})
+    add_test(tjunittest-${libtype}-alloc
+      ${CMAKE_CROSSCOMPILING_EMULATOR} tjunittest${suffix} -alloc)
+    add_test(tjunittest-${libtype}-yuv
+      ${CMAKE_CROSSCOMPILING_EMULATOR} tjunittest${suffix} -yuv)
+    add_test(tjunittest-${libtype}-yuv-alloc
+      ${CMAKE_CROSSCOMPILING_EMULATOR} tjunittest${suffix} -yuv -alloc)
+    add_test(tjunittest-${libtype}-yuv-nopad
+      ${CMAKE_CROSSCOMPILING_EMULATOR} tjunittest${suffix} -yuv -noyuvpad)
+    add_test(tjunittest-${libtype}-bmp
+      ${CMAKE_CROSSCOMPILING_EMULATOR} tjunittest${suffix} -bmp)
+
+    set(MD5_PPM_GRAY_TILE 89d3ca21213d9d864b50b4e4e7de4ca6)
+    set(MD5_PPM_420_8x8_TILE 847fceab15c5b7b911cb986cf0f71de3)
+    set(MD5_PPM_420_16x16_TILE ca45552a93687e078f7137cc4126a7b0)
+    set(MD5_PPM_420_32x32_TILE d8676f1d6b68df358353bba9844f4a00)
+    set(MD5_PPM_420_64x64_TILE 4e4c1a3d7ea4bace4f868bcbe83b7050)
+    set(MD5_PPM_420_128x128_TILE f24c3429c52265832beab9df72a0ceae)
+    set(MD5_PPM_420M_8x8_TILE bc25320e1f4c31ce2e610e43e9fd173c)
+    set(MD5_PPM_420M_TILE 75ffdf14602258c5c189522af57fa605)
+    set(MD5_PPM_422_8x8_TILE d83dacd9fc73b0a6f10c09acad64eb1e)
+    set(MD5_PPM_422_16x16_TILE 35077fb610d72dd743b1eb0cbcfe10fb)
+    set(MD5_PPM_422_32x32_TILE e6902ed8a449ecc0f0d6f2bf945f65f7)
+    set(MD5_PPM_422_64x64_TILE 2b4502a8f316cedbde1da7bce3d2231e)
+    set(MD5_PPM_422_128x128_TILE f0b5617d578f5e13c8eee215d64d4877)
+    set(MD5_PPM_422M_8x8_TILE 828941d7f41cd6283abd6beffb7fd51d)
+    set(MD5_PPM_422M_TILE e877ae1324c4a280b95376f7f018172f)
+    set(MD5_PPM_444_TILE 7964e41e67cfb8d0a587c0aa4798f9c3)
+
+    # Test compressing from/decompressing to an arbitrary subregion of a larger
+    # image buffer
+    add_test(tjbench-${libtype}-tile-cp
+      ${CMAKE_COMMAND} -E copy_if_different ${TESTIMAGES}/testorig.ppm
+        testout_tile.ppm)
+    add_test(tjbench-${libtype}-tile
+      ${CMAKE_CROSSCOMPILING_EMULATOR} tjbench${suffix} testout_tile.ppm 95
+        -rgb -quiet -tile -benchtime 0.01 -warmup 0)
+    set_tests_properties(tjbench-${libtype}-tile
+      PROPERTIES DEPENDS tjbench-${libtype}-tile-cp)
+
+    foreach(tile 8 16 32 64 128)
+      add_test(tjbench-${libtype}-tile-gray-${tile}x${tile}-cmp
+        ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP} ${MD5_PPM_GRAY_TILE}
+          testout_tile_GRAY_Q95_${tile}x${tile}.ppm)
+      foreach(subsamp 420 422)
+        add_test(tjbench-${libtype}-tile-${subsamp}-${tile}x${tile}-cmp
+          ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP}
+            ${MD5_PPM_${subsamp}_${tile}x${tile}_TILE}
+            testout_tile_${subsamp}_Q95_${tile}x${tile}.ppm)
       endforeach()
+      add_test(tjbench-${libtype}-tile-444-${tile}x${tile}-cmp
+        ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP} ${MD5_PPM_444_TILE}
+          testout_tile_444_Q95_${tile}x${tile}.ppm)
+      foreach(subsamp gray 420 422 444)
+        set_tests_properties(tjbench-${libtype}-tile-${subsamp}-${tile}x${tile}-cmp
+          PROPERTIES DEPENDS tjbench-${libtype}-tile)
+      endforeach()
+    endforeach()
 
-      add_test(tjbench-${libtype}-tilem-cp
-        ${CMAKE_COMMAND} -E copy_if_different ${TESTIMAGES}/testorig.ppm
-          testout_tilem.ppm)
-      add_test(tjbench-${libtype}-tilem
-        ${CMAKE_CROSSCOMPILING_EMULATOR} tjbench${suffix} testout_tilem.ppm 95
-          -rgb -fastupsample -quiet -tile -benchtime 0.01 -warmup 0)
-      set_tests_properties(tjbench-${libtype}-tilem
-        PROPERTIES DEPENDS tjbench-${libtype}-tilem-cp)
-
-      add_test(tjbench-${libtype}-tile-420m-8x8-cmp
-        ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP} ${MD5_PPM_420M_8x8_TILE}
-          testout_tilem_420_Q95_8x8.ppm)
-      add_test(tjbench-${libtype}-tile-422m-8x8-cmp
-        ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP} ${MD5_PPM_422M_8x8_TILE}
-          testout_tilem_422_Q95_8x8.ppm)
-      foreach(tile 16 32 64 128)
-        foreach(subsamp 420 422)
-          add_test(tjbench-${libtype}-tile-${subsamp}m-${tile}x${tile}-cmp
-            ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP}
-              ${MD5_PPM_${subsamp}M_TILE}
-              testout_tilem_${subsamp}_Q95_${tile}x${tile}.ppm)
-        endforeach()
+    add_test(tjbench-${libtype}-tilem-cp
+      ${CMAKE_COMMAND} -E copy_if_different ${TESTIMAGES}/testorig.ppm
+        testout_tilem.ppm)
+    add_test(tjbench-${libtype}-tilem
+      ${CMAKE_CROSSCOMPILING_EMULATOR} tjbench${suffix} testout_tilem.ppm 95
+        -rgb -fastupsample -quiet -tile -benchtime 0.01 -warmup 0)
+    set_tests_properties(tjbench-${libtype}-tilem
+      PROPERTIES DEPENDS tjbench-${libtype}-tilem-cp)
+
+    add_test(tjbench-${libtype}-tile-420m-8x8-cmp
+      ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP} ${MD5_PPM_420M_8x8_TILE}
+        testout_tilem_420_Q95_8x8.ppm)
+    add_test(tjbench-${libtype}-tile-422m-8x8-cmp
+      ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP} ${MD5_PPM_422M_8x8_TILE}
+        testout_tilem_422_Q95_8x8.ppm)
+    foreach(tile 16 32 64 128)
+      foreach(subsamp 420 422)
+        add_test(tjbench-${libtype}-tile-${subsamp}m-${tile}x${tile}-cmp
+          ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP}
+            ${MD5_PPM_${subsamp}M_TILE}
+            testout_tilem_${subsamp}_Q95_${tile}x${tile}.ppm)
       endforeach()
-      foreach(tile 8 16 32 64 128)
-        foreach(subsamp 420 422)
-          set_tests_properties(tjbench-${libtype}-tile-${subsamp}m-${tile}x${tile}-cmp
-            PROPERTIES DEPENDS tjbench-${libtype}-tilem)
-        endforeach()
+    endforeach()
+    foreach(tile 8 16 32 64 128)
+      foreach(subsamp 420 422)
+        set_tests_properties(tjbench-${libtype}-tile-${subsamp}m-${tile}x${tile}-cmp
+          PROPERTIES DEPENDS tjbench-${libtype}-tilem)
       endforeach()
-    endif()
+    endforeach()
+  endif()
 
-    # These tests are carefully crafted to provide full coverage of as many of
-    # the underlying algorithms as possible (including all of the
-    # SIMD-accelerated ones.)
-
-    macro(add_bittest PROG NAME ARGS OUTFILE INFILE MD5SUM)
-      add_test(${PROG}-${libtype}-${NAME}
-        ${CMAKE_CROSSCOMPILING_EMULATOR} ${PROG}${suffix} ${ARGS}
-          -outfile ${OUTFILE} ${INFILE})
-      add_test(${PROG}-${libtype}-${NAME}-cmp
-        ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP} ${MD5SUM} ${OUTFILE})
-      set_tests_properties(${PROG}-${libtype}-${NAME}-cmp PROPERTIES
-        DEPENDS ${PROG}-${libtype}-${NAME})
-      if(${ARGC} GREATER 6)
-        set(DEPENDS ${ARGN})
-        set_tests_properties(${PROG}-${libtype}-${NAME} PROPERTIES
-          DEPENDS ${DEPENDS})
-      endif()
-    endmacro()
-
-    # CC: null  SAMP: fullsize  FDCT: islow  ENT: huff
-    add_bittest(cjpeg rgb-islow "-rgb;-dct;int;-icc;${TESTIMAGES}/test1.icc"
-      testout_rgb_islow.jpg ${TESTIMAGES}/testorig.ppm
-      ${MD5_JPEG_RGB_ISLOW})
-
-    # CC: null  SAMP: fullsize  IDCT: islow  ENT: huff
-    add_bittest(djpeg rgb-islow "-dct;int;-ppm;-icc;testout_rgb_islow.icc"
-      testout_rgb_islow.ppm testout_rgb_islow.jpg
-      ${MD5_PPM_RGB_ISLOW} cjpeg-${libtype}-rgb-islow)
-
-    add_test(djpeg-${libtype}-rgb-islow-icc-cmp
-      ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP}
-        b06a39d730129122e85c1363ed1bbc9e testout_rgb_islow.icc)
-    set_tests_properties(djpeg-${libtype}-rgb-islow-icc-cmp PROPERTIES
-      DEPENDS djpeg-${libtype}-rgb-islow)
-
-    add_bittest(jpegtran icc "-copy;all;-icc;${TESTIMAGES}/test2.icc"
-      testout_rgb_islow2.jpg testout_rgb_islow.jpg
-      ${MD5_JPEG_RGB_ISLOW2} cjpeg-${libtype}-rgb-islow)
-
-    if(NOT WITH_12BIT)
-      # CC: RGB->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
-      add_bittest(djpeg rgb-islow-565 "-dct;int;-rgb565;-dither;none;-bmp"
-        testout_rgb_islow_565.bmp testout_rgb_islow.jpg
-        ${MD5_BMP_RGB_ISLOW_565} cjpeg-${libtype}-rgb-islow)
-
-      # CC: RGB->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
-      add_bittest(djpeg rgb-islow-565D "-dct;int;-rgb565;-bmp"
-        testout_rgb_islow_565D.bmp testout_rgb_islow.jpg
-        ${MD5_BMP_RGB_ISLOW_565D} cjpeg-${libtype}-rgb-islow)
+  # These tests are carefully crafted to provide full coverage of as many of
+  # the underlying algorithms as possible (including all of the
+  # SIMD-accelerated ones.)
+
+  macro(add_bittest PROG NAME ARGS OUTFILE INFILE MD5SUM)
+    add_test(${PROG}-${libtype}-${NAME}
+      ${CMAKE_CROSSCOMPILING_EMULATOR} ${PROG}${suffix} ${ARGS}
+        -outfile ${OUTFILE} ${INFILE})
+    add_test(${PROG}-${libtype}-${NAME}-cmp
+      ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP} ${MD5SUM} ${OUTFILE})
+    set_tests_properties(${PROG}-${libtype}-${NAME}-cmp PROPERTIES
+      DEPENDS ${PROG}-${libtype}-${NAME})
+    if(${ARGC} GREATER 6)
+      set(DEPENDS ${ARGN})
+      set_tests_properties(${PROG}-${libtype}-${NAME} PROPERTIES
+        DEPENDS ${DEPENDS})
     endif()
+  endmacro()
+
+  # CC: null  SAMP: fullsize  FDCT: islow  ENT: huff
+  add_bittest(cjpeg rgb-islow "-rgb;-dct;int;-icc;${TESTIMAGES}/test1.icc"
+    testout_rgb_islow.jpg ${TESTIMAGES}/testorig.ppm
+    ${MD5_JPEG_RGB_ISLOW})
+
+  # CC: null  SAMP: fullsize  IDCT: islow  ENT: huff
+  add_bittest(djpeg rgb-islow "-dct;int;-ppm;-icc;testout_rgb_islow.icc"
+    testout_rgb_islow.ppm testout_rgb_islow.jpg
+    ${MD5_PPM_RGB_ISLOW} cjpeg-${libtype}-rgb-islow)
+
+  add_test(djpeg-${libtype}-rgb-islow-icc-cmp
+    ${CMAKE_CROSSCOMPILING_EMULATOR} ${MD5CMP}
+      b06a39d730129122e85c1363ed1bbc9e testout_rgb_islow.icc)
+  set_tests_properties(djpeg-${libtype}-rgb-islow-icc-cmp PROPERTIES
+    DEPENDS djpeg-${libtype}-rgb-islow)
+
+  add_bittest(jpegtran icc "-copy;all;-icc;${TESTIMAGES}/test2.icc"
+    testout_rgb_islow2.jpg testout_rgb_islow.jpg
+    ${MD5_JPEG_RGB_ISLOW2} cjpeg-${libtype}-rgb-islow)
+
+  if(NOT WITH_12BIT)
+    # CC: RGB->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
+    add_bittest(djpeg rgb-islow-565 "-dct;int;-rgb565;-dither;none;-bmp"
+      testout_rgb_islow_565.bmp testout_rgb_islow.jpg
+      ${MD5_BMP_RGB_ISLOW_565} cjpeg-${libtype}-rgb-islow)
+
+    # CC: RGB->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
+    add_bittest(djpeg rgb-islow-565D "-dct;int;-rgb565;-bmp"
+      testout_rgb_islow_565D.bmp testout_rgb_islow.jpg
+      ${MD5_BMP_RGB_ISLOW_565D} cjpeg-${libtype}-rgb-islow)
+  endif()
 
-    # CC: RGB->YCC  SAMP: fullsize/h2v1  FDCT: ifast  ENT: 2-pass huff
-    add_bittest(cjpeg 422-ifast-opt "-sample;2x1;-dct;fast;-opt"
-      testout_422_ifast_opt.jpg ${TESTIMAGES}/testorig.ppm
-      ${MD5_JPEG_422_IFAST_OPT})
-
-    # CC: YCC->RGB  SAMP: fullsize/h2v1 fancy  IDCT: ifast  ENT: huff
-    add_bittest(djpeg 422-ifast "-dct;fast"
-      testout_422_ifast.ppm testout_422_ifast_opt.jpg
-      ${MD5_PPM_422_IFAST} cjpeg-${libtype}-422-ifast-opt)
-
-    # CC: YCC->RGB  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
-    add_bittest(djpeg 422m-ifast "-dct;fast;-nosmooth"
-      testout_422m_ifast.ppm testout_422_ifast_opt.jpg
-      ${MD5_PPM_422M_IFAST} cjpeg-${libtype}-422-ifast-opt)
-
-    if(NOT WITH_12BIT)
-      # CC: YCC->RGB565  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
-      add_bittest(djpeg 422m-ifast-565
-        "-dct;int;-nosmooth;-rgb565;-dither;none;-bmp"
-        testout_422m_ifast_565.bmp testout_422_ifast_opt.jpg
-        ${MD5_BMP_422M_IFAST_565} cjpeg-${libtype}-422-ifast-opt)
-
-      # CC: YCC->RGB565 (dithered)  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
-      add_bittest(djpeg 422m-ifast-565D "-dct;int;-nosmooth;-rgb565;-bmp"
-        testout_422m_ifast_565D.bmp testout_422_ifast_opt.jpg
-        ${MD5_BMP_422M_IFAST_565D} cjpeg-${libtype}-422-ifast-opt)
-    endif()
+  # CC: RGB->YCC  SAMP: fullsize/h2v1  FDCT: ifast  ENT: 2-pass huff
+  add_bittest(cjpeg 422-ifast-opt "-sample;2x1;-dct;fast;-opt"
+    testout_422_ifast_opt.jpg ${TESTIMAGES}/testorig.ppm
+    ${MD5_JPEG_422_IFAST_OPT})
+
+  # CC: YCC->RGB  SAMP: fullsize/h2v1 fancy  IDCT: ifast  ENT: huff
+  add_bittest(djpeg 422-ifast "-dct;fast"
+    testout_422_ifast.ppm testout_422_ifast_opt.jpg
+    ${MD5_PPM_422_IFAST} cjpeg-${libtype}-422-ifast-opt)
+
+  # CC: RGB->YCC  SAMP: fullsize/h1v2  FDCT: islow  ENT: huff
+  add_bittest(cjpeg 440-islow "-sample;1x2;-dct;int"
+    testout_440_islow.jpg ${TESTIMAGES}/testorig.ppm
+    ${MD5_JPEG_440_ISLOW})
+
+  # CC: YCC->RGB  SAMP: fullsize/h1v2 fancy  IDCT: islow  ENT: huff
+  add_bittest(djpeg 440-islow "-dct;int"
+    testout_440_islow.ppm testout_440_islow.jpg
+    ${MD5_PPM_440_ISLOW} cjpeg-${libtype}-440-islow)
+
+  # CC: YCC->RGB  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
+  add_bittest(djpeg 422m-ifast "-dct;fast;-nosmooth"
+    testout_422m_ifast.ppm testout_422_ifast_opt.jpg
+    ${MD5_PPM_422M_IFAST} cjpeg-${libtype}-422-ifast-opt)
+
+  if(NOT WITH_12BIT)
+    # CC: YCC->RGB565  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
+    add_bittest(djpeg 422m-ifast-565
+      "-dct;int;-nosmooth;-rgb565;-dither;none;-bmp"
+      testout_422m_ifast_565.bmp testout_422_ifast_opt.jpg
+      ${MD5_BMP_422M_IFAST_565} cjpeg-${libtype}-422-ifast-opt)
+
+    # CC: YCC->RGB565 (dithered)  SAMP: h2v1 merged  IDCT: ifast  ENT: huff
+    add_bittest(djpeg 422m-ifast-565D "-dct;int;-nosmooth;-rgb565;-bmp"
+      testout_422m_ifast_565D.bmp testout_422_ifast_opt.jpg
+      ${MD5_BMP_422M_IFAST_565D} cjpeg-${libtype}-422-ifast-opt)
+  endif()
 
-    # CC: RGB->YCC  SAMP: fullsize/h2v2  FDCT: ifast  ENT: prog huff
-    add_bittest(cjpeg 420-q100-ifast-prog
-      "-sample;2x2;-quality;100;-dct;fast;-scans;${TESTIMAGES}/test.scan"
-      testout_420_q100_ifast_prog.jpg ${TESTIMAGES}/testorig.ppm
-      ${MD5_JPEG_420_IFAST_Q100_PROG})
-
-    # CC: YCC->RGB  SAMP: fullsize/h2v2 fancy  IDCT: ifast  ENT: prog huff
-    add_bittest(djpeg 420-q100-ifast-prog "-dct;fast"
-      testout_420_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
-      ${MD5_PPM_420_Q100_IFAST} cjpeg-${libtype}-420-q100-ifast-prog)
-
-    # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: ifast  ENT: prog huff
-    add_bittest(djpeg 420m-q100-ifast-prog "-dct;fast;-nosmooth"
-      testout_420m_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
-      ${MD5_PPM_420M_Q100_IFAST} cjpeg-${libtype}-420-q100-ifast-prog)
-
-    # CC: RGB->Gray  SAMP: fullsize  FDCT: islow  ENT: huff
-    add_bittest(cjpeg gray-islow "-gray;-dct;int"
-      testout_gray_islow.jpg ${TESTIMAGES}/testorig.ppm
-      ${MD5_JPEG_GRAY_ISLOW})
-
-    # CC: Gray->Gray  SAMP: fullsize  IDCT: islow  ENT: huff
-    add_bittest(djpeg gray-islow "-dct;int"
-      testout_gray_islow.ppm testout_gray_islow.jpg
-      ${MD5_PPM_GRAY_ISLOW} cjpeg-${libtype}-gray-islow)
-
-    # CC: Gray->RGB  SAMP: fullsize  IDCT: islow  ENT: huff
-    add_bittest(djpeg gray-islow-rgb "-dct;int;-rgb"
-      testout_gray_islow_rgb.ppm testout_gray_islow.jpg
-      ${MD5_PPM_GRAY_ISLOW_RGB} cjpeg-${libtype}-gray-islow)
-
-    if(NOT WITH_12BIT)
-      # CC: Gray->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
-      add_bittest(djpeg gray-islow-565 "-dct;int;-rgb565;-dither;none;-bmp"
-        testout_gray_islow_565.bmp testout_gray_islow.jpg
-        ${MD5_BMP_GRAY_ISLOW_565} cjpeg-${libtype}-gray-islow)
-
-      # CC: Gray->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
-      add_bittest(djpeg gray-islow-565D "-dct;int;-rgb565;-bmp"
-        testout_gray_islow_565D.bmp testout_gray_islow.jpg
-        ${MD5_BMP_GRAY_ISLOW_565D} cjpeg-${libtype}-gray-islow)
-    endif()
+  # CC: RGB->YCC  SAMP: fullsize/h2v2  FDCT: ifast  ENT: prog huff
+  add_bittest(cjpeg 420-q100-ifast-prog
+    "-sample;2x2;-quality;100;-dct;fast;-scans;${TESTIMAGES}/test.scan"
+    testout_420_q100_ifast_prog.jpg ${TESTIMAGES}/testorig.ppm
+    ${MD5_JPEG_420_IFAST_Q100_PROG})
+
+  # CC: YCC->RGB  SAMP: fullsize/h2v2 fancy  IDCT: ifast  ENT: prog huff
+  add_bittest(djpeg 420-q100-ifast-prog "-dct;fast"
+    testout_420_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
+    ${MD5_PPM_420_Q100_IFAST} cjpeg-${libtype}-420-q100-ifast-prog)
+
+  # CC: YCC->RGB  SAMP: h2v2 merged  IDCT: ifast  ENT: prog huff
+  add_bittest(djpeg 420m-q100-ifast-prog "-dct;fast;-nosmooth"
+    testout_420m_q100_ifast.ppm testout_420_q100_ifast_prog.jpg
+    ${MD5_PPM_420M_Q100_IFAST} cjpeg-${libtype}-420-q100-ifast-prog)
+
+  # CC: RGB->Gray  SAMP: fullsize  FDCT: islow  ENT: huff
+  add_bittest(cjpeg gray-islow "-gray;-dct;int"
+    testout_gray_islow.jpg ${TESTIMAGES}/testorig.ppm
+    ${MD5_JPEG_GRAY_ISLOW})
+
+  # CC: Gray->Gray  SAMP: fullsize  IDCT: islow  ENT: huff
+  add_bittest(djpeg gray-islow "-dct;int"
+    testout_gray_islow.ppm testout_gray_islow.jpg
+    ${MD5_PPM_GRAY_ISLOW} cjpeg-${libtype}-gray-islow)
+
+  # CC: Gray->RGB  SAMP: fullsize  IDCT: islow  ENT: huff
+  add_bittest(djpeg gray-islow-rgb "-dct;int;-rgb"
+    testout_gray_islow_rgb.ppm testout_gray_islow.jpg
+    ${MD5_PPM_GRAY_ISLOW_RGB} cjpeg-${libtype}-gray-islow)
+
+  if(NOT WITH_12BIT)
+    # CC: Gray->RGB565  SAMP: fullsize  IDCT: islow  ENT: huff
+    add_bittest(djpeg gray-islow-565 "-dct;int;-rgb565;-dither;none;-bmp"
+      testout_gray_islow_565.bmp testout_gray_islow.jpg
+      ${MD5_BMP_GRAY_ISLOW_565} cjpeg-${libtype}-gray-islow)
+
+    # CC: Gray->RGB565 (dithered)  SAMP: fullsize  IDCT: islow  ENT: huff
+    add_bittest(djpeg gray-islow-565D "-dct;int;-rgb565;-bmp"
+      testout_gray_islow_565D.bmp testout_gray_islow.jpg
+      ${MD5_BMP_GRAY_ISLOW_565D} cjpeg-${libtype}-gray-islow)
+  endif()
 
-    # CC: RGB->YCC  SAMP: fullsize smooth/h2v2 smooth  FDCT: islow
-    # ENT: 2-pass huff
-    add_bittest(cjpeg 420s-ifast-opt "-sample;2x2;-smooth;1;-dct;int;-opt"
-      testout_420s_ifast_opt.jpg ${TESTIMAGES}/testorig.ppm
-      ${MD5_JPEG_420S_IFAST_OPT})
-
-    if(FLOATTEST)
-      # CC: RGB->YCC  SAMP: fullsize/int  FDCT: float  ENT: prog huff
-      add_bittest(cjpeg 3x2-float-prog "-sample;3x2;-dct;float;-prog"
-        testout_3x2_float_prog.jpg ${TESTIMAGES}/testorig.ppm
-        ${MD5_JPEG_3x2_FLOAT_PROG_${FLOATTEST_UC}})
-
-      # CC: YCC->RGB  SAMP: fullsize/int  IDCT: float  ENT: prog huff
-      add_bittest(djpeg 3x2-float-prog "-dct;float"
-        testout_3x2_float.ppm testout_3x2_float_prog.jpg
-        ${MD5_PPM_3x2_FLOAT_${FLOATTEST_UC}} cjpeg-${libtype}-3x2-float-prog)
-    endif()
+  # CC: RGB->YCC  SAMP: fullsize smooth/h2v2 smooth  FDCT: islow
+  # ENT: 2-pass huff
+  add_bittest(cjpeg 420s-ifast-opt "-sample;2x2;-smooth;1;-dct;int;-opt"
+    testout_420s_ifast_opt.jpg ${TESTIMAGES}/testorig.ppm
+    ${MD5_JPEG_420S_IFAST_OPT})
 
-      # CC: RGB->YCC  SAMP: fullsize/int  FDCT: ifast  ENT: prog huff
-    add_bittest(cjpeg 3x2-ifast-prog "-sample;3x2;-dct;fast;-prog"
-      testout_3x2_ifast_prog.jpg ${TESTIMAGES}/testorig.ppm
-      ${MD5_JPEG_3x2_IFAST_PROG})
-
-    # CC: YCC->RGB  SAMP: fullsize/int  IDCT: ifast  ENT: prog huff
-    add_bittest(djpeg 3x2-ifast-prog "-dct;fast"
-      testout_3x2_ifast.ppm testout_3x2_ifast_prog.jpg
-      ${MD5_PPM_3x2_IFAST} cjpeg-${libtype}-3x2-ifast-prog)
-
-    if(WITH_ARITH_ENC)
-      # CC: YCC->RGB  SAMP: fullsize/h2v2  FDCT: islow  ENT: arith
-      add_bittest(cjpeg 420-islow-ari "-dct;int;-arithmetic"
-        testout_420_islow_ari.jpg ${TESTIMAGES}/testorig.ppm
-        ${MD5_JPEG_420_ISLOW_ARI})
-
-      add_bittest(jpegtran 420-islow-ari "-arithmetic"
-        testout_420_islow_ari2.jpg ${TESTIMAGES}/testimgint.jpg
-        ${MD5_JPEG_420_ISLOW_ARI})
-
-      # CC: YCC->RGB  SAMP: fullsize  FDCT: islow  ENT: prog arith
-      add_bittest(cjpeg 444-islow-progari
-        "-sample;1x1;-dct;int;-prog;-arithmetic"
-        testout_444_islow_progari.jpg ${TESTIMAGES}/testorig.ppm
-        ${MD5_JPEG_444_ISLOW_PROGARI})
-    endif()
+  if(FLOATTEST)
+    # CC: RGB->YCC  SAMP: fullsize/int  FDCT: float  ENT: prog huff
+    add_bittest(cjpeg 3x2-float-prog "-sample;3x2;-dct;float;-prog"
+      testout_3x2_float_prog.jpg ${TESTIMAGES}/testorig.ppm
+      ${MD5_JPEG_3x2_FLOAT_PROG_${FLOATTEST_UC}})
+
+    # CC: YCC->RGB  SAMP: fullsize/int  IDCT: float  ENT: prog huff
+    add_bittest(djpeg 3x2-float-prog "-dct;float"
+      testout_3x2_float.ppm testout_3x2_float_prog.jpg
+      ${MD5_PPM_3x2_FLOAT_${FLOATTEST_UC}} cjpeg-${libtype}-3x2-float-prog)
+  endif()
 
-    if(WITH_ARITH_DEC)
-      # CC: RGB->YCC  SAMP: h2v2 merged  IDCT: ifast  ENT: arith
-      if((CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm") AND WITH_SIMD)
-        # Refer to the comment above the definition of MD5_PPM_420M_IFAST_ARI for
-        # an explanation of why this is necessary.
-        add_bittest(djpeg 420m-ifast-ari "-fast;-ppm"
-          testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg
-          ${MD5_PPM_420M_IFAST_ARI})
-      else()
-        add_bittest(djpeg 420m-ifast-ari "-fast;-skip;1,20;-ppm"
-          testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg
-          ${MD5_PPM_420M_IFAST_ARI})
-      endif()
+    # CC: RGB->YCC  SAMP: fullsize/int  FDCT: ifast  ENT: prog huff
+  add_bittest(cjpeg 3x2-ifast-prog "-sample;3x2;-dct;fast;-prog"
+    testout_3x2_ifast_prog.jpg ${TESTIMAGES}/testorig.ppm
+    ${MD5_JPEG_3x2_IFAST_PROG})
+
+  # CC: YCC->RGB  SAMP: fullsize/int  IDCT: ifast  ENT: prog huff
+  add_bittest(djpeg 3x2-ifast-prog "-dct;fast"
+    testout_3x2_ifast.ppm testout_3x2_ifast_prog.jpg
+    ${MD5_PPM_3x2_IFAST} cjpeg-${libtype}-3x2-ifast-prog)
+
+  if(WITH_ARITH_ENC)
+    # CC: YCC->RGB  SAMP: fullsize/h2v2  FDCT: islow  ENT: arith
+    add_bittest(cjpeg 420-islow-ari "-dct;int;-arithmetic"
+      testout_420_islow_ari.jpg ${TESTIMAGES}/testorig.ppm
+      ${MD5_JPEG_420_ISLOW_ARI})
+
+    add_bittest(jpegtran 420-islow-ari "-arithmetic"
+      testout_420_islow_ari2.jpg ${TESTIMAGES}/testimgint.jpg
+      ${MD5_JPEG_420_ISLOW_ARI})
+
+    # CC: YCC->RGB  SAMP: fullsize  FDCT: islow  ENT: prog arith
+    add_bittest(cjpeg 444-islow-progari
+      "-sample;1x1;-dct;int;-prog;-arithmetic"
+      testout_444_islow_progari.jpg ${TESTIMAGES}/testorig.ppm
+      ${MD5_JPEG_444_ISLOW_PROGARI})
+  endif()
 
-      add_bittest(jpegtran 420-islow ""
-        testout_420_islow.jpg ${TESTIMAGES}/testimgari.jpg
-        ${MD5_JPEG_420_ISLOW})
-    endif()
+  if(WITH_ARITH_DEC)
+    # CC: RGB->YCC  SAMP: h2v2 merged  IDCT: ifast  ENT: arith
+    add_bittest(djpeg 420m-ifast-ari "-fast;-skip;1,20;-ppm"
+      testout_420m_ifast_ari.ppm ${TESTIMAGES}/testimgari.jpg
+      ${MD5_PPM_420M_IFAST_ARI})
 
-    # 2/1--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 16x16 islow  ENT: huff
-    # 15/8--  CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 15x15 islow  ENT: huff
-    # 13/8--  CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 13x13 islow  ENT: huff
-    # 11/8--  CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 11x11 islow  ENT: huff
-    # 9/8--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 9x9 islow  ENT: huff
-    # 7/8--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 7x7 islow/14x14 islow
-    #         ENT: huff
-    # 3/4--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 6x6 islow/12x12 islow
-    #         ENT: huff
-    # 5/8--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 5x5 islow/10x10 islow
-    #         ENT: huff
-    # 1/2--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 4x4 islow/8x8 islow
-    #         ENT: huff
-    # 3/8--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 3x3 islow/6x6 islow
-    #         ENT: huff
-    # 1/4--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 2x2 islow/4x4 islow
-    #         ENT: huff
-    # 1/8--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 1x1 islow/2x2 islow
-    #         ENT: huff
-    foreach(scale 2_1 15_8 13_8 11_8 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8)
-      string(REGEX REPLACE "_" "/" scalearg ${scale})
-      add_bittest(djpeg 420m-islow-${scale}
-        "-dct;int;-scale;${scalearg};-nosmooth;-ppm"
-        testout_420m_islow_${scale}.ppm ${TESTIMAGES}/${TESTORIG}
-        ${MD5_PPM_420M_ISLOW_${scale}})
-    endforeach()
+    add_bittest(jpegtran 420-islow ""
+      testout_420_islow.jpg ${TESTIMAGES}/testimgari.jpg
+      ${MD5_JPEG_420_ISLOW})
+  endif()
 
-    if(NOT WITH_12BIT)
-      # CC: YCC->RGB (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
-      add_bittest(djpeg 420-islow-256 "-dct;int;-colors;256;-bmp"
-        testout_420_islow_256.bmp ${TESTIMAGES}/${TESTORIG}
-        ${MD5_BMP_420_ISLOW_256})
-
-      # CC: YCC->RGB565  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
-      add_bittest(djpeg 420-islow-565 "-dct;int;-rgb565;-dither;none;-bmp"
-        testout_420_islow_565.bmp ${TESTIMAGES}/${TESTORIG}
-        ${MD5_BMP_420_ISLOW_565})
-
-      # CC: YCC->RGB565 (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
-      add_bittest(djpeg 420-islow-565D "-dct;int;-rgb565;-bmp"
-        testout_420_islow_565D.bmp ${TESTIMAGES}/${TESTORIG}
-        ${MD5_BMP_420_ISLOW_565D})
-
-      # CC: YCC->RGB565  SAMP: h2v2 merged  IDCT: islow  ENT: huff
-      add_bittest(djpeg 420m-islow-565
-        "-dct;int;-nosmooth;-rgb565;-dither;none;-bmp"
-        testout_420m_islow_565.bmp ${TESTIMAGES}/${TESTORIG}
-        ${MD5_BMP_420M_ISLOW_565})
-
-      # CC: YCC->RGB565 (dithered)  SAMP: h2v2 merged  IDCT: islow  ENT: huff
-      add_bittest(djpeg 420m-islow-565D "-dct;int;-nosmooth;-rgb565;-bmp"
-        testout_420m_islow_565D.bmp ${TESTIMAGES}/${TESTORIG}
-        ${MD5_BMP_420M_ISLOW_565D})
-    endif()
+  # 2/1--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 16x16 islow  ENT: huff
+  # 15/8--  CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 15x15 islow  ENT: huff
+  # 13/8--  CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 13x13 islow  ENT: huff
+  # 11/8--  CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 11x11 islow  ENT: huff
+  # 9/8--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 9x9 islow  ENT: huff
+  # 7/8--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 7x7 islow/14x14 islow
+  #         ENT: huff
+  # 3/4--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 6x6 islow/12x12 islow
+  #         ENT: huff
+  # 5/8--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 5x5 islow/10x10 islow
+  #         ENT: huff
+  # 1/2--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 4x4 islow/8x8 islow
+  #         ENT: huff
+  # 3/8--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 3x3 islow/6x6 islow
+  #         ENT: huff
+  # 1/4--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 2x2 islow/4x4 islow
+  #         ENT: huff
+  # 1/8--   CC: YCC->RGB  SAMP: h2v2 merged  IDCT: 1x1 islow/2x2 islow
+  #         ENT: huff
+  foreach(scale 2_1 15_8 13_8 11_8 9_8 7_8 3_4 5_8 1_2 3_8 1_4 1_8)
+    string(REGEX REPLACE "_" "/" scalearg ${scale})
+    add_bittest(djpeg 420m-islow-${scale}
+      "-dct;int;-scale;${scalearg};-nosmooth;-ppm"
+      testout_420m_islow_${scale}.ppm ${TESTIMAGES}/${TESTORIG}
+      ${MD5_PPM_420M_ISLOW_${scale}})
+  endforeach()
 
-    # Partial decode tests.  These tests are designed to cover all of the
-    # possible code paths in jpeg_skip_scanlines().
+  if(NOT WITH_12BIT)
+    # CC: YCC->RGB (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
+    add_bittest(djpeg 420-islow-256 "-dct;int;-colors;256;-bmp"
+      testout_420_islow_256.bmp ${TESTIMAGES}/${TESTORIG}
+      ${MD5_BMP_420_ISLOW_256})
+
+    # CC: YCC->RGB565  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
+    add_bittest(djpeg 420-islow-565 "-dct;int;-rgb565;-dither;none;-bmp"
+      testout_420_islow_565.bmp ${TESTIMAGES}/${TESTORIG}
+      ${MD5_BMP_420_ISLOW_565})
+
+    # CC: YCC->RGB565 (dithered)  SAMP: h2v2 fancy  IDCT: islow  ENT: huff
+    add_bittest(djpeg 420-islow-565D "-dct;int;-rgb565;-bmp"
+      testout_420_islow_565D.bmp ${TESTIMAGES}/${TESTORIG}
+      ${MD5_BMP_420_ISLOW_565D})
+
+    # CC: YCC->RGB565  SAMP: h2v2 merged  IDCT: islow  ENT: huff
+    add_bittest(djpeg 420m-islow-565
+      "-dct;int;-nosmooth;-rgb565;-dither;none;-bmp"
+      testout_420m_islow_565.bmp ${TESTIMAGES}/${TESTORIG}
+      ${MD5_BMP_420M_ISLOW_565})
+
+    # CC: YCC->RGB565 (dithered)  SAMP: h2v2 merged  IDCT: islow  ENT: huff
+    add_bittest(djpeg 420m-islow-565D "-dct;int;-nosmooth;-rgb565;-bmp"
+      testout_420m_islow_565D.bmp ${TESTIMAGES}/${TESTORIG}
+      ${MD5_BMP_420M_ISLOW_565D})
+  endif()
 
-    # Context rows: Yes  Intra-iMCU row: Yes  iMCU row prefetch: No   ENT: huff
-    add_bittest(djpeg 420-islow-skip15_31 "-dct;int;-skip;15,31;-ppm"
-      testout_420_islow_skip15,31.ppm ${TESTIMAGES}/${TESTORIG}
-      ${MD5_PPM_420_ISLOW_SKIP15_31})
+  # Partial decode tests.  These tests are designed to cover all of the
+  # possible code paths in jpeg_skip_scanlines().
 
-    # Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: Yes  ENT: arith
-    if(WITH_ARITH_DEC)
-      add_bittest(djpeg 420-islow-ari-skip16_139 "-dct;int;-skip;16,139;-ppm"
-        testout_420_islow_ari_skip16,139.ppm ${TESTIMAGES}/testimgari.jpg
-        ${MD5_PPM_420_ISLOW_ARI_SKIP16_139})
-    endif()
+  # Context rows: Yes  Intra-iMCU row: Yes  iMCU row prefetch: No   ENT: huff
+  add_bittest(djpeg 420-islow-skip15_31 "-dct;int;-skip;15,31;-ppm"
+    testout_420_islow_skip15,31.ppm ${TESTIMAGES}/${TESTORIG}
+    ${MD5_PPM_420_ISLOW_SKIP15_31})
 
-    # Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: prog huff
-    add_test(cjpeg-${libtype}-420-islow-prog
-      ${CMAKE_CROSSCOMPILING_EMULATOR} cjpeg${suffix} -dct int -prog
-        -outfile testout_420_islow_prog.jpg ${TESTIMAGES}/testorig.ppm)
-    add_bittest(djpeg 420-islow-prog-crop62x62_71_71
-      "-dct;int;-crop;62x62+71+71;-ppm"
-      testout_420_islow_prog_crop62x62,71,71.ppm testout_420_islow_prog.jpg
-      ${MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71} cjpeg-${libtype}-420-islow-prog)
+  # Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: Yes  ENT: arith
+  if(WITH_ARITH_DEC)
+    add_bittest(djpeg 420-islow-ari-skip16_139 "-dct;int;-skip;16,139;-ppm"
+      testout_420_islow_ari_skip16,139.ppm ${TESTIMAGES}/testimgari.jpg
+      ${MD5_PPM_420_ISLOW_ARI_SKIP16_139})
+  endif()
 
-    # Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: arith
-    if(WITH_ARITH_DEC)
-      add_bittest(djpeg 420-islow-ari-crop53x53_4_4
-        "-dct;int;-crop;53x53+4+4;-ppm"
-        testout_420_islow_ari_crop53x53,4,4.ppm ${TESTIMAGES}/testimgari.jpg
-        ${MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4})
-    endif()
+  # Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: prog huff
+  add_test(cjpeg-${libtype}-420-islow-prog
+    ${CMAKE_CROSSCOMPILING_EMULATOR} cjpeg${suffix} -dct int -prog
+      -outfile testout_420_islow_prog.jpg ${TESTIMAGES}/testorig.ppm)
+  add_bittest(djpeg 420-islow-prog-crop62x62_71_71
+    "-dct;int;-crop;62x62+71+71;-ppm"
+    testout_420_islow_prog_crop62x62,71,71.ppm testout_420_islow_prog.jpg
+    ${MD5_PPM_420_ISLOW_PROG_CROP62x62_71_71} cjpeg-${libtype}-420-islow-prog)
+
+  # Context rows: Yes  Intra-iMCU row: No   iMCU row prefetch: No   ENT: arith
+  if(WITH_ARITH_DEC)
+    add_bittest(djpeg 420-islow-ari-crop53x53_4_4
+      "-dct;int;-crop;53x53+4+4;-ppm"
+      testout_420_islow_ari_crop53x53,4,4.ppm ${TESTIMAGES}/testimgari.jpg
+      ${MD5_PPM_420_ISLOW_ARI_CROP53x53_4_4})
+  endif()
 
-    # Context rows: No   Intra-iMCU row: Yes  ENT: huff
-    add_test(cjpeg-${libtype}-444-islow
-      ${CMAKE_CROSSCOMPILING_EMULATOR} cjpeg${suffix} -dct int -sample 1x1
-        -outfile testout_444_islow.jpg ${TESTIMAGES}/testorig.ppm)
-    add_bittest(djpeg 444-islow-skip1_6 "-dct;int;-skip;1,6;-ppm"
-      testout_444_islow_skip1,6.ppm testout_444_islow.jpg
-      ${MD5_PPM_444_ISLOW_SKIP1_6} cjpeg-${libtype}-444-islow)
-
-    # Context rows: No   Intra-iMCU row: No   ENT: prog huff
-    add_test(cjpeg-${libtype}-444-islow-prog
-      ${CMAKE_CROSSCOMPILING_EMULATOR} cjpeg${suffix} -dct int -prog -sample 1x1
-        -outfile testout_444_islow_prog.jpg ${TESTIMAGES}/testorig.ppm)
-    add_bittest(djpeg 444-islow-prog-crop98x98_13_13
-      "-dct;int;-crop;98x98+13+13;-ppm"
-      testout_444_islow_prog_crop98x98,13,13.ppm testout_444_islow_prog.jpg
-      ${MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13} cjpeg-${libtype}-444-islow-prog)
-
-    # Context rows: No   Intra-iMCU row: No   ENT: arith
-    if(WITH_ARITH_ENC)
-      add_test(cjpeg-${libtype}-444-islow-ari
-        ${CMAKE_CROSSCOMPILING_EMULATOR} cjpeg${suffix} -dct int -arithmetic
-          -sample 1x1 -outfile testout_444_islow_ari.jpg
-          ${TESTIMAGES}/testorig.ppm)
-      if(WITH_ARITH_DEC)
-        add_bittest(djpeg 444-islow-ari-crop37x37_0_0
-          "-dct;int;-crop;37x37+0+0;-ppm"
-          testout_444_islow_ari_crop37x37,0,0.ppm testout_444_islow_ari.jpg
-          ${MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0} cjpeg-${libtype}-444-islow-ari)
-      endif()
+  # Context rows: No   Intra-iMCU row: Yes  ENT: huff
+  add_test(cjpeg-${libtype}-444-islow
+    ${CMAKE_CROSSCOMPILING_EMULATOR} cjpeg${suffix} -dct int -sample 1x1
+      -outfile testout_444_islow.jpg ${TESTIMAGES}/testorig.ppm)
+  add_bittest(djpeg 444-islow-skip1_6 "-dct;int;-skip;1,6;-ppm"
+    testout_444_islow_skip1,6.ppm testout_444_islow.jpg
+    ${MD5_PPM_444_ISLOW_SKIP1_6} cjpeg-${libtype}-444-islow)
+
+  # Context rows: No   Intra-iMCU row: No   ENT: prog huff
+  add_test(cjpeg-${libtype}-444-islow-prog
+    ${CMAKE_CROSSCOMPILING_EMULATOR} cjpeg${suffix} -dct int -prog -sample 1x1
+      -outfile testout_444_islow_prog.jpg ${TESTIMAGES}/testorig.ppm)
+  add_bittest(djpeg 444-islow-prog-crop98x98_13_13
+    "-dct;int;-crop;98x98+13+13;-ppm"
+    testout_444_islow_prog_crop98x98,13,13.ppm testout_444_islow_prog.jpg
+    ${MD5_PPM_444_ISLOW_PROG_CROP98x98_13_13} cjpeg-${libtype}-444-islow-prog)
+
+  # Context rows: No   Intra-iMCU row: No   ENT: arith
+  if(WITH_ARITH_ENC)
+    add_test(cjpeg-${libtype}-444-islow-ari
+      ${CMAKE_CROSSCOMPILING_EMULATOR} cjpeg${suffix} -dct int -arithmetic
+        -sample 1x1 -outfile testout_444_islow_ari.jpg
+        ${TESTIMAGES}/testorig.ppm)
+    if(WITH_ARITH_DEC)
+      add_bittest(djpeg 444-islow-ari-crop37x37_0_0
+        "-dct;int;-crop;37x37+0+0;-ppm"
+        testout_444_islow_ari_crop37x37,0,0.ppm testout_444_islow_ari.jpg
+        ${MD5_PPM_444_ISLOW_ARI_CROP37x37_0_0} cjpeg-${libtype}-444-islow-ari)
     endif()
+  endif()
 
-    add_bittest(jpegtran crop "-crop;120x90+20+50;-transpose;-perfect"
-      testout_crop.jpg ${TESTIMAGES}/${TESTORIG}
-      ${MD5_JPEG_CROP})
+  add_bittest(jpegtran crop "-crop;120x90+20+50;-transpose;-perfect"
+    testout_crop.jpg ${TESTIMAGES}/${TESTORIG}
+    ${MD5_JPEG_CROP})
 
-  endforeach()
+endforeach()
 
-  add_custom_target(testclean COMMAND ${CMAKE_COMMAND} -P
-    ${CMAKE_CURRENT_SOURCE_DIR}/cmakescripts/testclean.cmake)
+add_custom_target(testclean COMMAND ${CMAKE_COMMAND} -P
+  ${CMAKE_CURRENT_SOURCE_DIR}/cmakescripts/testclean.cmake)
 
-  configure_file(croptest.in croptest @ONLY)
-  add_custom_target(croptest
-    COMMAND echo croptest
-    COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/croptest)
+configure_file(croptest.in croptest @ONLY)
+add_custom_target(croptest
+  COMMAND echo croptest
+  COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/croptest)
 
-  if(WITH_TURBOJPEG)
-    configure_file(tjbenchtest.in tjbenchtest @ONLY)
-    configure_file(tjexampletest.in tjexampletest @ONLY)
-    if(WIN32)
-      set(BASH bash)
-    endif()
-    if(WITH_JAVA)
-      configure_file(tjbenchtest.java.in tjbenchtest.java @ONLY)
-      configure_file(tjexampletest.java.in tjexampletest.java @ONLY)
-      add_custom_target(tjtest
-        COMMAND echo tjbenchtest
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest
-        COMMAND echo tjbenchtest -alloc
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -alloc
-        COMMAND echo tjbenchtest -yuv
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -yuv
-        COMMAND echo tjbenchtest -yuv -alloc
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -yuv -alloc
-        COMMAND echo tjbenchtest -progressive
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -progressive
-        COMMAND echo tjbenchtest -progressive -yuv
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -progressive -yuv
-        COMMAND echo tjexampletest
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjexampletest
-        COMMAND echo tjbenchtest.java
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java
-        COMMAND echo tjbenchtest.java -yuv
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java -yuv
-        COMMAND echo tjbenchtest.java -progressive
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java -progressive
-        COMMAND echo tjexampletest.java -progressive -yuv
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java
-          -progressive -yuv
-        COMMAND echo tjexampletest.java
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjexampletest.java
-        DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest
-          ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java
-          ${CMAKE_CURRENT_BINARY_DIR}/tjexampletest)
-    else()
-      add_custom_target(tjtest
-        COMMAND echo tjbenchtest
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest
-        COMMAND echo tjbenchtest -alloc
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -alloc
-        COMMAND echo tjbenchtest -yuv
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -yuv
-        COMMAND echo tjbenchtest -yuv -alloc
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -yuv -alloc
-        COMMAND echo tjbenchtest -progressive
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -progressive
-        COMMAND echo tjbenchtest -progressive -yuv
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -progressive -yuv
-        COMMAND echo tjexampletest
-        COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjexampletest
-        DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest)
-    endif()
+if(WITH_TURBOJPEG)
+  configure_file(tjbenchtest.in tjbenchtest @ONLY)
+  configure_file(tjexampletest.in tjexampletest @ONLY)
+  if(WIN32)
+    set(BASH bash)
+  endif()
+  if(WITH_JAVA)
+    configure_file(tjbenchtest.java.in tjbenchtest.java @ONLY)
+    configure_file(tjexampletest.java.in tjexampletest.java @ONLY)
+    add_custom_target(tjtest
+      COMMAND echo tjbenchtest
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest
+      COMMAND echo tjbenchtest -alloc
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -alloc
+      COMMAND echo tjbenchtest -yuv
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -yuv
+      COMMAND echo tjbenchtest -yuv -alloc
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -yuv -alloc
+      COMMAND echo tjbenchtest -progressive
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -progressive
+      COMMAND echo tjbenchtest -progressive -yuv
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -progressive -yuv
+      COMMAND echo tjexampletest
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjexampletest
+      COMMAND echo tjbenchtest.java
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java
+      COMMAND echo tjbenchtest.java -yuv
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java -yuv
+      COMMAND echo tjbenchtest.java -progressive
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java -progressive
+      COMMAND echo tjexampletest.java -progressive -yuv
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java
+        -progressive -yuv
+      COMMAND echo tjexampletest.java
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjexampletest.java
+      DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest
+        ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest.java
+        ${CMAKE_CURRENT_BINARY_DIR}/tjexampletest)
+  else()
+    add_custom_target(tjtest
+      COMMAND echo tjbenchtest
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest
+      COMMAND echo tjbenchtest -alloc
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -alloc
+      COMMAND echo tjbenchtest -yuv
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -yuv
+      COMMAND echo tjbenchtest -yuv -alloc
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -yuv -alloc
+      COMMAND echo tjbenchtest -progressive
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -progressive
+      COMMAND echo tjbenchtest -progressive -yuv
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest -progressive -yuv
+      COMMAND echo tjexampletest
+      COMMAND ${BASH} ${CMAKE_CURRENT_BINARY_DIR}/tjexampletest
+      DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/tjbenchtest)
   endif()
-endif() # End TESTS
+endif()
+endif()
 
 ###############################################################################
 # INSTALLATION
 ###############################################################################
-
 if(JTURBO_ENABLE_INSTALL)
-  set(EXE ${CMAKE_EXECUTABLE_SUFFIX})
+set(EXE ${CMAKE_EXECUTABLE_SUFFIX})
 
-  if(WITH_TURBOJPEG)
-    if(ENABLE_SHARED)
-      install(TARGETS turbojpeg tjbench
-        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
-      if(NOT CMAKE_VERSION VERSION_LESS "3.1" AND MSVC AND
-        CMAKE_C_LINKER_SUPPORTS_PDB)
-        install(FILES "$<TARGET_PDB_FILE:turbojpeg>"
-          DESTINATION ${CMAKE_INSTALL_BINDIR} OPTIONAL)
-      endif()
-    endif()
-    if(ENABLE_STATIC)
-      install(TARGETS turbojpeg-static ARCHIVE
-        DESTINATION ${CMAKE_INSTALL_LIBDIR})
-      if(NOT ENABLE_SHARED)
-        if(MSVC_IDE OR XCODE)
-          set(DIR "${CMAKE_CURRENT_BINARY_DIR}/\${CMAKE_INSTALL_CONFIG_NAME}")
-        else()
-          set(DIR ${CMAKE_CURRENT_BINARY_DIR})
-        endif()
-        install(PROGRAMS ${DIR}/tjbench-static${EXE}
-          DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME tjbench${EXE})
-      endif()
+if(WITH_TURBOJPEG)
+  if(ENABLE_SHARED)
+    install(TARGETS turbojpeg EXPORT ${CMAKE_PROJECT_NAME}Targets
+      INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+      ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+    install(TARGETS tjbench
+      RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+    if(NOT CMAKE_VERSION VERSION_LESS "3.1" AND MSVC AND
+      CMAKE_C_LINKER_SUPPORTS_PDB)
+      install(FILES "$<TARGET_PDB_FILE:turbojpeg>"
+        DESTINATION ${CMAKE_INSTALL_BINDIR} OPTIONAL)
     endif()
-    install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/turbojpeg.h
-      DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
   endif()
-
   if(ENABLE_STATIC)
-    install(TARGETS ${libname} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+    install(TARGETS turbojpeg-static EXPORT ${CMAKE_PROJECT_NAME}Targets
+      INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+      ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
     if(NOT ENABLE_SHARED)
       if(MSVC_IDE OR XCODE)
         set(DIR "${CMAKE_CURRENT_BINARY_DIR}/\${CMAKE_INSTALL_CONFIG_NAME}")
       else()
         set(DIR ${CMAKE_CURRENT_BINARY_DIR})
       endif()
-      install(PROGRAMS ${DIR}/cjpeg-static${EXE}
-        DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME cjpeg${EXE})
-      install(PROGRAMS ${DIR}/djpeg-static${EXE}
-        DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME djpeg${EXE})
-      install(PROGRAMS ${DIR}/jpegtran-static${EXE}
-        DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME jpegtran${EXE})
+      install(PROGRAMS ${DIR}/tjbench-static${EXE}
+        DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME tjbench${EXE})
     endif()
   endif()
+  install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/turbojpeg.h
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+endif()
 
-  install(TARGETS rdjpgcom wrjpgcom RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
-
-  install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/README.ijg
-    ${CMAKE_CURRENT_SOURCE_DIR}/README.md ${CMAKE_CURRENT_SOURCE_DIR}/example.txt
-    ${CMAKE_CURRENT_SOURCE_DIR}/tjexample.c
-    ${CMAKE_CURRENT_SOURCE_DIR}/libjpeg.txt
-    ${CMAKE_CURRENT_SOURCE_DIR}/structure.txt
-    ${CMAKE_CURRENT_SOURCE_DIR}/usage.txt ${CMAKE_CURRENT_SOURCE_DIR}/wizard.txt
-    ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR})
-  if(WITH_JAVA)
-    install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/java/TJExample.java
-      DESTINATION ${CMAKE_INSTALL_DOCDIR})
-  endif()
-
-  if(UNIX OR MINGW)
-    install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/cjpeg.1
-      ${CMAKE_CURRENT_SOURCE_DIR}/djpeg.1 ${CMAKE_CURRENT_SOURCE_DIR}/jpegtran.1
-      ${CMAKE_CURRENT_SOURCE_DIR}/rdjpgcom.1
-      ${CMAKE_CURRENT_SOURCE_DIR}/wrjpgcom.1
-      DESTINATION ${CMAKE_INSTALL_MANDIR}/man1)
+if(ENABLE_STATIC)
+  install(TARGETS jpeg-static EXPORT ${CMAKE_PROJECT_NAME}Targets
+    INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+  if(NOT ENABLE_SHARED)
+    if(MSVC_IDE OR XCODE)
+      set(DIR "${CMAKE_CURRENT_BINARY_DIR}/\${CMAKE_INSTALL_CONFIG_NAME}")
+    else()
+      set(DIR ${CMAKE_CURRENT_BINARY_DIR})
+    endif()
+    install(PROGRAMS ${DIR}/cjpeg-static${EXE}
+      DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME cjpeg${EXE})
+    install(PROGRAMS ${DIR}/djpeg-static${EXE}
+      DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME djpeg${EXE})
+    install(PROGRAMS ${DIR}/jpegtran-static${EXE}
+      DESTINATION ${CMAKE_INSTALL_BINDIR} RENAME jpegtran${EXE})
   endif()
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/libjpeg.pc
-    ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/libturbojpeg.pc
-    DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
-
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/jconfig.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/jerror.h ${CMAKE_CURRENT_SOURCE_DIR}/jmorecfg.h
-    ${CMAKE_CURRENT_SOURCE_DIR}/jpeglib.h
-    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+endif()
 
-  include(cmakescripts/BuildPackages.cmake)
+install(TARGETS rdjpgcom wrjpgcom RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 
-  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmakescripts/cmake_uninstall.cmake.in"
-    "cmake_uninstall.cmake" IMMEDIATE @ONLY)
+install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/README.ijg
+  ${CMAKE_CURRENT_SOURCE_DIR}/README.md ${CMAKE_CURRENT_SOURCE_DIR}/example.txt
+  ${CMAKE_CURRENT_SOURCE_DIR}/tjexample.c
+  ${CMAKE_CURRENT_SOURCE_DIR}/libjpeg.txt
+  ${CMAKE_CURRENT_SOURCE_DIR}/structure.txt
+  ${CMAKE_CURRENT_SOURCE_DIR}/usage.txt ${CMAKE_CURRENT_SOURCE_DIR}/wizard.txt
+  ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md DESTINATION ${CMAKE_INSTALL_DOCDIR})
+if(WITH_JAVA)
+  install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/java/TJExample.java
+    DESTINATION ${CMAKE_INSTALL_DOCDIR})
+endif()
 
-  add_custom_target(uninstall COMMAND ${CMAKE_COMMAND} -P cmake_uninstall.cmake)
+if(UNIX OR MINGW)
+  install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/cjpeg.1
+    ${CMAKE_CURRENT_SOURCE_DIR}/djpeg.1 ${CMAKE_CURRENT_SOURCE_DIR}/jpegtran.1
+    ${CMAKE_CURRENT_SOURCE_DIR}/rdjpgcom.1
+    ${CMAKE_CURRENT_SOURCE_DIR}/wrjpgcom.1
+    DESTINATION ${CMAKE_INSTALL_MANDIR}/man1)
+endif()
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/libjpeg.pc
+  ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/libturbojpeg.pc
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
+install(FILES
+  ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/${CMAKE_PROJECT_NAME}Config.cmake
+  ${CMAKE_CURRENT_BINARY_DIR}/pkgscripts/${CMAKE_PROJECT_NAME}ConfigVersion.cmake
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CMAKE_PROJECT_NAME})
+install(EXPORT ${CMAKE_PROJECT_NAME}Targets
+  NAMESPACE ${CMAKE_PROJECT_NAME}::
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${CMAKE_PROJECT_NAME})
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/jconfig.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/jerror.h ${CMAKE_CURRENT_SOURCE_DIR}/jmorecfg.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/jpeglib.h
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+include(cmakescripts/BuildPackages.cmake)
+
+configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmakescripts/cmake_uninstall.cmake.in"
+  "cmake_uninstall.cmake" IMMEDIATE @ONLY)
+
+add_custom_target(uninstall COMMAND ${CMAKE_COMMAND} -P cmake_uninstall.cmake)
 endif()
diff --git a/external/jpeg/ChangeLog.md b/external/jpeg/ChangeLog.md
index 7b5f8754643d..498b8f2729da 100644
--- a/external/jpeg/ChangeLog.md
+++ b/external/jpeg/ChangeLog.md
@@ -1,3 +1,199 @@
+2.1.0
+=====
+
+### Significant changes relative to 2.1 beta1
+
+1. Fixed a regression introduced by 2.1 beta1[6(b)] whereby attempting to
+decompress certain progressive JPEG images with one or more component planes of
+width 8 or less caused a buffer overrun.
+
+2. Fixed a regression introduced by 2.1 beta1[6(b)] whereby attempting to
+decompress a specially-crafted malformed progressive JPEG image caused the
+block smoothing algorithm to read from uninitialized memory.
+
+3. Fixed an issue in the Arm Neon SIMD Huffman encoders that caused the
+encoders to generate incorrect results when using the Clang compiler with
+Visual Studio.
+
+4. Fixed a floating point exception (CVE-2021-20205) that occurred when
+attempting to compress a specially-crafted malformed GIF image with a specified
+image width of 0 using cjpeg.
+
+5. Fixed a regression introduced by 2.0 beta1[15] whereby attempting to
+generate a progressive JPEG image on an SSE2-capable CPU using a scan script
+containing one or more scans with lengths divisible by 32 and non-zero
+successive approximation low bit positions would, under certain circumstances,
+result in an error ("Missing Huffman code table entry") and an invalid JPEG
+image.
+
+6. Introduced a new flag (`TJFLAG_LIMITSCANS` in the TurboJPEG C API and
+`TJ.FLAG_LIMIT_SCANS` in the TurboJPEG Java API) and a corresponding TJBench
+command-line argument (`-limitscans`) that causes the TurboJPEG decompression
+and transform functions/operations to return/throw an error if a progressive
+JPEG image contains an unreasonably large number of scans.  This allows
+applications that use the TurboJPEG API to guard against an exploit of the
+progressive JPEG format described in the report
+["Two Issues with the JPEG Standard"](https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+7. The PPM reader now throws an error, rather than segfaulting (due to a buffer
+overrun) or generating incorrect pixels, if an application attempts to use the
+`tjLoadImage()` function to load a 16-bit binary PPM file (a binary PPM file
+with a maximum value greater than 255) into a grayscale image buffer or to load
+a 16-bit binary PGM file into an RGB image buffer.
+
+8. Fixed an issue in the PPM reader that caused incorrect pixels to be
+generated when using the `tjLoadImage()` function to load a 16-bit binary PPM
+file into an extended RGB image buffer.
+
+9. Fixed an issue whereby, if a JPEG buffer was automatically re-allocated by
+one of the TurboJPEG compression or transform functions and an error
+subsequently occurred during compression or transformation, the JPEG buffer
+pointer passed by the application was not updated when the function returned.
+
+
+2.0.90 (2.1 beta1)
+==================
+
+### Significant changes relative to 2.0.6:
+
+1. The build system, x86-64 SIMD extensions, and accelerated Huffman codec now
+support the x32 ABI on Linux, which allows for using x86-64 instructions with
+32-bit pointers.  The x32 ABI is generally enabled by adding `-mx32` to the
+compiler flags.
+
+     Caveats:
+     - CMake 3.9.0 or later is required in order for the build system to
+automatically detect an x32 build.
+     - Java does not support the x32 ABI, and thus the TurboJPEG Java API will
+automatically be disabled with x32 builds.
+
+2. Added Loongson MMI SIMD implementations of the RGB-to-grayscale, 4:2:2 fancy
+chroma upsampling, 4:2:2 and 4:2:0 merged chroma upsampling/color conversion,
+and fast integer DCT/IDCT algorithms.  Relative to libjpeg-turbo 2.0.x, this
+speeds up:
+
+     - the compression of RGB source images into grayscale JPEG images by
+approximately 20%
+     - the decompression of 4:2:2 JPEG images by approximately 40-60% when
+using fancy upsampling
+     - the decompression of 4:2:2 and 4:2:0 JPEG images by approximately
+15-20% when using merged upsampling
+     - the compression of RGB source images by approximately 30-45% when using
+the fast integer DCT
+     - the decompression of JPEG images into RGB destination images by
+approximately 2x when using the fast integer IDCT
+
+    The overall decompression speedup for RGB images is now approximately
+2.3-3.7x (compared to 2-3.5x with libjpeg-turbo 2.0.x.)
+
+3. 32-bit (Armv7 or Armv7s) iOS builds of libjpeg-turbo are no longer
+supported, and the libjpeg-turbo build system can no longer be used to package
+such builds.  32-bit iOS apps cannot run in iOS 11 and later, and the App Store
+no longer allows them.
+
+4. 32-bit (i386) OS X/macOS builds of libjpeg-turbo are no longer supported,
+and the libjpeg-turbo build system can no longer be used to package such
+builds.  32-bit Mac applications cannot run in macOS 10.15 "Catalina" and
+later, and the App Store no longer allows them.
+
+5. The SSE2 (x86 SIMD) and C Huffman encoding algorithms have been
+significantly optimized, resulting in a measured average overall compression
+speedup of 12-28% for 64-bit code and 22-52% for 32-bit code on various Intel
+and AMD CPUs, as well as a measured average overall compression speedup of
+0-23% on platforms that do not have a SIMD-accelerated Huffman encoding
+implementation.
+
+6. The block smoothing algorithm that is applied by default when decompressing
+progressive Huffman-encoded JPEG images has been improved in the following
+ways:
+
+     - The algorithm is now more fault-tolerant.  Previously, if a particular
+scan was incomplete, then the smoothing parameters for the incomplete scan
+would be applied to the entire output image, including the parts of the image
+that were generated by the prior (complete) scan.  Visually, this had the
+effect of removing block smoothing from lower-frequency scans if they were
+followed by an incomplete higher-frequency scan.  libjpeg-turbo now applies
+block smoothing parameters to each iMCU row based on which scan generated the
+pixels in that row, rather than always using the block smoothing parameters for
+the most recent scan.
+     - When applying block smoothing to DC scans, a Gaussian-like kernel with a
+5x5 window is used to reduce the "blocky" appearance.
+
+7. Added SIMD acceleration for progressive Huffman encoding on Arm platforms.
+This speeds up the compression of full-color progressive JPEGs by about 30-40%
+on average (relative to libjpeg-turbo 2.0.x) when using modern Arm CPUs.
+
+8. Added configure-time and run-time auto-detection of Loongson MMI SIMD
+instructions, so that the Loongson MMI SIMD extensions can be included in any
+MIPS64 libjpeg-turbo build.
+
+9. Added fault tolerance features to djpeg and jpegtran, mainly to demonstrate
+methods by which applications can guard against the exploits of the JPEG format
+described in the report
+["Two Issues with the JPEG Standard"](https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf).
+
+     - Both programs now accept a `-maxscans` argument, which can be used to
+limit the number of allowable scans in the input file.
+     - Both programs now accept a `-strict` argument, which can be used to
+treat all warnings as fatal.
+
+10. CMake package config files are now included for both the libjpeg and
+TurboJPEG API libraries.  This facilitates using libjpeg-turbo with CMake's
+`find_package()` function.  For example:
+
+        find_package(libjpeg-turbo CONFIG REQUIRED)
+
+        add_executable(libjpeg_program libjpeg_program.c)
+        target_link_libraries(libjpeg_program PUBLIC libjpeg-turbo::jpeg)
+
+        add_executable(libjpeg_program_static libjpeg_program.c)
+        target_link_libraries(libjpeg_program_static PUBLIC
+          libjpeg-turbo::jpeg-static)
+
+        add_executable(turbojpeg_program turbojpeg_program.c)
+        target_link_libraries(turbojpeg_program PUBLIC
+          libjpeg-turbo::turbojpeg)
+
+        add_executable(turbojpeg_program_static turbojpeg_program.c)
+        target_link_libraries(turbojpeg_program_static PUBLIC
+          libjpeg-turbo::turbojpeg-static)
+
+11. Since the Unisys LZW patent has long expired, cjpeg and djpeg can now
+read/write both LZW-compressed and uncompressed GIF files (feature ported from
+jpeg-6a and jpeg-9d.)
+
+12. jpegtran now includes the `-wipe` and `-drop` options from jpeg-9a and
+jpeg-9d, as well as the ability to expand the image size using the `-crop`
+option.  Refer to jpegtran.1 or usage.txt for more details.
+
+13. Added a complete intrinsics implementation of the Arm Neon SIMD extensions,
+thus providing SIMD acceleration on Arm platforms for all of the algorithms
+that are SIMD-accelerated on x86 platforms.  This new implementation is
+significantly faster in some cases than the old GAS implementation--
+depending on the algorithms used, the type of CPU core, and the compiler.  GCC,
+as of this writing, does not provide a full or optimal set of Neon intrinsics,
+so for performance reasons, the default when building libjpeg-turbo with GCC is
+to continue using the GAS implementation of the following algorithms:
+
+     - 32-bit RGB-to-YCbCr color conversion
+     - 32-bit fast and accurate inverse DCT
+     - 64-bit RGB-to-YCbCr and YCbCr-to-RGB color conversion
+     - 64-bit accurate forward and inverse DCT
+     - 64-bit Huffman encoding
+
+    A new CMake variable (`NEON_INTRINSICS`) can be used to override this
+default.
+
+    Since the new intrinsics implementation includes SIMD acceleration
+for merged upsampling/color conversion, 1.5.1[5] is no longer necessary and has
+been reverted.
+
+14. The Arm Neon SIMD extensions can now be built using Visual Studio.
+
+15. The build system can now be used to generate a universal x86-64 + Armv8
+libjpeg-turbo SDK package for both iOS and macOS.
+
+
 2.0.6
 =====
 
diff --git a/external/jpeg/LICENSE.md b/external/jpeg/LICENSE.md
index 99c9aadcc47c..a1cdad52faf4 100644
--- a/external/jpeg/LICENSE.md
+++ b/external/jpeg/LICENSE.md
@@ -91,7 +91,7 @@ best of our understanding.
 The Modified (3-clause) BSD License
 ===================================
 
-Copyright (C)2009-2020 D. R. Commander.  All Rights Reserved.
+Copyright (C)2009-2021 D. R. Commander.  All Rights Reserved.<br>
 Copyright (C)2015 Viktor Szathmáry.  All Rights Reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/external/jpeg/README.ijg b/external/jpeg/README.ijg
new file mode 100644
index 000000000000..9453c195010f
--- /dev/null
+++ b/external/jpeg/README.ijg
@@ -0,0 +1,258 @@
+libjpeg-turbo note:  This file has been modified by The libjpeg-turbo Project
+to include only information relevant to libjpeg-turbo, to wordsmith certain
+sections, and to remove impolitic language that existed in the libjpeg v8
+README.  It is included only for reference.  Please see README.md for
+information specific to libjpeg-turbo.
+
+
+The Independent JPEG Group's JPEG software
+==========================================
+
+This distribution contains a release of the Independent JPEG Group's free JPEG
+software.  You are welcome to redistribute this software and to use it for any
+purpose, subject to the conditions under LEGAL ISSUES, below.
+
+This software is the work of Tom Lane, Guido Vollbeding, Philip Gladstone,
+Bill Allombert, Jim Boucher, Lee Crocker, Bob Friesenhahn, Ben Jackson,
+Julian Minguillon, Luis Ortiz, George Phillips, Davide Rossi, Ge' Weijers,
+and other members of the Independent JPEG Group.
+
+IJG is not affiliated with the ISO/IEC JTC1/SC29/WG1 standards committee
+(also known as JPEG, together with ITU-T SG16).
+
+
+DOCUMENTATION ROADMAP
+=====================
+
+This file contains the following sections:
+
+OVERVIEW            General description of JPEG and the IJG software.
+LEGAL ISSUES        Copyright, lack of warranty, terms of distribution.
+REFERENCES          Where to learn more about JPEG.
+ARCHIVE LOCATIONS   Where to find newer versions of this software.
+FILE FORMAT WARS    Software *not* to get.
+TO DO               Plans for future IJG releases.
+
+Other documentation files in the distribution are:
+
+User documentation:
+  usage.txt         Usage instructions for cjpeg, djpeg, jpegtran,
+                    rdjpgcom, and wrjpgcom.
+  *.1               Unix-style man pages for programs (same info as usage.txt).
+  wizard.txt        Advanced usage instructions for JPEG wizards only.
+  change.log        Version-to-version change highlights.
+Programmer and internal documentation:
+  libjpeg.txt       How to use the JPEG library in your own programs.
+  example.txt       Sample code for calling the JPEG library.
+  structure.txt     Overview of the JPEG library's internal structure.
+  coderules.txt     Coding style rules --- please read if you contribute code.
+
+Please read at least usage.txt.  Some information can also be found in the JPEG
+FAQ (Frequently Asked Questions) article.  See ARCHIVE LOCATIONS below to find
+out where to obtain the FAQ article.
+
+If you want to understand how the JPEG code works, we suggest reading one or
+more of the REFERENCES, then looking at the documentation files (in roughly
+the order listed) before diving into the code.
+
+
+OVERVIEW
+========
+
+This package contains C software to implement JPEG image encoding, decoding,
+and transcoding.  JPEG (pronounced "jay-peg") is a standardized compression
+method for full-color and grayscale images.  JPEG's strong suit is compressing
+photographic images or other types of images that have smooth color and
+brightness transitions between neighboring pixels.  Images with sharp lines or
+other abrupt features may not compress well with JPEG, and a higher JPEG
+quality may have to be used to avoid visible compression artifacts with such
+images.
+
+JPEG is lossy, meaning that the output pixels are not necessarily identical to
+the input pixels.  However, on photographic content and other "smooth" images,
+very good compression ratios can be obtained with no visible compression
+artifacts, and extremely high compression ratios are possible if you are
+willing to sacrifice image quality (by reducing the "quality" setting in the
+compressor.)
+
+This software implements JPEG baseline, extended-sequential, and progressive
+compression processes.  Provision is made for supporting all variants of these
+processes, although some uncommon parameter settings aren't implemented yet.
+We have made no provision for supporting the hierarchical or lossless
+processes defined in the standard.
+
+We provide a set of library routines for reading and writing JPEG image files,
+plus two sample applications "cjpeg" and "djpeg", which use the library to
+perform conversion between JPEG and some other popular image file formats.
+The library is intended to be reused in other applications.
+
+In order to support file conversion and viewing software, we have included
+considerable functionality beyond the bare JPEG coding/decoding capability;
+for example, the color quantization modules are not strictly part of JPEG
+decoding, but they are essential for output to colormapped file formats or
+colormapped displays.  These extra functions can be compiled out of the
+library if not required for a particular application.
+
+We have also included "jpegtran", a utility for lossless transcoding between
+different JPEG processes, and "rdjpgcom" and "wrjpgcom", two simple
+applications for inserting and extracting textual comments in JFIF files.
+
+The emphasis in designing this software has been on achieving portability and
+flexibility, while also making it fast enough to be useful.  In particular,
+the software is not intended to be read as a tutorial on JPEG.  (See the
+REFERENCES section for introductory material.)  Rather, it is intended to
+be reliable, portable, industrial-strength code.  We do not claim to have
+achieved that goal in every aspect of the software, but we strive for it.
+
+We welcome the use of this software as a component of commercial products.
+No royalty is required, but we do ask for an acknowledgement in product
+documentation, as described under LEGAL ISSUES.
+
+
+LEGAL ISSUES
+============
+
+In plain English:
+
+1. We don't promise that this software works.  (But if you find any bugs,
+   please let us know!)
+2. You can use this software for whatever you want.  You don't have to pay us.
+3. You may not pretend that you wrote this software.  If you use it in a
+   program, you must acknowledge somewhere in your documentation that
+   you've used the IJG code.
+
+In legalese:
+
+The authors make NO WARRANTY or representation, either express or implied,
+with respect to this software, its quality, accuracy, merchantability, or
+fitness for a particular purpose.  This software is provided "AS IS", and you,
+its user, assume the entire risk as to its quality and accuracy.
+
+This software is copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
+All Rights Reserved except as specified below.
+
+Permission is hereby granted to use, copy, modify, and distribute this
+software (or portions thereof) for any purpose, without fee, subject to these
+conditions:
+(1) If any part of the source code for this software is distributed, then this
+README file must be included, with this copyright and no-warranty notice
+unaltered; and any additions, deletions, or changes to the original files
+must be clearly indicated in accompanying documentation.
+(2) If only executable code is distributed, then the accompanying
+documentation must state that "this software is based in part on the work of
+the Independent JPEG Group".
+(3) Permission for use of this software is granted only if the user accepts
+full responsibility for any undesirable consequences; the authors accept
+NO LIABILITY for damages of any kind.
+
+These conditions apply to any software derived from or based on the IJG code,
+not just to the unmodified library.  If you use our work, you ought to
+acknowledge us.
+
+Permission is NOT granted for the use of any IJG author's name or company name
+in advertising or publicity relating to this software or products derived from
+it.  This software may be referred to only as "the Independent JPEG Group's
+software".
+
+We specifically permit and encourage the use of this software as the basis of
+commercial products, provided that all warranty or liability claims are
+assumed by the product vendor.
+
+
+REFERENCES
+==========
+
+We recommend reading one or more of these references before trying to
+understand the innards of the JPEG software.
+
+The best short technical introduction to the JPEG compression algorithm is
+        Wallace, Gregory K.  "The JPEG Still Picture Compression Standard",
+        Communications of the ACM, April 1991 (vol. 34 no. 4), pp. 30-44.
+(Adjacent articles in that issue discuss MPEG motion picture compression,
+applications of JPEG, and related topics.)  If you don't have the CACM issue
+handy, a PDF file containing a revised version of Wallace's article is
+available at http://www.ijg.org/files/Wallace.JPEG.pdf.  The file (actually
+a preprint for an article that appeared in IEEE Trans. Consumer Electronics)
+omits the sample images that appeared in CACM, but it includes corrections
+and some added material.  Note: the Wallace article is copyright ACM and IEEE,
+and it may not be used for commercial purposes.
+
+A somewhat less technical, more leisurely introduction to JPEG can be found in
+"The Data Compression Book" by Mark Nelson and Jean-loup Gailly, published by
+M&T Books (New York), 2nd ed. 1996, ISBN 1-55851-434-1.  This book provides
+good explanations and example C code for a multitude of compression methods
+including JPEG.  It is an excellent source if you are comfortable reading C
+code but don't know much about data compression in general.  The book's JPEG
+sample code is far from industrial-strength, but when you are ready to look
+at a full implementation, you've got one here...
+
+The best currently available description of JPEG is the textbook "JPEG Still
+Image Data Compression Standard" by William B. Pennebaker and Joan L.
+Mitchell, published by Van Nostrand Reinhold, 1993, ISBN 0-442-01272-1.
+Price US$59.95, 638 pp.  The book includes the complete text of the ISO JPEG
+standards (DIS 10918-1 and draft DIS 10918-2).
+
+The original JPEG standard is divided into two parts, Part 1 being the actual
+specification, while Part 2 covers compliance testing methods.  Part 1 is
+titled "Digital Compression and Coding of Continuous-tone Still Images,
+Part 1: Requirements and guidelines" and has document numbers ISO/IEC IS
+10918-1, ITU-T T.81.  Part 2 is titled "Digital Compression and Coding of
+Continuous-tone Still Images, Part 2: Compliance testing" and has document
+numbers ISO/IEC IS 10918-2, ITU-T T.83.
+
+The JPEG standard does not specify all details of an interchangeable file
+format.  For the omitted details, we follow the "JFIF" conventions, revision
+1.02.  JFIF version 1 has been adopted as ISO/IEC 10918-5 (05/2013) and
+Recommendation ITU-T T.871 (05/2011): Information technology - Digital
+compression and coding of continuous-tone still images: JPEG File Interchange
+Format (JFIF).  It is available as a free download in PDF file format from
+https://www.iso.org/standard/54989.html and http://www.itu.int/rec/T-REC-T.871.
+A PDF file of the older JFIF 1.02 specification is available at
+http://www.w3.org/Graphics/JPEG/jfif3.pdf.
+
+The TIFF 6.0 file format specification can be obtained from
+http://mirrors.ctan.org/graphics/tiff/TIFF6.ps.gz.  The JPEG incorporation
+scheme found in the TIFF 6.0 spec of 3-June-92 has a number of serious
+problems.  IJG does not recommend use of the TIFF 6.0 design (TIFF Compression
+tag 6).  Instead, we recommend the JPEG design proposed by TIFF Technical Note
+#2 (Compression tag 7).  Copies of this Note can be obtained from
+http://www.ijg.org/files/.  It is expected that the next revision
+of the TIFF spec will replace the 6.0 JPEG design with the Note's design.
+Although IJG's own code does not support TIFF/JPEG, the free libtiff library
+uses our library to implement TIFF/JPEG per the Note.
+
+
+ARCHIVE LOCATIONS
+=================
+
+The "official" archive site for this software is www.ijg.org.
+The most recent released version can always be found there in
+directory "files".
+
+The JPEG FAQ (Frequently Asked Questions) article is a source of some
+general information about JPEG.  It is available at
+http://www.faqs.org/faqs/jpeg-faq.
+
+
+FILE FORMAT COMPATIBILITY
+=========================
+
+This software implements ITU T.81 | ISO/IEC 10918 with some extensions from
+ITU T.871 | ISO/IEC 10918-5 (JPEG File Interchange Format-- see REFERENCES).
+Informally, the term "JPEG image" or "JPEG file" most often refers to JFIF or
+a subset thereof, but there are other formats containing the name "JPEG" that
+are incompatible with the DCT-based JPEG standard or with JFIF (for instance,
+JPEG 2000 and JPEG XR).  This software therefore does not support these
+formats.  Indeed, one of the original reasons for developing this free software
+was to help force convergence on a common, interoperable format standard for
+JPEG files.
+
+JFIF is a minimal or "low end" representation.  TIFF/JPEG (TIFF revision 6.0 as
+modified by TIFF Technical Note #2) can be used for "high end" applications
+that need to record a lot of additional data about an image.
+
+
+TO DO
+=====
+
+Please send bug reports, offers of help, etc. to jpeg-info@jpegclub.org.
diff --git a/external/jpeg/README.md b/external/jpeg/README.md
index 90a4a43ee1de..01e391ea7c08 100644
--- a/external/jpeg/README.md
+++ b/external/jpeg/README.md
@@ -3,7 +3,7 @@ Background
 
 libjpeg-turbo is a JPEG image codec that uses SIMD instructions to accelerate
 baseline JPEG compression and decompression on x86, x86-64, Arm, PowerPC, and
-MIPS systems, as well as progressive JPEG compression on x86 and x86-64
+MIPS systems, as well as progressive JPEG compression on x86, x86-64, and Arm
 systems.  On such systems, libjpeg-turbo is generally 2-6x as fast as libjpeg,
 all else being equal.  On other types of systems, libjpeg-turbo can still
 outperform libjpeg by a significant amount, by virtue of its highly-optimized
diff --git a/external/jpeg/appveyor.yml b/external/jpeg/appveyor.yml
new file mode 100644
index 000000000000..1e5f55792fe6
--- /dev/null
+++ b/external/jpeg/appveyor.yml
@@ -0,0 +1,71 @@
+install:
+  - cmd: >-
+      if not exist c:\installers mkdir c:\installers
+
+      mkdir c:\temp
+
+      if not exist c:\installers\nasm-2.10.01-win32.zip curl -fSL -o c:\installers\nasm-2.10.01-win32.zip http://www.nasm.us/pub/nasm/releasebuilds/2.10.01/win32/nasm-2.10.01-win32.zip
+
+      7z x c:\installers\nasm-2.10.01-win32.zip -oc:\ > c:\installers\nasm.install.log
+
+      if not exist c:\installers\i686-6.4.0-release-posix-dwarf-rt_v5-rev0.7z curl -fSL -o c:\installers\i686-6.4.0-release-posix-dwarf-rt_v5-rev0.7z "https://sourceforge.net/projects/mingw-w64/files/Toolchains targetting Win32/Personal Builds/mingw-builds/6.4.0/threads-posix/dwarf/i686-6.4.0-release-posix-dwarf-rt_v5-rev0.7z"
+
+      md "c:\Program Files (x86)\mingw-w64\i686-6.4.0-posix-dwarf-rt_v5-rev0"
+
+      7z x c:\installers\i686-6.4.0-release-posix-dwarf-rt_v5-rev0.7z -o"c:\Program Files (x86)\mingw-w64\i686-6.4.0-posix-dwarf-rt_v5-rev0" > c:\installers\mingw32.install.log
+
+      if not exist c:\installers\x86_64-6.4.0-release-posix-seh-rt_v5-rev0.7z curl -fSL -o c:\installers\x86_64-6.4.0-release-posix-seh-rt_v5-rev0.7z "https://sourceforge.net/projects/mingw-w64/files/Toolchains targetting Win64/Personal Builds/mingw-builds/6.4.0/threads-posix/seh/x86_64-6.4.0-release-posix-seh-rt_v5-rev0.7z"
+
+      md "c:\Program Files\mingw-w64\x86_64-6.4.0-posix-seh-rt_v5-rev0"
+
+      7z x c:\installers\x86_64-6.4.0-release-posix-seh-rt_v5-rev0.7z -o"c:\Program Files\mingw-w64\x86_64-6.4.0-posix-seh-rt_v5-rev0" > c:\installers\mingw64.install.log
+
+      set PATH=c:\nasm-2.10.01;c:\Program Files (x86)\NSIS;c:\msys64\usr\bin;%PATH%
+
+      "c:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\amd64\vcvars64.bat"
+
+      set INCLUDE
+
+      set LIB
+
+      set PATH
+
+      set MSYSTEM=MINGW32
+
+      mklink /d "%ProgramData%\Oracle\Java32" "c:\Program Files (x86)\Java\jdk1.6.0"
+
+      git clone --depth=1 https://github.com/libjpeg-turbo/buildscripts.git -b %APPVEYOR_REPO_BRANCH% c:/buildscripts
+
+cache:
+  - c:\installers\nasm-2.10.01-win32.zip -> appveyor.yml
+  - c:\installers\i686-6.4.0-release-posix-dwarf-rt_v5-rev0.7z -> appveyor.yml
+  - c:\installers\x86_64-6.4.0-release-posix-seh-rt_v5-rev0.7z -> appveyor.yml
+
+build_script:
+  - cmd: >-
+      for /f %%i in ('"cygpath %CD%"') do set MINGWPATH=%%i
+
+      bash c:/buildscripts/buildljt -d %MINGWPATH% -b /c/ljt.nightly -v
+
+      move c:\ljt.nightly\files\*.tar.gz .
+
+      move c:\ljt.nightly\files\*.exe .
+
+      move c:\ljt.nightly\log-windows.txt .
+
+artifacts:
+  - path: '*.tar.gz'
+    name: Source tarball
+
+  - path: '*-gcc*.exe'
+    name: SDK for MinGW
+
+  - path: '*-vc*.exe'
+    name: SDK for Visual C++
+
+  - path: 'log-windows.txt'
+    name: Build log
+
+test: off
+
+deploy: off
diff --git a/external/jpeg/cderror.h b/external/jpeg/cderror.h
index 4f2c7a3e5f77..2844346ee38b 100644
--- a/external/jpeg/cderror.h
+++ b/external/jpeg/cderror.h
@@ -1,9 +1,11 @@
 /*
  * cderror.h
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * Modified 2009-2017 by Guido Vollbeding.
- * This file is part of the Independent JPEG Group's software.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2021, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -42,7 +44,7 @@ JMESSAGE(JMSG_FIRSTADDONCODE = 1000, NULL) /* Must be first entry! */
 
 #ifdef BMP_SUPPORTED
 JMESSAGE(JERR_BMP_BADCMAP, "Unsupported BMP colormap format")
-JMESSAGE(JERR_BMP_BADDEPTH, "Only 8- and 24-bit BMP files are supported")
+JMESSAGE(JERR_BMP_BADDEPTH, "Only 8-, 24-, and 32-bit BMP files are supported")
 JMESSAGE(JERR_BMP_BADHEADER, "Invalid BMP file: bad header length")
 JMESSAGE(JERR_BMP_BADPLANES, "Invalid BMP file: biPlanes not equal to 1")
 JMESSAGE(JERR_BMP_COLORSPACE, "BMP output must be grayscale or RGB")
@@ -50,9 +52,9 @@ JMESSAGE(JERR_BMP_COMPRESSED, "Sorry, compressed BMPs not yet supported")
 JMESSAGE(JERR_BMP_EMPTY, "Empty BMP image")
 JMESSAGE(JERR_BMP_NOT, "Not a BMP file - does not start with BM")
 JMESSAGE(JERR_BMP_OUTOFRANGE, "Numeric value out of range in BMP file")
-JMESSAGE(JTRC_BMP, "%ux%u 24-bit BMP image")
+JMESSAGE(JTRC_BMP, "%ux%u %d-bit BMP image")
 JMESSAGE(JTRC_BMP_MAPPED, "%ux%u 8-bit colormapped BMP image")
-JMESSAGE(JTRC_BMP_OS2, "%ux%u 24-bit OS2 BMP image")
+JMESSAGE(JTRC_BMP_OS2, "%ux%u %d-bit OS2 BMP image")
 JMESSAGE(JTRC_BMP_OS2_MAPPED, "%ux%u 8-bit colormapped OS2 BMP image")
 #endif /* BMP_SUPPORTED */
 
@@ -60,6 +62,7 @@ JMESSAGE(JTRC_BMP_OS2_MAPPED, "%ux%u 8-bit colormapped OS2 BMP image")
 JMESSAGE(JERR_GIF_BUG, "GIF output got confused")
 JMESSAGE(JERR_GIF_CODESIZE, "Bogus GIF codesize %d")
 JMESSAGE(JERR_GIF_COLORSPACE, "GIF output must be grayscale or RGB")
+JMESSAGE(JERR_GIF_EMPTY, "Empty GIF image")
 JMESSAGE(JERR_GIF_IMAGENOTFOUND, "Too few images in GIF file")
 JMESSAGE(JERR_GIF_NOT, "Not a GIF file")
 JMESSAGE(JTRC_GIF, "%ux%ux%d GIF image")
@@ -84,23 +87,6 @@ JMESSAGE(JTRC_PPM, "%ux%u PPM image")
 JMESSAGE(JTRC_PPM_TEXT, "%ux%u text PPM image")
 #endif /* PPM_SUPPORTED */
 
-#ifdef RLE_SUPPORTED
-JMESSAGE(JERR_RLE_BADERROR, "Bogus error code from RLE library")
-JMESSAGE(JERR_RLE_COLORSPACE, "RLE output must be grayscale or RGB")
-JMESSAGE(JERR_RLE_DIMENSIONS, "Image dimensions (%ux%u) too large for RLE")
-JMESSAGE(JERR_RLE_EMPTY, "Empty RLE file")
-JMESSAGE(JERR_RLE_EOF, "Premature EOF in RLE header")
-JMESSAGE(JERR_RLE_MEM, "Insufficient memory for RLE header")
-JMESSAGE(JERR_RLE_NOT, "Not an RLE file")
-JMESSAGE(JERR_RLE_TOOMANYCHANNELS, "Cannot handle %d output channels for RLE")
-JMESSAGE(JERR_RLE_UNSUPPORTED, "Cannot handle this RLE setup")
-JMESSAGE(JTRC_RLE, "%ux%u full-color RLE file")
-JMESSAGE(JTRC_RLE_FULLMAP, "%ux%u full-color RLE file with map of length %d")
-JMESSAGE(JTRC_RLE_GRAY, "%ux%u grayscale RLE file")
-JMESSAGE(JTRC_RLE_MAPGRAY, "%ux%u grayscale RLE file with map of length %d")
-JMESSAGE(JTRC_RLE_MAPPED, "%ux%u colormapped RLE file with map of length %d")
-#endif /* RLE_SUPPORTED */
-
 #ifdef TARGA_SUPPORTED
 JMESSAGE(JERR_TGA_BADCMAP, "Unsupported Targa colormap format")
 JMESSAGE(JERR_TGA_BADPARMS, "Invalid or unsupported Targa file")
diff --git a/external/jpeg/cdjpeg.c b/external/jpeg/cdjpeg.c
index e0e382d0cdc1..5278c1dbef3f 100644
--- a/external/jpeg/cdjpeg.c
+++ b/external/jpeg/cdjpeg.c
@@ -3,8 +3,8 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code relevant
- * to libjpeg-turbo.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -25,26 +25,37 @@
  * Optional progress monitor: display a percent-done figure on stderr.
  */
 
-#ifdef PROGRESS_REPORT
-
 METHODDEF(void)
 progress_monitor(j_common_ptr cinfo)
 {
   cd_progress_ptr prog = (cd_progress_ptr)cinfo->progress;
-  int total_passes = prog->pub.total_passes + prog->total_extra_passes;
-  int percent_done =
-    (int)(prog->pub.pass_counter * 100L / prog->pub.pass_limit);
-
-  if (percent_done != prog->percent_done) {
-    prog->percent_done = percent_done;
-    if (total_passes > 1) {
-      fprintf(stderr, "\rPass %d/%d: %3d%% ",
-              prog->pub.completed_passes + prog->completed_extra_passes + 1,
-              total_passes, percent_done);
-    } else {
-      fprintf(stderr, "\r %3d%% ", percent_done);
+
+  if (prog->max_scans != 0 && cinfo->is_decompressor) {
+    int scan_no = ((j_decompress_ptr)cinfo)->input_scan_number;
+
+    if (scan_no > (int)prog->max_scans) {
+      fprintf(stderr, "Scan number %d exceeds maximum scans (%d)\n", scan_no,
+              prog->max_scans);
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  if (prog->report) {
+    int total_passes = prog->pub.total_passes + prog->total_extra_passes;
+    int percent_done =
+      (int)(prog->pub.pass_counter * 100L / prog->pub.pass_limit);
+
+    if (percent_done != prog->percent_done) {
+      prog->percent_done = percent_done;
+      if (total_passes > 1) {
+        fprintf(stderr, "\rPass %d/%d: %3d%% ",
+                prog->pub.completed_passes + prog->completed_extra_passes + 1,
+                total_passes, percent_done);
+      } else {
+        fprintf(stderr, "\r %3d%% ", percent_done);
+      }
+      fflush(stderr);
     }
-    fflush(stderr);
   }
 }
 
@@ -57,6 +68,8 @@ start_progress_monitor(j_common_ptr cinfo, cd_progress_ptr progress)
     progress->pub.progress_monitor = progress_monitor;
     progress->completed_extra_passes = 0;
     progress->total_extra_passes = 0;
+    progress->max_scans = 0;
+    progress->report = FALSE;
     progress->percent_done = -1;
     cinfo->progress = &progress->pub;
   }
@@ -73,8 +86,6 @@ end_progress_monitor(j_common_ptr cinfo)
   }
 }
 
-#endif
-
 
 /*
  * Case-insensitive matching of possibly-abbreviated keyword switches.
diff --git a/external/jpeg/cdjpeg.h b/external/jpeg/cdjpeg.h
index 8f357141cee9..082687ce06db 100644
--- a/external/jpeg/cdjpeg.h
+++ b/external/jpeg/cdjpeg.h
@@ -3,8 +3,9 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1997, Thomas G. Lane.
+ * Modified 2019 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2017, D. R. Commander.
+ * Copyright (C) 2017, 2019, 2021, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -35,6 +36,9 @@ struct cjpeg_source_struct {
 
   JSAMPARRAY buffer;
   JDIMENSION buffer_height;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  JDIMENSION max_pixels;
+#endif
 };
 
 
@@ -56,9 +60,9 @@ struct djpeg_dest_struct {
   void (*finish_output) (j_decompress_ptr cinfo, djpeg_dest_ptr dinfo);
   /* Re-calculate buffer dimensions based on output dimensions (for use with
      partial image decompression.)  If this is NULL, then the output format
-     does not support partial image decompression (BMP and RLE, in particular,
-     cannot support partial decompression because they use an inversion buffer
-     to write the image in bottom-up order.) */
+     does not support partial image decompression (BMP, in particular, cannot
+     support partial decompression because it uses an inversion buffer to write
+     the image in bottom-up order.) */
   void (*calc_buffer_dimensions) (j_decompress_ptr cinfo,
                                   djpeg_dest_ptr dinfo);
 
@@ -87,6 +91,9 @@ struct cdjpeg_progress_mgr {
   struct jpeg_progress_mgr pub; /* fields known to JPEG library */
   int completed_extra_passes;   /* extra passes completed */
   int total_extra_passes;       /* total extra */
+  JDIMENSION max_scans;         /* abort if the number of scans exceeds this
+                                   value and the value is non-zero */
+  boolean report;               /* whether or not to report progress */
   /* last printed percentage stored here to avoid multiple printouts */
   int percent_done;
 };
@@ -101,11 +108,9 @@ EXTERN(cjpeg_source_ptr) jinit_read_bmp(j_compress_ptr cinfo,
 EXTERN(djpeg_dest_ptr) jinit_write_bmp(j_decompress_ptr cinfo, boolean is_os2,
                                        boolean use_inversion_array);
 EXTERN(cjpeg_source_ptr) jinit_read_gif(j_compress_ptr cinfo);
-EXTERN(djpeg_dest_ptr) jinit_write_gif(j_decompress_ptr cinfo);
+EXTERN(djpeg_dest_ptr) jinit_write_gif(j_decompress_ptr cinfo, boolean is_lzw);
 EXTERN(cjpeg_source_ptr) jinit_read_ppm(j_compress_ptr cinfo);
 EXTERN(djpeg_dest_ptr) jinit_write_ppm(j_decompress_ptr cinfo);
-EXTERN(cjpeg_source_ptr) jinit_read_rle(j_compress_ptr cinfo);
-EXTERN(djpeg_dest_ptr) jinit_write_rle(j_decompress_ptr cinfo);
 EXTERN(cjpeg_source_ptr) jinit_read_targa(j_compress_ptr cinfo);
 EXTERN(djpeg_dest_ptr) jinit_write_targa(j_decompress_ptr cinfo);
 
diff --git a/external/jpeg/cjpeg.1 b/external/jpeg/cjpeg.1
new file mode 100644
index 000000000000..569dc3fa3f9f
--- /dev/null
+++ b/external/jpeg/cjpeg.1
@@ -0,0 +1,360 @@
+.TH CJPEG 1 "4 November 2020"
+.SH NAME
+cjpeg \- compress an image file to a JPEG file
+.SH SYNOPSIS
+.B cjpeg
+[
+.I options
+]
+[
+.I filename
+]
+.LP
+.SH DESCRIPTION
+.LP
+.B cjpeg
+compresses the named image file, or the standard input if no file is
+named, and produces a JPEG/JFIF file on the standard output.
+The currently supported input file formats are: PPM (PBMPLUS color
+format), PGM (PBMPLUS grayscale format), BMP, GIF, and Targa.
+.SH OPTIONS
+All switch names may be abbreviated; for example,
+.B \-grayscale
+may be written
+.B \-gray
+or
+.BR \-gr .
+Most of the "basic" switches can be abbreviated to as little as one letter.
+Upper and lower case are equivalent (thus
+.B \-BMP
+is the same as
+.BR \-bmp ).
+British spellings are also accepted (e.g.,
+.BR \-greyscale ),
+though for brevity these are not mentioned below.
+.PP
+The basic switches are:
+.TP
+.BI \-quality " N[,...]"
+Scale quantization tables to adjust image quality.  Quality is 0 (worst) to
+100 (best); default is 75.  (See below for more info.)
+.TP
+.B \-grayscale
+Create monochrome JPEG file from color input.  Be sure to use this switch when
+compressing a grayscale BMP or GIF file, because
+.B cjpeg
+isn't bright enough to notice whether a BMP or GIF file uses only shades of
+gray.  By saying
+.BR \-grayscale,
+you'll get a smaller JPEG file that takes less time to process.
+.TP
+.B \-rgb
+Create RGB JPEG file.
+Using this switch suppresses the conversion from RGB
+colorspace input to the default YCbCr JPEG colorspace.
+.TP
+.B \-optimize
+Perform optimization of entropy encoding parameters.  Without this, default
+encoding parameters are used.
+.B \-optimize
+usually makes the JPEG file a little smaller, but
+.B cjpeg
+runs somewhat slower and needs much more memory.  Image quality and speed of
+decompression are unaffected by
+.BR \-optimize .
+.TP
+.B \-progressive
+Create progressive JPEG file (see below).
+.TP
+.B \-targa
+Input file is Targa format.  Targa files that contain an "identification"
+field will not be automatically recognized by
+.BR cjpeg ;
+for such files you must specify
+.B \-targa
+to make
+.B cjpeg
+treat the input as Targa format.
+For most Targa files, you won't need this switch.
+.PP
+The
+.B \-quality
+switch lets you trade off compressed file size against quality of the
+reconstructed image: the higher the quality setting, the larger the JPEG file,
+and the closer the output image will be to the original input.  Normally you
+want to use the lowest quality setting (smallest file) that decompresses into
+something visually indistinguishable from the original image.  For this
+purpose the quality setting should generally be between 50 and 95 (the default
+is 75) for photographic images.  If you see defects at
+.B \-quality
+75, then go up 5 or 10 counts at a time until you are happy with the output
+image.  (The optimal setting will vary from one image to another.)
+.PP
+.B \-quality
+100 will generate a quantization table of all 1's, minimizing loss in the
+quantization step (but there is still information loss in subsampling, as well
+as roundoff error.)  For most images, specifying a quality value above
+about 95 will increase the size of the compressed file dramatically, and while
+the quality gain from these higher quality values is measurable (using metrics
+such as PSNR or SSIM), it is rarely perceivable by human vision.
+.PP
+In the other direction, quality values below 50 will produce very small files
+of low image quality.  Settings around 5 to 10 might be useful in preparing an
+index of a large image library, for example.  Try
+.B \-quality
+2 (or so) for some amusing Cubist effects.  (Note: quality
+values below about 25 generate 2-byte quantization tables, which are
+considered optional in the JPEG standard.
+.B cjpeg
+emits a warning message when you give such a quality value, because some
+other JPEG programs may be unable to decode the resulting file.  Use
+.B \-baseline
+if you need to ensure compatibility at low quality values.)
+.PP
+The \fB-quality\fR option has been extended in this version of \fBcjpeg\fR to
+support separate quality settings for luminance and chrominance (or, in
+general, separate settings for every quantization table slot.)  The principle
+is the same as chrominance subsampling:  since the human eye is more sensitive
+to spatial changes in brightness than spatial changes in color, the chrominance
+components can be quantized more than the luminance components without
+incurring any visible image quality loss.  However, unlike subsampling, this
+feature reduces data in the frequency domain instead of the spatial domain,
+which allows for more fine-grained control.  This option is useful in
+quality-sensitive applications, for which the artifacts generated by
+subsampling may be unacceptable.
+.PP
+The \fB-quality\fR option accepts a comma-separated list of parameters, which
+respectively refer to the quality levels that should be assigned to the
+quantization table slots.  If there are more q-table slots than parameters,
+then the last parameter is replicated.  Thus, if only one quality parameter is
+given, this is used for both luminance and chrominance (slots 0 and 1,
+respectively), preserving the legacy behavior of cjpeg v6b and prior.
+More (or customized) quantization tables can be set with the \fB-qtables\fR
+option and assigned to components with the \fB-qslots\fR option (see the
+"wizard" switches below.)
+.PP
+JPEG files generated with separate luminance and chrominance quality are fully
+compliant with standard JPEG decoders.
+.PP
+.BR CAUTION:
+For this setting to be useful, be sure to pass an argument of \fB-sample 1x1\fR
+to \fBcjpeg\fR to disable chrominance subsampling.  Otherwise, the default
+subsampling level (2x2, AKA "4:2:0") will be used.
+.PP
+The
+.B \-progressive
+switch creates a "progressive JPEG" file.  In this type of JPEG file, the data
+is stored in multiple scans of increasing quality.  If the file is being
+transmitted over a slow communications link, the decoder can use the first
+scan to display a low-quality image very quickly, and can then improve the
+display with each subsequent scan.  The final image is exactly equivalent to a
+standard JPEG file of the same quality setting, and the total file size is
+about the same --- often a little smaller.
+.PP
+Switches for advanced users:
+.TP
+.B \-arithmetic
+Use arithmetic coding.
+.B Caution:
+arithmetic coded JPEG is not yet widely implemented, so many decoders will be
+unable to view an arithmetic coded JPEG file at all.
+.TP
+.B \-dct int
+Use accurate integer DCT method (default).
+.TP
+.B \-dct fast
+Use less accurate integer DCT method [legacy feature].
+When the Independent JPEG Group's software was first released in 1991, the
+compression time for a 1-megapixel JPEG image on a mainstream PC was measured
+in minutes.  Thus, the \fBfast\fR integer DCT algorithm provided noticeable
+performance benefits.  On modern CPUs running libjpeg-turbo, however, the
+compression time for a 1-megapixel JPEG image is measured in milliseconds, and
+thus the performance benefits of the \fBfast\fR algorithm are much less
+noticeable.  On modern x86/x86-64 CPUs that support AVX2 instructions, the
+\fBfast\fR and \fBint\fR methods have similar performance.  On other types of
+CPUs, the \fBfast\fR method is generally about 5-15% faster than the \fBint\fR
+method.
+
+For quality levels of 90 and below, there should be little or no perceptible
+quality difference between the two algorithms.  For quality levels above 90,
+however, the difference between the \fBfast\fR and \fBint\fR methods becomes
+more pronounced.  With quality=97, for instance, the \fBfast\fR method incurs
+generally about a 1-3 dB loss in PSNR relative to the \fBint\fR method, but
+this can be larger for some images.  Do not use the \fBfast\fR method with
+quality levels above 97.  The algorithm often degenerates at quality=98 and
+above and can actually produce a more lossy image than if lower quality levels
+had been used.  Also, in libjpeg-turbo, the \fBfast\fR method is not fully
+accelerated for quality levels above 97, so it will be slower than the
+\fBint\fR method.
+.TP
+.B \-dct float
+Use floating-point DCT method [legacy feature].
+The \fBfloat\fR method does not produce significantly more accurate results
+than the \fBint\fR method, and it is much slower.  The \fBfloat\fR method may
+also give different results on different machines due to varying roundoff
+behavior, whereas the integer methods should give the same results on all
+machines.
+.TP
+.BI \-icc " file"
+Embed ICC color management profile contained in the specified file.
+.TP
+.BI \-restart " N"
+Emit a JPEG restart marker every N MCU rows, or every N MCU blocks if "B" is
+attached to the number.
+.B \-restart 0
+(the default) means no restart markers.
+.TP
+.BI \-smooth " N"
+Smooth the input image to eliminate dithering noise.  N, ranging from 1 to
+100, indicates the strength of smoothing.  0 (the default) means no smoothing.
+.TP
+.BI \-maxmemory " N"
+Set limit for amount of memory to use in processing large images.  Value is
+in thousands of bytes, or millions of bytes if "M" is attached to the
+number.  For example,
+.B \-max 4m
+selects 4000000 bytes.  If more space is needed, an error will occur.
+.TP
+.BI \-outfile " name"
+Send output image to the named file, not to standard output.
+.TP
+.BI \-memdst
+Compress to memory instead of a file.  This feature was implemented mainly as a
+way of testing the in-memory destination manager (jpeg_mem_dest()), but it is
+also useful for benchmarking, since it reduces the I/O overhead.
+.TP
+.BI \-report
+Report compression progress.
+.TP
+.B \-verbose
+Enable debug printout.  More
+.BR \-v 's
+give more output.  Also, version information is printed at startup.
+.TP
+.B \-debug
+Same as
+.BR \-verbose .
+.TP
+.B \-version
+Print version information and exit.
+.PP
+The
+.B \-restart
+option inserts extra markers that allow a JPEG decoder to resynchronize after
+a transmission error.  Without restart markers, any damage to a compressed
+file will usually ruin the image from the point of the error to the end of the
+image; with restart markers, the damage is usually confined to the portion of
+the image up to the next restart marker.  Of course, the restart markers
+occupy extra space.  We recommend
+.B \-restart 1
+for images that will be transmitted across unreliable networks such as Usenet.
+.PP
+The
+.B \-smooth
+option filters the input to eliminate fine-scale noise.  This is often useful
+when converting dithered images to JPEG: a moderate smoothing factor of 10 to
+50 gets rid of dithering patterns in the input file, resulting in a smaller
+JPEG file and a better-looking image.  Too large a smoothing factor will
+visibly blur the image, however.
+.PP
+Switches for wizards:
+.TP
+.B \-baseline
+Force baseline-compatible quantization tables to be generated.  This clamps
+quantization values to 8 bits even at low quality settings.  (This switch is
+poorly named, since it does not ensure that the output is actually baseline
+JPEG.  For example, you can use
+.B \-baseline
+and
+.B \-progressive
+together.)
+.TP
+.BI \-qtables " file"
+Use the quantization tables given in the specified text file.
+.TP
+.BI \-qslots " N[,...]"
+Select which quantization table to use for each color component.
+.TP
+.BI \-sample " HxV[,...]"
+Set JPEG sampling factors for each color component.
+.TP
+.BI \-scans " file"
+Use the scan script given in the specified text file.
+.PP
+The "wizard" switches are intended for experimentation with JPEG.  If you
+don't know what you are doing, \fBdon't use them\fR.  These switches are
+documented further in the file wizard.txt.
+.SH EXAMPLES
+.LP
+This example compresses the PPM file foo.ppm with a quality factor of
+60 and saves the output as foo.jpg:
+.IP
+.B cjpeg \-quality
+.I 60 foo.ppm
+.B >
+.I foo.jpg
+.SH HINTS
+Color GIF files are not the ideal input for JPEG; JPEG is really intended for
+compressing full-color (24-bit) images.  In particular, don't try to convert
+cartoons, line drawings, and other images that have only a few distinct
+colors.  GIF works great on these, JPEG does not.  If you want to convert a
+GIF to JPEG, you should experiment with
+.BR cjpeg 's
+.B \-quality
+and
+.B \-smooth
+options to get a satisfactory conversion.
+.B \-smooth 10
+or so is often helpful.
+.PP
+Avoid running an image through a series of JPEG compression/decompression
+cycles.  Image quality loss will accumulate; after ten or so cycles the image
+may be noticeably worse than it was after one cycle.  It's best to use a
+lossless format while manipulating an image, then convert to JPEG format when
+you are ready to file the image away.
+.PP
+The
+.B \-optimize
+option to
+.B cjpeg
+is worth using when you are making a "final" version for posting or archiving.
+It's also a win when you are using low quality settings to make very small
+JPEG files; the percentage improvement is often a lot more than it is on
+larger files.  (At present,
+.B \-optimize
+mode is always selected when generating progressive JPEG files.)
+.SH ENVIRONMENT
+.TP
+.B JPEGMEM
+If this environment variable is set, its value is the default memory limit.
+The value is specified as described for the
+.B \-maxmemory
+switch.
+.B JPEGMEM
+overrides the default value specified when the program was compiled, and
+itself is overridden by an explicit
+.BR \-maxmemory .
+.SH SEE ALSO
+.BR djpeg (1),
+.BR jpegtran (1),
+.BR rdjpgcom (1),
+.BR wrjpgcom (1)
+.br
+.BR ppm (5),
+.BR pgm (5)
+.br
+Wallace, Gregory K.  "The JPEG Still Picture Compression Standard",
+Communications of the ACM, April 1991 (vol. 34, no. 4), pp. 30-44.
+.SH AUTHOR
+Independent JPEG Group
+.PP
+This file was modified by The libjpeg-turbo Project to include only information
+relevant to libjpeg-turbo, to wordsmith certain sections, and to describe
+features not present in libjpeg.
+.SH ISSUES
+Not all variants of BMP and Targa file formats are supported.
+.PP
+The
+.B \-targa
+switch is not a bug, it's a feature.  (It would be a bug if the Targa format
+designers had not been clueless.)
diff --git a/external/jpeg/cjpeg.c b/external/jpeg/cjpeg.c
index 56781953b679..c99a133e12a3 100644
--- a/external/jpeg/cjpeg.c
+++ b/external/jpeg/cjpeg.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1998, Thomas G. Lane.
  * Modified 2003-2011 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2013-2014, 2017, 2020, D. R. Commander.
+ * Copyright (C) 2010, 2013-2014, 2017, 2019-2021, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -27,6 +27,9 @@
  * works regardless of which command line style is used.
  */
 
+#ifdef CJPEG_FUZZER
+#define JPEG_INTERNALS
+#endif
 #include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 #include "jversion.h"           /* for version message */
 #include "jconfigint.h"
@@ -69,9 +72,9 @@ static const char * const cdjpeg_message_table[] = {
  *     2) assume we can push back more than one character (works in
  *        some C implementations, but unportable);
  *     3) provide our own buffering (breaks input readers that want to use
- *        stdio directly, such as the RLE library);
+ *        stdio directly);
  * or  4) don't put back the data, and modify the input_init methods to assume
- *        they start reading after the start of file (also breaks RLE library).
+ *        they start reading after the start of file.
  * #1 is attractive for MS-DOS but is untenable on Unix.
  *
  * The most portable solution for file types that can't be identified by their
@@ -117,10 +120,6 @@ select_file_type(j_compress_ptr cinfo, FILE *infile)
   case 'P':
     return jinit_read_ppm(cinfo);
 #endif
-#ifdef RLE_SUPPORTED
-  case 'R':
-    return jinit_read_rle(cinfo);
-#endif
 #ifdef TARGA_SUPPORTED
   case 0x00:
     return jinit_read_targa(cinfo);
@@ -147,6 +146,46 @@ static const char *progname;    /* program name for error messages */
 static char *icc_filename;      /* for -icc switch */
 static char *outfilename;       /* for -outfile switch */
 boolean memdst;                 /* for -memdst switch */
+boolean report;                 /* for -report switch */
+
+
+#ifdef CJPEG_FUZZER
+
+#include <setjmp.h>
+
+struct my_error_mgr {
+  struct jpeg_error_mgr pub;
+  jmp_buf setjmp_buffer;
+};
+
+void my_error_exit(j_common_ptr cinfo)
+{
+  struct my_error_mgr *myerr = (struct my_error_mgr *)cinfo->err;
+
+  longjmp(myerr->setjmp_buffer, 1);
+}
+
+static void my_emit_message(j_common_ptr cinfo, int msg_level)
+{
+  if (msg_level < 0)
+    cinfo->err->num_warnings++;
+}
+
+#define HANDLE_ERROR() { \
+  if (cinfo.global_state > CSTATE_START) { \
+    if (memdst && outbuffer) \
+      (*cinfo.dest->term_destination) (&cinfo); \
+    jpeg_abort_compress(&cinfo); \
+  } \
+  jpeg_destroy_compress(&cinfo); \
+  if (input_file != stdin && input_file != NULL) \
+    fclose(input_file); \
+  if (memdst) \
+    free(outbuffer); \
+  return EXIT_FAILURE; \
+}
+
+#endif
 
 
 LOCAL(void)
@@ -200,6 +239,7 @@ usage(void)
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
   fprintf(stderr, "  -memdst        Compress to memory instead of file (useful for benchmarking)\n");
 #endif
+  fprintf(stderr, "  -report        Report compression progress\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
   fprintf(stderr, "  -version       Print version information and exit\n");
   fprintf(stderr, "Switches for wizards:\n");
@@ -244,6 +284,7 @@ parse_switches(j_compress_ptr cinfo, int argc, char **argv,
   icc_filename = NULL;
   outfilename = NULL;
   memdst = FALSE;
+  report = FALSE;
   cinfo->err->trace_level = 0;
 
   /* Scan command line options, adjust parameters */
@@ -395,6 +436,9 @@ parse_switches(j_compress_ptr cinfo, int argc, char **argv,
       qtablefile = argv[argn];
       /* We postpone actually reading the file in case -quality comes later. */
 
+    } else if (keymatch(arg, "report", 3)) {
+      report = TRUE;
+
     } else if (keymatch(arg, "restart", 1)) {
       /* Restart interval in MCU rows (or in MCUs with 'b'). */
       long lval;
@@ -504,13 +548,16 @@ int
 main(int argc, char **argv)
 {
   struct jpeg_compress_struct cinfo;
+#ifdef CJPEG_FUZZER
+  struct my_error_mgr myerr;
+  struct jpeg_error_mgr &jerr = myerr.pub;
+#else
   struct jpeg_error_mgr jerr;
-#ifdef PROGRESS_REPORT
-  struct cdjpeg_progress_mgr progress;
 #endif
+  struct cdjpeg_progress_mgr progress;
   int file_index;
   cjpeg_source_ptr src_mgr;
-  FILE *input_file;
+  FILE *input_file = NULL;
   FILE *icc_file;
   JOCTET *icc_profile = NULL;
   long icc_len = 0;
@@ -628,13 +675,24 @@ main(int argc, char **argv)
     fclose(icc_file);
   }
 
-#ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr)&cinfo, &progress);
+#ifdef CJPEG_FUZZER
+  jerr.error_exit = my_error_exit;
+  jerr.emit_message = my_emit_message;
+  if (setjmp(myerr.setjmp_buffer))
+    HANDLE_ERROR()
 #endif
 
+  if (report) {
+    start_progress_monitor((j_common_ptr)&cinfo, &progress);
+    progress.report = report;
+  }
+
   /* Figure out the input file format, and set up to read it. */
   src_mgr = select_file_type(&cinfo, input_file);
   src_mgr->input_file = input_file;
+#ifdef CJPEG_FUZZER
+  src_mgr->max_pixels = 1048576;
+#endif
 
   /* Read the input file header to obtain file size & colorspace. */
   (*src_mgr->start_input) (&cinfo, src_mgr);
@@ -653,6 +711,11 @@ main(int argc, char **argv)
 #endif
     jpeg_stdio_dest(&cinfo, output_file);
 
+#ifdef CJPEG_FUZZER
+  if (setjmp(myerr.setjmp_buffer))
+    HANDLE_ERROR()
+#endif
+
   /* Start compressor */
   jpeg_start_compress(&cinfo, TRUE);
 
@@ -676,18 +739,18 @@ main(int argc, char **argv)
   if (output_file != stdout && output_file != NULL)
     fclose(output_file);
 
-#ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr)&cinfo);
-#endif
+  if (report)
+    end_progress_monitor((j_common_ptr)&cinfo);
 
   if (memdst) {
+#ifndef CJPEG_FUZZER
     fprintf(stderr, "Compressed size:  %lu bytes\n", outsize);
+#endif
     free(outbuffer);
   }
 
   free(icc_profile);
 
   /* All done. */
-  exit(jerr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS);
-  return 0;                     /* suppress no-return-value warnings */
+  return (jerr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS);
 }
diff --git a/external/jpeg/cmakescripts/BuildPackages.cmake b/external/jpeg/cmakescripts/BuildPackages.cmake
deleted file mode 100644
index 277c72fbc872..000000000000
--- a/external/jpeg/cmakescripts/BuildPackages.cmake
+++ /dev/null
@@ -1,189 +0,0 @@
-# This file is included from the top-level CMakeLists.txt.  We just store it
-# here to avoid cluttering up that file.
-
-set(PKGNAME ${CMAKE_PROJECT_NAME} CACHE STRING
-  "Distribution package name (default: ${CMAKE_PROJECT_NAME})")
-set(PKGVENDOR "The ${CMAKE_PROJECT_NAME} Project" CACHE STRING
-  "Vendor name to be included in distribution package descriptions (default: The ${CMAKE_PROJECT_NAME} Project)")
-set(PKGURL "http://www.${CMAKE_PROJECT_NAME}.org" CACHE STRING
-  "URL of project web site to be included in distribution package descriptions (default: http://www.${CMAKE_PROJECT_NAME}.org)")
-set(PKGEMAIL "information@${CMAKE_PROJECT_NAME}.org" CACHE STRING
-  "E-mail of project maintainer to be included in distribution package descriptions (default: information@${CMAKE_PROJECT_NAME}.org")
-set(PKGID "com.${CMAKE_PROJECT_NAME}.${PKGNAME}" CACHE STRING
-  "Globally unique package identifier (reverse DNS notation) (default: com.${CMAKE_PROJECT_NAME}.${PKGNAME})")
-
-
-###############################################################################
-# Linux RPM and DEB
-###############################################################################
-
-if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
-
-set(RPMARCH ${CMAKE_SYSTEM_PROCESSOR})
-if(CPU_TYPE STREQUAL "x86_64")
-  set(DEBARCH amd64)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "armv7*")
-  set(RPMARCH armv7hl)
-  set(DEBARCH armhf)
-elseif(CPU_TYPE STREQUAL "arm64")
-  set(DEBARCH ${CPU_TYPE})
-elseif(CPU_TYPE STREQUAL "arm")
-  if(CMAKE_C_COMPILER MATCHES "gnueabihf")
-    set(RPMARCH armv7hl)
-    set(DEBARCH armhf)
-  else()
-    set(RPMARCH armel)
-    set(DEBARCH armel)
-  endif()
-elseif(CMAKE_SYSTEM_PROCESSOR_LC STREQUAL "ppc64le")
-  set(DEBARCH ppc64el)
-elseif(CPU_TYPE STREQUAL "powerpc" AND BITS EQUAL 32)
-  set(RPMARCH ppc)
-  set(DEBARCH ppc)
-else()
-  set(DEBARCH ${CMAKE_SYSTEM_PROCESSOR})
-endif()
-message(STATUS "RPM architecture = ${RPMARCH}, DEB architecture = ${DEBARCH}")
-
-# Re-set CMAKE_POSITION_INDEPENDENT_CODE so that the RPM spec file works
-# properly
-boolean_number(CMAKE_POSITION_INDEPENDENT_CODE)
-
-configure_file(release/makerpm.in pkgscripts/makerpm)
-configure_file(release/rpm.spec.in pkgscripts/rpm.spec @ONLY)
-
-add_custom_target(rpm pkgscripts/makerpm
-  SOURCES pkgscripts/makerpm)
-
-configure_file(release/makesrpm.in pkgscripts/makesrpm)
-
-add_custom_target(srpm pkgscripts/makesrpm
-  SOURCES pkgscripts/makesrpm
-  DEPENDS dist)
-
-configure_file(release/makedpkg.in pkgscripts/makedpkg)
-configure_file(release/deb-control.in pkgscripts/deb-control)
-
-add_custom_target(deb pkgscripts/makedpkg
-  SOURCES pkgscripts/makedpkg)
-
-endif() # Linux
-
-
-###############################################################################
-# Windows installer (NullSoft Installer)
-###############################################################################
-
-if(WIN32)
-
-if(MSVC)
-  set(INST_PLATFORM "Visual C++")
-  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-vc)
-  set(INST_REG_NAME ${CMAKE_PROJECT_NAME})
-elseif(MINGW)
-  set(INST_PLATFORM GCC)
-  set(INST_NAME ${CMAKE_PROJECT_NAME}-${VERSION}-gcc)
-  set(INST_REG_NAME ${CMAKE_PROJECT_NAME}-gcc)
-  set(INST_DEFS -DGCC)
-endif()
-
-if(BITS EQUAL 64)
-  set(INST_PLATFORM "${INST_PLATFORM} 64-bit")
-  set(INST_NAME ${INST_NAME}64)
-  set(INST_REG_NAME ${INST_REG_NAME}64)
-  set(INST_DEFS ${INST_DEFS} -DWIN64)
-endif()
-
-if(WITH_JAVA)
-  set(INST_DEFS ${INST_DEFS} -DJAVA)
-endif()
-
-if(MSVC_IDE)
-  set(INST_DEFS ${INST_DEFS} "-DBUILDDIR=${CMAKE_CFG_INTDIR}\\")
-else()
-  set(INST_DEFS ${INST_DEFS} "-DBUILDDIR=")
-endif()
-
-string(REGEX REPLACE "/" "\\\\" INST_DIR ${CMAKE_INSTALL_PREFIX})
-
-configure_file(release/installer.nsi.in installer.nsi @ONLY)
-
-if(WITH_JAVA)
-  set(JAVA_DEPEND turbojpeg-java)
-endif()
-add_custom_target(installer
-  makensis -nocd ${INST_DEFS} installer.nsi
-  DEPENDS jpeg jpeg-static turbojpeg turbojpeg-static rdjpgcom wrjpgcom
-    cjpeg djpeg jpegtran tjbench ${JAVA_DEPEND}
-  SOURCES installer.nsi)
-
-endif() # WIN32
-
-
-###############################################################################
-# Cygwin Package
-###############################################################################
-
-if(CYGWIN)
-
-configure_file(release/makecygwinpkg.in pkgscripts/makecygwinpkg)
-
-add_custom_target(cygwinpkg pkgscripts/makecygwinpkg)
-
-endif() # CYGWIN
-
-
-###############################################################################
-# Mac DMG
-###############################################################################
-
-if(APPLE)
-
-set(DEFAULT_OSX_32BIT_BUILD ${CMAKE_SOURCE_DIR}/osxx86)
-set(OSX_32BIT_BUILD ${DEFAULT_OSX_32BIT_BUILD} CACHE PATH
-  "Directory containing 32-bit (i386) Mac build to include in universal binaries (default: ${DEFAULT_OSX_32BIT_BUILD})")
-set(DEFAULT_IOS_ARMV7_BUILD ${CMAKE_SOURCE_DIR}/iosarmv7)
-set(IOS_ARMV7_BUILD ${DEFAULT_IOS_ARMV7_BUILD} CACHE PATH
-  "Directory containing Armv7 iOS build to include in universal binaries (default: ${DEFAULT_IOS_ARMV7_BUILD})")
-set(DEFAULT_IOS_ARMV7S_BUILD ${CMAKE_SOURCE_DIR}/iosarmv7s)
-set(IOS_ARMV7S_BUILD ${DEFAULT_IOS_ARMV7S_BUILD} CACHE PATH
-  "Directory containing Armv7s iOS build to include in universal binaries (default: ${DEFAULT_IOS_ARMV7S_BUILD})")
-set(DEFAULT_IOS_ARMV8_BUILD ${CMAKE_SOURCE_DIR}/iosarmv8)
-set(IOS_ARMV8_BUILD ${DEFAULT_IOS_ARMV8_BUILD} CACHE PATH
-  "Directory containing Armv8 iOS build to include in universal binaries (default: ${DEFAULT_IOS_ARMV8_BUILD})")
-
-set(OSX_APP_CERT_NAME "" CACHE STRING
-  "Name of the Developer ID Application certificate (in the macOS keychain) that should be used to sign the libjpeg-turbo DMG.  Leave this blank to generate an unsigned DMG.")
-set(OSX_INST_CERT_NAME "" CACHE STRING
-  "Name of the Developer ID Installer certificate (in the macOS keychain) that should be used to sign the libjpeg-turbo installer package.  Leave this blank to generate an unsigned package.")
-
-configure_file(release/makemacpkg.in pkgscripts/makemacpkg)
-configure_file(release/Distribution.xml.in pkgscripts/Distribution.xml)
-configure_file(release/uninstall.in pkgscripts/uninstall)
-
-add_custom_target(dmg pkgscripts/makemacpkg
-  SOURCES pkgscripts/makemacpkg)
-
-add_custom_target(udmg pkgscripts/makemacpkg universal
-  SOURCES pkgscripts/makemacpkg)
-
-endif() # APPLE
-
-
-###############################################################################
-# Generic
-###############################################################################
-
-add_custom_target(dist
-  COMMAND git archive --prefix=${CMAKE_PROJECT_NAME}-${VERSION}/ HEAD |
-    gzip > ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}-${VERSION}.tar.gz
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-
-configure_file(release/maketarball.in pkgscripts/maketarball)
-
-add_custom_target(tarball pkgscripts/maketarball
-  SOURCES pkgscripts/maketarball)
-
-configure_file(release/libjpeg.pc.in pkgscripts/libjpeg.pc @ONLY)
-
-configure_file(release/libturbojpeg.pc.in pkgscripts/libturbojpeg.pc @ONLY)
diff --git a/external/jpeg/cmakescripts/GNUInstallDirs.cmake b/external/jpeg/cmakescripts/GNUInstallDirs.cmake
deleted file mode 100644
index 7c4119659124..000000000000
--- a/external/jpeg/cmakescripts/GNUInstallDirs.cmake
+++ /dev/null
@@ -1,416 +0,0 @@
-#.rst:
-# GNUInstallDirs
-# --------------
-#
-# Define GNU standard installation directories
-#
-# Provides install directory variables as defined by the
-# `GNU Coding Standards`_.
-#
-# .. _`GNU Coding Standards`: https://www.gnu.org/prep/standards/html_node/Directory-Variables.html
-#
-# Result Variables
-# ^^^^^^^^^^^^^^^^
-#
-# Inclusion of this module defines the following variables:
-#
-# ``CMAKE_INSTALL_<dir>``
-#
-#   Destination for files of a given type.  This value may be passed to
-#   the ``DESTINATION`` options of :command:`install` commands for the
-#   corresponding file type.
-#
-# ``CMAKE_INSTALL_FULL_<dir>``
-#
-#   The absolute path generated from the corresponding ``CMAKE_INSTALL_<dir>``
-#   value.  If the value is not already an absolute path, an absolute path
-#   is constructed typically by prepending the value of the
-#   :variable:`CMAKE_INSTALL_PREFIX` variable.  However, there are some
-#   `special cases`_ as documented below.
-#
-# where ``<dir>`` is one of:
-#
-# ``BINDIR``
-#   user executables (``bin``)
-# ``SBINDIR``
-#   system admin executables (``sbin``)
-# ``LIBEXECDIR``
-#   program executables (``libexec``)
-# ``SYSCONFDIR``
-#   read-only single-machine data (``etc``)
-# ``SHAREDSTATEDIR``
-#   modifiable architecture-independent data (``com``)
-# ``LOCALSTATEDIR``
-#   modifiable single-machine data (``var``)
-# ``LIBDIR``
-#   object code libraries (``lib`` or ``lib64``
-#   or ``lib/<multiarch-tuple>`` on Debian)
-# ``INCLUDEDIR``
-#   C header files (``include``)
-# ``OLDINCLUDEDIR``
-#   C header files for non-gcc (``/usr/include``)
-# ``DATAROOTDIR``
-#   read-only architecture-independent data root (``share``)
-# ``DATADIR``
-#   read-only architecture-independent data (``DATAROOTDIR``)
-# ``INFODIR``
-#   info documentation (``DATAROOTDIR/info``)
-# ``LOCALEDIR``
-#   locale-dependent data (``DATAROOTDIR/locale``)
-# ``MANDIR``
-#   man documentation (``DATAROOTDIR/man``)
-# ``DOCDIR``
-#   documentation root (``DATAROOTDIR/doc/PROJECT_NAME``)
-#
-# If the includer does not define a value the above-shown default will be
-# used and the value will appear in the cache for editing by the user.
-#
-# Special Cases
-# ^^^^^^^^^^^^^
-#
-# The following values of :variable:`CMAKE_INSTALL_PREFIX` are special:
-#
-# ``/``
-#
-#   For ``<dir>`` other than the ``SYSCONFDIR`` and ``LOCALSTATEDIR``,
-#   the value of ``CMAKE_INSTALL_<dir>`` is prefixed with ``usr/`` if
-#   it is not user-specified as an absolute path.  For example, the
-#   ``INCLUDEDIR`` value ``include`` becomes ``usr/include``.
-#   This is required by the `GNU Coding Standards`_, which state:
-#
-#     When building the complete GNU system, the prefix will be empty
-#     and ``/usr`` will be a symbolic link to ``/``.
-#
-# ``/usr``
-#
-#   For ``<dir>`` equal to ``SYSCONFDIR`` or ``LOCALSTATEDIR``, the
-#   ``CMAKE_INSTALL_FULL_<dir>`` is computed by prepending just ``/``
-#   to the value of ``CMAKE_INSTALL_<dir>`` if it is not user-specified
-#   as an absolute path.  For example, the ``SYSCONFDIR`` value ``etc``
-#   becomes ``/etc``.  This is required by the `GNU Coding Standards`_.
-#
-# ``/opt/...``
-#
-#   For ``<dir>`` equal to ``SYSCONFDIR`` or ``LOCALSTATEDIR``, the
-#   ``CMAKE_INSTALL_FULL_<dir>`` is computed by *appending* the prefix
-#   to the value of ``CMAKE_INSTALL_<dir>`` if it is not user-specified
-#   as an absolute path.  For example, the ``SYSCONFDIR`` value ``etc``
-#   becomes ``/etc/opt/...``.  This is defined by the
-#   `Filesystem Hierarchy Standard`_.
-#
-# .. _`Filesystem Hierarchy Standard`: https://refspecs.linuxfoundation.org/FHS_3.0/fhs/index.html
-#
-# Macros
-# ^^^^^^
-#
-# .. command:: GNUInstallDirs_get_absolute_install_dir
-#
-#   ::
-#
-#     GNUInstallDirs_get_absolute_install_dir(absvar var)
-#
-#   Set the given variable ``absvar`` to the absolute path contained
-#   within the variable ``var``.  This is to allow the computation of an
-#   absolute path, accounting for all the special cases documented
-#   above.  While this macro is used to compute the various
-#   ``CMAKE_INSTALL_FULL_<dir>`` variables, it is exposed publicly to
-#   allow users who create additional path variables to also compute
-#   absolute paths where necessary, using the same logic.
-
-#=============================================================================
-# Copyright 2016, 2019 D. R. Commander
-# Copyright 2016 Dmitry Marakasov
-# Copyright 2016 Roger Leigh
-# Copyright 2015 Alex Turbov
-# Copyright 2014 Rolf Eike Beer
-# Copyright 2014 Daniele E. Domenichelli
-# Copyright 2013 Dimitri John Ledkov
-# Copyright 2011 Alex Neundorf
-# Copyright 2011 Eric NOULARD
-# Copyright 2011, 2013-2015 Kitware, Inc.
-# Copyright 2011 Nikita Krupen'ko
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-# * Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the distribution.
-#
-# * Neither the names of Kitware, Inc., the Insight Software Consortium,
-#   nor the names of their contributors may be used to endorse or promote
-#   products derived from this software without specific prior written
-#   permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-#=============================================================================
-
-# Installation directories
-#
-
-macro(GNUInstallDirs_set_install_dir var docstring)
-  # If CMAKE_INSTALL_PREFIX changes and CMAKE_INSTALL_*DIR is still set to the
-  # default value, then modify it accordingly.  This presumes that the default
-  # value may change based on the prefix.
-
-  set(_GNUInstallDirs_CMAKE_INSTALL_FORCE_${var} "")
-  if(NOT DEFINED CMAKE_INSTALL_${var})
-    set(_GNUInstallDirs_CMAKE_INSTALL_DEFAULT_${var} 1 CACHE INTERNAL
-      "CMAKE_INSTALL_${var} has default value")
-  elseif(DEFINED _GNUInstallDirs_CMAKE_INSTALL_LAST_DEFAULT_${var} AND
-    NOT "${_GNUInstallDirs_CMAKE_INSTALL_LAST_DEFAULT_${var}}" STREQUAL
-      "${CMAKE_INSTALL_DEFAULT_${var}}" AND
-    _GNUInstallDirs_CMAKE_INSTALL_DEFAULT_${var} AND
-    "${_GNUInstallDirs_CMAKE_INSTALL_LAST_${var}}" STREQUAL
-      "${CMAKE_INSTALL_${var}}")
-    set(_GNUInstallDirs_CMAKE_INSTALL_FORCE_${var} "FORCE")
-  endif()
-
-  set(CMAKE_INSTALL_${var} "${CMAKE_INSTALL_DEFAULT_${var}}" CACHE PATH
-    "${docstring} (Default: ${CMAKE_INSTALL_DEFAULT_${var}})"
-    ${_GNUInstallDirs_CMAKE_INSTALL_FORCE_${var}})
-
-  if(NOT CMAKE_INSTALL_${var} STREQUAL CMAKE_INSTALL_DEFAULT_${var})
-    unset(_GNUInstallDirs_CMAKE_INSTALL_DEFAULT_${var} CACHE)
-  endif()
-
-  # Save for next run
-  set(_GNUInstallDirs_CMAKE_INSTALL_LAST_${var} "${CMAKE_INSTALL_${var}}"
-    CACHE INTERNAL "CMAKE_INSTALL_${var} during last run")
-  set(_GNUInstallDirs_CMAKE_INSTALL_LAST_DEFAULT_${var}
-    "${CMAKE_INSTALL_DEFAULT_${var}}" CACHE INTERNAL
-    "CMAKE_INSTALL_DEFAULT_${var} during last run")
-endmacro()
-
-if(NOT DEFINED CMAKE_INSTALL_DEFAULT_BINDIR)
-  set(CMAKE_INSTALL_DEFAULT_BINDIR "bin")
-endif()
-GNUInstallDirs_set_install_dir(BINDIR
-  "Directory into which user executables should be installed")
-
-if(NOT DEFINED CMAKE_INSTALL_DEFAULT_SBINDIR)
-  set(CMAKE_INSTALL_DEFAULT_SBINDIR "sbin")
-endif()
-GNUInstallDirs_set_install_dir(SBINDIR
-  "Directory into which system admin executables should be installed")
-
-if(NOT DEFINED CMAKE_INSTALL_DEFAULT_LIBEXECDIR)
-  set(CMAKE_INSTALL_DEFAULT_LIBEXECDIR "libexec")
-endif()
-GNUInstallDirs_set_install_dir(LIBEXECDIR
-  "Directory under which executables run by other programs should be installed")
-
-if(NOT DEFINED CMAKE_INSTALL_DEFAULT_SYSCONFDIR)
-  set(CMAKE_INSTALL_DEFAULT_SYSCONFDIR "etc")
-endif()
-GNUInstallDirs_set_install_dir(SYSCONFDIR
-  "Directory into which machine-specific read-only ASCII data and configuration files should be installed")
-
-if(NOT DEFINED CMAKE_INSTALL_DEFAULT_SHAREDSTATEDIR)
-  set(CMAKE_INSTALL_DEFAULT_SHAREDSTATEDIR "com")
-endif()
-GNUInstallDirs_set_install_dir(SHAREDSTATEDIR
-  "Directory into which architecture-independent run-time-modifiable data files should be installed")
-
-if(NOT DEFINED CMAKE_INSTALL_DEFAULT_LOCALSTATEDIR)
-  set(CMAKE_INSTALL_DEFAULT_LOCALSTATEDIR "var")
-endif()
-GNUInstallDirs_set_install_dir(LOCALSTATEDIR
-  "Directory into which machine-specific run-time-modifiable data files should be installed")
-
-if(NOT DEFINED CMAKE_INSTALL_DEFAULT_LIBDIR)
-  set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib")
-  # Override this default 'lib' with 'lib64' iff:
-  #  - we are on Linux system but NOT cross-compiling
-  #  - we are NOT on debian
-  #  - we are on a 64 bits system
-  # reason is: amd64 ABI: http://www.x86-64.org/documentation/abi.pdf
-  # For Debian with multiarch, use 'lib/${CMAKE_LIBRARY_ARCHITECTURE}' if
-  # CMAKE_LIBRARY_ARCHITECTURE is set (which contains e.g. "i386-linux-gnu"
-  # and CMAKE_INSTALL_PREFIX is "/usr"
-  # See http://wiki.debian.org/Multiarch
-  if(CMAKE_SYSTEM_NAME MATCHES "^(Linux|kFreeBSD|GNU)$"
-      AND NOT CMAKE_CROSSCOMPILING)
-    if (EXISTS "/etc/debian_version") # is this a debian system ?
-      if(CMAKE_LIBRARY_ARCHITECTURE)
-        if("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/?$")
-          set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib/${CMAKE_LIBRARY_ARCHITECTURE}")
-        endif()
-      endif()
-    else() # not debian, rely on CMAKE_SIZEOF_VOID_P:
-      if(NOT DEFINED CMAKE_SIZEOF_VOID_P)
-        message(AUTHOR_WARNING
-          "Unable to determine default CMAKE_INSTALL_LIBDIR directory because no target architecture is known. "
-          "Please enable at least one language before including GNUInstallDirs.")
-      else()
-        if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
-          set(CMAKE_INSTALL_DEFAULT_LIBDIR "lib64")
-        endif()
-      endif()
-    endif()
-  endif()
-endif()
-GNUInstallDirs_set_install_dir(LIBDIR
-  "Directory into which object files and object code libraries should be installed")
-
-if(NOT DEFINED CMAKE_INSTALL_DEFAULT_INCLUDEDIR)
-  set(CMAKE_INSTALL_DEFAULT_INCLUDEDIR "include")
-endif()
-GNUInstallDirs_set_install_dir(INCLUDEDIR
-  "Directory into which C header files should be installed")
-
-if(NOT DEFINED CMAKE_INSTALL_DEFAULT_OLDINCLUDEDIR)
-  set(CMAKE_INSTALL_DEFAULT_OLDINCLUDEDIR "/usr/include")
-endif()
-GNUInstallDirs_set_install_dir(OLDINCLUDEDIR
-  PATH "Directory into which C header files for non-GCC compilers should be installed")
-
-if(NOT DEFINED CMAKE_INSTALL_DEFAULT_DATAROOTDIR)
-  set(CMAKE_INSTALL_DEFAULT_DATAROOTDIR "share")
-endif()
-GNUInstallDirs_set_install_dir(DATAROOTDIR
-  "The root of the directory tree for read-only architecture-independent data files")
-
-#-----------------------------------------------------------------------------
-# Values whose defaults are relative to DATAROOTDIR.  Store empty values in
-# the cache and store the defaults in local variables if the cache values are
-# not set explicitly.  This auto-updates the defaults as DATAROOTDIR changes.
-
-if(NOT DEFINED CMAKE_INSTALL_DEFAULT_DATADIR)
-  set(CMAKE_INSTALL_DEFAULT_DATADIR "<CMAKE_INSTALL_DATAROOTDIR>")
-endif()
-GNUInstallDirs_set_install_dir(DATADIR
-  "The directory under which read-only architecture-independent data files should be installed")
-
-if(NOT DEFINED CMAKE_INSTALL_DEFAULT_INFODIR)
-  if(CMAKE_SYSTEM_NAME MATCHES "^(.*BSD|DragonFly)$")
-    set(CMAKE_INSTALL_DEFAULT_INFODIR "info")
-  else()
-    set(CMAKE_INSTALL_DEFAULT_INFODIR "<CMAKE_INSTALL_DATAROOTDIR>/info")
-  endif()
-endif()
-GNUInstallDirs_set_install_dir(INFODIR
-  "The directory into which info documentation files should be installed")
-
-if(NOT DEFINED CMAKE_INSTALL_DEFAULT_MANDIR)
-  if(CMAKE_SYSTEM_NAME MATCHES "^(.*BSD|DragonFly)$")
-    set(CMAKE_INSTALL_DEFAULT_MANDIR "man")
-  else()
-    set(CMAKE_INSTALL_DEFAULT_MANDIR "<CMAKE_INSTALL_DATAROOTDIR>/man")
-  endif()
-endif()
-GNUInstallDirs_set_install_dir(MANDIR
-  "The directory under which man pages should be installed")
-
-if(NOT DEFINED CMAKE_INSTALL_DEFAULT_LOCALEDIR)
-  set(CMAKE_INSTALL_DEFAULT_LOCALEDIR "<CMAKE_INSTALL_DATAROOTDIR>/locale")
-endif()
-GNUInstallDirs_set_install_dir(LOCALEDIR
-  "The directory under which locale-specific message catalogs should be installed")
-
-if(NOT DEFINED CMAKE_INSTALL_DEFAULT_DOCDIR)
-  set(CMAKE_INSTALL_DEFAULT_DOCDIR "<CMAKE_INSTALL_DATAROOTDIR>/doc/${PROJECT_NAME}")
-endif()
-GNUInstallDirs_set_install_dir(DOCDIR
-  "The directory into which documentation files (other than info files) should be installed")
-
-#-----------------------------------------------------------------------------
-
-mark_as_advanced(
-  CMAKE_INSTALL_BINDIR
-  CMAKE_INSTALL_SBINDIR
-  CMAKE_INSTALL_LIBEXECDIR
-  CMAKE_INSTALL_SYSCONFDIR
-  CMAKE_INSTALL_SHAREDSTATEDIR
-  CMAKE_INSTALL_LOCALSTATEDIR
-  CMAKE_INSTALL_LIBDIR
-  CMAKE_INSTALL_INCLUDEDIR
-  CMAKE_INSTALL_OLDINCLUDEDIR
-  CMAKE_INSTALL_DATAROOTDIR
-  CMAKE_INSTALL_DATADIR
-  CMAKE_INSTALL_INFODIR
-  CMAKE_INSTALL_LOCALEDIR
-  CMAKE_INSTALL_MANDIR
-  CMAKE_INSTALL_DOCDIR
-  )
-
-macro(GNUInstallDirs_get_absolute_install_dir absvar var)
-  string(REGEX REPLACE "[<>]" "@" ${var} "${${var}}")
-  # Handle the specific case of an empty CMAKE_INSTALL_DATAROOTDIR
-  if(NOT CMAKE_INSTALL_DATAROOTDIR AND
-    ${var} MATCHES "\@CMAKE_INSTALL_DATAROOTDIR\@/")
-    string(CONFIGURE "${${var}}" ${var} @ONLY)
-    string(REGEX REPLACE "^/" "" ${var} "${${var}}")
-  else()
-    string(CONFIGURE "${${var}}" ${var} @ONLY)
-  endif()
-  if(NOT IS_ABSOLUTE "${${var}}")
-    # Handle special cases:
-    # - CMAKE_INSTALL_PREFIX == /
-    # - CMAKE_INSTALL_PREFIX == /usr
-    # - CMAKE_INSTALL_PREFIX == /opt/...
-    if("${CMAKE_INSTALL_PREFIX}" STREQUAL "/")
-      if("${dir}" STREQUAL "SYSCONFDIR" OR "${dir}" STREQUAL "LOCALSTATEDIR")
-        set(${absvar} "/${${var}}")
-      else()
-        if (NOT "${${var}}" MATCHES "^usr/")
-          set(${var} "usr/${${var}}")
-        endif()
-        set(${absvar} "/${${var}}")
-      endif()
-    elseif("${CMAKE_INSTALL_PREFIX}" MATCHES "^/usr/?$")
-      if("${dir}" STREQUAL "SYSCONFDIR" OR "${dir}" STREQUAL "LOCALSTATEDIR")
-        set(${absvar} "/${${var}}")
-      else()
-        set(${absvar} "${CMAKE_INSTALL_PREFIX}/${${var}}")
-      endif()
-    elseif("${CMAKE_INSTALL_PREFIX}" MATCHES "^/opt/.*")
-      if("${dir}" STREQUAL "SYSCONFDIR" OR "${dir}" STREQUAL "LOCALSTATEDIR")
-        set(${absvar} "/${${var}}${CMAKE_INSTALL_PREFIX}")
-      else()
-        set(${absvar} "${CMAKE_INSTALL_PREFIX}/${${var}}")
-      endif()
-    else()
-      set(${absvar} "${CMAKE_INSTALL_PREFIX}/${${var}}")
-    endif()
-  else()
-    set(${absvar} "${${var}}")
-  endif()
-  string(REGEX REPLACE "/$" "" ${absvar} "${${absvar}}")
-endmacro()
-
-# Result directories
-#
-foreach(dir
-    BINDIR
-    SBINDIR
-    LIBEXECDIR
-    SYSCONFDIR
-    SHAREDSTATEDIR
-    LOCALSTATEDIR
-    LIBDIR
-    INCLUDEDIR
-    OLDINCLUDEDIR
-    DATAROOTDIR
-    DATADIR
-    INFODIR
-    LOCALEDIR
-    MANDIR
-    DOCDIR
-    )
-  GNUInstallDirs_get_absolute_install_dir(CMAKE_INSTALL_FULL_${dir} CMAKE_INSTALL_${dir})
-endforeach()
diff --git a/external/jpeg/cmakescripts/cmake_uninstall.cmake.in b/external/jpeg/cmakescripts/cmake_uninstall.cmake.in
deleted file mode 100644
index 6726a0d6a8d2..000000000000
--- a/external/jpeg/cmakescripts/cmake_uninstall.cmake.in
+++ /dev/null
@@ -1,24 +0,0 @@
-# This code is from the CMake FAQ
-
-if (NOT EXISTS "@CMAKE_BINARY_DIR@/install_manifest.txt")
-  message(FATAL_ERROR "Cannot find install manifest: \"@CMAKE_BINARY_DIR@/install_manifest.txt\"")
-endif(NOT EXISTS "@CMAKE_BINARY_DIR@/install_manifest.txt")
-
-file(READ "@CMAKE_BINARY_DIR@/install_manifest.txt" files)
-string(REGEX REPLACE "\n" ";" files "${files}")
-list(REVERSE files)
-foreach (file ${files})
-  message(STATUS "Uninstalling \"$ENV{DESTDIR}${file}\"")
-    if (EXISTS "$ENV{DESTDIR}${file}")
-      execute_process(
-        COMMAND "@CMAKE_COMMAND@" -E remove "$ENV{DESTDIR}${file}"
-        OUTPUT_VARIABLE rm_out
-        RESULT_VARIABLE rm_retval
-      )
-    if(NOT ${rm_retval} EQUAL 0)
-      message(FATAL_ERROR "Problem when removing \"$ENV{DESTDIR}${file}\"")
-    endif (NOT ${rm_retval} EQUAL 0)
-  else (EXISTS "$ENV{DESTDIR}${file}")
-    message(STATUS "File \"$ENV{DESTDIR}${file}\" does not exist.")
-  endif (EXISTS "$ENV{DESTDIR}${file}")
-endforeach(file)
diff --git a/external/jpeg/cmakescripts/testclean.cmake b/external/jpeg/cmakescripts/testclean.cmake
deleted file mode 100644
index fc3fc25e9658..000000000000
--- a/external/jpeg/cmakescripts/testclean.cmake
+++ /dev/null
@@ -1,41 +0,0 @@
-file(GLOB FILES
-  testout*
-  *_GRAY_*.bmp
-  *_GRAY_*.png
-  *_GRAY_*.ppm
-  *_GRAY_*.jpg
-  *_GRAY.yuv
-  *_420_*.bmp
-  *_420_*.png
-  *_420_*.ppm
-  *_420_*.jpg
-  *_420.yuv
-  *_422_*.bmp
-  *_422_*.png
-  *_422_*.ppm
-  *_422_*.jpg
-  *_422.yuv
-  *_444_*.bmp
-  *_444_*.png
-  *_444_*.ppm
-  *_444_*.jpg
-  *_444.yuv
-  *_440_*.bmp
-  *_440_*.png
-  *_440_*.ppm
-  *_440_*.jpg
-  *_440.yuv
-  *_411_*.bmp
-  *_411_*.png
-  *_411_*.ppm
-  *_411_*.jpg
-  *_411.yuv
-  tjbenchtest*.log
-  tjexampletest*.log)
-
-if(NOT FILES STREQUAL "")
-  message(STATUS "Removing test files")
-  file(REMOVE ${FILES})
-else()
-  message(STATUS "No files to remove")
-endif()
diff --git a/external/jpeg/coderules.txt b/external/jpeg/coderules.txt
new file mode 100644
index 000000000000..a2f593adab7c
--- /dev/null
+++ b/external/jpeg/coderules.txt
@@ -0,0 +1,78 @@
+IJG JPEG LIBRARY:  CODING RULES
+
+This file was part of the Independent JPEG Group's software:
+Copyright (C) 1991-1996, Thomas G. Lane.
+It was modified by The libjpeg-turbo Project to include only information
+relevant to libjpeg-turbo.
+For conditions of distribution and use, see the accompanying README.ijg file.
+
+
+Since numerous people will be contributing code and bug fixes, it's important
+to establish a common coding style.  The goal of using similar coding styles
+is much more important than the details of just what that style is.
+
+In general we follow the recommendations of "Recommended C Style and Coding
+Standards" revision 6.1 (Cannon et al. as modified by Spencer, Keppel and
+Brader).  This document is available in the IJG FTP archive (see
+jpeg/doc/cstyle.ms.tbl.Z, or cstyle.txt.Z for those without nroff/tbl).
+
+Block comments should be laid out thusly:
+
+/*
+ *  Block comments in this style.
+ */
+
+We indent statements in K&R style, e.g.,
+        if (test) {
+          then-part;
+        } else {
+          else-part;
+        }
+with two spaces per indentation level.  (This indentation convention is
+handled automatically by GNU Emacs and many other text editors.)
+
+Multi-word names should be written in lower case with underscores, e.g.,
+multi_word_name (not multiWordName).  Preprocessor symbols and enum constants
+are similar but upper case (MULTI_WORD_NAME).  Names should be unique within
+the first fifteen characters.
+
+Note that each function definition must begin with GLOBAL(type), LOCAL(type),
+or METHODDEF(type).  These macros expand to "static type" or just "type" as
+appropriate.  They provide a readable indication of the routine's usage and
+can readily be changed for special needs.  (For instance, special linkage
+keywords can be inserted for use in Windows DLLs.)
+
+A similar solution is used for external function declarations (see the EXTERN
+macro.)
+
+
+The JPEG library is intended to be used within larger programs.  Furthermore,
+we want it to be reentrant so that it can be used by applications that process
+multiple images concurrently.  The following rules support these requirements:
+
+1. Avoid direct use of file I/O, "malloc", error report printouts, etc;
+pass these through the common routines provided.
+
+2. Minimize global namespace pollution.  Functions should be declared static
+wherever possible.  (Note that our method-based calling conventions help this
+a lot: in many modules only the initialization function will ever need to be
+called directly, so only that function need be externally visible.)  All
+global function names should begin with "jpeg_".
+
+3. Don't use global variables; anything that must be used in another module
+should be in the common data structures.
+
+4. Don't use static variables except for read-only constant tables.  Variables
+that should be private to a module can be placed into private structures (see
+the system architecture document, structure.txt).
+
+5. Source file names should begin with "j" for files that are part of the
+library proper; source files that are not part of the library, such as cjpeg.c
+and djpeg.c, do not begin with "j".  Keep compression and decompression code in
+separate source files --- some applications may want only one half of the
+library.
+
+Note: these rules (particularly #4) are not followed religiously in the
+modules that are used in cjpeg/djpeg but are not part of the JPEG library
+proper.  Those modules are not really intended to be used in other
+applications.
diff --git a/external/jpeg/djpeg.1 b/external/jpeg/djpeg.1
new file mode 100644
index 000000000000..31431b9829d8
--- /dev/null
+++ b/external/jpeg/djpeg.1
@@ -0,0 +1,320 @@
+.TH DJPEG 1 "4 November 2020"
+.SH NAME
+djpeg \- decompress a JPEG file to an image file
+.SH SYNOPSIS
+.B djpeg
+[
+.I options
+]
+[
+.I filename
+]
+.LP
+.SH DESCRIPTION
+.LP
+.B djpeg
+decompresses the named JPEG file, or the standard input if no file is named,
+and produces an image file on the standard output.  PBMPLUS (PPM/PGM), BMP,
+GIF, or Targa output format can be selected.
+.SH OPTIONS
+All switch names may be abbreviated; for example,
+.B \-grayscale
+may be written
+.B \-gray
+or
+.BR \-gr .
+Most of the "basic" switches can be abbreviated to as little as one letter.
+Upper and lower case are equivalent (thus
+.B \-BMP
+is the same as
+.BR \-bmp ).
+British spellings are also accepted (e.g.,
+.BR \-greyscale ),
+though for brevity these are not mentioned below.
+.PP
+The basic switches are:
+.TP
+.BI \-colors " N"
+Reduce image to at most N colors.  This reduces the number of colors used in
+the output image, so that it can be displayed on a colormapped display or
+stored in a colormapped file format.  For example, if you have an 8-bit
+display, you'd need to reduce to 256 or fewer colors.
+.TP
+.BI \-quantize " N"
+Same as
+.BR \-colors .
+.B \-colors
+is the recommended name,
+.B \-quantize
+is provided only for backwards compatibility.
+.TP
+.B \-fast
+Select recommended processing options for fast, low quality output.  (The
+default options are chosen for highest quality output.)  Currently, this is
+equivalent to \fB\-dct fast \-nosmooth \-onepass \-dither ordered\fR.
+.TP
+.B \-grayscale
+Force grayscale output even if JPEG file is color.  Useful for viewing on
+monochrome displays; also,
+.B djpeg
+runs noticeably faster in this mode.
+.TP
+.B \-rgb
+Force RGB output even if JPEG file is grayscale.
+.TP
+.BI \-scale " M/N"
+Scale the output image by a factor M/N.  Currently the scale factor must be
+M/8, where M is an integer between 1 and 16 inclusive, or any reduced fraction
+thereof (such as 1/2, 3/4, etc.)  Scaling is handy if the image is larger than
+your screen; also,
+.B djpeg
+runs much faster when scaling down the output.
+.TP
+.B \-bmp
+Select BMP output format (Windows flavor).  8-bit colormapped format is
+emitted if
+.B \-colors
+or
+.B \-grayscale
+is specified, or if the JPEG file is grayscale; otherwise, 24-bit full-color
+format is emitted.
+.TP
+.B \-gif
+Select GIF output format (LZW-compressed).  Since GIF does not support more
+than 256 colors,
+.B \-colors 256
+is assumed (unless you specify a smaller number of colors).  If you specify
+.BR \-fast,
+the default number of colors is 216.
+.TP
+.B \-gif0
+Select GIF output format (uncompressed).  Since GIF does not support more than
+256 colors,
+.B \-colors 256
+is assumed (unless you specify a smaller number of colors).  If you specify
+.BR \-fast,
+the default number of colors is 216.
+.TP
+.B \-os2
+Select BMP output format (OS/2 1.x flavor).  8-bit colormapped format is
+emitted if
+.B \-colors
+or
+.B \-grayscale
+is specified, or if the JPEG file is grayscale; otherwise, 24-bit full-color
+format is emitted.
+.TP
+.B \-pnm
+Select PBMPLUS (PPM/PGM) output format (this is the default format).
+PGM is emitted if the JPEG file is grayscale or if
+.B \-grayscale
+is specified; otherwise PPM is emitted.
+.TP
+.B \-targa
+Select Targa output format.  Grayscale format is emitted if the JPEG file is
+grayscale or if
+.B \-grayscale
+is specified; otherwise, colormapped format is emitted if
+.B \-colors
+is specified; otherwise, 24-bit full-color format is emitted.
+.PP
+Switches for advanced users:
+.TP
+.B \-dct int
+Use accurate integer DCT method (default).
+.TP
+.B \-dct fast
+Use less accurate integer DCT method [legacy feature].
+When the Independent JPEG Group's software was first released in 1991, the
+decompression time for a 1-megapixel JPEG image on a mainstream PC was measured
+in minutes.  Thus, the \fBfast\fR integer DCT algorithm provided noticeable
+performance benefits.  On modern CPUs running libjpeg-turbo, however, the
+decompression time for a 1-megapixel JPEG image is measured in milliseconds,
+and thus the performance benefits of the \fBfast\fR algorithm are much less
+noticeable.  On modern x86/x86-64 CPUs that support AVX2 instructions, the
+\fBfast\fR and \fBint\fR methods have similar performance.  On other types of
+CPUs, the \fBfast\fR method is generally about 5-15% faster than the \fBint\fR
+method.
+
+If the JPEG image was compressed using a quality level of 85 or below, then
+there should be little or no perceptible quality difference between the two
+algorithms.  When decompressing images that were compressed using quality
+levels above 85, however, the difference between the \fBfast\fR and \fBint\fR
+methods becomes more pronounced.  With images compressed using quality=97, for
+instance, the \fBfast\fR method incurs generally about a 4-6 dB loss in PSNR
+relative to the \fBint\fR method, but this can be larger for some images.  If
+you can avoid it, do not use the \fBfast\fR method when decompressing images
+that were compressed using quality levels above 97.  The algorithm often
+degenerates for such images and can actually produce a more lossy output image
+than if the JPEG image had been compressed using lower quality levels.
+.TP
+.B \-dct float
+Use floating-point DCT method [legacy feature].
+The \fBfloat\fR method does not produce significantly more accurate results
+than the \fBint\fR method, and it is much slower.  The \fBfloat\fR method may
+also give different results on different machines due to varying roundoff
+behavior, whereas the integer methods should give the same results on all
+machines.
+.TP
+.B \-dither fs
+Use Floyd-Steinberg dithering in color quantization.
+.TP
+.B \-dither ordered
+Use ordered dithering in color quantization.
+.TP
+.B \-dither none
+Do not use dithering in color quantization.
+By default, Floyd-Steinberg dithering is applied when quantizing colors; this
+is slow but usually produces the best results.  Ordered dither is a compromise
+between speed and quality; no dithering is fast but usually looks awful.  Note
+that these switches have no effect unless color quantization is being done.
+Ordered dither is only available in
+.B \-onepass
+mode.
+.TP
+.BI \-icc " file"
+Extract ICC color management profile to the specified file.
+.TP
+.BI \-map " file"
+Quantize to the colors used in the specified image file.  This is useful for
+producing multiple files with identical color maps, or for forcing a
+predefined set of colors to be used.  The
+.I file
+must be a GIF or PPM file. This option overrides
+.B \-colors
+and
+.BR \-onepass .
+.TP
+.B \-nosmooth
+Use a faster, lower-quality upsampling routine.
+.TP
+.B \-onepass
+Use one-pass instead of two-pass color quantization.  The one-pass method is
+faster and needs less memory, but it produces a lower-quality image.
+.B \-onepass
+is ignored unless you also say
+.B \-colors
+.IR N .
+Also, the one-pass method is always used for grayscale output (the two-pass
+method is no improvement then).
+.TP
+.BI \-maxmemory " N"
+Set limit for amount of memory to use in processing large images.  Value is
+in thousands of bytes, or millions of bytes if "M" is attached to the
+number.  For example,
+.B \-max 4m
+selects 4000000 bytes.  If more space is needed, an error will occur.
+.TP
+.BI \-maxscans " N"
+Abort if the JPEG image contains more than
+.I N
+scans.  This feature demonstrates a method by which applications can guard
+against denial-of-service attacks instigated by specially-crafted malformed
+JPEG images containing numerous scans with missing image data or image data
+consisting only of "EOB runs" (a feature of progressive JPEG images that allows
+potentially hundreds of thousands of adjoining zero-value pixels to be
+represented using only a few bytes.)  Attempting to decompress such malformed
+JPEG images can cause excessive CPU activity, since the decompressor must fully
+process each scan (even if the scan is corrupt) before it can proceed to the
+next scan.
+.TP
+.BI \-outfile " name"
+Send output image to the named file, not to standard output.
+.TP
+.BI \-memsrc
+Load input file into memory before decompressing.  This feature was implemented
+mainly as a way of testing the in-memory source manager (jpeg_mem_src().)
+.TP
+.BI \-report
+Report decompression progress.
+.TP
+.BI \-skip " Y0,Y1"
+Decompress all rows of the JPEG image except those between Y0 and Y1
+(inclusive.)  Note that if decompression scaling is being used, then Y0 and Y1
+are relative to the scaled image dimensions.
+.TP
+.BI \-crop " WxH+X+Y"
+Decompress only a rectangular subregion of the image, starting at point X,Y
+with width W and height H.  If necessary, X will be shifted left to the nearest
+iMCU boundary, and the width will be increased accordingly.  Note that if
+decompression scaling is being used, then X, Y, W, and H are relative to the
+scaled image dimensions.  Currently this option only works with the
+PBMPLUS (PPM/PGM), GIF, and Targa output formats.
+.TP
+.BI \-strict
+Treat all warnings as fatal.  This feature also demonstrates a method by which
+applications can guard against attacks instigated by specially-crafted
+malformed JPEG images.  Enabling this option will cause the decompressor to
+abort if the JPEG image contains incomplete or corrupt image data.
+.TP
+.B \-verbose
+Enable debug printout.  More
+.BR \-v 's
+give more output.  Also, version information is printed at startup.
+.TP
+.B \-debug
+Same as
+.BR \-verbose .
+.TP
+.B \-version
+Print version information and exit.
+.SH EXAMPLES
+.LP
+This example decompresses the JPEG file foo.jpg, quantizes it to
+256 colors, and saves the output in 8-bit BMP format in foo.bmp:
+.IP
+.B djpeg \-colors 256 \-bmp
+.I foo.jpg
+.B >
+.I foo.bmp
+.SH HINTS
+To get a quick preview of an image, use the
+.B \-grayscale
+and/or
+.B \-scale
+switches.
+.B \-grayscale \-scale 1/8
+is the fastest case.
+.PP
+Several options are available that trade off image quality to gain speed.
+.B \-fast
+turns on the recommended settings.
+.PP
+.B \-dct fast
+and/or
+.B \-nosmooth
+gain speed at a small sacrifice in quality.
+When producing a color-quantized image,
+.B \-onepass \-dither ordered
+is fast but much lower quality than the default behavior.
+.B \-dither none
+may give acceptable results in two-pass mode, but is seldom tolerable in
+one-pass mode.
+.SH ENVIRONMENT
+.TP
+.B JPEGMEM
+If this environment variable is set, its value is the default memory limit.
+The value is specified as described for the
+.B \-maxmemory
+switch.
+.B JPEGMEM
+overrides the default value specified when the program was compiled, and
+itself is overridden by an explicit
+.BR \-maxmemory .
+.SH SEE ALSO
+.BR cjpeg (1),
+.BR jpegtran (1),
+.BR rdjpgcom (1),
+.BR wrjpgcom (1)
+.br
+.BR ppm (5),
+.BR pgm (5)
+.br
+Wallace, Gregory K.  "The JPEG Still Picture Compression Standard",
+Communications of the ACM, April 1991 (vol. 34, no. 4), pp. 30-44.
+.SH AUTHOR
+Independent JPEG Group
+.PP
+This file was modified by The libjpeg-turbo Project to include only information
+relevant to libjpeg-turbo, to wordsmith certain sections, and to describe
+features not present in libjpeg.
diff --git a/external/jpeg/djpeg.c b/external/jpeg/djpeg.c
index 693aadb8ca7b..d47984e64071 100644
--- a/external/jpeg/djpeg.c
+++ b/external/jpeg/djpeg.c
@@ -3,9 +3,9 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * Modified 2013 by Guido Vollbeding.
+ * Modified 2013-2019 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010-2011, 2013-2017, 2020, D. R. Commander.
+ * Copyright (C) 2010-2011, 2013-2017, 2019-2020, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -68,10 +68,10 @@ static const char * const cdjpeg_message_table[] = {
 
 typedef enum {
   FMT_BMP,                      /* BMP format (Windows flavor) */
-  FMT_GIF,                      /* GIF format */
+  FMT_GIF,                      /* GIF format (LZW-compressed) */
+  FMT_GIF0,                     /* GIF format (uncompressed) */
   FMT_OS2,                      /* BMP format (OS/2 flavor) */
   FMT_PPM,                      /* PPM/PGM (PBMPLUS formats) */
-  FMT_RLE,                      /* RLE format */
   FMT_TARGA,                    /* Targa format */
   FMT_TIFF                      /* TIFF format */
 } IMAGE_FORMATS;
@@ -94,11 +94,14 @@ static IMAGE_FORMATS requested_fmt;
 
 static const char *progname;    /* program name for error messages */
 static char *icc_filename;      /* for -icc switch */
+JDIMENSION max_scans;           /* for -maxscans switch */
 static char *outfilename;       /* for -outfile switch */
 boolean memsrc;                 /* for -memsrc switch */
+boolean report;                 /* for -report switch */
 boolean skip, crop;
 JDIMENSION skip_start, skip_end;
 JDIMENSION crop_x, crop_y, crop_width, crop_height;
+boolean strict;                 /* for -strict switch */
 #define INPUT_BUF_SIZE  4096
 
 
@@ -127,8 +130,10 @@ usage(void)
           (DEFAULT_FMT == FMT_BMP ? " (default)" : ""));
 #endif
 #ifdef GIF_SUPPORTED
-  fprintf(stderr, "  -gif           Select GIF output format%s\n",
+  fprintf(stderr, "  -gif           Select GIF output format (LZW-compressed)%s\n",
           (DEFAULT_FMT == FMT_GIF ? " (default)" : ""));
+  fprintf(stderr, "  -gif0          Select GIF output format (uncompressed)%s\n",
+          (DEFAULT_FMT == FMT_GIF0 ? " (default)" : ""));
 #endif
 #ifdef BMP_SUPPORTED
   fprintf(stderr, "  -os2           Select BMP output format (OS/2 style)%s\n",
@@ -138,10 +143,6 @@ usage(void)
   fprintf(stderr, "  -pnm           Select PBMPLUS (PPM/PGM) output format%s\n",
           (DEFAULT_FMT == FMT_PPM ? " (default)" : ""));
 #endif
-#ifdef RLE_SUPPORTED
-  fprintf(stderr, "  -rle           Select Utah RLE output format%s\n",
-          (DEFAULT_FMT == FMT_RLE ? " (default)" : ""));
-#endif
 #ifdef TARGA_SUPPORTED
   fprintf(stderr, "  -targa         Select Targa output format%s\n",
           (DEFAULT_FMT == FMT_TARGA ? " (default)" : ""));
@@ -171,14 +172,16 @@ usage(void)
   fprintf(stderr, "  -onepass       Use 1-pass quantization (fast, low quality)\n");
 #endif
   fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
+  fprintf(stderr, "  -maxscans N    Maximum number of scans to allow in input file\n");
   fprintf(stderr, "  -outfile name  Specify name for output file\n");
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
   fprintf(stderr, "  -memsrc        Load input file into memory before decompressing\n");
 #endif
-
+  fprintf(stderr, "  -report        Report decompression progress\n");
   fprintf(stderr, "  -skip Y0,Y1    Decompress all rows except those between Y0 and Y1 (inclusive)\n");
   fprintf(stderr, "  -crop WxH+X+Y  Decompress only a rectangular subregion of the image\n");
   fprintf(stderr, "                 [requires PBMPLUS (PPM/PGM), GIF, or Targa output format]\n");
+  fprintf(stderr, "  -strict        Treat all warnings as fatal\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
   fprintf(stderr, "  -version       Print version information and exit\n");
   exit(EXIT_FAILURE);
@@ -203,10 +206,13 @@ parse_switches(j_decompress_ptr cinfo, int argc, char **argv,
   /* Set up default JPEG parameters. */
   requested_fmt = DEFAULT_FMT;  /* set default output file format */
   icc_filename = NULL;
+  max_scans = 0;
   outfilename = NULL;
   memsrc = FALSE;
+  report = FALSE;
   skip = FALSE;
   crop = FALSE;
+  strict = FALSE;
   cinfo->err->trace_level = 0;
 
   /* Scan command line options, adjust parameters */
@@ -224,7 +230,7 @@ parse_switches(j_decompress_ptr cinfo, int argc, char **argv,
     arg++;                      /* advance past switch marker character */
 
     if (keymatch(arg, "bmp", 1)) {
-      /* BMP output format. */
+      /* BMP output format (Windows flavor). */
       requested_fmt = FMT_BMP;
 
     } else if (keymatch(arg, "colors", 1) || keymatch(arg, "colours", 1) ||
@@ -295,9 +301,13 @@ parse_switches(j_decompress_ptr cinfo, int argc, char **argv,
       cinfo->do_fancy_upsampling = FALSE;
 
     } else if (keymatch(arg, "gif", 1)) {
-      /* GIF output format. */
+      /* GIF output format (LZW-compressed). */
       requested_fmt = FMT_GIF;
 
+    } else if (keymatch(arg, "gif0", 4)) {
+      /* GIF output format (uncompressed). */
+      requested_fmt = FMT_GIF0;
+
     } else if (keymatch(arg, "grayscale", 2) ||
                keymatch(arg, "greyscale", 2)) {
       /* Force monochrome output. */
@@ -351,6 +361,12 @@ parse_switches(j_decompress_ptr cinfo, int argc, char **argv,
         lval *= 1000L;
       cinfo->mem->max_memory_to_use = lval * 1000L;
 
+    } else if (keymatch(arg, "maxscans", 4)) {
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (sscanf(argv[argn], "%u", &max_scans) != 1)
+        usage();
+
     } else if (keymatch(arg, "nosmooth", 3)) {
       /* Suppress fancy upsampling */
       cinfo->do_fancy_upsampling = FALSE;
@@ -383,9 +399,8 @@ parse_switches(j_decompress_ptr cinfo, int argc, char **argv,
       /* PPM/PGM output format. */
       requested_fmt = FMT_PPM;
 
-    } else if (keymatch(arg, "rle", 1)) {
-      /* RLE output format. */
-      requested_fmt = FMT_RLE;
+    } else if (keymatch(arg, "report", 2)) {
+      report = TRUE;
 
     } else if (keymatch(arg, "scale", 2)) {
       /* Scale the output image by a fraction M/N. */
@@ -413,6 +428,9 @@ parse_switches(j_decompress_ptr cinfo, int argc, char **argv,
         usage();
       crop = TRUE;
 
+    } else if (keymatch(arg, "strict", 2)) {
+      strict = TRUE;
+
     } else if (keymatch(arg, "targa", 1)) {
       /* Targa output format. */
       requested_fmt = FMT_TARGA;
@@ -444,7 +462,7 @@ jpeg_getc(j_decompress_ptr cinfo)
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
   }
   datasrc->bytes_in_buffer--;
-  return GETJOCTET(*datasrc->next_input_byte++);
+  return *datasrc->next_input_byte++;
 }
 
 
@@ -499,6 +517,19 @@ print_text_marker(j_decompress_ptr cinfo)
 }
 
 
+METHODDEF(void)
+my_emit_message(j_common_ptr cinfo, int msg_level)
+{
+  if (msg_level < 0) {
+    /* Treat warning as fatal */
+    cinfo->err->error_exit(cinfo);
+  } else {
+    if (cinfo->err->trace_level >= msg_level)
+      cinfo->err->output_message(cinfo);
+  }
+}
+
+
 /*
  * The main program.
  */
@@ -508,9 +539,7 @@ main(int argc, char **argv)
 {
   struct jpeg_decompress_struct cinfo;
   struct jpeg_error_mgr jerr;
-#ifdef PROGRESS_REPORT
   struct cdjpeg_progress_mgr progress;
-#endif
   int file_index;
   djpeg_dest_ptr dest_mgr = NULL;
   FILE *input_file;
@@ -557,6 +586,9 @@ main(int argc, char **argv)
 
   file_index = parse_switches(&cinfo, argc, argv, 0, FALSE);
 
+  if (strict)
+    jerr.emit_message = my_emit_message;
+
 #ifdef TWO_FILE_COMMANDLINE
   /* Must have either -outfile switch or explicit output file name */
   if (outfilename == NULL) {
@@ -603,9 +635,11 @@ main(int argc, char **argv)
     output_file = write_stdout();
   }
 
-#ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr)&cinfo, &progress);
-#endif
+  if (report || max_scans != 0) {
+    start_progress_monitor((j_common_ptr)&cinfo, &progress);
+    progress.report = report;
+    progress.max_scans = max_scans;
+  }
 
   /* Specify data source for decompression */
 #if JPEG_LIB_VERSION >= 80 || defined(MEM_SRCDST_SUPPORTED)
@@ -653,7 +687,10 @@ main(int argc, char **argv)
 #endif
 #ifdef GIF_SUPPORTED
   case FMT_GIF:
-    dest_mgr = jinit_write_gif(&cinfo);
+    dest_mgr = jinit_write_gif(&cinfo, TRUE);
+    break;
+  case FMT_GIF0:
+    dest_mgr = jinit_write_gif(&cinfo, FALSE);
     break;
 #endif
 #ifdef PPM_SUPPORTED
@@ -661,11 +698,6 @@ main(int argc, char **argv)
     dest_mgr = jinit_write_ppm(&cinfo);
     break;
 #endif
-#ifdef RLE_SUPPORTED
-  case FMT_RLE:
-    dest_mgr = jinit_write_rle(&cinfo);
-    break;
-#endif
 #ifdef TARGA_SUPPORTED
   case FMT_TARGA:
     dest_mgr = jinit_write_targa(&cinfo);
@@ -781,12 +813,11 @@ main(int argc, char **argv)
     }
   }
 
-#ifdef PROGRESS_REPORT
   /* Hack: count final pass as done in case finish_output does an extra pass.
    * The library won't have updated completed_passes.
    */
-  progress.pub.completed_passes = progress.pub.total_passes;
-#endif
+  if (report || max_scans != 0)
+    progress.pub.completed_passes = progress.pub.total_passes;
 
   if (icc_filename != NULL) {
     FILE *icc_file;
@@ -825,9 +856,8 @@ main(int argc, char **argv)
   if (output_file != stdout)
     fclose(output_file);
 
-#ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr)&cinfo);
-#endif
+  if (report || max_scans != 0)
+    end_progress_monitor((j_common_ptr)&cinfo);
 
   if (memsrc)
     free(inbuffer);
diff --git a/external/jpeg/doxygen-extra.css b/external/jpeg/doxygen-extra.css
new file mode 100644
index 000000000000..f1bd4c269531
--- /dev/null
+++ b/external/jpeg/doxygen-extra.css
@@ -0,0 +1,3 @@
+code {
+	color: #4665A2;
+}
diff --git a/external/jpeg/doxygen.config b/external/jpeg/doxygen.config
new file mode 100644
index 000000000000..6deb064ecabc
--- /dev/null
+++ b/external/jpeg/doxygen.config
@@ -0,0 +1,16 @@
+PROJECT_NAME = TurboJPEG
+PROJECT_NUMBER = 2.1
+OUTPUT_DIRECTORY = doc/
+USE_WINDOWS_ENCODING = NO
+OPTIMIZE_OUTPUT_FOR_C = YES
+WARN_NO_PARAMDOC = YES
+GENERATE_LATEX = NO
+FILE_PATTERNS = turbojpeg.h
+HIDE_UNDOC_MEMBERS = YES
+VERBATIM_HEADERS = NO
+EXTRACT_STATIC = YES
+JAVADOC_AUTOBRIEF = YES
+MAX_INITIALIZER_LINES = 0
+ALWAYS_DETAILED_SEC = YES
+HTML_TIMESTAMP = NO
+HTML_EXTRA_STYLESHEET = doxygen-extra.css
diff --git a/external/jpeg/example.txt b/external/jpeg/example.txt
new file mode 100644
index 000000000000..bc0ba49d2921
--- /dev/null
+++ b/external/jpeg/example.txt
@@ -0,0 +1,464 @@
+/*
+ * example.txt
+ *
+ * This file illustrates how to use the IJG code as a subroutine library
+ * to read or write JPEG image files.  You should look at this code in
+ * conjunction with the documentation file libjpeg.txt.
+ *
+ * This code will not do anything useful as-is, but it may be helpful as a
+ * skeleton for constructing routines that call the JPEG library.
+ *
+ * We present these routines in the same coding style used in the JPEG code
+ * (ANSI function definitions, etc); but you are of course free to code your
+ * routines in a different style if you prefer.
+ */
+
+/* This example was part of the original libjpeg documentation and has been
+ * unchanged since 1994.  It is, as described in libjpeg.txt, "heavily
+ * commented skeleton code for calling the JPEG library."  It is not meant to
+ * be compiled as a standalone program, since it has no main() function and
+ * does not compress from/decompress to a real image buffer (corollary:
+ * put_scanline_someplace() is not a real function.)  First-time users of
+ * libjpeg-turbo would be better served by looking at tjexample.c, which uses
+ * the more straightforward TurboJPEG API, or at cjpeg.c and djpeg.c, which are
+ * examples of libjpeg API usage that can be (and are) compiled into standalone
+ * programs.  Note that this example, as well as the examples in cjpeg.c and
+ * djpeg.c, interleave disk I/O with JPEG compression/decompression, so none of
+ * these examples is suitable for benchmarking purposes.
+ */
+
+#include <stdio.h>
+
+/*
+ * Include file for users of JPEG library.
+ * You will need to have included system headers that define at least
+ * the typedefs FILE and size_t before you can include jpeglib.h.
+ * (stdio.h is sufficient on ANSI-conforming systems.)
+ * You may also wish to include "jerror.h".
+ */
+
+#include "jpeglib.h"
+
+/*
+ * <setjmp.h> is used for the optional error recovery mechanism shown in
+ * the second part of the example.
+ */
+
+#include <setjmp.h>
+
+
+
+/******************** JPEG COMPRESSION SAMPLE INTERFACE *******************/
+
+/* This half of the example shows how to feed data into the JPEG compressor.
+ * We present a minimal version that does not worry about refinements such
+ * as error recovery (the JPEG code will just exit() if it gets an error).
+ */
+
+
+/*
+ * IMAGE DATA FORMATS:
+ *
+ * The standard input image format is a rectangular array of pixels, with
+ * each pixel having the same number of "component" values (color channels).
+ * Each pixel row is an array of JSAMPLEs (which typically are unsigned chars).
+ * If you are working with color data, then the color values for each pixel
+ * must be adjacent in the row; for example, R,G,B,R,G,B,R,G,B,... for 24-bit
+ * RGB color.
+ *
+ * For this example, we'll assume that this data structure matches the way
+ * our application has stored the image in memory, so we can just pass a
+ * pointer to our image buffer.  In particular, let's say that the image is
+ * RGB color and is described by:
+ */
+
+extern JSAMPLE *image_buffer;   /* Points to large array of R,G,B-order data */
+extern int image_height;        /* Number of rows in image */
+extern int image_width;         /* Number of columns in image */
+
+
+/*
+ * Sample routine for JPEG compression.  We assume that the target file name
+ * and a compression quality factor are passed in.
+ */
+
+GLOBAL(void)
+write_JPEG_file(char *filename, int quality)
+{
+  /* This struct contains the JPEG compression parameters and pointers to
+   * working space (which is allocated as needed by the JPEG library).
+   * It is possible to have several such structures, representing multiple
+   * compression/decompression processes, in existence at once.  We refer
+   * to any one struct (and its associated working data) as a "JPEG object".
+   */
+  struct jpeg_compress_struct cinfo;
+  /* This struct represents a JPEG error handler.  It is declared separately
+   * because applications often want to supply a specialized error handler
+   * (see the second half of this file for an example).  But here we just
+   * take the easy way out and use the standard error handler, which will
+   * print a message on stderr and call exit() if compression fails.
+   * Note that this struct must live as long as the main JPEG parameter
+   * struct, to avoid dangling-pointer problems.
+   */
+  struct jpeg_error_mgr jerr;
+  /* More stuff */
+  FILE *outfile;                /* target file */
+  JSAMPROW row_pointer[1];      /* pointer to JSAMPLE row[s] */
+  int row_stride;               /* physical row width in image buffer */
+
+  /* Step 1: allocate and initialize JPEG compression object */
+
+  /* We have to set up the error handler first, in case the initialization
+   * step fails.  (Unlikely, but it could happen if you are out of memory.)
+   * This routine fills in the contents of struct jerr, and returns jerr's
+   * address which we place into the link field in cinfo.
+   */
+  cinfo.err = jpeg_std_error(&jerr);
+  /* Now we can initialize the JPEG compression object. */
+  jpeg_create_compress(&cinfo);
+
+  /* Step 2: specify data destination (eg, a file) */
+  /* Note: steps 2 and 3 can be done in either order. */
+
+  /* Here we use the library-supplied code to send compressed data to a
+   * stdio stream.  You can also write your own code to do something else.
+   * VERY IMPORTANT: use "b" option to fopen() if you are on a machine that
+   * requires it in order to write binary files.
+   */
+  if ((outfile = fopen(filename, "wb")) == NULL) {
+    fprintf(stderr, "can't open %s\n", filename);
+    exit(1);
+  }
+  jpeg_stdio_dest(&cinfo, outfile);
+
+  /* Step 3: set parameters for compression */
+
+  /* First we supply a description of the input image.
+   * Four fields of the cinfo struct must be filled in:
+   */
+  cinfo.image_width = image_width;      /* image width and height, in pixels */
+  cinfo.image_height = image_height;
+  cinfo.input_components = 3;           /* # of color components per pixel */
+  cinfo.in_color_space = JCS_RGB;       /* colorspace of input image */
+  /* Now use the library's routine to set default compression parameters.
+   * (You must set at least cinfo.in_color_space before calling this,
+   * since the defaults depend on the source color space.)
+   */
+  jpeg_set_defaults(&cinfo);
+  /* Now you can set any non-default parameters you wish to.
+   * Here we just illustrate the use of quality (quantization table) scaling:
+   */
+  jpeg_set_quality(&cinfo, quality, TRUE /* limit to baseline-JPEG values */);
+
+  /* Step 4: Start compressor */
+
+  /* TRUE ensures that we will write a complete interchange-JPEG file.
+   * Pass TRUE unless you are very sure of what you're doing.
+   */
+  jpeg_start_compress(&cinfo, TRUE);
+
+  /* Step 5: while (scan lines remain to be written) */
+  /*           jpeg_write_scanlines(...); */
+
+  /* Here we use the library's state variable cinfo.next_scanline as the
+   * loop counter, so that we don't have to keep track ourselves.
+   * To keep things simple, we pass one scanline per call; you can pass
+   * more if you wish, though.
+   */
+  row_stride = image_width * 3; /* JSAMPLEs per row in image_buffer */
+
+  while (cinfo.next_scanline < cinfo.image_height) {
+    /* jpeg_write_scanlines expects an array of pointers to scanlines.
+     * Here the array is only one element long, but you could pass
+     * more than one scanline at a time if that's more convenient.
+     */
+    row_pointer[0] = &image_buffer[cinfo.next_scanline * row_stride];
+    (void)jpeg_write_scanlines(&cinfo, row_pointer, 1);
+  }
+
+  /* Step 6: Finish compression */
+
+  jpeg_finish_compress(&cinfo);
+  /* After finish_compress, we can close the output file. */
+  fclose(outfile);
+
+  /* Step 7: release JPEG compression object */
+
+  /* This is an important step since it will release a good deal of memory. */
+  jpeg_destroy_compress(&cinfo);
+
+  /* And we're done! */
+}
+
+
+/*
+ * SOME FINE POINTS:
+ *
+ * In the above loop, we ignored the return value of jpeg_write_scanlines,
+ * which is the number of scanlines actually written.  We could get away
+ * with this because we were only relying on the value of cinfo.next_scanline,
+ * which will be incremented correctly.  If you maintain additional loop
+ * variables then you should be careful to increment them properly.
+ * Actually, for output to a stdio stream you needn't worry, because
+ * then jpeg_write_scanlines will write all the lines passed (or else exit
+ * with a fatal error).  Partial writes can only occur if you use a data
+ * destination module that can demand suspension of the compressor.
+ * (If you don't know what that's for, you don't need it.)
+ *
+ * If the compressor requires full-image buffers (for entropy-coding
+ * optimization or a multi-scan JPEG file), it will create temporary
+ * files for anything that doesn't fit within the maximum-memory setting.
+ * (Note that temp files are NOT needed if you use the default parameters.)
+ * On some systems you may need to set up a signal handler to ensure that
+ * temporary files are deleted if the program is interrupted.  See libjpeg.txt.
+ *
+ * Scanlines MUST be supplied in top-to-bottom order if you want your JPEG
+ * files to be compatible with everyone else's.  If you cannot readily read
+ * your data in that order, you'll need an intermediate array to hold the
+ * image.  See rdtarga.c or rdbmp.c for examples of handling bottom-to-top
+ * source data using the JPEG code's internal virtual-array mechanisms.
+ */
+
+
+
+/******************** JPEG DECOMPRESSION SAMPLE INTERFACE *******************/
+
+/* This half of the example shows how to read data from the JPEG decompressor.
+ * It's a bit more refined than the above, in that we show:
+ *   (a) how to modify the JPEG library's standard error-reporting behavior;
+ *   (b) how to allocate workspace using the library's memory manager.
+ *
+ * Just to make this example a little different from the first one, we'll
+ * assume that we do not intend to put the whole image into an in-memory
+ * buffer, but to send it line-by-line someplace else.  We need a one-
+ * scanline-high JSAMPLE array as a work buffer, and we will let the JPEG
+ * memory manager allocate it for us.  This approach is actually quite useful
+ * because we don't need to remember to deallocate the buffer separately: it
+ * will go away automatically when the JPEG object is cleaned up.
+ */
+
+
+/*
+ * ERROR HANDLING:
+ *
+ * The JPEG library's standard error handler (jerror.c) is divided into
+ * several "methods" which you can override individually.  This lets you
+ * adjust the behavior without duplicating a lot of code, which you might
+ * have to update with each future release.
+ *
+ * Our example here shows how to override the "error_exit" method so that
+ * control is returned to the library's caller when a fatal error occurs,
+ * rather than calling exit() as the standard error_exit method does.
+ *
+ * We use C's setjmp/longjmp facility to return control.  This means that the
+ * routine which calls the JPEG library must first execute a setjmp() call to
+ * establish the return point.  We want the replacement error_exit to do a
+ * longjmp().  But we need to make the setjmp buffer accessible to the
+ * error_exit routine.  To do this, we make a private extension of the
+ * standard JPEG error handler object.  (If we were using C++, we'd say we
+ * were making a subclass of the regular error handler.)
+ *
+ * Here's the extended error handler struct:
+ */
+
+struct my_error_mgr {
+  struct jpeg_error_mgr pub;    /* "public" fields */
+
+  jmp_buf setjmp_buffer;        /* for return to caller */
+};
+
+typedef struct my_error_mgr *my_error_ptr;
+
+/*
+ * Here's the routine that will replace the standard error_exit method:
+ */
+
+METHODDEF(void)
+my_error_exit(j_common_ptr cinfo)
+{
+  /* cinfo->err really points to a my_error_mgr struct, so coerce pointer */
+  my_error_ptr myerr = (my_error_ptr)cinfo->err;
+
+  /* Always display the message. */
+  /* We could postpone this until after returning, if we chose. */
+  (*cinfo->err->output_message) (cinfo);
+
+  /* Return control to the setjmp point */
+  longjmp(myerr->setjmp_buffer, 1);
+}
+
+
+METHODDEF(int) do_read_JPEG_file(struct jpeg_decompress_struct *cinfo,
+                                 char *filename);
+
+/*
+ * Sample routine for JPEG decompression.  We assume that the source file name
+ * is passed in.  We want to return 1 on success, 0 on error.
+ */
+
+GLOBAL(int)
+read_JPEG_file(char *filename)
+{
+  /* This struct contains the JPEG decompression parameters and pointers to
+   * working space (which is allocated as needed by the JPEG library).
+   */
+  struct jpeg_decompress_struct cinfo;
+
+  return do_read_JPEG_file(&cinfo, filename);
+}
+
+/*
+ * We call the libjpeg API from within a separate function, because modifying
+ * the local non-volatile jpeg_decompress_struct instance below the setjmp()
+ * return point and then accessing the instance after setjmp() returns would
+ * return in undefined behavior that may potentially overwrite all or part of
+ * the structure.
+ */
+
+METHODDEF(int)
+do_read_JPEG_file(struct jpeg_decompress_struct *cinfo, char *filename)
+{
+  /* We use our private extension JPEG error handler.
+   * Note that this struct must live as long as the main JPEG parameter
+   * struct, to avoid dangling-pointer problems.
+   */
+  struct my_error_mgr jerr;
+  /* More stuff */
+  FILE *infile;                 /* source file */
+  JSAMPARRAY buffer;            /* Output row buffer */
+  int row_stride;               /* physical row width in output buffer */
+
+  /* In this example we want to open the input file before doing anything else,
+   * so that the setjmp() error recovery below can assume the file is open.
+   * VERY IMPORTANT: use "b" option to fopen() if you are on a machine that
+   * requires it in order to read binary files.
+   */
+
+  if ((infile = fopen(filename, "rb")) == NULL) {
+    fprintf(stderr, "can't open %s\n", filename);
+    return 0;
+  }
+
+  /* Step 1: allocate and initialize JPEG decompression object */
+
+  /* We set up the normal JPEG error routines, then override error_exit. */
+  cinfo->err = jpeg_std_error(&jerr.pub);
+  jerr.pub.error_exit = my_error_exit;
+  /* Establish the setjmp return context for my_error_exit to use. */
+  if (setjmp(jerr.setjmp_buffer)) {
+    /* If we get here, the JPEG code has signaled an error.
+     * We need to clean up the JPEG object, close the input file, and return.
+     */
+    jpeg_destroy_decompress(cinfo);
+    fclose(infile);
+    return 0;
+  }
+  /* Now we can initialize the JPEG decompression object. */
+  jpeg_create_decompress(cinfo);
+
+  /* Step 2: specify data source (eg, a file) */
+
+  jpeg_stdio_src(cinfo, infile);
+
+  /* Step 3: read file parameters with jpeg_read_header() */
+
+  (void)jpeg_read_header(cinfo, TRUE);
+  /* We can ignore the return value from jpeg_read_header since
+   *   (a) suspension is not possible with the stdio data source, and
+   *   (b) we passed TRUE to reject a tables-only JPEG file as an error.
+   * See libjpeg.txt for more info.
+   */
+
+  /* Step 4: set parameters for decompression */
+
+  /* In this example, we don't need to change any of the defaults set by
+   * jpeg_read_header(), so we do nothing here.
+   */
+
+  /* Step 5: Start decompressor */
+
+  (void)jpeg_start_decompress(cinfo);
+  /* We can ignore the return value since suspension is not possible
+   * with the stdio data source.
+   */
+
+  /* We may need to do some setup of our own at this point before reading
+   * the data.  After jpeg_start_decompress() we have the correct scaled
+   * output image dimensions available, as well as the output colormap
+   * if we asked for color quantization.
+   * In this example, we need to make an output work buffer of the right size.
+   */
+  /* JSAMPLEs per row in output buffer */
+  row_stride = cinfo->output_width * cinfo->output_components;
+  /* Make a one-row-high sample array that will go away when done with image */
+  buffer = (*cinfo->mem->alloc_sarray)
+                ((j_common_ptr)cinfo, JPOOL_IMAGE, row_stride, 1);
+
+  /* Step 6: while (scan lines remain to be read) */
+  /*           jpeg_read_scanlines(...); */
+
+  /* Here we use the library's state variable cinfo->output_scanline as the
+   * loop counter, so that we don't have to keep track ourselves.
+   */
+  while (cinfo->output_scanline < cinfo->output_height) {
+    /* jpeg_read_scanlines expects an array of pointers to scanlines.
+     * Here the array is only one element long, but you could ask for
+     * more than one scanline at a time if that's more convenient.
+     */
+    (void)jpeg_read_scanlines(cinfo, buffer, 1);
+    /* Assume put_scanline_someplace wants a pointer and sample count. */
+    put_scanline_someplace(buffer[0], row_stride);
+  }
+
+  /* Step 7: Finish decompression */
+
+  (void)jpeg_finish_decompress(cinfo);
+  /* We can ignore the return value since suspension is not possible
+   * with the stdio data source.
+   */
+
+  /* Step 8: Release JPEG decompression object */
+
+  /* This is an important step since it will release a good deal of memory. */
+  jpeg_destroy_decompress(cinfo);
+
+  /* After finish_decompress, we can close the input file.
+   * Here we postpone it until after no more JPEG errors are possible,
+   * so as to simplify the setjmp error logic above.  (Actually, I don't
+   * think that jpeg_destroy can do an error exit, but why assume anything...)
+   */
+  fclose(infile);
+
+  /* At this point you may want to check to see whether any corrupt-data
+   * warnings occurred (test whether jerr.pub.num_warnings is nonzero).
+   */
+
+  /* And we're done! */
+  return 1;
+}
+
+
+/*
+ * SOME FINE POINTS:
+ *
+ * In the above code, we ignored the return value of jpeg_read_scanlines,
+ * which is the number of scanlines actually read.  We could get away with
+ * this because we asked for only one line at a time and we weren't using
+ * a suspending data source.  See libjpeg.txt for more info.
+ *
+ * We cheated a bit by calling alloc_sarray() after jpeg_start_decompress();
+ * we should have done it beforehand to ensure that the space would be
+ * counted against the JPEG max_memory setting.  In some systems the above
+ * code would risk an out-of-memory error.  However, in general we don't
+ * know the output image dimensions before jpeg_start_decompress(), unless we
+ * call jpeg_calc_output_dimensions().  See libjpeg.txt for more about this.
+ *
+ * Scanlines are returned in the same order as they appear in the JPEG file,
+ * which is standardly top-to-bottom.  If you must emit data bottom-to-top,
+ * you can use one of the virtual arrays provided by the JPEG memory manager
+ * to invert the data.  See wrbmp.c for an example.
+ *
+ * As with compression, some operating modes may require temporary files.
+ * On some systems you may need to set up a signal handler to ensure that
+ * temporary files are deleted if the program is interrupted.  See libjpeg.txt.
+ */
diff --git a/external/jpeg/jccolext.c b/external/jpeg/jccolext.c
index 19c955c9d6af..303b322ce674 100644
--- a/external/jpeg/jccolext.c
+++ b/external/jpeg/jccolext.c
@@ -48,9 +48,9 @@ rgb_ycc_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     outptr2 = output_buf[2][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
+      r = inptr[RGB_RED];
+      g = inptr[RGB_GREEN];
+      b = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
       /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
@@ -100,9 +100,9 @@ rgb_gray_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     outptr = output_buf[0][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr[RGB_RED]);
-      g = GETJSAMPLE(inptr[RGB_GREEN]);
-      b = GETJSAMPLE(inptr[RGB_BLUE]);
+      r = inptr[RGB_RED];
+      g = inptr[RGB_GREEN];
+      b = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
       /* Y */
       outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
@@ -135,9 +135,9 @@ rgb_rgb_convert_internal(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     outptr2 = output_buf[2][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      outptr0[col] = GETJSAMPLE(inptr[RGB_RED]);
-      outptr1[col] = GETJSAMPLE(inptr[RGB_GREEN]);
-      outptr2[col] = GETJSAMPLE(inptr[RGB_BLUE]);
+      outptr0[col] = inptr[RGB_RED];
+      outptr1[col] = inptr[RGB_GREEN];
+      outptr2[col] = inptr[RGB_BLUE];
       inptr += RGB_PIXELSIZE;
     }
   }
diff --git a/external/jpeg/jccolor.c b/external/jpeg/jccolor.c
index 036f6016d18c..bdc563c723ca 100644
--- a/external/jpeg/jccolor.c
+++ b/external/jpeg/jccolor.c
@@ -392,11 +392,11 @@ cmyk_ycck_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     outptr3 = output_buf[3][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      r = MAXJSAMPLE - GETJSAMPLE(inptr[0]);
-      g = MAXJSAMPLE - GETJSAMPLE(inptr[1]);
-      b = MAXJSAMPLE - GETJSAMPLE(inptr[2]);
+      r = MAXJSAMPLE - inptr[0];
+      g = MAXJSAMPLE - inptr[1];
+      b = MAXJSAMPLE - inptr[2];
       /* K passes through as-is */
-      outptr3[col] = inptr[3];  /* don't need GETJSAMPLE here */
+      outptr3[col] = inptr[3];
       inptr += 4;
       /* If the inputs are 0..MAXJSAMPLE, the outputs of these equations
        * must be too; we do not need an explicit range-limiting operation.
@@ -438,7 +438,7 @@ grayscale_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     outptr = output_buf[0][output_row];
     output_row++;
     for (col = 0; col < num_cols; col++) {
-      outptr[col] = inptr[0];   /* don't need GETJSAMPLE() here */
+      outptr[col] = inptr[0];
       inptr += instride;
     }
   }
@@ -497,7 +497,7 @@ null_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
         inptr = *input_buf;
         outptr = output_buf[ci][output_row];
         for (col = 0; col < num_cols; col++) {
-          outptr[col] = inptr[ci]; /* don't need GETJSAMPLE() here */
+          outptr[col] = inptr[ci];
           inptr += nc;
         }
       }
diff --git a/external/jpeg/jcdctmgr.c b/external/jpeg/jcdctmgr.c
index c04058e6cec9..7dae17a6e149 100644
--- a/external/jpeg/jcdctmgr.c
+++ b/external/jpeg/jcdctmgr.c
@@ -381,19 +381,19 @@ convsamp(JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
     elemptr = sample_data[elemr] + start_col;
 
 #if DCTSIZE == 8                /* unroll the inner loop */
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
-    *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
+    *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
 #else
     {
       register int elemc;
       for (elemc = DCTSIZE; elemc > 0; elemc--)
-        *workspaceptr++ = GETJSAMPLE(*elemptr++) - CENTERJSAMPLE;
+        *workspaceptr++ = (*elemptr++) - CENTERJSAMPLE;
     }
 #endif
   }
@@ -533,20 +533,19 @@ convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
   for (elemr = 0; elemr < DCTSIZE; elemr++) {
     elemptr = sample_data[elemr] + start_col;
 #if DCTSIZE == 8                /* unroll the inner loop */
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
-    *workspaceptr++ = (FAST_FLOAT)(GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
+    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
 #else
     {
       register int elemc;
       for (elemc = DCTSIZE; elemc > 0; elemc--)
-        *workspaceptr++ = (FAST_FLOAT)
-                          (GETJSAMPLE(*elemptr++) - CENTERJSAMPLE);
+        *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - CENTERJSAMPLE);
     }
 #endif
   }
diff --git a/external/jpeg/jchuff.c b/external/jpeg/jchuff.c
index db85ce114f8b..2bce767ebd70 100644
--- a/external/jpeg/jchuff.c
+++ b/external/jpeg/jchuff.c
@@ -4,8 +4,10 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2014-2016, 2018-2019, D. R. Commander.
+ * Copyright (C) 2009-2011, 2014-2016, 2018-2021, D. R. Commander.
  * Copyright (C) 2015, Matthieu Darbois.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -42,15 +44,19 @@
  * flags (this defines __thumb__).
  */
 
-/* NOTE: Both GCC and Clang define __GNUC__ */
-#if defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))
+#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || \
+    defined(_M_ARM64)
 #if !defined(__thumb__) || defined(__thumb2__)
 #define USE_CLZ_INTRINSIC
 #endif
 #endif
 
 #ifdef USE_CLZ_INTRINSIC
+#if defined(_MSC_VER) && !defined(__clang__)
+#define JPEG_NBITS_NONZERO(x)  (32 - _CountLeadingZeros(x))
+#else
 #define JPEG_NBITS_NONZERO(x)  (32 - __builtin_clz(x))
+#endif
 #define JPEG_NBITS(x)          (x ? JPEG_NBITS_NONZERO(x) : 0)
 #else
 #include "jpeg_nbits_table.h"
@@ -65,31 +71,42 @@
  * but must not be updated permanently until we complete the MCU.
  */
 
-typedef struct {
-  size_t put_buffer;                    /* current bit-accumulation buffer */
-  int put_bits;                         /* # of bits now in it */
-  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
-} savable_state;
+#if defined(__x86_64__) && defined(__ILP32__)
+typedef unsigned long long bit_buf_type;
+#else
+typedef size_t bit_buf_type;
+#endif
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
+/* NOTE: The more optimal Huffman encoding algorithm is only used by the
+ * intrinsics implementation of the Arm Neon SIMD extensions, which is why we
+ * retain the old Huffman encoder behavior when using the GAS implementation.
  */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest, src)  ((dest) = (src))
+#if defined(WITH_SIMD) && !(defined(__arm__) || defined(__aarch64__) || \
+                            defined(_M_ARM) || defined(_M_ARM64))
+typedef unsigned long long simd_bit_buf_type;
 #else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest, src) \
-  ((dest).put_buffer = (src).put_buffer, \
-   (dest).put_bits = (src).put_bits, \
-   (dest).last_dc_val[0] = (src).last_dc_val[0], \
-   (dest).last_dc_val[1] = (src).last_dc_val[1], \
-   (dest).last_dc_val[2] = (src).last_dc_val[2], \
-   (dest).last_dc_val[3] = (src).last_dc_val[3])
+typedef bit_buf_type simd_bit_buf_type;
 #endif
+
+#if (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 8) || defined(_WIN64) || \
+    (defined(__x86_64__) && defined(__ILP32__))
+#define BIT_BUF_SIZE  64
+#elif (defined(SIZEOF_SIZE_T) && SIZEOF_SIZE_T == 4) || defined(_WIN32)
+#define BIT_BUF_SIZE  32
+#else
+#error Cannot determine word size
 #endif
+#define SIMD_BIT_BUF_SIZE  (sizeof(simd_bit_buf_type) * 8)
 
+typedef struct {
+  union {
+    bit_buf_type c;
+    simd_bit_buf_type simd;
+  } put_buffer;                         /* current bit accumulation buffer */
+  int free_bits;                        /* # of bits available in it */
+                                        /* (Neon GAS: # of bits now in it) */
+  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
+} savable_state;
 
 typedef struct {
   struct jpeg_entropy_encoder pub; /* public fields */
@@ -123,6 +140,7 @@ typedef struct {
   size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
   savable_state cur;            /* Current bit buffer & DC state */
   j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+  int simd;
 } working_state;
 
 
@@ -201,8 +219,17 @@ start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)
   }
 
   /* Initialize bit buffer to empty */
-  entropy->saved.put_buffer = 0;
-  entropy->saved.put_bits = 0;
+  if (entropy->simd) {
+    entropy->saved.put_buffer.simd = 0;
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+    entropy->saved.free_bits = 0;
+#else
+    entropy->saved.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+  } else {
+    entropy->saved.put_buffer.c = 0;
+    entropy->saved.free_bits = BIT_BUF_SIZE;
+  }
 
   /* Initialize restart stuff */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -287,6 +314,7 @@ jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC, int tblno,
    * this lets us detect duplicate VAL entries here, and later
    * allows emit_bits to detect any attempt to emit such symbols.
    */
+  MEMZERO(dtbl->ehufco, sizeof(dtbl->ehufco));
   MEMZERO(dtbl->ehufsi, sizeof(dtbl->ehufsi));
 
   /* This is also a convenient place to check for out-of-range
@@ -334,94 +362,94 @@ dump_buffer(working_state *state)
 
 /* Outputting bits to the file */
 
-/* These macros perform the same task as the emit_bits() function in the
- * original libjpeg code.  In addition to reducing overhead by explicitly
- * inlining the code, additional performance is achieved by taking into
- * account the size of the bit buffer and waiting until it is almost full
- * before emptying it.  This mostly benefits 64-bit platforms, since 6
- * bytes can be stored in a 64-bit bit buffer before it has to be emptied.
+/* Output byte b and, speculatively, an additional 0 byte.  0xFF must be
+ * encoded as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the
+ * byte is 0xFF.  Otherwise, the output buffer pointer is advanced by 1, and
+ * the speculative 0 byte will be overwritten by the next byte.
  */
-
-#define EMIT_BYTE() { \
-  JOCTET c; \
-  put_bits -= 8; \
-  c = (JOCTET)GETJOCTET(put_buffer >> put_bits); \
-  *buffer++ = c; \
-  if (c == 0xFF)  /* need to stuff a zero byte? */ \
-    *buffer++ = 0; \
+#define EMIT_BYTE(b) { \
+  buffer[0] = (JOCTET)(b); \
+  buffer[1] = 0; \
+  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
 }
 
-#define PUT_BITS(code, size) { \
-  put_bits += size; \
-  put_buffer = (put_buffer << size) | code; \
-}
-
-#if SIZEOF_SIZE_T != 8 && !defined(_WIN64)
-
-#define CHECKBUF15() { \
-  if (put_bits > 15) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
+/* Output the entire bit buffer.  If there are no 0xFF bytes in it, then write
+ * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if BIT_BUF_SIZE == 64
+
+#define FLUSH() { \
+  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+    EMIT_BYTE(put_buffer >> 56) \
+    EMIT_BYTE(put_buffer >> 48) \
+    EMIT_BYTE(put_buffer >> 40) \
+    EMIT_BYTE(put_buffer >> 32) \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    buffer[0] = (JOCTET)(put_buffer >> 56); \
+    buffer[1] = (JOCTET)(put_buffer >> 48); \
+    buffer[2] = (JOCTET)(put_buffer >> 40); \
+    buffer[3] = (JOCTET)(put_buffer >> 32); \
+    buffer[4] = (JOCTET)(put_buffer >> 24); \
+    buffer[5] = (JOCTET)(put_buffer >> 16); \
+    buffer[6] = (JOCTET)(put_buffer >> 8); \
+    buffer[7] = (JOCTET)(put_buffer); \
+    buffer += 8; \
   } \
 }
 
-#endif
-
-#define CHECKBUF31() { \
-  if (put_bits > 31) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-  } \
-}
+#else
 
-#define CHECKBUF47() { \
-  if (put_bits > 47) { \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
-    EMIT_BYTE() \
+#define FLUSH() { \
+  if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    buffer[0] = (JOCTET)(put_buffer >> 24); \
+    buffer[1] = (JOCTET)(put_buffer >> 16); \
+    buffer[2] = (JOCTET)(put_buffer >> 8); \
+    buffer[3] = (JOCTET)(put_buffer); \
+    buffer += 4; \
   } \
 }
 
-#if !defined(_WIN32) && !defined(SIZEOF_SIZE_T)
-#error Cannot determine word size
 #endif
 
-#if SIZEOF_SIZE_T == 8 || defined(_WIN64)
-
-#define EMIT_BITS(code, size) { \
-  CHECKBUF47() \
-  PUT_BITS(code, size) \
-}
-
-#define EMIT_CODE(code, size) { \
-  temp2 &= (((JLONG)1) << nbits) - 1; \
-  CHECKBUF31() \
-  PUT_BITS(code, size) \
-  PUT_BITS(temp2, nbits) \
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+  FLUSH() \
+  free_bits += BIT_BUF_SIZE; \
+  put_buffer = code; \
 }
 
-#else
-
-#define EMIT_BITS(code, size) { \
-  PUT_BITS(code, size) \
-  CHECKBUF15() \
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+  free_bits -= size; \
+  if (free_bits < 0) \
+    PUT_AND_FLUSH(code, size) \
+  else \
+    put_buffer = (put_buffer << size) | code; \
 }
 
-#define EMIT_CODE(code, size) { \
-  temp2 &= (((JLONG)1) << nbits) - 1; \
-  PUT_BITS(code, size) \
-  CHECKBUF15() \
-  PUT_BITS(temp2, nbits) \
-  CHECKBUF15() \
+#define PUT_CODE(code, size) { \
+  temp &= (((JLONG)1) << nbits) - 1; \
+  temp |= code << nbits; \
+  nbits += size; \
+  PUT_BITS(temp, nbits) \
 }
 
-#endif
-
 
 /* Although it is exceedingly rare, it is possible for a Huffman-encoded
  * coefficient block to be larger than the 128-byte unencoded block.  For each
@@ -444,6 +472,7 @@ dump_buffer(working_state *state)
 
 #define STORE_BUFFER() { \
   if (localbuf) { \
+    size_t bytes, bytestocopy; \
     bytes = buffer - _buffer; \
     buffer = _buffer; \
     while (bytes > 0) { \
@@ -466,20 +495,46 @@ dump_buffer(working_state *state)
 LOCAL(boolean)
 flush_bits(working_state *state)
 {
-  JOCTET _buffer[BUFSIZE], *buffer;
-  size_t put_buffer;  int put_bits;
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  JOCTET _buffer[BUFSIZE], *buffer, temp;
+  simd_bit_buf_type put_buffer;  int put_bits;
+  int localbuf = 0;
+
+  if (state->simd) {
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+    put_bits = state->cur.free_bits;
+#else
+    put_bits = SIMD_BIT_BUF_SIZE - state->cur.free_bits;
+#endif
+    put_buffer = state->cur.put_buffer.simd;
+  } else {
+    put_bits = BIT_BUF_SIZE - state->cur.free_bits;
+    put_buffer = state->cur.put_buffer.c;
+  }
 
-  put_buffer = state->cur.put_buffer;
-  put_bits = state->cur.put_bits;
   LOAD_BUFFER()
 
-  /* fill any partial byte with ones */
-  PUT_BITS(0x7F, 7)
-  while (put_bits >= 8) EMIT_BYTE()
+  while (put_bits >= 8) {
+    put_bits -= 8;
+    temp = (JOCTET)(put_buffer >> put_bits);
+    EMIT_BYTE(temp)
+  }
+  if (put_bits) {
+    /* fill partial byte with ones */
+    temp = (JOCTET)((put_buffer << (8 - put_bits)) | (0xFF >> put_bits));
+    EMIT_BYTE(temp)
+  }
 
-  state->cur.put_buffer = 0;    /* and reset bit-buffer to empty */
-  state->cur.put_bits = 0;
+  if (state->simd) {                    /* and reset bit buffer to empty */
+    state->cur.put_buffer.simd = 0;
+#if defined(__aarch64__) && !defined(NEON_INTRINSICS)
+    state->cur.free_bits = 0;
+#else
+    state->cur.free_bits = SIMD_BIT_BUF_SIZE;
+#endif
+  } else {
+    state->cur.put_buffer.c = 0;
+    state->cur.free_bits = BIT_BUF_SIZE;
+  }
   STORE_BUFFER()
 
   return TRUE;
@@ -493,7 +548,7 @@ encode_one_block_simd(working_state *state, JCOEFPTR block, int last_dc_val,
                       c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
   JOCTET _buffer[BUFSIZE], *buffer;
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  int localbuf = 0;
 
   LOAD_BUFFER()
 
@@ -509,53 +564,41 @@ LOCAL(boolean)
 encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
 {
-  int temp, temp2, temp3;
-  int nbits;
-  int r, code, size;
+  int temp, nbits, free_bits;
+  bit_buf_type put_buffer;
   JOCTET _buffer[BUFSIZE], *buffer;
-  size_t put_buffer;  int put_bits;
-  int code_0xf0 = actbl->ehufco[0xf0], size_0xf0 = actbl->ehufsi[0xf0];
-  size_t bytes, bytestocopy;  int localbuf = 0;
+  int localbuf = 0;
 
-  put_buffer = state->cur.put_buffer;
-  put_bits = state->cur.put_bits;
+  free_bits = state->cur.free_bits;
+  put_buffer = state->cur.put_buffer.c;
   LOAD_BUFFER()
 
   /* Encode the DC coefficient difference per section F.1.2.1 */
 
-  temp = temp2 = block[0] - last_dc_val;
+  temp = block[0] - last_dc_val;
 
   /* This is a well-known technique for obtaining the absolute value without a
    * branch.  It is derived from an assembly language technique presented in
    * "How to Optimize for the Pentium Processors", Copyright (c) 1996, 1997 by
-   * Agner Fog.
+   * Agner Fog.  This code assumes we are on a two's complement machine.
    */
-  temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-  temp ^= temp3;
-  temp -= temp3;
-
-  /* For a negative input, want temp2 = bitwise complement of abs(input) */
-  /* This code assumes we are on a two's complement machine */
-  temp2 += temp3;
+  nbits = temp >> (CHAR_BIT * sizeof(int) - 1);
+  temp += nbits;
+  nbits ^= temp;
 
   /* Find the number of bits needed for the magnitude of the coefficient */
-  nbits = JPEG_NBITS(temp);
-
-  /* Emit the Huffman-coded symbol for the number of bits */
-  code = dctbl->ehufco[nbits];
-  size = dctbl->ehufsi[nbits];
-  EMIT_BITS(code, size)
+  nbits = JPEG_NBITS(nbits);
 
-  /* Mask off any extra bits in code */
-  temp2 &= (((JLONG)1) << nbits) - 1;
-
-  /* Emit that number of bits of the value, if positive, */
-  /* or the complement of its magnitude, if negative. */
-  EMIT_BITS(temp2, nbits)
+  /* Emit the Huffman-coded symbol for the number of bits.
+   * Emit that number of bits of the value, if positive,
+   * or the complement of its magnitude, if negative.
+   */
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits])
 
   /* Encode the AC coefficients per section F.1.2.2 */
 
-  r = 0;                        /* r = run length of zeros */
+  {
+    int r = 0;                  /* r = run length of zeros */
 
 /* Manually unroll the k loop to eliminate the counter variable.  This
  * improves performance greatly on systems with a limited number of
@@ -563,51 +606,46 @@ encode_one_block(working_state *state, JCOEFPTR block, int last_dc_val,
  */
 #define kloop(jpeg_natural_order_of_k) { \
   if ((temp = block[jpeg_natural_order_of_k]) == 0) { \
-    r++; \
+    r += 16; \
   } else { \
-    temp2 = temp; \
     /* Branch-less absolute value, bitwise complement, etc., same as above */ \
-    temp3 = temp >> (CHAR_BIT * sizeof(int) - 1); \
-    temp ^= temp3; \
-    temp -= temp3; \
-    temp2 += temp3; \
-    nbits = JPEG_NBITS_NONZERO(temp); \
+    nbits = temp >> (CHAR_BIT * sizeof(int) - 1); \
+    temp += nbits; \
+    nbits ^= temp; \
+    nbits = JPEG_NBITS_NONZERO(nbits); \
     /* if run length > 15, must emit special run-length-16 codes (0xF0) */ \
-    while (r > 15) { \
-      EMIT_BITS(code_0xf0, size_0xf0) \
-      r -= 16; \
+    while (r >= 16 * 16) { \
+      r -= 16 * 16; \
+      PUT_BITS(actbl->ehufco[0xf0], actbl->ehufsi[0xf0]) \
     } \
     /* Emit Huffman symbol for run length / number of bits */ \
-    temp3 = (r << 4) + nbits; \
-    code = actbl->ehufco[temp3]; \
-    size = actbl->ehufsi[temp3]; \
-    EMIT_CODE(code, size) \
+    r += nbits; \
+    PUT_CODE(actbl->ehufco[r], actbl->ehufsi[r]) \
     r = 0; \
   } \
 }
 
-  /* One iteration for each value in jpeg_natural_order[] */
-  kloop(1);   kloop(8);   kloop(16);  kloop(9);   kloop(2);   kloop(3);
-  kloop(10);  kloop(17);  kloop(24);  kloop(32);  kloop(25);  kloop(18);
-  kloop(11);  kloop(4);   kloop(5);   kloop(12);  kloop(19);  kloop(26);
-  kloop(33);  kloop(40);  kloop(48);  kloop(41);  kloop(34);  kloop(27);
-  kloop(20);  kloop(13);  kloop(6);   kloop(7);   kloop(14);  kloop(21);
-  kloop(28);  kloop(35);  kloop(42);  kloop(49);  kloop(56);  kloop(57);
-  kloop(50);  kloop(43);  kloop(36);  kloop(29);  kloop(22);  kloop(15);
-  kloop(23);  kloop(30);  kloop(37);  kloop(44);  kloop(51);  kloop(58);
-  kloop(59);  kloop(52);  kloop(45);  kloop(38);  kloop(31);  kloop(39);
-  kloop(46);  kloop(53);  kloop(60);  kloop(61);  kloop(54);  kloop(47);
-  kloop(55);  kloop(62);  kloop(63);
-
-  /* If the last coef(s) were zero, emit an end-of-block code */
-  if (r > 0) {
-    code = actbl->ehufco[0];
-    size = actbl->ehufsi[0];
-    EMIT_BITS(code, size)
+    /* One iteration for each value in jpeg_natural_order[] */
+    kloop(1);   kloop(8);   kloop(16);  kloop(9);   kloop(2);   kloop(3);
+    kloop(10);  kloop(17);  kloop(24);  kloop(32);  kloop(25);  kloop(18);
+    kloop(11);  kloop(4);   kloop(5);   kloop(12);  kloop(19);  kloop(26);
+    kloop(33);  kloop(40);  kloop(48);  kloop(41);  kloop(34);  kloop(27);
+    kloop(20);  kloop(13);  kloop(6);   kloop(7);   kloop(14);  kloop(21);
+    kloop(28);  kloop(35);  kloop(42);  kloop(49);  kloop(56);  kloop(57);
+    kloop(50);  kloop(43);  kloop(36);  kloop(29);  kloop(22);  kloop(15);
+    kloop(23);  kloop(30);  kloop(37);  kloop(44);  kloop(51);  kloop(58);
+    kloop(59);  kloop(52);  kloop(45);  kloop(38);  kloop(31);  kloop(39);
+    kloop(46);  kloop(53);  kloop(60);  kloop(61);  kloop(54);  kloop(47);
+    kloop(55);  kloop(62);  kloop(63);
+
+    /* If the last coef(s) were zero, emit an end-of-block code */
+    if (r > 0) {
+      PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+    }
   }
 
-  state->cur.put_buffer = put_buffer;
-  state->cur.put_bits = put_bits;
+  state->cur.put_buffer.c = put_buffer;
+  state->cur.free_bits = free_bits;
   STORE_BUFFER()
 
   return TRUE;
@@ -654,8 +692,9 @@ encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Load up working state */
   state.next_output_byte = cinfo->dest->next_output_byte;
   state.free_in_buffer = cinfo->dest->free_in_buffer;
-  ASSIGN_STATE(state.cur, entropy->saved);
+  state.cur = entropy->saved;
   state.cinfo = cinfo;
+  state.simd = entropy->simd;
 
   /* Emit restart marker if needed */
   if (cinfo->restart_interval) {
@@ -694,7 +733,7 @@ encode_mcu_huff(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Completed MCU, so update state */
   cinfo->dest->next_output_byte = state.next_output_byte;
   cinfo->dest->free_in_buffer = state.free_in_buffer;
-  ASSIGN_STATE(entropy->saved, state.cur);
+  entropy->saved = state.cur;
 
   /* Update restart-interval state too */
   if (cinfo->restart_interval) {
@@ -723,8 +762,9 @@ finish_pass_huff(j_compress_ptr cinfo)
   /* Load up working state ... flush_bits needs it */
   state.next_output_byte = cinfo->dest->next_output_byte;
   state.free_in_buffer = cinfo->dest->free_in_buffer;
-  ASSIGN_STATE(state.cur, entropy->saved);
+  state.cur = entropy->saved;
   state.cinfo = cinfo;
+  state.simd = entropy->simd;
 
   /* Flush out the last data */
   if (!flush_bits(&state))
@@ -733,7 +773,7 @@ finish_pass_huff(j_compress_ptr cinfo)
   /* Update state */
   cinfo->dest->next_output_byte = state.next_output_byte;
   cinfo->dest->free_in_buffer = state.free_in_buffer;
-  ASSIGN_STATE(entropy->saved, state.cur);
+  entropy->saved = state.cur;
 }
 
 
diff --git a/external/jpeg/jconfig.h.in b/external/jpeg/jconfig.h.in
index 18a69a48142a..d4284d97b812 100644
--- a/external/jpeg/jconfig.h.in
+++ b/external/jpeg/jconfig.h.in
@@ -61,11 +61,6 @@
    unsigned. */
 #cmakedefine RIGHT_SHIFT_IS_UNSIGNED 1
 
-/* Define to 1 if type `char' is unsigned and you are not using gcc.  */
-#ifndef __CHAR_UNSIGNED__
-  #cmakedefine __CHAR_UNSIGNED__ 1
-#endif
-
 /* Define to empty if `const' does not conform to ANSI C. */
 /* #undef const */
 
diff --git a/external/jpeg/jconfig.txt b/external/jpeg/jconfig.txt
new file mode 100644
index 000000000000..21f35c13f5a0
--- /dev/null
+++ b/external/jpeg/jconfig.txt
@@ -0,0 +1,136 @@
+/*
+ * jconfig.txt
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1994, Thomas G. Lane.
+ * It was modified by The libjpeg-turbo Project to include only code relevant
+ * to libjpeg-turbo.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ *
+ * This file documents the configuration options that are required to
+ * customize the JPEG software for a particular system.
+ *
+ * The actual configuration options for a particular installation are stored
+ * in jconfig.h.  On many machines, jconfig.h can be generated automatically
+ * or copied from one of the "canned" jconfig files that we supply.  But if
+ * you need to generate a jconfig.h file by hand, this file tells you how.
+ *
+ * DO NOT EDIT THIS FILE --- IT WON'T ACCOMPLISH ANYTHING.
+ * EDIT A COPY NAMED JCONFIG.H.
+ */
+
+
+/*
+ * These symbols indicate the properties of your machine or compiler.
+ * #define the symbol if yes, #undef it if no.
+ */
+
+/* Does your compiler support the declaration "unsigned char" ?
+ * How about "unsigned short" ?
+ */
+#define HAVE_UNSIGNED_CHAR
+#define HAVE_UNSIGNED_SHORT
+
+/* Define "void" as "char" if your compiler doesn't know about type void.
+ * NOTE: be sure to define void such that "void *" represents the most general
+ * pointer type, e.g., that returned by malloc().
+ */
+/* #define void char */
+
+/* Define "const" as empty if your compiler doesn't know the "const" keyword.
+ */
+/* #define const */
+
+/* Define this if your system has an ANSI-conforming <stddef.h> file.
+ */
+#define HAVE_STDDEF_H
+
+/* Define this if your system has an ANSI-conforming <stdlib.h> file.
+ */
+#define HAVE_STDLIB_H
+
+/* Define this if your system does not have an ANSI/SysV <string.h>,
+ * but does have a BSD-style <strings.h>.
+ */
+#undef NEED_BSD_STRINGS
+
+/* Define this if your system does not provide typedef size_t in any of the
+ * ANSI-standard places (stddef.h, stdlib.h, or stdio.h), but places it in
+ * <sys/types.h> instead.
+ */
+#undef NEED_SYS_TYPES_H
+
+/* Although a real ANSI C compiler can deal perfectly well with pointers to
+ * unspecified structures (see "incomplete types" in the spec), a few pre-ANSI
+ * and pseudo-ANSI compilers get confused.  To keep one of these bozos happy,
+ * define INCOMPLETE_TYPES_BROKEN.  This is not recommended unless you
+ * actually get "missing structure definition" warnings or errors while
+ * compiling the JPEG code.
+ */
+#undef INCOMPLETE_TYPES_BROKEN
+
+/* Define "boolean" as unsigned char, not int, on Windows systems.
+ */
+#ifdef _WIN32
+#ifndef __RPCNDR_H__            /* don't conflict if rpcndr.h already read */
+typedef unsigned char boolean;
+#endif
+#define HAVE_BOOLEAN            /* prevent jmorecfg.h from redefining it */
+#endif
+
+
+/*
+ * The following options affect code selection within the JPEG library,
+ * but they don't need to be visible to applications using the library.
+ * To minimize application namespace pollution, the symbols won't be
+ * defined unless JPEG_INTERNALS has been defined.
+ */
+
+#ifdef JPEG_INTERNALS
+
+/* Define this if your compiler implements ">>" on signed values as a logical
+ * (unsigned) shift; leave it undefined if ">>" is a signed (arithmetic) shift,
+ * which is the normal and rational definition.
+ */
+#undef RIGHT_SHIFT_IS_UNSIGNED
+
+
+#endif /* JPEG_INTERNALS */
+
+
+/*
+ * The remaining options do not affect the JPEG library proper,
+ * but only the sample applications cjpeg/djpeg (see cjpeg.c, djpeg.c).
+ * Other applications can ignore these.
+ */
+
+#ifdef JPEG_CJPEG_DJPEG
+
+/* These defines indicate which image (non-JPEG) file formats are allowed. */
+
+#define BMP_SUPPORTED           /* BMP image file format */
+#define GIF_SUPPORTED           /* GIF image file format */
+#define PPM_SUPPORTED           /* PBMPLUS PPM/PGM image file format */
+#define TARGA_SUPPORTED         /* Targa image file format */
+
+/* Define this if you want to name both input and output files on the command
+ * line, rather than using stdout and optionally stdin.  You MUST do this if
+ * your system can't cope with binary I/O to stdin/stdout.  See comments at
+ * head of cjpeg.c or djpeg.c.
+ */
+#undef TWO_FILE_COMMANDLINE
+
+/* By default, we open image files with fopen(..., "rb") or fopen(..., "wb").
+ * This is necessary on systems that distinguish text files from binary files,
+ * and is harmless on most systems that don't.  If you have one of the rare
+ * systems that complains about the "b" spec, define this symbol.
+ */
+#undef DONT_USE_B_MODE
+
+/* Define this if you want percent-done progress reports from cjpeg/djpeg.
+ */
+#undef PROGRESS_REPORT
+
+
+#endif /* JPEG_CJPEG_DJPEG */
diff --git a/external/jpeg/jcphuff.c b/external/jpeg/jcphuff.c
index a8b94bed84b8..bd14fc27d5e2 100644
--- a/external/jpeg/jcphuff.c
+++ b/external/jpeg/jcphuff.c
@@ -4,8 +4,9 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2011, 2015, 2018, D. R. Commander.
+ * Copyright (C) 2011, 2015, 2018, 2021, D. R. Commander.
  * Copyright (C) 2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -51,15 +52,19 @@
  * flags (this defines __thumb__).
  */
 
-/* NOTE: Both GCC and Clang define __GNUC__ */
-#if defined(__GNUC__) && (defined(__arm__) || defined(__aarch64__))
+#if defined(__arm__) || defined(__aarch64__) || defined(_M_ARM) || \
+    defined(_M_ARM64)
 #if !defined(__thumb__) || defined(__thumb2__)
 #define USE_CLZ_INTRINSIC
 #endif
 #endif
 
 #ifdef USE_CLZ_INTRINSIC
+#if defined(_MSC_VER) && !defined(__clang__)
+#define JPEG_NBITS_NONZERO(x)  (32 - _CountLeadingZeros(x))
+#else
 #define JPEG_NBITS_NONZERO(x)  (32 - __builtin_clz(x))
+#endif
 #define JPEG_NBITS(x)          (x ? JPEG_NBITS_NONZERO(x) : 0)
 #else
 #include "jpeg_nbits_table.h"
@@ -169,24 +174,26 @@ INLINE
 METHODDEF(int)
 count_zeroes(size_t *x)
 {
-  int result;
 #if defined(HAVE_BUILTIN_CTZL)
+  int result;
   result = __builtin_ctzl(*x);
   *x >>= result;
 #elif defined(HAVE_BITSCANFORWARD64)
+  unsigned long result;
   _BitScanForward64(&result, *x);
   *x >>= result;
 #elif defined(HAVE_BITSCANFORWARD)
+  unsigned long result;
   _BitScanForward(&result, *x);
   *x >>= result;
 #else
-  result = 0;
+  int result = 0;
   while ((*x & 1) == 0) {
     ++result;
     *x >>= 1;
   }
 #endif
-  return result;
+  return (int)result;
 }
 
 
@@ -860,7 +867,7 @@ encode_mcu_AC_refine_prepare(const JCOEF *block,
 
 #define ENCODE_COEFS_AC_REFINE(label) { \
   while (zerobits) { \
-    int idx = count_zeroes(&zerobits); \
+    idx = count_zeroes(&zerobits); \
     r += idx; \
     cabsvalue += idx; \
     signbits >>= idx; \
@@ -917,7 +924,7 @@ METHODDEF(boolean)
 encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 {
   phuff_entropy_ptr entropy = (phuff_entropy_ptr)cinfo->entropy;
-  register int temp, r;
+  register int temp, r, idx;
   char *BR_buffer;
   unsigned int BR;
   int Sl = cinfo->Se - cinfo->Ss + 1;
@@ -968,7 +975,7 @@ encode_mcu_AC_refine(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
 
   if (zerobits) {
     int diff = ((absvalues + DCTSIZE2 / 2) - cabsvalue);
-    int idx = count_zeroes(&zerobits);
+    idx = count_zeroes(&zerobits);
     signbits >>= idx;
     idx += diff;
     r += idx;
diff --git a/external/jpeg/jcsample.c b/external/jpeg/jcsample.c
index bd27b84e068a..e8515ebf0fce 100644
--- a/external/jpeg/jcsample.c
+++ b/external/jpeg/jcsample.c
@@ -6,7 +6,7 @@
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
- * Copyright (C) 2015, D. R. Commander.
+ * Copyright (C) 2015, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -103,7 +103,7 @@ expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
       ptr = image_data[row] + input_cols;
-      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      pixval = ptr[-1];
       for (count = numcols; count > 0; count--)
         *ptr++ = pixval;
     }
@@ -174,7 +174,7 @@ int_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
       for (v = 0; v < v_expand; v++) {
         inptr = input_data[inrow + v] + outcol_h;
         for (h = 0; h < h_expand; h++) {
-          outvalue += (JLONG)GETJSAMPLE(*inptr++);
+          outvalue += (JLONG)(*inptr++);
         }
       }
       *outptr++ = (JSAMPLE)((outvalue + numpix2) / numpix);
@@ -237,8 +237,7 @@ h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     inptr = input_data[outrow];
     bias = 0;                   /* bias = 0,1,0,1,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
-      *outptr++ =
-        (JSAMPLE)((GETJSAMPLE(*inptr) + GETJSAMPLE(inptr[1]) + bias) >> 1);
+      *outptr++ = (JSAMPLE)((inptr[0] + inptr[1] + bias) >> 1);
       bias ^= 1;                /* 0=>1, 1=>0 */
       inptr += 2;
     }
@@ -277,8 +276,7 @@ h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     bias = 1;                   /* bias = 1,2,1,2,... for successive samples */
     for (outcol = 0; outcol < output_cols; outcol++) {
       *outptr++ =
-        (JSAMPLE)((GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                   GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]) + bias) >> 2);
+        (JSAMPLE)((inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1] + bias) >> 2);
       bias ^= 3;                /* 1=>2, 2=>1 */
       inptr0 += 2;  inptr1 += 2;
     }
@@ -337,33 +335,25 @@ h2v2_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     below_ptr = input_data[inrow + 2];
 
     /* Special case for first column: pretend column -1 is same as column 0 */
-    membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
-    neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-               GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-               GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[2]) +
-               GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[2]);
+    membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+    neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+               inptr0[0] + inptr0[2] + inptr1[0] + inptr1[2];
     neighsum += neighsum;
-    neighsum += GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[2]) +
-                GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[2]);
+    neighsum += above_ptr[0] + above_ptr[2] + below_ptr[0] + below_ptr[2];
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
     inptr0 += 2;  inptr1 += 2;  above_ptr += 2;  below_ptr += 2;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
       /* sum of pixels directly mapped to this output element */
-      membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                  GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
+      membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
       /* sum of edge-neighbor pixels */
-      neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-                 GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-                 GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[2]) +
-                 GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[2]);
+      neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+                 inptr0[-1] + inptr0[2] + inptr1[-1] + inptr1[2];
       /* The edge-neighbors count twice as much as corner-neighbors */
       neighsum += neighsum;
       /* Add in the corner-neighbors */
-      neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[2]) +
-                  GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[2]);
+      neighsum += above_ptr[-1] + above_ptr[2] + below_ptr[-1] + below_ptr[2];
       /* form final output scaled up by 2^16 */
       membersum = membersum * memberscale + neighsum * neighscale;
       /* round, descale and output it */
@@ -372,15 +362,11 @@ h2v2_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     }
 
     /* Special case for last column */
-    membersum = GETJSAMPLE(*inptr0) + GETJSAMPLE(inptr0[1]) +
-                GETJSAMPLE(*inptr1) + GETJSAMPLE(inptr1[1]);
-    neighsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(above_ptr[1]) +
-               GETJSAMPLE(*below_ptr) + GETJSAMPLE(below_ptr[1]) +
-               GETJSAMPLE(inptr0[-1]) + GETJSAMPLE(inptr0[1]) +
-               GETJSAMPLE(inptr1[-1]) + GETJSAMPLE(inptr1[1]);
+    membersum = inptr0[0] + inptr0[1] + inptr1[0] + inptr1[1];
+    neighsum = above_ptr[0] + above_ptr[1] + below_ptr[0] + below_ptr[1] +
+               inptr0[-1] + inptr0[1] + inptr1[-1] + inptr1[1];
     neighsum += neighsum;
-    neighsum += GETJSAMPLE(above_ptr[-1]) + GETJSAMPLE(above_ptr[1]) +
-                GETJSAMPLE(below_ptr[-1]) + GETJSAMPLE(below_ptr[1]);
+    neighsum += above_ptr[-1] + above_ptr[1] + below_ptr[-1] + below_ptr[1];
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr = (JSAMPLE)((membersum + 32768) >> 16);
 
@@ -429,21 +415,18 @@ fullsize_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     below_ptr = input_data[outrow + 1];
 
     /* Special case for first column */
-    colsum = GETJSAMPLE(*above_ptr++) + GETJSAMPLE(*below_ptr++) +
-             GETJSAMPLE(*inptr);
-    membersum = GETJSAMPLE(*inptr++);
-    nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
-                 GETJSAMPLE(*inptr);
+    colsum = (*above_ptr++) + (*below_ptr++) + inptr[0];
+    membersum = *inptr++;
+    nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
     neighsum = colsum + (colsum - membersum) + nextcolsum;
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
     lastcolsum = colsum;  colsum = nextcolsum;
 
     for (colctr = output_cols - 2; colctr > 0; colctr--) {
-      membersum = GETJSAMPLE(*inptr++);
+      membersum = *inptr++;
       above_ptr++;  below_ptr++;
-      nextcolsum = GETJSAMPLE(*above_ptr) + GETJSAMPLE(*below_ptr) +
-                   GETJSAMPLE(*inptr);
+      nextcolsum = above_ptr[0] + below_ptr[0] + inptr[0];
       neighsum = lastcolsum + (colsum - membersum) + nextcolsum;
       membersum = membersum * memberscale + neighsum * neighscale;
       *outptr++ = (JSAMPLE)((membersum + 32768) >> 16);
@@ -451,7 +434,7 @@ fullsize_smooth_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
     }
 
     /* Special case for last column */
-    membersum = GETJSAMPLE(*inptr);
+    membersum = *inptr;
     neighsum = lastcolsum + (colsum - membersum) + colsum;
     membersum = membersum * memberscale + neighsum * neighscale;
     *outptr = (JSAMPLE)((membersum + 32768) >> 16);
diff --git a/external/jpeg/jdapistd.c b/external/jpeg/jdapistd.c
index 38bd1110d9b3..695a6200992d 100644
--- a/external/jpeg/jdapistd.c
+++ b/external/jpeg/jdapistd.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1994-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2015-2018, 2020, D. R. Commander.
+ * Copyright (C) 2010, 2015-2020, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -319,6 +319,8 @@ read_and_discard_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
 {
   JDIMENSION n;
   my_master_ptr master = (my_master_ptr)cinfo->master;
+  JSAMPLE dummy_sample[1] = { 0 };
+  JSAMPROW dummy_row = dummy_sample;
   JSAMPARRAY scanlines = NULL;
   void (*color_convert) (j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                          JDIMENSION input_row, JSAMPARRAY output_buf,
@@ -329,6 +331,10 @@ read_and_discard_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
   if (cinfo->cconvert && cinfo->cconvert->color_convert) {
     color_convert = cinfo->cconvert->color_convert;
     cinfo->cconvert->color_convert = noop_convert;
+    /* This just prevents UBSan from complaining about adding 0 to a NULL
+     * pointer.  The pointer isn't actually used.
+     */
+    scanlines = &dummy_row;
   }
 
   if (cinfo->cquantize && cinfo->cquantize->color_quantize) {
@@ -532,6 +538,8 @@ jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines)
          * decoded coefficients.  This is ~5% faster for large subsets, but
          * it's tough to tell a difference for smaller images.
          */
+        if (!cinfo->entropy->insufficient_data)
+          cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
         (*cinfo->entropy->decode_mcu) (cinfo, NULL);
       }
     }
diff --git a/external/jpeg/jdarith.c b/external/jpeg/jdarith.c
index 6002481e242c..7f0d3a785c39 100644
--- a/external/jpeg/jdarith.c
+++ b/external/jpeg/jdarith.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Developed 1997-2015 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2018, D. R. Commander.
+ * Copyright (C) 2015-2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -80,7 +80,7 @@ get_byte(j_decompress_ptr cinfo)
     if (!(*src->fill_input_buffer) (cinfo))
       ERREXIT(cinfo, JERR_CANT_SUSPEND);
   src->bytes_in_buffer--;
-  return GETJOCTET(*src->next_input_byte++);
+  return *src->next_input_byte++;
 }
 
 
@@ -665,8 +665,16 @@ start_pass(j_decompress_ptr cinfo)
     for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
       int coefi, cindex = cinfo->cur_comp_info[ci]->component_index;
       int *coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+      int *prev_coef_bit_ptr =
+        &cinfo->coef_bits[cindex + cinfo->num_components][0];
       if (cinfo->Ss && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
         WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+      for (coefi = MIN(cinfo->Ss, 1); coefi <= MAX(cinfo->Se, 9); coefi++) {
+        if (cinfo->input_scan_number > 1)
+          prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
+        else
+          prev_coef_bit_ptr[coefi] = 0;
+      }
       for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
         int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
         if (cinfo->Ah != expected)
@@ -727,6 +735,7 @@ start_pass(j_decompress_ptr cinfo)
   entropy->c = 0;
   entropy->a = 0;
   entropy->ct = -16;    /* force reading 2 initial bytes to fill C */
+  entropy->pub.insufficient_data = FALSE;
 
   /* Initialize restart counter */
   entropy->restarts_to_go = cinfo->restart_interval;
@@ -763,7 +772,7 @@ jinit_arith_decoder(j_decompress_ptr cinfo)
     int *coef_bit_ptr, ci;
     cinfo->coef_bits = (int (*)[DCTSIZE2])
       (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                  cinfo->num_components * DCTSIZE2 *
+                                  cinfo->num_components * 2 * DCTSIZE2 *
                                   sizeof(int));
     coef_bit_ptr = &cinfo->coef_bits[0][0];
     for (ci = 0; ci < cinfo->num_components; ci++)
diff --git a/external/jpeg/jdcoefct.c b/external/jpeg/jdcoefct.c
index 2ba6aa11e4d2..15e6cded628e 100644
--- a/external/jpeg/jdcoefct.c
+++ b/external/jpeg/jdcoefct.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2010, 2015-2016, D. R. Commander.
+ * Copyright (C) 2010, 2015-2016, 2019-2020, D. R. Commander.
  * Copyright (C) 2015, 2020, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -102,6 +102,8 @@ decompress_onepass(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
       /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
       jzero_far((void *)coef->MCU_buffer[0],
                 (size_t)(cinfo->blocks_in_MCU * sizeof(JBLOCK)));
+      if (!cinfo->entropy->insufficient_data)
+        cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
       if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
         coef->MCU_vert_offset = yoffset;
@@ -227,6 +229,8 @@ consume_data(j_decompress_ptr cinfo)
           }
         }
       }
+      if (!cinfo->entropy->insufficient_data)
+        cinfo->master->last_good_iMCU_row = cinfo->input_iMCU_row;
       /* Try to fetch the MCU. */
       if (!(*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
         /* Suspension forced; update state counters and exit */
@@ -326,19 +330,22 @@ decompress_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
 #ifdef BLOCK_SMOOTHING_SUPPORTED
 
 /*
- * This code applies interblock smoothing as described by section K.8
- * of the JPEG standard: the first 5 AC coefficients are estimated from
- * the DC values of a DCT block and its 8 neighboring blocks.
+ * This code applies interblock smoothing; the first 9 AC coefficients are
+ * estimated from the DC values of a DCT block and its 24 neighboring blocks.
  * We apply smoothing only for progressive JPEG decoding, and only if
  * the coefficients it can estimate are not yet known to full precision.
  */
 
-/* Natural-order array positions of the first 5 zigzag-order coefficients */
+/* Natural-order array positions of the first 9 zigzag-order coefficients */
 #define Q01_POS  1
 #define Q10_POS  8
 #define Q20_POS  16
 #define Q11_POS  9
 #define Q02_POS  2
+#define Q03_POS  3
+#define Q12_POS  10
+#define Q21_POS  17
+#define Q30_POS  24
 
 /*
  * Determine whether block smoothing is applicable and safe.
@@ -356,8 +363,8 @@ smoothing_ok(j_decompress_ptr cinfo)
   int ci, coefi;
   jpeg_component_info *compptr;
   JQUANT_TBL *qtable;
-  int *coef_bits;
-  int *coef_bits_latch;
+  int *coef_bits, *prev_coef_bits;
+  int *coef_bits_latch, *prev_coef_bits_latch;
 
   if (!cinfo->progressive_mode || cinfo->coef_bits == NULL)
     return FALSE;
@@ -366,34 +373,47 @@ smoothing_ok(j_decompress_ptr cinfo)
   if (coef->coef_bits_latch == NULL)
     coef->coef_bits_latch = (int *)
       (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                  cinfo->num_components *
+                                  cinfo->num_components * 2 *
                                   (SAVED_COEFS * sizeof(int)));
   coef_bits_latch = coef->coef_bits_latch;
+  prev_coef_bits_latch =
+    &coef->coef_bits_latch[cinfo->num_components * SAVED_COEFS];
 
   for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
        ci++, compptr++) {
     /* All components' quantization values must already be latched. */
     if ((qtable = compptr->quant_table) == NULL)
       return FALSE;
-    /* Verify DC & first 5 AC quantizers are nonzero to avoid zero-divide. */
+    /* Verify DC & first 9 AC quantizers are nonzero to avoid zero-divide. */
     if (qtable->quantval[0] == 0 ||
         qtable->quantval[Q01_POS] == 0 ||
         qtable->quantval[Q10_POS] == 0 ||
         qtable->quantval[Q20_POS] == 0 ||
         qtable->quantval[Q11_POS] == 0 ||
-        qtable->quantval[Q02_POS] == 0)
+        qtable->quantval[Q02_POS] == 0 ||
+        qtable->quantval[Q03_POS] == 0 ||
+        qtable->quantval[Q12_POS] == 0 ||
+        qtable->quantval[Q21_POS] == 0 ||
+        qtable->quantval[Q30_POS] == 0)
       return FALSE;
     /* DC values must be at least partly known for all components. */
     coef_bits = cinfo->coef_bits[ci];
+    prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components];
     if (coef_bits[0] < 0)
       return FALSE;
+    coef_bits_latch[0] = coef_bits[0];
     /* Block smoothing is helpful if some AC coefficients remain inaccurate. */
-    for (coefi = 1; coefi <= 5; coefi++) {
+    for (coefi = 1; coefi < SAVED_COEFS; coefi++) {
+      if (cinfo->input_scan_number > 1)
+        prev_coef_bits_latch[coefi] = prev_coef_bits[coefi];
+      else
+        prev_coef_bits_latch[coefi] = -1;
       coef_bits_latch[coefi] = coef_bits[coefi];
       if (coef_bits[coefi] != 0)
         smoothing_useful = TRUE;
     }
     coef_bits_latch += SAVED_COEFS;
+    prev_coef_bits_latch += SAVED_COEFS;
   }
 
   return smoothing_useful;
@@ -412,17 +432,20 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
   JDIMENSION block_num, last_block_column;
   int ci, block_row, block_rows, access_rows;
   JBLOCKARRAY buffer;
-  JBLOCKROW buffer_ptr, prev_block_row, next_block_row;
+  JBLOCKROW buffer_ptr, prev_prev_block_row, prev_block_row;
+  JBLOCKROW next_block_row, next_next_block_row;
   JSAMPARRAY output_ptr;
   JDIMENSION output_col;
   jpeg_component_info *compptr;
   inverse_DCT_method_ptr inverse_DCT;
-  boolean first_row, last_row;
+  boolean change_dc;
   JCOEF *workspace;
   int *coef_bits;
   JQUANT_TBL *quanttbl;
-  JLONG Q00, Q01, Q02, Q10, Q11, Q20, num;
-  int DC1, DC2, DC3, DC4, DC5, DC6, DC7, DC8, DC9;
+  JLONG Q00, Q01, Q02, Q03 = 0, Q10, Q11, Q12 = 0, Q20, Q21 = 0, Q30 = 0, num;
+  int DC01, DC02, DC03, DC04, DC05, DC06, DC07, DC08, DC09, DC10, DC11, DC12,
+      DC13, DC14, DC15, DC16, DC17, DC18, DC19, DC20, DC21, DC22, DC23, DC24,
+      DC25;
   int Al, pred;
 
   /* Keep a local variable to avoid looking it up more than once */
@@ -434,10 +457,10 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
     if (cinfo->input_scan_number == cinfo->output_scan_number) {
       /* If input is working on current scan, we ordinarily want it to
        * have completed the current row.  But if input scan is DC,
-       * we want it to keep one row ahead so that next block row's DC
+       * we want it to keep two rows ahead so that next two block rows' DC
        * values are up to date.
        */
-      JDIMENSION delta = (cinfo->Ss == 0) ? 1 : 0;
+      JDIMENSION delta = (cinfo->Ss == 0) ? 2 : 0;
       if (cinfo->input_iMCU_row > cinfo->output_iMCU_row + delta)
         break;
     }
@@ -452,34 +475,53 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
     if (!compptr->component_needed)
       continue;
     /* Count non-dummy DCT block rows in this iMCU row. */
-    if (cinfo->output_iMCU_row < last_iMCU_row) {
+    if (cinfo->output_iMCU_row < last_iMCU_row - 1) {
+      block_rows = compptr->v_samp_factor;
+      access_rows = block_rows * 3; /* this and next two iMCU rows */
+    } else if (cinfo->output_iMCU_row < last_iMCU_row) {
       block_rows = compptr->v_samp_factor;
       access_rows = block_rows * 2; /* this and next iMCU row */
-      last_row = FALSE;
     } else {
       /* NB: can't use last_row_height here; it is input-side-dependent! */
       block_rows = (int)(compptr->height_in_blocks % compptr->v_samp_factor);
       if (block_rows == 0) block_rows = compptr->v_samp_factor;
       access_rows = block_rows; /* this iMCU row only */
-      last_row = TRUE;
     }
     /* Align the virtual buffer for this component. */
-    if (cinfo->output_iMCU_row > 0) {
-      access_rows += compptr->v_samp_factor; /* prior iMCU row too */
+    if (cinfo->output_iMCU_row > 1) {
+      access_rows += 2 * compptr->v_samp_factor; /* prior two iMCU rows too */
+      buffer = (*cinfo->mem->access_virt_barray)
+        ((j_common_ptr)cinfo, coef->whole_image[ci],
+         (cinfo->output_iMCU_row - 2) * compptr->v_samp_factor,
+         (JDIMENSION)access_rows, FALSE);
+      buffer += 2 * compptr->v_samp_factor; /* point to current iMCU row */
+    } else if (cinfo->output_iMCU_row > 0) {
       buffer = (*cinfo->mem->access_virt_barray)
         ((j_common_ptr)cinfo, coef->whole_image[ci],
          (cinfo->output_iMCU_row - 1) * compptr->v_samp_factor,
          (JDIMENSION)access_rows, FALSE);
       buffer += compptr->v_samp_factor; /* point to current iMCU row */
-      first_row = FALSE;
     } else {
       buffer = (*cinfo->mem->access_virt_barray)
         ((j_common_ptr)cinfo, coef->whole_image[ci],
          (JDIMENSION)0, (JDIMENSION)access_rows, FALSE);
-      first_row = TRUE;
     }
-    /* Fetch component-dependent info */
-    coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS);
+    /* Fetch component-dependent info.
+     * If the current scan is incomplete, then we use the component-dependent
+     * info from the previous scan.
+     */
+    if (cinfo->output_iMCU_row > cinfo->master->last_good_iMCU_row)
+      coef_bits =
+        coef->coef_bits_latch + ((ci + cinfo->num_components) * SAVED_COEFS);
+    else
+      coef_bits = coef->coef_bits_latch + (ci * SAVED_COEFS);
+
+    /* We only do DC interpolation if no AC coefficient data is available. */
+    change_dc =
+      coef_bits[1] == -1 && coef_bits[2] == -1 && coef_bits[3] == -1 &&
+      coef_bits[4] == -1 && coef_bits[5] == -1 && coef_bits[6] == -1 &&
+      coef_bits[7] == -1 && coef_bits[8] == -1 && coef_bits[9] == -1;
+
     quanttbl = compptr->quant_table;
     Q00 = quanttbl->quantval[0];
     Q01 = quanttbl->quantval[Q01_POS];
@@ -487,27 +529,51 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
     Q20 = quanttbl->quantval[Q20_POS];
     Q11 = quanttbl->quantval[Q11_POS];
     Q02 = quanttbl->quantval[Q02_POS];
+    if (change_dc) {
+      Q03 = quanttbl->quantval[Q03_POS];
+      Q12 = quanttbl->quantval[Q12_POS];
+      Q21 = quanttbl->quantval[Q21_POS];
+      Q30 = quanttbl->quantval[Q30_POS];
+    }
     inverse_DCT = cinfo->idct->inverse_DCT[ci];
     output_ptr = output_buf[ci];
     /* Loop over all DCT blocks to be processed. */
     for (block_row = 0; block_row < block_rows; block_row++) {
       buffer_ptr = buffer[block_row] + cinfo->master->first_MCU_col[ci];
-      if (first_row && block_row == 0)
+
+      if (block_row > 0 || cinfo->output_iMCU_row > 0)
+        prev_block_row =
+          buffer[block_row - 1] + cinfo->master->first_MCU_col[ci];
+      else
         prev_block_row = buffer_ptr;
+
+      if (block_row > 1 || cinfo->output_iMCU_row > 1)
+        prev_prev_block_row =
+          buffer[block_row - 2] + cinfo->master->first_MCU_col[ci];
+      else
+        prev_prev_block_row = prev_block_row;
+
+      if (block_row < block_rows - 1 || cinfo->output_iMCU_row < last_iMCU_row)
+        next_block_row =
+          buffer[block_row + 1] + cinfo->master->first_MCU_col[ci];
       else
-        prev_block_row = buffer[block_row - 1] +
-                         cinfo->master->first_MCU_col[ci];
-      if (last_row && block_row == block_rows - 1)
         next_block_row = buffer_ptr;
+
+      if (block_row < block_rows - 2 ||
+          cinfo->output_iMCU_row < last_iMCU_row - 1)
+        next_next_block_row =
+          buffer[block_row + 2] + cinfo->master->first_MCU_col[ci];
       else
-        next_block_row = buffer[block_row + 1] +
-                         cinfo->master->first_MCU_col[ci];
+        next_next_block_row = next_block_row;
+
       /* We fetch the surrounding DC values using a sliding-register approach.
-       * Initialize all nine here so as to do the right thing on narrow pics.
+       * Initialize all 25 here so as to do the right thing on narrow pics.
        */
-      DC1 = DC2 = DC3 = (int)prev_block_row[0][0];
-      DC4 = DC5 = DC6 = (int)buffer_ptr[0][0];
-      DC7 = DC8 = DC9 = (int)next_block_row[0][0];
+      DC01 = DC02 = DC03 = DC04 = DC05 = (int)prev_prev_block_row[0][0];
+      DC06 = DC07 = DC08 = DC09 = DC10 = (int)prev_block_row[0][0];
+      DC11 = DC12 = DC13 = DC14 = DC15 = (int)buffer_ptr[0][0];
+      DC16 = DC17 = DC18 = DC19 = DC20 = (int)next_block_row[0][0];
+      DC21 = DC22 = DC23 = DC24 = DC25 = (int)next_next_block_row[0][0];
       output_col = 0;
       last_block_column = compptr->width_in_blocks - 1;
       for (block_num = cinfo->master->first_MCU_col[ci];
@@ -515,18 +581,39 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
         /* Fetch current DCT block into workspace so we can modify it. */
         jcopy_block_row(buffer_ptr, (JBLOCKROW)workspace, (JDIMENSION)1);
         /* Update DC values */
-        if (block_num < last_block_column) {
-          DC3 = (int)prev_block_row[1][0];
-          DC6 = (int)buffer_ptr[1][0];
-          DC9 = (int)next_block_row[1][0];
+        if (block_num == cinfo->master->first_MCU_col[ci] &&
+            block_num < last_block_column) {
+          DC04 = (int)prev_prev_block_row[1][0];
+          DC09 = (int)prev_block_row[1][0];
+          DC14 = (int)buffer_ptr[1][0];
+          DC19 = (int)next_block_row[1][0];
+          DC24 = (int)next_next_block_row[1][0];
         }
-        /* Compute coefficient estimates per K.8.
-         * An estimate is applied only if coefficient is still zero,
-         * and is not known to be fully accurate.
+        if (block_num + 1 < last_block_column) {
+          DC05 = (int)prev_prev_block_row[2][0];
+          DC10 = (int)prev_block_row[2][0];
+          DC15 = (int)buffer_ptr[2][0];
+          DC20 = (int)next_block_row[2][0];
+          DC25 = (int)next_next_block_row[2][0];
+        }
+        /* If DC interpolation is enabled, compute coefficient estimates using
+         * a Gaussian-like kernel, keeping the averages of the DC values.
+         *
+         * If DC interpolation is disabled, compute coefficient estimates using
+         * an algorithm similar to the one described in Section K.8 of the JPEG
+         * standard, except applied to a 5x5 window rather than a 3x3 window.
+         *
+         * An estimate is applied only if the coefficient is still zero and is
+         * not known to be fully accurate.
          */
         /* AC01 */
         if ((Al = coef_bits[1]) != 0 && workspace[1] == 0) {
-          num = 36 * Q00 * (DC4 - DC6);
+          num = Q00 * (change_dc ?
+                (-DC01 - DC02 + DC04 + DC05 - 3 * DC06 + 13 * DC07 -
+                 13 * DC09 + 3 * DC10 - 3 * DC11 + 38 * DC12 - 38 * DC14 +
+                 3 * DC15 - 3 * DC16 + 13 * DC17 - 13 * DC19 + 3 * DC20 -
+                 DC21 - DC22 + DC24 + DC25) :
+                (-7 * DC11 + 50 * DC12 - 50 * DC14 + 7 * DC15));
           if (num >= 0) {
             pred = (int)(((Q01 << 7) + num) / (Q01 << 8));
             if (Al > 0 && pred >= (1 << Al))
@@ -541,7 +628,12 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
         }
         /* AC10 */
         if ((Al = coef_bits[2]) != 0 && workspace[8] == 0) {
-          num = 36 * Q00 * (DC2 - DC8);
+          num = Q00 * (change_dc ?
+                (-DC01 - 3 * DC02 - 3 * DC03 - 3 * DC04 - DC05 - DC06 +
+                 13 * DC07 + 38 * DC08 + 13 * DC09 - DC10 + DC16 -
+                 13 * DC17 - 38 * DC18 - 13 * DC19 + DC20 + DC21 +
+                 3 * DC22 + 3 * DC23 + 3 * DC24 + DC25) :
+                (-7 * DC03 + 50 * DC08 - 50 * DC18 + 7 * DC23));
           if (num >= 0) {
             pred = (int)(((Q10 << 7) + num) / (Q10 << 8));
             if (Al > 0 && pred >= (1 << Al))
@@ -556,7 +648,10 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
         }
         /* AC20 */
         if ((Al = coef_bits[3]) != 0 && workspace[16] == 0) {
-          num = 9 * Q00 * (DC2 + DC8 - 2 * DC5);
+          num = Q00 * (change_dc ?
+                (DC03 + 2 * DC07 + 7 * DC08 + 2 * DC09 - 5 * DC12 - 14 * DC13 -
+                 5 * DC14 + 2 * DC17 + 7 * DC18 + 2 * DC19 + DC23) :
+                (-DC03 + 13 * DC08 - 24 * DC13 + 13 * DC18 - DC23));
           if (num >= 0) {
             pred = (int)(((Q20 << 7) + num) / (Q20 << 8));
             if (Al > 0 && pred >= (1 << Al))
@@ -571,7 +666,11 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
         }
         /* AC11 */
         if ((Al = coef_bits[4]) != 0 && workspace[9] == 0) {
-          num = 5 * Q00 * (DC1 - DC3 - DC7 + DC9);
+          num = Q00 * (change_dc ?
+                (-DC01 + DC05 + 9 * DC07 - 9 * DC09 - 9 * DC17 +
+                 9 * DC19 + DC21 - DC25) :
+                (DC10 + DC16 - 10 * DC17 + 10 * DC19 - DC02 - DC20 + DC22 -
+                 DC24 + DC04 - DC06 + 10 * DC07 - 10 * DC09));
           if (num >= 0) {
             pred = (int)(((Q11 << 7) + num) / (Q11 << 8));
             if (Al > 0 && pred >= (1 << Al))
@@ -586,7 +685,10 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
         }
         /* AC02 */
         if ((Al = coef_bits[5]) != 0 && workspace[2] == 0) {
-          num = 9 * Q00 * (DC4 + DC6 - 2 * DC5);
+          num = Q00 * (change_dc ?
+                (2 * DC07 - 5 * DC08 + 2 * DC09 + DC11 + 7 * DC12 - 14 * DC13 +
+                 7 * DC14 + DC15 + 2 * DC17 - 5 * DC18 + 2 * DC19) :
+                (-DC11 + 13 * DC12 - 24 * DC13 + 13 * DC14 - DC15));
           if (num >= 0) {
             pred = (int)(((Q02 << 7) + num) / (Q02 << 8));
             if (Al > 0 && pred >= (1 << Al))
@@ -599,14 +701,96 @@ decompress_smooth_data(j_decompress_ptr cinfo, JSAMPIMAGE output_buf)
           }
           workspace[2] = (JCOEF)pred;
         }
+        if (change_dc) {
+          /* AC03 */
+          if ((Al = coef_bits[6]) != 0 && workspace[3] == 0) {
+            num = Q00 * (DC07 - DC09 + 2 * DC12 - 2 * DC14 + DC17 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q03 << 7) + num) / (Q03 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q03 << 7) - num) / (Q03 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[3] = (JCOEF)pred;
+          }
+          /* AC12 */
+          if ((Al = coef_bits[7]) != 0 && workspace[10] == 0) {
+            num = Q00 * (DC07 - 3 * DC08 + DC09 - DC17 + 3 * DC18 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q12 << 7) + num) / (Q12 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q12 << 7) - num) / (Q12 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[10] = (JCOEF)pred;
+          }
+          /* AC21 */
+          if ((Al = coef_bits[8]) != 0 && workspace[17] == 0) {
+            num = Q00 * (DC07 - DC09 - 3 * DC12 + 3 * DC14 + DC17 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q21 << 7) + num) / (Q21 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q21 << 7) - num) / (Q21 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[17] = (JCOEF)pred;
+          }
+          /* AC30 */
+          if ((Al = coef_bits[9]) != 0 && workspace[24] == 0) {
+            num = Q00 * (DC07 + 2 * DC08 + DC09 - DC17 - 2 * DC18 - DC19);
+            if (num >= 0) {
+              pred = (int)(((Q30 << 7) + num) / (Q30 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+            } else {
+              pred = (int)(((Q30 << 7) - num) / (Q30 << 8));
+              if (Al > 0 && pred >= (1 << Al))
+                pred = (1 << Al) - 1;
+              pred = -pred;
+            }
+            workspace[24] = (JCOEF)pred;
+          }
+          /* coef_bits[0] is non-negative.  Otherwise this function would not
+           * be called.
+           */
+          num = Q00 *
+                (-2 * DC01 - 6 * DC02 - 8 * DC03 - 6 * DC04 - 2 * DC05 -
+                 6 * DC06 + 6 * DC07 + 42 * DC08 + 6 * DC09 - 6 * DC10 -
+                 8 * DC11 + 42 * DC12 + 152 * DC13 + 42 * DC14 - 8 * DC15 -
+                 6 * DC16 + 6 * DC17 + 42 * DC18 + 6 * DC19 - 6 * DC20 -
+                 2 * DC21 - 6 * DC22 - 8 * DC23 - 6 * DC24 - 2 * DC25);
+          if (num >= 0) {
+            pred = (int)(((Q00 << 7) + num) / (Q00 << 8));
+          } else {
+            pred = (int)(((Q00 << 7) - num) / (Q00 << 8));
+            pred = -pred;
+          }
+          workspace[0] = (JCOEF)pred;
+        }  /* change_dc */
+
         /* OK, do the IDCT */
         (*inverse_DCT) (cinfo, compptr, (JCOEFPTR)workspace, output_ptr,
                         output_col);
         /* Advance for next column */
-        DC1 = DC2;  DC2 = DC3;
-        DC4 = DC5;  DC5 = DC6;
-        DC7 = DC8;  DC8 = DC9;
-        buffer_ptr++, prev_block_row++, next_block_row++;
+        DC01 = DC02;  DC02 = DC03;  DC03 = DC04;  DC04 = DC05;
+        DC06 = DC07;  DC07 = DC08;  DC08 = DC09;  DC09 = DC10;
+        DC11 = DC12;  DC12 = DC13;  DC13 = DC14;  DC14 = DC15;
+        DC16 = DC17;  DC17 = DC18;  DC18 = DC19;  DC19 = DC20;
+        DC21 = DC22;  DC22 = DC23;  DC23 = DC24;  DC24 = DC25;
+        buffer_ptr++, prev_block_row++, next_block_row++,
+          prev_prev_block_row++, next_next_block_row++;
         output_col += compptr->_DCT_scaled_size;
       }
       output_ptr += compptr->_DCT_scaled_size;
@@ -655,7 +839,7 @@ jinit_d_coef_controller(j_decompress_ptr cinfo, boolean need_full_buffer)
 #ifdef BLOCK_SMOOTHING_SUPPORTED
       /* If block smoothing could be used, need a bigger window */
       if (cinfo->progressive_mode)
-        access_rows *= 3;
+        access_rows *= 5;
 #endif
       coef->whole_image[ci] = (*cinfo->mem->request_virt_barray)
         ((j_common_ptr)cinfo, JPOOL_IMAGE, TRUE,
diff --git a/external/jpeg/jdcoefct.h b/external/jpeg/jdcoefct.h
index c4d1943dd4db..9a0e78066364 100644
--- a/external/jpeg/jdcoefct.h
+++ b/external/jpeg/jdcoefct.h
@@ -5,6 +5,7 @@
  * Copyright (C) 1994-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2020, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  */
@@ -51,7 +52,7 @@ typedef struct {
 #ifdef BLOCK_SMOOTHING_SUPPORTED
   /* When doing block smoothing, we latch coefficient Al values here */
   int *coef_bits_latch;
-#define SAVED_COEFS  6          /* we save coef_bits[0..5] */
+#define SAVED_COEFS  10         /* we save coef_bits[0..9] */
 #endif
 } my_coef_controller;
 
diff --git a/external/jpeg/jdcol565.c b/external/jpeg/jdcol565.c
index 40068ef84fd2..53c7bd9187d4 100644
--- a/external/jpeg/jdcol565.c
+++ b/external/jpeg/jdcol565.c
@@ -45,9 +45,9 @@ ycc_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr = *output_buf++;
 
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
@@ -58,18 +58,18 @@ ycc_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
       b = range_limit[y + Cbbtab[cb]];
       rgb = PACK_SHORT_565(r, g, b);
 
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
@@ -80,9 +80,9 @@ ycc_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       outptr += 4;
     }
     if (num_cols & 1) {
-      y  = GETJSAMPLE(*inptr0);
-      cb = GETJSAMPLE(*inptr1);
-      cr = GETJSAMPLE(*inptr2);
+      y  = *inptr0;
+      cb = *inptr1;
+      cr = *inptr2;
       r = range_limit[y + Crrtab[cr]];
       g = range_limit[y + ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
                                             SCALEBITS))];
@@ -125,9 +125,9 @@ ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -139,9 +139,9 @@ ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -150,9 +150,9 @@ ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_SHORT_565(r, g, b);
 
-      y  = GETJSAMPLE(*inptr0++);
-      cb = GETJSAMPLE(*inptr1++);
-      cr = GETJSAMPLE(*inptr2++);
+      y  = *inptr0++;
+      cb = *inptr1++;
+      cr = *inptr2++;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -165,9 +165,9 @@ ycc_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       outptr += 4;
     }
     if (num_cols & 1) {
-      y  = GETJSAMPLE(*inptr0);
-      cb = GETJSAMPLE(*inptr1);
-      cr = GETJSAMPLE(*inptr2);
+      y  = *inptr0;
+      cb = *inptr1;
+      cr = *inptr2;
       r = range_limit[DITHER_565_R(y + Crrtab[cr], d0)];
       g = range_limit[DITHER_565_G(y +
                                    ((int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr],
@@ -202,32 +202,32 @@ rgb_rgb565_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_SHORT_565(r, g, b);
 
-      r = GETJSAMPLE(*inptr0++);
-      g = GETJSAMPLE(*inptr1++);
-      b = GETJSAMPLE(*inptr2++);
+      r = *inptr0++;
+      g = *inptr1++;
+      b = *inptr2++;
       rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
 
       WRITE_TWO_ALIGNED_PIXELS(outptr, rgb);
       outptr += 4;
     }
     if (num_cols & 1) {
-      r = GETJSAMPLE(*inptr0);
-      g = GETJSAMPLE(*inptr1);
-      b = GETJSAMPLE(*inptr2);
+      r = *inptr0;
+      g = *inptr1;
+      b = *inptr2;
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
     }
@@ -259,24 +259,24 @@ rgb_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     input_row++;
     outptr = *output_buf++;
     if (PACK_NEED_ALIGNMENT(outptr)) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
       outptr += 2;
       num_cols--;
     }
     for (col = 0; col < (num_cols >> 1); col++) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_SHORT_565(r, g, b);
 
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0++), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1++), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2++), d0)];
+      r = range_limit[DITHER_565_R(*inptr0++, d0)];
+      g = range_limit[DITHER_565_G(*inptr1++, d0)];
+      b = range_limit[DITHER_565_B(*inptr2++, d0)];
       d0 = DITHER_ROTATE(d0);
       rgb = PACK_TWO_PIXELS(rgb, PACK_SHORT_565(r, g, b));
 
@@ -284,9 +284,9 @@ rgb_rgb565D_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
       outptr += 4;
     }
     if (num_cols & 1) {
-      r = range_limit[DITHER_565_R(GETJSAMPLE(*inptr0), d0)];
-      g = range_limit[DITHER_565_G(GETJSAMPLE(*inptr1), d0)];
-      b = range_limit[DITHER_565_B(GETJSAMPLE(*inptr2), d0)];
+      r = range_limit[DITHER_565_R(*inptr0, d0)];
+      g = range_limit[DITHER_565_G(*inptr1, d0)];
+      b = range_limit[DITHER_565_B(*inptr2, d0)];
       rgb = PACK_SHORT_565(r, g, b);
       *(INT16 *)outptr = (INT16)rgb;
     }
diff --git a/external/jpeg/jdcolext.c b/external/jpeg/jdcolext.c
index 72a530107036..863c7a2fbc76 100644
--- a/external/jpeg/jdcolext.c
+++ b/external/jpeg/jdcolext.c
@@ -53,9 +53,9 @@ ycc_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      y  = GETJSAMPLE(inptr0[col]);
-      cb = GETJSAMPLE(inptr1[col]);
-      cr = GETJSAMPLE(inptr2[col]);
+      y  = inptr0[col];
+      cb = inptr1[col];
+      cr = inptr2[col];
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[RGB_RED] =   range_limit[y + Crrtab[cr]];
       outptr[RGB_GREEN] = range_limit[y +
@@ -93,7 +93,6 @@ gray_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     inptr = input_buf[0][input_row++];
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      /* We can dispense with GETJSAMPLE() here */
       outptr[RGB_RED] = outptr[RGB_GREEN] = outptr[RGB_BLUE] = inptr[col];
       /* Set unused byte to 0xFF so it can be interpreted as an opaque */
       /* alpha channel value */
@@ -128,7 +127,6 @@ rgb_rgb_convert_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      /* We can dispense with GETJSAMPLE() here */
       outptr[RGB_RED] = inptr0[col];
       outptr[RGB_GREEN] = inptr1[col];
       outptr[RGB_BLUE] = inptr2[col];
diff --git a/external/jpeg/jdcolor.c b/external/jpeg/jdcolor.c
index d3ae40c7da9a..8da2b4eaf2e9 100644
--- a/external/jpeg/jdcolor.c
+++ b/external/jpeg/jdcolor.c
@@ -341,9 +341,9 @@ rgb_gray_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      r = GETJSAMPLE(inptr0[col]);
-      g = GETJSAMPLE(inptr1[col]);
-      b = GETJSAMPLE(inptr2[col]);
+      r = inptr0[col];
+      g = inptr1[col];
+      b = inptr2[col];
       /* Y */
       outptr[col] = (JSAMPLE)((ctab[r + R_Y_OFF] + ctab[g + G_Y_OFF] +
                                ctab[b + B_Y_OFF]) >> SCALEBITS);
@@ -550,9 +550,9 @@ ycck_cmyk_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     input_row++;
     outptr = *output_buf++;
     for (col = 0; col < num_cols; col++) {
-      y  = GETJSAMPLE(inptr0[col]);
-      cb = GETJSAMPLE(inptr1[col]);
-      cr = GETJSAMPLE(inptr2[col]);
+      y  = inptr0[col];
+      cb = inptr1[col];
+      cr = inptr2[col];
       /* Range-limiting is essential due to noise introduced by DCT losses. */
       outptr[0] = range_limit[MAXJSAMPLE - (y + Crrtab[cr])];   /* red */
       outptr[1] = range_limit[MAXJSAMPLE - (y +                 /* green */
@@ -560,7 +560,7 @@ ycck_cmyk_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                                                  SCALEBITS)))];
       outptr[2] = range_limit[MAXJSAMPLE - (y + Cbbtab[cb])];   /* blue */
       /* K passes through unchanged */
-      outptr[3] = inptr3[col];  /* don't need GETJSAMPLE here */
+      outptr[3] = inptr3[col];
       outptr += 4;
     }
   }
diff --git a/external/jpeg/jdhuff.c b/external/jpeg/jdhuff.c
index a1128178b0a9..f786c1054735 100644
--- a/external/jpeg/jdhuff.c
+++ b/external/jpeg/jdhuff.c
@@ -5,6 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2009-2011, 2016, 2018-2019, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -39,24 +40,6 @@ typedef struct {
   int last_dc_val[MAX_COMPS_IN_SCAN]; /* last DC coef for each component */
 } savable_state;
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest, src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest, src) \
-  ((dest).last_dc_val[0] = (src).last_dc_val[0], \
-   (dest).last_dc_val[1] = (src).last_dc_val[1], \
-   (dest).last_dc_val[2] = (src).last_dc_val[2], \
-   (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
 typedef struct {
   struct jpeg_entropy_decoder pub; /* public fields */
 
@@ -325,7 +308,7 @@ jpeg_fill_bit_buffer(bitread_working_state *state,
         bytes_in_buffer = cinfo->src->bytes_in_buffer;
       }
       bytes_in_buffer--;
-      c = GETJOCTET(*next_input_byte++);
+      c = *next_input_byte++;
 
       /* If it's 0xFF, check and discard stuffed zero byte */
       if (c == 0xFF) {
@@ -342,7 +325,7 @@ jpeg_fill_bit_buffer(bitread_working_state *state,
             bytes_in_buffer = cinfo->src->bytes_in_buffer;
           }
           bytes_in_buffer--;
-          c = GETJOCTET(*next_input_byte++);
+          c = *next_input_byte++;
         } while (c == 0xFF);
 
         if (c == 0) {
@@ -405,8 +388,8 @@ jpeg_fill_bit_buffer(bitread_working_state *state,
 
 #define GET_BYTE { \
   register int c0, c1; \
-  c0 = GETJOCTET(*buffer++); \
-  c1 = GETJOCTET(*buffer); \
+  c0 = *buffer++; \
+  c1 = *buffer; \
   /* Pre-execute most common case */ \
   get_buffer = (get_buffer << 8) | c0; \
   bits_left += 8; \
@@ -423,7 +406,7 @@ jpeg_fill_bit_buffer(bitread_working_state *state,
   } \
 }
 
-#if SIZEOF_SIZE_T == 8 || defined(_WIN64)
+#if SIZEOF_SIZE_T == 8 || defined(_WIN64) || (defined(__x86_64__) && defined(__ILP32__))
 
 /* Pre-fetch 48 bytes, because the holding register is 64-bit */
 #define FILL_BIT_BUFFER_FAST \
@@ -557,6 +540,12 @@ process_restart(j_decompress_ptr cinfo)
 }
 
 
+#if defined(__has_feature)
+#if __has_feature(undefined_behavior_sanitizer)
+__attribute__((no_sanitize("signed-integer-overflow"),
+               no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
 LOCAL(boolean)
 decode_mcu_slow(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 {
@@ -568,7 +557,7 @@ decode_mcu_slow(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 
   /* Load up working state */
   BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
-  ASSIGN_STATE(state, entropy->saved);
+  state = entropy->saved;
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
@@ -589,11 +578,15 @@ decode_mcu_slow(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
     if (entropy->dc_needed[blkn]) {
       /* Convert DC difference to actual value, update last_dc_val */
       int ci = cinfo->MCU_membership[blkn];
-      /* This is really just
-       *   s += state.last_dc_val[ci];
-       * It is written this way in order to shut up UBSan.
+      /* Certain malformed JPEG images produce repeated DC coefficient
+       * differences of 2047 or -2047, which causes state.last_dc_val[ci] to
+       * grow until it overflows or underflows a 32-bit signed integer.  This
+       * behavior is, to the best of our understanding, innocuous, and it is
+       * unclear how to work around it without potentially affecting
+       * performance.  Thus, we (hopefully temporarily) suppress UBSan integer
+       * overflow errors for this function.
        */
-      s = (int)((unsigned int)s + (unsigned int)state.last_dc_val[ci]);
+      s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
       if (block) {
         /* Output the DC coefficient (assumes jpeg_natural_order[0] = 0) */
@@ -653,7 +646,7 @@ decode_mcu_slow(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 
   /* Completed MCU, so update state */
   BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
-  ASSIGN_STATE(entropy->saved, state);
+  entropy->saved = state;
   return TRUE;
 }
 
@@ -671,7 +664,7 @@ decode_mcu_fast(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   /* Load up working state */
   BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
   buffer = (JOCTET *)br_state.next_input_byte;
-  ASSIGN_STATE(state, entropy->saved);
+  state = entropy->saved;
 
   for (blkn = 0; blkn < cinfo->blocks_in_MCU; blkn++) {
     JBLOCKROW block = MCU_data ? MCU_data[blkn] : NULL;
@@ -688,7 +681,7 @@ decode_mcu_fast(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 
     if (entropy->dc_needed[blkn]) {
       int ci = cinfo->MCU_membership[blkn];
-      s = (int)((unsigned int)s + (unsigned int)state.last_dc_val[ci]);
+      s += state.last_dc_val[ci];
       state.last_dc_val[ci] = s;
       if (block)
         (*block)[0] = (JCOEF)s;
@@ -740,7 +733,7 @@ decode_mcu_fast(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   br_state.bytes_in_buffer -= (buffer - br_state.next_input_byte);
   br_state.next_input_byte = buffer;
   BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
-  ASSIGN_STATE(entropy->saved, state);
+  entropy->saved = state;
   return TRUE;
 }
 
@@ -795,7 +788,8 @@ decode_mcu(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
diff --git a/external/jpeg/jdhuff.h b/external/jpeg/jdhuff.h
index 6a8d90f4027c..cfa0b7f55888 100644
--- a/external/jpeg/jdhuff.h
+++ b/external/jpeg/jdhuff.h
@@ -4,7 +4,8 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010-2011, 2015-2016, D. R. Commander.
+ * Copyright (C) 2010-2011, 2015-2016, 2021, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -78,6 +79,11 @@ EXTERN(void) jpeg_make_d_derived_tbl(j_decompress_ptr cinfo, boolean isDC,
 typedef size_t bit_buf_type;            /* type of bit-extraction buffer */
 #define BIT_BUF_SIZE  64                /* size of buffer in bits */
 
+#elif defined(__x86_64__) && defined(__ILP32__)
+
+typedef unsigned long long bit_buf_type; /* type of bit-extraction buffer */
+#define BIT_BUF_SIZE  64                 /* size of buffer in bits */
+
 #else
 
 typedef unsigned long bit_buf_type;     /* type of bit-extraction buffer */
@@ -228,7 +234,10 @@ slowlabel: \
       s |= GET_BITS(1); \
       nb++; \
     } \
-    s = htbl->pub->huffval[(int)(s + htbl->valoffset[nb]) & 0xFF]; \
+    if (nb > 16) \
+      s = 0; \
+    else \
+      s = htbl->pub->huffval[(int)(s + htbl->valoffset[nb]) & 0xFF]; \
   }
 
 /* Out-of-line case for Huffman code fetching */
diff --git a/external/jpeg/jdicc.c b/external/jpeg/jdicc.c
index 7224695816b0..a1a5b867ae2b 100644
--- a/external/jpeg/jdicc.c
+++ b/external/jpeg/jdicc.c
@@ -38,18 +38,18 @@ marker_is_icc(jpeg_saved_marker_ptr marker)
     marker->marker == ICC_MARKER &&
     marker->data_length >= ICC_OVERHEAD_LEN &&
     /* verify the identifying string */
-    GETJOCTET(marker->data[0]) == 0x49 &&
-    GETJOCTET(marker->data[1]) == 0x43 &&
-    GETJOCTET(marker->data[2]) == 0x43 &&
-    GETJOCTET(marker->data[3]) == 0x5F &&
-    GETJOCTET(marker->data[4]) == 0x50 &&
-    GETJOCTET(marker->data[5]) == 0x52 &&
-    GETJOCTET(marker->data[6]) == 0x4F &&
-    GETJOCTET(marker->data[7]) == 0x46 &&
-    GETJOCTET(marker->data[8]) == 0x49 &&
-    GETJOCTET(marker->data[9]) == 0x4C &&
-    GETJOCTET(marker->data[10]) == 0x45 &&
-    GETJOCTET(marker->data[11]) == 0x0;
+    marker->data[0] == 0x49 &&
+    marker->data[1] == 0x43 &&
+    marker->data[2] == 0x43 &&
+    marker->data[3] == 0x5F &&
+    marker->data[4] == 0x50 &&
+    marker->data[5] == 0x52 &&
+    marker->data[6] == 0x4F &&
+    marker->data[7] == 0x46 &&
+    marker->data[8] == 0x49 &&
+    marker->data[9] == 0x4C &&
+    marker->data[10] == 0x45 &&
+    marker->data[11] == 0x0;
 }
 
 
@@ -102,12 +102,12 @@ jpeg_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
   for (marker = cinfo->marker_list; marker != NULL; marker = marker->next) {
     if (marker_is_icc(marker)) {
       if (num_markers == 0)
-        num_markers = GETJOCTET(marker->data[13]);
-      else if (num_markers != GETJOCTET(marker->data[13])) {
+        num_markers = marker->data[13];
+      else if (num_markers != marker->data[13]) {
         WARNMS(cinfo, JWRN_BOGUS_ICC);  /* inconsistent num_markers fields */
         return FALSE;
       }
-      seq_no = GETJOCTET(marker->data[12]);
+      seq_no = marker->data[12];
       if (seq_no <= 0 || seq_no > num_markers) {
         WARNMS(cinfo, JWRN_BOGUS_ICC);  /* bogus sequence number */
         return FALSE;
@@ -154,7 +154,7 @@ jpeg_read_icc_profile(j_decompress_ptr cinfo, JOCTET **icc_data_ptr,
       JOCTET FAR *src_ptr;
       JOCTET *dst_ptr;
       unsigned int length;
-      seq_no = GETJOCTET(marker->data[12]);
+      seq_no = marker->data[12];
       dst_ptr = icc_data + data_offset[seq_no];
       src_ptr = marker->data + ICC_OVERHEAD_LEN;
       length = data_length[seq_no];
diff --git a/external/jpeg/jdmarker.c b/external/jpeg/jdmarker.c
index c9c7ef639947..b964c3a1a6ac 100644
--- a/external/jpeg/jdmarker.c
+++ b/external/jpeg/jdmarker.c
@@ -151,7 +151,7 @@ typedef my_marker_reader *my_marker_ptr;
 #define INPUT_BYTE(cinfo, V, action) \
   MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
             bytes_in_buffer--; \
-            V = GETJOCTET(*next_input_byte++); )
+            V = *next_input_byte++; )
 
 /* As above, but read two bytes interpreted as an unsigned 16-bit integer.
  * V should be declared unsigned int or perhaps JLONG.
@@ -159,10 +159,10 @@ typedef my_marker_reader *my_marker_ptr;
 #define INPUT_2BYTES(cinfo, V, action) \
   MAKESTMT( MAKE_BYTE_AVAIL(cinfo, action); \
             bytes_in_buffer--; \
-            V = ((unsigned int)GETJOCTET(*next_input_byte++)) << 8; \
+            V = ((unsigned int)(*next_input_byte++)) << 8; \
             MAKE_BYTE_AVAIL(cinfo, action); \
             bytes_in_buffer--; \
-            V += GETJOCTET(*next_input_byte++); )
+            V += *next_input_byte++; )
 
 
 /*
@@ -608,18 +608,18 @@ examine_app0(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
   JLONG totallen = (JLONG)datalen + remaining;
 
   if (datalen >= APP0_DATA_LEN &&
-      GETJOCTET(data[0]) == 0x4A &&
-      GETJOCTET(data[1]) == 0x46 &&
-      GETJOCTET(data[2]) == 0x49 &&
-      GETJOCTET(data[3]) == 0x46 &&
-      GETJOCTET(data[4]) == 0) {
+      data[0] == 0x4A &&
+      data[1] == 0x46 &&
+      data[2] == 0x49 &&
+      data[3] == 0x46 &&
+      data[4] == 0) {
     /* Found JFIF APP0 marker: save info */
     cinfo->saw_JFIF_marker = TRUE;
-    cinfo->JFIF_major_version = GETJOCTET(data[5]);
-    cinfo->JFIF_minor_version = GETJOCTET(data[6]);
-    cinfo->density_unit = GETJOCTET(data[7]);
-    cinfo->X_density = (GETJOCTET(data[8]) << 8) + GETJOCTET(data[9]);
-    cinfo->Y_density = (GETJOCTET(data[10]) << 8) + GETJOCTET(data[11]);
+    cinfo->JFIF_major_version = data[5];
+    cinfo->JFIF_minor_version = data[6];
+    cinfo->density_unit = data[7];
+    cinfo->X_density = (data[8] << 8) + data[9];
+    cinfo->Y_density = (data[10] << 8) + data[11];
     /* Check version.
      * Major version must be 1, anything else signals an incompatible change.
      * (We used to treat this as an error, but now it's a nonfatal warning,
@@ -634,24 +634,22 @@ examine_app0(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
              cinfo->JFIF_major_version, cinfo->JFIF_minor_version,
              cinfo->X_density, cinfo->Y_density, cinfo->density_unit);
     /* Validate thumbnail dimensions and issue appropriate messages */
-    if (GETJOCTET(data[12]) | GETJOCTET(data[13]))
-      TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL,
-               GETJOCTET(data[12]), GETJOCTET(data[13]));
+    if (data[12] | data[13])
+      TRACEMS2(cinfo, 1, JTRC_JFIF_THUMBNAIL, data[12], data[13]);
     totallen -= APP0_DATA_LEN;
-    if (totallen !=
-        ((JLONG)GETJOCTET(data[12]) * (JLONG)GETJOCTET(data[13]) * (JLONG)3))
+    if (totallen != ((JLONG)data[12] * (JLONG)data[13] * (JLONG)3))
       TRACEMS1(cinfo, 1, JTRC_JFIF_BADTHUMBNAILSIZE, (int)totallen);
   } else if (datalen >= 6 &&
-             GETJOCTET(data[0]) == 0x4A &&
-             GETJOCTET(data[1]) == 0x46 &&
-             GETJOCTET(data[2]) == 0x58 &&
-             GETJOCTET(data[3]) == 0x58 &&
-             GETJOCTET(data[4]) == 0) {
+             data[0] == 0x4A &&
+             data[1] == 0x46 &&
+             data[2] == 0x58 &&
+             data[3] == 0x58 &&
+             data[4] == 0) {
     /* Found JFIF "JFXX" extension APP0 marker */
     /* The library doesn't actually do anything with these,
      * but we try to produce a helpful trace message.
      */
-    switch (GETJOCTET(data[5])) {
+    switch (data[5]) {
     case 0x10:
       TRACEMS1(cinfo, 1, JTRC_THUMB_JPEG, (int)totallen);
       break;
@@ -662,8 +660,7 @@ examine_app0(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
       TRACEMS1(cinfo, 1, JTRC_THUMB_RGB, (int)totallen);
       break;
     default:
-      TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION,
-               GETJOCTET(data[5]), (int)totallen);
+      TRACEMS2(cinfo, 1, JTRC_JFIF_EXTENSION, data[5], (int)totallen);
       break;
     }
   } else {
@@ -684,16 +681,16 @@ examine_app14(j_decompress_ptr cinfo, JOCTET *data, unsigned int datalen,
   unsigned int version, flags0, flags1, transform;
 
   if (datalen >= APP14_DATA_LEN &&
-      GETJOCTET(data[0]) == 0x41 &&
-      GETJOCTET(data[1]) == 0x64 &&
-      GETJOCTET(data[2]) == 0x6F &&
-      GETJOCTET(data[3]) == 0x62 &&
-      GETJOCTET(data[4]) == 0x65) {
+      data[0] == 0x41 &&
+      data[1] == 0x64 &&
+      data[2] == 0x6F &&
+      data[3] == 0x62 &&
+      data[4] == 0x65) {
     /* Found Adobe APP14 marker */
-    version = (GETJOCTET(data[5]) << 8) + GETJOCTET(data[6]);
-    flags0 = (GETJOCTET(data[7]) << 8) + GETJOCTET(data[8]);
-    flags1 = (GETJOCTET(data[9]) << 8) + GETJOCTET(data[10]);
-    transform = GETJOCTET(data[11]);
+    version = (data[5] << 8) + data[6];
+    flags0 = (data[7] << 8) + data[8];
+    flags1 = (data[9] << 8) + data[10];
+    transform = data[11];
     TRACEMS4(cinfo, 1, JTRC_ADOBE, version, flags0, flags1, transform);
     cinfo->saw_Adobe_marker = TRUE;
     cinfo->Adobe_transform = (UINT8)transform;
diff --git a/external/jpeg/jdmaster.c b/external/jpeg/jdmaster.c
index b20906438e49..cbc8774b1f2b 100644
--- a/external/jpeg/jdmaster.c
+++ b/external/jpeg/jdmaster.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2002-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2009-2011, 2016, D. R. Commander.
+ * Copyright (C) 2009-2011, 2016, 2019, D. R. Commander.
  * Copyright (C) 2013, Linaro Limited.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -22,7 +22,6 @@
 #include "jpeglib.h"
 #include "jpegcomp.h"
 #include "jdmaster.h"
-#include "jsimd.h"
 
 
 /*
@@ -70,17 +69,6 @@ use_merged_upsample(j_decompress_ptr cinfo)
       cinfo->comp_info[1]._DCT_scaled_size != cinfo->_min_DCT_scaled_size ||
       cinfo->comp_info[2]._DCT_scaled_size != cinfo->_min_DCT_scaled_size)
     return FALSE;
-#ifdef WITH_SIMD
-  /* If YCbCr-to-RGB color conversion is SIMD-accelerated but merged upsampling
-     isn't, then disabling merged upsampling is likely to be faster when
-     decompressing YCbCr JPEG images. */
-  if (!jsimd_can_h2v2_merged_upsample() && !jsimd_can_h2v1_merged_upsample() &&
-      jsimd_can_ycc_rgb() && cinfo->jpeg_color_space == JCS_YCbCr &&
-      (cinfo->out_color_space == JCS_RGB ||
-       (cinfo->out_color_space >= JCS_EXT_RGB &&
-        cinfo->out_color_space <= JCS_EXT_ARGB)))
-    return FALSE;
-#endif
   /* ??? also need to test for upsample-time rescaling, when & if supported */
   return TRUE;                  /* by golly, it'll work... */
 #else
@@ -580,6 +568,7 @@ master_selection(j_decompress_ptr cinfo)
    */
   cinfo->master->first_iMCU_col = 0;
   cinfo->master->last_iMCU_col = cinfo->MCUs_per_row - 1;
+  cinfo->master->last_good_iMCU_row = 0;
 
 #ifdef D_MULTISCAN_FILES_SUPPORTED
   /* If jpeg_start_decompress will read the whole file, initialize
diff --git a/external/jpeg/jdmrg565.c b/external/jpeg/jdmrg565.c
index 53f1e1670006..980a4e216e4d 100644
--- a/external/jpeg/jdmrg565.c
+++ b/external/jpeg/jdmrg565.c
@@ -43,20 +43,20 @@ h2v1_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -68,12 +68,12 @@ h2v1_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -115,21 +115,21 @@ h2v1_merged_upsample_565D_internal(j_decompress_ptr cinfo,
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     d0 = DITHER_ROTATE(d0);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
@@ -142,12 +142,12 @@ h2v1_merged_upsample_565D_internal(j_decompress_ptr cinfo,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
@@ -189,20 +189,20 @@ h2v2_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -211,13 +211,13 @@ h2v2_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     WRITE_TWO_PIXELS(outptr0, rgb);
     outptr0 += 4;
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -229,20 +229,20 @@ h2v2_merged_upsample_565_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
     rgb = PACK_SHORT_565(r, g, b);
     *(INT16 *)outptr0 = (INT16)rgb;
 
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     r = range_limit[y + cred];
     g = range_limit[y + cgreen];
     b = range_limit[y + cblue];
@@ -287,21 +287,21 @@ h2v2_merged_upsample_565D_internal(j_decompress_ptr cinfo,
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     d0 = DITHER_ROTATE(d0);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
@@ -311,14 +311,14 @@ h2v2_merged_upsample_565D_internal(j_decompress_ptr cinfo,
     WRITE_TWO_PIXELS(outptr0, rgb);
     outptr0 += 4;
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
     d1 = DITHER_ROTATE(d1);
     rgb = PACK_SHORT_565(r, g, b);
 
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
@@ -331,20 +331,20 @@ h2v2_merged_upsample_565D_internal(j_decompress_ptr cinfo,
 
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
 
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     r = range_limit[DITHER_565_R(y + cred, d0)];
     g = range_limit[DITHER_565_G(y + cgreen, d0)];
     b = range_limit[DITHER_565_B(y + cblue, d0)];
     rgb = PACK_SHORT_565(r, g, b);
     *(INT16 *)outptr0 = (INT16)rgb;
 
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     r = range_limit[DITHER_565_R(y + cred, d1)];
     g = range_limit[DITHER_565_G(y + cgreen, d1)];
     b = range_limit[DITHER_565_B(y + cblue, d1)];
diff --git a/external/jpeg/jdmrgext.c b/external/jpeg/jdmrgext.c
index c9a44d8219c2..9bf4f1a307f3 100644
--- a/external/jpeg/jdmrgext.c
+++ b/external/jpeg/jdmrgext.c
@@ -46,13 +46,13 @@ h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   /* Loop for each pair of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     /* Fetch 2 Y values and emit 2 pixels */
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -60,7 +60,7 @@ h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr[RGB_ALPHA] = 0xFF;
 #endif
     outptr += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr0++);
+    y  = *inptr0++;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -71,12 +71,12 @@ h2v1_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   }
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr0);
+    y  = *inptr0;
     outptr[RGB_RED] =   range_limit[y + cred];
     outptr[RGB_GREEN] = range_limit[y + cgreen];
     outptr[RGB_BLUE] =  range_limit[y + cblue];
@@ -120,13 +120,13 @@ h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   /* Loop for each group of output pixels */
   for (col = cinfo->output_width >> 1; col > 0; col--) {
     /* Do the chroma part of the calculation */
-    cb = GETJSAMPLE(*inptr1++);
-    cr = GETJSAMPLE(*inptr2++);
+    cb = *inptr1++;
+    cr = *inptr2++;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
     /* Fetch 4 Y values and emit 4 pixels */
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
@@ -134,7 +134,7 @@ h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
     outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr00++);
+    y  = *inptr00++;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
@@ -142,7 +142,7 @@ h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
     outptr0 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
@@ -150,7 +150,7 @@ h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     outptr1[RGB_ALPHA] = 0xFF;
 #endif
     outptr1 += RGB_PIXELSIZE;
-    y  = GETJSAMPLE(*inptr01++);
+    y  = *inptr01++;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
@@ -161,19 +161,19 @@ h2v2_merged_upsample_internal(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
   }
   /* If image width is odd, do the last output column separately */
   if (cinfo->output_width & 1) {
-    cb = GETJSAMPLE(*inptr1);
-    cr = GETJSAMPLE(*inptr2);
+    cb = *inptr1;
+    cr = *inptr2;
     cred = Crrtab[cr];
     cgreen = (int)RIGHT_SHIFT(Cbgtab[cb] + Crgtab[cr], SCALEBITS);
     cblue = Cbbtab[cb];
-    y  = GETJSAMPLE(*inptr00);
+    y  = *inptr00;
     outptr0[RGB_RED] =   range_limit[y + cred];
     outptr0[RGB_GREEN] = range_limit[y + cgreen];
     outptr0[RGB_BLUE] =  range_limit[y + cblue];
 #ifdef RGB_ALPHA
     outptr0[RGB_ALPHA] = 0xFF;
 #endif
-    y  = GETJSAMPLE(*inptr01);
+    y  = *inptr01;
     outptr1[RGB_RED] =   range_limit[y + cred];
     outptr1[RGB_GREEN] = range_limit[y + cgreen];
     outptr1[RGB_BLUE] =  range_limit[y + cblue];
diff --git a/external/jpeg/jdphuff.c b/external/jpeg/jdphuff.c
index 9e82636bbd12..c6d82ca14b8c 100644
--- a/external/jpeg/jdphuff.c
+++ b/external/jpeg/jdphuff.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1995-1997, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, 2018, D. R. Commander.
+ * Copyright (C) 2015-2016, 2018-2021, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -41,25 +41,6 @@ typedef struct {
   int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
 } savable_state;
 
-/* This macro is to work around compilers with missing or broken
- * structure assignment.  You'll need to fix this code if you have
- * such a compiler and you change MAX_COMPS_IN_SCAN.
- */
-
-#ifndef NO_STRUCT_ASSIGN
-#define ASSIGN_STATE(dest, src)  ((dest) = (src))
-#else
-#if MAX_COMPS_IN_SCAN == 4
-#define ASSIGN_STATE(dest, src) \
-  ((dest).EOBRUN = (src).EOBRUN, \
-   (dest).last_dc_val[0] = (src).last_dc_val[0], \
-   (dest).last_dc_val[1] = (src).last_dc_val[1], \
-   (dest).last_dc_val[2] = (src).last_dc_val[2], \
-   (dest).last_dc_val[3] = (src).last_dc_val[3])
-#endif
-#endif
-
-
 typedef struct {
   struct jpeg_entropy_decoder pub; /* public fields */
 
@@ -102,7 +83,7 @@ start_pass_phuff_decoder(j_decompress_ptr cinfo)
   boolean is_DC_band, bad;
   int ci, coefi, tbl;
   d_derived_tbl **pdtbl;
-  int *coef_bit_ptr;
+  int *coef_bit_ptr, *prev_coef_bit_ptr;
   jpeg_component_info *compptr;
 
   is_DC_band = (cinfo->Ss == 0);
@@ -143,8 +124,15 @@ start_pass_phuff_decoder(j_decompress_ptr cinfo)
   for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
     int cindex = cinfo->cur_comp_info[ci]->component_index;
     coef_bit_ptr = &cinfo->coef_bits[cindex][0];
+    prev_coef_bit_ptr = &cinfo->coef_bits[cindex + cinfo->num_components][0];
     if (!is_DC_band && coef_bit_ptr[0] < 0) /* AC without prior DC scan */
       WARNMS2(cinfo, JWRN_BOGUS_PROGRESSION, cindex, 0);
+    for (coefi = MIN(cinfo->Ss, 1); coefi <= MAX(cinfo->Se, 9); coefi++) {
+      if (cinfo->input_scan_number > 1)
+        prev_coef_bit_ptr[coefi] = coef_bit_ptr[coefi];
+      else
+        prev_coef_bit_ptr[coefi] = 0;
+    }
     for (coefi = cinfo->Ss; coefi <= cinfo->Se; coefi++) {
       int expected = (coef_bit_ptr[coefi] < 0) ? 0 : coef_bit_ptr[coefi];
       if (cinfo->Ah != expected)
@@ -323,7 +311,7 @@ decode_mcu_DC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 
     /* Load up working state */
     BITREAD_LOAD_STATE(cinfo, entropy->bitstate);
-    ASSIGN_STATE(state, entropy->saved);
+    state = entropy->saved;
 
     /* Outer loop handles each block in the MCU */
 
@@ -356,11 +344,12 @@ decode_mcu_DC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
 
     /* Completed MCU, so update state */
     BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
-    ASSIGN_STATE(entropy->saved, state);
+    entropy->saved = state;
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
@@ -444,7 +433,8 @@ decode_mcu_AC_first(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
@@ -495,7 +485,8 @@ decode_mcu_DC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   BITREAD_SAVE_STATE(cinfo, entropy->bitstate);
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 }
@@ -638,7 +629,8 @@ decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
   }
 
   /* Account for restart interval (no-op if not using restarts) */
-  entropy->restarts_to_go--;
+  if (cinfo->restart_interval)
+    entropy->restarts_to_go--;
 
   return TRUE;
 
@@ -676,7 +668,7 @@ jinit_phuff_decoder(j_decompress_ptr cinfo)
   /* Create progression status table */
   cinfo->coef_bits = (int (*)[DCTSIZE2])
     (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                cinfo->num_components * DCTSIZE2 *
+                                cinfo->num_components * 2 * DCTSIZE2 *
                                 sizeof(int));
   coef_bit_ptr = &cinfo->coef_bits[0][0];
   for (ci = 0; ci < cinfo->num_components; ci++)
diff --git a/external/jpeg/jdsample.c b/external/jpeg/jdsample.c
index 50a68b301318..eaad72a03089 100644
--- a/external/jpeg/jdsample.c
+++ b/external/jpeg/jdsample.c
@@ -8,7 +8,7 @@
  * Copyright (C) 2010, 2015-2016, D. R. Commander.
  * Copyright (C) 2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2015, Google, Inc.
- * Copyright (C) 2019, Arm Limited.
+ * Copyright (C) 2019-2020, Arm Limited.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -177,7 +177,7 @@ int_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       for (h = h_expand; h > 0; h--) {
         *outptr++ = invalue;
       }
@@ -213,7 +213,7 @@ h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     outptr = output_data[inrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
@@ -242,7 +242,7 @@ h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     outptr = output_data[outrow];
     outend = outptr + cinfo->output_width;
     while (outptr < outend) {
-      invalue = *inptr++;       /* don't need GETJSAMPLE() here */
+      invalue = *inptr++;
       *outptr++ = invalue;
       *outptr++ = invalue;
     }
@@ -283,20 +283,20 @@ h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     inptr = input_data[inrow];
     outptr = output_data[inrow];
     /* Special case for first column */
-    invalue = GETJSAMPLE(*inptr++);
+    invalue = *inptr++;
     *outptr++ = (JSAMPLE)invalue;
-    *outptr++ = (JSAMPLE)((invalue * 3 + GETJSAMPLE(*inptr) + 2) >> 2);
+    *outptr++ = (JSAMPLE)((invalue * 3 + inptr[0] + 2) >> 2);
 
     for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
       /* General case: 3/4 * nearer pixel + 1/4 * further pixel */
-      invalue = GETJSAMPLE(*inptr++) * 3;
-      *outptr++ = (JSAMPLE)((invalue + GETJSAMPLE(inptr[-2]) + 1) >> 2);
-      *outptr++ = (JSAMPLE)((invalue + GETJSAMPLE(*inptr) + 2) >> 2);
+      invalue = (*inptr++) * 3;
+      *outptr++ = (JSAMPLE)((invalue + inptr[-2] + 1) >> 2);
+      *outptr++ = (JSAMPLE)((invalue + inptr[0] + 2) >> 2);
     }
 
     /* Special case for last column */
-    invalue = GETJSAMPLE(*inptr);
-    *outptr++ = (JSAMPLE)((invalue * 3 + GETJSAMPLE(inptr[-1]) + 1) >> 2);
+    invalue = *inptr;
+    *outptr++ = (JSAMPLE)((invalue * 3 + inptr[-1] + 1) >> 2);
     *outptr++ = (JSAMPLE)invalue;
   }
 }
@@ -338,7 +338,7 @@ h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
       outptr = output_data[outrow++];
 
       for (colctr = 0; colctr < compptr->downsampled_width; colctr++) {
-        thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+        thiscolsum = (*inptr0++) * 3 + (*inptr1++);
         *outptr++ = (JSAMPLE)((thiscolsum + bias) >> 2);
       }
     }
@@ -381,8 +381,8 @@ h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
       outptr = output_data[outrow++];
 
       /* Special case for first column */
-      thiscolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
-      nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+      thiscolsum = (*inptr0++) * 3 + (*inptr1++);
+      nextcolsum = (*inptr0++) * 3 + (*inptr1++);
       *outptr++ = (JSAMPLE)((thiscolsum * 4 + 8) >> 4);
       *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
       lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
@@ -390,7 +390,7 @@ h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
       for (colctr = compptr->downsampled_width - 2; colctr > 0; colctr--) {
         /* General case: 3/4 * nearer pixel + 1/4 * further pixel in each */
         /* dimension, thus 9/16, 3/16, 3/16, 1/16 overall */
-        nextcolsum = GETJSAMPLE(*inptr0++) * 3 + GETJSAMPLE(*inptr1++);
+        nextcolsum = (*inptr0++) * 3 + (*inptr1++);
         *outptr++ = (JSAMPLE)((thiscolsum * 3 + lastcolsum + 8) >> 4);
         *outptr++ = (JSAMPLE)((thiscolsum * 3 + nextcolsum + 7) >> 4);
         lastcolsum = thiscolsum;  thiscolsum = nextcolsum;
@@ -477,7 +477,13 @@ jinit_upsampler(j_decompress_ptr cinfo)
     } else if (h_in_group == h_out_group &&
                v_in_group * 2 == v_out_group && do_fancy) {
       /* Non-fancy upsampling is handled by the generic method */
-      upsample->methods[ci] = h1v2_fancy_upsample;
+#if defined(__arm__) || defined(__aarch64__) || \
+    defined(_M_ARM) || defined(_M_ARM64)
+      if (jsimd_can_h1v2_fancy_upsample())
+        upsample->methods[ci] = jsimd_h1v2_fancy_upsample;
+      else
+#endif
+        upsample->methods[ci] = h1v2_fancy_upsample;
       upsample->pub.need_context_rows = TRUE;
     } else if (h_in_group * 2 == h_out_group &&
                v_in_group * 2 == v_out_group) {
diff --git a/external/jpeg/jerror.h b/external/jpeg/jerror.h
index 933a3690fdf4..4476df2c934b 100644
--- a/external/jpeg/jerror.h
+++ b/external/jpeg/jerror.h
@@ -207,6 +207,10 @@ JMESSAGE(JWRN_ARITH_BAD_CODE, "Corrupt JPEG data: bad arithmetic code")
 #endif
 #endif
 JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker")
+#if JPEG_LIB_VERSION < 70
+JMESSAGE(JERR_BAD_DROP_SAMPLING,
+         "Component index %d: mismatching sampling ratio %d:%d, %d:%d, %c")
+#endif
 
 #ifdef JMAKE_ENUM_LIST
 
@@ -252,6 +256,15 @@ JMESSAGE(JWRN_BOGUS_ICC, "Corrupt JPEG data: bad ICC marker")
    (cinfo)->err->msg_parm.i[2] = (p3), \
    (cinfo)->err->msg_parm.i[3] = (p4), \
    (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
+#define ERREXIT6(cinfo, code, p1, p2, p3, p4, p5, p6) \
+  ((cinfo)->err->msg_code = (code), \
+   (cinfo)->err->msg_parm.i[0] = (p1), \
+   (cinfo)->err->msg_parm.i[1] = (p2), \
+   (cinfo)->err->msg_parm.i[2] = (p3), \
+   (cinfo)->err->msg_parm.i[3] = (p4), \
+   (cinfo)->err->msg_parm.i[4] = (p5), \
+   (cinfo)->err->msg_parm.i[5] = (p6), \
+   (*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
 #define ERREXITS(cinfo, code, str) \
   ((cinfo)->err->msg_code = (code), \
    strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
diff --git a/external/jpeg/jidctint.c b/external/jpeg/jidctint.c
index 50f385da3329..bb0874801920 100644
--- a/external/jpeg/jidctint.c
+++ b/external/jpeg/jidctint.c
@@ -3,7 +3,7 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1998, Thomas G. Lane.
- * Modification developed 2002-2009 by Guido Vollbeding.
+ * Modification developed 2002-2018 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, 2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -417,7 +417,7 @@ jpeg_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
 /*
  * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a 7x7 output block.
+ * producing a reduced-size 7x7 output block.
  *
  * Optimized algorithm with 12 multiplications in the 1-D kernel.
  * cK represents sqrt(2) * cos(K*pi/14).
@@ -1258,7 +1258,7 @@ jpeg_idct_10x10(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 
 /*
  * Perform dequantization and inverse DCT on one block of coefficients,
- * producing a 11x11 output block.
+ * producing an 11x11 output block.
  *
  * Optimized algorithm with 24 multiplications in the 1-D kernel.
  * cK represents sqrt(2) * cos(K*pi/22).
@@ -2398,7 +2398,7 @@ jpeg_idct_16x16(j_decompress_ptr cinfo, jpeg_component_info *compptr,
     tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
     tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
     /* Add fudge factor here for final descale. */
-    tmp0 += 1 << (CONST_BITS - PASS1_BITS - 1);
+    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
 
     z1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
     tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
diff --git a/external/jpeg/jmorecfg.h b/external/jpeg/jmorecfg.h
index aa29f0f9f13e..fb3a9cf411cc 100644
--- a/external/jpeg/jmorecfg.h
+++ b/external/jpeg/jmorecfg.h
@@ -43,25 +43,11 @@
 
 #if BITS_IN_JSAMPLE == 8
 /* JSAMPLE should be the smallest type that will hold the values 0..255.
- * You can use a signed char by having GETJSAMPLE mask it with 0xFF.
  */
 
-#ifdef HAVE_UNSIGNED_CHAR
-
 typedef unsigned char JSAMPLE;
 #define GETJSAMPLE(value)  ((int)(value))
 
-#else /* not HAVE_UNSIGNED_CHAR */
-
-typedef char JSAMPLE;
-#ifdef __CHAR_UNSIGNED__
-#define GETJSAMPLE(value)  ((int)(value))
-#else
-#define GETJSAMPLE(value)  ((int)(value) & 0xFF)
-#endif /* __CHAR_UNSIGNED__ */
-
-#endif /* HAVE_UNSIGNED_CHAR */
-
 #define MAXJSAMPLE      255
 #define CENTERJSAMPLE   128
 
@@ -97,22 +83,9 @@ typedef short JCOEF;
  * managers, this is also the data type passed to fread/fwrite.
  */
 
-#ifdef HAVE_UNSIGNED_CHAR
-
 typedef unsigned char JOCTET;
 #define GETJOCTET(value)  (value)
 
-#else /* not HAVE_UNSIGNED_CHAR */
-
-typedef char JOCTET;
-#ifdef __CHAR_UNSIGNED__
-#define GETJOCTET(value)  (value)
-#else
-#define GETJOCTET(value)  ((value) & 0xFF)
-#endif /* __CHAR_UNSIGNED__ */
-
-#endif /* HAVE_UNSIGNED_CHAR */
-
 
 /* These typedefs are used for various table entries and so forth.
  * They must be at least as wide as specified; but making them too big
@@ -123,15 +96,7 @@ typedef char JOCTET;
 
 /* UINT8 must hold at least the values 0..255. */
 
-#ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char UINT8;
-#else /* not HAVE_UNSIGNED_CHAR */
-#ifdef __CHAR_UNSIGNED__
-typedef char UINT8;
-#else /* not __CHAR_UNSIGNED__ */
-typedef short UINT8;
-#endif /* __CHAR_UNSIGNED__ */
-#endif /* HAVE_UNSIGNED_CHAR */
 
 /* UINT16 must hold at least the values 0..65535. */
 
diff --git a/external/jpeg/jpegint.h b/external/jpeg/jpegint.h
index ad36ca8b5605..195fbcb9b675 100644
--- a/external/jpeg/jpegint.h
+++ b/external/jpeg/jpegint.h
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 1997-2009 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2016, D. R. Commander.
+ * Copyright (C) 2015-2016, 2019, D. R. Commander.
  * Copyright (C) 2015, Google, Inc.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
@@ -158,6 +158,9 @@ struct jpeg_decomp_master {
   JDIMENSION first_MCU_col[MAX_COMPONENTS];
   JDIMENSION last_MCU_col[MAX_COMPONENTS];
   boolean jinit_upsampler_no_alloc;
+
+  /* Last iMCU row that was successfully decoded */
+  JDIMENSION last_good_iMCU_row;
 };
 
 /* Input control module */
diff --git a/external/jpeg/jpegtran.1 b/external/jpeg/jpegtran.1
new file mode 100644
index 000000000000..da7a26697432
--- /dev/null
+++ b/external/jpeg/jpegtran.1
@@ -0,0 +1,358 @@
+.TH JPEGTRAN 1 "26 October 2020"
+.SH NAME
+jpegtran \- lossless transformation of JPEG files
+.SH SYNOPSIS
+.B jpegtran
+[
+.I options
+]
+[
+.I filename
+]
+.LP
+.SH DESCRIPTION
+.LP
+.B jpegtran
+performs various useful transformations of JPEG files.
+It can translate the coded representation from one variant of JPEG to another,
+for example from baseline JPEG to progressive JPEG or vice versa.  It can also
+perform some rearrangements of the image data, for example turning an image
+from landscape to portrait format by rotation.
+.PP
+For EXIF files and JPEG files containing Exif data, you may prefer to use
+.B exiftran
+instead.
+.PP
+.B jpegtran
+works by rearranging the compressed data (DCT coefficients), without
+ever fully decoding the image.  Therefore, its transformations are lossless:
+there is no image degradation at all, which would not be true if you used
+.B djpeg
+followed by
+.B cjpeg
+to accomplish the same conversion.  But by the same token,
+.B jpegtran
+cannot perform lossy operations such as changing the image quality.  However,
+while the image data is losslessly transformed, metadata can be removed.  See
+the
+.B \-copy
+option for specifics.
+.PP
+.B jpegtran
+reads the named JPEG/JFIF file, or the standard input if no file is
+named, and produces a JPEG/JFIF file on the standard output.
+.SH OPTIONS
+All switch names may be abbreviated; for example,
+.B \-optimize
+may be written
+.B \-opt
+or
+.BR \-o .
+Upper and lower case are equivalent.
+British spellings are also accepted (e.g.,
+.BR \-optimise ),
+though for brevity these are not mentioned below.
+.PP
+To specify the coded JPEG representation used in the output file,
+.B jpegtran
+accepts a subset of the switches recognized by
+.BR cjpeg :
+.TP
+.B \-optimize
+Perform optimization of entropy encoding parameters.
+.TP
+.B \-progressive
+Create progressive JPEG file.
+.TP
+.BI \-restart " N"
+Emit a JPEG restart marker every N MCU rows, or every N MCU blocks if "B" is
+attached to the number.
+.TP
+.B \-arithmetic
+Use arithmetic coding.
+.TP
+.BI \-scans " file"
+Use the scan script given in the specified text file.
+.PP
+See
+.BR cjpeg (1)
+for more details about these switches.
+If you specify none of these switches, you get a plain baseline-JPEG output
+file.  The quality setting and so forth are determined by the input file.
+.PP
+The image can be losslessly transformed by giving one of these switches:
+.TP
+.B \-flip horizontal
+Mirror image horizontally (left-right).
+.TP
+.B \-flip vertical
+Mirror image vertically (top-bottom).
+.TP
+.B \-rotate 90
+Rotate image 90 degrees clockwise.
+.TP
+.B \-rotate 180
+Rotate image 180 degrees.
+.TP
+.B \-rotate 270
+Rotate image 270 degrees clockwise (or 90 ccw).
+.TP
+.B \-transpose
+Transpose image (across UL-to-LR axis).
+.TP
+.B \-transverse
+Transverse transpose (across UR-to-LL axis).
+.PP
+The transpose transformation has no restrictions regarding image dimensions.
+The other transformations operate rather oddly if the image dimensions are not
+a multiple of the iMCU size (usually 8 or 16 pixels), because they can only
+transform complete blocks of DCT coefficient data in the desired way.
+.PP
+.BR jpegtran 's
+default behavior when transforming an odd-size image is designed
+to preserve exact reversibility and mathematical consistency of the
+transformation set.  As stated, transpose is able to flip the entire image
+area.  Horizontal mirroring leaves any partial iMCU column at the right edge
+untouched, but is able to flip all rows of the image.  Similarly, vertical
+mirroring leaves any partial iMCU row at the bottom edge untouched, but is
+able to flip all columns.  The other transforms can be built up as sequences
+of transpose and flip operations; for consistency, their actions on edge
+pixels are defined to be the same as the end result of the corresponding
+transpose-and-flip sequence.
+.PP
+For practical use, you may prefer to discard any untransformable edge pixels
+rather than having a strange-looking strip along the right and/or bottom edges
+of a transformed image.  To do this, add the
+.B \-trim
+switch:
+.TP
+.B \-trim
+Drop non-transformable edge blocks.
+.IP
+Obviously, a transformation with
+.B \-trim
+is not reversible, so strictly speaking
+.B jpegtran
+with this switch is not lossless.  Also, the expected mathematical
+equivalences between the transformations no longer hold.  For example,
+.B \-rot 270 -trim
+trims only the bottom edge, but
+.B \-rot 90 -trim
+followed by
+.B \-rot 180 -trim
+trims both edges.
+.TP
+.B \-perfect
+If you are only interested in perfect transformations, add the
+.B \-perfect
+switch.  This causes
+.B jpegtran
+to fail with an error if the transformation is not perfect.
+.IP
+For example, you may want to do
+.IP
+.B (jpegtran \-rot 90 -perfect
+.I foo.jpg
+.B || djpeg
+.I foo.jpg
+.B | pnmflip \-r90 | cjpeg)
+.IP
+to do a perfect rotation, if available, or an approximated one if not.
+.PP
+This version of \fBjpegtran\fR also offers a lossless crop option, which
+discards data outside of a given image region but losslessly preserves what is
+inside.  Like the rotate and flip transforms, lossless crop is restricted by
+the current JPEG format; the upper left corner of the selected region must fall
+on an iMCU boundary.  If it doesn't, then it is silently moved up and/or left
+to the nearest iMCU boundary (the lower right corner is unchanged.)  Thus, the
+output image covers at least the requested region, but it may cover more.  The
+adjustment of the region dimensions may be optionally disabled by attaching an
+'f' character ("force") to the width or height number.
+
+The image can be losslessly cropped by giving the switch:
+.TP
+.B \-crop WxH+X+Y
+Crop the image to a rectangular region of width W and height H, starting at
+point X,Y.  The lossless crop feature discards data outside of a given image
+region but losslessly preserves what is inside.  Like the rotate and flip
+transforms, lossless crop is restricted by the current JPEG format; the upper
+left corner of the selected region must fall on an iMCU boundary.  If it
+doesn't, then it is silently moved up and/or left to the nearest iMCU boundary
+(the lower right corner is unchanged.)
+.PP
+If W or H is larger than the width/height of the input image, then the output
+image is expanded in size, and the expanded region is filled in with zeros
+(neutral gray).  Attaching an 'f' character ("flatten") to the width number
+will cause each block in the expanded region to be filled in with the DC
+coefficient of the nearest block in the input image rather than grayed out.
+Attaching an 'r' character ("reflect") to the width number will cause the
+expanded region to be filled in with repeated reflections of the input image
+rather than grayed out.
+.PP
+A complementary lossless wipe option is provided to discard (gray out) data
+inside a given image region while losslessly preserving what is outside:
+.TP
+.B \-wipe WxH+X+Y
+Wipe (gray out) a rectangular region of width W and height H from the input
+image, starting at point X,Y.
+.PP
+Attaching an 'f' character ("flatten") to the width number will cause the
+region to be filled with the average of adjacent blocks rather than grayed out.
+If the wipe region and the region outside the wipe region, when adjusted to the
+nearest iMCU boundary, form two horizontally adjacent rectangles, then
+attaching an 'r' character ("reflect") to the width number will cause the wipe
+region to be filled with repeated reflections of the outside region rather than
+grayed out.
+.PP
+A lossless drop option is also provided, which allows another JPEG image to be
+inserted ("dropped") into the input image data at a given position, replacing
+the existing image data at that position:
+.TP
+.B \-drop +X+Y filename
+Drop (insert) another image at point X,Y
+.PP
+Both the input image and the drop image must have the same subsampling level.
+It is best if they also have the same quantization (quality.)  Otherwise, the
+quantization of the output image will be adapted to accommodate the higher of
+the input image quality and the drop image quality.  The trim option can be
+used with the drop option to requantize the drop image to match the input
+image.  Note that a grayscale image can be dropped into a full-color image or
+vice versa, as long as the full-color image has no vertical subsampling.  If
+the input image is grayscale and the drop image is full-color, then the
+chrominance channels from the drop image will be discarded.
+.PP
+Other not-strictly-lossless transformation switches are:
+.TP
+.B \-grayscale
+Force grayscale output.
+.IP
+This option discards the chrominance channels if the input image is YCbCr
+(ie, a standard color JPEG), resulting in a grayscale JPEG file.  The
+luminance channel is preserved exactly, so this is a better method of reducing
+to grayscale than decompression, conversion, and recompression.  This switch
+is particularly handy for fixing a monochrome picture that was mistakenly
+encoded as a color JPEG.  (In such a case, the space savings from getting rid
+of the near-empty chroma channels won't be large; but the decoding time for
+a grayscale JPEG is substantially less than that for a color JPEG.)
+.PP
+.B jpegtran
+also recognizes these switches that control what to do with "extra" markers,
+such as comment blocks:
+.TP
+.B \-copy none
+Copy no extra markers from source file.  This setting suppresses all
+comments and other metadata in the source file.
+.TP
+.B \-copy comments
+Copy only comment markers.  This setting copies comments from the source file
+but discards any other metadata.
+.TP
+.B \-copy all
+Copy all extra markers.  This setting preserves miscellaneous markers
+found in the source file, such as JFIF thumbnails, Exif data, and Photoshop
+settings.  In some files, these extra markers can be sizable.  Note that this
+option will copy thumbnails as-is; they will not be transformed.
+.PP
+The default behavior is \fB-copy comments\fR.  (Note: in IJG releases v6 and
+v6a, \fBjpegtran\fR always did the equivalent of \fB-copy none\fR.)
+.PP
+Additional switches recognized by jpegtran are:
+.TP
+.BI \-icc " file"
+Embed ICC color management profile contained in the specified file.  Note that
+this will cause \fBjpegtran\fR to ignore any APP2 markers in the input file,
+even if \fB-copy all\fR is specified.
+.TP
+.BI \-maxmemory " N"
+Set limit for amount of memory to use in processing large images.  Value is
+in thousands of bytes, or millions of bytes if "M" is attached to the
+number.  For example,
+.B \-max 4m
+selects 4000000 bytes.  If more space is needed, an error will occur.
+.TP
+.BI \-maxscans " N"
+Abort if the input image contains more than
+.I N
+scans.  This feature demonstrates a method by which applications can guard
+against denial-of-service attacks instigated by specially-crafted malformed
+JPEG images containing numerous scans with missing image data or image data
+consisting only of "EOB runs" (a feature of progressive JPEG images that allows
+potentially hundreds of thousands of adjoining zero-value pixels to be
+represented using only a few bytes.)  Attempting to transform such malformed
+JPEG images can cause excessive CPU activity, since the decompressor must fully
+process each scan (even if the scan is corrupt) before it can proceed to the
+next scan.
+.TP
+.BI \-outfile " name"
+Send output image to the named file, not to standard output.
+.TP
+.BI \-report
+Report transformation progress.
+.TP
+.BI \-strict
+Treat all warnings as fatal.  This feature also demonstrates a method by which
+applications can guard against attacks instigated by specially-crafted
+malformed JPEG images.  Enabling this option will cause the decompressor to
+abort if the input image contains incomplete or corrupt image data.
+.TP
+.B \-verbose
+Enable debug printout.  More
+.BR \-v 's
+give more output.  Also, version information is printed at startup.
+.TP
+.B \-debug
+Same as
+.BR \-verbose .
+.TP
+.B \-version
+Print version information and exit.
+.SH EXAMPLES
+.LP
+This example converts a baseline JPEG file to progressive form:
+.IP
+.B jpegtran \-progressive
+.I foo.jpg
+.B >
+.I fooprog.jpg
+.PP
+This example rotates an image 90 degrees clockwise, discarding any
+unrotatable edge pixels:
+.IP
+.B jpegtran \-rot 90 -trim
+.I foo.jpg
+.B >
+.I foo90.jpg
+.SH ENVIRONMENT
+.TP
+.B JPEGMEM
+If this environment variable is set, its value is the default memory limit.
+The value is specified as described for the
+.B \-maxmemory
+switch.
+.B JPEGMEM
+overrides the default value specified when the program was compiled, and
+itself is overridden by an explicit
+.BR \-maxmemory .
+.SH SEE ALSO
+.BR cjpeg (1),
+.BR djpeg (1),
+.BR rdjpgcom (1),
+.BR wrjpgcom (1)
+.br
+Wallace, Gregory K.  "The JPEG Still Picture Compression Standard",
+Communications of the ACM, April 1991 (vol. 34, no. 4), pp. 30-44.
+.SH AUTHOR
+Independent JPEG Group
+.PP
+This file was modified by The libjpeg-turbo Project to include only information
+relevant to libjpeg-turbo and to wordsmith certain sections.
+.SH BUGS
+The transform options can't transform odd-size images perfectly.  Use
+.B \-trim
+or
+.B \-perfect
+if you don't like the results.
+.PP
+The entire image is read into memory and then written out again, even in
+cases where this isn't really necessary.  Expect swapping on large images,
+especially when using the more complex transform options.
diff --git a/external/jpeg/jpegtran.c b/external/jpeg/jpegtran.c
index e1fe41a39a15..244996dd7812 100644
--- a/external/jpeg/jpegtran.c
+++ b/external/jpeg/jpegtran.c
@@ -2,9 +2,9 @@
  * jpegtran.c
  *
  * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1995-2010, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 1995-2019, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2014, 2017, 2020, D. R. Commander.
+ * Copyright (C) 2010, 2014, 2017, 2019-2020, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -41,7 +41,11 @@
 
 static const char *progname;    /* program name for error messages */
 static char *icc_filename;      /* for -icc switch */
+JDIMENSION max_scans;           /* for -maxscans switch */
 static char *outfilename;       /* for -outfile switch */
+static char *dropfilename;      /* for -drop switch */
+boolean report;                 /* for -report switch */
+boolean strict;                 /* for -strict switch */
 static JCOPY_OPTION copyoption; /* -copy switch */
 static jpeg_transform_info transformoption; /* image transformation options */
 
@@ -70,8 +74,9 @@ usage(void)
   fprintf(stderr, "Switches for modifying the image:\n");
 #if TRANSFORMS_SUPPORTED
   fprintf(stderr, "  -crop WxH+X+Y  Crop to a rectangular region\n");
-  fprintf(stderr, "  -grayscale     Reduce to grayscale (omit color data)\n");
+  fprintf(stderr, "  -drop +X+Y filename          Drop (insert) another image\n");
   fprintf(stderr, "  -flip [horizontal|vertical]  Mirror image (left-right or top-bottom)\n");
+  fprintf(stderr, "  -grayscale     Reduce to grayscale (omit color data)\n");
   fprintf(stderr, "  -perfect       Fail if there is non-transformable edge blocks\n");
   fprintf(stderr, "  -rotate [90|180|270]         Rotate image (degrees clockwise)\n");
 #endif
@@ -79,6 +84,8 @@ usage(void)
   fprintf(stderr, "  -transpose     Transpose image\n");
   fprintf(stderr, "  -transverse    Transverse transpose image\n");
   fprintf(stderr, "  -trim          Drop non-transformable edge blocks\n");
+  fprintf(stderr, "                 with -drop: Requantize drop file to match source file\n");
+  fprintf(stderr, "  -wipe WxH+X+Y  Wipe (gray out) a rectangular region\n");
 #endif
   fprintf(stderr, "Switches for advanced users:\n");
 #ifdef C_ARITH_CODING_SUPPORTED
@@ -87,7 +94,10 @@ usage(void)
   fprintf(stderr, "  -icc FILE      Embed ICC profile contained in FILE\n");
   fprintf(stderr, "  -restart N     Set restart interval in rows, or in blocks with B\n");
   fprintf(stderr, "  -maxmemory N   Maximum memory to use (in kbytes)\n");
+  fprintf(stderr, "  -maxscans N    Maximum number of scans to allow in input file\n");
   fprintf(stderr, "  -outfile name  Specify name for output file\n");
+  fprintf(stderr, "  -report        Report transformation progress\n");
+  fprintf(stderr, "  -strict        Treat all warnings as fatal\n");
   fprintf(stderr, "  -verbose  or  -debug   Emit debug output\n");
   fprintf(stderr, "  -version       Print version information and exit\n");
   fprintf(stderr, "Switches for wizards:\n");
@@ -141,7 +151,10 @@ parse_switches(j_compress_ptr cinfo, int argc, char **argv,
   /* Set up default JPEG parameters. */
   simple_progressive = FALSE;
   icc_filename = NULL;
+  max_scans = 0;
   outfilename = NULL;
+  report = FALSE;
+  strict = FALSE;
   copyoption = JCOPYOPT_DEFAULT;
   transformoption.transform = JXFORM_NONE;
   transformoption.perfect = FALSE;
@@ -193,7 +206,8 @@ parse_switches(j_compress_ptr cinfo, int argc, char **argv,
 #if TRANSFORMS_SUPPORTED
       if (++argn >= argc)       /* advance to next argument */
         usage();
-      if (!jtransform_parse_crop_spec(&transformoption, argv[argn])) {
+      if (transformoption.crop /* reject multiple crop/drop/wipe requests */ ||
+          !jtransform_parse_crop_spec(&transformoption, argv[argn])) {
         fprintf(stderr, "%s: bogus -crop argument '%s'\n",
                 progname, argv[argn]);
         exit(EXIT_FAILURE);
@@ -202,6 +216,26 @@ parse_switches(j_compress_ptr cinfo, int argc, char **argv,
       select_transform(JXFORM_NONE);    /* force an error */
 #endif
 
+    } else if (keymatch(arg, "drop", 2)) {
+#if TRANSFORMS_SUPPORTED
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (transformoption.crop /* reject multiple crop/drop/wipe requests */ ||
+          !jtransform_parse_crop_spec(&transformoption, argv[argn]) ||
+          transformoption.crop_width_set != JCROP_UNSET ||
+          transformoption.crop_height_set != JCROP_UNSET) {
+        fprintf(stderr, "%s: bogus -drop argument '%s'\n",
+                progname, argv[argn]);
+        exit(EXIT_FAILURE);
+      }
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      dropfilename = argv[argn];
+      select_transform(JXFORM_DROP);
+#else
+      select_transform(JXFORM_NONE);    /* force an error */
+#endif
+
     } else if (keymatch(arg, "debug", 1) || keymatch(arg, "verbose", 1)) {
       /* Enable debug printouts. */
       /* On first -d, print version identification */
@@ -261,6 +295,12 @@ parse_switches(j_compress_ptr cinfo, int argc, char **argv,
         lval *= 1000L;
       cinfo->mem->max_memory_to_use = lval * 1000L;
 
+    } else if (keymatch(arg, "maxscans", 4)) {
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (sscanf(argv[argn], "%u", &max_scans) != 1)
+        usage();
+
     } else if (keymatch(arg, "optimize", 1) || keymatch(arg, "optimise", 1)) {
       /* Enable entropy parm optimization. */
 #ifdef ENTROPY_OPT_SUPPORTED
@@ -293,6 +333,9 @@ parse_switches(j_compress_ptr cinfo, int argc, char **argv,
       exit(EXIT_FAILURE);
 #endif
 
+    } else if (keymatch(arg, "report", 3)) {
+      report = TRUE;
+
     } else if (keymatch(arg, "restart", 1)) {
       /* Restart interval in MCU rows (or in MCUs with 'b'). */
       long lval;
@@ -338,6 +381,9 @@ parse_switches(j_compress_ptr cinfo, int argc, char **argv,
       exit(EXIT_FAILURE);
 #endif
 
+    } else if (keymatch(arg, "strict", 2)) {
+      strict = TRUE;
+
     } else if (keymatch(arg, "transpose", 1)) {
       /* Transpose (across UL-to-LR axis). */
       select_transform(JXFORM_TRANSPOSE);
@@ -350,6 +396,21 @@ parse_switches(j_compress_ptr cinfo, int argc, char **argv,
       /* Trim off any partial edge MCUs that the transform can't handle. */
       transformoption.trim = TRUE;
 
+    } else if (keymatch(arg, "wipe", 1)) {
+#if TRANSFORMS_SUPPORTED
+      if (++argn >= argc)       /* advance to next argument */
+        usage();
+      if (transformoption.crop /* reject multiple crop/drop/wipe requests */ ||
+          !jtransform_parse_crop_spec(&transformoption, argv[argn])) {
+        fprintf(stderr, "%s: bogus -wipe argument '%s'\n",
+                progname, argv[argn]);
+        exit(EXIT_FAILURE);
+      }
+      select_transform(JXFORM_WIPE);
+#else
+      select_transform(JXFORM_NONE);    /* force an error */
+#endif
+
     } else {
       usage();                  /* bogus switch */
     }
@@ -375,6 +436,19 @@ parse_switches(j_compress_ptr cinfo, int argc, char **argv,
 }
 
 
+METHODDEF(void)
+my_emit_message(j_common_ptr cinfo, int msg_level)
+{
+  if (msg_level < 0) {
+    /* Treat warning as fatal */
+    cinfo->err->error_exit(cinfo);
+  } else {
+    if (cinfo->err->trace_level >= msg_level)
+      cinfo->err->output_message(cinfo);
+  }
+}
+
+
 /*
  * The main program.
  */
@@ -383,11 +457,14 @@ int
 main(int argc, char **argv)
 {
   struct jpeg_decompress_struct srcinfo;
+#if TRANSFORMS_SUPPORTED
+  struct jpeg_decompress_struct dropinfo;
+  struct jpeg_error_mgr jdroperr;
+  FILE *drop_file;
+#endif
   struct jpeg_compress_struct dstinfo;
   struct jpeg_error_mgr jsrcerr, jdsterr;
-#ifdef PROGRESS_REPORT
-  struct cdjpeg_progress_mgr progress;
-#endif
+  struct cdjpeg_progress_mgr src_progress, dst_progress;
   jvirt_barray_ptr *src_coef_arrays;
   jvirt_barray_ptr *dst_coef_arrays;
   int file_index;
@@ -420,13 +497,16 @@ main(int argc, char **argv)
    * values read here are mostly ignored; we will rescan the switches after
    * opening the input file.  Also note that most of the switches affect the
    * destination JPEG object, so we parse into that and then copy over what
-   * needs to affects the source too.
+   * needs to affect the source too.
    */
 
   file_index = parse_switches(&dstinfo, argc, argv, 0, FALSE);
   jsrcerr.trace_level = jdsterr.trace_level;
   srcinfo.mem->max_memory_to_use = dstinfo.mem->max_memory_to_use;
 
+  if (strict)
+    jsrcerr.emit_message = my_emit_message;
+
 #ifdef TWO_FILE_COMMANDLINE
   /* Must have either -outfile switch or explicit output file name */
   if (outfilename == NULL) {
@@ -492,8 +572,29 @@ main(int argc, char **argv)
       copyoption = JCOPYOPT_ALL_EXCEPT_ICC;
   }
 
-#ifdef PROGRESS_REPORT
-  start_progress_monitor((j_common_ptr)&dstinfo, &progress);
+  if (report) {
+    start_progress_monitor((j_common_ptr)&dstinfo, &dst_progress);
+    dst_progress.report = report;
+  }
+  if (report || max_scans != 0) {
+    start_progress_monitor((j_common_ptr)&srcinfo, &src_progress);
+    src_progress.report = report;
+    src_progress.max_scans = max_scans;
+  }
+#if TRANSFORMS_SUPPORTED
+  /* Open the drop file. */
+  if (dropfilename != NULL) {
+    if ((drop_file = fopen(dropfilename, READ_BINARY)) == NULL) {
+      fprintf(stderr, "%s: can't open %s for reading\n", progname,
+              dropfilename);
+      exit(EXIT_FAILURE);
+    }
+    dropinfo.err = jpeg_std_error(&jdroperr);
+    jpeg_create_decompress(&dropinfo);
+    jpeg_stdio_src(&dropinfo, drop_file);
+  } else {
+    drop_file = NULL;
+  }
 #endif
 
   /* Specify data source for decompression */
@@ -505,6 +606,17 @@ main(int argc, char **argv)
   /* Read file header */
   (void)jpeg_read_header(&srcinfo, TRUE);
 
+#if TRANSFORMS_SUPPORTED
+  if (dropfilename != NULL) {
+    (void)jpeg_read_header(&dropinfo, TRUE);
+    transformoption.crop_width = dropinfo.image_width;
+    transformoption.crop_width_set = JCROP_POS;
+    transformoption.crop_height = dropinfo.image_height;
+    transformoption.crop_height_set = JCROP_POS;
+    transformoption.drop_ptr = &dropinfo;
+  }
+#endif
+
   /* Any space needed by a transform option must be requested before
    * jpeg_read_coefficients so that memory allocation will be done right.
    */
@@ -520,6 +632,12 @@ main(int argc, char **argv)
   /* Read source file as DCT coefficients */
   src_coef_arrays = jpeg_read_coefficients(&srcinfo);
 
+#if TRANSFORMS_SUPPORTED
+  if (dropfilename != NULL) {
+    transformoption.drop_coef_arrays = jpeg_read_coefficients(&dropinfo);
+  }
+#endif
+
   /* Initialize destination compression parameters from source values */
   jpeg_copy_critical_parameters(&srcinfo, &dstinfo);
 
@@ -580,20 +698,36 @@ main(int argc, char **argv)
   /* Finish compression and release memory */
   jpeg_finish_compress(&dstinfo);
   jpeg_destroy_compress(&dstinfo);
+#if TRANSFORMS_SUPPORTED
+  if (dropfilename != NULL) {
+    (void)jpeg_finish_decompress(&dropinfo);
+    jpeg_destroy_decompress(&dropinfo);
+  }
+#endif
   (void)jpeg_finish_decompress(&srcinfo);
   jpeg_destroy_decompress(&srcinfo);
 
   /* Close output file, if we opened it */
   if (fp != stdout)
     fclose(fp);
-
-#ifdef PROGRESS_REPORT
-  end_progress_monitor((j_common_ptr)&dstinfo);
+#if TRANSFORMS_SUPPORTED
+  if (drop_file != NULL)
+    fclose(drop_file);
 #endif
 
+  if (report)
+    end_progress_monitor((j_common_ptr)&dstinfo);
+  if (report || max_scans != 0)
+    end_progress_monitor((j_common_ptr)&srcinfo);
+
   free(icc_profile);
 
   /* All done. */
+#if TRANSFORMS_SUPPORTED
+  if (dropfilename != NULL)
+    exit(jsrcerr.num_warnings + jdroperr.num_warnings +
+         jdsterr.num_warnings ? EXIT_WARNING : EXIT_SUCCESS);
+#endif
   exit(jsrcerr.num_warnings + jdsterr.num_warnings ?
        EXIT_WARNING : EXIT_SUCCESS);
   return 0;                     /* suppress no-return-value warnings */
diff --git a/external/jpeg/jquant1.c b/external/jpeg/jquant1.c
index 40bbb28cc7f6..73b83e16e5cc 100644
--- a/external/jpeg/jquant1.c
+++ b/external/jpeg/jquant1.c
@@ -479,7 +479,7 @@ color_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     for (col = width; col > 0; col--) {
       pixcode = 0;
       for (ci = 0; ci < nc; ci++) {
-        pixcode += GETJSAMPLE(colorindex[ci][GETJSAMPLE(*ptrin++)]);
+        pixcode += colorindex[ci][*ptrin++];
       }
       *ptrout++ = (JSAMPLE)pixcode;
     }
@@ -506,9 +506,9 @@ color_quantize3(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     ptrin = input_buf[row];
     ptrout = output_buf[row];
     for (col = width; col > 0; col--) {
-      pixcode  = GETJSAMPLE(colorindex0[GETJSAMPLE(*ptrin++)]);
-      pixcode += GETJSAMPLE(colorindex1[GETJSAMPLE(*ptrin++)]);
-      pixcode += GETJSAMPLE(colorindex2[GETJSAMPLE(*ptrin++)]);
+      pixcode  = colorindex0[*ptrin++];
+      pixcode += colorindex1[*ptrin++];
+      pixcode += colorindex2[*ptrin++];
       *ptrout++ = (JSAMPLE)pixcode;
     }
   }
@@ -552,7 +552,7 @@ quantize_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
          * required amount of padding.
          */
         *output_ptr +=
-          colorindex_ci[GETJSAMPLE(*input_ptr) + dither[col_index]];
+          colorindex_ci[*input_ptr + dither[col_index]];
         input_ptr += nc;
         output_ptr++;
         col_index = (col_index + 1) & ODITHER_MASK;
@@ -595,12 +595,9 @@ quantize3_ord_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     col_index = 0;
 
     for (col = width; col > 0; col--) {
-      pixcode  =
-        GETJSAMPLE(colorindex0[GETJSAMPLE(*input_ptr++) + dither0[col_index]]);
-      pixcode +=
-        GETJSAMPLE(colorindex1[GETJSAMPLE(*input_ptr++) + dither1[col_index]]);
-      pixcode +=
-        GETJSAMPLE(colorindex2[GETJSAMPLE(*input_ptr++) + dither2[col_index]]);
+      pixcode  = colorindex0[(*input_ptr++) + dither0[col_index]];
+      pixcode += colorindex1[(*input_ptr++) + dither1[col_index]];
+      pixcode += colorindex2[(*input_ptr++) + dither2[col_index]];
       *output_ptr++ = (JSAMPLE)pixcode;
       col_index = (col_index + 1) & ODITHER_MASK;
     }
@@ -677,15 +674,15 @@ quantize_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
          * The maximum error is +- MAXJSAMPLE; this sets the required size
          * of the range_limit array.
          */
-        cur += GETJSAMPLE(*input_ptr);
-        cur = GETJSAMPLE(range_limit[cur]);
+        cur += *input_ptr;
+        cur = range_limit[cur];
         /* Select output value, accumulate into output code for this pixel */
-        pixcode = GETJSAMPLE(colorindex_ci[cur]);
+        pixcode = colorindex_ci[cur];
         *output_ptr += (JSAMPLE)pixcode;
         /* Compute actual representation error at this pixel */
         /* Note: we can do this even though we don't have the final */
         /* pixel code, because the colormap is orthogonal. */
-        cur -= GETJSAMPLE(colormap_ci[pixcode]);
+        cur -= colormap_ci[pixcode];
         /* Compute error fractions to be propagated to adjacent pixels.
          * Add these into the running sums, and simultaneously shift the
          * next-line error sums left by 1 column.
diff --git a/external/jpeg/jquant2.c b/external/jpeg/jquant2.c
index 6570613bb9f2..44efb18cadf1 100644
--- a/external/jpeg/jquant2.c
+++ b/external/jpeg/jquant2.c
@@ -215,9 +215,9 @@ prescan_quantize(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     ptr = input_buf[row];
     for (col = width; col > 0; col--) {
       /* get pixel value and index into the histogram */
-      histp = &histogram[GETJSAMPLE(ptr[0]) >> C0_SHIFT]
-                        [GETJSAMPLE(ptr[1]) >> C1_SHIFT]
-                        [GETJSAMPLE(ptr[2]) >> C2_SHIFT];
+      histp = &histogram[ptr[0] >> C0_SHIFT]
+                        [ptr[1] >> C1_SHIFT]
+                        [ptr[2] >> C2_SHIFT];
       /* increment, check for overflow and undo increment if so. */
       if (++(*histp) <= 0)
         (*histp)--;
@@ -665,7 +665,7 @@ find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
 
   for (i = 0; i < numcolors; i++) {
     /* We compute the squared-c0-distance term, then add in the other two. */
-    x = GETJSAMPLE(cinfo->colormap[0][i]);
+    x = cinfo->colormap[0][i];
     if (x < minc0) {
       tdist = (x - minc0) * C0_SCALE;
       min_dist = tdist * tdist;
@@ -688,7 +688,7 @@ find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
       }
     }
 
-    x = GETJSAMPLE(cinfo->colormap[1][i]);
+    x = cinfo->colormap[1][i];
     if (x < minc1) {
       tdist = (x - minc1) * C1_SCALE;
       min_dist += tdist * tdist;
@@ -710,7 +710,7 @@ find_nearby_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
       }
     }
 
-    x = GETJSAMPLE(cinfo->colormap[2][i]);
+    x = cinfo->colormap[2][i];
     if (x < minc2) {
       tdist = (x - minc2) * C2_SCALE;
       min_dist += tdist * tdist;
@@ -788,13 +788,13 @@ find_best_colors(j_decompress_ptr cinfo, int minc0, int minc1, int minc2,
 #define STEP_C2  ((1 << C2_SHIFT) * C2_SCALE)
 
   for (i = 0; i < numcolors; i++) {
-    icolor = GETJSAMPLE(colorlist[i]);
+    icolor = colorlist[i];
     /* Compute (square of) distance from minc0/c1/c2 to this color */
-    inc0 = (minc0 - GETJSAMPLE(cinfo->colormap[0][icolor])) * C0_SCALE;
+    inc0 = (minc0 - cinfo->colormap[0][icolor]) * C0_SCALE;
     dist0 = inc0 * inc0;
-    inc1 = (minc1 - GETJSAMPLE(cinfo->colormap[1][icolor])) * C1_SCALE;
+    inc1 = (minc1 - cinfo->colormap[1][icolor]) * C1_SCALE;
     dist0 += inc1 * inc1;
-    inc2 = (minc2 - GETJSAMPLE(cinfo->colormap[2][icolor])) * C2_SCALE;
+    inc2 = (minc2 - cinfo->colormap[2][icolor]) * C2_SCALE;
     dist0 += inc2 * inc2;
     /* Form the initial difference increments */
     inc0 = inc0 * (2 * STEP_C0) + STEP_C0 * STEP_C0;
@@ -879,7 +879,7 @@ fill_inverse_cmap(j_decompress_ptr cinfo, int c0, int c1, int c2)
     for (ic1 = 0; ic1 < BOX_C1_ELEMS; ic1++) {
       cachep = &histogram[c0 + ic0][c1 + ic1][c2];
       for (ic2 = 0; ic2 < BOX_C2_ELEMS; ic2++) {
-        *cachep++ = (histcell)(GETJSAMPLE(*cptr++) + 1);
+        *cachep++ = (histcell)((*cptr++) + 1);
       }
     }
   }
@@ -909,9 +909,9 @@ pass2_no_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
     outptr = output_buf[row];
     for (col = width; col > 0; col--) {
       /* get pixel value and index into the cache */
-      c0 = GETJSAMPLE(*inptr++) >> C0_SHIFT;
-      c1 = GETJSAMPLE(*inptr++) >> C1_SHIFT;
-      c2 = GETJSAMPLE(*inptr++) >> C2_SHIFT;
+      c0 = (*inptr++) >> C0_SHIFT;
+      c1 = (*inptr++) >> C1_SHIFT;
+      c2 = (*inptr++) >> C2_SHIFT;
       cachep = &histogram[c0][c1][c2];
       /* If we have not seen this color before, find nearest colormap entry */
       /* and update the cache */
@@ -996,12 +996,12 @@ pass2_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
        * The maximum error is +- MAXJSAMPLE (or less with error limiting);
        * this sets the required size of the range_limit array.
        */
-      cur0 += GETJSAMPLE(inptr[0]);
-      cur1 += GETJSAMPLE(inptr[1]);
-      cur2 += GETJSAMPLE(inptr[2]);
-      cur0 = GETJSAMPLE(range_limit[cur0]);
-      cur1 = GETJSAMPLE(range_limit[cur1]);
-      cur2 = GETJSAMPLE(range_limit[cur2]);
+      cur0 += inptr[0];
+      cur1 += inptr[1];
+      cur2 += inptr[2];
+      cur0 = range_limit[cur0];
+      cur1 = range_limit[cur1];
+      cur2 = range_limit[cur2];
       /* Index into the cache with adjusted pixel value */
       cachep =
         &histogram[cur0 >> C0_SHIFT][cur1 >> C1_SHIFT][cur2 >> C2_SHIFT];
@@ -1015,9 +1015,9 @@ pass2_fs_dither(j_decompress_ptr cinfo, JSAMPARRAY input_buf,
         register int pixcode = *cachep - 1;
         *outptr = (JSAMPLE)pixcode;
         /* Compute representation error for this pixel */
-        cur0 -= GETJSAMPLE(colormap0[pixcode]);
-        cur1 -= GETJSAMPLE(colormap1[pixcode]);
-        cur2 -= GETJSAMPLE(colormap2[pixcode]);
+        cur0 -= colormap0[pixcode];
+        cur1 -= colormap1[pixcode];
+        cur2 -= colormap2[pixcode];
       }
       /* Compute error fractions to be propagated to adjacent pixels.
        * Add these into the running sums, and simultaneously shift the
diff --git a/external/jpeg/jsimd.h b/external/jpeg/jsimd.h
index 51e2b8c89de3..6c203655ef84 100644
--- a/external/jpeg/jsimd.h
+++ b/external/jpeg/jsimd.h
@@ -4,6 +4,7 @@
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2011, 2014, D. R. Commander.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -75,6 +76,7 @@ EXTERN(void) jsimd_int_upsample(j_decompress_ptr cinfo,
 
 EXTERN(int) jsimd_can_h2v2_fancy_upsample(void);
 EXTERN(int) jsimd_can_h2v1_fancy_upsample(void);
+EXTERN(int) jsimd_can_h1v2_fancy_upsample(void);
 
 EXTERN(void) jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo,
                                        jpeg_component_info *compptr,
@@ -84,6 +86,10 @@ EXTERN(void) jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo,
                                        jpeg_component_info *compptr,
                                        JSAMPARRAY input_data,
                                        JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo,
+                                       jpeg_component_info *compptr,
+                                       JSAMPARRAY input_data,
+                                       JSAMPARRAY *output_data_ptr);
 
 EXTERN(int) jsimd_can_h2v2_merged_upsample(void);
 EXTERN(int) jsimd_can_h2v1_merged_upsample(void);
diff --git a/external/jpeg/jsimd_none.c b/external/jpeg/jsimd_none.c
index 3cb6c80f8aab..5b38a9fb5c99 100644
--- a/external/jpeg/jsimd_none.c
+++ b/external/jpeg/jsimd_none.c
@@ -4,6 +4,7 @@
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2009-2011, 2014, D. R. Commander.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -169,6 +170,12 @@ jsimd_can_h2v1_fancy_upsample(void)
   return 0;
 }
 
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  return 0;
+}
+
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
@@ -181,6 +188,12 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
 {
 }
 
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+}
+
 GLOBAL(int)
 jsimd_can_h2v2_merged_upsample(void)
 {
diff --git a/external/jpeg/jversion.h b/external/jpeg/jversion.h
index 4462b941048d..2ab534af4147 100644
--- a/external/jpeg/jversion.h
+++ b/external/jpeg/jversion.h
@@ -2,9 +2,9 @@
  * jversion.h
  *
  * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2010, 2012-2020, D. R. Commander.
+ * Copyright (C) 2010, 2012-2021, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -37,9 +37,9 @@
  */
 
 #define JCOPYRIGHT \
-  "Copyright (C) 2009-2020 D. R. Commander\n" \
+  "Copyright (C) 2009-2021 D. R. Commander\n" \
   "Copyright (C) 2015, 2020 Google, Inc.\n" \
-  "Copyright (C) 2019 Arm Limited\n" \
+  "Copyright (C) 2019-2020 Arm Limited\n" \
   "Copyright (C) 2015-2016, 2018 Matthieu Darbois\n" \
   "Copyright (C) 2011-2016 Siarhei Siamashka\n" \
   "Copyright (C) 2015 Intel Corporation\n" \
@@ -48,7 +48,7 @@
   "Copyright (C) 2009, 2012 Pierre Ossman for Cendio AB\n" \
   "Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
   "Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
-  "Copyright (C) 1991-2017 Thomas G. Lane, Guido Vollbeding"
+  "Copyright (C) 1991-2020 Thomas G. Lane, Guido Vollbeding"
 
 #define JCOPYRIGHT_SHORT \
-  "Copyright (C) 1991-2020 The libjpeg-turbo Project and many others"
+  "Copyright (C) 1991-2021 The libjpeg-turbo Project and many others"
diff --git a/external/jpeg/libjpeg.txt b/external/jpeg/libjpeg.txt
new file mode 100644
index 000000000000..3c680b5fe242
--- /dev/null
+++ b/external/jpeg/libjpeg.txt
@@ -0,0 +1,3162 @@
+USING THE IJG JPEG LIBRARY
+
+This file was part of the Independent JPEG Group's software:
+Copyright (C) 1994-2013, Thomas G. Lane, Guido Vollbeding.
+libjpeg-turbo Modifications:
+Copyright (C) 2010, 2014-2018, 2020, D. R. Commander.
+Copyright (C) 2015, Google, Inc.
+For conditions of distribution and use, see the accompanying README.ijg file.
+
+
+This file describes how to use the IJG JPEG library within an application
+program.  Read it if you want to write a program that uses the library.
+
+The file example.txt provides heavily commented skeleton code for calling the
+JPEG library.  Also see jpeglib.h (the include file to be used by application
+programs) for full details about data structures and function parameter lists.
+The library source code, of course, is the ultimate reference.
+
+Note that there have been *major* changes from the application interface
+presented by IJG version 4 and earlier versions.  The old design had several
+inherent limitations, and it had accumulated a lot of cruft as we added
+features while trying to minimize application-interface changes.  We have
+sacrificed backward compatibility in the version 5 rewrite, but we think the
+improvements justify this.
+
+
+TABLE OF CONTENTS
+-----------------
+
+Overview:
+        Functions provided by the library
+        Outline of typical usage
+Basic library usage:
+        Data formats
+        Compression details
+        Decompression details
+        Partial image decompression
+        Mechanics of usage: include files, linking, etc
+Advanced features:
+        Compression parameter selection
+        Decompression parameter selection
+        Special color spaces
+        Error handling
+        Compressed data handling (source and destination managers)
+        I/O suspension
+        Progressive JPEG support
+        Buffered-image mode
+        Abbreviated datastreams and multiple images
+        Special markers
+        ICC profiles
+        Raw (downsampled) image data
+        Really raw data: DCT coefficients
+        Progress monitoring
+        Memory management
+        Memory usage
+        Library compile-time options
+        Portability considerations
+
+You should read at least the overview and basic usage sections before trying
+to program with the library.  The sections on advanced features can be read
+if and when you need them.
+
+
+OVERVIEW
+========
+
+Functions provided by the library
+---------------------------------
+
+The IJG JPEG library provides C code to read and write JPEG-compressed image
+files.  The surrounding application program receives or supplies image data a
+scanline at a time, using a straightforward uncompressed image format.  All
+details of color conversion and other preprocessing/postprocessing can be
+handled by the library.
+
+The library includes a substantial amount of code that is not covered by the
+JPEG standard but is necessary for typical applications of JPEG.  These
+functions preprocess the image before JPEG compression or postprocess it after
+decompression.  They include colorspace conversion, downsampling/upsampling,
+and color quantization.  The application indirectly selects use of this code
+by specifying the format in which it wishes to supply or receive image data.
+For example, if colormapped output is requested, then the decompression
+library automatically invokes color quantization.
+
+A wide range of quality vs. speed tradeoffs are possible in JPEG processing,
+and even more so in decompression postprocessing.  The decompression library
+provides multiple implementations that cover most of the useful tradeoffs,
+ranging from very-high-quality down to fast-preview operation.  On the
+compression side we have generally not provided low-quality choices, since
+compression is normally less time-critical.  It should be understood that the
+low-quality modes may not meet the JPEG standard's accuracy requirements;
+nonetheless, they are useful for viewers.
+
+A word about functions *not* provided by the library.  We handle a subset of
+the ISO JPEG standard; most baseline, extended-sequential, and progressive
+JPEG processes are supported.  (Our subset includes all features now in common
+use.)  Unsupported ISO options include:
+        * Hierarchical storage
+        * Lossless JPEG
+        * DNL marker
+        * Nonintegral subsampling ratios
+We support both 8- and 12-bit data precision, but this is a compile-time
+choice rather than a run-time choice; hence it is difficult to use both
+precisions in a single application.
+
+By itself, the library handles only interchange JPEG datastreams --- in
+particular the widely used JFIF file format.  The library can be used by
+surrounding code to process interchange or abbreviated JPEG datastreams that
+are embedded in more complex file formats.  (For example, this library is
+used by the free LIBTIFF library to support JPEG compression in TIFF.)
+
+
+Outline of typical usage
+------------------------
+
+The rough outline of a JPEG compression operation is:
+
+        Allocate and initialize a JPEG compression object
+        Specify the destination for the compressed data (eg, a file)
+        Set parameters for compression, including image size & colorspace
+        jpeg_start_compress(...);
+        while (scan lines remain to be written)
+                jpeg_write_scanlines(...);
+        jpeg_finish_compress(...);
+        Release the JPEG compression object
+
+A JPEG compression object holds parameters and working state for the JPEG
+library.  We make creation/destruction of the object separate from starting
+or finishing compression of an image; the same object can be re-used for a
+series of image compression operations.  This makes it easy to re-use the
+same parameter settings for a sequence of images.  Re-use of a JPEG object
+also has important implications for processing abbreviated JPEG datastreams,
+as discussed later.
+
+The image data to be compressed is supplied to jpeg_write_scanlines() from
+in-memory buffers.  If the application is doing file-to-file compression,
+reading image data from the source file is the application's responsibility.
+The library emits compressed data by calling a "data destination manager",
+which typically will write the data into a file; but the application can
+provide its own destination manager to do something else.
+
+Similarly, the rough outline of a JPEG decompression operation is:
+
+        Allocate and initialize a JPEG decompression object
+        Specify the source of the compressed data (eg, a file)
+        Call jpeg_read_header() to obtain image info
+        Set parameters for decompression
+        jpeg_start_decompress(...);
+        while (scan lines remain to be read)
+                jpeg_read_scanlines(...);
+        jpeg_finish_decompress(...);
+        Release the JPEG decompression object
+
+This is comparable to the compression outline except that reading the
+datastream header is a separate step.  This is helpful because information
+about the image's size, colorspace, etc is available when the application
+selects decompression parameters.  For example, the application can choose an
+output scaling ratio that will fit the image into the available screen size.
+
+The decompression library obtains compressed data by calling a data source
+manager, which typically will read the data from a file; but other behaviors
+can be obtained with a custom source manager.  Decompressed data is delivered
+into in-memory buffers passed to jpeg_read_scanlines().
+
+It is possible to abort an incomplete compression or decompression operation
+by calling jpeg_abort(); or, if you do not need to retain the JPEG object,
+simply release it by calling jpeg_destroy().
+
+JPEG compression and decompression objects are two separate struct types.
+However, they share some common fields, and certain routines such as
+jpeg_destroy() can work on either type of object.
+
+The JPEG library has no static variables: all state is in the compression
+or decompression object.  Therefore it is possible to process multiple
+compression and decompression operations concurrently, using multiple JPEG
+objects.
+
+Both compression and decompression can be done in an incremental memory-to-
+memory fashion, if suitable source/destination managers are used.  See the
+section on "I/O suspension" for more details.
+
+
+BASIC LIBRARY USAGE
+===================
+
+Data formats
+------------
+
+Before diving into procedural details, it is helpful to understand the
+image data format that the JPEG library expects or returns.
+
+The standard input image format is a rectangular array of pixels, with each
+pixel having the same number of "component" or "sample" values (color
+channels).  You must specify how many components there are and the colorspace
+interpretation of the components.  Most applications will use RGB data
+(three components per pixel) or grayscale data (one component per pixel).
+PLEASE NOTE THAT RGB DATA IS THREE SAMPLES PER PIXEL, GRAYSCALE ONLY ONE.
+A remarkable number of people manage to miss this, only to find that their
+programs don't work with grayscale JPEG files.
+
+There is no provision for colormapped input.  JPEG files are always full-color
+or full grayscale (or sometimes another colorspace such as CMYK).  You can
+feed in a colormapped image by expanding it to full-color format.  However
+JPEG often doesn't work very well with source data that has been colormapped,
+because of dithering noise.  This is discussed in more detail in the JPEG FAQ
+and the other references mentioned in the README.ijg file.
+
+Pixels are stored by scanlines, with each scanline running from left to
+right.  The component values for each pixel are adjacent in the row; for
+example, R,G,B,R,G,B,R,G,B,... for 24-bit RGB color.  Each scanline is an
+array of data type JSAMPLE --- which is typically "unsigned char", unless
+you've changed jmorecfg.h.  (You can also change the RGB pixel layout, say
+to B,G,R order, by modifying jmorecfg.h.  But see the restrictions listed in
+that file before doing so.)
+
+A 2-D array of pixels is formed by making a list of pointers to the starts of
+scanlines; so the scanlines need not be physically adjacent in memory.  Even
+if you process just one scanline at a time, you must make a one-element
+pointer array to conform to this structure.  Pointers to JSAMPLE rows are of
+type JSAMPROW, and the pointer to the pointer array is of type JSAMPARRAY.
+
+The library accepts or supplies one or more complete scanlines per call.
+It is not possible to process part of a row at a time.  Scanlines are always
+processed top-to-bottom.  You can process an entire image in one call if you
+have it all in memory, but usually it's simplest to process one scanline at
+a time.
+
+For best results, source data values should have the precision specified by
+BITS_IN_JSAMPLE (normally 8 bits).  For instance, if you choose to compress
+data that's only 6 bits/channel, you should left-justify each value in a
+byte before passing it to the compressor.  If you need to compress data
+that has more than 8 bits/channel, compile with BITS_IN_JSAMPLE = 12.
+(See "Library compile-time options", later.)
+
+
+The data format returned by the decompressor is the same in all details,
+except that colormapped output is supported.  (Again, a JPEG file is never
+colormapped.  But you can ask the decompressor to perform on-the-fly color
+quantization to deliver colormapped output.)  If you request colormapped
+output then the returned data array contains a single JSAMPLE per pixel;
+its value is an index into a color map.  The color map is represented as
+a 2-D JSAMPARRAY in which each row holds the values of one color component,
+that is, colormap[i][j] is the value of the i'th color component for pixel
+value (map index) j.  Note that since the colormap indexes are stored in
+JSAMPLEs, the maximum number of colors is limited by the size of JSAMPLE
+(ie, at most 256 colors for an 8-bit JPEG library).
+
+
+Compression details
+-------------------
+
+Here we revisit the JPEG compression outline given in the overview.
+
+1. Allocate and initialize a JPEG compression object.
+
+A JPEG compression object is a "struct jpeg_compress_struct".  (It also has
+a bunch of subsidiary structures which are allocated via malloc(), but the
+application doesn't control those directly.)  This struct can be just a local
+variable in the calling routine, if a single routine is going to execute the
+whole JPEG compression sequence.  Otherwise it can be static or allocated
+from malloc().
+
+You will also need a structure representing a JPEG error handler.  The part
+of this that the library cares about is a "struct jpeg_error_mgr".  If you
+are providing your own error handler, you'll typically want to embed the
+jpeg_error_mgr struct in a larger structure; this is discussed later under
+"Error handling".  For now we'll assume you are just using the default error
+handler.  The default error handler will print JPEG error/warning messages
+on stderr, and it will call exit() if a fatal error occurs.
+
+You must initialize the error handler structure, store a pointer to it into
+the JPEG object's "err" field, and then call jpeg_create_compress() to
+initialize the rest of the JPEG object.
+
+Typical code for this step, if you are using the default error handler, is
+
+        struct jpeg_compress_struct cinfo;
+        struct jpeg_error_mgr jerr;
+        ...
+        cinfo.err = jpeg_std_error(&jerr);
+        jpeg_create_compress(&cinfo);
+
+jpeg_create_compress allocates a small amount of memory, so it could fail
+if you are out of memory.  In that case it will exit via the error handler;
+that's why the error handler must be initialized first.
+
+
+2. Specify the destination for the compressed data (eg, a file).
+
+As previously mentioned, the JPEG library delivers compressed data to a
+"data destination" module.  The library includes one data destination
+module which knows how to write to a stdio stream.  You can use your own
+destination module if you want to do something else, as discussed later.
+
+If you use the standard destination module, you must open the target stdio
+stream beforehand.  Typical code for this step looks like:
+
+        FILE *outfile;
+        ...
+        if ((outfile = fopen(filename, "wb")) == NULL) {
+            fprintf(stderr, "can't open %s\n", filename);
+            exit(1);
+        }
+        jpeg_stdio_dest(&cinfo, outfile);
+
+where the last line invokes the standard destination module.
+
+WARNING: it is critical that the binary compressed data be delivered to the
+output file unchanged.  On non-Unix systems the stdio library may perform
+newline translation or otherwise corrupt binary data.  To suppress this
+behavior, you may need to use a "b" option to fopen (as shown above), or use
+setmode() or another routine to put the stdio stream in binary mode.  See
+cjpeg.c and djpeg.c for code that has been found to work on many systems.
+
+You can select the data destination after setting other parameters (step 3),
+if that's more convenient.  You may not change the destination between
+calling jpeg_start_compress() and jpeg_finish_compress().
+
+
+3. Set parameters for compression, including image size & colorspace.
+
+You must supply information about the source image by setting the following
+fields in the JPEG object (cinfo structure):
+
+        image_width             Width of image, in pixels
+        image_height            Height of image, in pixels
+        input_components        Number of color channels (samples per pixel)
+        in_color_space          Color space of source image
+
+The image dimensions are, hopefully, obvious.  JPEG supports image dimensions
+of 1 to 64K pixels in either direction.  The input color space is typically
+RGB or grayscale, and input_components is 3 or 1 accordingly.  (See "Special
+color spaces", later, for more info.)  The in_color_space field must be
+assigned one of the J_COLOR_SPACE enum constants, typically JCS_RGB or
+JCS_GRAYSCALE.
+
+JPEG has a large number of compression parameters that determine how the
+image is encoded.  Most applications don't need or want to know about all
+these parameters.  You can set all the parameters to reasonable defaults by
+calling jpeg_set_defaults(); then, if there are particular values you want
+to change, you can do so after that.  The "Compression parameter selection"
+section tells about all the parameters.
+
+You must set in_color_space correctly before calling jpeg_set_defaults(),
+because the defaults depend on the source image colorspace.  However the
+other three source image parameters need not be valid until you call
+jpeg_start_compress().  There's no harm in calling jpeg_set_defaults() more
+than once, if that happens to be convenient.
+
+Typical code for a 24-bit RGB source image is
+
+        cinfo.image_width = Width;      /* image width and height, in pixels */
+        cinfo.image_height = Height;
+        cinfo.input_components = 3;     /* # of color components per pixel */
+        cinfo.in_color_space = JCS_RGB; /* colorspace of input image */
+
+        jpeg_set_defaults(&cinfo);
+        /* Make optional parameter settings here */
+
+
+4. jpeg_start_compress(...);
+
+After you have established the data destination and set all the necessary
+source image info and other parameters, call jpeg_start_compress() to begin
+a compression cycle.  This will initialize internal state, allocate working
+storage, and emit the first few bytes of the JPEG datastream header.
+
+Typical code:
+
+        jpeg_start_compress(&cinfo, TRUE);
+
+The "TRUE" parameter ensures that a complete JPEG interchange datastream
+will be written.  This is appropriate in most cases.  If you think you might
+want to use an abbreviated datastream, read the section on abbreviated
+datastreams, below.
+
+Once you have called jpeg_start_compress(), you may not alter any JPEG
+parameters or other fields of the JPEG object until you have completed
+the compression cycle.
+
+
+5. while (scan lines remain to be written)
+        jpeg_write_scanlines(...);
+
+Now write all the required image data by calling jpeg_write_scanlines()
+one or more times.  You can pass one or more scanlines in each call, up
+to the total image height.  In most applications it is convenient to pass
+just one or a few scanlines at a time.  The expected format for the passed
+data is discussed under "Data formats", above.
+
+Image data should be written in top-to-bottom scanline order.
+Rec. ITU-T T.81 | ISO/IEC 10918-1 says, "Applications determine which edges of
+a source image are defined as top, bottom, left, and right."  However, if you
+want your files to be compatible with everyone else's, then top-to-bottom order
+must be used.  If the source data must be read in bottom-to-top order, then you
+can use the JPEG library's virtual array mechanism to invert the data
+efficiently.  Examples of this can be found in the sample application cjpeg.
+
+The library maintains a count of the number of scanlines written so far
+in the next_scanline field of the JPEG object.  Usually you can just use
+this variable as the loop counter, so that the loop test looks like
+"while (cinfo.next_scanline < cinfo.image_height)".
+
+Code for this step depends heavily on the way that you store the source data.
+example.txt shows the following code for the case of a full-size 2-D source
+array containing 3-byte RGB pixels:
+
+        JSAMPROW row_pointer[1];        /* pointer to a single row */
+        int row_stride;                 /* physical row width in buffer */
+
+        row_stride = image_width * 3;   /* JSAMPLEs per row in image_buffer */
+
+        while (cinfo.next_scanline < cinfo.image_height) {
+            row_pointer[0] = &image_buffer[cinfo.next_scanline * row_stride];
+            jpeg_write_scanlines(&cinfo, row_pointer, 1);
+        }
+
+jpeg_write_scanlines() returns the number of scanlines actually written.
+This will normally be equal to the number passed in, so you can usually
+ignore the return value.  It is different in just two cases:
+  * If you try to write more scanlines than the declared image height,
+    the additional scanlines are ignored.
+  * If you use a suspending data destination manager, output buffer overrun
+    will cause the compressor to return before accepting all the passed lines.
+    This feature is discussed under "I/O suspension", below.  The normal
+    stdio destination manager will NOT cause this to happen.
+In any case, the return value is the same as the change in the value of
+next_scanline.
+
+
+6. jpeg_finish_compress(...);
+
+After all the image data has been written, call jpeg_finish_compress() to
+complete the compression cycle.  This step is ESSENTIAL to ensure that the
+last bufferload of data is written to the data destination.
+jpeg_finish_compress() also releases working memory associated with the JPEG
+object.
+
+Typical code:
+
+        jpeg_finish_compress(&cinfo);
+
+If using the stdio destination manager, don't forget to close the output
+stdio stream (if necessary) afterwards.
+
+If you have requested a multi-pass operating mode, such as Huffman code
+optimization, jpeg_finish_compress() will perform the additional passes using
+data buffered by the first pass.  In this case jpeg_finish_compress() may take
+quite a while to complete.  With the default compression parameters, this will
+not happen.
+
+It is an error to call jpeg_finish_compress() before writing the necessary
+total number of scanlines.  If you wish to abort compression, call
+jpeg_abort() as discussed below.
+
+After completing a compression cycle, you may dispose of the JPEG object
+as discussed next, or you may use it to compress another image.  In that case
+return to step 2, 3, or 4 as appropriate.  If you do not change the
+destination manager, the new datastream will be written to the same target.
+If you do not change any JPEG parameters, the new datastream will be written
+with the same parameters as before.  Note that you can change the input image
+dimensions freely between cycles, but if you change the input colorspace, you
+should call jpeg_set_defaults() to adjust for the new colorspace; and then
+you'll need to repeat all of step 3.
+
+
+7. Release the JPEG compression object.
+
+When you are done with a JPEG compression object, destroy it by calling
+jpeg_destroy_compress().  This will free all subsidiary memory (regardless of
+the previous state of the object).  Or you can call jpeg_destroy(), which
+works for either compression or decompression objects --- this may be more
+convenient if you are sharing code between compression and decompression
+cases.  (Actually, these routines are equivalent except for the declared type
+of the passed pointer.  To avoid gripes from ANSI C compilers, jpeg_destroy()
+should be passed a j_common_ptr.)
+
+If you allocated the jpeg_compress_struct structure from malloc(), freeing
+it is your responsibility --- jpeg_destroy() won't.  Ditto for the error
+handler structure.
+
+Typical code:
+
+        jpeg_destroy_compress(&cinfo);
+
+
+8. Aborting.
+
+If you decide to abort a compression cycle before finishing, you can clean up
+in either of two ways:
+
+* If you don't need the JPEG object any more, just call
+  jpeg_destroy_compress() or jpeg_destroy() to release memory.  This is
+  legitimate at any point after calling jpeg_create_compress() --- in fact,
+  it's safe even if jpeg_create_compress() fails.
+
+* If you want to re-use the JPEG object, call jpeg_abort_compress(), or call
+  jpeg_abort() which works on both compression and decompression objects.
+  This will return the object to an idle state, releasing any working memory.
+  jpeg_abort() is allowed at any time after successful object creation.
+
+Note that cleaning up the data destination, if required, is your
+responsibility; neither of these routines will call term_destination().
+(See "Compressed data handling", below, for more about that.)
+
+jpeg_destroy() and jpeg_abort() are the only safe calls to make on a JPEG
+object that has reported an error by calling error_exit (see "Error handling"
+for more info).  The internal state of such an object is likely to be out of
+whack.  Either of these two routines will return the object to a known state.
+
+
+Decompression details
+---------------------
+
+Here we revisit the JPEG decompression outline given in the overview.
+
+1. Allocate and initialize a JPEG decompression object.
+
+This is just like initialization for compression, as discussed above,
+except that the object is a "struct jpeg_decompress_struct" and you
+call jpeg_create_decompress().  Error handling is exactly the same.
+
+Typical code:
+
+        struct jpeg_decompress_struct cinfo;
+        struct jpeg_error_mgr jerr;
+        ...
+        cinfo.err = jpeg_std_error(&jerr);
+        jpeg_create_decompress(&cinfo);
+
+(Both here and in the IJG code, we usually use variable name "cinfo" for
+both compression and decompression objects.)
+
+
+2. Specify the source of the compressed data (eg, a file).
+
+As previously mentioned, the JPEG library reads compressed data from a "data
+source" module.  The library includes one data source module which knows how
+to read from a stdio stream.  You can use your own source module if you want
+to do something else, as discussed later.
+
+If you use the standard source module, you must open the source stdio stream
+beforehand.  Typical code for this step looks like:
+
+        FILE *infile;
+        ...
+        if ((infile = fopen(filename, "rb")) == NULL) {
+            fprintf(stderr, "can't open %s\n", filename);
+            exit(1);
+        }
+        jpeg_stdio_src(&cinfo, infile);
+
+where the last line invokes the standard source module.
+
+WARNING: it is critical that the binary compressed data be read unchanged.
+On non-Unix systems the stdio library may perform newline translation or
+otherwise corrupt binary data.  To suppress this behavior, you may need to use
+a "b" option to fopen (as shown above), or use setmode() or another routine to
+put the stdio stream in binary mode.  See cjpeg.c and djpeg.c for code that
+has been found to work on many systems.
+
+You may not change the data source between calling jpeg_read_header() and
+jpeg_finish_decompress().  If you wish to read a series of JPEG images from
+a single source file, you should repeat the jpeg_read_header() to
+jpeg_finish_decompress() sequence without reinitializing either the JPEG
+object or the data source module; this prevents buffered input data from
+being discarded.
+
+
+3. Call jpeg_read_header() to obtain image info.
+
+Typical code for this step is just
+
+        jpeg_read_header(&cinfo, TRUE);
+
+This will read the source datastream header markers, up to the beginning
+of the compressed data proper.  On return, the image dimensions and other
+info have been stored in the JPEG object.  The application may wish to
+consult this information before selecting decompression parameters.
+
+More complex code is necessary if
+  * A suspending data source is used --- in that case jpeg_read_header()
+    may return before it has read all the header data.  See "I/O suspension",
+    below.  The normal stdio source manager will NOT cause this to happen.
+  * Abbreviated JPEG files are to be processed --- see the section on
+    abbreviated datastreams.  Standard applications that deal only in
+    interchange JPEG files need not be concerned with this case either.
+
+It is permissible to stop at this point if you just wanted to find out the
+image dimensions and other header info for a JPEG file.  In that case,
+call jpeg_destroy() when you are done with the JPEG object, or call
+jpeg_abort() to return it to an idle state before selecting a new data
+source and reading another header.
+
+
+4. Set parameters for decompression.
+
+jpeg_read_header() sets appropriate default decompression parameters based on
+the properties of the image (in particular, its colorspace).  However, you
+may well want to alter these defaults before beginning the decompression.
+For example, the default is to produce full color output from a color file.
+If you want colormapped output you must ask for it.  Other options allow the
+returned image to be scaled and allow various speed/quality tradeoffs to be
+selected.  "Decompression parameter selection", below, gives details.
+
+If the defaults are appropriate, nothing need be done at this step.
+
+Note that all default values are set by each call to jpeg_read_header().
+If you reuse a decompression object, you cannot expect your parameter
+settings to be preserved across cycles, as you can for compression.
+You must set desired parameter values each time.
+
+
+5. jpeg_start_decompress(...);
+
+Once the parameter values are satisfactory, call jpeg_start_decompress() to
+begin decompression.  This will initialize internal state, allocate working
+memory, and prepare for returning data.
+
+Typical code is just
+
+        jpeg_start_decompress(&cinfo);
+
+If you have requested a multi-pass operating mode, such as 2-pass color
+quantization, jpeg_start_decompress() will do everything needed before data
+output can begin.  In this case jpeg_start_decompress() may take quite a while
+to complete.  With a single-scan (non progressive) JPEG file and default
+decompression parameters, this will not happen; jpeg_start_decompress() will
+return quickly.
+
+After this call, the final output image dimensions, including any requested
+scaling, are available in the JPEG object; so is the selected colormap, if
+colormapped output has been requested.  Useful fields include
+
+        output_width            image width and height, as scaled
+        output_height
+        out_color_components    # of color components in out_color_space
+        output_components       # of color components returned per pixel
+        colormap                the selected colormap, if any
+        actual_number_of_colors         number of entries in colormap
+
+output_components is 1 (a colormap index) when quantizing colors; otherwise it
+equals out_color_components.  It is the number of JSAMPLE values that will be
+emitted per pixel in the output arrays.
+
+Typically you will need to allocate data buffers to hold the incoming image.
+You will need output_width * output_components JSAMPLEs per scanline in your
+output buffer, and a total of output_height scanlines will be returned.
+
+Note: if you are using the JPEG library's internal memory manager to allocate
+data buffers (as djpeg does), then the manager's protocol requires that you
+request large buffers *before* calling jpeg_start_decompress().  This is a
+little tricky since the output_XXX fields are not normally valid then.  You
+can make them valid by calling jpeg_calc_output_dimensions() after setting the
+relevant parameters (scaling, output color space, and quantization flag).
+
+
+6. while (scan lines remain to be read)
+        jpeg_read_scanlines(...);
+
+Now you can read the decompressed image data by calling jpeg_read_scanlines()
+one or more times.  At each call, you pass in the maximum number of scanlines
+to be read (ie, the height of your working buffer); jpeg_read_scanlines()
+will return up to that many lines.  The return value is the number of lines
+actually read.  The format of the returned data is discussed under "Data
+formats", above.  Don't forget that grayscale and color JPEGs will return
+different data formats!
+
+Image data is returned in top-to-bottom scanline order.  If you must write
+out the image in bottom-to-top order, you can use the JPEG library's virtual
+array mechanism to invert the data efficiently.  Examples of this can be
+found in the sample application djpeg.
+
+The library maintains a count of the number of scanlines returned so far
+in the output_scanline field of the JPEG object.  Usually you can just use
+this variable as the loop counter, so that the loop test looks like
+"while (cinfo.output_scanline < cinfo.output_height)".  (Note that the test
+should NOT be against image_height, unless you never use scaling.  The
+image_height field is the height of the original unscaled image.)
+The return value always equals the change in the value of output_scanline.
+
+If you don't use a suspending data source, it is safe to assume that
+jpeg_read_scanlines() reads at least one scanline per call, until the
+bottom of the image has been reached.
+
+If you use a buffer larger than one scanline, it is NOT safe to assume that
+jpeg_read_scanlines() fills it.  (The current implementation returns only a
+few scanlines per call, no matter how large a buffer you pass.)  So you must
+always provide a loop that calls jpeg_read_scanlines() repeatedly until the
+whole image has been read.
+
+
+7. jpeg_finish_decompress(...);
+
+After all the image data has been read, call jpeg_finish_decompress() to
+complete the decompression cycle.  This causes working memory associated
+with the JPEG object to be released.
+
+Typical code:
+
+        jpeg_finish_decompress(&cinfo);
+
+If using the stdio source manager, don't forget to close the source stdio
+stream if necessary.
+
+It is an error to call jpeg_finish_decompress() before reading the correct
+total number of scanlines.  If you wish to abort decompression, call
+jpeg_abort() as discussed below.
+
+After completing a decompression cycle, you may dispose of the JPEG object as
+discussed next, or you may use it to decompress another image.  In that case
+return to step 2 or 3 as appropriate.  If you do not change the source
+manager, the next image will be read from the same source.
+
+
+8. Release the JPEG decompression object.
+
+When you are done with a JPEG decompression object, destroy it by calling
+jpeg_destroy_decompress() or jpeg_destroy().  The previous discussion of
+destroying compression objects applies here too.
+
+Typical code:
+
+        jpeg_destroy_decompress(&cinfo);
+
+
+9. Aborting.
+
+You can abort a decompression cycle by calling jpeg_destroy_decompress() or
+jpeg_destroy() if you don't need the JPEG object any more, or
+jpeg_abort_decompress() or jpeg_abort() if you want to reuse the object.
+The previous discussion of aborting compression cycles applies here too.
+
+
+Partial image decompression
+---------------------------
+
+Partial image decompression is convenient for performance-critical applications
+that wish to view only a portion of a large JPEG image without decompressing
+the whole thing.  It it also useful in memory-constrained environments (such as
+on mobile devices.)  This library provides the following functions to support
+partial image decompression:
+
+1. Skipping rows when decompressing
+
+        jpeg_skip_scanlines(j_decompress_ptr cinfo, JDIMENSION num_lines);
+
+This function provides application programmers with the ability to skip over
+multiple rows in the JPEG image.
+
+Suspending data sources are not supported by this function.  Calling
+jpeg_skip_scanlines() with a suspending data source will result in undefined
+behavior.  Two-pass color quantization is also not supported by this function.
+Calling jpeg_skip_scanlines() with two-pass color quantization enabled will
+result in an error.
+
+jpeg_skip_scanlines() will not allow skipping past the bottom of the image.  If
+the value of num_lines is large enough to skip past the bottom of the image,
+then the function will skip to the end of the image instead.
+
+If the value of num_lines is valid, then jpeg_skip_scanlines() will always
+skip all of the input rows requested.  There is no need to inspect the return
+value of the function in that case.
+
+Best results will be achieved by calling jpeg_skip_scanlines() for large chunks
+of rows.  The function should be viewed as a way to quickly jump to a
+particular vertical offset in the JPEG image in order to decode a subset of the
+image.  Used in this manner, it will provide significant performance
+improvements.
+
+Calling jpeg_skip_scanlines() for small values of num_lines has several
+potential drawbacks:
+    1) JPEG decompression occurs in blocks, so if jpeg_skip_scanlines() is
+       called from the middle of a decompression block, then it is likely that
+       much of the decompression work has already been done for the first
+       couple of rows that need to be skipped.
+    2) When this function returns, it must leave the decompressor in a state
+       such that it is ready to read the next line.  This may involve
+       decompressing a block that must be partially skipped.
+These issues are especially tricky for cases in which upsampling requires
+context rows.  In the worst case, jpeg_skip_scanlines() will perform similarly
+to jpeg_read_scanlines() (since it will actually call jpeg_read_scanlines().)
+
+2. Decompressing partial scanlines
+
+        jpeg_crop_scanline (j_decompress_ptr cinfo, JDIMENSION *xoffset,
+                            JDIMENSION *width)
+
+This function provides application programmers with the ability to decompress
+only a portion of each row in the JPEG image.  It must be called after
+jpeg_start_decompress() and before any calls to jpeg_read_scanlines() or
+jpeg_skip_scanlines().
+
+If xoffset and width do not form a valid subset of the image row, then this
+function will generate an error.  Note that if the output image is scaled, then
+xoffset and width are relative to the scaled image dimensions.
+
+xoffset and width are passed by reference because xoffset must fall on an iMCU
+boundary.  If it doesn't, then it will be moved left to the nearest iMCU
+boundary, and width will be increased accordingly.  If the calling program does
+not like the adjusted values of xoffset and width, then it can call
+jpeg_crop_scanline() again with new values (for instance, if it wants to move
+xoffset to the nearest iMCU boundary to the right instead of to the left.)
+
+After calling this function, cinfo->output_width will be set to the adjusted
+width.  This value should be used when allocating an output buffer to pass to
+jpeg_read_scanlines().
+
+The output image from a partial-width decompression will be identical to the
+corresponding image region from a full decode, with one exception:  The "fancy"
+(smooth) h2v2 (4:2:0) and h2v1 (4:2:2) upsampling algorithms fill in the
+missing chroma components by averaging the chroma components from neighboring
+pixels, except on the right and left edges of the image (where there are no
+neighboring pixels.)  When performing a partial-width decompression, these
+"fancy" upsampling algorithms may treat the left and right edges of the partial
+image region as if they are the left and right edges of the image, meaning that
+the upsampling algorithm may be simplified.  The result is that the pixels on
+the left or right edge of the partial image may not be exactly identical to the
+corresponding pixels in the original image.
+
+
+Mechanics of usage: include files, linking, etc
+-----------------------------------------------
+
+Applications using the JPEG library should include the header file jpeglib.h
+to obtain declarations of data types and routines.  Before including
+jpeglib.h, include system headers that define at least the typedefs FILE and
+size_t.  On ANSI-conforming systems, including <stdio.h> is sufficient; on
+older Unix systems, you may need <sys/types.h> to define size_t.
+
+If the application needs to refer to individual JPEG library error codes, also
+include jerror.h to define those symbols.
+
+jpeglib.h indirectly includes the files jconfig.h and jmorecfg.h.  If you are
+installing the JPEG header files in a system directory, you will want to
+install all four files: jpeglib.h, jerror.h, jconfig.h, jmorecfg.h.
+
+The most convenient way to include the JPEG code into your executable program
+is to prepare a library file ("libjpeg.a", or a corresponding name on non-Unix
+machines) and reference it at your link step.  If you use only half of the
+library (only compression or only decompression), only that much code will be
+included from the library, unless your linker is hopelessly brain-damaged.
+The supplied makefiles build libjpeg.a automatically (see install.txt).
+
+While you can build the JPEG library as a shared library if the whim strikes
+you, we don't really recommend it.  The trouble with shared libraries is that
+at some point you'll probably try to substitute a new version of the library
+without recompiling the calling applications.  That generally doesn't work
+because the parameter struct declarations usually change with each new
+version.  In other words, the library's API is *not* guaranteed binary
+compatible across versions; we only try to ensure source-code compatibility.
+(In hindsight, it might have been smarter to hide the parameter structs from
+applications and introduce a ton of access functions instead.  Too late now,
+however.)
+
+It may be worth pointing out that the core JPEG library does not actually
+require the stdio library: only the default source/destination managers and
+error handler need it.  You can use the library in a stdio-less environment
+if you replace those modules and use jmemnobs.c (or another memory manager of
+your own devising).  More info about the minimum system library requirements
+may be found in jinclude.h.
+
+
+ADVANCED FEATURES
+=================
+
+Compression parameter selection
+-------------------------------
+
+This section describes all the optional parameters you can set for JPEG
+compression, as well as the "helper" routines provided to assist in this
+task.  Proper setting of some parameters requires detailed understanding
+of the JPEG standard; if you don't know what a parameter is for, it's best
+not to mess with it!  See REFERENCES in the README.ijg file for pointers to
+more info about JPEG.
+
+It's a good idea to call jpeg_set_defaults() first, even if you plan to set
+all the parameters; that way your code is more likely to work with future JPEG
+libraries that have additional parameters.  For the same reason, we recommend
+you use a helper routine where one is provided, in preference to twiddling
+cinfo fields directly.
+
+The helper routines are:
+
+jpeg_set_defaults (j_compress_ptr cinfo)
+        This routine sets all JPEG parameters to reasonable defaults, using
+        only the input image's color space (field in_color_space, which must
+        already be set in cinfo).  Many applications will only need to use
+        this routine and perhaps jpeg_set_quality().
+
+jpeg_set_colorspace (j_compress_ptr cinfo, J_COLOR_SPACE colorspace)
+        Sets the JPEG file's colorspace (field jpeg_color_space) as specified,
+        and sets other color-space-dependent parameters appropriately.  See
+        "Special color spaces", below, before using this.  A large number of
+        parameters, including all per-component parameters, are set by this
+        routine; if you want to twiddle individual parameters you should call
+        jpeg_set_colorspace() before rather than after.
+
+jpeg_default_colorspace (j_compress_ptr cinfo)
+        Selects an appropriate JPEG colorspace based on cinfo->in_color_space,
+        and calls jpeg_set_colorspace().  This is actually a subroutine of
+        jpeg_set_defaults().  It's broken out in case you want to change
+        just the colorspace-dependent JPEG parameters.
+
+jpeg_set_quality (j_compress_ptr cinfo, int quality, boolean force_baseline)
+        Constructs JPEG quantization tables appropriate for the indicated
+        quality setting.  The quality value is expressed on the 0..100 scale
+        recommended by IJG (cjpeg's "-quality" switch uses this routine).
+        Note that the exact mapping from quality values to tables may change
+        in future IJG releases as more is learned about DCT quantization.
+        If the force_baseline parameter is TRUE, then the quantization table
+        entries are constrained to the range 1..255 for full JPEG baseline
+        compatibility.  In the current implementation, this only makes a
+        difference for quality settings below 25, and it effectively prevents
+        very small/low quality files from being generated.  The IJG decoder
+        is capable of reading the non-baseline files generated at low quality
+        settings when force_baseline is FALSE, but other decoders may not be.
+
+jpeg_set_linear_quality (j_compress_ptr cinfo, int scale_factor,
+                         boolean force_baseline)
+        Same as jpeg_set_quality() except that the generated tables are the
+        sample tables given in Annex K (Clause K.1) of
+        Rec. ITU-T T.81 (1992) | ISO/IEC 10918-1:1994, multiplied by the
+        specified scale factor (which is expressed as a percentage; thus
+        scale_factor = 100 reproduces the spec's tables).  Note that larger
+        scale factors give lower quality.  This entry point is useful for
+        conforming to the Adobe PostScript DCT conventions, but we do not
+        recommend linear scaling as a user-visible quality scale otherwise.
+        force_baseline again constrains the computed table entries to 1..255.
+
+int jpeg_quality_scaling (int quality)
+        Converts a value on the IJG-recommended quality scale to a linear
+        scaling percentage.  Note that this routine may change or go away
+        in future releases --- IJG may choose to adopt a scaling method that
+        can't be expressed as a simple scalar multiplier, in which case the
+        premise of this routine collapses.  Caveat user.
+
+jpeg_default_qtables (j_compress_ptr cinfo, boolean force_baseline)
+        [libjpeg v7+ API/ABI emulation only]
+        Set default quantization tables with linear q_scale_factor[] values
+        (see below).
+
+jpeg_add_quant_table (j_compress_ptr cinfo, int which_tbl,
+                      const unsigned int *basic_table,
+                      int scale_factor, boolean force_baseline)
+        Allows an arbitrary quantization table to be created.  which_tbl
+        indicates which table slot to fill.  basic_table points to an array
+        of 64 unsigned ints given in normal array order.  These values are
+        multiplied by scale_factor/100 and then clamped to the range 1..65535
+        (or to 1..255 if force_baseline is TRUE).
+        CAUTION: prior to library version 6a, jpeg_add_quant_table expected
+        the basic table to be given in JPEG zigzag order.  If you need to
+        write code that works with either older or newer versions of this
+        routine, you must check the library version number.  Something like
+        "#if JPEG_LIB_VERSION >= 61" is the right test.
+
+jpeg_simple_progression (j_compress_ptr cinfo)
+        Generates a default scan script for writing a progressive-JPEG file.
+        This is the recommended method of creating a progressive file,
+        unless you want to make a custom scan sequence.  You must ensure that
+        the JPEG color space is set correctly before calling this routine.
+
+
+Compression parameters (cinfo fields) include:
+
+boolean arith_code
+        If TRUE, use arithmetic coding.
+        If FALSE, use Huffman coding.
+
+J_DCT_METHOD dct_method
+        Selects the algorithm used for the DCT step.  Choices are:
+                JDCT_ISLOW: accurate integer method
+                JDCT_IFAST: less accurate integer method [legacy feature]
+                JDCT_FLOAT: floating-point method [legacy feature]
+                JDCT_DEFAULT: default method (normally JDCT_ISLOW)
+                JDCT_FASTEST: fastest method (normally JDCT_IFAST)
+        When the Independent JPEG Group's software was first released in 1991,
+        the compression time for a 1-megapixel JPEG image on a mainstream PC
+        was measured in minutes.  Thus, JDCT_IFAST provided noticeable
+        performance benefits.  On modern CPUs running libjpeg-turbo, however,
+        the compression time for a 1-megapixel JPEG image is measured in
+        milliseconds, and thus the performance benefits of JDCT_IFAST are much
+        less noticeable.  On modern x86/x86-64 CPUs that support AVX2
+        instructions, JDCT_IFAST and JDCT_ISLOW have similar performance.  On
+        other types of CPUs, JDCT_IFAST is generally about 5-15% faster than
+        JDCT_ISLOW.
+
+        For quality levels of 90 and below, there should be little or no
+        perceptible quality difference between the two algorithms.  For quality
+        levels above 90, however, the difference between JDCT_IFAST and
+        JDCT_ISLOW becomes more pronounced.  With quality=97, for instance,
+        JDCT_IFAST incurs generally about a 1-3 dB loss in PSNR relative to
+        JDCT_ISLOW, but this can be larger for some images.  Do not use
+        JDCT_IFAST with quality levels above 97.  The algorithm often
+        degenerates at quality=98 and above and can actually produce a more
+        lossy image than if lower quality levels had been used.  Also, in
+        libjpeg-turbo, JDCT_IFAST is not fully accelerated for quality levels
+        above 97, so it will be slower than JDCT_ISLOW.
+
+        JDCT_FLOAT does not produce significantly more accurate results than
+        JDCT_ISLOW, and it is much slower.  JDCT_FLOAT may also give different
+        results on different machines due to varying roundoff behavior, whereas
+        the integer methods should give the same results on all machines.
+
+J_COLOR_SPACE jpeg_color_space
+int num_components
+        The JPEG color space and corresponding number of components; see
+        "Special color spaces", below, for more info.  We recommend using
+        jpeg_set_color_space() if you want to change these.
+
+boolean optimize_coding
+        TRUE causes the compressor to compute optimal Huffman coding tables
+        for the image.  This requires an extra pass over the data and
+        therefore costs a good deal of space and time.  The default is
+        FALSE, which tells the compressor to use the supplied or default
+        Huffman tables.  In most cases optimal tables save only a few percent
+        of file size compared to the default tables.  Note that when this is
+        TRUE, you need not supply Huffman tables at all, and any you do
+        supply will be overwritten.
+
+unsigned int restart_interval
+int restart_in_rows
+        To emit restart markers in the JPEG file, set one of these nonzero.
+        Set restart_interval to specify the exact interval in MCU blocks.
+        Set restart_in_rows to specify the interval in MCU rows.  (If
+        restart_in_rows is not 0, then restart_interval is set after the
+        image width in MCUs is computed.)  Defaults are zero (no restarts).
+        One restart marker per MCU row is often a good choice.
+        NOTE: the overhead of restart markers is higher in grayscale JPEG
+        files than in color files, and MUCH higher in progressive JPEGs.
+        If you use restarts, you may want to use larger intervals in those
+        cases.
+
+const jpeg_scan_info *scan_info
+int num_scans
+        By default, scan_info is NULL; this causes the compressor to write a
+        single-scan sequential JPEG file.  If not NULL, scan_info points to
+        an array of scan definition records of length num_scans.  The
+        compressor will then write a JPEG file having one scan for each scan
+        definition record.  This is used to generate noninterleaved or
+        progressive JPEG files.  The library checks that the scan array
+        defines a valid JPEG scan sequence.  (jpeg_simple_progression creates
+        a suitable scan definition array for progressive JPEG.)  This is
+        discussed further under "Progressive JPEG support".
+
+int smoothing_factor
+        If non-zero, the input image is smoothed; the value should be 1 for
+        minimal smoothing to 100 for maximum smoothing.  Consult jcsample.c
+        for details of the smoothing algorithm.  The default is zero.
+
+boolean write_JFIF_header
+        If TRUE, a JFIF APP0 marker is emitted.  jpeg_set_defaults() and
+        jpeg_set_colorspace() set this TRUE if a JFIF-legal JPEG color space
+        (ie, YCbCr or grayscale) is selected, otherwise FALSE.
+
+UINT8 JFIF_major_version
+UINT8 JFIF_minor_version
+        The version number to be written into the JFIF marker.
+        jpeg_set_defaults() initializes the version to 1.01 (major=minor=1).
+        You should set it to 1.02 (major=1, minor=2) if you plan to write
+        any JFIF 1.02 extension markers.
+
+UINT8 density_unit
+UINT16 X_density
+UINT16 Y_density
+        The resolution information to be written into the JFIF marker;
+        not used otherwise.  density_unit may be 0 for unknown,
+        1 for dots/inch, or 2 for dots/cm.  The default values are 0,1,1
+        indicating square pixels of unknown size.
+
+boolean write_Adobe_marker
+        If TRUE, an Adobe APP14 marker is emitted.  jpeg_set_defaults() and
+        jpeg_set_colorspace() set this TRUE if JPEG color space RGB, CMYK,
+        or YCCK is selected, otherwise FALSE.  It is generally a bad idea
+        to set both write_JFIF_header and write_Adobe_marker.  In fact,
+        you probably shouldn't change the default settings at all --- the
+        default behavior ensures that the JPEG file's color space can be
+        recognized by the decoder.
+
+JQUANT_TBL *quant_tbl_ptrs[NUM_QUANT_TBLS]
+        Pointers to coefficient quantization tables, one per table slot,
+        or NULL if no table is defined for a slot.  Usually these should
+        be set via one of the above helper routines; jpeg_add_quant_table()
+        is general enough to define any quantization table.  The other
+        routines will set up table slot 0 for luminance quality and table
+        slot 1 for chrominance.
+
+int q_scale_factor[NUM_QUANT_TBLS]
+        [libjpeg v7+ API/ABI emulation only]
+        Linear quantization scaling factors (0-100, default 100)
+        for use with jpeg_default_qtables().
+        See rdswitch.c and cjpeg.c for an example of usage.
+        Note that the q_scale_factor[] values use "linear" scales, so JPEG
+        quality levels chosen by the user must be converted to these scales
+        using jpeg_quality_scaling().  Here is an example that corresponds to
+        cjpeg -quality 90,70:
+
+                jpeg_set_defaults(cinfo);
+
+                /* Set luminance quality 90. */
+                cinfo->q_scale_factor[0] = jpeg_quality_scaling(90);
+                /* Set chrominance quality 70. */
+                cinfo->q_scale_factor[1] = jpeg_quality_scaling(70);
+
+                jpeg_default_qtables(cinfo, force_baseline);
+
+        CAUTION: Setting separate quality levels for chrominance and luminance
+        is mainly only useful if chrominance subsampling is disabled.  2x2
+        chrominance subsampling (AKA "4:2:0") is the default, but you can
+        explicitly disable subsampling as follows:
+
+                cinfo->comp_info[0].v_samp_factor = 1;
+                cinfo->comp_info[0].h_samp_factor = 1;
+
+JHUFF_TBL *dc_huff_tbl_ptrs[NUM_HUFF_TBLS]
+JHUFF_TBL *ac_huff_tbl_ptrs[NUM_HUFF_TBLS]
+        Pointers to Huffman coding tables, one per table slot, or NULL if
+        no table is defined for a slot.  Slots 0 and 1 are filled with the
+        JPEG sample tables by jpeg_set_defaults().  If you need to allocate
+        more table structures, jpeg_alloc_huff_table() may be used.
+        Note that optimal Huffman tables can be computed for an image
+        by setting optimize_coding, as discussed above; there's seldom
+        any need to mess with providing your own Huffman tables.
+
+
+[libjpeg v7+ API/ABI emulation only]
+The actual dimensions of the JPEG image that will be written to the file are
+given by the following fields.  These are computed from the input image
+dimensions and the compression parameters by jpeg_start_compress().  You can
+also call jpeg_calc_jpeg_dimensions() to obtain the values that will result
+from the current parameter settings.  This can be useful if you are trying
+to pick a scaling ratio that will get close to a desired target size.
+
+JDIMENSION jpeg_width           Actual dimensions of output image.
+JDIMENSION jpeg_height
+
+
+Per-component parameters are stored in the struct cinfo.comp_info[i] for
+component number i.  Note that components here refer to components of the
+JPEG color space, *not* the source image color space.  A suitably large
+comp_info[] array is allocated by jpeg_set_defaults(); if you choose not
+to use that routine, it's up to you to allocate the array.
+
+int component_id
+        The one-byte identifier code to be recorded in the JPEG file for
+        this component.  For the standard color spaces, we recommend you
+        leave the default values alone.
+
+int h_samp_factor
+int v_samp_factor
+        Horizontal and vertical sampling factors for the component; must
+        be 1..4 according to the JPEG standard.  Note that larger sampling
+        factors indicate a higher-resolution component; many people find
+        this behavior quite unintuitive.  The default values are 2,2 for
+        luminance components and 1,1 for chrominance components, except
+        for grayscale where 1,1 is used.
+
+int quant_tbl_no
+        Quantization table number for component.  The default value is
+        0 for luminance components and 1 for chrominance components.
+
+int dc_tbl_no
+int ac_tbl_no
+        DC and AC entropy coding table numbers.  The default values are
+        0 for luminance components and 1 for chrominance components.
+
+int component_index
+        Must equal the component's index in comp_info[].  (Beginning in
+        release v6, the compressor library will fill this in automatically;
+        you don't have to.)
+
+
+Decompression parameter selection
+---------------------------------
+
+Decompression parameter selection is somewhat simpler than compression
+parameter selection, since all of the JPEG internal parameters are
+recorded in the source file and need not be supplied by the application.
+(Unless you are working with abbreviated files, in which case see
+"Abbreviated datastreams", below.)  Decompression parameters control
+the postprocessing done on the image to deliver it in a format suitable
+for the application's use.  Many of the parameters control speed/quality
+tradeoffs, in which faster decompression may be obtained at the price of
+a poorer-quality image.  The defaults select the highest quality (slowest)
+processing.
+
+The following fields in the JPEG object are set by jpeg_read_header() and
+may be useful to the application in choosing decompression parameters:
+
+JDIMENSION image_width                  Width and height of image
+JDIMENSION image_height
+int num_components                      Number of color components
+J_COLOR_SPACE jpeg_color_space          Colorspace of image
+boolean saw_JFIF_marker                 TRUE if a JFIF APP0 marker was seen
+  UINT8 JFIF_major_version              Version information from JFIF marker
+  UINT8 JFIF_minor_version
+  UINT8 density_unit                    Resolution data from JFIF marker
+  UINT16 X_density
+  UINT16 Y_density
+boolean saw_Adobe_marker                TRUE if an Adobe APP14 marker was seen
+  UINT8 Adobe_transform                 Color transform code from Adobe marker
+
+The JPEG color space, unfortunately, is something of a guess since the JPEG
+standard proper does not provide a way to record it.  In practice most files
+adhere to the JFIF or Adobe conventions, and the decoder will recognize these
+correctly.  See "Special color spaces", below, for more info.
+
+
+The decompression parameters that determine the basic properties of the
+returned image are:
+
+J_COLOR_SPACE out_color_space
+        Output color space.  jpeg_read_header() sets an appropriate default
+        based on jpeg_color_space; typically it will be RGB or grayscale.
+        The application can change this field to request output in a different
+        colorspace.  For example, set it to JCS_GRAYSCALE to get grayscale
+        output from a color file.  (This is useful for previewing: grayscale
+        output is faster than full color since the color components need not
+        be processed.)  Note that not all possible color space transforms are
+        currently implemented; you may need to extend jdcolor.c if you want an
+        unusual conversion.
+
+unsigned int scale_num, scale_denom
+        Scale the image by the fraction scale_num/scale_denom.  Default is
+        1/1, or no scaling.  Currently, the only supported scaling ratios
+        are M/8 with all M from 1 to 16, or any reduced fraction thereof (such
+        as 1/2, 3/4, etc.)  (The library design allows for arbitrary
+        scaling ratios but this is not likely to be implemented any time soon.)
+        Smaller scaling ratios permit significantly faster decoding since
+        fewer pixels need be processed and a simpler IDCT method can be used.
+
+boolean quantize_colors
+        If set TRUE, colormapped output will be delivered.  Default is FALSE,
+        meaning that full-color output will be delivered.
+
+The next three parameters are relevant only if quantize_colors is TRUE.
+
+int desired_number_of_colors
+        Maximum number of colors to use in generating a library-supplied color
+        map (the actual number of colors is returned in a different field).
+        Default 256.  Ignored when the application supplies its own color map.
+
+boolean two_pass_quantize
+        If TRUE, an extra pass over the image is made to select a custom color
+        map for the image.  This usually looks a lot better than the one-size-
+        fits-all colormap that is used otherwise.  Default is TRUE.  Ignored
+        when the application supplies its own color map.
+
+J_DITHER_MODE dither_mode
+        Selects color dithering method.  Supported values are:
+                JDITHER_NONE    no dithering: fast, very low quality
+                JDITHER_ORDERED ordered dither: moderate speed and quality
+                JDITHER_FS      Floyd-Steinberg dither: slow, high quality
+        Default is JDITHER_FS.  (At present, ordered dither is implemented
+        only in the single-pass, standard-colormap case.  If you ask for
+        ordered dither when two_pass_quantize is TRUE or when you supply
+        an external color map, you'll get F-S dithering.)
+
+When quantize_colors is TRUE, the target color map is described by the next
+two fields.  colormap is set to NULL by jpeg_read_header().  The application
+can supply a color map by setting colormap non-NULL and setting
+actual_number_of_colors to the map size.  Otherwise, jpeg_start_decompress()
+selects a suitable color map and sets these two fields itself.
+[Implementation restriction: at present, an externally supplied colormap is
+only accepted for 3-component output color spaces.]
+
+JSAMPARRAY colormap
+        The color map, represented as a 2-D pixel array of out_color_components
+        rows and actual_number_of_colors columns.  Ignored if not quantizing.
+        CAUTION: if the JPEG library creates its own colormap, the storage
+        pointed to by this field is released by jpeg_finish_decompress().
+        Copy the colormap somewhere else first, if you want to save it.
+
+int actual_number_of_colors
+        The number of colors in the color map.
+
+Additional decompression parameters that the application may set include:
+
+J_DCT_METHOD dct_method
+        Selects the algorithm used for the DCT step.  Choices are:
+                JDCT_ISLOW: accurate integer method
+                JDCT_IFAST: less accurate integer method [legacy feature]
+                JDCT_FLOAT: floating-point method [legacy feature]
+                JDCT_DEFAULT: default method (normally JDCT_ISLOW)
+                JDCT_FASTEST: fastest method (normally JDCT_IFAST)
+        When the Independent JPEG Group's software was first released in 1991,
+        the decompression time for a 1-megapixel JPEG image on a mainstream PC
+        was measured in minutes.  Thus, JDCT_IFAST provided noticeable
+        performance benefits.  On modern CPUs running libjpeg-turbo, however,
+        the decompression time for a 1-megapixel JPEG image is measured in
+        milliseconds, and thus the performance benefits of JDCT_IFAST are much
+        less noticeable.  On modern x86/x86-64 CPUs that support AVX2
+        instructions, JDCT_IFAST and JDCT_ISLOW have similar performance.  On
+        other types of CPUs, JDCT_IFAST is generally about 5-15% faster than
+        JDCT_ISLOW.
+
+        If the JPEG image was compressed using a quality level of 85 or below,
+        then there should be little or no perceptible quality difference
+        between the two algorithms.  When decompressing images that were
+        compressed using quality levels above 85, however, the difference
+        between JDCT_IFAST and JDCT_ISLOW becomes more pronounced.  With images
+        compressed using quality=97, for instance, JDCT_IFAST incurs generally
+        about a 4-6 dB loss in PSNR relative to JDCT_ISLOW, but this can be
+        larger for some images.  If you can avoid it, do not use JDCT_IFAST
+        when decompressing images that were compressed using quality levels
+        above 97.  The algorithm often degenerates for such images and can
+        actually produce a more lossy output image than if the JPEG image had
+        been compressed using lower quality levels.
+
+        JDCT_FLOAT does not produce significantly more accurate results than
+        JDCT_ISLOW, and it is much slower.  JDCT_FLOAT may also give different
+        results on different machines due to varying roundoff behavior, whereas
+        the integer methods should give the same results on all machines.
+
+boolean do_fancy_upsampling
+        If TRUE, do careful upsampling of chroma components.  If FALSE,
+        a faster but sloppier method is used.  Default is TRUE.  The visual
+        impact of the sloppier method is often very small.
+
+boolean do_block_smoothing
+        If TRUE, interblock smoothing is applied in early stages of decoding
+        progressive JPEG files; if FALSE, not.  Default is TRUE.  Early
+        progression stages look "fuzzy" with smoothing, "blocky" without.
+        In any case, block smoothing ceases to be applied after the first few
+        AC coefficients are known to full accuracy, so it is relevant only
+        when using buffered-image mode for progressive images.
+
+boolean enable_1pass_quant
+boolean enable_external_quant
+boolean enable_2pass_quant
+        These are significant only in buffered-image mode, which is
+        described in its own section below.
+
+
+The output image dimensions are given by the following fields.  These are
+computed from the source image dimensions and the decompression parameters
+by jpeg_start_decompress().  You can also call jpeg_calc_output_dimensions()
+to obtain the values that will result from the current parameter settings.
+This can be useful if you are trying to pick a scaling ratio that will get
+close to a desired target size.  It's also important if you are using the
+JPEG library's memory manager to allocate output buffer space, because you
+are supposed to request such buffers *before* jpeg_start_decompress().
+
+JDIMENSION output_width         Actual dimensions of output image.
+JDIMENSION output_height
+int out_color_components        Number of color components in out_color_space.
+int output_components           Number of color components returned.
+int rec_outbuf_height           Recommended height of scanline buffer.
+
+When quantizing colors, output_components is 1, indicating a single color map
+index per pixel.  Otherwise it equals out_color_components.  The output arrays
+are required to be output_width * output_components JSAMPLEs wide.
+
+rec_outbuf_height is the recommended minimum height (in scanlines) of the
+buffer passed to jpeg_read_scanlines().  If the buffer is smaller, the
+library will still work, but time will be wasted due to unnecessary data
+copying.  In high-quality modes, rec_outbuf_height is always 1, but some
+faster, lower-quality modes set it to larger values (typically 2 to 4).
+If you are going to ask for a high-speed processing mode, you may as well
+go to the trouble of honoring rec_outbuf_height so as to avoid data copying.
+(An output buffer larger than rec_outbuf_height lines is OK, but won't
+provide any material speed improvement over that height.)
+
+
+Special color spaces
+--------------------
+
+The JPEG standard itself is "color blind" and doesn't specify any particular
+color space.  It is customary to convert color data to a luminance/chrominance
+color space before compressing, since this permits greater compression.  The
+existing de-facto JPEG file format standards specify YCbCr or grayscale data
+(JFIF), or grayscale, RGB, YCbCr, CMYK, or YCCK (Adobe).  For special
+applications such as multispectral images, other color spaces can be used,
+but it must be understood that such files will be unportable.
+
+The JPEG library can handle the most common colorspace conversions (namely
+RGB <=> YCbCr and CMYK <=> YCCK).  It can also deal with data of an unknown
+color space, passing it through without conversion.  If you deal extensively
+with an unusual color space, you can easily extend the library to understand
+additional color spaces and perform appropriate conversions.
+
+For compression, the source data's color space is specified by field
+in_color_space.  This is transformed to the JPEG file's color space given
+by jpeg_color_space.  jpeg_set_defaults() chooses a reasonable JPEG color
+space depending on in_color_space, but you can override this by calling
+jpeg_set_colorspace().  Of course you must select a supported transformation.
+jccolor.c currently supports the following transformations:
+        RGB => YCbCr
+        RGB => GRAYSCALE
+        YCbCr => GRAYSCALE
+        CMYK => YCCK
+plus the null transforms: GRAYSCALE => GRAYSCALE, RGB => RGB,
+YCbCr => YCbCr, CMYK => CMYK, YCCK => YCCK, and UNKNOWN => UNKNOWN.
+
+The de-facto file format standards (JFIF and Adobe) specify APPn markers that
+indicate the color space of the JPEG file.  It is important to ensure that
+these are written correctly, or omitted if the JPEG file's color space is not
+one of the ones supported by the de-facto standards.  jpeg_set_colorspace()
+will set the compression parameters to include or omit the APPn markers
+properly, so long as it is told the truth about the JPEG color space.
+For example, if you are writing some random 3-component color space without
+conversion, don't try to fake out the library by setting in_color_space and
+jpeg_color_space to JCS_YCbCr; use JCS_UNKNOWN.  You may want to write an
+APPn marker of your own devising to identify the colorspace --- see "Special
+markers", below.
+
+When told that the color space is UNKNOWN, the library will default to using
+luminance-quality compression parameters for all color components.  You may
+well want to change these parameters.  See the source code for
+jpeg_set_colorspace(), in jcparam.c, for details.
+
+For decompression, the JPEG file's color space is given in jpeg_color_space,
+and this is transformed to the output color space out_color_space.
+jpeg_read_header's setting of jpeg_color_space can be relied on if the file
+conforms to JFIF or Adobe conventions, but otherwise it is no better than a
+guess.  If you know the JPEG file's color space for certain, you can override
+jpeg_read_header's guess by setting jpeg_color_space.  jpeg_read_header also
+selects a default output color space based on (its guess of) jpeg_color_space;
+set out_color_space to override this.  Again, you must select a supported
+transformation.  jdcolor.c currently supports
+        YCbCr => RGB
+        YCbCr => GRAYSCALE
+        RGB => GRAYSCALE
+        GRAYSCALE => RGB
+        YCCK => CMYK
+as well as the null transforms.  (Since GRAYSCALE=>RGB is provided, an
+application can force grayscale JPEGs to look like color JPEGs if it only
+wants to handle one case.)
+
+The two-pass color quantizer, jquant2.c, is specialized to handle RGB data
+(it weights distances appropriately for RGB colors).  You'll need to modify
+the code if you want to use it for non-RGB output color spaces.  Note that
+jquant2.c is used to map to an application-supplied colormap as well as for
+the normal two-pass colormap selection process.
+
+CAUTION: it appears that Adobe Photoshop writes inverted data in CMYK JPEG
+files: 0 represents 100% ink coverage, rather than 0% ink as you'd expect.
+This is arguably a bug in Photoshop, but if you need to work with Photoshop
+CMYK files, you will have to deal with it in your application.  We cannot
+"fix" this in the library by inverting the data during the CMYK<=>YCCK
+transform, because that would break other applications, notably Ghostscript.
+Photoshop versions prior to 3.0 write EPS files containing JPEG-encoded CMYK
+data in the same inverted-YCCK representation used in bare JPEG files, but
+the surrounding PostScript code performs an inversion using the PS image
+operator.  I am told that Photoshop 3.0 will write uninverted YCCK in
+EPS/JPEG files, and will omit the PS-level inversion.  (But the data
+polarity used in bare JPEG files will not change in 3.0.)  In either case,
+the JPEG library must not invert the data itself, or else Ghostscript would
+read these EPS files incorrectly.
+
+
+Error handling
+--------------
+
+When the default error handler is used, any error detected inside the JPEG
+routines will cause a message to be printed on stderr, followed by exit().
+You can supply your own error handling routines to override this behavior
+and to control the treatment of nonfatal warnings and trace/debug messages.
+The file example.txt illustrates the most common case, which is to have the
+application regain control after an error rather than exiting.
+
+The JPEG library never writes any message directly; it always goes through
+the error handling routines.  Three classes of messages are recognized:
+  * Fatal errors: the library cannot continue.
+  * Warnings: the library can continue, but the data is corrupt, and a
+    damaged output image is likely to result.
+  * Trace/informational messages.  These come with a trace level indicating
+    the importance of the message; you can control the verbosity of the
+    program by adjusting the maximum trace level that will be displayed.
+
+You may, if you wish, simply replace the entire JPEG error handling module
+(jerror.c) with your own code.  However, you can avoid code duplication by
+only replacing some of the routines depending on the behavior you need.
+This is accomplished by calling jpeg_std_error() as usual, but then overriding
+some of the method pointers in the jpeg_error_mgr struct, as illustrated by
+example.txt.
+
+All of the error handling routines will receive a pointer to the JPEG object
+(a j_common_ptr which points to either a jpeg_compress_struct or a
+jpeg_decompress_struct; if you need to tell which, test the is_decompressor
+field).  This struct includes a pointer to the error manager struct in its
+"err" field.  Frequently, custom error handler routines will need to access
+additional data which is not known to the JPEG library or the standard error
+handler.  The most convenient way to do this is to embed either the JPEG
+object or the jpeg_error_mgr struct in a larger structure that contains
+additional fields; then casting the passed pointer provides access to the
+additional fields.  Again, see example.txt for one way to do it.  (Beginning
+with IJG version 6b, there is also a void pointer "client_data" in each
+JPEG object, which the application can also use to find related data.
+The library does not touch client_data at all.)
+
+The individual methods that you might wish to override are:
+
+error_exit (j_common_ptr cinfo)
+        Receives control for a fatal error.  Information sufficient to
+        generate the error message has been stored in cinfo->err; call
+        output_message to display it.  Control must NOT return to the caller;
+        generally this routine will exit() or longjmp() somewhere.
+        Typically you would override this routine to get rid of the exit()
+        default behavior.  Note that if you continue processing, you should
+        clean up the JPEG object with jpeg_abort() or jpeg_destroy().
+
+output_message (j_common_ptr cinfo)
+        Actual output of any JPEG message.  Override this to send messages
+        somewhere other than stderr.  Note that this method does not know
+        how to generate a message, only where to send it.
+
+format_message (j_common_ptr cinfo, char *buffer)
+        Constructs a readable error message string based on the error info
+        stored in cinfo->err.  This method is called by output_message.  Few
+        applications should need to override this method.  One possible
+        reason for doing so is to implement dynamic switching of error message
+        language.
+
+emit_message (j_common_ptr cinfo, int msg_level)
+        Decide whether or not to emit a warning or trace message; if so,
+        calls output_message.  The main reason for overriding this method
+        would be to abort on warnings.  msg_level is -1 for warnings,
+        0 and up for trace messages.
+
+Only error_exit() and emit_message() are called from the rest of the JPEG
+library; the other two are internal to the error handler.
+
+The actual message texts are stored in an array of strings which is pointed to
+by the field err->jpeg_message_table.  The messages are numbered from 0 to
+err->last_jpeg_message, and it is these code numbers that are used in the
+JPEG library code.  You could replace the message texts (for instance, with
+messages in French or German) by changing the message table pointer.  See
+jerror.h for the default texts.  CAUTION: this table will almost certainly
+change or grow from one library version to the next.
+
+It may be useful for an application to add its own message texts that are
+handled by the same mechanism.  The error handler supports a second "add-on"
+message table for this purpose.  To define an addon table, set the pointer
+err->addon_message_table and the message numbers err->first_addon_message and
+err->last_addon_message.  If you number the addon messages beginning at 1000
+or so, you won't have to worry about conflicts with the library's built-in
+messages.  See the sample applications cjpeg/djpeg for an example of using
+addon messages (the addon messages are defined in cderror.h).
+
+Actual invocation of the error handler is done via macros defined in jerror.h:
+        ERREXITn(...)   for fatal errors
+        WARNMSn(...)    for corrupt-data warnings
+        TRACEMSn(...)   for trace and informational messages.
+These macros store the message code and any additional parameters into the
+error handler struct, then invoke the error_exit() or emit_message() method.
+The variants of each macro are for varying numbers of additional parameters.
+The additional parameters are inserted into the generated message using
+standard printf() format codes.
+
+See jerror.h and jerror.c for further details.
+
+
+Compressed data handling (source and destination managers)
+----------------------------------------------------------
+
+The JPEG compression library sends its compressed data to a "destination
+manager" module.  The default destination manager just writes the data to a
+memory buffer or to a stdio stream, but you can provide your own manager to
+do something else.  Similarly, the decompression library calls a "source
+manager" to obtain the compressed data; you can provide your own source
+manager if you want the data to come from somewhere other than a memory
+buffer or a stdio stream.
+
+In both cases, compressed data is processed a bufferload at a time: the
+destination or source manager provides a work buffer, and the library invokes
+the manager only when the buffer is filled or emptied.  (You could define a
+one-character buffer to force the manager to be invoked for each byte, but
+that would be rather inefficient.)  The buffer's size and location are
+controlled by the manager, not by the library.  For example, the memory
+source manager just makes the buffer pointer and length point to the original
+data in memory.  In this case the buffer-reload procedure will be invoked
+only if the decompressor ran off the end of the datastream, which would
+indicate an erroneous datastream.
+
+The work buffer is defined as an array of datatype JOCTET, which is generally
+"char" or "unsigned char".  On a machine where char is not exactly 8 bits
+wide, you must define JOCTET as a wider data type and then modify the data
+source and destination modules to transcribe the work arrays into 8-bit units
+on external storage.
+
+A data destination manager struct contains a pointer and count defining the
+next byte to write in the work buffer and the remaining free space:
+
+        JOCTET *next_output_byte;   /* => next byte to write in buffer */
+        size_t free_in_buffer;      /* # of byte spaces remaining in buffer */
+
+The library increments the pointer and decrements the count until the buffer
+is filled.  The manager's empty_output_buffer method must reset the pointer
+and count.  The manager is expected to remember the buffer's starting address
+and total size in private fields not visible to the library.
+
+A data destination manager provides three methods:
+
+init_destination (j_compress_ptr cinfo)
+        Initialize destination.  This is called by jpeg_start_compress()
+        before any data is actually written.  It must initialize
+        next_output_byte and free_in_buffer.  free_in_buffer must be
+        initialized to a positive value.
+
+empty_output_buffer (j_compress_ptr cinfo)
+        This is called whenever the buffer has filled (free_in_buffer
+        reaches zero).  In typical applications, it should write out the
+        *entire* buffer (use the saved start address and buffer length;
+        ignore the current state of next_output_byte and free_in_buffer).
+        Then reset the pointer & count to the start of the buffer, and
+        return TRUE indicating that the buffer has been dumped.
+        free_in_buffer must be set to a positive value when TRUE is
+        returned.  A FALSE return should only be used when I/O suspension is
+        desired (this operating mode is discussed in the next section).
+
+term_destination (j_compress_ptr cinfo)
+        Terminate destination --- called by jpeg_finish_compress() after all
+        data has been written.  In most applications, this must flush any
+        data remaining in the buffer.  Use either next_output_byte or
+        free_in_buffer to determine how much data is in the buffer.
+
+term_destination() is NOT called by jpeg_abort() or jpeg_destroy().  If you
+want the destination manager to be cleaned up during an abort, you must do it
+yourself.
+
+You will also need code to create a jpeg_destination_mgr struct, fill in its
+method pointers, and insert a pointer to the struct into the "dest" field of
+the JPEG compression object.  This can be done in-line in your setup code if
+you like, but it's probably cleaner to provide a separate routine similar to
+the jpeg_stdio_dest() or jpeg_mem_dest() routines of the supplied destination
+managers.
+
+Decompression source managers follow a parallel design, but with some
+additional frammishes.  The source manager struct contains a pointer and count
+defining the next byte to read from the work buffer and the number of bytes
+remaining:
+
+        const JOCTET *next_input_byte;  /* => next byte to read from buffer */
+        size_t bytes_in_buffer;         /* # of bytes remaining in buffer */
+
+The library increments the pointer and decrements the count until the buffer
+is emptied.  The manager's fill_input_buffer method must reset the pointer and
+count.  In most applications, the manager must remember the buffer's starting
+address and total size in private fields not visible to the library.
+
+A data source manager provides five methods:
+
+init_source (j_decompress_ptr cinfo)
+        Initialize source.  This is called by jpeg_read_header() before any
+        data is actually read.  Unlike init_destination(), it may leave
+        bytes_in_buffer set to 0 (in which case a fill_input_buffer() call
+        will occur immediately).
+
+fill_input_buffer (j_decompress_ptr cinfo)
+        This is called whenever bytes_in_buffer has reached zero and more
+        data is wanted.  In typical applications, it should read fresh data
+        into the buffer (ignoring the current state of next_input_byte and
+        bytes_in_buffer), reset the pointer & count to the start of the
+        buffer, and return TRUE indicating that the buffer has been reloaded.
+        It is not necessary to fill the buffer entirely, only to obtain at
+        least one more byte.  bytes_in_buffer MUST be set to a positive value
+        if TRUE is returned.  A FALSE return should only be used when I/O
+        suspension is desired (this mode is discussed in the next section).
+
+skip_input_data (j_decompress_ptr cinfo, long num_bytes)
+        Skip num_bytes worth of data.  The buffer pointer and count should
+        be advanced over num_bytes input bytes, refilling the buffer as
+        needed.  This is used to skip over a potentially large amount of
+        uninteresting data (such as an APPn marker).  In some applications
+        it may be possible to optimize away the reading of the skipped data,
+        but it's not clear that being smart is worth much trouble; large
+        skips are uncommon.  bytes_in_buffer may be zero on return.
+        A zero or negative skip count should be treated as a no-op.
+
+resync_to_restart (j_decompress_ptr cinfo, int desired)
+        This routine is called only when the decompressor has failed to find
+        a restart (RSTn) marker where one is expected.  Its mission is to
+        find a suitable point for resuming decompression.  For most
+        applications, we recommend that you just use the default resync
+        procedure, jpeg_resync_to_restart().  However, if you are able to back
+        up in the input data stream, or if you have a-priori knowledge about
+        the likely location of restart markers, you may be able to do better.
+        Read the read_restart_marker() and jpeg_resync_to_restart() routines
+        in jdmarker.c if you think you'd like to implement your own resync
+        procedure.
+
+term_source (j_decompress_ptr cinfo)
+        Terminate source --- called by jpeg_finish_decompress() after all
+        data has been read.  Often a no-op.
+
+For both fill_input_buffer() and skip_input_data(), there is no such thing
+as an EOF return.  If the end of the file has been reached, the routine has
+a choice of exiting via ERREXIT() or inserting fake data into the buffer.
+In most cases, generating a warning message and inserting a fake EOI marker
+is the best course of action --- this will allow the decompressor to output
+however much of the image is there.  In pathological cases, the decompressor
+may swallow the EOI and again demand data ... just keep feeding it fake EOIs.
+jdatasrc.c illustrates the recommended error recovery behavior.
+
+term_source() is NOT called by jpeg_abort() or jpeg_destroy().  If you want
+the source manager to be cleaned up during an abort, you must do it yourself.
+
+You will also need code to create a jpeg_source_mgr struct, fill in its method
+pointers, and insert a pointer to the struct into the "src" field of the JPEG
+decompression object.  This can be done in-line in your setup code if you
+like, but it's probably cleaner to provide a separate routine similar to the
+jpeg_stdio_src() or jpeg_mem_src() routines of the supplied source managers.
+
+For more information, consult the memory and stdio source and destination
+managers in jdatasrc.c and jdatadst.c.
+
+
+I/O suspension
+--------------
+
+Some applications need to use the JPEG library as an incremental memory-to-
+memory filter: when the compressed data buffer is filled or emptied, they want
+control to return to the outer loop, rather than expecting that the buffer can
+be emptied or reloaded within the data source/destination manager subroutine.
+The library supports this need by providing an "I/O suspension" mode, which we
+describe in this section.
+
+The I/O suspension mode is not a panacea: nothing is guaranteed about the
+maximum amount of time spent in any one call to the library, so it will not
+eliminate response-time problems in single-threaded applications.  If you
+need guaranteed response time, we suggest you "bite the bullet" and implement
+a real multi-tasking capability.
+
+To use I/O suspension, cooperation is needed between the calling application
+and the data source or destination manager; you will always need a custom
+source/destination manager.  (Please read the previous section if you haven't
+already.)  The basic idea is that the empty_output_buffer() or
+fill_input_buffer() routine is a no-op, merely returning FALSE to indicate
+that it has done nothing.  Upon seeing this, the JPEG library suspends
+operation and returns to its caller.  The surrounding application is
+responsible for emptying or refilling the work buffer before calling the
+JPEG library again.
+
+Compression suspension:
+
+For compression suspension, use an empty_output_buffer() routine that returns
+FALSE; typically it will not do anything else.  This will cause the
+compressor to return to the caller of jpeg_write_scanlines(), with the return
+value indicating that not all the supplied scanlines have been accepted.
+The application must make more room in the output buffer, adjust the output
+buffer pointer/count appropriately, and then call jpeg_write_scanlines()
+again, pointing to the first unconsumed scanline.
+
+When forced to suspend, the compressor will backtrack to a convenient stopping
+point (usually the start of the current MCU); it will regenerate some output
+data when restarted.  Therefore, although empty_output_buffer() is only
+called when the buffer is filled, you should NOT write out the entire buffer
+after a suspension.  Write only the data up to the current position of
+next_output_byte/free_in_buffer.  The data beyond that point will be
+regenerated after resumption.
+
+Because of the backtracking behavior, a good-size output buffer is essential
+for efficiency; you don't want the compressor to suspend often.  (In fact, an
+overly small buffer could lead to infinite looping, if a single MCU required
+more data than would fit in the buffer.)  We recommend a buffer of at least
+several Kbytes.  You may want to insert explicit code to ensure that you don't
+call jpeg_write_scanlines() unless there is a reasonable amount of space in
+the output buffer; in other words, flush the buffer before trying to compress
+more data.
+
+The compressor does not allow suspension while it is trying to write JPEG
+markers at the beginning and end of the file.  This means that:
+  * At the beginning of a compression operation, there must be enough free
+    space in the output buffer to hold the header markers (typically 600 or
+    so bytes).  The recommended buffer size is bigger than this anyway, so
+    this is not a problem as long as you start with an empty buffer.  However,
+    this restriction might catch you if you insert large special markers, such
+    as a JFIF thumbnail image, without flushing the buffer afterwards.
+  * When you call jpeg_finish_compress(), there must be enough space in the
+    output buffer to emit any buffered data and the final EOI marker.  In the
+    current implementation, half a dozen bytes should suffice for this, but
+    for safety's sake we recommend ensuring that at least 100 bytes are free
+    before calling jpeg_finish_compress().
+
+A more significant restriction is that jpeg_finish_compress() cannot suspend.
+This means you cannot use suspension with multi-pass operating modes, namely
+Huffman code optimization and multiple-scan output.  Those modes write the
+whole file during jpeg_finish_compress(), which will certainly result in
+buffer overrun.  (Note that this restriction applies only to compression,
+not decompression.  The decompressor supports input suspension in all of its
+operating modes.)
+
+Decompression suspension:
+
+For decompression suspension, use a fill_input_buffer() routine that simply
+returns FALSE (except perhaps during error recovery, as discussed below).
+This will cause the decompressor to return to its caller with an indication
+that suspension has occurred.  This can happen at four places:
+  * jpeg_read_header(): will return JPEG_SUSPENDED.
+  * jpeg_start_decompress(): will return FALSE, rather than its usual TRUE.
+  * jpeg_read_scanlines(): will return the number of scanlines already
+        completed (possibly 0).
+  * jpeg_finish_decompress(): will return FALSE, rather than its usual TRUE.
+The surrounding application must recognize these cases, load more data into
+the input buffer, and repeat the call.  In the case of jpeg_read_scanlines(),
+increment the passed pointers past any scanlines successfully read.
+
+Just as with compression, the decompressor will typically backtrack to a
+convenient restart point before suspending.  When fill_input_buffer() is
+called, next_input_byte/bytes_in_buffer point to the current restart point,
+which is where the decompressor will backtrack to if FALSE is returned.
+The data beyond that position must NOT be discarded if you suspend; it needs
+to be re-read upon resumption.  In most implementations, you'll need to shift
+this data down to the start of your work buffer and then load more data after
+it.  Again, this behavior means that a several-Kbyte work buffer is essential
+for decent performance; furthermore, you should load a reasonable amount of
+new data before resuming decompression.  (If you loaded, say, only one new
+byte each time around, you could waste a LOT of cycles.)
+
+The skip_input_data() source manager routine requires special care in a
+suspension scenario.  This routine is NOT granted the ability to suspend the
+decompressor; it can decrement bytes_in_buffer to zero, but no more.  If the
+requested skip distance exceeds the amount of data currently in the input
+buffer, then skip_input_data() must set bytes_in_buffer to zero and record the
+additional skip distance somewhere else.  The decompressor will immediately
+call fill_input_buffer(), which should return FALSE, which will cause a
+suspension return.  The surrounding application must then arrange to discard
+the recorded number of bytes before it resumes loading the input buffer.
+(Yes, this design is rather baroque, but it avoids complexity in the far more
+common case where a non-suspending source manager is used.)
+
+If the input data has been exhausted, we recommend that you emit a warning
+and insert dummy EOI markers just as a non-suspending data source manager
+would do.  This can be handled either in the surrounding application logic or
+within fill_input_buffer(); the latter is probably more efficient.  If
+fill_input_buffer() knows that no more data is available, it can set the
+pointer/count to point to a dummy EOI marker and then return TRUE just as
+though it had read more data in a non-suspending situation.
+
+The decompressor does not attempt to suspend within standard JPEG markers;
+instead it will backtrack to the start of the marker and reprocess the whole
+marker next time.  Hence the input buffer must be large enough to hold the
+longest standard marker in the file.  Standard JPEG markers should normally
+not exceed a few hundred bytes each (DHT tables are typically the longest).
+We recommend at least a 2K buffer for performance reasons, which is much
+larger than any correct marker is likely to be.  For robustness against
+damaged marker length counts, you may wish to insert a test in your
+application for the case that the input buffer is completely full and yet
+the decoder has suspended without consuming any data --- otherwise, if this
+situation did occur, it would lead to an endless loop.  (The library can't
+provide this test since it has no idea whether "the buffer is full", or
+even whether there is a fixed-size input buffer.)
+
+The input buffer would need to be 64K to allow for arbitrary COM or APPn
+markers, but these are handled specially: they are either saved into allocated
+memory, or skipped over by calling skip_input_data().  In the former case,
+suspension is handled correctly, and in the latter case, the problem of
+buffer overrun is placed on skip_input_data's shoulders, as explained above.
+Note that if you provide your own marker handling routine for large markers,
+you should consider how to deal with buffer overflow.
+
+Multiple-buffer management:
+
+In some applications it is desirable to store the compressed data in a linked
+list of buffer areas, so as to avoid data copying.  This can be handled by
+having empty_output_buffer() or fill_input_buffer() set the pointer and count
+to reference the next available buffer; FALSE is returned only if no more
+buffers are available.  Although seemingly straightforward, there is a
+pitfall in this approach: the backtrack that occurs when FALSE is returned
+could back up into an earlier buffer.  For example, when fill_input_buffer()
+is called, the current pointer & count indicate the backtrack restart point.
+Since fill_input_buffer() will set the pointer and count to refer to a new
+buffer, the restart position must be saved somewhere else.  Suppose a second
+call to fill_input_buffer() occurs in the same library call, and no
+additional input data is available, so fill_input_buffer must return FALSE.
+If the JPEG library has not moved the pointer/count forward in the current
+buffer, then *the correct restart point is the saved position in the prior
+buffer*.  Prior buffers may be discarded only after the library establishes
+a restart point within a later buffer.  Similar remarks apply for output into
+a chain of buffers.
+
+The library will never attempt to backtrack over a skip_input_data() call,
+so any skipped data can be permanently discarded.  You still have to deal
+with the case of skipping not-yet-received data, however.
+
+It's much simpler to use only a single buffer; when fill_input_buffer() is
+called, move any unconsumed data (beyond the current pointer/count) down to
+the beginning of this buffer and then load new data into the remaining buffer
+space.  This approach requires a little more data copying but is far easier
+to get right.
+
+
+Progressive JPEG support
+------------------------
+
+Progressive JPEG rearranges the stored data into a series of scans of
+increasing quality.  In situations where a JPEG file is transmitted across a
+slow communications link, a decoder can generate a low-quality image very
+quickly from the first scan, then gradually improve the displayed quality as
+more scans are received.  The final image after all scans are complete is
+identical to that of a regular (sequential) JPEG file of the same quality
+setting.  Progressive JPEG files are often slightly smaller than equivalent
+sequential JPEG files, but the possibility of incremental display is the main
+reason for using progressive JPEG.
+
+The IJG encoder library generates progressive JPEG files when given a
+suitable "scan script" defining how to divide the data into scans.
+Creation of progressive JPEG files is otherwise transparent to the encoder.
+Progressive JPEG files can also be read transparently by the decoder library.
+If the decoding application simply uses the library as defined above, it
+will receive a final decoded image without any indication that the file was
+progressive.  Of course, this approach does not allow incremental display.
+To perform incremental display, an application needs to use the decoder
+library's "buffered-image" mode, in which it receives a decoded image
+multiple times.
+
+Each displayed scan requires about as much work to decode as a full JPEG
+image of the same size, so the decoder must be fairly fast in relation to the
+data transmission rate in order to make incremental display useful.  However,
+it is possible to skip displaying the image and simply add the incoming bits
+to the decoder's coefficient buffer.  This is fast because only Huffman
+decoding need be done, not IDCT, upsampling, colorspace conversion, etc.
+The IJG decoder library allows the application to switch dynamically between
+displaying the image and simply absorbing the incoming bits.  A properly
+coded application can automatically adapt the number of display passes to
+suit the time available as the image is received.  Also, a final
+higher-quality display cycle can be performed from the buffered data after
+the end of the file is reached.
+
+Progressive compression:
+
+To create a progressive JPEG file (or a multiple-scan sequential JPEG file),
+set the scan_info cinfo field to point to an array of scan descriptors, and
+perform compression as usual.  Instead of constructing your own scan list,
+you can call the jpeg_simple_progression() helper routine to create a
+recommended progression sequence; this method should be used by all
+applications that don't want to get involved in the nitty-gritty of
+progressive scan sequence design.  (If you want to provide user control of
+scan sequences, you may wish to borrow the scan script reading code found
+in rdswitch.c, so that you can read scan script files just like cjpeg's.)
+When scan_info is not NULL, the compression library will store DCT'd data
+into a buffer array as jpeg_write_scanlines() is called, and will emit all
+the requested scans during jpeg_finish_compress().  This implies that
+multiple-scan output cannot be created with a suspending data destination
+manager, since jpeg_finish_compress() does not support suspension.  We
+should also note that the compressor currently forces Huffman optimization
+mode when creating a progressive JPEG file, because the default Huffman
+tables are unsuitable for progressive files.
+
+Progressive decompression:
+
+When buffered-image mode is not used, the decoder library will read all of
+a multi-scan file during jpeg_start_decompress(), so that it can provide a
+final decoded image.  (Here "multi-scan" means either progressive or
+multi-scan sequential.)  This makes multi-scan files transparent to the
+decoding application.  However, existing applications that used suspending
+input with version 5 of the IJG library will need to be modified to check
+for a suspension return from jpeg_start_decompress().
+
+To perform incremental display, an application must use the library's
+buffered-image mode.  This is described in the next section.
+
+
+Buffered-image mode
+-------------------
+
+In buffered-image mode, the library stores the partially decoded image in a
+coefficient buffer, from which it can be read out as many times as desired.
+This mode is typically used for incremental display of progressive JPEG files,
+but it can be used with any JPEG file.  Each scan of a progressive JPEG file
+adds more data (more detail) to the buffered image.  The application can
+display in lockstep with the source file (one display pass per input scan),
+or it can allow input processing to outrun display processing.  By making
+input and display processing run independently, it is possible for the
+application to adapt progressive display to a wide range of data transmission
+rates.
+
+The basic control flow for buffered-image decoding is
+
+        jpeg_create_decompress()
+        set data source
+        jpeg_read_header()
+        set overall decompression parameters
+        cinfo.buffered_image = TRUE;    /* select buffered-image mode */
+        jpeg_start_decompress()
+        for (each output pass) {
+            adjust output decompression parameters if required
+            jpeg_start_output()         /* start a new output pass */
+            for (all scanlines in image) {
+                jpeg_read_scanlines()
+                display scanlines
+            }
+            jpeg_finish_output()        /* terminate output pass */
+        }
+        jpeg_finish_decompress()
+        jpeg_destroy_decompress()
+
+This differs from ordinary unbuffered decoding in that there is an additional
+level of looping.  The application can choose how many output passes to make
+and how to display each pass.
+
+The simplest approach to displaying progressive images is to do one display
+pass for each scan appearing in the input file.  In this case the outer loop
+condition is typically
+        while (!jpeg_input_complete(&cinfo))
+and the start-output call should read
+        jpeg_start_output(&cinfo, cinfo.input_scan_number);
+The second parameter to jpeg_start_output() indicates which scan of the input
+file is to be displayed; the scans are numbered starting at 1 for this
+purpose.  (You can use a loop counter starting at 1 if you like, but using
+the library's input scan counter is easier.)  The library automatically reads
+data as necessary to complete each requested scan, and jpeg_finish_output()
+advances to the next scan or end-of-image marker (hence input_scan_number
+will be incremented by the time control arrives back at jpeg_start_output()).
+With this technique, data is read from the input file only as needed, and
+input and output processing run in lockstep.
+
+After reading the final scan and reaching the end of the input file, the
+buffered image remains available; it can be read additional times by
+repeating the jpeg_start_output()/jpeg_read_scanlines()/jpeg_finish_output()
+sequence.  For example, a useful technique is to use fast one-pass color
+quantization for display passes made while the image is arriving, followed by
+a final display pass using two-pass quantization for highest quality.  This
+is done by changing the library parameters before the final output pass.
+Changing parameters between passes is discussed in detail below.
+
+In general the last scan of a progressive file cannot be recognized as such
+until after it is read, so a post-input display pass is the best approach if
+you want special processing in the final pass.
+
+When done with the image, be sure to call jpeg_finish_decompress() to release
+the buffered image (or just use jpeg_destroy_decompress()).
+
+If input data arrives faster than it can be displayed, the application can
+cause the library to decode input data in advance of what's needed to produce
+output.  This is done by calling the routine jpeg_consume_input().
+The return value is one of the following:
+        JPEG_REACHED_SOS:    reached an SOS marker (the start of a new scan)
+        JPEG_REACHED_EOI:    reached the EOI marker (end of image)
+        JPEG_ROW_COMPLETED:  completed reading one MCU row of compressed data
+        JPEG_SCAN_COMPLETED: completed reading last MCU row of current scan
+        JPEG_SUSPENDED:      suspended before completing any of the above
+(JPEG_SUSPENDED can occur only if a suspending data source is used.)  This
+routine can be called at any time after initializing the JPEG object.  It
+reads some additional data and returns when one of the indicated significant
+events occurs.  (If called after the EOI marker is reached, it will
+immediately return JPEG_REACHED_EOI without attempting to read more data.)
+
+The library's output processing will automatically call jpeg_consume_input()
+whenever the output processing overtakes the input; thus, simple lockstep
+display requires no direct calls to jpeg_consume_input().  But by adding
+calls to jpeg_consume_input(), you can absorb data in advance of what is
+being displayed.  This has two benefits:
+  * You can limit buildup of unprocessed data in your input buffer.
+  * You can eliminate extra display passes by paying attention to the
+    state of the library's input processing.
+
+The first of these benefits only requires interspersing calls to
+jpeg_consume_input() with your display operations and any other processing
+you may be doing.  To avoid wasting cycles due to backtracking, it's best to
+call jpeg_consume_input() only after a hundred or so new bytes have arrived.
+This is discussed further under "I/O suspension", above.  (Note: the JPEG
+library currently is not thread-safe.  You must not call jpeg_consume_input()
+from one thread of control if a different library routine is working on the
+same JPEG object in another thread.)
+
+When input arrives fast enough that more than one new scan is available
+before you start a new output pass, you may as well skip the output pass
+corresponding to the completed scan.  This occurs for free if you pass
+cinfo.input_scan_number as the target scan number to jpeg_start_output().
+The input_scan_number field is simply the index of the scan currently being
+consumed by the input processor.  You can ensure that this is up-to-date by
+emptying the input buffer just before calling jpeg_start_output(): call
+jpeg_consume_input() repeatedly until it returns JPEG_SUSPENDED or
+JPEG_REACHED_EOI.
+
+The target scan number passed to jpeg_start_output() is saved in the
+cinfo.output_scan_number field.  The library's output processing calls
+jpeg_consume_input() whenever the current input scan number and row within
+that scan is less than or equal to the current output scan number and row.
+Thus, input processing can "get ahead" of the output processing but is not
+allowed to "fall behind".  You can achieve several different effects by
+manipulating this interlock rule.  For example, if you pass a target scan
+number greater than the current input scan number, the output processor will
+wait until that scan starts to arrive before producing any output.  (To avoid
+an infinite loop, the target scan number is automatically reset to the last
+scan number when the end of image is reached.  Thus, if you specify a large
+target scan number, the library will just absorb the entire input file and
+then perform an output pass.  This is effectively the same as what
+jpeg_start_decompress() does when you don't select buffered-image mode.)
+When you pass a target scan number equal to the current input scan number,
+the image is displayed no faster than the current input scan arrives.  The
+final possibility is to pass a target scan number less than the current input
+scan number; this disables the input/output interlock and causes the output
+processor to simply display whatever it finds in the image buffer, without
+waiting for input.  (However, the library will not accept a target scan
+number less than one, so you can't avoid waiting for the first scan.)
+
+When data is arriving faster than the output display processing can advance
+through the image, jpeg_consume_input() will store data into the buffered
+image beyond the point at which the output processing is reading data out
+again.  If the input arrives fast enough, it may "wrap around" the buffer to
+the point where the input is more than one whole scan ahead of the output.
+If the output processing simply proceeds through its display pass without
+paying attention to the input, the effect seen on-screen is that the lower
+part of the image is one or more scans better in quality than the upper part.
+Then, when the next output scan is started, you have a choice of what target
+scan number to use.  The recommended choice is to use the current input scan
+number at that time, which implies that you've skipped the output scans
+corresponding to the input scans that were completed while you processed the
+previous output scan.  In this way, the decoder automatically adapts its
+speed to the arriving data, by skipping output scans as necessary to keep up
+with the arriving data.
+
+When using this strategy, you'll want to be sure that you perform a final
+output pass after receiving all the data; otherwise your last display may not
+be full quality across the whole screen.  So the right outer loop logic is
+something like this:
+        do {
+            absorb any waiting input by calling jpeg_consume_input()
+            final_pass = jpeg_input_complete(&cinfo);
+            adjust output decompression parameters if required
+            jpeg_start_output(&cinfo, cinfo.input_scan_number);
+            ...
+            jpeg_finish_output()
+        } while (!final_pass);
+rather than quitting as soon as jpeg_input_complete() returns TRUE.  This
+arrangement makes it simple to use higher-quality decoding parameters
+for the final pass.  But if you don't want to use special parameters for
+the final pass, the right loop logic is like this:
+        for (;;) {
+            absorb any waiting input by calling jpeg_consume_input()
+            jpeg_start_output(&cinfo, cinfo.input_scan_number);
+            ...
+            jpeg_finish_output()
+            if (jpeg_input_complete(&cinfo) &&
+                cinfo.input_scan_number == cinfo.output_scan_number)
+              break;
+        }
+In this case you don't need to know in advance whether an output pass is to
+be the last one, so it's not necessary to have reached EOF before starting
+the final output pass; rather, what you want to test is whether the output
+pass was performed in sync with the final input scan.  This form of the loop
+will avoid an extra output pass whenever the decoder is able (or nearly able)
+to keep up with the incoming data.
+
+When the data transmission speed is high, you might begin a display pass,
+then find that much or all of the file has arrived before you can complete
+the pass.  (You can detect this by noting the JPEG_REACHED_EOI return code
+from jpeg_consume_input(), or equivalently by testing jpeg_input_complete().)
+In this situation you may wish to abort the current display pass and start a
+new one using the newly arrived information.  To do so, just call
+jpeg_finish_output() and then start a new pass with jpeg_start_output().
+
+A variant strategy is to abort and restart display if more than one complete
+scan arrives during an output pass; this can be detected by noting
+JPEG_REACHED_SOS returns and/or examining cinfo.input_scan_number.  This
+idea should be employed with caution, however, since the display process
+might never get to the bottom of the image before being aborted, resulting
+in the lower part of the screen being several passes worse than the upper.
+In most cases it's probably best to abort an output pass only if the whole
+file has arrived and you want to begin the final output pass immediately.
+
+When receiving data across a communication link, we recommend always using
+the current input scan number for the output target scan number; if a
+higher-quality final pass is to be done, it should be started (aborting any
+incomplete output pass) as soon as the end of file is received.  However,
+many other strategies are possible.  For example, the application can examine
+the parameters of the current input scan and decide whether to display it or
+not.  If the scan contains only chroma data, one might choose not to use it
+as the target scan, expecting that the scan will be small and will arrive
+quickly.  To skip to the next scan, call jpeg_consume_input() until it
+returns JPEG_REACHED_SOS or JPEG_REACHED_EOI.  Or just use the next higher
+number as the target scan for jpeg_start_output(); but that method doesn't
+let you inspect the next scan's parameters before deciding to display it.
+
+
+In buffered-image mode, jpeg_start_decompress() never performs input and
+thus never suspends.  An application that uses input suspension with
+buffered-image mode must be prepared for suspension returns from these
+routines:
+* jpeg_start_output() performs input only if you request 2-pass quantization
+  and the target scan isn't fully read yet.  (This is discussed below.)
+* jpeg_read_scanlines(), as always, returns the number of scanlines that it
+  was able to produce before suspending.
+* jpeg_finish_output() will read any markers following the target scan,
+  up to the end of the file or the SOS marker that begins another scan.
+  (But it reads no input if jpeg_consume_input() has already reached the
+  end of the file or a SOS marker beyond the target output scan.)
+* jpeg_finish_decompress() will read until the end of file, and thus can
+  suspend if the end hasn't already been reached (as can be tested by
+  calling jpeg_input_complete()).
+jpeg_start_output(), jpeg_finish_output(), and jpeg_finish_decompress()
+all return TRUE if they completed their tasks, FALSE if they had to suspend.
+In the event of a FALSE return, the application must load more input data
+and repeat the call.  Applications that use non-suspending data sources need
+not check the return values of these three routines.
+
+
+It is possible to change decoding parameters between output passes in the
+buffered-image mode.  The decoder library currently supports only very
+limited changes of parameters.  ONLY THE FOLLOWING parameter changes are
+allowed after jpeg_start_decompress() is called:
+* dct_method can be changed before each call to jpeg_start_output().
+  For example, one could use a fast DCT method for early scans, changing
+  to a higher quality method for the final scan.
+* dither_mode can be changed before each call to jpeg_start_output();
+  of course this has no impact if not using color quantization.  Typically
+  one would use ordered dither for initial passes, then switch to
+  Floyd-Steinberg dither for the final pass.  Caution: changing dither mode
+  can cause more memory to be allocated by the library.  Although the amount
+  of memory involved is not large (a scanline or so), it may cause the
+  initial max_memory_to_use specification to be exceeded, which in the worst
+  case would result in an out-of-memory failure.
+* do_block_smoothing can be changed before each call to jpeg_start_output().
+  This setting is relevant only when decoding a progressive JPEG image.
+  During the first DC-only scan, block smoothing provides a very "fuzzy" look
+  instead of the very "blocky" look seen without it; which is better seems a
+  matter of personal taste.  But block smoothing is nearly always a win
+  during later stages, especially when decoding a successive-approximation
+  image: smoothing helps to hide the slight blockiness that otherwise shows
+  up on smooth gradients until the lowest coefficient bits are sent.
+* Color quantization mode can be changed under the rules described below.
+  You *cannot* change between full-color and quantized output (because that
+  would alter the required I/O buffer sizes), but you can change which
+  quantization method is used.
+
+When generating color-quantized output, changing quantization method is a
+very useful way of switching between high-speed and high-quality display.
+The library allows you to change among its three quantization methods:
+1. Single-pass quantization to a fixed color cube.
+   Selected by cinfo.two_pass_quantize = FALSE and cinfo.colormap = NULL.
+2. Single-pass quantization to an application-supplied colormap.
+   Selected by setting cinfo.colormap to point to the colormap (the value of
+   two_pass_quantize is ignored); also set cinfo.actual_number_of_colors.
+3. Two-pass quantization to a colormap chosen specifically for the image.
+   Selected by cinfo.two_pass_quantize = TRUE and cinfo.colormap = NULL.
+   (This is the default setting selected by jpeg_read_header, but it is
+   probably NOT what you want for the first pass of progressive display!)
+These methods offer successively better quality and lesser speed.  However,
+only the first method is available for quantizing in non-RGB color spaces.
+
+IMPORTANT: because the different quantizer methods have very different
+working-storage requirements, the library requires you to indicate which
+one(s) you intend to use before you call jpeg_start_decompress().  (If we did
+not require this, the max_memory_to_use setting would be a complete fiction.)
+You do this by setting one or more of these three cinfo fields to TRUE:
+        enable_1pass_quant              Fixed color cube colormap
+        enable_external_quant           Externally-supplied colormap
+        enable_2pass_quant              Two-pass custom colormap
+All three are initialized FALSE by jpeg_read_header().  But
+jpeg_start_decompress() automatically sets TRUE the one selected by the
+current two_pass_quantize and colormap settings, so you only need to set the
+enable flags for any other quantization methods you plan to change to later.
+
+After setting the enable flags correctly at jpeg_start_decompress() time, you
+can change to any enabled quantization method by setting two_pass_quantize
+and colormap properly just before calling jpeg_start_output().  The following
+special rules apply:
+1. You must explicitly set cinfo.colormap to NULL when switching to 1-pass
+   or 2-pass mode from a different mode, or when you want the 2-pass
+   quantizer to be re-run to generate a new colormap.
+2. To switch to an external colormap, or to change to a different external
+   colormap than was used on the prior pass, you must call
+   jpeg_new_colormap() after setting cinfo.colormap.
+NOTE: if you want to use the same colormap as was used in the prior pass,
+you should not do either of these things.  This will save some nontrivial
+switchover costs.
+(These requirements exist because cinfo.colormap will always be non-NULL
+after completing a prior output pass, since both the 1-pass and 2-pass
+quantizers set it to point to their output colormaps.  Thus you have to
+do one of these two things to notify the library that something has changed.
+Yup, it's a bit klugy, but it's necessary to do it this way for backwards
+compatibility.)
+
+Note that in buffered-image mode, the library generates any requested colormap
+during jpeg_start_output(), not during jpeg_start_decompress().
+
+When using two-pass quantization, jpeg_start_output() makes a pass over the
+buffered image to determine the optimum color map; it therefore may take a
+significant amount of time, whereas ordinarily it does little work.  The
+progress monitor hook is called during this pass, if defined.  It is also
+important to realize that if the specified target scan number is greater than
+or equal to the current input scan number, jpeg_start_output() will attempt
+to consume input as it makes this pass.  If you use a suspending data source,
+you need to check for a FALSE return from jpeg_start_output() under these
+conditions.  The combination of 2-pass quantization and a not-yet-fully-read
+target scan is the only case in which jpeg_start_output() will consume input.
+
+
+Application authors who support buffered-image mode may be tempted to use it
+for all JPEG images, even single-scan ones.  This will work, but it is
+inefficient: there is no need to create an image-sized coefficient buffer for
+single-scan images.  Requesting buffered-image mode for such an image wastes
+memory.  Worse, it can cost time on large images, since the buffered data has
+to be swapped out or written to a temporary file.  If you are concerned about
+maximum performance on baseline JPEG files, you should use buffered-image
+mode only when the incoming file actually has multiple scans.  This can be
+tested by calling jpeg_has_multiple_scans(), which will return a correct
+result at any time after jpeg_read_header() completes.
+
+It is also worth noting that when you use jpeg_consume_input() to let input
+processing get ahead of output processing, the resulting pattern of access to
+the coefficient buffer is quite nonsequential.  It's best to use the memory
+manager jmemnobs.c if you can (ie, if you have enough real or virtual main
+memory).  If not, at least make sure that max_memory_to_use is set as high as
+possible.  If the JPEG memory manager has to use a temporary file, you will
+probably see a lot of disk traffic and poor performance.  (This could be
+improved with additional work on the memory manager, but we haven't gotten
+around to it yet.)
+
+In some applications it may be convenient to use jpeg_consume_input() for all
+input processing, including reading the initial markers; that is, you may
+wish to call jpeg_consume_input() instead of jpeg_read_header() during
+startup.  This works, but note that you must check for JPEG_REACHED_SOS and
+JPEG_REACHED_EOI return codes as the equivalent of jpeg_read_header's codes.
+Once the first SOS marker has been reached, you must call
+jpeg_start_decompress() before jpeg_consume_input() will consume more input;
+it'll just keep returning JPEG_REACHED_SOS until you do.  If you read a
+tables-only file this way, jpeg_consume_input() will return JPEG_REACHED_EOI
+without ever returning JPEG_REACHED_SOS; be sure to check for this case.
+If this happens, the decompressor will not read any more input until you call
+jpeg_abort() to reset it.  It is OK to call jpeg_consume_input() even when not
+using buffered-image mode, but in that case it's basically a no-op after the
+initial markers have been read: it will just return JPEG_SUSPENDED.
+
+
+Abbreviated datastreams and multiple images
+-------------------------------------------
+
+A JPEG compression or decompression object can be reused to process multiple
+images.  This saves a small amount of time per image by eliminating the
+"create" and "destroy" operations, but that isn't the real purpose of the
+feature.  Rather, reuse of an object provides support for abbreviated JPEG
+datastreams.  Object reuse can also simplify processing a series of images in
+a single input or output file.  This section explains these features.
+
+A JPEG file normally contains several hundred bytes worth of quantization
+and Huffman tables.  In a situation where many images will be stored or
+transmitted with identical tables, this may represent an annoying overhead.
+The JPEG standard therefore permits tables to be omitted.  The standard
+defines three classes of JPEG datastreams:
+  * "Interchange" datastreams contain an image and all tables needed to decode
+     the image.  These are the usual kind of JPEG file.
+  * "Abbreviated image" datastreams contain an image, but are missing some or
+    all of the tables needed to decode that image.
+  * "Abbreviated table specification" (henceforth "tables-only") datastreams
+    contain only table specifications.
+To decode an abbreviated image, it is necessary to load the missing table(s)
+into the decoder beforehand.  This can be accomplished by reading a separate
+tables-only file.  A variant scheme uses a series of images in which the first
+image is an interchange (complete) datastream, while subsequent ones are
+abbreviated and rely on the tables loaded by the first image.  It is assumed
+that once the decoder has read a table, it will remember that table until a
+new definition for the same table number is encountered.
+
+It is the application designer's responsibility to figure out how to associate
+the correct tables with an abbreviated image.  While abbreviated datastreams
+can be useful in a closed environment, their use is strongly discouraged in
+any situation where data exchange with other applications might be needed.
+Caveat designer.
+
+The JPEG library provides support for reading and writing any combination of
+tables-only datastreams and abbreviated images.  In both compression and
+decompression objects, a quantization or Huffman table will be retained for
+the lifetime of the object, unless it is overwritten by a new table definition.
+
+
+To create abbreviated image datastreams, it is only necessary to tell the
+compressor not to emit some or all of the tables it is using.  Each
+quantization and Huffman table struct contains a boolean field "sent_table",
+which normally is initialized to FALSE.  For each table used by the image, the
+header-writing process emits the table and sets sent_table = TRUE unless it is
+already TRUE.  (In normal usage, this prevents outputting the same table
+definition multiple times, as would otherwise occur because the chroma
+components typically share tables.)  Thus, setting this field to TRUE before
+calling jpeg_start_compress() will prevent the table from being written at
+all.
+
+If you want to create a "pure" abbreviated image file containing no tables,
+just call "jpeg_suppress_tables(&cinfo, TRUE)" after constructing all the
+tables.  If you want to emit some but not all tables, you'll need to set the
+individual sent_table fields directly.
+
+To create an abbreviated image, you must also call jpeg_start_compress()
+with a second parameter of FALSE, not TRUE.  Otherwise jpeg_start_compress()
+will force all the sent_table fields to FALSE.  (This is a safety feature to
+prevent abbreviated images from being created accidentally.)
+
+To create a tables-only file, perform the same parameter setup that you
+normally would, but instead of calling jpeg_start_compress() and so on, call
+jpeg_write_tables(&cinfo).  This will write an abbreviated datastream
+containing only SOI, DQT and/or DHT markers, and EOI.  All the quantization
+and Huffman tables that are currently defined in the compression object will
+be emitted unless their sent_tables flag is already TRUE, and then all the
+sent_tables flags will be set TRUE.
+
+A sure-fire way to create matching tables-only and abbreviated image files
+is to proceed as follows:
+
+        create JPEG compression object
+        set JPEG parameters
+        set destination to tables-only file
+        jpeg_write_tables(&cinfo);
+        set destination to image file
+        jpeg_start_compress(&cinfo, FALSE);
+        write data...
+        jpeg_finish_compress(&cinfo);
+
+Since the JPEG parameters are not altered between writing the table file and
+the abbreviated image file, the same tables are sure to be used.  Of course,
+you can repeat the jpeg_start_compress() ... jpeg_finish_compress() sequence
+many times to produce many abbreviated image files matching the table file.
+
+You cannot suppress output of the computed Huffman tables when Huffman
+optimization is selected.  (If you could, there'd be no way to decode the
+image...)  Generally, you don't want to set optimize_coding = TRUE when
+you are trying to produce abbreviated files.
+
+In some cases you might want to compress an image using tables which are
+not stored in the application, but are defined in an interchange or
+tables-only file readable by the application.  This can be done by setting up
+a JPEG decompression object to read the specification file, then copying the
+tables into your compression object.  See jpeg_copy_critical_parameters()
+for an example of copying quantization tables.
+
+
+To read abbreviated image files, you simply need to load the proper tables
+into the decompression object before trying to read the abbreviated image.
+If the proper tables are stored in the application program, you can just
+allocate the table structs and fill in their contents directly.  For example,
+to load a fixed quantization table into table slot "n":
+
+    if (cinfo.quant_tbl_ptrs[n] == NULL)
+      cinfo.quant_tbl_ptrs[n] = jpeg_alloc_quant_table((j_common_ptr) &cinfo);
+    quant_ptr = cinfo.quant_tbl_ptrs[n];        /* quant_ptr is JQUANT_TBL* */
+    for (i = 0; i < 64; i++) {
+      /* Qtable[] is desired quantization table, in natural array order */
+      quant_ptr->quantval[i] = Qtable[i];
+    }
+
+Code to load a fixed Huffman table is typically (for AC table "n"):
+
+    if (cinfo.ac_huff_tbl_ptrs[n] == NULL)
+      cinfo.ac_huff_tbl_ptrs[n] = jpeg_alloc_huff_table((j_common_ptr) &cinfo);
+    huff_ptr = cinfo.ac_huff_tbl_ptrs[n];       /* huff_ptr is JHUFF_TBL* */
+    for (i = 1; i <= 16; i++) {
+      /* counts[i] is number of Huffman codes of length i bits, i=1..16 */
+      huff_ptr->bits[i] = counts[i];
+    }
+    for (i = 0; i < 256; i++) {
+      /* symbols[] is the list of Huffman symbols, in code-length order */
+      huff_ptr->huffval[i] = symbols[i];
+    }
+
+(Note that trying to set cinfo.quant_tbl_ptrs[n] to point directly at a
+constant JQUANT_TBL object is not safe.  If the incoming file happened to
+contain a quantization table definition, your master table would get
+overwritten!  Instead allocate a working table copy and copy the master table
+into it, as illustrated above.  Ditto for Huffman tables, of course.)
+
+You might want to read the tables from a tables-only file, rather than
+hard-wiring them into your application.  The jpeg_read_header() call is
+sufficient to read a tables-only file.  You must pass a second parameter of
+FALSE to indicate that you do not require an image to be present.  Thus, the
+typical scenario is
+
+        create JPEG decompression object
+        set source to tables-only file
+        jpeg_read_header(&cinfo, FALSE);
+        set source to abbreviated image file
+        jpeg_read_header(&cinfo, TRUE);
+        set decompression parameters
+        jpeg_start_decompress(&cinfo);
+        read data...
+        jpeg_finish_decompress(&cinfo);
+
+In some cases, you may want to read a file without knowing whether it contains
+an image or just tables.  In that case, pass FALSE and check the return value
+from jpeg_read_header(): it will be JPEG_HEADER_OK if an image was found,
+JPEG_HEADER_TABLES_ONLY if only tables were found.  (A third return value,
+JPEG_SUSPENDED, is possible when using a suspending data source manager.)
+Note that jpeg_read_header() will not complain if you read an abbreviated
+image for which you haven't loaded the missing tables; the missing-table check
+occurs later, in jpeg_start_decompress().
+
+
+It is possible to read a series of images from a single source file by
+repeating the jpeg_read_header() ... jpeg_finish_decompress() sequence,
+without releasing/recreating the JPEG object or the data source module.
+(If you did reinitialize, any partial bufferload left in the data source
+buffer at the end of one image would be discarded, causing you to lose the
+start of the next image.)  When you use this method, stored tables are
+automatically carried forward, so some of the images can be abbreviated images
+that depend on tables from earlier images.
+
+If you intend to write a series of images into a single destination file,
+you might want to make a specialized data destination module that doesn't
+flush the output buffer at term_destination() time.  This would speed things
+up by some trifling amount.  Of course, you'd need to remember to flush the
+buffer after the last image.  You can make the later images be abbreviated
+ones by passing FALSE to jpeg_start_compress().
+
+
+Special markers
+---------------
+
+Some applications may need to insert or extract special data in the JPEG
+datastream.  The JPEG standard provides marker types "COM" (comment) and
+"APP0" through "APP15" (application) to hold application-specific data.
+Unfortunately, the use of these markers is not specified by the standard.
+COM markers are fairly widely used to hold user-supplied text.  The JFIF file
+format spec uses APP0 markers with specified initial strings to hold certain
+data.  Adobe applications use APP14 markers beginning with the string "Adobe"
+for miscellaneous data.  Other APPn markers are rarely seen, but might
+contain almost anything.
+
+If you wish to store user-supplied text, we recommend you use COM markers
+and place readable 7-bit ASCII text in them.  Newline conventions are not
+standardized --- expect to find LF (Unix style), CR/LF (DOS style), or CR
+(Mac style).  A robust COM reader should be able to cope with random binary
+garbage, including nulls, since some applications generate COM markers
+containing non-ASCII junk.  (But yours should not be one of them.)
+
+For program-supplied data, use an APPn marker, and be sure to begin it with an
+identifying string so that you can tell whether the marker is actually yours.
+It's probably best to avoid using APP0 or APP14 for any private markers.
+(NOTE: the upcoming SPIFF standard will use APP8 markers; we recommend you
+not use APP8 markers for any private purposes, either.)
+
+Keep in mind that at most 65533 bytes can be put into one marker, but you
+can have as many markers as you like.
+
+By default, the IJG compression library will write a JFIF APP0 marker if the
+selected JPEG colorspace is grayscale or YCbCr, or an Adobe APP14 marker if
+the selected colorspace is RGB, CMYK, or YCCK.  You can disable this, but
+we don't recommend it.  The decompression library will recognize JFIF and
+Adobe markers and will set the JPEG colorspace properly when one is found.
+
+
+You can write special markers immediately following the datastream header by
+calling jpeg_write_marker() after jpeg_start_compress() and before the first
+call to jpeg_write_scanlines().  When you do this, the markers appear after
+the SOI and the JFIF APP0 and Adobe APP14 markers (if written), but before
+all else.  Specify the marker type parameter as "JPEG_COM" for COM or
+"JPEG_APP0 + n" for APPn.  (Actually, jpeg_write_marker will let you write
+any marker type, but we don't recommend writing any other kinds of marker.)
+For example, to write a user comment string pointed to by comment_text:
+        jpeg_write_marker(cinfo, JPEG_COM, comment_text, strlen(comment_text));
+
+If it's not convenient to store all the marker data in memory at once,
+you can instead call jpeg_write_m_header() followed by multiple calls to
+jpeg_write_m_byte().  If you do it this way, it's your responsibility to
+call jpeg_write_m_byte() exactly the number of times given in the length
+parameter to jpeg_write_m_header().  (This method lets you empty the
+output buffer partway through a marker, which might be important when
+using a suspending data destination module.  In any case, if you are using
+a suspending destination, you should flush its buffer after inserting
+any special markers.  See "I/O suspension".)
+
+Or, if you prefer to synthesize the marker byte sequence yourself,
+you can just cram it straight into the data destination module.
+
+If you are writing JFIF 1.02 extension markers (thumbnail images), don't
+forget to set cinfo.JFIF_minor_version = 2 so that the encoder will write the
+correct JFIF version number in the JFIF header marker.  The library's default
+is to write version 1.01, but that's wrong if you insert any 1.02 extension
+markers.  (We could probably get away with just defaulting to 1.02, but there
+used to be broken decoders that would complain about unknown minor version
+numbers.  To reduce compatibility risks it's safest not to write 1.02 unless
+you are actually using 1.02 extensions.)
+
+
+When reading, two methods of handling special markers are available:
+1. You can ask the library to save the contents of COM and/or APPn markers
+into memory, and then examine them at your leisure afterwards.
+2. You can supply your own routine to process COM and/or APPn markers
+on-the-fly as they are read.
+The first method is simpler to use, especially if you are using a suspending
+data source; writing a marker processor that copes with input suspension is
+not easy (consider what happens if the marker is longer than your available
+input buffer).  However, the second method conserves memory since the marker
+data need not be kept around after it's been processed.
+
+For either method, you'd normally set up marker handling after creating a
+decompression object and before calling jpeg_read_header(), because the
+markers of interest will typically be near the head of the file and so will
+be scanned by jpeg_read_header.  Once you've established a marker handling
+method, it will be used for the life of that decompression object
+(potentially many datastreams), unless you change it.  Marker handling is
+determined separately for COM markers and for each APPn marker code.
+
+
+To save the contents of special markers in memory, call
+        jpeg_save_markers(cinfo, marker_code, length_limit)
+where marker_code is the marker type to save, JPEG_COM or JPEG_APP0+n.
+(To arrange to save all the special marker types, you need to call this
+routine 17 times, for COM and APP0-APP15.)  If the incoming marker is longer
+than length_limit data bytes, only length_limit bytes will be saved; this
+parameter allows you to avoid chewing up memory when you only need to see the
+first few bytes of a potentially large marker.  If you want to save all the
+data, set length_limit to 0xFFFF; that is enough since marker lengths are only
+16 bits.  As a special case, setting length_limit to 0 prevents that marker
+type from being saved at all.  (That is the default behavior, in fact.)
+
+After jpeg_read_header() completes, you can examine the special markers by
+following the cinfo->marker_list pointer chain.  All the special markers in
+the file appear in this list, in order of their occurrence in the file (but
+omitting any markers of types you didn't ask for).  Both the original data
+length and the saved data length are recorded for each list entry; the latter
+will not exceed length_limit for the particular marker type.  Note that these
+lengths exclude the marker length word, whereas the stored representation
+within the JPEG file includes it.  (Hence the maximum data length is really
+only 65533.)
+
+It is possible that additional special markers appear in the file beyond the
+SOS marker at which jpeg_read_header stops; if so, the marker list will be
+extended during reading of the rest of the file.  This is not expected to be
+common, however.  If you are short on memory you may want to reset the length
+limit to zero for all marker types after finishing jpeg_read_header, to
+ensure that the max_memory_to_use setting cannot be exceeded due to addition
+of later markers.
+
+The marker list remains stored until you call jpeg_finish_decompress or
+jpeg_abort, at which point the memory is freed and the list is set to empty.
+(jpeg_destroy also releases the storage, of course.)
+
+Note that the library is internally interested in APP0 and APP14 markers;
+if you try to set a small nonzero length limit on these types, the library
+will silently force the length up to the minimum it wants.  (But you can set
+a zero length limit to prevent them from being saved at all.)  Also, in a
+16-bit environment, the maximum length limit may be constrained to less than
+65533 by malloc() limitations.  It is therefore best not to assume that the
+effective length limit is exactly what you set it to be.
+
+
+If you want to supply your own marker-reading routine, you do it by calling
+jpeg_set_marker_processor().  A marker processor routine must have the
+signature
+        boolean jpeg_marker_parser_method (j_decompress_ptr cinfo)
+Although the marker code is not explicitly passed, the routine can find it
+in cinfo->unread_marker.  At the time of call, the marker proper has been
+read from the data source module.  The processor routine is responsible for
+reading the marker length word and the remaining parameter bytes, if any.
+Return TRUE to indicate success.  (FALSE should be returned only if you are
+using a suspending data source and it tells you to suspend.  See the standard
+marker processors in jdmarker.c for appropriate coding methods if you need to
+use a suspending data source.)
+
+If you override the default APP0 or APP14 processors, it is up to you to
+recognize JFIF and Adobe markers if you want colorspace recognition to occur
+properly.  We recommend copying and extending the default processors if you
+want to do that.  (A better idea is to save these marker types for later
+examination by calling jpeg_save_markers(); that method doesn't interfere
+with the library's own processing of these markers.)
+
+jpeg_set_marker_processor() and jpeg_save_markers() are mutually exclusive
+--- if you call one it overrides any previous call to the other, for the
+particular marker type specified.
+
+A simple example of an external COM processor can be found in djpeg.c.
+Also, see jpegtran.c for an example of using jpeg_save_markers.
+
+
+ICC profiles
+------------
+
+Two functions are provided for writing and reading International Color
+Consortium (ICC) device profiles embedded in JFIF JPEG image files:
+
+        void jpeg_write_icc_profile (j_compress_ptr cinfo,
+                                     const JOCTET *icc_data_ptr,
+                                     unsigned int icc_data_len);
+        boolean jpeg_read_icc_profile (j_decompress_ptr cinfo,
+                                       JOCTET **icc_data_ptr,
+                                       unsigned int *icc_data_len);
+
+The ICC has defined a standard for including such data in JPEG "APP2" markers.
+The aforementioned functions do not know anything about the internal structure
+of the ICC profile data; they just know how to embed the profile data into a
+JPEG file while writing it, or to extract the profile data from a JPEG file
+while reading it.
+
+jpeg_write_icc_profile() must be called after calling jpeg_start_compress() and
+before the first call to jpeg_write_scanlines() or jpeg_write_raw_data().  This
+ordering ensures that the APP2 marker(s) will appear after the SOI and JFIF or
+Adobe markers, but before all other data.
+
+jpeg_read_icc_profile() returns TRUE if an ICC profile was found and FALSE
+otherwise.  If an ICC profile was found, then the function will allocate a
+memory region containing the profile and will return a pointer to that memory
+region in *icc_data_ptr, as well as the length of the region in *icc_data_len.
+This memory region is allocated by the library using malloc() and must be freed
+by the caller using free() when the memory region is no longer needed.  Callers
+wishing to use jpeg_read_icc_profile() must call
+
+        jpeg_save_markers(cinfo, JPEG_APP0 + 2, 0xFFFF);
+
+prior to calling jpeg_read_header().  jpeg_read_icc_profile() can be called at
+any point between jpeg_read_header() and jpeg_finish_decompress().
+
+
+Raw (downsampled) image data
+----------------------------
+
+Some applications need to supply already-downsampled image data to the JPEG
+compressor, or to receive raw downsampled data from the decompressor.  The
+library supports this requirement by allowing the application to write or
+read raw data, bypassing the normal preprocessing or postprocessing steps.
+The interface is different from the standard one and is somewhat harder to
+use.  If your interest is merely in bypassing color conversion, we recommend
+that you use the standard interface and simply set jpeg_color_space =
+in_color_space (or jpeg_color_space = out_color_space for decompression).
+The mechanism described in this section is necessary only to supply or
+receive downsampled image data, in which not all components have the same
+dimensions.
+
+
+To compress raw data, you must supply the data in the colorspace to be used
+in the JPEG file (please read the earlier section on Special color spaces)
+and downsampled to the sampling factors specified in the JPEG parameters.
+You must supply the data in the format used internally by the JPEG library,
+namely a JSAMPIMAGE array.  This is an array of pointers to two-dimensional
+arrays, each of type JSAMPARRAY.  Each 2-D array holds the values for one
+color component.  This structure is necessary since the components are of
+different sizes.  If the image dimensions are not a multiple of the MCU size,
+you must also pad the data correctly (usually, this is done by replicating
+the last column and/or row).  The data must be padded to a multiple of a DCT
+block in each component: that is, each downsampled row must contain a
+multiple of 8 valid samples, and there must be a multiple of 8 sample rows
+for each component.  (For applications such as conversion of digital TV
+images, the standard image size is usually a multiple of the DCT block size,
+so that no padding need actually be done.)
+
+The procedure for compression of raw data is basically the same as normal
+compression, except that you call jpeg_write_raw_data() in place of
+jpeg_write_scanlines().  Before calling jpeg_start_compress(), you must do
+the following:
+  * Set cinfo->raw_data_in to TRUE.  (It is set FALSE by jpeg_set_defaults().)
+    This notifies the library that you will be supplying raw data.
+  * Ensure jpeg_color_space is correct --- an explicit jpeg_set_colorspace()
+    call is a good idea.  Note that since color conversion is bypassed,
+    in_color_space is ignored, except that jpeg_set_defaults() uses it to
+    choose the default jpeg_color_space setting.
+  * Ensure the sampling factors, cinfo->comp_info[i].h_samp_factor and
+    cinfo->comp_info[i].v_samp_factor, are correct.  Since these indicate the
+    dimensions of the data you are supplying, it's wise to set them
+    explicitly, rather than assuming the library's defaults are what you want.
+
+To pass raw data to the library, call jpeg_write_raw_data() in place of
+jpeg_write_scanlines().  The two routines work similarly except that
+jpeg_write_raw_data takes a JSAMPIMAGE data array rather than JSAMPARRAY.
+The scanlines count passed to and returned from jpeg_write_raw_data is
+measured in terms of the component with the largest v_samp_factor.
+
+jpeg_write_raw_data() processes one MCU row per call, which is to say
+v_samp_factor*DCTSIZE sample rows of each component.  The passed num_lines
+value must be at least max_v_samp_factor*DCTSIZE, and the return value will
+be exactly that amount (or possibly some multiple of that amount, in future
+library versions).  This is true even on the last call at the bottom of the
+image; don't forget to pad your data as necessary.
+
+The required dimensions of the supplied data can be computed for each
+component as
+        cinfo->comp_info[i].width_in_blocks*DCTSIZE  samples per row
+        cinfo->comp_info[i].height_in_blocks*DCTSIZE rows in image
+after jpeg_start_compress() has initialized those fields.  If the valid data
+is smaller than this, it must be padded appropriately.  For some sampling
+factors and image sizes, additional dummy DCT blocks are inserted to make
+the image a multiple of the MCU dimensions.  The library creates such dummy
+blocks itself; it does not read them from your supplied data.  Therefore you
+need never pad by more than DCTSIZE samples.  An example may help here.
+Assume 2h2v downsampling of YCbCr data, that is
+        cinfo->comp_info[0].h_samp_factor = 2           for Y
+        cinfo->comp_info[0].v_samp_factor = 2
+        cinfo->comp_info[1].h_samp_factor = 1           for Cb
+        cinfo->comp_info[1].v_samp_factor = 1
+        cinfo->comp_info[2].h_samp_factor = 1           for Cr
+        cinfo->comp_info[2].v_samp_factor = 1
+and suppose that the nominal image dimensions (cinfo->image_width and
+cinfo->image_height) are 101x101 pixels.  Then jpeg_start_compress() will
+compute downsampled_width = 101 and width_in_blocks = 13 for Y,
+downsampled_width = 51 and width_in_blocks = 7 for Cb and Cr (and the same
+for the height fields).  You must pad the Y data to at least 13*8 = 104
+columns and rows, the Cb/Cr data to at least 7*8 = 56 columns and rows.  The
+MCU height is max_v_samp_factor = 2 DCT rows so you must pass at least 16
+scanlines on each call to jpeg_write_raw_data(), which is to say 16 actual
+sample rows of Y and 8 each of Cb and Cr.  A total of 7 MCU rows are needed,
+so you must pass a total of 7*16 = 112 "scanlines".  The last DCT block row
+of Y data is dummy, so it doesn't matter what you pass for it in the data
+arrays, but the scanlines count must total up to 112 so that all of the Cb
+and Cr data gets passed.
+
+Output suspension is supported with raw-data compression: if the data
+destination module suspends, jpeg_write_raw_data() will return 0.
+In this case the same data rows must be passed again on the next call.
+
+
+Decompression with raw data output implies bypassing all postprocessing:
+you cannot ask for rescaling or color quantization, for instance.  More
+seriously, you must deal with the color space and sampling factors present in
+the incoming file.  If your application only handles, say, 2h1v YCbCr data,
+you must check for and fail on other color spaces or other sampling factors.
+The library will not convert to a different color space for you.
+
+To obtain raw data output, set cinfo->raw_data_out = TRUE before
+jpeg_start_decompress() (it is set FALSE by jpeg_read_header()).  Be sure to
+verify that the color space and sampling factors are ones you can handle.
+Then call jpeg_read_raw_data() in place of jpeg_read_scanlines().  The
+decompression process is otherwise the same as usual.
+
+jpeg_read_raw_data() returns one MCU row per call, and thus you must pass a
+buffer of at least max_v_samp_factor*DCTSIZE scanlines (scanline counting is
+the same as for raw-data compression).  The buffer you pass must be large
+enough to hold the actual data plus padding to DCT-block boundaries.  As with
+compression, any entirely dummy DCT blocks are not processed so you need not
+allocate space for them, but the total scanline count includes them.  The
+above example of computing buffer dimensions for raw-data compression is
+equally valid for decompression.
+
+Input suspension is supported with raw-data decompression: if the data source
+module suspends, jpeg_read_raw_data() will return 0.  You can also use
+buffered-image mode to read raw data in multiple passes.
+
+
+Really raw data: DCT coefficients
+---------------------------------
+
+It is possible to read or write the contents of a JPEG file as raw DCT
+coefficients.  This facility is mainly intended for use in lossless
+transcoding between different JPEG file formats.  Other possible applications
+include lossless cropping of a JPEG image, lossless reassembly of a
+multi-strip or multi-tile TIFF/JPEG file into a single JPEG datastream, etc.
+
+To read the contents of a JPEG file as DCT coefficients, open the file and do
+jpeg_read_header() as usual.  But instead of calling jpeg_start_decompress()
+and jpeg_read_scanlines(), call jpeg_read_coefficients().  This will read the
+entire image into a set of virtual coefficient-block arrays, one array per
+component.  The return value is a pointer to an array of virtual-array
+descriptors.  Each virtual array can be accessed directly using the JPEG
+memory manager's access_virt_barray method (see Memory management, below,
+and also read structure.txt's discussion of virtual array handling).  Or,
+for simple transcoding to a different JPEG file format, the array list can
+just be handed directly to jpeg_write_coefficients().
+
+Each block in the block arrays contains quantized coefficient values in
+normal array order (not JPEG zigzag order).  The block arrays contain only
+DCT blocks containing real data; any entirely-dummy blocks added to fill out
+interleaved MCUs at the right or bottom edges of the image are discarded
+during reading and are not stored in the block arrays.  (The size of each
+block array can be determined from the width_in_blocks and height_in_blocks
+fields of the component's comp_info entry.)  This is also the data format
+expected by jpeg_write_coefficients().
+
+When you are done using the virtual arrays, call jpeg_finish_decompress()
+to release the array storage and return the decompression object to an idle
+state; or just call jpeg_destroy() if you don't need to reuse the object.
+
+If you use a suspending data source, jpeg_read_coefficients() will return
+NULL if it is forced to suspend; a non-NULL return value indicates successful
+completion.  You need not test for a NULL return value when using a
+non-suspending data source.
+
+It is also possible to call jpeg_read_coefficients() to obtain access to the
+decoder's coefficient arrays during a normal decode cycle in buffered-image
+mode.  This frammish might be useful for progressively displaying an incoming
+image and then re-encoding it without loss.  To do this, decode in buffered-
+image mode as discussed previously, then call jpeg_read_coefficients() after
+the last jpeg_finish_output() call.  The arrays will be available for your use
+until you call jpeg_finish_decompress().
+
+
+To write the contents of a JPEG file as DCT coefficients, you must provide
+the DCT coefficients stored in virtual block arrays.  You can either pass
+block arrays read from an input JPEG file by jpeg_read_coefficients(), or
+allocate virtual arrays from the JPEG compression object and fill them
+yourself.  In either case, jpeg_write_coefficients() is substituted for
+jpeg_start_compress() and jpeg_write_scanlines().  Thus the sequence is
+  * Create compression object
+  * Set all compression parameters as necessary
+  * Request virtual arrays if needed
+  * jpeg_write_coefficients()
+  * jpeg_finish_compress()
+  * Destroy or re-use compression object
+jpeg_write_coefficients() is passed a pointer to an array of virtual block
+array descriptors; the number of arrays is equal to cinfo.num_components.
+
+The virtual arrays need only have been requested, not realized, before
+jpeg_write_coefficients() is called.  A side-effect of
+jpeg_write_coefficients() is to realize any virtual arrays that have been
+requested from the compression object's memory manager.  Thus, when obtaining
+the virtual arrays from the compression object, you should fill the arrays
+after calling jpeg_write_coefficients().  The data is actually written out
+when you call jpeg_finish_compress(); jpeg_write_coefficients() only writes
+the file header.
+
+When writing raw DCT coefficients, it is crucial that the JPEG quantization
+tables and sampling factors match the way the data was encoded, or the
+resulting file will be invalid.  For transcoding from an existing JPEG file,
+we recommend using jpeg_copy_critical_parameters().  This routine initializes
+all the compression parameters to default values (like jpeg_set_defaults()),
+then copies the critical information from a source decompression object.
+The decompression object should have just been used to read the entire
+JPEG input file --- that is, it should be awaiting jpeg_finish_decompress().
+
+jpeg_write_coefficients() marks all tables stored in the compression object
+as needing to be written to the output file (thus, it acts like
+jpeg_start_compress(cinfo, TRUE)).  This is for safety's sake, to avoid
+emitting abbreviated JPEG files by accident.  If you really want to emit an
+abbreviated JPEG file, call jpeg_suppress_tables(), or set the tables'
+individual sent_table flags, between calling jpeg_write_coefficients() and
+jpeg_finish_compress().
+
+
+Progress monitoring
+-------------------
+
+Some applications may need to regain control from the JPEG library every so
+often.  The typical use of this feature is to produce a percent-done bar or
+other progress display.  (For a simple example, see cjpeg.c or djpeg.c.)
+Although you do get control back frequently during the data-transferring pass
+(the jpeg_read_scanlines or jpeg_write_scanlines loop), any additional passes
+will occur inside jpeg_finish_compress or jpeg_start_decompress; those
+routines may take a long time to execute, and you don't get control back
+until they are done.
+
+You can define a progress-monitor routine which will be called periodically
+by the library.  No guarantees are made about how often this call will occur,
+so we don't recommend you use it for mouse tracking or anything like that.
+At present, a call will occur once per MCU row, scanline, or sample row
+group, whichever unit is convenient for the current processing mode; so the
+wider the image, the longer the time between calls.  During the data
+transferring pass, only one call occurs per call of jpeg_read_scanlines or
+jpeg_write_scanlines, so don't pass a large number of scanlines at once if
+you want fine resolution in the progress count.  (If you really need to use
+the callback mechanism for time-critical tasks like mouse tracking, you could
+insert additional calls inside some of the library's inner loops.)
+
+To establish a progress-monitor callback, create a struct jpeg_progress_mgr,
+fill in its progress_monitor field with a pointer to your callback routine,
+and set cinfo->progress to point to the struct.  The callback will be called
+whenever cinfo->progress is non-NULL.  (This pointer is set to NULL by
+jpeg_create_compress or jpeg_create_decompress; the library will not change
+it thereafter.  So if you allocate dynamic storage for the progress struct,
+make sure it will live as long as the JPEG object does.  Allocating from the
+JPEG memory manager with lifetime JPOOL_PERMANENT will work nicely.)  You
+can use the same callback routine for both compression and decompression.
+
+The jpeg_progress_mgr struct contains four fields which are set by the library:
+        long pass_counter;      /* work units completed in this pass */
+        long pass_limit;        /* total number of work units in this pass */
+        int completed_passes;   /* passes completed so far */
+        int total_passes;       /* total number of passes expected */
+During any one pass, pass_counter increases from 0 up to (not including)
+pass_limit; the step size is usually but not necessarily 1.  The pass_limit
+value may change from one pass to another.  The expected total number of
+passes is in total_passes, and the number of passes already completed is in
+completed_passes.  Thus the fraction of work completed may be estimated as
+                completed_passes + (pass_counter/pass_limit)
+                --------------------------------------------
+                                total_passes
+ignoring the fact that the passes may not be equal amounts of work.
+
+When decompressing, pass_limit can even change within a pass, because it
+depends on the number of scans in the JPEG file, which isn't always known in
+advance.  The computed fraction-of-work-done may jump suddenly (if the library
+discovers it has overestimated the number of scans) or even decrease (in the
+opposite case).  It is not wise to put great faith in the work estimate.
+
+When using the decompressor's buffered-image mode, the progress monitor work
+estimate is likely to be completely unhelpful, because the library has no way
+to know how many output passes will be demanded of it.  Currently, the library
+sets total_passes based on the assumption that there will be one more output
+pass if the input file end hasn't yet been read (jpeg_input_complete() isn't
+TRUE), but no more output passes if the file end has been reached when the
+output pass is started.  This means that total_passes will rise as additional
+output passes are requested.  If you have a way of determining the input file
+size, estimating progress based on the fraction of the file that's been read
+will probably be more useful than using the library's value.
+
+
+Memory management
+-----------------
+
+This section covers some key facts about the JPEG library's built-in memory
+manager.  For more info, please read structure.txt's section about the memory
+manager, and consult the source code if necessary.
+
+All memory and temporary file allocation within the library is done via the
+memory manager.  If necessary, you can replace the "back end" of the memory
+manager to control allocation yourself (for example, if you don't want the
+library to use malloc() and free() for some reason).
+
+Some data is allocated "permanently" and will not be freed until the JPEG
+object is destroyed.  Most data is allocated "per image" and is freed by
+jpeg_finish_compress, jpeg_finish_decompress, or jpeg_abort.  You can call the
+memory manager yourself to allocate structures that will automatically be
+freed at these times.  Typical code for this is
+  ptr = (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, size);
+Use JPOOL_PERMANENT to get storage that lasts as long as the JPEG object.
+Use alloc_large instead of alloc_small for anything bigger than a few Kbytes.
+There are also alloc_sarray and alloc_barray routines that automatically
+build 2-D sample or block arrays.
+
+The library's minimum space requirements to process an image depend on the
+image's width, but not on its height, because the library ordinarily works
+with "strip" buffers that are as wide as the image but just a few rows high.
+Some operating modes (eg, two-pass color quantization) require full-image
+buffers.  Such buffers are treated as "virtual arrays": only the current strip
+need be in memory, and the rest can be swapped out to a temporary file.
+
+When using temporary files, the library will make the in-memory buffers for
+its virtual arrays just big enough to stay within a "maximum memory" setting.
+Your application can set this limit by setting cinfo->mem->max_memory_to_use
+after creating the JPEG object.  (Of course, there is still a minimum size for
+the buffers, so the max-memory setting is effective only if it is bigger than
+the minimum space needed.)  If you allocate any large structures yourself, you
+must allocate them before jpeg_start_compress() or jpeg_start_decompress() in
+order to have them counted against the max memory limit.  Also keep in mind
+that space allocated with alloc_small() is ignored, on the assumption that
+it's too small to be worth worrying about; so a reasonable safety margin
+should be left when setting max_memory_to_use.
+
+NOTE: Unless you develop your own memory manager back end, then temporary files
+will never be used.  The back end provided in libjpeg-turbo (jmemnobs.c) simply
+malloc()s and free()s virtual arrays, and an error occurs if the required
+memory exceeds the limit specified in cinfo->mem->max_memory_to_use.
+
+
+Memory usage
+------------
+
+Working memory requirements while performing compression or decompression
+depend on image dimensions, image characteristics (such as colorspace and
+JPEG process), and operating mode (application-selected options).
+
+As of v6b, the decompressor requires:
+ 1. About 24K in more-or-less-fixed-size data.  This varies a bit depending
+    on operating mode and image characteristics (particularly color vs.
+    grayscale), but it doesn't depend on image dimensions.
+ 2. Strip buffers (of size proportional to the image width) for IDCT and
+    upsampling results.  The worst case for commonly used sampling factors
+    is about 34 bytes * width in pixels for a color image.  A grayscale image
+    only needs about 8 bytes per pixel column.
+ 3. A full-image DCT coefficient buffer is needed to decode a multi-scan JPEG
+    file (including progressive JPEGs), or whenever you select buffered-image
+    mode.  This takes 2 bytes/coefficient.  At typical 2x2 sampling, that's
+    3 bytes per pixel for a color image.  Worst case (1x1 sampling) requires
+    6 bytes/pixel.  For grayscale, figure 2 bytes/pixel.
+ 4. To perform 2-pass color quantization, the decompressor also needs a
+    128K color lookup table and a full-image pixel buffer (3 bytes/pixel).
+This does not count any memory allocated by the application, such as a
+buffer to hold the final output image.
+
+The above figures are valid for 8-bit JPEG data precision and a machine with
+32-bit ints.  For 12-bit JPEG data, double the size of the strip buffers and
+quantization pixel buffer.  The "fixed-size" data will be somewhat smaller
+with 16-bit ints, larger with 64-bit ints.  Also, CMYK or other unusual
+color spaces will require different amounts of space.
+
+The full-image coefficient and pixel buffers, if needed at all, do not
+have to be fully RAM resident; you can have the library use temporary
+files instead when the total memory usage would exceed a limit you set.
+(But if your OS supports virtual memory, it's probably better to just use
+jmemnobs and let the OS do the swapping.)
+
+The compressor's memory requirements are similar, except that it has no need
+for color quantization.  Also, it needs a full-image DCT coefficient buffer
+if Huffman-table optimization is asked for, even if progressive mode is not
+requested.
+
+If you need more detailed information about memory usage in a particular
+situation, you can enable the MEM_STATS code in jmemmgr.c.
+
+
+Library compile-time options
+----------------------------
+
+A number of compile-time options are available by modifying jmorecfg.h.
+
+The JPEG standard provides for both the baseline 8-bit DCT process and
+a 12-bit DCT process.  The IJG code supports 12-bit lossy JPEG if you define
+BITS_IN_JSAMPLE as 12 rather than 8.  Note that this causes JSAMPLE to be
+larger than a char, so it affects the surrounding application's image data.
+The sample applications cjpeg and djpeg can support 12-bit mode only for PPM
+and GIF file formats; you must disable the other file formats to compile a
+12-bit cjpeg or djpeg.  (install.txt has more information about that.)
+At present, a 12-bit library can handle *only* 12-bit images, not both
+precisions.
+
+Note that a 12-bit library always compresses in Huffman optimization mode,
+in order to generate valid Huffman tables.  This is necessary because our
+default Huffman tables only cover 8-bit data.  If you need to output 12-bit
+files in one pass, you'll have to supply suitable default Huffman tables.
+You may also want to supply your own DCT quantization tables; the existing
+quality-scaling code has been developed for 8-bit use, and probably doesn't
+generate especially good tables for 12-bit.
+
+The maximum number of components (color channels) in the image is determined
+by MAX_COMPONENTS.  The JPEG standard allows up to 255 components, but we
+expect that few applications will need more than four or so.
+
+On machines with unusual data type sizes, you may be able to improve
+performance or reduce memory space by tweaking the various typedefs in
+jmorecfg.h.  In particular, on some RISC CPUs, access to arrays of "short"s
+is quite slow; consider trading memory for speed by making JCOEF, INT16, and
+UINT16 be "int" or "unsigned int".  UINT8 is also a candidate to become int.
+You probably don't want to make JSAMPLE be int unless you have lots of memory
+to burn.
+
+You can reduce the size of the library by compiling out various optional
+functions.  To do this, undefine xxx_SUPPORTED symbols as necessary.
+
+You can also save a few K by not having text error messages in the library;
+the standard error message table occupies about 5Kb.  This is particularly
+reasonable for embedded applications where there's no good way to display
+a message anyway.  To do this, remove the creation of the message table
+(jpeg_std_message_table[]) from jerror.c, and alter format_message to do
+something reasonable without it.  You could output the numeric value of the
+message code number, for example.  If you do this, you can also save a couple
+more K by modifying the TRACEMSn() macros in jerror.h to expand to nothing;
+you don't need trace capability anyway, right?
+
+
+Portability considerations
+--------------------------
+
+The JPEG library has been written to be extremely portable; the sample
+applications cjpeg and djpeg are slightly less so.  This section summarizes
+the design goals in this area.  (If you encounter any bugs that cause the
+library to be less portable than is claimed here, we'd appreciate hearing
+about them.)
+
+The code works fine on ANSI C and C++ compilers, using any of the popular
+system include file setups, and some not-so-popular ones too.
+
+The code is not dependent on the exact sizes of the C data types.  As
+distributed, we make the assumptions that
+        char    is at least 8 bits wide
+        short   is at least 16 bits wide
+        int     is at least 16 bits wide
+        long    is at least 32 bits wide
+(These are the minimum requirements of the ANSI C standard.)  Wider types will
+work fine, although memory may be used inefficiently if char is much larger
+than 8 bits or short is much bigger than 16 bits.  The code should work
+equally well with 16- or 32-bit ints.
+
+In a system where these assumptions are not met, you may be able to make the
+code work by modifying the typedefs in jmorecfg.h.  However, you will probably
+have difficulty if int is less than 16 bits wide, since references to plain
+int abound in the code.
+
+char can be either signed or unsigned, although the code runs faster if an
+unsigned char type is available.  If char is wider than 8 bits, you will need
+to redefine JOCTET and/or provide custom data source/destination managers so
+that JOCTET represents exactly 8 bits of data on external storage.
+
+The JPEG library proper does not assume ASCII representation of characters.
+But some of the image file I/O modules in cjpeg/djpeg do have ASCII
+dependencies in file-header manipulation; so does cjpeg's select_file_type()
+routine.
+
+The JPEG library does not rely heavily on the C library.  In particular, C
+stdio is used only by the data source/destination modules and the error
+handler, all of which are application-replaceable.  (cjpeg/djpeg are more
+heavily dependent on stdio.)  malloc and free are called only from the memory
+manager "back end" module, so you can use a different memory allocator by
+replacing that one file.
+
+More info about porting the code may be gleaned by reading jconfig.txt,
+jmorecfg.h, and jinclude.h.
diff --git a/external/jpeg/rdbmp.c b/external/jpeg/rdbmp.c
index 51af23778f88..358a02676911 100644
--- a/external/jpeg/rdbmp.c
+++ b/external/jpeg/rdbmp.c
@@ -6,13 +6,13 @@
  * Modified 2009-2017 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Modified 2011 by Siarhei Siamashka.
- * Copyright (C) 2015, 2017-2018, D. R. Commander.
+ * Copyright (C) 2015, 2017-2018, 2021, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
  * This file contains routines to read input images in Microsoft "BMP"
  * format (MS Windows 3.x, OS/2 1.x, and OS/2 2.x flavors).
- * Currently, only 8-bit and 24-bit images are supported, not 1-bit or
+ * Currently, only 8-, 24-, and 32-bit images are supported, not 1-bit or
  * 4-bit (feeding such low-depth images into JPEG would be silly anyway).
  * Also, we don't support RLE-compressed files.
  *
@@ -34,18 +34,8 @@
 
 /* Macros to deal with unsigned chars as efficiently as compiler allows */
 
-#ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char U_CHAR;
 #define UCH(x)  ((int)(x))
-#else /* !HAVE_UNSIGNED_CHAR */
-#ifdef __CHAR_UNSIGNED__
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x))
-#else
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x) & 0xFF)
-#endif
-#endif /* HAVE_UNSIGNED_CHAR */
 
 
 #define ReadOK(file, buffer, len) \
@@ -71,7 +61,7 @@ typedef struct _bmp_source_struct {
   JDIMENSION source_row;        /* Current source row number */
   JDIMENSION row_width;         /* Physical width of scanlines in file */
 
-  int bits_per_pixel;           /* remembers 8- or 24-bit format */
+  int bits_per_pixel;           /* remembers 8-, 24-, or 32-bit format */
   int cmap_length;              /* colormap length */
 
   boolean use_inversion_array;  /* TRUE = preload the whole image, which is
@@ -179,14 +169,14 @@ get_8bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
   outptr = source->pub.buffer[0];
   if (cinfo->in_color_space == JCS_GRAYSCALE) {
     for (col = cinfo->image_width; col > 0; col--) {
-      t = GETJSAMPLE(*inptr++);
+      t = *inptr++;
       if (t >= cmaplen)
         ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
       *outptr++ = colormap[0][t];
     }
   } else if (cinfo->in_color_space == JCS_CMYK) {
     for (col = cinfo->image_width; col > 0; col--) {
-      t = GETJSAMPLE(*inptr++);
+      t = *inptr++;
       if (t >= cmaplen)
         ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
       rgb_to_cmyk(colormap[0][t], colormap[1][t], colormap[2][t], outptr,
@@ -202,7 +192,7 @@ get_8bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 
     if (aindex >= 0) {
       for (col = cinfo->image_width; col > 0; col--) {
-        t = GETJSAMPLE(*inptr++);
+        t = *inptr++;
         if (t >= cmaplen)
           ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
         outptr[rindex] = colormap[0][t];
@@ -213,7 +203,7 @@ get_8bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
       }
     } else {
       for (col = cinfo->image_width; col > 0; col--) {
-        t = GETJSAMPLE(*inptr++);
+        t = *inptr++;
         if (t >= cmaplen)
           ERREXIT(cinfo, JERR_BMP_OUTOFRANGE);
         outptr[rindex] = colormap[0][t];
@@ -258,7 +248,6 @@ get_24bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
     MEMCOPY(outptr, inptr, source->row_width);
   } else if (cinfo->in_color_space == JCS_CMYK) {
     for (col = cinfo->image_width; col > 0; col--) {
-      /* can omit GETJSAMPLE() safely */
       JSAMPLE b = *inptr++, g = *inptr++, r = *inptr++;
       rgb_to_cmyk(r, g, b, outptr, outptr + 1, outptr + 2, outptr + 3);
       outptr += 4;
@@ -272,7 +261,7 @@ get_24bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 
     if (aindex >= 0) {
       for (col = cinfo->image_width; col > 0; col--) {
-        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[bindex] = *inptr++;
         outptr[gindex] = *inptr++;
         outptr[rindex] = *inptr++;
         outptr[aindex] = 0xFF;
@@ -280,7 +269,7 @@ get_24bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
       }
     } else {
       for (col = cinfo->image_width; col > 0; col--) {
-        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[bindex] = *inptr++;
         outptr[gindex] = *inptr++;
         outptr[rindex] = *inptr++;
         outptr += ps;
@@ -323,7 +312,6 @@ get_32bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
     MEMCOPY(outptr, inptr, source->row_width);
   } else if (cinfo->in_color_space == JCS_CMYK) {
     for (col = cinfo->image_width; col > 0; col--) {
-      /* can omit GETJSAMPLE() safely */
       JSAMPLE b = *inptr++, g = *inptr++, r = *inptr++;
       rgb_to_cmyk(r, g, b, outptr, outptr + 1, outptr + 2, outptr + 3);
       inptr++;                          /* skip the 4th byte (Alpha channel) */
@@ -338,7 +326,7 @@ get_32bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 
     if (aindex >= 0) {
       for (col = cinfo->image_width; col > 0; col--) {
-        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[bindex] = *inptr++;
         outptr[gindex] = *inptr++;
         outptr[rindex] = *inptr++;
         outptr[aindex] = *inptr++;
@@ -346,7 +334,7 @@ get_32bit_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
       }
     } else {
       for (col = cinfo->image_width; col > 0; col--) {
-        outptr[bindex] = *inptr++;      /* can omit GETJSAMPLE() safely */
+        outptr[bindex] = *inptr++;
         outptr[gindex] = *inptr++;
         outptr[rindex] = *inptr++;
         inptr++;                        /* skip the 4th byte (Alpha channel) */
@@ -436,14 +424,14 @@ start_input_bmp(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
    (((unsigned int)UCH(array[offset + 2])) << 16) + \
    (((unsigned int)UCH(array[offset + 3])) << 24))
 
-  unsigned int bfOffBits;
-  unsigned int headerSize;
+  int bfOffBits;
+  int headerSize;
   int biWidth;
   int biHeight;
   unsigned short biPlanes;
   unsigned int biCompression;
   int biXPelsPerMeter, biYPelsPerMeter;
-  unsigned int biClrUsed = 0;
+  int biClrUsed = 0;
   int mapentrysize = 0;         /* 0 indicates no colormap */
   int bPad;
   JDIMENSION row_width = 0;
@@ -462,7 +450,7 @@ start_input_bmp(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
   if (!ReadOK(source->pub.input_file, bmpinfoheader, 4))
     ERREXIT(cinfo, JERR_INPUT_EOF);
   headerSize = GET_4B(bmpinfoheader, 0);
-  if (headerSize < 12 || headerSize > 64)
+  if (headerSize < 12 || headerSize > 64 || (headerSize + 14) > bfOffBits)
     ERREXIT(cinfo, JERR_BMP_BADHEADER);
   if (!ReadOK(source->pub.input_file, bmpinfoheader + 4, headerSize - 4))
     ERREXIT(cinfo, JERR_INPUT_EOF);
@@ -481,7 +469,9 @@ start_input_bmp(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
       TRACEMS2(cinfo, 1, JTRC_BMP_OS2_MAPPED, biWidth, biHeight);
       break;
     case 24:                    /* RGB image */
-      TRACEMS2(cinfo, 1, JTRC_BMP_OS2, biWidth, biHeight);
+    case 32:                    /* RGB image + Alpha channel */
+      TRACEMS3(cinfo, 1, JTRC_BMP_OS2, biWidth, biHeight,
+               source->bits_per_pixel);
       break;
     default:
       ERREXIT(cinfo, JERR_BMP_BADDEPTH);
@@ -508,10 +498,8 @@ start_input_bmp(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
       TRACEMS2(cinfo, 1, JTRC_BMP_MAPPED, biWidth, biHeight);
       break;
     case 24:                    /* RGB image */
-      TRACEMS2(cinfo, 1, JTRC_BMP, biWidth, biHeight);
-      break;
     case 32:                    /* RGB image + Alpha channel */
-      TRACEMS2(cinfo, 1, JTRC_BMP, biWidth, biHeight);
+      TRACEMS3(cinfo, 1, JTRC_BMP, biWidth, biHeight, source->bits_per_pixel);
       break;
     default:
       ERREXIT(cinfo, JERR_BMP_BADDEPTH);
@@ -534,6 +522,11 @@ start_input_bmp(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 
   if (biWidth <= 0 || biHeight <= 0)
     ERREXIT(cinfo, JERR_BMP_EMPTY);
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  if (sinfo->max_pixels &&
+      (unsigned long long)biWidth * biHeight > sinfo->max_pixels)
+    ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+#endif
   if (biPlanes != 1)
     ERREXIT(cinfo, JERR_BMP_BADPLANES);
 
@@ -587,7 +580,9 @@ start_input_bmp(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
       cinfo->input_components = 4;
     else
       ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
-    row_width = (JDIMENSION)(biWidth * 3);
+    if ((unsigned long long)biWidth * 3ULL > 0xFFFFFFFFULL)
+      ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+    row_width = (JDIMENSION)biWidth * 3;
     break;
   case 32:
     if (cinfo->in_color_space == JCS_UNKNOWN)
@@ -598,7 +593,9 @@ start_input_bmp(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
       cinfo->input_components = 4;
     else
       ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
-    row_width = (JDIMENSION)(biWidth * 4);
+    if ((unsigned long long)biWidth * 4ULL > 0xFFFFFFFFULL)
+      ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+    row_width = (JDIMENSION)biWidth * 4;
     break;
   default:
     ERREXIT(cinfo, JERR_BMP_BADDEPTH);
@@ -643,7 +640,7 @@ start_input_bmp(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
   /* Allocate one-row buffer for returned data */
   source->pub.buffer = (*cinfo->mem->alloc_sarray)
     ((j_common_ptr)cinfo, JPOOL_IMAGE,
-     (JDIMENSION)(biWidth * cinfo->input_components), (JDIMENSION)1);
+     (JDIMENSION)biWidth * (JDIMENSION)cinfo->input_components, (JDIMENSION)1);
   source->pub.buffer_height = 1;
 
   cinfo->data_precision = 8;
@@ -680,6 +677,9 @@ jinit_read_bmp(j_compress_ptr cinfo, boolean use_inversion_array)
   /* Fill in method ptrs, except get_pixel_rows which start_input sets */
   source->pub.start_input = start_input_bmp;
   source->pub.finish_input = finish_input_bmp;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  source->pub.max_pixels = 0;
+#endif
 
   source->use_inversion_array = use_inversion_array;
 
diff --git a/external/jpeg/rdcolmap.c b/external/jpeg/rdcolmap.c
index cbbef59d5f59..d2ed95cf8042 100644
--- a/external/jpeg/rdcolmap.c
+++ b/external/jpeg/rdcolmap.c
@@ -54,9 +54,8 @@ add_map_entry(j_decompress_ptr cinfo, int R, int G, int B)
 
   /* Check for duplicate color. */
   for (index = 0; index < ncolors; index++) {
-    if (GETJSAMPLE(colormap0[index]) == R &&
-        GETJSAMPLE(colormap1[index]) == G &&
-        GETJSAMPLE(colormap2[index]) == B)
+    if (colormap0[index] == R && colormap1[index] == G &&
+        colormap2[index] == B)
       return;                   /* color is already in map */
   }
 
diff --git a/external/jpeg/rdgif.c b/external/jpeg/rdgif.c
index ff9258d6eb18..c814c6b0f227 100644
--- a/external/jpeg/rdgif.c
+++ b/external/jpeg/rdgif.c
@@ -1,29 +1,673 @@
 /*
  * rdgif.c
  *
+ * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
- * This file is part of the Independent JPEG Group's software.
+ * Modified 2019 by Guido Vollbeding.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2021, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
  * This file contains routines to read input images in GIF format.
  *
- *****************************************************************************
- * NOTE: to avoid entanglements with Unisys' patent on LZW compression,      *
- * the ability to read GIF files has been removed from the IJG distribution. *
- * Sorry about that.                                                         *
- *****************************************************************************
- *
- * We are required to state that
- *    "The Graphics Interchange Format(c) is the Copyright property of
- *    CompuServe Incorporated. GIF(sm) is a Service Mark property of
- *    CompuServe Incorporated."
+ * These routines may need modification for non-Unix environments or
+ * specialized applications.  As they stand, they assume input from
+ * an ordinary stdio stream.  They further assume that reading begins
+ * at the start of the file; start_input may need work if the
+ * user interface has already read some data (e.g., to determine that
+ * the file is indeed GIF format).
+ */
+
+/*
+ * This code is loosely based on giftoppm from the PBMPLUS distribution
+ * of Feb. 1991.  That file contains the following copyright notice:
+ * +-------------------------------------------------------------------+
+ * | Copyright 1990, David Koblas.                                     |
+ * |   Permission to use, copy, modify, and distribute this software   |
+ * |   and its documentation for any purpose and without fee is hereby |
+ * |   granted, provided that the above copyright notice appear in all |
+ * |   copies and that both that copyright notice and this permission  |
+ * |   notice appear in supporting documentation.  This software is    |
+ * |   provided "as is" without express or implied warranty.           |
+ * +-------------------------------------------------------------------+
  */
 
 #include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
 
 #ifdef GIF_SUPPORTED
 
+
+/* Macros to deal with unsigned chars as efficiently as compiler allows */
+
+typedef unsigned char U_CHAR;
+#define UCH(x)  ((int)(x))
+
+
+#define ReadOK(file, buffer, len) \
+  (JFREAD(file, buffer, len) == ((size_t)(len)))
+
+
+#define MAXCOLORMAPSIZE  256    /* max # of colors in a GIF colormap */
+#define NUMCOLORS        3      /* # of colors */
+#define CM_RED           0      /* color component numbers */
+#define CM_GREEN         1
+#define CM_BLUE          2
+
+#define MAX_LZW_BITS     12     /* maximum LZW code size */
+#define LZW_TABLE_SIZE   (1 << MAX_LZW_BITS) /* # of possible LZW symbols */
+
+/* Macros for extracting header data --- note we assume chars may be signed */
+
+#define LM_to_uint(array, offset) \
+  ((unsigned int)UCH(array[offset]) + \
+   (((unsigned int)UCH(array[offset + 1])) << 8))
+
+#define BitSet(byte, bit)       ((byte) & (bit))
+#define INTERLACE       0x40    /* mask for bit signifying interlaced image */
+#define COLORMAPFLAG    0x80    /* mask for bit signifying colormap presence */
+
+
+/*
+ * LZW decompression tables look like this:
+ *   symbol_head[K] = prefix symbol of any LZW symbol K (0..LZW_TABLE_SIZE-1)
+ *   symbol_tail[K] = suffix byte   of any LZW symbol K (0..LZW_TABLE_SIZE-1)
+ * Note that entries 0..end_code of the above tables are not used,
+ * since those symbols represent raw bytes or special codes.
+ *
+ * The stack represents the not-yet-used expansion of the last LZW symbol.
+ * In the worst case, a symbol could expand to as many bytes as there are
+ * LZW symbols, so we allocate LZW_TABLE_SIZE bytes for the stack.
+ * (This is conservative since that number includes the raw-byte symbols.)
+ */
+
+
+/* Private version of data source object */
+
+typedef struct {
+  struct cjpeg_source_struct pub; /* public fields */
+
+  j_compress_ptr cinfo;         /* back link saves passing separate parm */
+
+  JSAMPARRAY colormap;          /* GIF colormap (converted to my format) */
+
+  /* State for GetCode and LZWReadByte */
+  U_CHAR code_buf[256 + 4];     /* current input data block */
+  int last_byte;                /* # of bytes in code_buf */
+  int last_bit;                 /* # of bits in code_buf */
+  int cur_bit;                  /* next bit index to read */
+  boolean first_time;           /* flags first call to GetCode */
+  boolean out_of_blocks;        /* TRUE if hit terminator data block */
+
+  int input_code_size;          /* codesize given in GIF file */
+  int clear_code, end_code;     /* values for Clear and End codes */
+
+  int code_size;                /* current actual code size */
+  int limit_code;               /* 2^code_size */
+  int max_code;                 /* first unused code value */
+
+  /* Private state for LZWReadByte */
+  int oldcode;                  /* previous LZW symbol */
+  int firstcode;                /* first byte of oldcode's expansion */
+
+  /* LZW symbol table and expansion stack */
+  UINT16 *symbol_head;          /* => table of prefix symbols */
+  UINT8  *symbol_tail;          /* => table of suffix bytes */
+  UINT8  *symbol_stack;         /* => stack for symbol expansions */
+  UINT8  *sp;                   /* stack pointer */
+
+  /* State for interlaced image processing */
+  boolean is_interlaced;        /* TRUE if have interlaced image */
+  jvirt_sarray_ptr interlaced_image; /* full image in interlaced order */
+  JDIMENSION cur_row_number;    /* need to know actual row number */
+  JDIMENSION pass2_offset;      /* # of pixel rows in pass 1 */
+  JDIMENSION pass3_offset;      /* # of pixel rows in passes 1&2 */
+  JDIMENSION pass4_offset;      /* # of pixel rows in passes 1,2,3 */
+} gif_source_struct;
+
+typedef gif_source_struct *gif_source_ptr;
+
+
+/* Forward declarations */
+METHODDEF(JDIMENSION) get_pixel_rows(j_compress_ptr cinfo,
+                                     cjpeg_source_ptr sinfo);
+METHODDEF(JDIMENSION) load_interlaced_image(j_compress_ptr cinfo,
+                                            cjpeg_source_ptr sinfo);
+METHODDEF(JDIMENSION) get_interlaced_row(j_compress_ptr cinfo,
+                                         cjpeg_source_ptr sinfo);
+
+
+LOCAL(int)
+ReadByte(gif_source_ptr sinfo)
+/* Read next byte from GIF file */
+{
+  register FILE *infile = sinfo->pub.input_file;
+  register int c;
+
+  if ((c = getc(infile)) == EOF)
+    ERREXIT(sinfo->cinfo, JERR_INPUT_EOF);
+  return c;
+}
+
+
+LOCAL(int)
+GetDataBlock(gif_source_ptr sinfo, U_CHAR *buf)
+/* Read a GIF data block, which has a leading count byte */
+/* A zero-length block marks the end of a data block sequence */
+{
+  int count;
+
+  count = ReadByte(sinfo);
+  if (count > 0) {
+    if (!ReadOK(sinfo->pub.input_file, buf, count))
+      ERREXIT(sinfo->cinfo, JERR_INPUT_EOF);
+  }
+  return count;
+}
+
+
+LOCAL(void)
+SkipDataBlocks(gif_source_ptr sinfo)
+/* Skip a series of data blocks, until a block terminator is found */
+{
+  U_CHAR buf[256];
+
+  while (GetDataBlock(sinfo, buf) > 0)
+    /* skip */;
+}
+
+
+LOCAL(void)
+ReInitLZW(gif_source_ptr sinfo)
+/* (Re)initialize LZW state; shared code for startup and Clear processing */
+{
+  sinfo->code_size = sinfo->input_code_size + 1;
+  sinfo->limit_code = sinfo->clear_code << 1;   /* 2^code_size */
+  sinfo->max_code = sinfo->clear_code + 2;      /* first unused code value */
+  sinfo->sp = sinfo->symbol_stack;              /* init stack to empty */
+}
+
+
+LOCAL(void)
+InitLZWCode(gif_source_ptr sinfo)
+/* Initialize for a series of LZWReadByte (and hence GetCode) calls */
+{
+  /* GetCode initialization */
+  sinfo->last_byte = 2;         /* make safe to "recopy last two bytes" */
+  sinfo->code_buf[0] = 0;
+  sinfo->code_buf[1] = 0;
+  sinfo->last_bit = 0;          /* nothing in the buffer */
+  sinfo->cur_bit = 0;           /* force buffer load on first call */
+  sinfo->first_time = TRUE;
+  sinfo->out_of_blocks = FALSE;
+
+  /* LZWReadByte initialization: */
+  /* compute special code values (note that these do not change later) */
+  sinfo->clear_code = 1 << sinfo->input_code_size;
+  sinfo->end_code = sinfo->clear_code + 1;
+  ReInitLZW(sinfo);
+}
+
+
+LOCAL(int)
+GetCode(gif_source_ptr sinfo)
+/* Fetch the next code_size bits from the GIF data */
+/* We assume code_size is less than 16 */
+{
+  register int accum;
+  int offs, count;
+
+  while (sinfo->cur_bit + sinfo->code_size > sinfo->last_bit) {
+    /* Time to reload the buffer */
+    /* First time, share code with Clear case */
+    if (sinfo->first_time) {
+      sinfo->first_time = FALSE;
+      return sinfo->clear_code;
+    }
+    if (sinfo->out_of_blocks) {
+      WARNMS(sinfo->cinfo, JWRN_GIF_NOMOREDATA);
+      return sinfo->end_code;   /* fake something useful */
+    }
+    /* preserve last two bytes of what we have -- assume code_size <= 16 */
+    sinfo->code_buf[0] = sinfo->code_buf[sinfo->last_byte-2];
+    sinfo->code_buf[1] = sinfo->code_buf[sinfo->last_byte-1];
+    /* Load more bytes; set flag if we reach the terminator block */
+    if ((count = GetDataBlock(sinfo, &sinfo->code_buf[2])) == 0) {
+      sinfo->out_of_blocks = TRUE;
+      WARNMS(sinfo->cinfo, JWRN_GIF_NOMOREDATA);
+      return sinfo->end_code;   /* fake something useful */
+    }
+    /* Reset counters */
+    sinfo->cur_bit = (sinfo->cur_bit - sinfo->last_bit) + 16;
+    sinfo->last_byte = 2 + count;
+    sinfo->last_bit = sinfo->last_byte * 8;
+  }
+
+  /* Form up next 24 bits in accum */
+  offs = sinfo->cur_bit >> 3;   /* byte containing cur_bit */
+  accum = UCH(sinfo->code_buf[offs + 2]);
+  accum <<= 8;
+  accum |= UCH(sinfo->code_buf[offs + 1]);
+  accum <<= 8;
+  accum |= UCH(sinfo->code_buf[offs]);
+
+  /* Right-align cur_bit in accum, then mask off desired number of bits */
+  accum >>= (sinfo->cur_bit & 7);
+  sinfo->cur_bit += sinfo->code_size;
+  return accum & ((1 << sinfo->code_size) - 1);
+}
+
+
+LOCAL(int)
+LZWReadByte(gif_source_ptr sinfo)
+/* Read an LZW-compressed byte */
+{
+  register int code;            /* current working code */
+  int incode;                   /* saves actual input code */
+
+  /* If any codes are stacked from a previously read symbol, return them */
+  if (sinfo->sp > sinfo->symbol_stack)
+    return (int)(*(--sinfo->sp));
+
+  /* Time to read a new symbol */
+  code = GetCode(sinfo);
+
+  if (code == sinfo->clear_code) {
+    /* Reinit state, swallow any extra Clear codes, and */
+    /* return next code, which is expected to be a raw byte. */
+    ReInitLZW(sinfo);
+    do {
+      code = GetCode(sinfo);
+    } while (code == sinfo->clear_code);
+    if (code > sinfo->clear_code) { /* make sure it is a raw byte */
+      WARNMS(sinfo->cinfo, JWRN_GIF_BADDATA);
+      code = 0;                 /* use something valid */
+    }
+    /* make firstcode, oldcode valid! */
+    sinfo->firstcode = sinfo->oldcode = code;
+    return code;
+  }
+
+  if (code == sinfo->end_code) {
+    /* Skip the rest of the image, unless GetCode already read terminator */
+    if (!sinfo->out_of_blocks) {
+      SkipDataBlocks(sinfo);
+      sinfo->out_of_blocks = TRUE;
+    }
+    /* Complain that there's not enough data */
+    WARNMS(sinfo->cinfo, JWRN_GIF_ENDCODE);
+    /* Pad data with 0's */
+    return 0;                   /* fake something usable */
+  }
+
+  /* Got normal raw byte or LZW symbol */
+  incode = code;                /* save for a moment */
+
+  if (code >= sinfo->max_code) { /* special case for not-yet-defined symbol */
+    /* code == max_code is OK; anything bigger is bad data */
+    if (code > sinfo->max_code) {
+      WARNMS(sinfo->cinfo, JWRN_GIF_BADDATA);
+      incode = 0;               /* prevent creation of loops in symbol table */
+    }
+    /* this symbol will be defined as oldcode/firstcode */
+    *(sinfo->sp++) = (UINT8)sinfo->firstcode;
+    code = sinfo->oldcode;
+  }
+
+  /* If it's a symbol, expand it into the stack */
+  while (code >= sinfo->clear_code) {
+    *(sinfo->sp++) = sinfo->symbol_tail[code]; /* tail is a byte value */
+    code = sinfo->symbol_head[code]; /* head is another LZW symbol */
+  }
+  /* At this point code just represents a raw byte */
+  sinfo->firstcode = code;      /* save for possible future use */
+
+  /* If there's room in table... */
+  if ((code = sinfo->max_code) < LZW_TABLE_SIZE) {
+    /* Define a new symbol = prev sym + head of this sym's expansion */
+    sinfo->symbol_head[code] = (UINT16)sinfo->oldcode;
+    sinfo->symbol_tail[code] = (UINT8)sinfo->firstcode;
+    sinfo->max_code++;
+    /* Is it time to increase code_size? */
+    if (sinfo->max_code >= sinfo->limit_code &&
+        sinfo->code_size < MAX_LZW_BITS) {
+      sinfo->code_size++;
+      sinfo->limit_code <<= 1;  /* keep equal to 2^code_size */
+    }
+  }
+
+  sinfo->oldcode = incode;      /* save last input symbol for future use */
+  return sinfo->firstcode;      /* return first byte of symbol's expansion */
+}
+
+
+LOCAL(void)
+ReadColorMap(gif_source_ptr sinfo, int cmaplen, JSAMPARRAY cmap)
+/* Read a GIF colormap */
+{
+  int i;
+
+  for (i = 0; i < cmaplen; i++) {
+#if BITS_IN_JSAMPLE == 8
+#define UPSCALE(x)  (x)
+#else
+#define UPSCALE(x)  ((x) << (BITS_IN_JSAMPLE - 8))
+#endif
+    cmap[CM_RED][i]   = (JSAMPLE)UPSCALE(ReadByte(sinfo));
+    cmap[CM_GREEN][i] = (JSAMPLE)UPSCALE(ReadByte(sinfo));
+    cmap[CM_BLUE][i]  = (JSAMPLE)UPSCALE(ReadByte(sinfo));
+  }
+}
+
+
+LOCAL(void)
+DoExtension(gif_source_ptr sinfo)
+/* Process an extension block */
+/* Currently we ignore 'em all */
+{
+  int extlabel;
+
+  /* Read extension label byte */
+  extlabel = ReadByte(sinfo);
+  TRACEMS1(sinfo->cinfo, 1, JTRC_GIF_EXTENSION, extlabel);
+  /* Skip the data block(s) associated with the extension */
+  SkipDataBlocks(sinfo);
+}
+
+
+/*
+ * Read the file header; return image size and component count.
+ */
+
+METHODDEF(void)
+start_input_gif(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr)sinfo;
+  U_CHAR hdrbuf[10];            /* workspace for reading control blocks */
+  unsigned int width, height;   /* image dimensions */
+  int colormaplen, aspectRatio;
+  int c;
+
+  /* Read and verify GIF Header */
+  if (!ReadOK(source->pub.input_file, hdrbuf, 6))
+    ERREXIT(cinfo, JERR_GIF_NOT);
+  if (hdrbuf[0] != 'G' || hdrbuf[1] != 'I' || hdrbuf[2] != 'F')
+    ERREXIT(cinfo, JERR_GIF_NOT);
+  /* Check for expected version numbers.
+   * If unknown version, give warning and try to process anyway;
+   * this is per recommendation in GIF89a standard.
+   */
+  if ((hdrbuf[3] != '8' || hdrbuf[4] != '7' || hdrbuf[5] != 'a') &&
+      (hdrbuf[3] != '8' || hdrbuf[4] != '9' || hdrbuf[5] != 'a'))
+    TRACEMS3(cinfo, 1, JTRC_GIF_BADVERSION, hdrbuf[3], hdrbuf[4], hdrbuf[5]);
+
+  /* Read and decipher Logical Screen Descriptor */
+  if (!ReadOK(source->pub.input_file, hdrbuf, 7))
+    ERREXIT(cinfo, JERR_INPUT_EOF);
+  width = LM_to_uint(hdrbuf, 0);
+  height = LM_to_uint(hdrbuf, 2);
+  if (width == 0 || height == 0)
+    ERREXIT(cinfo, JERR_GIF_EMPTY);
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  if (sinfo->max_pixels &&
+      (unsigned long long)width * height > sinfo->max_pixels)
+    ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+#endif
+  /* we ignore the color resolution, sort flag, and background color index */
+  aspectRatio = UCH(hdrbuf[6]);
+  if (aspectRatio != 0 && aspectRatio != 49)
+    TRACEMS(cinfo, 1, JTRC_GIF_NONSQUARE);
+
+  /* Allocate space to store the colormap */
+  source->colormap = (*cinfo->mem->alloc_sarray)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)MAXCOLORMAPSIZE,
+     (JDIMENSION)NUMCOLORS);
+  colormaplen = 0;              /* indicate initialization */
+
+  /* Read global colormap if header indicates it is present */
+  if (BitSet(hdrbuf[4], COLORMAPFLAG)) {
+    colormaplen = 2 << (hdrbuf[4] & 0x07);
+    ReadColorMap(source, colormaplen, source->colormap);
+  }
+
+  /* Scan until we reach start of desired image.
+   * We don't currently support skipping images, but could add it easily.
+   */
+  for (;;) {
+    c = ReadByte(source);
+
+    if (c == ';')               /* GIF terminator?? */
+      ERREXIT(cinfo, JERR_GIF_IMAGENOTFOUND);
+
+    if (c == '!') {             /* Extension */
+      DoExtension(source);
+      continue;
+    }
+
+    if (c != ',') {             /* Not an image separator? */
+      WARNMS1(cinfo, JWRN_GIF_CHAR, c);
+      continue;
+    }
+
+    /* Read and decipher Local Image Descriptor */
+    if (!ReadOK(source->pub.input_file, hdrbuf, 9))
+      ERREXIT(cinfo, JERR_INPUT_EOF);
+    /* we ignore top/left position info, also sort flag */
+    width = LM_to_uint(hdrbuf, 4);
+    height = LM_to_uint(hdrbuf, 6);
+    if (width == 0 || height == 0)
+      ERREXIT(cinfo, JERR_GIF_EMPTY);
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+    if (sinfo->max_pixels &&
+        (unsigned long long)width * height > sinfo->max_pixels)
+      ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+#endif
+    source->is_interlaced = (BitSet(hdrbuf[8], INTERLACE) != 0);
+
+    /* Read local colormap if header indicates it is present */
+    /* Note: if we wanted to support skipping images, */
+    /* we'd need to skip rather than read colormap for ignored images */
+    if (BitSet(hdrbuf[8], COLORMAPFLAG)) {
+      colormaplen = 2 << (hdrbuf[8] & 0x07);
+      ReadColorMap(source, colormaplen, source->colormap);
+    }
+
+    source->input_code_size = ReadByte(source); /* get min-code-size byte */
+    if (source->input_code_size < 2 || source->input_code_size > 8)
+      ERREXIT1(cinfo, JERR_GIF_CODESIZE, source->input_code_size);
+
+    /* Reached desired image, so break out of loop */
+    /* If we wanted to skip this image, */
+    /* we'd call SkipDataBlocks and then continue the loop */
+    break;
+  }
+
+  /* Prepare to read selected image: first initialize LZW decompressor */
+  source->symbol_head = (UINT16 *)
+    (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                LZW_TABLE_SIZE * sizeof(UINT16));
+  source->symbol_tail = (UINT8 *)
+    (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                LZW_TABLE_SIZE * sizeof(UINT8));
+  source->symbol_stack = (UINT8 *)
+    (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                LZW_TABLE_SIZE * sizeof(UINT8));
+  InitLZWCode(source);
+
+  /*
+   * If image is interlaced, we read it into a full-size sample array,
+   * decompressing as we go; then get_interlaced_row selects rows from the
+   * sample array in the proper order.
+   */
+  if (source->is_interlaced) {
+    /* We request the virtual array now, but can't access it until virtual
+     * arrays have been allocated.  Hence, the actual work of reading the
+     * image is postponed until the first call to get_pixel_rows.
+     */
+    source->interlaced_image = (*cinfo->mem->request_virt_sarray)
+      ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
+       (JDIMENSION)width, (JDIMENSION)height, (JDIMENSION)1);
+    if (cinfo->progress != NULL) {
+      cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
+      progress->total_extra_passes++; /* count file input as separate pass */
+    }
+    source->pub.get_pixel_rows = load_interlaced_image;
+  } else {
+    source->pub.get_pixel_rows = get_pixel_rows;
+  }
+
+  /* Create compressor input buffer. */
+  source->pub.buffer = (*cinfo->mem->alloc_sarray)
+    ((j_common_ptr)cinfo, JPOOL_IMAGE, (JDIMENSION)width * NUMCOLORS,
+     (JDIMENSION)1);
+  source->pub.buffer_height = 1;
+
+  /* Pad colormap for safety. */
+  for (c = colormaplen; c < source->clear_code; c++) {
+    source->colormap[CM_RED][c]   =
+    source->colormap[CM_GREEN][c] =
+    source->colormap[CM_BLUE][c]  = CENTERJSAMPLE;
+  }
+
+  /* Return info about the image. */
+  cinfo->in_color_space = JCS_RGB;
+  cinfo->input_components = NUMCOLORS;
+  cinfo->data_precision = BITS_IN_JSAMPLE; /* we always rescale data to this */
+  cinfo->image_width = width;
+  cinfo->image_height = height;
+
+  TRACEMS3(cinfo, 1, JTRC_GIF, width, height, colormaplen);
+}
+
+
+/*
+ * Read one row of pixels.
+ * This version is used for noninterlaced GIF images:
+ * we read directly from the GIF file.
+ */
+
+METHODDEF(JDIMENSION)
+get_pixel_rows(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr)sinfo;
+  register int c;
+  register JSAMPROW ptr;
+  register JDIMENSION col;
+  register JSAMPARRAY colormap = source->colormap;
+
+  ptr = source->pub.buffer[0];
+  for (col = cinfo->image_width; col > 0; col--) {
+    c = LZWReadByte(source);
+    *ptr++ = colormap[CM_RED][c];
+    *ptr++ = colormap[CM_GREEN][c];
+    *ptr++ = colormap[CM_BLUE][c];
+  }
+  return 1;
+}
+
+
+/*
+ * Read one row of pixels.
+ * This version is used for the first call on get_pixel_rows when
+ * reading an interlaced GIF file: we read the whole image into memory.
+ */
+
+METHODDEF(JDIMENSION)
+load_interlaced_image(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr)sinfo;
+  register JSAMPROW sptr;
+  register JDIMENSION col;
+  JDIMENSION row;
+  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
+
+  /* Read the interlaced image into the virtual array we've created. */
+  for (row = 0; row < cinfo->image_height; row++) {
+    if (progress != NULL) {
+      progress->pub.pass_counter = (long)row;
+      progress->pub.pass_limit = (long)cinfo->image_height;
+      (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
+    }
+    sptr = *(*cinfo->mem->access_virt_sarray)
+      ((j_common_ptr)cinfo, source->interlaced_image, row, (JDIMENSION)1,
+       TRUE);
+    for (col = cinfo->image_width; col > 0; col--) {
+      *sptr++ = (JSAMPLE)LZWReadByte(source);
+    }
+  }
+  if (progress != NULL)
+    progress->completed_extra_passes++;
+
+  /* Replace method pointer so subsequent calls don't come here. */
+  source->pub.get_pixel_rows = get_interlaced_row;
+  /* Initialize for get_interlaced_row, and perform first call on it. */
+  source->cur_row_number = 0;
+  source->pass2_offset = (cinfo->image_height + 7) / 8;
+  source->pass3_offset = source->pass2_offset + (cinfo->image_height + 3) / 8;
+  source->pass4_offset = source->pass3_offset + (cinfo->image_height + 1) / 4;
+
+  return get_interlaced_row(cinfo, sinfo);
+}
+
+
+/*
+ * Read one row of pixels.
+ * This version is used for interlaced GIF images:
+ * we read from the virtual array.
+ */
+
+METHODDEF(JDIMENSION)
+get_interlaced_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  gif_source_ptr source = (gif_source_ptr)sinfo;
+  register int c;
+  register JSAMPROW sptr, ptr;
+  register JDIMENSION col;
+  register JSAMPARRAY colormap = source->colormap;
+  JDIMENSION irow;
+
+  /* Figure out which row of interlaced image is needed, and access it. */
+  switch ((int)(source->cur_row_number & 7)) {
+  case 0:                       /* first-pass row */
+    irow = source->cur_row_number >> 3;
+    break;
+  case 4:                       /* second-pass row */
+    irow = (source->cur_row_number >> 3) + source->pass2_offset;
+    break;
+  case 2:                       /* third-pass row */
+  case 6:
+    irow = (source->cur_row_number >> 2) + source->pass3_offset;
+    break;
+  default:                      /* fourth-pass row */
+    irow = (source->cur_row_number >> 1) + source->pass4_offset;
+  }
+  sptr = *(*cinfo->mem->access_virt_sarray)
+    ((j_common_ptr)cinfo, source->interlaced_image, irow, (JDIMENSION)1,
+     FALSE);
+  /* Scan the row, expand colormap, and output */
+  ptr = source->pub.buffer[0];
+  for (col = cinfo->image_width; col > 0; col--) {
+    c = *sptr++;
+    *ptr++ = colormap[CM_RED][c];
+    *ptr++ = colormap[CM_GREEN][c];
+    *ptr++ = colormap[CM_BLUE][c];
+  }
+  source->cur_row_number++;     /* for next time */
+  return 1;
+}
+
+
+/*
+ * Finish up at the end of the file.
+ */
+
+METHODDEF(void)
+finish_input_gif(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
+{
+  /* no work */
+}
+
+
 /*
  * The module selection routine for GIF format input.
  */
@@ -31,9 +675,21 @@
 GLOBAL(cjpeg_source_ptr)
 jinit_read_gif(j_compress_ptr cinfo)
 {
-  fprintf(stderr, "GIF input is unsupported for legal reasons.  Sorry.\n");
-  exit(EXIT_FAILURE);
-  return NULL;                  /* keep compiler happy */
+  gif_source_ptr source;
+
+  /* Create module interface object */
+  source = (gif_source_ptr)
+    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                sizeof(gif_source_struct));
+  source->cinfo = cinfo;        /* make back link for subroutines */
+  /* Fill in method ptrs, except get_pixel_rows which start_input sets */
+  source->pub.start_input = start_input_gif;
+  source->pub.finish_input = finish_input_gif;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  source->pub.max_pixels = 0;
+#endif
+
+  return (cjpeg_source_ptr)source;
 }
 
 #endif /* GIF_SUPPORTED */
diff --git a/external/jpeg/rdjpgcom.1 b/external/jpeg/rdjpgcom.1
new file mode 100644
index 000000000000..97611df813c3
--- /dev/null
+++ b/external/jpeg/rdjpgcom.1
@@ -0,0 +1,63 @@
+.TH RDJPGCOM 1 "02 April 2009"
+.SH NAME
+rdjpgcom \- display text comments from a JPEG file
+.SH SYNOPSIS
+.B rdjpgcom
+[
+.B \-raw
+]
+[
+.B \-verbose
+]
+[
+.I filename
+]
+.LP
+.SH DESCRIPTION
+.LP
+.B rdjpgcom
+reads the named JPEG/JFIF file, or the standard input if no file is named,
+and prints any text comments found in the file on the standard output.
+.PP
+The JPEG standard allows "comment" (COM) blocks to occur within a JPEG file.
+Although the standard doesn't actually define what COM blocks are for, they
+are widely used to hold user-supplied text strings.  This lets you add
+annotations, titles, index terms, etc to your JPEG files, and later retrieve
+them as text.  COM blocks do not interfere with the image stored in the JPEG
+file.  The maximum size of a COM block is 64K, but you can have as many of
+them as you like in one JPEG file.
+.SH OPTIONS
+.TP
+.B \-raw
+Normally
+.B rdjpgcom
+escapes non-printable characters in comments, for security reasons.
+This option avoids that.
+.PP
+.B \-verbose
+Causes
+.B rdjpgcom
+to also display the JPEG image dimensions.
+.PP
+Switch names may be abbreviated, and are not case sensitive.
+.SH HINTS
+.B rdjpgcom
+does not depend on the IJG JPEG library.  Its source code is intended as an
+illustration of the minimum amount of code required to parse a JPEG file
+header correctly.
+.PP
+In
+.B \-verbose
+mode,
+.B rdjpgcom
+will also attempt to print the contents of any "APP12" markers as text.
+Some digital cameras produce APP12 markers containing useful textual
+information.  If you like, you can modify the source code to print
+other APPn marker types as well.
+.SH SEE ALSO
+.BR cjpeg (1),
+.BR djpeg (1),
+.BR jpegtran (1),
+.BR wrjpgcom (1)
+.SH AUTHOR
+Independent JPEG Group
diff --git a/external/jpeg/rdppm.c b/external/jpeg/rdppm.c
index 2a58e796643c..9699ca5ee850 100644
--- a/external/jpeg/rdppm.c
+++ b/external/jpeg/rdppm.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1997, Thomas G. Lane.
  * Modified 2009 by Bill Allombert, Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2015-2017, 2020, D. R. Commander.
+ * Copyright (C) 2015-2017, 2020-2021, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -43,18 +43,8 @@
 
 /* Macros to deal with unsigned chars as efficiently as compiler allows */
 
-#ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char U_CHAR;
 #define UCH(x)  ((int)(x))
-#else /* !HAVE_UNSIGNED_CHAR */
-#ifdef __CHAR_UNSIGNED__
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x))
-#else
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x) & 0xFF)
-#endif
-#endif /* HAVE_UNSIGNED_CHAR */
 
 
 #define ReadOK(file, buffer, len) \
@@ -122,11 +112,10 @@ read_pbm_integer(j_compress_ptr cinfo, FILE *infile, unsigned int maxval)
   while ((ch = pbm_getc(infile)) >= '0' && ch <= '9') {
     val *= 10;
     val += ch - '0';
+    if (val > maxval)
+      ERREXIT(cinfo, JERR_PPM_OUTOFRANGE);
   }
 
-  if (val > maxval)
-    ERREXIT(cinfo, JERR_PPM_OUTOFRANGE);
-
   return val;
 }
 
@@ -526,6 +515,11 @@ get_word_rgb_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
   register JSAMPLE *rescale = source->rescale;
   JDIMENSION col;
   unsigned int maxval = source->maxval;
+  register int rindex = rgb_red[cinfo->in_color_space];
+  register int gindex = rgb_green[cinfo->in_color_space];
+  register int bindex = rgb_blue[cinfo->in_color_space];
+  register int aindex = alpha_index[cinfo->in_color_space];
+  register int ps = rgb_pixelsize[cinfo->in_color_space];
 
   if (!ReadOK(source->pub.input_file, source->iobuffer, source->buffer_width))
     ERREXIT(cinfo, JERR_INPUT_EOF);
@@ -537,17 +531,20 @@ get_word_rgb_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
     temp |= UCH(*bufferptr++);
     if (temp > maxval)
       ERREXIT(cinfo, JERR_PPM_OUTOFRANGE);
-    *ptr++ = rescale[temp];
+    ptr[rindex] = rescale[temp];
     temp  = UCH(*bufferptr++) << 8;
     temp |= UCH(*bufferptr++);
     if (temp > maxval)
       ERREXIT(cinfo, JERR_PPM_OUTOFRANGE);
-    *ptr++ = rescale[temp];
+    ptr[gindex] = rescale[temp];
     temp  = UCH(*bufferptr++) << 8;
     temp |= UCH(*bufferptr++);
     if (temp > maxval)
       ERREXIT(cinfo, JERR_PPM_OUTOFRANGE);
-    *ptr++ = rescale[temp];
+    ptr[bindex] = rescale[temp];
+    if (aindex >= 0)
+      ptr[aindex] = 0xFF;
+    ptr += ps;
   }
   return 1;
 }
@@ -589,6 +586,10 @@ start_input_ppm(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
 
   if (w <= 0 || h <= 0 || maxval <= 0) /* error check */
     ERREXIT(cinfo, JERR_PPM_NOT);
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  if (sinfo->max_pixels && (unsigned long long)w * h > sinfo->max_pixels)
+    ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+#endif
 
   cinfo->data_precision = BITS_IN_JSAMPLE; /* we always rescale data to this */
   cinfo->image_width = (JDIMENSION)w;
@@ -634,7 +635,10 @@ start_input_ppm(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
       cinfo->in_color_space = JCS_GRAYSCALE;
     TRACEMS2(cinfo, 1, JTRC_PGM, w, h);
     if (maxval > 255) {
-      source->pub.get_pixel_rows = get_word_gray_row;
+      if (cinfo->in_color_space == JCS_GRAYSCALE)
+        source->pub.get_pixel_rows = get_word_gray_row;
+      else
+        ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
     } else if (maxval == MAXJSAMPLE && sizeof(JSAMPLE) == sizeof(U_CHAR) &&
                cinfo->in_color_space == JCS_GRAYSCALE) {
       source->pub.get_pixel_rows = get_raw_row;
@@ -657,7 +661,10 @@ start_input_ppm(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
       cinfo->in_color_space = JCS_EXT_RGB;
     TRACEMS2(cinfo, 1, JTRC_PPM, w, h);
     if (maxval > 255) {
-      source->pub.get_pixel_rows = get_word_rgb_row;
+      if (IsExtRGB(cinfo->in_color_space))
+        source->pub.get_pixel_rows = get_word_rgb_row;
+      else
+        ERREXIT(cinfo, JERR_BAD_IN_COLORSPACE);
     } else if (maxval == MAXJSAMPLE && sizeof(JSAMPLE) == sizeof(U_CHAR) &&
 #if RGB_RED == 0 && RGB_GREEN == 1 && RGB_BLUE == 2 && RGB_PIXELSIZE == 3
                (cinfo->in_color_space == JCS_EXT_RGB ||
@@ -723,6 +730,8 @@ start_input_ppm(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
       (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
                                   (size_t)(((long)MAX(maxval, 255) + 1L) *
                                            sizeof(JSAMPLE)));
+    MEMZERO(source->rescale, (size_t)(((long)MAX(maxval, 255) + 1L) *
+                                      sizeof(JSAMPLE)));
     half_maxval = maxval / 2;
     for (val = 0; val <= (long)maxval; val++) {
       /* The multiplication here must be done in 32 bits to avoid overflow */
@@ -760,6 +769,9 @@ jinit_read_ppm(j_compress_ptr cinfo)
   /* Fill in method ptrs, except get_pixel_rows which start_input sets */
   source->pub.start_input = start_input_ppm;
   source->pub.finish_input = finish_input_ppm;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  source->pub.max_pixels = 0;
+#endif
 
   return (cjpeg_source_ptr)source;
 }
diff --git a/external/jpeg/rdrle.c b/external/jpeg/rdrle.c
deleted file mode 100644
index b6945146a029..000000000000
--- a/external/jpeg/rdrle.c
+++ /dev/null
@@ -1,389 +0,0 @@
-/*
- * rdrle.c
- *
- * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1991-1996, Thomas G. Lane.
- * It was modified by The libjpeg-turbo Project to include only code and
- * information relevant to libjpeg-turbo.
- * For conditions of distribution and use, see the accompanying README.ijg
- * file.
- *
- * This file contains routines to read input images in Utah RLE format.
- * The Utah Raster Toolkit library is required (version 3.1 or later).
- *
- * These routines may need modification for non-Unix environments or
- * specialized applications.  As they stand, they assume input from
- * an ordinary stdio stream.  They further assume that reading begins
- * at the start of the file; start_input may need work if the
- * user interface has already read some data (e.g., to determine that
- * the file is indeed RLE format).
- *
- * Based on code contributed by Mike Lijewski,
- * with updates from Robert Hutchinson.
- */
-
-#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
-
-#ifdef RLE_SUPPORTED
-
-/* rle.h is provided by the Utah Raster Toolkit. */
-
-#include <rle.h>
-
-/*
- * We assume that JSAMPLE has the same representation as rle_pixel,
- * to wit, "unsigned char".  Hence we can't cope with 12- or 16-bit samples.
- */
-
-#if BITS_IN_JSAMPLE != 8
-  Sorry, this code only copes with 8-bit JSAMPLEs. /* deliberate syntax err */
-#endif
-
-/*
- * We support the following types of RLE files:
- *
- *   GRAYSCALE   - 8 bits, no colormap
- *   MAPPEDGRAY  - 8 bits, 1 channel colomap
- *   PSEUDOCOLOR - 8 bits, 3 channel colormap
- *   TRUECOLOR   - 24 bits, 3 channel colormap
- *   DIRECTCOLOR - 24 bits, no colormap
- *
- * For now, we ignore any alpha channel in the image.
- */
-
-typedef enum
-  { GRAYSCALE, MAPPEDGRAY, PSEUDOCOLOR, TRUECOLOR, DIRECTCOLOR } rle_kind;
-
-
-/*
- * Since RLE stores scanlines bottom-to-top, we have to invert the image
- * to conform to JPEG's top-to-bottom order.  To do this, we read the
- * incoming image into a virtual array on the first get_pixel_rows call,
- * then fetch the required row from the virtual array on subsequent calls.
- */
-
-typedef struct _rle_source_struct *rle_source_ptr;
-
-typedef struct _rle_source_struct {
-  struct cjpeg_source_struct pub; /* public fields */
-
-  rle_kind visual;              /* actual type of input file */
-  jvirt_sarray_ptr image;       /* virtual array to hold the image */
-  JDIMENSION row;               /* current row # in the virtual array */
-  rle_hdr header;               /* Input file information */
-  rle_pixel **rle_row;          /* holds a row returned by rle_getrow() */
-
-} rle_source_struct;
-
-
-/*
- * Read the file header; return image size and component count.
- */
-
-METHODDEF(void)
-start_input_rle(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
-{
-  rle_source_ptr source = (rle_source_ptr)sinfo;
-  JDIMENSION width, height;
-#ifdef PROGRESS_REPORT
-  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
-#endif
-
-  /* Use RLE library routine to get the header info */
-  source->header = *rle_hdr_init(NULL);
-  source->header.rle_file = source->pub.input_file;
-  switch (rle_get_setup(&(source->header))) {
-  case RLE_SUCCESS:
-    /* A-OK */
-    break;
-  case RLE_NOT_RLE:
-    ERREXIT(cinfo, JERR_RLE_NOT);
-    break;
-  case RLE_NO_SPACE:
-    ERREXIT(cinfo, JERR_RLE_MEM);
-    break;
-  case RLE_EMPTY:
-    ERREXIT(cinfo, JERR_RLE_EMPTY);
-    break;
-  case RLE_EOF:
-    ERREXIT(cinfo, JERR_RLE_EOF);
-    break;
-  default:
-    ERREXIT(cinfo, JERR_RLE_BADERROR);
-    break;
-  }
-
-  /* Figure out what we have, set private vars and return values accordingly */
-
-  width  = source->header.xmax - source->header.xmin + 1;
-  height = source->header.ymax - source->header.ymin + 1;
-  source->header.xmin = 0;              /* realign horizontally */
-  source->header.xmax = width - 1;
-
-  cinfo->image_width      = width;
-  cinfo->image_height     = height;
-  cinfo->data_precision   = 8;  /* we can only handle 8 bit data */
-
-  if (source->header.ncolors == 1 && source->header.ncmap == 0) {
-    source->visual     = GRAYSCALE;
-    TRACEMS2(cinfo, 1, JTRC_RLE_GRAY, width, height);
-  } else if (source->header.ncolors == 1 && source->header.ncmap == 1) {
-    source->visual     = MAPPEDGRAY;
-    TRACEMS3(cinfo, 1, JTRC_RLE_MAPGRAY, width, height,
-             1 << source->header.cmaplen);
-  } else if (source->header.ncolors == 1 && source->header.ncmap == 3) {
-    source->visual     = PSEUDOCOLOR;
-    TRACEMS3(cinfo, 1, JTRC_RLE_MAPPED, width, height,
-             1 << source->header.cmaplen);
-  } else if (source->header.ncolors == 3 && source->header.ncmap == 3) {
-    source->visual     = TRUECOLOR;
-    TRACEMS3(cinfo, 1, JTRC_RLE_FULLMAP, width, height,
-             1 << source->header.cmaplen);
-  } else if (source->header.ncolors == 3 && source->header.ncmap == 0) {
-    source->visual     = DIRECTCOLOR;
-    TRACEMS2(cinfo, 1, JTRC_RLE, width, height);
-  } else
-    ERREXIT(cinfo, JERR_RLE_UNSUPPORTED);
-
-  if (source->visual == GRAYSCALE || source->visual == MAPPEDGRAY) {
-    cinfo->in_color_space   = JCS_GRAYSCALE;
-    cinfo->input_components = 1;
-  } else {
-    cinfo->in_color_space   = JCS_RGB;
-    cinfo->input_components = 3;
-  }
-
-  /*
-   * A place to hold each scanline while it's converted.
-   * (GRAYSCALE scanlines don't need converting)
-   */
-  if (source->visual != GRAYSCALE) {
-    source->rle_row = (rle_pixel **)(*cinfo->mem->alloc_sarray)
-      ((j_common_ptr)cinfo, JPOOL_IMAGE,
-       (JDIMENSION)width, (JDIMENSION)cinfo->input_components);
-  }
-
-  /* request a virtual array to hold the image */
-  source->image = (*cinfo->mem->request_virt_sarray)
-    ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
-     (JDIMENSION)(width * source->header.ncolors),
-     (JDIMENSION)height, (JDIMENSION)1);
-
-#ifdef PROGRESS_REPORT
-  if (progress != NULL) {
-    /* count file input as separate pass */
-    progress->total_extra_passes++;
-  }
-#endif
-
-  source->pub.buffer_height = 1;
-}
-
-
-/*
- * Read one row of pixels.
- * Called only after load_image has read the image into the virtual array.
- * Used for GRAYSCALE, MAPPEDGRAY, TRUECOLOR, and DIRECTCOLOR images.
- */
-
-METHODDEF(JDIMENSION)
-get_rle_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
-{
-  rle_source_ptr source = (rle_source_ptr)sinfo;
-
-  source->row--;
-  source->pub.buffer = (*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr)cinfo, source->image, source->row, (JDIMENSION)1, FALSE);
-
-  return 1;
-}
-
-/*
- * Read one row of pixels.
- * Called only after load_image has read the image into the virtual array.
- * Used for PSEUDOCOLOR images.
- */
-
-METHODDEF(JDIMENSION)
-get_pseudocolor_row(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
-{
-  rle_source_ptr source = (rle_source_ptr)sinfo;
-  JSAMPROW src_row, dest_row;
-  JDIMENSION col;
-  rle_map *colormap;
-  int val;
-
-  colormap = source->header.cmap;
-  dest_row = source->pub.buffer[0];
-  source->row--;
-  src_row = *(*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr)cinfo, source->image, source->row, (JDIMENSION)1, FALSE);
-
-  for (col = cinfo->image_width; col > 0; col--) {
-    val = GETJSAMPLE(*src_row++);
-    *dest_row++ = (JSAMPLE)(colormap[val      ] >> 8);
-    *dest_row++ = (JSAMPLE)(colormap[val + 256] >> 8);
-    *dest_row++ = (JSAMPLE)(colormap[val + 512] >> 8);
-  }
-
-  return 1;
-}
-
-
-/*
- * Load the image into a virtual array.  We have to do this because RLE
- * files start at the lower left while the JPEG standard has them starting
- * in the upper left.  This is called the first time we want to get a row
- * of input.  What we do is load the RLE data into the array and then call
- * the appropriate routine to read one row from the array.  Before returning,
- * we set source->pub.get_pixel_rows so that subsequent calls go straight to
- * the appropriate row-reading routine.
- */
-
-METHODDEF(JDIMENSION)
-load_image(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
-{
-  rle_source_ptr source = (rle_source_ptr)sinfo;
-  JDIMENSION row, col;
-  JSAMPROW scanline, red_ptr, green_ptr, blue_ptr;
-  rle_pixel **rle_row;
-  rle_map *colormap;
-  char channel;
-#ifdef PROGRESS_REPORT
-  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
-#endif
-
-  colormap = source->header.cmap;
-  rle_row = source->rle_row;
-
-  /* Read the RLE data into our virtual array.
-   * We assume here that rle_pixel is represented the same as JSAMPLE.
-   */
-  RLE_CLR_BIT(source->header, RLE_ALPHA); /* don't read the alpha channel */
-
-#ifdef PROGRESS_REPORT
-  if (progress != NULL) {
-    progress->pub.pass_limit = cinfo->image_height;
-    progress->pub.pass_counter = 0;
-    (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-  }
-#endif
-
-  switch (source->visual) {
-
-  case GRAYSCALE:
-  case PSEUDOCOLOR:
-    for (row = 0; row < cinfo->image_height; row++) {
-      rle_row = (rle_pixel **)(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr)cinfo, source->image, row, (JDIMENSION)1, TRUE);
-      rle_getrow(&source->header, rle_row);
-#ifdef PROGRESS_REPORT
-      if (progress != NULL) {
-        progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-      }
-#endif
-    }
-    break;
-
-  case MAPPEDGRAY:
-  case TRUECOLOR:
-    for (row = 0; row < cinfo->image_height; row++) {
-      scanline = *(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr)cinfo, source->image, row, (JDIMENSION)1, TRUE);
-      rle_row = source->rle_row;
-      rle_getrow(&source->header, rle_row);
-
-      for (col = 0; col < cinfo->image_width; col++) {
-        for (channel = 0; channel < source->header.ncolors; channel++) {
-          *scanline++ = (JSAMPLE)
-            (colormap[GETJSAMPLE(rle_row[channel][col]) + 256 * channel] >> 8);
-        }
-      }
-
-#ifdef PROGRESS_REPORT
-      if (progress != NULL) {
-        progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-      }
-#endif
-    }
-    break;
-
-  case DIRECTCOLOR:
-    for (row = 0; row < cinfo->image_height; row++) {
-      scanline = *(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr)cinfo, source->image, row, (JDIMENSION)1, TRUE);
-      rle_getrow(&source->header, rle_row);
-
-      red_ptr   = rle_row[0];
-      green_ptr = rle_row[1];
-      blue_ptr  = rle_row[2];
-
-      for (col = cinfo->image_width; col > 0; col--) {
-        *scanline++ = *red_ptr++;
-        *scanline++ = *green_ptr++;
-        *scanline++ = *blue_ptr++;
-      }
-
-#ifdef PROGRESS_REPORT
-      if (progress != NULL) {
-        progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-      }
-#endif
-    }
-  }
-
-#ifdef PROGRESS_REPORT
-  if (progress != NULL)
-    progress->completed_extra_passes++;
-#endif
-
-  /* Set up to call proper row-extraction routine in future */
-  if (source->visual == PSEUDOCOLOR) {
-    source->pub.buffer = source->rle_row;
-    source->pub.get_pixel_rows = get_pseudocolor_row;
-  } else {
-    source->pub.get_pixel_rows = get_rle_row;
-  }
-  source->row = cinfo->image_height;
-
-  /* And fetch the topmost (bottommost) row */
-  return (*source->pub.get_pixel_rows) (cinfo, sinfo);
-}
-
-
-/*
- * Finish up at the end of the file.
- */
-
-METHODDEF(void)
-finish_input_rle(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
-{
-  /* no work */
-}
-
-
-/*
- * The module selection routine for RLE format input.
- */
-
-GLOBAL(cjpeg_source_ptr)
-jinit_read_rle(j_compress_ptr cinfo)
-{
-  rle_source_ptr source;
-
-  /* Create module interface object */
-  source = (rle_source_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                sizeof(rle_source_struct));
-  /* Fill in method ptrs */
-  source->pub.start_input = start_input_rle;
-  source->pub.finish_input = finish_input_rle;
-  source->pub.get_pixel_rows = load_image;
-
-  return (cjpeg_source_ptr)source;
-}
-
-#endif /* RLE_SUPPORTED */
diff --git a/external/jpeg/rdtarga.c b/external/jpeg/rdtarga.c
index cd8a363e1c78..8f2d03162d40 100644
--- a/external/jpeg/rdtarga.c
+++ b/external/jpeg/rdtarga.c
@@ -5,7 +5,7 @@
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * Modified 2017 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2018, D. R. Commander.
+ * Copyright (C) 2018, 2021, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -28,18 +28,8 @@
 
 /* Macros to deal with unsigned chars as efficiently as compiler allows */
 
-#ifdef HAVE_UNSIGNED_CHAR
 typedef unsigned char U_CHAR;
 #define UCH(x)  ((int)(x))
-#else /* !HAVE_UNSIGNED_CHAR */
-#ifdef __CHAR_UNSIGNED__
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x))
-#else
-typedef char U_CHAR;
-#define UCH(x)  ((int)(x) & 0xFF)
-#endif
-#endif /* HAVE_UNSIGNED_CHAR */
 
 
 #define ReadOK(file, buffer, len) \
@@ -373,6 +363,11 @@ start_input_tga(j_compress_ptr cinfo, cjpeg_source_ptr sinfo)
       interlace_type != 0 ||      /* currently don't allow interlaced image */
       width == 0 || height == 0)  /* image width/height must be non-zero */
     ERREXIT(cinfo, JERR_TGA_BADPARMS);
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  if (sinfo->max_pixels &&
+      (unsigned long long)width * height > sinfo->max_pixels)
+    ERREXIT(cinfo, JERR_WIDTH_OVERFLOW);
+#endif
 
   if (subtype > 8) {
     /* It's an RLE-coded file */
@@ -503,6 +498,9 @@ jinit_read_targa(j_compress_ptr cinfo)
   /* Fill in method ptrs, except get_pixel_rows which start_input sets */
   source->pub.start_input = start_input_tga;
   source->pub.finish_input = finish_input_tga;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  source->pub.max_pixels = 0;
+#endif
 
   return (cjpeg_source_ptr)source;
 }
diff --git a/external/jpeg/sharedlib/CMakeLists.txt b/external/jpeg/sharedlib/CMakeLists.txt
deleted file mode 100644
index 8d65e589dde4..000000000000
--- a/external/jpeg/sharedlib/CMakeLists.txt
+++ /dev/null
@@ -1,99 +0,0 @@
-# Anything that must be linked against the shared C library on Windows must
-# be built in this subdirectory, because CMake doesn't allow us to override
-# the compiler flags for each build type except at directory scope.  Note
-# to CMake developers:  Add a COMPILE_FLAGS_<CONFIG> target property, or
-# better yet, provide a friendly way of configuring a Windows target to use the
-# static C library.
-
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/..)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/..)
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/..)
-
-if(MSVC)
-  # Build all configurations against shared C library
-  foreach(var CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-    CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-    if(${var} MATCHES "/MT")
-      string(REGEX REPLACE "/MT" "/MD" ${var} "${${var}}")
-    endif()
-  endforeach()
-endif()
-
-foreach(src ${JPEG_SOURCES})
-  set(JPEG_SRCS ${JPEG_SRCS} ../${src})
-endforeach()
-
-if(WITH_SIMD AND (MSVC_IDE OR XCODE))
-  # This tells CMake that the "source" files haven't been generated yet
-  set_source_files_properties(${SIMD_OBJS} PROPERTIES GENERATED 1)
-endif()
-
-if(WIN32)
-  if(WITH_MEM_SRCDST)
-    set(DEFFILE ../win/jpeg${SO_MAJOR_VERSION}-memsrcdst.def)
-  else()
-    set(DEFFILE ../win/jpeg${SO_MAJOR_VERSION}.def)
-  endif()
-endif()
-add_library(jpeg SHARED ${JPEG_SRCS} ${DEFFILE} $<TARGET_OBJECTS:simd>
-  ${SIMD_OBJS})
-
-set_target_properties(jpeg PROPERTIES SOVERSION ${SO_MAJOR_VERSION}
-  VERSION ${SO_MAJOR_VERSION}.${SO_AGE}.${SO_MINOR_VERSION})
-if(APPLE AND (NOT CMAKE_OSX_DEPLOYMENT_TARGET OR
-              CMAKE_OSX_DEPLOYMENT_TARGET VERSION_GREATER 10.4))
-  if(NOT CMAKE_SHARED_LIBRARY_RUNTIME_C_FLAG)
-    set(CMAKE_SHARED_LIBRARY_RUNTIME_C_FLAG "-Wl,-rpath,")
-  endif()
-  set_target_properties(jpeg PROPERTIES MACOSX_RPATH 1)
-endif()
-if(MAPFLAG)
-  set_target_properties(jpeg PROPERTIES
-    LINK_FLAGS "${MAPFLAG}${CMAKE_CURRENT_BINARY_DIR}/../libjpeg.map")
-endif()
-if(MSVC)
-  set_target_properties(jpeg PROPERTIES
-    RUNTIME_OUTPUT_NAME jpeg${SO_MAJOR_VERSION})
-  # The jsimd_*.c file is built using /MT, so this prevents a linker warning.
-  set_target_properties(jpeg PROPERTIES LINK_FLAGS "/NODEFAULTLIB:LIBCMT /NODEFAULTLIB:LIBCMTD")
-elseif(MINGW)
-  set_target_properties(jpeg PROPERTIES SUFFIX -${SO_MAJOR_VERSION}.dll)
-endif()
-
-if(WIN32)
-  set(USE_SETMODE "-DUSE_SETMODE")
-endif()
-if(WITH_12BIT)
-  set(COMPILE_FLAGS "-DGIF_SUPPORTED -DPPM_SUPPORTED ${USE_SETMODE}")
-else()
-  set(COMPILE_FLAGS "-DBMP_SUPPORTED -DGIF_SUPPORTED -DPPM_SUPPORTED -DTARGA_SUPPORTED ${USE_SETMODE}")
-  set(CJPEG_BMP_SOURCES ../rdbmp.c ../rdtarga.c)
-  set(DJPEG_BMP_SOURCES ../wrbmp.c ../wrtarga.c)
-endif()
-
-add_executable(cjpeg ../cjpeg.c ../cdjpeg.c ../rdgif.c ../rdppm.c
-  ../rdswitch.c ${CJPEG_BMP_SOURCES})
-set_property(TARGET cjpeg PROPERTY COMPILE_FLAGS ${COMPILE_FLAGS})
-target_link_libraries(cjpeg jpeg)
-
-add_executable(djpeg ../djpeg.c ../cdjpeg.c ../rdcolmap.c ../rdswitch.c
-  ../wrgif.c ../wrppm.c ${DJPEG_BMP_SOURCES})
-set_property(TARGET djpeg PROPERTY COMPILE_FLAGS ${COMPILE_FLAGS})
-target_link_libraries(djpeg jpeg)
-
-add_executable(jpegtran ../jpegtran.c ../cdjpeg.c ../rdswitch.c ../transupp.c)
-target_link_libraries(jpegtran jpeg)
-set_property(TARGET jpegtran PROPERTY COMPILE_FLAGS "${USE_SETMODE}")
-
-add_executable(jcstest ../jcstest.c)
-target_link_libraries(jcstest jpeg)
-
-install(TARGETS jpeg cjpeg djpeg jpegtran
-  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
-if(NOT CMAKE_VERSION VERSION_LESS "3.1" AND MSVC AND
-  CMAKE_C_LINKER_SUPPORTS_PDB)
-  install(FILES "$<TARGET_PDB_FILE:jpeg>"
-    DESTINATION ${CMAKE_INSTALL_BINDIR} OPTIONAL)
-endif()
diff --git a/external/jpeg/simd/CMakeLists.txt b/external/jpeg/simd/CMakeLists.txt
index a3c1498a8b2d..7f380dcefb09 100644
--- a/external/jpeg/simd/CMakeLists.txt
+++ b/external/jpeg/simd/CMakeLists.txt
@@ -30,6 +30,9 @@ if(CPU_TYPE STREQUAL "x86_64")
   if(CYGWIN)
     set(CMAKE_ASM_NASM_OBJECT_FORMAT win64)
   endif()
+  if(CMAKE_C_COMPILER_ABI MATCHES "ELF X32")
+    set(CMAKE_ASM_NASM_OBJECT_FORMAT elfx32)
+  endif()
 elseif(CPU_TYPE STREQUAL "i386")
   if(BORLAND)
     set(CMAKE_ASM_NASM_OBJECT_FORMAT obj)
@@ -49,9 +52,9 @@ endif()
 enable_language(ASM_NASM)
 message(STATUS "CMAKE_ASM_NASM_COMPILER = ${CMAKE_ASM_NASM_COMPILER}")
 
-if(CMAKE_ASM_NASM_OBJECT_FORMAT MATCHES "macho*")
+if(CMAKE_ASM_NASM_OBJECT_FORMAT MATCHES "^macho")
   set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DMACHO")
-elseif(CMAKE_ASM_NASM_OBJECT_FORMAT MATCHES "elf*")
+elseif(CMAKE_ASM_NASM_OBJECT_FORMAT MATCHES "^elf")
   set(CMAKE_ASM_NASM_FLAGS "${CMAKE_ASM_NASM_FLAGS} -DELF")
   set(CMAKE_ASM_NASM_DEBUG_FORMAT "dwarf2")
 endif()
@@ -205,25 +208,76 @@ endif()
 
 
 ###############################################################################
-# Arm (GAS)
+# Arm (Intrinsics or GAS)
 ###############################################################################
 
 elseif(CPU_TYPE STREQUAL "arm64" OR CPU_TYPE STREQUAL "arm")
 
-enable_language(ASM)
+include(CheckSymbolExists)
+if(BITS EQUAL 32)
+  set(CMAKE_REQUIRED_FLAGS -mfpu=neon)
+endif()
+check_symbol_exists(vld1_s16_x3 arm_neon.h HAVE_VLD1_S16_X3)
+check_symbol_exists(vld1_u16_x2 arm_neon.h HAVE_VLD1_U16_X2)
+check_symbol_exists(vld1q_u8_x4 arm_neon.h HAVE_VLD1Q_U8_X4)
+if(BITS EQUAL 32)
+  unset(CMAKE_REQUIRED_FLAGS)
+endif()
+configure_file(arm/neon-compat.h.in arm/neon-compat.h @ONLY)
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/arm)
+
+# GCC (as of this writing) and some older versions of Clang do not have a full
+# or optimal set of Neon intrinsics, so for performance reasons, when using
+# those compilers, we default to using the older GAS implementation of the Neon
+# SIMD extensions for certain algorithms.  The presence or absence of the three
+# intrinsics we tested above is a reasonable proxy for this.  We always default
+# to using the full Neon intrinsics implementation when building for macOS or
+# iOS, to avoid the need for gas-preprocessor.
+if((HAVE_VLD1_S16_X3 AND HAVE_VLD1_U16_X2 AND HAVE_VLD1Q_U8_X4) OR APPLE)
+  set(DEFAULT_NEON_INTRINSICS 1)
+else()
+  set(DEFAULT_NEON_INTRINSICS 0)
+endif()
+option(NEON_INTRINSICS
+  "Because GCC (as of this writing) and some older versions of Clang do not have a full or optimal set of Neon intrinsics, for performance reasons, the default when building libjpeg-turbo with those compilers is to continue using the older GAS implementation of the Neon SIMD extensions for certain algorithms.  Setting this option forces the full Neon intrinsics implementation to be used with all compilers.  Unsetting this option forces the hybrid GAS/intrinsics implementation to be used with all compilers."
+  ${DEFAULT_NEON_INTRINSICS})
+boolean_number(NEON_INTRINSICS PARENT_SCOPE)
+if(NEON_INTRINSICS)
+  add_definitions(-DNEON_INTRINSICS)
+  message(STATUS "Use full Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
+else()
+  message(STATUS "Use partial Neon SIMD intrinsics implementation (NEON_INTRINSICS = ${NEON_INTRINSICS})")
+endif()
 
-set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_ASM_FLAGS}")
+set(SIMD_SOURCES arm/jcgray-neon.c arm/jcphuff-neon.c arm/jcsample-neon.c
+  arm/jdmerge-neon.c arm/jdsample-neon.c arm/jfdctfst-neon.c
+  arm/jidctred-neon.c arm/jquanti-neon.c)
+if(NEON_INTRINSICS)
+  set(SIMD_SOURCES ${SIMD_SOURCES} arm/jccolor-neon.c arm/jidctint-neon.c)
+endif()
+if(NEON_INTRINSICS OR BITS EQUAL 64)
+  set(SIMD_SOURCES ${SIMD_SOURCES} arm/jidctfst-neon.c)
+endif()
+if(NEON_INTRINSICS OR BITS EQUAL 32)
+  set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jchuff-neon.c
+    arm/jdcolor-neon.c arm/jfdctint-neon.c)
+endif()
+if(BITS EQUAL 32)
+  set_source_files_properties(${SIMD_SOURCES} COMPILE_FLAGS -mfpu=neon)
+endif()
+if(NOT NEON_INTRINSICS)
+  enable_language(ASM)
 
-string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
-set(EFFECTIVE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CMAKE_ASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
-message(STATUS "CMAKE_ASM_FLAGS = ${EFFECTIVE_ASM_FLAGS}")
+  set(CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_ASM_FLAGS}")
 
-# Since engine-x apps arm/arm64 cpu always support gas properly,
-# so we remove TurboJPEG official complex check logic which not properly works with
-# Apple xcode11+ clang
-message(STATUS "GAS is working properly")
-# add_library(simd OBJECT ${CPU_TYPE}/jsimd_neon.S ${CPU_TYPE}/jsimd.c)
-add_library(simd OBJECT jsimd.c jsimd_neon.S)
+  string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UC)
+  set(EFFECTIVE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CMAKE_ASM_FLAGS_${CMAKE_BUILD_TYPE_UC}}")
+  message(STATUS "CMAKE_ASM_FLAGS = ${EFFECTIVE_ASM_FLAGS}")
+
+  set(SIMD_SOURCES ${SIMD_SOURCES} arm/aarch${BITS}/jsimd_neon.S)
+endif()
+
+add_library(simd OBJECT ${SIMD_SOURCES} arm/aarch${BITS}/jsimd.c)
 
 if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
   set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
@@ -272,14 +326,35 @@ if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
 endif()
 
 ###############################################################################
-# Loongson (Intrinsics)
+# MIPS64 (Intrinsics)
 ###############################################################################
 
-elseif(CPU_TYPE STREQUAL "loongson")
+elseif(CPU_TYPE STREQUAL "loongson" OR CPU_TYPE MATCHES "^mips64")
+
+set(CMAKE_REQUIRED_FLAGS -Wa,-mloongson-mmi,-mloongson-ext)
+
+check_c_source_compiles("
+  int main(void) {
+    int c = 0, a = 0, b = 0;
+    asm (
+      \"paddb %0, %1, %2\"
+      : \"=f\" (c)
+      : \"f\" (a), \"f\" (b)
+    );
+    return c;
+  }" HAVE_MMI)
+
+unset(CMAKE_REQUIRED_FLAGS)
+
+if(NOT HAVE_MMI)
+  simd_fail("SIMD extensions not available for this CPU")
+  return()
+endif()
 
-set(SIMD_SOURCES loongson/jccolor-mmi.c loongson/jcsample-mmi.c
-  loongson/jdcolor-mmi.c loongson/jdsample-mmi.c loongson/jfdctint-mmi.c
-  loongson/jidctint-mmi.c loongson/jquanti-mmi.c)
+set(SIMD_SOURCES mips64/jccolor-mmi.c mips64/jcgray-mmi.c mips64/jcsample-mmi.c
+  mips64/jdcolor-mmi.c mips64/jdmerge-mmi.c mips64/jdsample-mmi.c
+  mips64/jfdctfst-mmi.c mips64/jfdctint-mmi.c mips64/jidctfst-mmi.c
+  mips64/jidctint-mmi.c mips64/jquanti-mmi.c)
 
 if(CMAKE_COMPILER_IS_GNUCC)
   foreach(file ${SIMD_SOURCES})
@@ -287,8 +362,12 @@ if(CMAKE_COMPILER_IS_GNUCC)
       " -fno-strict-aliasing")
   endforeach()
 endif()
+foreach(file ${SIMD_SOURCES})
+  set_property(SOURCE ${file} APPEND_STRING PROPERTY COMPILE_FLAGS
+    " -Wa,-mloongson-mmi,-mloongson-ext")
+endforeach()
 
-add_library(simd OBJECT ${SIMD_SOURCES} loongson/jsimd.c)
+add_library(simd OBJECT ${SIMD_SOURCES} mips64/jsimd.c)
 
 if(CMAKE_POSITION_INDEPENDENT_CODE OR ENABLE_SHARED)
   set_target_properties(simd PROPERTIES POSITION_INDEPENDENT_CODE 1)
diff --git a/external/jpeg/simd/arm/aarch32/jccolext-neon.c b/external/jpeg/simd/arm/aarch32/jccolext-neon.c
new file mode 100644
index 000000000000..362102d2b2d8
--- /dev/null
+++ b/external/jpeg/simd/arm/aarch32/jccolext-neon.c
@@ -0,0 +1,148 @@
+/*
+ * jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *    Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
+ *    Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ *    0.16874695 = 11059 * 2^-16
+ *    0.33125305 = 21709 * 2^-16
+ *    0.50000000 = 32768 * 2^-16
+ *    0.41868592 = 27439 * 2^-16
+ *    0.08131409 =  5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  /* Pointer to RGB(X/A) input data */
+  JSAMPROW inptr;
+  /* Pointers to Y, Cb, and Cr output data */
+  JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 8) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
+
+  /* Set up conversion constants. */
+#ifdef HAVE_VLD1_U16_X2
+  const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x2(). */
+  const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
+  const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
+  const uint16x4x2_t consts = { { consts1, consts2 } };
+#endif
+  const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining > 0; cols_remaining -= 8) {
+
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 8) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      if (cols_remaining < 8) {
+        memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+        inptr = tmp_buf;
+      }
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+      uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+      uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+      uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+      uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
+      y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
+      y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
+      uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
+      y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
+      y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_low = scaled_128_5;
+      cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
+      cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
+      cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
+      uint32x4_t cb_high = scaled_128_5;
+      cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
+      cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
+      cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_low = scaled_128_5;
+      cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
+      cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
+      cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
+      uint32x4_t cr_high = scaled_128_5;
+      cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
+      cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
+      cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
+                                      vrshrn_n_u32(y_high, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
+                                       vshrn_n_u32(cb_high, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
+                                       vshrn_n_u32(cr_high, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1_u8(outptr0, vmovn_u16(y_u16));
+      vst1_u8(outptr1, vmovn_u16(cb_u16));
+      vst1_u8(outptr2, vmovn_u16(cr_u16));
+
+      /* Increment pointers. */
+      inptr += (8 * RGB_PIXELSIZE);
+      outptr0 += 8;
+      outptr1 += 8;
+      outptr2 += 8;
+    }
+  }
+}
diff --git a/external/jpeg/simd/arm/aarch32/jchuff-neon.c b/external/jpeg/simd/arm/aarch32/jchuff-neon.c
new file mode 100644
index 000000000000..19d94f720da0
--- /dev/null
+++ b/external/jpeg/simd/arm/aarch32/jchuff-neon.c
@@ -0,0 +1,334 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+                                         JCOEFPTR block, int last_dc_val,
+                                         c_derived_tbl *dctbl,
+                                         c_derived_tbl *actbl)
+{
+  uint8_t block_nbits[DCTSIZE2];
+  uint16_t block_diff[DCTSIZE2];
+
+  /* Load rows of coefficients from DCT block in zig-zag order. */
+
+  /* Compute DC coefficient difference value. (F.1.1.5.1) */
+  int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val);
+  row0 = vld1q_lane_s16(block +  1, row0, 1);
+  row0 = vld1q_lane_s16(block +  8, row0, 2);
+  row0 = vld1q_lane_s16(block + 16, row0, 3);
+  row0 = vld1q_lane_s16(block +  9, row0, 4);
+  row0 = vld1q_lane_s16(block +  2, row0, 5);
+  row0 = vld1q_lane_s16(block +  3, row0, 6);
+  row0 = vld1q_lane_s16(block + 10, row0, 7);
+
+  int16x8_t row1 = vld1q_dup_s16(block + 17);
+  row1 = vld1q_lane_s16(block + 24, row1, 1);
+  row1 = vld1q_lane_s16(block + 32, row1, 2);
+  row1 = vld1q_lane_s16(block + 25, row1, 3);
+  row1 = vld1q_lane_s16(block + 18, row1, 4);
+  row1 = vld1q_lane_s16(block + 11, row1, 5);
+  row1 = vld1q_lane_s16(block +  4, row1, 6);
+  row1 = vld1q_lane_s16(block +  5, row1, 7);
+
+  int16x8_t row2 = vld1q_dup_s16(block + 12);
+  row2 = vld1q_lane_s16(block + 19, row2, 1);
+  row2 = vld1q_lane_s16(block + 26, row2, 2);
+  row2 = vld1q_lane_s16(block + 33, row2, 3);
+  row2 = vld1q_lane_s16(block + 40, row2, 4);
+  row2 = vld1q_lane_s16(block + 48, row2, 5);
+  row2 = vld1q_lane_s16(block + 41, row2, 6);
+  row2 = vld1q_lane_s16(block + 34, row2, 7);
+
+  int16x8_t row3 = vld1q_dup_s16(block + 27);
+  row3 = vld1q_lane_s16(block + 20, row3, 1);
+  row3 = vld1q_lane_s16(block + 13, row3, 2);
+  row3 = vld1q_lane_s16(block +  6, row3, 3);
+  row3 = vld1q_lane_s16(block +  7, row3, 4);
+  row3 = vld1q_lane_s16(block + 14, row3, 5);
+  row3 = vld1q_lane_s16(block + 21, row3, 6);
+  row3 = vld1q_lane_s16(block + 28, row3, 7);
+
+  int16x8_t abs_row0 = vabsq_s16(row0);
+  int16x8_t abs_row1 = vabsq_s16(row1);
+  int16x8_t abs_row2 = vabsq_s16(row2);
+  int16x8_t abs_row3 = vabsq_s16(row3);
+
+  int16x8_t row0_lz = vclzq_s16(abs_row0);
+  int16x8_t row1_lz = vclzq_s16(abs_row1);
+  int16x8_t row2_lz = vclzq_s16(abs_row2);
+  int16x8_t row3_lz = vclzq_s16(abs_row3);
+
+  /* Compute number of bits required to represent each coefficient. */
+  uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
+  uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
+  uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
+  uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
+
+  vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
+  vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
+  vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
+  vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
+
+  uint16x8_t row0_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)),
+              vnegq_s16(row0_lz));
+  uint16x8_t row1_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)),
+              vnegq_s16(row1_lz));
+  uint16x8_t row2_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)),
+              vnegq_s16(row2_lz));
+  uint16x8_t row3_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)),
+              vnegq_s16(row3_lz));
+
+  uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
+  uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask);
+  uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask);
+  uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask);
+
+  /* Store diff values for rows 0, 1, 2, and 3. */
+  vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+  vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+  vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+  vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+
+  /* Load last four rows of coefficients from DCT block in zig-zag order. */
+  int16x8_t row4 = vld1q_dup_s16(block + 35);
+  row4 = vld1q_lane_s16(block + 42, row4, 1);
+  row4 = vld1q_lane_s16(block + 49, row4, 2);
+  row4 = vld1q_lane_s16(block + 56, row4, 3);
+  row4 = vld1q_lane_s16(block + 57, row4, 4);
+  row4 = vld1q_lane_s16(block + 50, row4, 5);
+  row4 = vld1q_lane_s16(block + 43, row4, 6);
+  row4 = vld1q_lane_s16(block + 36, row4, 7);
+
+  int16x8_t row5 = vld1q_dup_s16(block + 29);
+  row5 = vld1q_lane_s16(block + 22, row5, 1);
+  row5 = vld1q_lane_s16(block + 15, row5, 2);
+  row5 = vld1q_lane_s16(block + 23, row5, 3);
+  row5 = vld1q_lane_s16(block + 30, row5, 4);
+  row5 = vld1q_lane_s16(block + 37, row5, 5);
+  row5 = vld1q_lane_s16(block + 44, row5, 6);
+  row5 = vld1q_lane_s16(block + 51, row5, 7);
+
+  int16x8_t row6 = vld1q_dup_s16(block + 58);
+  row6 = vld1q_lane_s16(block + 59, row6, 1);
+  row6 = vld1q_lane_s16(block + 52, row6, 2);
+  row6 = vld1q_lane_s16(block + 45, row6, 3);
+  row6 = vld1q_lane_s16(block + 38, row6, 4);
+  row6 = vld1q_lane_s16(block + 31, row6, 5);
+  row6 = vld1q_lane_s16(block + 39, row6, 6);
+  row6 = vld1q_lane_s16(block + 46, row6, 7);
+
+  int16x8_t row7 = vld1q_dup_s16(block + 53);
+  row7 = vld1q_lane_s16(block + 60, row7, 1);
+  row7 = vld1q_lane_s16(block + 61, row7, 2);
+  row7 = vld1q_lane_s16(block + 54, row7, 3);
+  row7 = vld1q_lane_s16(block + 47, row7, 4);
+  row7 = vld1q_lane_s16(block + 55, row7, 5);
+  row7 = vld1q_lane_s16(block + 62, row7, 6);
+  row7 = vld1q_lane_s16(block + 63, row7, 7);
+
+  int16x8_t abs_row4 = vabsq_s16(row4);
+  int16x8_t abs_row5 = vabsq_s16(row5);
+  int16x8_t abs_row6 = vabsq_s16(row6);
+  int16x8_t abs_row7 = vabsq_s16(row7);
+
+  int16x8_t row4_lz = vclzq_s16(abs_row4);
+  int16x8_t row5_lz = vclzq_s16(abs_row5);
+  int16x8_t row6_lz = vclzq_s16(abs_row6);
+  int16x8_t row7_lz = vclzq_s16(abs_row7);
+
+  /* Compute number of bits required to represent each coefficient. */
+  uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
+  uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
+  uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
+  uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
+                                 vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+
+  vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
+  vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
+  vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
+  vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+
+  uint16x8_t row4_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)),
+              vnegq_s16(row4_lz));
+  uint16x8_t row5_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)),
+              vnegq_s16(row5_lz));
+  uint16x8_t row6_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)),
+              vnegq_s16(row6_lz));
+  uint16x8_t row7_mask =
+    vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)),
+              vnegq_s16(row7_lz));
+
+  uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask);
+  uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask);
+  uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask);
+  uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask);
+
+  /* Store diff values for rows 4, 5, 6, and 7. */
+  vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+  vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+  vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+  vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+  /* Construct bitmap to accelerate encoding of AC coefficients.  A set bit
+   * means that the corresponding coefficient != 0.
+   */
+  uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0));
+  uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0));
+  uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0));
+  uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0));
+  uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0));
+  uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0));
+  uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
+  uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
+
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
+
+  row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
+  row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
+  row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask);
+  row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask);
+  row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask);
+  row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask);
+  row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask);
+  row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0);
+  uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0);
+  uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0);
+  uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0);
+  uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
+  uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
+  uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+
+  /* Shift left to remove DC bit. */
+  bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1));
+  /* Move bitmap to 32-bit scalar registers. */
+  uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1);
+  uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0);
+
+  /* Set up state and bit buffer for output bitstream. */
+  working_state *state_ptr = (working_state *)state;
+  int free_bits = state_ptr->cur.free_bits;
+  size_t put_buffer = state_ptr->cur.put_buffer;
+
+  /* Encode DC coefficient. */
+
+  unsigned int nbits = block_nbits[0];
+  /* Emit Huffman-coded symbol and additional diff bits. */
+  unsigned int diff = block_diff[0];
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+  /* Encode AC coefficients. */
+
+  unsigned int r = 0;  /* r = run length of zeros */
+  unsigned int i = 1;  /* i = number of coefficients encoded */
+  /* Code and size information for a run length of 16 zero coefficients */
+  const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+  const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+  while (bitmap_1_32 != 0) {
+    r = BUILTIN_CLZ(bitmap_1_32);
+    i += r;
+    bitmap_1_32 <<= r;
+    nbits = block_nbits[i];
+    diff = block_diff[i];
+    while (r > 15) {
+      /* If run length > 15, emit special run-length-16 codes. */
+      PUT_BITS(code_0xf0, size_0xf0)
+      r -= 16;
+    }
+    /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+    unsigned int rs = (r << 4) + nbits;
+    PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+    i++;
+    bitmap_1_32 <<= 1;
+  }
+
+  r = 33 - i;
+  i = 33;
+
+  while (bitmap_33_63 != 0) {
+    unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
+    r += leading_zeros;
+    i += leading_zeros;
+    bitmap_33_63 <<= leading_zeros;
+    nbits = block_nbits[i];
+    diff = block_diff[i];
+    while (r > 15) {
+      /* If run length > 15, emit special run-length-16 codes. */
+      PUT_BITS(code_0xf0, size_0xf0)
+      r -= 16;
+    }
+    /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+    unsigned int rs = (r << 4) + nbits;
+    PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+    r = 0;
+    i++;
+    bitmap_33_63 <<= 1;
+  }
+
+  /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+   * The value of RS for the EOB code is 0.
+   */
+  if (i != 64) {
+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+  }
+
+  state_ptr->cur.put_buffer = put_buffer;
+  state_ptr->cur.free_bits = free_bits;
+
+  return buffer;
+}
diff --git a/external/jpeg/simd/arm/jsimd.c b/external/jpeg/simd/arm/aarch32/jsimd.c
similarity index 68%
rename from external/jpeg/simd/arm/jsimd.c
rename to external/jpeg/simd/arm/aarch32/jsimd.c
index 709656ce1b2f..fac55dfb28f0 100644
--- a/external/jpeg/simd/arm/jsimd.c
+++ b/external/jpeg/simd/arm/aarch32/jsimd.c
@@ -6,6 +6,7 @@
  * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
  * Copyright (C) 2019, Google LLC.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -17,12 +18,12 @@
  */
 
 #define JPEG_INTERNALS
-#include "../../jinclude.h"
-#include "../../jpeglib.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
 #include "../../jsimd.h"
-#include "../../jdct.h"
-#include "../../jsimddct.h"
-#include "../jsimd.h"
 
 #include <stdio.h>
 #include <string.h>
@@ -164,6 +165,19 @@ jsimd_can_rgb_ycc(void)
 GLOBAL(int)
 jsimd_can_rgb_gray(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -246,6 +260,37 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
                        JSAMPIMAGE output_buf, JDIMENSION output_row,
                        int num_rows)
 {
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_gray_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_gray_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_gray_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)
@@ -298,12 +343,38 @@ jsimd_ycc_rgb565_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 GLOBAL(int)
 jsimd_can_h2v2_downsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_downsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -311,23 +382,50 @@ GLOBAL(void)
 jsimd_h2v2_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  jsimd_h2v2_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
 }
 
 GLOBAL(void)
 jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
                       JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
+  jsimd_h2v1_downsample_neon(cinfo->image_width, cinfo->max_v_samp_factor,
+                             compptr->v_samp_factor, compptr->width_in_blocks,
+                             input_data, output_data);
 }
 
 GLOBAL(int)
 jsimd_can_h2v2_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -335,17 +433,32 @@ GLOBAL(void)
 jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
 }
 
 GLOBAL(void)
 jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
 }
 
 GLOBAL(int)
 jsimd_can_h2v2_fancy_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -366,10 +479,30 @@ jsimd_can_h2v1_fancy_upsample(void)
   return 0;
 }
 
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
 GLOBAL(void)
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
 }
 
 GLOBAL(void)
@@ -381,15 +514,46 @@ jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                                  output_data_ptr);
 }
 
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
 GLOBAL(int)
 jsimd_can_h2v2_merged_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_merged_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -397,12 +561,74 @@ GLOBAL(void)
 jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                            JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
 GLOBAL(void)
 jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                            JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
 GLOBAL(int)
@@ -448,6 +674,17 @@ jsimd_convsamp_float(JSAMPARRAY sample_data, JDIMENSION start_col,
 GLOBAL(int)
 jsimd_can_fdct_islow(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -477,6 +714,7 @@ jsimd_can_fdct_float(void)
 GLOBAL(void)
 jsimd_fdct_islow(DCTELEM *data)
 {
+  jsimd_fdct_islow_neon(data);
 }
 
 GLOBAL(void)
@@ -696,6 +934,16 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
 GLOBAL(int)
 jsimd_can_encode_mcu_AC_first_prepare(void)
 {
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -704,11 +952,23 @@ jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
                                   const int *jpeg_natural_order_start, int Sl,
                                   int Al, JCOEF *values, size_t *zerobits)
 {
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
 }
 
 GLOBAL(int)
 jsimd_can_encode_mcu_AC_refine_prepare(void)
 {
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -717,5 +977,7 @@ jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
                                    const int *jpeg_natural_order_start, int Sl,
                                    int Al, JCOEF *absvalues, size_t *bits)
 {
-  return 0;
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start, Sl,
+                                                 Al, absvalues, bits);
 }
diff --git a/external/jpeg/simd/arm/aarch32/jsimd_neon.S b/external/jpeg/simd/arm/aarch32/jsimd_neon.S
new file mode 100644
index 000000000000..7e1e2b145122
--- /dev/null
+++ b/external/jpeg/simd/arm/aarch32/jsimd_neon.S
@@ -0,0 +1,1200 @@
+/*
+ * Armv7 Neon optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
+ *                          All Rights Reserved.
+ * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
+ * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
+ * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
+ * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#if defined(__linux__) && defined(__ELF__)
+.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.arm
+.syntax unified
+
+
+/*****************************************************************************/
+
+/* Supplementary macro for setting function attributes */
+.macro asm_function fname
+#ifdef __APPLE__
+    .private_extern _\fname
+    .globl _\fname
+_\fname:
+#else
+    .global \fname
+#ifdef __ELF__
+    .hidden \fname
+    .type \fname, %function
+#endif
+\fname:
+#endif
+.endm
+
+
+#define CENTERJSAMPLE  128
+
+/*****************************************************************************/
+
+/*
+ * Perform dequantization and inverse DCT on one block of coefficients.
+ *
+ * GLOBAL(void)
+ * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+ *                       JSAMPARRAY output_buf, JDIMENSION output_col)
+ */
+
+#define FIX_0_298631336  (2446)
+#define FIX_0_390180644  (3196)
+#define FIX_0_541196100  (4433)
+#define FIX_0_765366865  (6270)
+#define FIX_0_899976223  (7373)
+#define FIX_1_175875602  (9633)
+#define FIX_1_501321110  (12299)
+#define FIX_1_847759065  (15137)
+#define FIX_1_961570560  (16069)
+#define FIX_2_053119869  (16819)
+#define FIX_2_562915447  (20995)
+#define FIX_3_072711026  (25172)
+
+#define FIX_1_175875602_MINUS_1_961570560  (FIX_1_175875602 - FIX_1_961570560)
+#define FIX_1_175875602_MINUS_0_390180644  (FIX_1_175875602 - FIX_0_390180644)
+#define FIX_0_541196100_MINUS_1_847759065  (FIX_0_541196100 - FIX_1_847759065)
+#define FIX_3_072711026_MINUS_2_562915447  (FIX_3_072711026 - FIX_2_562915447)
+#define FIX_0_298631336_MINUS_0_899976223  (FIX_0_298631336 - FIX_0_899976223)
+#define FIX_1_501321110_MINUS_0_899976223  (FIX_1_501321110 - FIX_0_899976223)
+#define FIX_2_053119869_MINUS_2_562915447  (FIX_2_053119869 - FIX_2_562915447)
+#define FIX_0_541196100_PLUS_0_765366865   (FIX_0_541196100 + FIX_0_765366865)
+
+/*
+ * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
+ * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
+ */
+#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
+  DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
+  JLONG   q1, q2, q3, q4, q5, q6, q7; \
+  JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2; \
+  \
+  /* 1-D iDCT input data */ \
+  row0 = xrow0; \
+  row1 = xrow1; \
+  row2 = xrow2; \
+  row3 = xrow3; \
+  row4 = xrow4; \
+  row5 = xrow5; \
+  row6 = xrow6; \
+  row7 = xrow7; \
+  \
+  q5 = row7 + row3; \
+  q4 = row5 + row1; \
+  q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
+       MULTIPLY(q4, FIX_1_175875602); \
+  q7 = MULTIPLY(q5, FIX_1_175875602) + \
+       MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
+  q2 = MULTIPLY(row2, FIX_0_541196100) + \
+       MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
+  q4 = q6; \
+  q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
+  q6 += MULTIPLY(row5, -FIX_2_562915447) + \
+        MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
+  /* now we can use q1 (reloadable constants have been used up) */ \
+  q1 = q3 + q2; \
+  q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
+        MULTIPLY(row1, -FIX_0_899976223); \
+  q5 = q7; \
+  q1 = q1 + q6; \
+  q7 += MULTIPLY(row7, -FIX_0_899976223) + \
+        MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
+  \
+  /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
+  tmp11_plus_tmp2 = q1; \
+  row1 = 0; \
+  \
+  q1 = q1 - q6; \
+  q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
+        MULTIPLY(row3, -FIX_2_562915447); \
+  q1 = q1 - q6; \
+  q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
+       MULTIPLY(row6, FIX_0_541196100); \
+  q3 = q3 - q2; \
+  \
+  /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
+  tmp11_minus_tmp2 = q1; \
+  \
+  q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
+  q2 = q1 + q6; \
+  q1 = q1 - q6; \
+  \
+  /* pick up the results */ \
+  tmp0  = q4; \
+  tmp1  = q5; \
+  tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
+  tmp3  = q7; \
+  tmp10 = q2; \
+  tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
+  tmp12 = q3; \
+  tmp13 = q1; \
+}
+
+#define XFIX_0_899976223                    d0[0]
+#define XFIX_0_541196100                    d0[1]
+#define XFIX_2_562915447                    d0[2]
+#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
+#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
+#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
+#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
+#define XFIX_1_175875602                    d1[3]
+#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
+#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
+#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
+#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
+
+.balign 16
+jsimd_idct_islow_neon_consts:
+  .short FIX_0_899976223                    /* d0[0] */
+  .short FIX_0_541196100                    /* d0[1] */
+  .short FIX_2_562915447                    /* d0[2] */
+  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
+  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
+  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
+  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
+  .short FIX_1_175875602                    /* d1[3] */
+  /* reloadable constants */
+  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
+  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
+  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
+  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
+
+asm_function jsimd_idct_islow_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req r1
+    TMP3            .req r2
+    TMP4            .req ip
+
+    ROW0L           .req d16
+    ROW0R           .req d17
+    ROW1L           .req d18
+    ROW1R           .req d19
+    ROW2L           .req d20
+    ROW2R           .req d21
+    ROW3L           .req d22
+    ROW3R           .req d23
+    ROW4L           .req d24
+    ROW4R           .req d25
+    ROW5L           .req d26
+    ROW5R           .req d27
+    ROW6L           .req d28
+    ROW6R           .req d29
+    ROW7L           .req d30
+    ROW7R           .req d31
+
+    /* Load and dequantize coefficients into Neon registers
+     * with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17     ( q8  )
+     *   1 | d18     | d19     ( q9  )
+     *   2 | d20     | d21     ( q10 )
+     *   3 | d22     | d23     ( q11 )
+     *   4 | d24     | d25     ( q12 )
+     *   5 | d26     | d27     ( q13 )
+     *   6 | d28     | d29     ( q14 )
+     *   7 | d30     | d31     ( q15 )
+     */
+    adr             ip, jsimd_idct_islow_neon_consts
+    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+    vmul.s16        q8, q8, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q9, q9, q1
+    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+    vmul.s16        q10, q10, q2
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vmul.s16        q11, q11, q3
+    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+    vmul.s16        q12, q12, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q14, q14, q2
+    vmul.s16        q13, q13, q1
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
+    add             ip, ip, #16
+    vmul.s16        q15, q15, q3
+    vpush           {d8 - d15}                    /* save Neon registers */
+    /* 1-D IDCT, pass 1, left 4x8 half */
+    vadd.s16        d4, ROW7L, ROW3L
+    vadd.s16        d5, ROW5L, ROW1L
+    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d5, XFIX_1_175875602
+    vmull.s16       q7, d4, XFIX_1_175875602
+      /* Check for the zero coefficients in the right 4x8 half */
+      push            {r4, r5}
+    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW4L
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
+      orr             r0, r4, r5
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+      orr             r0, r0, r4
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q2
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+      orr             r0, r0, r4
+    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
+      orr             r0, r0, r5
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1L, q1, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
+      orr             r0, r0, r4
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+      orr             r0, r0, r5
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
+    vmlal.s16       q6, ROW6L, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+      orr             r0, r0, r4
+    vrshrn.s32      ROW6L, q1, #11
+      orr             r0, r0, r5
+    vadd.s32        q1, q3, q5
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW4L
+      orr             r0, r0, r4
+    vrshrn.s32      ROW2L, q1, #11
+      orr             r0, r0, r5
+    vrshrn.s32      ROW5L, q3, #11
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
+      orr             r0, r0, r4
+    vadd.s32        q2, q5, q6
+      orrs            r0, r0, r5
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+      orr             r0, r4, r5
+    vsub.s32        q3, q1, q4
+      pop             {r4, r5}
+    vrshrn.s32      ROW7L, q2, #11
+    vrshrn.s32      ROW3L, q5, #11
+    vrshrn.s32      ROW0L, q6, #11
+    vrshrn.s32      ROW4L, q3, #11
+
+      beq             3f  /* Go to do some special handling for the sparse
+                             right 4x8 half */
+
+    /* 1-D IDCT, pass 1, right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]  /* reload constants */
+    vadd.s16        d10, ROW7R, ROW3R
+    vadd.s16        d8, ROW5R, ROW1R
+      /* Transpose left 4x8 half */
+      vtrn.16         ROW6L, ROW7L
+    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, d8, XFIX_1_175875602
+      vtrn.16         ROW2L, ROW3L
+    vmull.s16       q7, d10, XFIX_1_175875602
+    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
+      vtrn.16         ROW0L, ROW1L
+    vsubl.s16       q3, ROW0R, ROW4R
+    vmull.s16       q2, ROW2R, XFIX_0_541196100
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+      vtrn.16         ROW4L, ROW5L
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
+      vtrn.32         ROW1L, ROW3L
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
+      vtrn.32         ROW4L, ROW6L
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+      vtrn.32         ROW0L, ROW2L
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
+    vrshrn.s32      ROW1R, q1, #11
+      vtrn.32         ROW5L, ROW7L
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vrshrn.s32      ROW6R, q1, #11
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0R, ROW4R
+    vrshrn.s32      ROW2R, q1, #11
+    vrshrn.s32      ROW5R, q3, #11
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vrshrn.s32      ROW7R, q2, #11
+    vrshrn.s32      ROW3R, q5, #11
+    vrshrn.s32      ROW0R, q6, #11
+    vrshrn.s32      ROW4R, q3, #11
+    /* Transpose right 4x8 half */
+    vtrn.16         ROW6R, ROW7R
+    vtrn.16         ROW2R, ROW3R
+    vtrn.16         ROW0R, ROW1R
+    vtrn.16         ROW4R, ROW5R
+    vtrn.32         ROW1R, ROW3R
+    vtrn.32         ROW4R, ROW6R
+    vtrn.32         ROW0R, ROW2R
+    vtrn.32         ROW5R, ROW7R
+
+1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
+    /* 1-D IDCT, pass 2, right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5R, XFIX_1_175875602
+    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
+    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
+    vmull.s16       q7, ROW7R, XFIX_1_175875602
+    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
+    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
+    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
+    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
+    vmov            q4, q6
+    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
+    vshl.s32        q3, q3, #13
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vadd.s32        q1, q1, q6
+    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
+    vmlal.s16       q6, ROW6R, XFIX_0_541196100
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vshl.s32        q5, q5, #13
+    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
+
+2:  /* Descale to 8-bit and range limit */
+    vqrshrn.s16     d16, q8, #2
+    vqrshrn.s16     d17, q9, #2
+    vqrshrn.s16     d18, q10, #2
+    vqrshrn.s16     d19, q11, #2
+    vpop            {d8 - d15}                    /* restore Neon registers */
+    vqrshrn.s16     d20, q12, #2
+      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
+      vtrn.16         q8, q9
+    vqrshrn.s16     d21, q13, #2
+    vqrshrn.s16     d22, q14, #2
+      vmov.u8         q0, #(CENTERJSAMPLE)
+    vqrshrn.s16     d23, q15, #2
+      vtrn.8          d16, d17
+      vtrn.8          d18, d19
+      vadd.u8         q8, q8, q0
+      vadd.u8         q9, q9, q0
+      vtrn.16         q10, q11
+        /* Store results to the output buffer */
+        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        vst1.8          {d16}, [TMP1]
+      vtrn.8          d20, d21
+        vst1.8          {d17}, [TMP2]
+        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        vst1.8          {d18}, [TMP1]
+      vadd.u8         q10, q10, q0
+        vst1.8          {d19}, [TMP2]
+        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+        add             TMP1, TMP1, OUTPUT_COL
+        add             TMP2, TMP2, OUTPUT_COL
+        add             TMP3, TMP3, OUTPUT_COL
+        add             TMP4, TMP4, OUTPUT_COL
+      vtrn.8          d22, d23
+        vst1.8          {d20}, [TMP1]
+      vadd.u8         q11, q11, q0
+        vst1.8          {d21}, [TMP2]
+        vst1.8          {d22}, [TMP3]
+        vst1.8          {d23}, [TMP4]
+    bx              lr
+
+3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
+
+    /* Transpose left 4x8 half */
+    vtrn.16         ROW6L, ROW7L
+    vtrn.16         ROW2L, ROW3L
+    vtrn.16         ROW0L, ROW1L
+    vtrn.16         ROW4L, ROW5L
+    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
+    vtrn.32         ROW1L, ROW3L
+    vtrn.32         ROW4L, ROW6L
+    vtrn.32         ROW0L, ROW2L
+    vtrn.32         ROW5L, ROW7L
+
+    cmp             r0, #0
+    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
+                           pass */
+
+    /* Only row 0 is non-zero for the right 4x8 half  */
+    vdup.s16        ROW1R, ROW0R[1]
+    vdup.s16        ROW2R, ROW0R[2]
+    vdup.s16        ROW3R, ROW0R[3]
+    vdup.s16        ROW4R, ROW0R[0]
+    vdup.s16        ROW5R, ROW0R[1]
+    vdup.s16        ROW6R, ROW0R[2]
+    vdup.s16        ROW7R, ROW0R[3]
+    vdup.s16        ROW0R, ROW0R[0]
+    b               1b  /* Go to 'normal' second pass */
+
+4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW1L, XFIX_1_175875602
+    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW3L, XFIX_1_175875602
+    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW2L, XFIX_0_541196100
+    vshll.s16       q3, ROW0L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
+    vshrn.s32       ROW1L, q1, #16
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW0L, #13
+    vshrn.s32       ROW2L, q1, #16
+    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW3L, q5, #16
+    vshrn.s32       ROW0L, q6, #16
+    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
+    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
+    vld1.s16        {d2}, [ip, :64]               /* reload constants */
+    vmull.s16       q6, ROW5L, XFIX_1_175875602
+    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
+    vmull.s16       q7, ROW7L, XFIX_1_175875602
+    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
+    vmull.s16       q2, ROW6L, XFIX_0_541196100
+    vshll.s16       q3, ROW4L, #13
+    vmov            q4, q6
+    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
+    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
+    vadd.s32        q1, q3, q2
+    vmov            q5, q7
+    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
+    vadd.s32        q1, q1, q6
+    vadd.s32        q6, q6, q6
+    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
+    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
+    vsub.s32        q1, q1, q6
+    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
+    vsub.s32        q3, q3, q2
+    vshrn.s32       ROW6R, q1, #16
+    vadd.s32        q1, q3, q5
+    vsub.s32        q3, q3, q5
+    vshll.s16       q5, ROW4L, #13
+    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
+    vshrn.s32       ROW5R, q3, #16
+    vadd.s32        q2, q5, q6
+    vsub.s32        q1, q5, q6
+    vadd.s32        q6, q2, q7
+    vsub.s32        q2, q2, q7
+    vadd.s32        q5, q1, q4
+    vsub.s32        q3, q1, q4
+    vshrn.s32       ROW7R, q2, #16
+    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
+    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
+    vshrn.s32       ROW4R, q3, #16
+    b               2b                            /* Go to epilogue */
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+
+    .unreq          ROW0L
+    .unreq          ROW0R
+    .unreq          ROW1L
+    .unreq          ROW1R
+    .unreq          ROW2L
+    .unreq          ROW2R
+    .unreq          ROW3L
+    .unreq          ROW3R
+    .unreq          ROW4L
+    .unreq          ROW4R
+    .unreq          ROW5L
+    .unreq          ROW5R
+    .unreq          ROW6L
+    .unreq          ROW6R
+    .unreq          ROW7L
+    .unreq          ROW7R
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_idct_ifast_neon
+ *
+ * This function contains a fast, not so accurate integer implementation of
+ * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
+ * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
+ * function from jidctfst.c
+ *
+ * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
+ * But in Arm Neon case some extra additions are required because VQDMULH
+ * instruction can't handle the constants larger than 1. So the expressions
+ * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
+ * which introduces an extra addition. Overall, there are 6 extra additions
+ * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
+ */
+
+#define XFIX_1_082392200  d0[0]
+#define XFIX_1_414213562  d0[1]
+#define XFIX_1_847759065  d0[2]
+#define XFIX_2_613125930  d0[3]
+
+.balign 16
+jsimd_idct_ifast_neon_consts:
+  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
+  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
+  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
+  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
+
+asm_function jsimd_idct_ifast_neon
+
+    DCT_TABLE       .req r0
+    COEF_BLOCK      .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_COL      .req r3
+    TMP1            .req r0
+    TMP2            .req r1
+    TMP3            .req r2
+    TMP4            .req ip
+
+    /* Load and dequantize coefficients into Neon registers
+     * with the following allocation:
+     *       0 1 2 3 | 4 5 6 7
+     *      ---------+--------
+     *   0 | d16     | d17     ( q8  )
+     *   1 | d18     | d19     ( q9  )
+     *   2 | d20     | d21     ( q10 )
+     *   3 | d22     | d23     ( q11 )
+     *   4 | d24     | d25     ( q12 )
+     *   5 | d26     | d27     ( q13 )
+     *   6 | d28     | d29     ( q14 )
+     *   7 | d30     | d31     ( q15 )
+     */
+    adr             ip, jsimd_idct_ifast_neon_consts
+    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
+    vmul.s16        q8, q8, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q9, q9, q1
+    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
+    vmul.s16        q10, q10, q2
+    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
+    vmul.s16        q11, q11, q3
+    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
+    vmul.s16        q12, q12, q0
+    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
+    vmul.s16        q14, q14, q2
+    vmul.s16        q13, q13, q1
+    vld1.16         {d0}, [ip, :64]  /* load constants */
+    vmul.s16        q15, q15, q3
+    vpush           {d8 - d13}       /* save Neon registers */
+    /* 1-D IDCT, pass 1 */
+    vsub.s16        q2, q10, q14
+    vadd.s16        q14, q10, q14
+    vsub.s16        q1, q11, q13
+    vadd.s16        q13, q11, q13
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
+    vsub.s16        q10, q10, q14
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
+    vsub.s16        q12, q12, q14
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
+    vsub.s16        q13, q10, q2
+    vadd.s16        q10, q10, q2
+      /* Transpose */
+      vtrn.16         q8, q9
+    vsub.s16        q11, q12, q1
+      vtrn.16         q14, q15
+    vadd.s16        q12, q12, q1
+      vtrn.16         q10, q11
+      vtrn.16         q12, q13
+      vtrn.32         q9, q11
+      vtrn.32         q12, q14
+      vtrn.32         q8, q10
+      vtrn.32         q13, q15
+      vswp            d28, d21
+      vswp            d26, d19
+    /* 1-D IDCT, pass 2 */
+    vsub.s16        q2, q10, q14
+      vswp            d30, d23
+    vadd.s16        q14, q10, q14
+      vswp            d24, d17
+    vsub.s16        q1, q11, q13
+    vadd.s16        q13, q11, q13
+    vsub.s16        q5, q9, q15
+    vadd.s16        q15, q9, q15
+    vqdmulh.s16     q4, q2, XFIX_1_414213562
+    vqdmulh.s16     q6, q1, XFIX_2_613125930
+    vadd.s16        q3, q1, q1
+    vsub.s16        q1, q5, q1
+    vadd.s16        q10, q2, q4
+    vqdmulh.s16     q4, q1, XFIX_1_847759065
+    vsub.s16        q2, q15, q13
+    vadd.s16        q3, q3, q6
+    vqdmulh.s16     q6, q2, XFIX_1_414213562
+    vadd.s16        q1, q1, q4
+    vqdmulh.s16     q4, q5, XFIX_1_082392200
+    vsub.s16        q10, q10, q14
+    vadd.s16        q2, q2, q6
+    vsub.s16        q6, q8, q12
+    vadd.s16        q12, q8, q12
+    vadd.s16        q9, q5, q4
+    vadd.s16        q5, q6, q10
+    vsub.s16        q10, q6, q10
+    vadd.s16        q6, q15, q13
+    vadd.s16        q8, q12, q14
+    vsub.s16        q3, q6, q3
+    vsub.s16        q12, q12, q14
+    vsub.s16        q3, q3, q1
+    vsub.s16        q1, q9, q1
+    vadd.s16        q2, q3, q2
+    vsub.s16        q15, q8, q6
+    vadd.s16        q1, q1, q2
+    vadd.s16        q8, q8, q6
+    vadd.s16        q14, q5, q3
+    vsub.s16        q9, q5, q3
+    vsub.s16        q13, q10, q2
+    vpop            {d8 - d13}    /* restore Neon registers */
+    vadd.s16        q10, q10, q2
+    vsub.s16        q11, q12, q1
+    vadd.s16        q12, q12, q1
+    /* Descale to 8-bit and range limit */
+    vmov.u8         q0, #0x80
+    vqshrn.s16      d16, q8, #5
+    vqshrn.s16      d17, q9, #5
+    vqshrn.s16      d18, q10, #5
+    vqshrn.s16      d19, q11, #5
+    vqshrn.s16      d20, q12, #5
+    vqshrn.s16      d21, q13, #5
+    vqshrn.s16      d22, q14, #5
+    vqshrn.s16      d23, q15, #5
+    vadd.u8         q8, q8, q0
+    vadd.u8         q9, q9, q0
+    vadd.u8         q10, q10, q0
+    vadd.u8         q11, q11, q0
+    /* Transpose the final 8-bit samples */
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.8          d16, d17
+    vtrn.8          d18, d19
+      /* Store results to the output buffer */
+      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      vst1.8          {d16}, [TMP1]
+      vst1.8          {d17}, [TMP2]
+      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      vst1.8          {d18}, [TMP1]
+    vtrn.8          d20, d21
+      vst1.8          {d19}, [TMP2]
+      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
+      add             TMP1, TMP1, OUTPUT_COL
+      add             TMP2, TMP2, OUTPUT_COL
+      add             TMP3, TMP3, OUTPUT_COL
+      add             TMP4, TMP4, OUTPUT_COL
+      vst1.8          {d20}, [TMP1]
+    vtrn.8          d22, d23
+      vst1.8          {d21}, [TMP2]
+      vst1.8          {d22}, [TMP3]
+      vst1.8          {d23}, [TMP4]
+    bx              lr
+
+    .unreq          DCT_TABLE
+    .unreq          COEF_BLOCK
+    .unreq          OUTPUT_BUF
+    .unreq          OUTPUT_COL
+    .unreq          TMP1
+    .unreq          TMP2
+    .unreq          TMP3
+    .unreq          TMP4
+
+
+/*****************************************************************************/
+
+/*
+ * jsimd_extrgb_ycc_convert_neon
+ * jsimd_extbgr_ycc_convert_neon
+ * jsimd_extrgbx_ycc_convert_neon
+ * jsimd_extbgrx_ycc_convert_neon
+ * jsimd_extxbgr_ycc_convert_neon
+ * jsimd_extxrgb_ycc_convert_neon
+ *
+ * Colorspace conversion RGB -> YCbCr
+ */
+
+.macro do_store size
+  .if \size == 8
+    vst1.8          {d20}, [Y]!
+    vst1.8          {d21}, [U]!
+    vst1.8          {d22}, [V]!
+  .elseif \size == 4
+    vst1.8          {d20[0]}, [Y]!
+    vst1.8          {d20[1]}, [Y]!
+    vst1.8          {d20[2]}, [Y]!
+    vst1.8          {d20[3]}, [Y]!
+    vst1.8          {d21[0]}, [U]!
+    vst1.8          {d21[1]}, [U]!
+    vst1.8          {d21[2]}, [U]!
+    vst1.8          {d21[3]}, [U]!
+    vst1.8          {d22[0]}, [V]!
+    vst1.8          {d22[1]}, [V]!
+    vst1.8          {d22[2]}, [V]!
+    vst1.8          {d22[3]}, [V]!
+  .elseif \size == 2
+    vst1.8          {d20[4]}, [Y]!
+    vst1.8          {d20[5]}, [Y]!
+    vst1.8          {d21[4]}, [U]!
+    vst1.8          {d21[5]}, [U]!
+    vst1.8          {d22[4]}, [V]!
+    vst1.8          {d22[5]}, [V]!
+  .elseif \size == 1
+    vst1.8          {d20[6]}, [Y]!
+    vst1.8          {d21[6]}, [U]!
+    vst1.8          {d22[6]}, [V]!
+  .else
+    .error unsupported macroblock size
+  .endif
+.endm
+
+.macro do_load bpp, size
+  .if \bpp == 24
+    .if \size == 8
+      vld3.8        {d10, d11, d12}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
+      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
+      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
+      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
+    .elseif \size == 2
+      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
+      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
+    .elseif \size == 1
+      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .elseif \bpp == 32
+    .if \size == 8
+      vld4.8        {d10, d11, d12, d13}, [RGB]!
+      pld           [RGB, #128]
+    .elseif \size == 4
+      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
+      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
+      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
+      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
+    .elseif \size == 2
+      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
+      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
+    .elseif \size == 1
+      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
+    .else
+      .error unsupported macroblock size
+    .endif
+  .else
+    .error unsupported bpp
+  .endif
+.endm
+
+.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
+
+/*
+ * 2-stage pipelined RGB->YCbCr conversion
+ */
+
+.macro do_rgb_to_yuv_stage1
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vrev64.32       q9, q1
+    vrev64.32       q13, q1
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
+.endm
+
+.macro do_rgb_to_yuv_stage2
+    vrshrn.u32      d20, q7, #16
+    vrshrn.u32      d21, q8, #16
+    vshrn.u32       d22, q9, #16
+    vshrn.u32       d23, q13, #16
+    vshrn.u32       d24, q14, #16
+    vshrn.u32       d25, q15, #16
+    vmovn.u16       d20, q10       /* d20 = y */
+    vmovn.u16       d21, q11       /* d21 = u */
+    vmovn.u16       d22, q12       /* d22 = v */
+.endm
+
+.macro do_rgb_to_yuv
+    do_rgb_to_yuv_stage1
+    do_rgb_to_yuv_stage2
+.endm
+
+.macro do_rgb_to_yuv_stage2_store_load_stage1
+      vrshrn.u32      d20, q7, #16
+      vrshrn.u32      d21, q8, #16
+      vshrn.u32       d22, q9, #16
+    vrev64.32       q9, q1
+      vshrn.u32       d23, q13, #16
+    vrev64.32       q13, q1
+      vshrn.u32       d24, q14, #16
+      vshrn.u32       d25, q15, #16
+    do_load         \bpp, 8
+      vmovn.u16       d20, q10     /* d20 = y */
+    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
+      vmovn.u16       d21, q11     /* d21 = u */
+    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
+      vmovn.u16       d22, q12     /* d22 = v */
+    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
+    vmull.u16       q7, d4, d0[0]
+    vmlal.u16       q7, d6, d0[1]
+    vmlal.u16       q7, d8, d0[2]
+      vst1.8          {d20}, [Y]!
+    vmull.u16       q8, d5, d0[0]
+    vmlal.u16       q8, d7, d0[1]
+    vmlal.u16       q8, d9, d0[2]
+    vmlsl.u16       q9, d4, d0[3]
+    vmlsl.u16       q9, d6, d1[0]
+    vmlal.u16       q9, d8, d1[1]
+      vst1.8          {d21}, [U]!
+    vmlsl.u16       q13, d5, d0[3]
+    vmlsl.u16       q13, d7, d1[0]
+    vmlal.u16       q13, d9, d1[1]
+    vrev64.32       q14, q1
+    vrev64.32       q15, q1
+    vmlal.u16       q14, d4, d1[1]
+    vmlsl.u16       q14, d6, d1[2]
+    vmlsl.u16       q14, d8, d1[3]
+      vst1.8          {d22}, [V]!
+    vmlal.u16       q15, d5, d1[1]
+    vmlsl.u16       q15, d7, d1[2]
+    vmlsl.u16       q15, d9, d1[3]
+.endm
+
+.balign 16
+jsimd_\colorid\()_ycc_neon_consts:
+  .short 19595, 38470, 7471,  11059
+  .short 21709, 32768, 27439, 5329
+  .short 32767, 128,   32767, 128
+  .short 32767, 128,   32767, 128
+
+asm_function jsimd_\colorid\()_ycc_convert_neon
+    OUTPUT_WIDTH    .req r0
+    INPUT_BUF       .req r1
+    OUTPUT_BUF      .req r2
+    OUTPUT_ROW      .req r3
+    NUM_ROWS        .req r4
+
+    OUTPUT_BUF0     .req r5
+    OUTPUT_BUF1     .req r6
+    OUTPUT_BUF2     .req OUTPUT_BUF
+
+    RGB             .req r7
+    Y               .req r8
+    U               .req r9
+    V               .req r10
+    N               .req ip
+
+    /* Load constants to d0, d1, d2, d3 */
+    adr             ip, jsimd_\colorid\()_ycc_neon_consts
+    vld1.16         {d0, d1, d2, d3}, [ip, :128]
+
+    /* Save Arm registers and handle input arguments */
+    push            {r4, r5, r6, r7, r8, r9, r10, lr}
+    ldr             NUM_ROWS, [sp, #(4 * 8)]
+    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
+    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
+    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
+    .unreq          OUTPUT_BUF
+
+    /* Save Neon registers */
+    vpush           {d8 - d15}
+
+    /* Outer loop over scanlines */
+    cmp             NUM_ROWS, #1
+    blt             9f
+0:
+    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
+    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
+    mov             N, OUTPUT_WIDTH
+    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
+    add             OUTPUT_ROW, OUTPUT_ROW, #1
+    ldr             RGB, [INPUT_BUF], #4
+
+    /* Inner loop over pixels */
+    subs            N, N, #8
+    blt             3f
+    do_load         \bpp, 8
+    do_rgb_to_yuv_stage1
+    subs            N, N, #8
+    blt             2f
+1:
+    do_rgb_to_yuv_stage2_store_load_stage1
+    subs            N, N, #8
+    bge             1b
+2:
+    do_rgb_to_yuv_stage2
+    do_store        8
+    tst             N, #7
+    beq             8f
+3:
+    tst             N, #4
+    beq             3f
+    do_load         \bpp, 4
+3:
+    tst             N, #2
+    beq             4f
+    do_load         \bpp, 2
+4:
+    tst             N, #1
+    beq             5f
+    do_load         \bpp, 1
+5:
+    do_rgb_to_yuv
+    tst             N, #4
+    beq             6f
+    do_store        4
+6:
+    tst             N, #2
+    beq             7f
+    do_store        2
+7:
+    tst             N, #1
+    beq             8f
+    do_store        1
+8:
+    subs            NUM_ROWS, NUM_ROWS, #1
+    bgt             0b
+9:
+    /* Restore all registers and return */
+    vpop            {d8 - d15}
+    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
+
+    .unreq          OUTPUT_WIDTH
+    .unreq          OUTPUT_ROW
+    .unreq          INPUT_BUF
+    .unreq          NUM_ROWS
+    .unreq          OUTPUT_BUF0
+    .unreq          OUTPUT_BUF1
+    .unreq          OUTPUT_BUF2
+    .unreq          RGB
+    .unreq          Y
+    .unreq          U
+    .unreq          V
+    .unreq          N
+
+.purgem do_rgb_to_yuv
+.purgem do_rgb_to_yuv_stage1
+.purgem do_rgb_to_yuv_stage2
+.purgem do_rgb_to_yuv_stage2_store_load_stage1
+
+.endm
+
+/*--------------------------------- id ----- bpp R  G  B */
+generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
+generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
+generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
+generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
+
+.purgem do_load
+.purgem do_store
diff --git a/external/jpeg/simd/arm/aarch64/jccolext-neon.c b/external/jpeg/simd/arm/aarch64/jccolext-neon.c
new file mode 100644
index 000000000000..37130c225ed0
--- /dev/null
+++ b/external/jpeg/simd/arm/aarch64/jccolext-neon.c
@@ -0,0 +1,316 @@
+/*
+ * jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-neon.c */
+
+
+/* RGB -> YCbCr conversion is defined by the following equations:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *    Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128
+ *    Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B  + 128
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ *    0.16874695 = 11059 * 2^-16
+ *    0.33125305 = 21709 * 2^-16
+ *    0.50000000 = 32768 * 2^-16
+ *    0.41868592 = 27439 * 2^-16
+ *    0.08131409 =  5329 * 2^-16
+ * These constants are defined in jccolor-neon.c
+ *
+ * We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
+ * rounds up or down the result via integer truncation.
+ */
+
+void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  /* Pointer to RGB(X/A) input data */
+  JSAMPROW inptr;
+  /* Pointers to Y, Cb, and Cr output data */
+  JSAMPROW outptr0, outptr1, outptr2;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+  /* Set up conversion constants. */
+  const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
+  const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining >= 16; cols_remaining -= 16) {
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+      uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+      uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+      uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_ll = scaled_128_5;
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+      cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+      uint32x4_t cb_lh = scaled_128_5;
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+      cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+      uint32x4_t cb_hl = scaled_128_5;
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+      cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+      uint32x4_t cb_hh = scaled_128_5;
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+      cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_ll = scaled_128_5;
+      cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+      uint32x4_t cr_lh = scaled_128_5;
+      cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+      uint32x4_t cr_hl = scaled_128_5;
+      cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+      uint32x4_t cr_hh = scaled_128_5;
+      cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+                                     vshrn_n_u32(cb_lh, 16));
+      uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+                                     vshrn_n_u32(cb_hh, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+                                     vshrn_n_u32(cr_lh, 16));
+      uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+                                     vshrn_n_u32(cr_hh, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+      vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+      vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+      /* Increment pointers. */
+      inptr += (16 * RGB_PIXELSIZE);
+      outptr0 += 16;
+      outptr1 += 16;
+      outptr2 += 16;
+    }
+
+    if (cols_remaining > 8) {
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 16) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+      inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
+      y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
+      uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
+      y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
+      uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
+      y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
+      uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
+      y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_ll = scaled_128_5;
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
+      cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
+      cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
+      uint32x4_t cb_lh = scaled_128_5;
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
+      cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
+      cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
+      uint32x4_t cb_hl = scaled_128_5;
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
+      cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
+      cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
+      uint32x4_t cb_hh = scaled_128_5;
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
+      cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
+      cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_ll = scaled_128_5;
+      cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
+      cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
+      uint32x4_t cr_lh = scaled_128_5;
+      cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
+      cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
+      uint32x4_t cr_hl = scaled_128_5;
+      cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
+      cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
+      uint32x4_t cr_hh = scaled_128_5;
+      cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
+      cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
+                                     vshrn_n_u32(cb_lh, 16));
+      uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
+                                     vshrn_n_u32(cb_hh, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
+                                     vshrn_n_u32(cr_lh, 16));
+      uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
+                                     vshrn_n_u32(cr_hh, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+      vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
+      vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
+
+    } else if (cols_remaining > 0) {
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 8) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+      inptr = tmp_buf;
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t input_pixels = vld4_u8(inptr);
+#else
+      uint8x8x3_t input_pixels = vld3_u8(inptr);
+#endif
+      uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
+      uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
+      uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
+      y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
+      y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
+      uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
+      y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
+      y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
+
+      /* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B  + 128 */
+      uint32x4_t cb_l = scaled_128_5;
+      cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
+      cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
+      cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
+      uint32x4_t cb_h = scaled_128_5;
+      cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
+      cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
+      cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
+
+      /* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B  + 128 */
+      uint32x4_t cr_l = scaled_128_5;
+      cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
+      cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
+      cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
+      uint32x4_t cr_h = scaled_128_5;
+      cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
+      cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
+      cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
+                                      vrshrn_n_u32(y_h, 16));
+      /* Descale Cb values (right shift) and narrow to 16-bit. */
+      uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
+                                       vshrn_n_u32(cb_h, 16));
+      /* Descale Cr values (right shift) and narrow to 16-bit. */
+      uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
+                                       vshrn_n_u32(cr_h, 16));
+      /* Narrow Y, Cb, and Cr values to 8-bit and store to memory.  Buffer
+       * overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1_u8(outptr0, vmovn_u16(y_u16));
+      vst1_u8(outptr1, vmovn_u16(cb_u16));
+      vst1_u8(outptr2, vmovn_u16(cr_u16));
+    }
+  }
+}
diff --git a/external/jpeg/simd/arm/aarch64/jchuff-neon.c b/external/jpeg/simd/arm/aarch64/jchuff-neon.c
new file mode 100644
index 000000000000..f13fd1b573c1
--- /dev/null
+++ b/external/jpeg/simd/arm/aarch64/jchuff-neon.c
@@ -0,0 +1,403 @@
+/*
+ * jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ * NOTE: All referenced figures are from
+ * Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+ */
+
+#define JPEG_INTERNALS
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
+#include "../../jsimd.h"
+#include "../align.h"
+#include "../jchuff.h"
+#include "neon-compat.h"
+
+#include <limits.h>
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
+    0,   1,   2,   3,  16,  17,  32,  33,
+   18,  19,   4,   5,   6,   7,  20,  21,
+   34,  35,  48,  49, 255, 255,  50,  51,
+   36,  37,  22,  23,   8,   9,  10,  11,
+  255, 255,   6,   7,  20,  21,  34,  35,
+   48,  49, 255, 255,  50,  51,  36,  37,
+   54,  55,  40,  41,  26,  27,  12,  13,
+   14,  15,  28,  29,  42,  43,  56,  57,
+    6,   7,  20,  21,  34,  35,  48,  49,
+   50,  51,  36,  37,  22,  23,   8,   9,
+   26,  27,  12,  13, 255, 255,  14,  15,
+   28,  29,  42,  43,  56,  57, 255, 255,
+   52,  53,  54,  55,  40,  41,  26,  27,
+   12,  13, 255, 255,  14,  15,  28,  29,
+   26,  27,  40,  41,  42,  43,  28,  29,
+   14,  15,  30,  31,  44,  45,  46,  47
+};
+
+JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
+                                         JCOEFPTR block, int last_dc_val,
+                                         c_derived_tbl *dctbl,
+                                         c_derived_tbl *actbl)
+{
+  uint16_t block_diff[DCTSIZE2];
+
+  /* Load lookup table indices for rows of zig-zag ordering. */
+#ifdef HAVE_VLD1Q_U8_X4
+  const uint8x16x4_t idx_rows_0123 =
+    vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
+  const uint8x16x4_t idx_rows_4567 =
+    vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE);
+#else
+  /* GCC does not currently support intrinsics vl1dq_<type>_x4(). */
+  const uint8x16x4_t idx_rows_0123 = { {
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE)
+  } };
+  const uint8x16x4_t idx_rows_4567 = { {
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE),
+    vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE)
+  } };
+#endif
+
+  /* Load 8x8 block of DCT coefficients. */
+#ifdef HAVE_VLD1Q_U8_X4
+  const int8x16x4_t tbl_rows_0123 =
+    vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
+  const int8x16x4_t tbl_rows_4567 =
+    vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE));
+#else
+  const int8x16x4_t tbl_rows_0123 = { {
+    vld1q_s8((int8_t *)(block + 0 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 1 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 2 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 3 * DCTSIZE))
+  } };
+  const int8x16x4_t tbl_rows_4567 = { {
+    vld1q_s8((int8_t *)(block + 4 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 5 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 6 * DCTSIZE)),
+    vld1q_s8((int8_t *)(block + 7 * DCTSIZE))
+  } };
+#endif
+
+  /* Initialise extra lookup tables. */
+  const int8x16x4_t tbl_rows_2345 = { {
+    tbl_rows_0123.val[2], tbl_rows_0123.val[3],
+    tbl_rows_4567.val[0], tbl_rows_4567.val[1]
+  } };
+  const int8x16x3_t tbl_rows_567 =
+    { { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } };
+
+  /* Shuffle coefficients into zig-zag order. */
+  int16x8_t row0 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0]));
+  int16x8_t row1 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1]));
+  int16x8_t row2 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2]));
+  int16x8_t row3 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3]));
+  int16x8_t row4 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0]));
+  int16x8_t row5 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1]));
+  int16x8_t row6 =
+    vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2]));
+  int16x8_t row7 =
+    vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3]));
+
+  /* Compute DC coefficient difference value (F.1.1.5.1). */
+  row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0);
+  /* Initialize AC coefficient lanes not reachable by lookup tables. */
+  row1 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]),
+                                  0), row1, 2);
+  row2 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+                                  4), row2, 0);
+  row2 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+                                  0), row2, 5);
+  row5 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
+                                  7), row5, 2);
+  row5 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
+                                  3), row5, 7);
+  row6 =
+    vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]),
+                                  7), row6, 5);
+
+  /* DCT block is now in zig-zag order; start Huffman encoding process. */
+  int16x8_t abs_row0 = vabsq_s16(row0);
+  int16x8_t abs_row1 = vabsq_s16(row1);
+  int16x8_t abs_row2 = vabsq_s16(row2);
+  int16x8_t abs_row3 = vabsq_s16(row3);
+  int16x8_t abs_row4 = vabsq_s16(row4);
+  int16x8_t abs_row5 = vabsq_s16(row5);
+  int16x8_t abs_row6 = vabsq_s16(row6);
+  int16x8_t abs_row7 = vabsq_s16(row7);
+
+  /* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
+  uint16x8_t row0_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row0, vshrq_n_s16(row0, 15)));
+  uint16x8_t row1_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row1, vshrq_n_s16(row1, 15)));
+  uint16x8_t row2_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row2, vshrq_n_s16(row2, 15)));
+  uint16x8_t row3_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row3, vshrq_n_s16(row3, 15)));
+  uint16x8_t row4_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row4, vshrq_n_s16(row4, 15)));
+  uint16x8_t row5_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row5, vshrq_n_s16(row5, 15)));
+  uint16x8_t row6_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row6, vshrq_n_s16(row6, 15)));
+  uint16x8_t row7_diff =
+    vreinterpretq_u16_s16(veorq_s16(abs_row7, vshrq_n_s16(row7, 15)));
+
+  /* Construct bitmap to accelerate encoding of AC coefficients.  A set bit
+   * means that the corresponding coefficient != 0.
+   */
+  uint8x8_t abs_row0_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row0),
+                                               vdupq_n_u16(0)));
+  uint8x8_t abs_row1_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row1),
+                                               vdupq_n_u16(0)));
+  uint8x8_t abs_row2_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row2),
+                                               vdupq_n_u16(0)));
+  uint8x8_t abs_row3_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row3),
+                                               vdupq_n_u16(0)));
+  uint8x8_t abs_row4_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row4),
+                                               vdupq_n_u16(0)));
+  uint8x8_t abs_row5_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row5),
+                                               vdupq_n_u16(0)));
+  uint8x8_t abs_row6_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row6),
+                                               vdupq_n_u16(0)));
+  uint8x8_t abs_row7_gt0 = vmovn_u16(vcgtq_u16(vreinterpretq_u16_s16(abs_row7),
+                                               vdupq_n_u16(0)));
+
+  /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
+
+  abs_row0_gt0 = vand_u8(abs_row0_gt0, bitmap_mask);
+  abs_row1_gt0 = vand_u8(abs_row1_gt0, bitmap_mask);
+  abs_row2_gt0 = vand_u8(abs_row2_gt0, bitmap_mask);
+  abs_row3_gt0 = vand_u8(abs_row3_gt0, bitmap_mask);
+  abs_row4_gt0 = vand_u8(abs_row4_gt0, bitmap_mask);
+  abs_row5_gt0 = vand_u8(abs_row5_gt0, bitmap_mask);
+  abs_row6_gt0 = vand_u8(abs_row6_gt0, bitmap_mask);
+  abs_row7_gt0 = vand_u8(abs_row7_gt0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_10 = vpadd_u8(abs_row1_gt0, abs_row0_gt0);
+  uint8x8_t bitmap_rows_32 = vpadd_u8(abs_row3_gt0, abs_row2_gt0);
+  uint8x8_t bitmap_rows_54 = vpadd_u8(abs_row5_gt0, abs_row4_gt0);
+  uint8x8_t bitmap_rows_76 = vpadd_u8(abs_row7_gt0, abs_row6_gt0);
+  uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
+  uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
+
+  /* Shift left to remove DC bit. */
+  bitmap_all =
+    vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1));
+  /* Count bits set (number of non-zero coefficients) in bitmap. */
+  unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all));
+  /* Move bitmap to 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+  /* Set up state and bit buffer for output bitstream. */
+  working_state *state_ptr = (working_state *)state;
+  int free_bits = state_ptr->cur.free_bits;
+  size_t put_buffer = state_ptr->cur.put_buffer;
+
+  /* Encode DC coefficient. */
+
+  /* Find nbits required to specify sign and amplitude of coefficient. */
+#if defined(_MSC_VER) && !defined(__clang__)
+  unsigned int lz = BUILTIN_CLZ(vgetq_lane_s16(abs_row0, 0));
+#else
+  unsigned int lz;
+  __asm__("clz %w0, %w1" : "=r"(lz) : "r"(vgetq_lane_s16(abs_row0, 0)));
+#endif
+  unsigned int nbits = 32 - lz;
+  /* Emit Huffman-coded symbol and additional diff bits. */
+  unsigned int diff = (unsigned int)(vgetq_lane_u16(row0_diff, 0) << lz) >> lz;
+  PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
+
+  /* Encode AC coefficients. */
+
+  unsigned int r = 0;  /* r = run length of zeros */
+  unsigned int i = 1;  /* i = number of coefficients encoded */
+  /* Code and size information for a run length of 16 zero coefficients */
+  const unsigned int code_0xf0 = actbl->ehufco[0xf0];
+  const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
+
+  /* The most efficient method of computing nbits and diff depends on the
+   * number of non-zero coefficients.  If the bitmap is not too sparse (> 8
+   * non-zero AC coefficients), it is beneficial to use Neon; else we compute
+   * nbits and diff on demand using scalar code.
+   */
+  if (non_zero_coefficients > 8) {
+    uint8_t block_nbits[DCTSIZE2];
+
+    int16x8_t row0_lz = vclzq_s16(abs_row0);
+    int16x8_t row1_lz = vclzq_s16(abs_row1);
+    int16x8_t row2_lz = vclzq_s16(abs_row2);
+    int16x8_t row3_lz = vclzq_s16(abs_row3);
+    int16x8_t row4_lz = vclzq_s16(abs_row4);
+    int16x8_t row5_lz = vclzq_s16(abs_row5);
+    int16x8_t row6_lz = vclzq_s16(abs_row6);
+    int16x8_t row7_lz = vclzq_s16(abs_row7);
+    /* Compute nbits needed to specify magnitude of each coefficient. */
+    uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
+    uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
+    uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
+    uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
+    uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
+    uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
+    uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
+    uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
+                                   vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
+    /* Store nbits. */
+    vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
+    vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
+    vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
+    vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
+    vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
+    vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
+    vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
+    vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
+    /* Mask bits not required to specify sign and amplitude of diff. */
+    row0_diff = vshlq_u16(row0_diff, row0_lz);
+    row1_diff = vshlq_u16(row1_diff, row1_lz);
+    row2_diff = vshlq_u16(row2_diff, row2_lz);
+    row3_diff = vshlq_u16(row3_diff, row3_lz);
+    row4_diff = vshlq_u16(row4_diff, row4_lz);
+    row5_diff = vshlq_u16(row5_diff, row5_lz);
+    row6_diff = vshlq_u16(row6_diff, row6_lz);
+    row7_diff = vshlq_u16(row7_diff, row7_lz);
+    row0_diff = vshlq_u16(row0_diff, vnegq_s16(row0_lz));
+    row1_diff = vshlq_u16(row1_diff, vnegq_s16(row1_lz));
+    row2_diff = vshlq_u16(row2_diff, vnegq_s16(row2_lz));
+    row3_diff = vshlq_u16(row3_diff, vnegq_s16(row3_lz));
+    row4_diff = vshlq_u16(row4_diff, vnegq_s16(row4_lz));
+    row5_diff = vshlq_u16(row5_diff, vnegq_s16(row5_lz));
+    row6_diff = vshlq_u16(row6_diff, vnegq_s16(row6_lz));
+    row7_diff = vshlq_u16(row7_diff, vnegq_s16(row7_lz));
+    /* Store diff bits. */
+    vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+    vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+    vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+    vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+    vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+    vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+    vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+    vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+    while (bitmap != 0) {
+      r = BUILTIN_CLZLL(bitmap);
+      i += r;
+      bitmap <<= r;
+      nbits = block_nbits[i];
+      diff = block_diff[i];
+      while (r > 15) {
+        /* If run length > 15, emit special run-length-16 codes. */
+        PUT_BITS(code_0xf0, size_0xf0)
+        r -= 16;
+      }
+      /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+      unsigned int rs = (r << 4) + nbits;
+      PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+      i++;
+      bitmap <<= 1;
+    }
+  } else if (bitmap != 0) {
+    uint16_t block_abs[DCTSIZE2];
+    /* Store absolute value of coefficients. */
+    vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
+    vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
+    vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
+    vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3));
+    vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4));
+    vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
+    vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
+    vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
+    /* Store diff bits. */
+    vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
+    vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
+    vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
+    vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
+    vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
+    vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
+    vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
+    vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
+
+    /* Same as above but must mask diff bits and compute nbits on demand. */
+    while (bitmap != 0) {
+      r = BUILTIN_CLZLL(bitmap);
+      i += r;
+      bitmap <<= r;
+      lz = BUILTIN_CLZ(block_abs[i]);
+      nbits = 32 - lz;
+      diff = (unsigned int)(block_diff[i] << lz) >> lz;
+      while (r > 15) {
+        /* If run length > 15, emit special run-length-16 codes. */
+        PUT_BITS(code_0xf0, size_0xf0)
+        r -= 16;
+      }
+      /* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
+      unsigned int rs = (r << 4) + nbits;
+      PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
+      i++;
+      bitmap <<= 1;
+    }
+  }
+
+  /* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
+   * The value of RS for the EOB code is 0.
+   */
+  if (i != 64) {
+    PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
+  }
+
+  state_ptr->cur.put_buffer = put_buffer;
+  state_ptr->cur.free_bits = free_bits;
+
+  return buffer;
+}
diff --git a/external/jpeg/simd/arm64/jsimd.c b/external/jpeg/simd/arm/aarch64/jsimd.c
similarity index 72%
rename from external/jpeg/simd/arm64/jsimd.c
rename to external/jpeg/simd/arm/aarch64/jsimd.c
index 808c0e3e27d7..8570b82c7599 100644
--- a/external/jpeg/simd/arm64/jsimd.c
+++ b/external/jpeg/simd/arm/aarch64/jsimd.c
@@ -3,8 +3,9 @@
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2011, Nokia Corporation and/or its subsidiary(-ies).
- * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, D. R. Commander.
+ * Copyright (C) 2009-2011, 2013-2014, 2016, 2018, 2020, D. R. Commander.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -16,12 +17,13 @@
  */
 
 #define JPEG_INTERNALS
-#include "../../jinclude.h"
-#include "../../jpeglib.h"
+#include "../../../jinclude.h"
+#include "../../../jpeglib.h"
+#include "../../../jsimd.h"
+#include "../../../jdct.h"
+#include "../../../jsimddct.h"
 #include "../../jsimd.h"
-#include "../../jdct.h"
-#include "../../jsimddct.h"
-#include "../jsimd.h"
+#include "jconfigint.h"
 
 #include <stdio.h>
 #include <string.h>
@@ -189,6 +191,19 @@ jsimd_can_rgb_ycc(void)
 GLOBAL(int)
 jsimd_can_rgb_gray(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -237,20 +252,28 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
 
   switch (cinfo->in_color_space) {
   case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
     if (simd_features & JSIMD_FASTLD3)
+#endif
       neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
     else
       neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
     break;
   case JCS_EXT_RGBX:
   case JCS_EXT_RGBA:
     neonfct = jsimd_extrgbx_ycc_convert_neon;
     break;
   case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
     if (simd_features & JSIMD_FASTLD3)
+#endif
       neonfct = jsimd_extbgr_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
     else
       neonfct = jsimd_extbgr_ycc_convert_neon_slowld3;
+#endif
     break;
   case JCS_EXT_BGRX:
   case JCS_EXT_BGRA:
@@ -265,10 +288,14 @@ jsimd_rgb_ycc_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
     neonfct = jsimd_extxrgb_ycc_convert_neon;
     break;
   default:
+#ifndef NEON_INTRINSICS
     if (simd_features & JSIMD_FASTLD3)
+#endif
       neonfct = jsimd_extrgb_ycc_convert_neon;
+#ifndef NEON_INTRINSICS
     else
       neonfct = jsimd_extrgb_ycc_convert_neon_slowld3;
+#endif
     break;
   }
 
@@ -280,6 +307,37 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
                        JSAMPIMAGE output_buf, JDIMENSION output_row,
                        int num_rows)
 {
+  void (*neonfct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    neonfct = jsimd_extrgbx_gray_convert_neon;
+    break;
+  case JCS_EXT_BGR:
+    neonfct = jsimd_extbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    neonfct = jsimd_extbgrx_gray_convert_neon;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    neonfct = jsimd_extxbgr_gray_convert_neon;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    neonfct = jsimd_extxrgb_gray_convert_neon;
+    break;
+  default:
+    neonfct = jsimd_extrgb_gray_convert_neon;
+    break;
+  }
+
+  neonfct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)
@@ -291,20 +349,28 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
 
   switch (cinfo->out_color_space) {
   case JCS_EXT_RGB:
+#ifndef NEON_INTRINSICS
     if (simd_features & JSIMD_FASTST3)
+#endif
       neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
     else
       neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
     break;
   case JCS_EXT_RGBX:
   case JCS_EXT_RGBA:
     neonfct = jsimd_ycc_extrgbx_convert_neon;
     break;
   case JCS_EXT_BGR:
+#ifndef NEON_INTRINSICS
     if (simd_features & JSIMD_FASTST3)
+#endif
       neonfct = jsimd_ycc_extbgr_convert_neon;
+#ifndef NEON_INTRINSICS
     else
       neonfct = jsimd_ycc_extbgr_convert_neon_slowst3;
+#endif
     break;
   case JCS_EXT_BGRX:
   case JCS_EXT_BGRA:
@@ -319,10 +385,14 @@ jsimd_ycc_rgb_convert(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
     neonfct = jsimd_ycc_extxrgb_convert_neon;
     break;
   default:
+#ifndef NEON_INTRINSICS
     if (simd_features & JSIMD_FASTST3)
+#endif
       neonfct = jsimd_ycc_extrgb_convert_neon;
+#ifndef NEON_INTRINSICS
     else
       neonfct = jsimd_ycc_extrgb_convert_neon_slowst3;
+#endif
     break;
   }
 
@@ -397,12 +467,33 @@ jsimd_h2v1_downsample(j_compress_ptr cinfo, jpeg_component_info *compptr,
 GLOBAL(int)
 jsimd_can_h2v2_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -410,23 +501,66 @@ GLOBAL(void)
 jsimd_h2v2_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v2_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
 }
 
 GLOBAL(void)
 jsimd_h2v1_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                     JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v1_upsample_neon(cinfo->max_v_samp_factor, cinfo->output_width,
+                           input_data, output_data_ptr);
 }
 
 GLOBAL(int)
 jsimd_can_h2v2_fancy_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_fancy_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
+  return 0;
+}
+
+GLOBAL(int)
+jsimd_can_h1v2_fancy_upsample(void)
+{
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -434,23 +568,60 @@ GLOBAL(void)
 jsimd_h2v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
 }
 
 GLOBAL(void)
 jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v1_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
+}
+
+GLOBAL(void)
+jsimd_h1v2_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+                          JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
+{
+  jsimd_h1v2_fancy_upsample_neon(cinfo->max_v_samp_factor,
+                                 compptr->downsampled_width, input_data,
+                                 output_data_ptr);
 }
 
 GLOBAL(int)
 jsimd_can_h2v2_merged_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_merged_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -458,12 +629,74 @@ GLOBAL(void)
 jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                            JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v2_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v2_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v2_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v2_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v2_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v2_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
 GLOBAL(void)
 jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                            JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
+  void (*neonfct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+    case JCS_EXT_RGB:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+    case JCS_EXT_RGBX:
+    case JCS_EXT_RGBA:
+      neonfct = jsimd_h2v1_extrgbx_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGR:
+      neonfct = jsimd_h2v1_extbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_BGRX:
+    case JCS_EXT_BGRA:
+      neonfct = jsimd_h2v1_extbgrx_merged_upsample_neon;
+      break;
+    case JCS_EXT_XBGR:
+    case JCS_EXT_ABGR:
+      neonfct = jsimd_h2v1_extxbgr_merged_upsample_neon;
+      break;
+    case JCS_EXT_XRGB:
+    case JCS_EXT_ARGB:
+      neonfct = jsimd_h2v1_extxrgb_merged_upsample_neon;
+      break;
+    default:
+      neonfct = jsimd_h2v1_extrgb_merged_upsample_neon;
+      break;
+  }
+
+  neonfct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
 GLOBAL(int)
@@ -762,17 +995,33 @@ jsimd_huff_encode_one_block(void *state, JOCTET *buffer, JCOEFPTR block,
                             int last_dc_val, c_derived_tbl *dctbl,
                             c_derived_tbl *actbl)
 {
+#ifndef NEON_INTRINSICS
   if (simd_features & JSIMD_FASTTBL)
+#endif
     return jsimd_huff_encode_one_block_neon(state, buffer, block, last_dc_val,
                                             dctbl, actbl);
+#ifndef NEON_INTRINSICS
   else
     return jsimd_huff_encode_one_block_neon_slowtbl(state, buffer, block,
                                                     last_dc_val, dctbl, actbl);
+#endif
 }
 
 GLOBAL(int)
 jsimd_can_encode_mcu_AC_first_prepare(void)
 {
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -781,11 +1030,25 @@ jsimd_encode_mcu_AC_first_prepare(const JCOEF *block,
                                   const int *jpeg_natural_order_start, int Sl,
                                   int Al, JCOEF *values, size_t *zerobits)
 {
+  jsimd_encode_mcu_AC_first_prepare_neon(block, jpeg_natural_order_start,
+                                         Sl, Al, values, zerobits);
 }
 
 GLOBAL(int)
 jsimd_can_encode_mcu_AC_refine_prepare(void)
 {
+  init_simd();
+
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (SIZEOF_SIZE_T != 8)
+    return 0;
+
+  if (simd_support & JSIMD_NEON)
+    return 1;
+
   return 0;
 }
 
@@ -794,5 +1057,7 @@ jsimd_encode_mcu_AC_refine_prepare(const JCOEF *block,
                                    const int *jpeg_natural_order_start, int Sl,
                                    int Al, JCOEF *absvalues, size_t *bits)
 {
-  return 0;
+  return jsimd_encode_mcu_AC_refine_prepare_neon(block,
+                                                 jpeg_natural_order_start,
+                                                 Sl, Al, absvalues, bits);
 }
diff --git a/external/jpeg/simd/arm64/jsimd_neon.S b/external/jpeg/simd/arm/aarch64/jsimd_neon.S
similarity index 69%
rename from external/jpeg/simd/arm64/jsimd_neon.S
rename to external/jpeg/simd/arm/aarch64/jsimd_neon.S
index c13d0d37c015..31aa8e258efb 100644
--- a/external/jpeg/simd/arm64/jsimd_neon.S
+++ b/external/jpeg/simd/arm/aarch64/jsimd_neon.S
@@ -86,56 +86,6 @@ Ljsimd_idct_islow_neon_consts:
 #undef F_2_562
 #undef F_3_072
 
-/* Constants for jsimd_idct_ifast_neon() */
-
-.balign 16
-Ljsimd_idct_ifast_neon_consts:
-  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
-  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
-  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
-  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
-
-/* Constants for jsimd_idct_4x4_neon() and jsimd_idct_2x2_neon() */
-
-#define CONST_BITS  13
-
-#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
-#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
-#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
-#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
-#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
-#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
-#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
-#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
-#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
-#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
-#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
-#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
-#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
-#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
-
-.balign 16
-Ljsimd_idct_4x4_neon_consts:
-  .short FIX_1_847759065        /* v0.h[0] */
-  .short -FIX_0_765366865       /* v0.h[1] */
-  .short -FIX_0_211164243       /* v0.h[2] */
-  .short FIX_1_451774981        /* v0.h[3] */
-  .short -FIX_2_172734803       /* d1[0] */
-  .short FIX_1_061594337        /* d1[1] */
-  .short -FIX_0_509795579       /* d1[2] */
-  .short -FIX_0_601344887       /* d1[3] */
-  .short FIX_0_899976223        /* v2.h[0] */
-  .short FIX_2_562915447        /* v2.h[1] */
-  .short 1 << (CONST_BITS + 1)  /* v2.h[2] */
-  .short 0                      /* v2.h[3] */
-
-.balign 8
-Ljsimd_idct_2x2_neon_consts:
-  .short -FIX_0_720959822  /* v14[0] */
-  .short FIX_0_850430095   /* v14[1] */
-  .short -FIX_1_272758580  /* v14[2] */
-  .short FIX_3_624509785   /* v14[3] */
-
 /* Constants for jsimd_ycc_*_neon() */
 
 .balign 16
@@ -201,52 +151,6 @@ Ljsimd_fdct_islow_neon_consts:
 #undef F_2_562
 #undef F_3_072
 
-/* Constants for jsimd_fdct_ifast_neon() */
-
-.balign 16
-Ljsimd_fdct_ifast_neon_consts:
-  .short (98 * 128)               /* XFIX_0_382683433 */
-  .short (139 * 128)              /* XFIX_0_541196100 */
-  .short (181 * 128)              /* XFIX_0_707106781 */
-  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
-
-/* Constants for jsimd_h2*_downsample_neon() */
-
-.balign 16
-Ljsimd_h2_downsample_neon_consts:
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F  /* diff 0 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E  /* diff 1 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D  /* diff 2 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C  /* diff 3 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B  /* diff 4 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A  /* diff 5 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09  /* diff 6 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08  /* diff 7 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \
-        0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07  /* diff 8 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, \
-        0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06  /* diff 9 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05, \
-        0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05  /* diff 10 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04, \
-        0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04  /* diff 11 */
-  .byte 0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, \
-        0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03  /* diff 12 */
-  .byte 0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, \
-        0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02  /* diff 13 */
-  .byte 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, \
-        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01  /* diff 14 */
-  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00  /* diff 15 */
-
 /* Constants for jsimd_huff_encode_one_block_neon() */
 
 .balign 16
@@ -278,11 +182,6 @@ Ljsimd_huff_encode_one_block_neon_consts:
     .byte    4,   5,   6,   7, 255, 255, 255, 255, \
            255, 255, 255, 255, 255, 255, 255, 255  /* L7 : 1 line OK */
 
-.text
-
-
-#define RESPECT_STRICT_ALIGNMENT  1
-
 
 /*****************************************************************************/
 
@@ -313,45 +212,6 @@ _\fname:
 #endif
 .endm
 
-/* Transpose elements of single 128 bit registers */
-.macro transpose_single x0, x1, xi, xilen, literal
-    ins             \xi\xilen[0], \x0\xilen[0]
-    ins             \x1\xilen[0], \x0\xilen[1]
-    trn1            \x0\literal, \x0\literal, \x1\literal
-    trn2            \x1\literal, \xi\literal, \x1\literal
-.endm
-
-/* Transpose elements of 2 different registers */
-.macro transpose x0, x1, xi, xilen, literal
-    mov             \xi\xilen, \x0\xilen
-    trn1            \x0\literal, \x0\literal, \x1\literal
-    trn2            \x1\literal, \xi\literal, \x1\literal
-.endm
-
-/* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4_32 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
-    mov             \xi\xilen, \x0\xilen
-    trn1            \x0\x0len, \x0\x0len, \x2\x2len
-    trn2            \x2\x2len, \xi\x0len, \x2\x2len
-    mov             \xi\xilen, \x1\xilen
-    trn1            \x1\x1len, \x1\x1len, \x3\x3len
-    trn2            \x3\x3len, \xi\x1len, \x3\x3len
-.endm
-
-.macro transpose_4x4_16 x0, x0len, x1, x1len, x2, x2len, x3, x3len, xi, xilen
-    mov             \xi\xilen, \x0\xilen
-    trn1            \x0\x0len, \x0\x0len, \x1\x1len
-    trn2            \x1\x2len, \xi\x0len, \x1\x2len
-    mov             \xi\xilen, \x2\xilen
-    trn1            \x2\x2len, \x2\x2len, \x3\x3len
-    trn2            \x3\x2len, \xi\x1len, \x3\x3len
-.endm
-
-.macro transpose_4x4 x0, x1, x2, x3, x5
-    transpose_4x4_16 \x0, .4h, \x1, .4h, \x2, .4h, \x3, .4h, \x5, .16b
-    transpose_4x4_32 \x0, .2s, \x1, .2s, \x2, .2s, \x3, .2s, \x5, .16b
-.endm
-
 .macro transpose_8x8 l0, l1, l2, l3, l4, l5, l6, l7, t0, t1, t2, t3
     trn1            \t0\().8h, \l0\().8h, \l1\().8h
     trn1            \t1\().8h, \l2\().8h, \l3\().8h
@@ -981,619 +841,6 @@ asm_function jsimd_idct_islow_neon
 #undef XFIX_P_3_072
 
 
-/*****************************************************************************/
-
-/*
- * jsimd_idct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
- * function from jidctfst.c
- *
- * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
- * But in Arm Neon case some extra additions are required because VQDMULH
- * instruction can't handle the constants larger than 1. So the expressions
- * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
- * which introduces an extra addition. Overall, there are 6 extra additions
- * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
- */
-
-#define XFIX_1_082392200  v0.h[0]
-#define XFIX_1_414213562  v0.h[1]
-#define XFIX_1_847759065  v0.h[2]
-#define XFIX_2_613125930  v0.h[3]
-
-asm_function jsimd_idct_ifast_neon
-
-    DCT_TABLE       .req x0
-    COEF_BLOCK      .req x1
-    OUTPUT_BUF      .req x2
-    OUTPUT_COL      .req x3
-    TMP1            .req x0
-    TMP2            .req x1
-    TMP3            .req x9
-    TMP4            .req x10
-    TMP5            .req x11
-    TMP6            .req x12
-    TMP7            .req x13
-    TMP8            .req x14
-
-    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x3 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x3, w3
-
-    /* Load and dequantize coefficients into Neon registers
-     * with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17     ( v16.8h )
-     *   1 | d18     | d19     ( v17.8h )
-     *   2 | d20     | d21     ( v18.8h )
-     *   3 | d22     | d23     ( v19.8h )
-     *   4 | d24     | d25     ( v20.8h )
-     *   5 | d26     | d27     ( v21.8h )
-     *   6 | d28     | d29     ( v22.8h )
-     *   7 | d30     | d31     ( v23.8h )
-     */
-    /* Save Neon registers used in fast IDCT */
-    get_symbol_loc  TMP5, Ljsimd_idct_ifast_neon_consts
-    ld1             {v16.8h, v17.8h}, [COEF_BLOCK], 32
-    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
-    ld1             {v18.8h, v19.8h}, [COEF_BLOCK], 32
-    mul             v16.8h, v16.8h, v0.8h
-    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
-    mul             v17.8h, v17.8h, v1.8h
-    ld1             {v20.8h, v21.8h}, [COEF_BLOCK], 32
-    mul             v18.8h, v18.8h, v2.8h
-    ld1             {v0.8h, v1.8h}, [DCT_TABLE], 32
-    mul             v19.8h, v19.8h, v3.8h
-    ld1             {v22.8h, v23.8h}, [COEF_BLOCK], 32
-    mul             v20.8h, v20.8h, v0.8h
-    ld1             {v2.8h, v3.8h}, [DCT_TABLE], 32
-    mul             v22.8h, v22.8h, v2.8h
-    mul             v21.8h, v21.8h, v1.8h
-    ld1             {v0.4h}, [TMP5]        /* load constants */
-    mul             v23.8h, v23.8h, v3.8h
-
-    /* 1-D IDCT, pass 1 */
-    sub             v2.8h, v18.8h, v22.8h
-    add             v22.8h, v18.8h, v22.8h
-    sub             v1.8h, v19.8h, v21.8h
-    add             v21.8h, v19.8h, v21.8h
-    sub             v5.8h, v17.8h, v23.8h
-    add             v23.8h, v17.8h, v23.8h
-    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
-    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
-    add             v3.8h, v1.8h, v1.8h
-    sub             v1.8h, v5.8h, v1.8h
-    add             v18.8h, v2.8h, v4.8h
-    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
-    sub             v2.8h, v23.8h, v21.8h
-    add             v3.8h, v3.8h, v6.8h
-    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
-    add             v1.8h, v1.8h, v4.8h
-    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
-    sub             v18.8h, v18.8h, v22.8h
-    add             v2.8h, v2.8h, v6.8h
-    sub             v6.8h, v16.8h, v20.8h
-    add             v20.8h, v16.8h, v20.8h
-    add             v17.8h, v5.8h, v4.8h
-    add             v5.8h, v6.8h, v18.8h
-    sub             v18.8h, v6.8h, v18.8h
-    add             v6.8h, v23.8h, v21.8h
-    add             v16.8h, v20.8h, v22.8h
-    sub             v3.8h, v6.8h, v3.8h
-    sub             v20.8h, v20.8h, v22.8h
-    sub             v3.8h, v3.8h, v1.8h
-    sub             v1.8h, v17.8h, v1.8h
-    add             v2.8h, v3.8h, v2.8h
-    sub             v23.8h, v16.8h, v6.8h
-    add             v1.8h, v1.8h, v2.8h
-    add             v16.8h, v16.8h, v6.8h
-    add             v22.8h, v5.8h, v3.8h
-    sub             v17.8h, v5.8h, v3.8h
-    sub             v21.8h, v18.8h, v2.8h
-    add             v18.8h, v18.8h, v2.8h
-    sub             v19.8h, v20.8h, v1.8h
-    add             v20.8h, v20.8h, v1.8h
-    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v28, v29, v30, v31
-    /* 1-D IDCT, pass 2 */
-    sub             v2.8h, v18.8h, v22.8h
-    add             v22.8h, v18.8h, v22.8h
-    sub             v1.8h, v19.8h, v21.8h
-    add             v21.8h, v19.8h, v21.8h
-    sub             v5.8h, v17.8h, v23.8h
-    add             v23.8h, v17.8h, v23.8h
-    sqdmulh         v4.8h, v2.8h, XFIX_1_414213562
-    sqdmulh         v6.8h, v1.8h, XFIX_2_613125930
-    add             v3.8h, v1.8h, v1.8h
-    sub             v1.8h, v5.8h, v1.8h
-    add             v18.8h, v2.8h, v4.8h
-    sqdmulh         v4.8h, v1.8h, XFIX_1_847759065
-    sub             v2.8h, v23.8h, v21.8h
-    add             v3.8h, v3.8h, v6.8h
-    sqdmulh         v6.8h, v2.8h, XFIX_1_414213562
-    add             v1.8h, v1.8h, v4.8h
-    sqdmulh         v4.8h, v5.8h, XFIX_1_082392200
-    sub             v18.8h, v18.8h, v22.8h
-    add             v2.8h, v2.8h, v6.8h
-    sub             v6.8h, v16.8h, v20.8h
-    add             v20.8h, v16.8h, v20.8h
-    add             v17.8h, v5.8h, v4.8h
-    add             v5.8h, v6.8h, v18.8h
-    sub             v18.8h, v6.8h, v18.8h
-    add             v6.8h, v23.8h, v21.8h
-    add             v16.8h, v20.8h, v22.8h
-    sub             v3.8h, v6.8h, v3.8h
-    sub             v20.8h, v20.8h, v22.8h
-    sub             v3.8h, v3.8h, v1.8h
-    sub             v1.8h, v17.8h, v1.8h
-    add             v2.8h, v3.8h, v2.8h
-    sub             v23.8h, v16.8h, v6.8h
-    add             v1.8h, v1.8h, v2.8h
-    add             v16.8h, v16.8h, v6.8h
-    add             v22.8h, v5.8h, v3.8h
-    sub             v17.8h, v5.8h, v3.8h
-    sub             v21.8h, v18.8h, v2.8h
-    add             v18.8h, v18.8h, v2.8h
-    sub             v19.8h, v20.8h, v1.8h
-    add             v20.8h, v20.8h, v1.8h
-    /* Descale to 8-bit and range limit */
-    movi            v0.16b, #0x80
-      /* Prepare pointers (dual-issue with Neon instructions) */
-      ldp             TMP1, TMP2, [OUTPUT_BUF], 16
-    sqshrn          v28.8b, v16.8h, #5
-      ldp             TMP3, TMP4, [OUTPUT_BUF], 16
-    sqshrn          v29.8b, v17.8h, #5
-      add             TMP1, TMP1, OUTPUT_COL
-    sqshrn          v30.8b, v18.8h, #5
-      add             TMP2, TMP2, OUTPUT_COL
-    sqshrn          v31.8b, v19.8h, #5
-      add             TMP3, TMP3, OUTPUT_COL
-    sqshrn2         v28.16b, v20.8h, #5
-      add             TMP4, TMP4, OUTPUT_COL
-    sqshrn2         v29.16b, v21.8h, #5
-      ldp             TMP5, TMP6, [OUTPUT_BUF], 16
-    sqshrn2         v30.16b, v22.8h, #5
-      ldp             TMP7, TMP8, [OUTPUT_BUF], 16
-    sqshrn2         v31.16b, v23.8h, #5
-      add             TMP5, TMP5, OUTPUT_COL
-    add             v16.16b, v28.16b, v0.16b
-      add             TMP6, TMP6, OUTPUT_COL
-    add             v18.16b, v29.16b, v0.16b
-      add             TMP7, TMP7, OUTPUT_COL
-    add             v20.16b, v30.16b, v0.16b
-      add             TMP8, TMP8, OUTPUT_COL
-    add             v22.16b, v31.16b, v0.16b
-
-    /* Transpose the final 8-bit samples */
-    trn1            v28.16b, v16.16b, v18.16b
-    trn1            v30.16b, v20.16b, v22.16b
-    trn2            v29.16b, v16.16b, v18.16b
-    trn2            v31.16b, v20.16b, v22.16b
-
-    trn1            v16.8h, v28.8h, v30.8h
-    trn2            v18.8h, v28.8h, v30.8h
-    trn1            v20.8h, v29.8h, v31.8h
-    trn2            v22.8h, v29.8h, v31.8h
-
-    uzp1            v28.4s, v16.4s, v18.4s
-    uzp2            v30.4s, v16.4s, v18.4s
-    uzp1            v29.4s, v20.4s, v22.4s
-    uzp2            v31.4s, v20.4s, v22.4s
-
-    /* Store results to the output buffer */
-    st1             {v28.d}[0], [TMP1]
-    st1             {v29.d}[0], [TMP2]
-    st1             {v28.d}[1], [TMP3]
-    st1             {v29.d}[1], [TMP4]
-    st1             {v30.d}[0], [TMP5]
-    st1             {v31.d}[0], [TMP6]
-    st1             {v30.d}[1], [TMP7]
-    st1             {v31.d}[1], [TMP8]
-    blr             x30
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-    .unreq          TMP5
-    .unreq          TMP6
-    .unreq          TMP7
-    .unreq          TMP8
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_4x4_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular Neon optimized function is
- *       bit exact compatibility with jpeg-6b.
- *
- * TODO: a bit better instructions scheduling can be achieved by expanding
- *       idct_helper/transpose_4x4 macros and reordering instructions,
- *       but readability will suffer somewhat.
- */
-
-.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    smull           v28.4s, \x4, v2.h[2]
-    smlal           v28.4s, \x8, v0.h[0]
-    smlal           v28.4s, \x14, v0.h[1]
-
-    smull           v26.4s, \x16, v1.h[2]
-    smlal           v26.4s, \x12, v1.h[3]
-    smlal           v26.4s, \x10, v2.h[0]
-    smlal           v26.4s, \x6, v2.h[1]
-
-    smull           v30.4s, \x4, v2.h[2]
-    smlsl           v30.4s, \x8, v0.h[0]
-    smlsl           v30.4s, \x14, v0.h[1]
-
-    smull           v24.4s, \x16, v0.h[2]
-    smlal           v24.4s, \x12, v0.h[3]
-    smlal           v24.4s, \x10, v1.h[0]
-    smlal           v24.4s, \x6, v1.h[1]
-
-    add             v20.4s, v28.4s, v26.4s
-    sub             v28.4s, v28.4s, v26.4s
-
-  .if \shift > 16
-    srshr           v20.4s, v20.4s, #\shift
-    srshr           v28.4s, v28.4s, #\shift
-    xtn             \y26, v20.4s
-    xtn             \y29, v28.4s
-  .else
-    rshrn           \y26, v20.4s, #\shift
-    rshrn           \y29, v28.4s, #\shift
-  .endif
-
-    add             v20.4s, v30.4s, v24.4s
-    sub             v30.4s, v30.4s, v24.4s
-
-  .if \shift > 16
-    srshr           v20.4s, v20.4s, #\shift
-    srshr           v30.4s, v30.4s, #\shift
-    xtn             \y27, v20.4s
-    xtn             \y28, v30.4s
-  .else
-    rshrn           \y27, v20.4s, #\shift
-    rshrn           \y28, v30.4s, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_4x4_neon
-
-    DCT_TABLE       .req x0
-    COEF_BLOCK      .req x1
-    OUTPUT_BUF      .req x2
-    OUTPUT_COL      .req x3
-    TMP1            .req x0
-    TMP2            .req x1
-    TMP3            .req x2
-    TMP4            .req x15
-
-    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x3 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x3, w3
-
-    /* Save all used Neon registers */
-    sub             sp, sp, 64
-    mov             x9, sp
-    /* Load constants (v3.4h is just used for padding) */
-    get_symbol_loc  TMP4, Ljsimd_idct_4x4_neon_consts
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
-    ld1             {v0.4h, v1.4h, v2.4h, v3.4h}, [TMP4]
-
-    /* Load all COEF_BLOCK into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | v4.4h   | v5.4h
-     *   1 | v6.4h   | v7.4h
-     *   2 | v8.4h   | v9.4h
-     *   3 | v10.4h  | v11.4h
-     *   4 | -       | -
-     *   5 | v12.4h  | v13.4h
-     *   6 | v14.4h  | v15.4h
-     *   7 | v16.4h  | v17.4h
-     */
-    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
-    ld1             {v8.4h, v9.4h, v10.4h, v11.4h}, [COEF_BLOCK], 32
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v12.4h, v13.4h, v14.4h, v15.4h}, [COEF_BLOCK], 32
-    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
-    /* dequantize */
-    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
-    mul             v4.4h, v4.4h, v18.4h
-    mul             v5.4h, v5.4h, v19.4h
-    ins             v4.d[1], v5.d[0]              /* 128 bit q4 */
-    ld1             {v22.4h, v23.4h, v24.4h, v25.4h}, [DCT_TABLE], 32
-    mul             v6.4h, v6.4h, v20.4h
-    mul             v7.4h, v7.4h, v21.4h
-    ins             v6.d[1], v7.d[0]              /* 128 bit q6 */
-    mul             v8.4h, v8.4h, v22.4h
-    mul             v9.4h, v9.4h, v23.4h
-    ins             v8.d[1], v9.d[0]              /* 128 bit q8 */
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v26.4h, v27.4h, v28.4h, v29.4h}, [DCT_TABLE], 32
-    mul             v10.4h, v10.4h, v24.4h
-    mul             v11.4h, v11.4h, v25.4h
-    ins             v10.d[1], v11.d[0]            /* 128 bit q10 */
-    mul             v12.4h, v12.4h, v26.4h
-    mul             v13.4h, v13.4h, v27.4h
-    ins             v12.d[1], v13.d[0]            /* 128 bit q12 */
-    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
-    mul             v14.4h, v14.4h, v28.4h
-    mul             v15.4h, v15.4h, v29.4h
-    ins             v14.d[1], v15.d[0]            /* 128 bit q14 */
-    mul             v16.4h, v16.4h, v30.4h
-    mul             v17.4h, v17.4h, v31.4h
-    ins             v16.d[1], v17.d[0]            /* 128 bit q16 */
-
-    /* Pass 1 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v12.4h, v14.4h, v16.4h, 12, \
-                    v4.4h, v6.4h, v8.4h, v10.4h
-    transpose_4x4   v4, v6, v8, v10, v3
-    ins             v10.d[1], v11.d[0]
-    idct_helper     v5.4h, v7.4h, v9.4h, v11.4h, v13.4h, v15.4h, v17.4h, 12, \
-                    v5.4h, v7.4h, v9.4h, v11.4h
-    transpose_4x4   v5, v7, v9, v11, v3
-    ins             v10.d[1], v11.d[0]
-
-    /* Pass 2 */
-    idct_helper     v4.4h, v6.4h, v8.4h, v10.4h, v7.4h, v9.4h, v11.4h, 19, \
-                    v26.4h, v27.4h, v28.4h, v29.4h
-    transpose_4x4   v26, v27, v28, v29, v3
-
-    /* Range limit */
-    movi            v30.8h, #0x80
-    ins             v26.d[1], v27.d[0]
-    ins             v28.d[1], v29.d[0]
-    add             v26.8h, v26.8h, v30.8h
-    add             v28.8h, v28.8h, v30.8h
-    sqxtun          v26.8b, v26.8h
-    sqxtun          v27.8b, v28.8h
-
-    /* Store results to the output buffer */
-    ldp             TMP1, TMP2, [OUTPUT_BUF], 16
-    ldp             TMP3, TMP4, [OUTPUT_BUF]
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-    add             TMP3, TMP3, OUTPUT_COL
-    add             TMP4, TMP4, OUTPUT_COL
-
-#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
-    /* We can use much less instructions on little endian systems if the
-     * OS kernel is not configured to trap unaligned memory accesses
-     */
-    st1             {v26.s}[0], [TMP1], 4
-    st1             {v27.s}[0], [TMP3], 4
-    st1             {v26.s}[1], [TMP2], 4
-    st1             {v27.s}[1], [TMP4], 4
-#else
-    st1             {v26.b}[0], [TMP1], 1
-    st1             {v27.b}[0], [TMP3], 1
-    st1             {v26.b}[1], [TMP1], 1
-    st1             {v27.b}[1], [TMP3], 1
-    st1             {v26.b}[2], [TMP1], 1
-    st1             {v27.b}[2], [TMP3], 1
-    st1             {v26.b}[3], [TMP1], 1
-    st1             {v27.b}[3], [TMP3], 1
-
-    st1             {v26.b}[4], [TMP2], 1
-    st1             {v27.b}[4], [TMP4], 1
-    st1             {v26.b}[5], [TMP2], 1
-    st1             {v27.b}[5], [TMP4], 1
-    st1             {v26.b}[6], [TMP2], 1
-    st1             {v27.b}[6], [TMP4], 1
-    st1             {v26.b}[7], [TMP2], 1
-    st1             {v27.b}[7], [TMP4], 1
-#endif
-
-    /* vpop            {v8.4h - v15.4h}    (not available) */
-    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    blr             x30
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular Neon optimized function is
- *       bit exact compatibility with jpeg-6b.
- */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    sshll           v15.4s, \x4, #15
-    smull           v26.4s, \x6, v14.h[3]
-    smlal           v26.4s, \x10, v14.h[2]
-    smlal           v26.4s, \x12, v14.h[1]
-    smlal           v26.4s, \x16, v14.h[0]
-
-    add             v20.4s, v15.4s, v26.4s
-    sub             v15.4s, v15.4s, v26.4s
-
-  .if \shift > 16
-    srshr           v20.4s, v20.4s, #\shift
-    srshr           v15.4s, v15.4s, #\shift
-    xtn             \y26, v20.4s
-    xtn             \y27, v15.4s
-  .else
-    rshrn           \y26, v20.4s, #\shift
-    rshrn           \y27, v15.4s, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
-    DCT_TABLE       .req x0
-    COEF_BLOCK      .req x1
-    OUTPUT_BUF      .req x2
-    OUTPUT_COL      .req x3
-    TMP1            .req x0
-    TMP2            .req x15
-
-    /* OUTPUT_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x3 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x3, w3
-
-    /* vpush           {v8.4h - v15.4h}    (not available) */
-    sub             sp, sp, 64
-    mov             x9, sp
-
-    /* Load constants */
-    get_symbol_loc  TMP2, Ljsimd_idct_2x2_neon_consts
-    st1             {v8.8b, v9.8b, v10.8b, v11.8b}, [x9], 32
-    st1             {v12.8b, v13.8b, v14.8b, v15.8b}, [x9], 32
-    ld1             {v14.4h}, [TMP2]
-
-    /* Load all COEF_BLOCK into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | v4.4h   | v5.4h
-     *   1 | v6.4h   | v7.4h
-     *   2 | -       | -
-     *   3 | v10.4h  | v11.4h
-     *   4 | -       | -
-     *   5 | v12.4h  | v13.4h
-     *   6 | -       | -
-     *   7 | v16.4h  | v17.4h
-     */
-    ld1             {v4.4h, v5.4h, v6.4h, v7.4h}, [COEF_BLOCK], 32
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v10.4h, v11.4h}, [COEF_BLOCK], 16
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v12.4h, v13.4h}, [COEF_BLOCK], 16
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    ld1             {v16.4h, v17.4h}, [COEF_BLOCK], 16
-    /* Dequantize */
-    ld1             {v18.4h, v19.4h, v20.4h, v21.4h}, [DCT_TABLE], 32
-    mul             v4.4h, v4.4h, v18.4h
-    mul             v5.4h, v5.4h, v19.4h
-    ins             v4.d[1], v5.d[0]
-    mul             v6.4h, v6.4h, v20.4h
-    mul             v7.4h, v7.4h, v21.4h
-    ins             v6.d[1], v7.d[0]
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v24.4h, v25.4h}, [DCT_TABLE], 16
-    mul             v10.4h, v10.4h, v24.4h
-    mul             v11.4h, v11.4h, v25.4h
-    ins             v10.d[1], v11.d[0]
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v26.4h, v27.4h}, [DCT_TABLE], 16
-    mul             v12.4h, v12.4h, v26.4h
-    mul             v13.4h, v13.4h, v27.4h
-    ins             v12.d[1], v13.d[0]
-    add             DCT_TABLE, DCT_TABLE, #16
-    ld1             {v30.4h, v31.4h}, [DCT_TABLE], 16
-    mul             v16.4h, v16.4h, v30.4h
-    mul             v17.4h, v17.4h, v31.4h
-    ins             v16.d[1], v17.d[0]
-
-    /* Pass 1 */
-#if 0
-    idct_helper     v4.4h, v6.4h, v10.4h, v12.4h, v16.4h, 13, v4.4h, v6.4h
-    transpose_4x4   v4.4h, v6.4h, v8.4h, v10.4h
-    idct_helper     v5.4h, v7.4h, v11.4h, v13.4h, v17.4h, 13, v5.4h, v7.4h
-    transpose_4x4   v5.4h, v7.4h, v9.4h, v11.4h
-#else
-    smull           v26.4s, v6.4h, v14.h[3]
-    smlal           v26.4s, v10.4h, v14.h[2]
-    smlal           v26.4s, v12.4h, v14.h[1]
-    smlal           v26.4s, v16.4h, v14.h[0]
-    smull           v24.4s, v7.4h, v14.h[3]
-    smlal           v24.4s, v11.4h, v14.h[2]
-    smlal           v24.4s, v13.4h, v14.h[1]
-    smlal           v24.4s, v17.4h, v14.h[0]
-    sshll           v15.4s, v4.4h, #15
-    sshll           v30.4s, v5.4h, #15
-    add             v20.4s, v15.4s, v26.4s
-    sub             v15.4s, v15.4s, v26.4s
-    rshrn           v4.4h, v20.4s, #13
-    rshrn           v6.4h, v15.4s, #13
-    add             v20.4s, v30.4s, v24.4s
-    sub             v15.4s, v30.4s, v24.4s
-    rshrn           v5.4h, v20.4s, #13
-    rshrn           v7.4h, v15.4s, #13
-    ins             v4.d[1], v5.d[0]
-    ins             v6.d[1], v7.d[0]
-    transpose       v4, v6, v3, .16b, .8h
-    transpose       v6, v10, v3, .16b, .4s
-    ins             v11.d[0], v10.d[1]
-    ins             v7.d[0], v6.d[1]
-#endif
-
-    /* Pass 2 */
-    idct_helper     v4.4h, v6.4h, v10.4h, v7.4h, v11.4h, 20, v26.4h, v27.4h
-
-    /* Range limit */
-    movi            v30.8h, #0x80
-    ins             v26.d[1], v27.d[0]
-    add             v26.8h, v26.8h, v30.8h
-    sqxtun          v30.8b, v26.8h
-    ins             v26.d[0], v30.d[0]
-    sqxtun          v27.8b, v26.8h
-
-    /* Store results to the output buffer */
-    ldp             TMP1, TMP2, [OUTPUT_BUF]
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-
-    st1             {v26.b}[0], [TMP1], 1
-    st1             {v27.b}[4], [TMP1], 1
-    st1             {v26.b}[1], [TMP2], 1
-    st1             {v27.b}[5], [TMP2], 1
-
-    ld1             {v8.8b, v9.8b, v10.8b, v11.8b}, [sp], 32
-    ld1             {v12.8b, v13.8b, v14.8b, v15.8b}, [sp], 32
-    blr             x30
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-
-.purgem idct_helper
-
-
 /*****************************************************************************/
 
 /*
@@ -2278,82 +1525,6 @@ generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0, 0
 .purgem do_store
 
 
-/*****************************************************************************/
-
-/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- *       rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
-    SAMPLE_DATA     .req x0
-    START_COL       .req x1
-    WORKSPACE       .req x2
-    TMP1            .req x9
-    TMP2            .req x10
-    TMP3            .req x11
-    TMP4            .req x12
-    TMP5            .req x13
-    TMP6            .req x14
-    TMP7            .req x15
-    TMP8            .req x4
-    TMPDUP          .req w3
-
-    /* START_COL is a JDIMENSION (unsigned int) argument, so the ABI doesn't
-       guarantee that the upper (unused) 32 bits of x1 are valid.  This
-       instruction ensures that those bits are set to zero. */
-    uxtw x1, w1
-
-    mov             TMPDUP, #128
-    ldp             TMP1, TMP2, [SAMPLE_DATA], 16
-    ldp             TMP3, TMP4, [SAMPLE_DATA], 16
-    dup             v0.8b, TMPDUP
-    add             TMP1, TMP1, START_COL
-    add             TMP2, TMP2, START_COL
-    ldp             TMP5, TMP6, [SAMPLE_DATA], 16
-    add             TMP3, TMP3, START_COL
-    add             TMP4, TMP4, START_COL
-    ldp             TMP7, TMP8, [SAMPLE_DATA], 16
-    add             TMP5, TMP5, START_COL
-    add             TMP6, TMP6, START_COL
-    ld1             {v16.8b}, [TMP1]
-    add             TMP7, TMP7, START_COL
-    add             TMP8, TMP8, START_COL
-    ld1             {v17.8b}, [TMP2]
-    usubl           v16.8h, v16.8b, v0.8b
-    ld1             {v18.8b}, [TMP3]
-    usubl           v17.8h, v17.8b, v0.8b
-    ld1             {v19.8b}, [TMP4]
-    usubl           v18.8h, v18.8b, v0.8b
-    ld1             {v20.8b}, [TMP5]
-    usubl           v19.8h, v19.8b, v0.8b
-    ld1             {v21.8b}, [TMP6]
-    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [WORKSPACE], 64
-    usubl           v20.8h, v20.8b, v0.8b
-    ld1             {v22.8b}, [TMP7]
-    usubl           v21.8h, v21.8b, v0.8b
-    ld1             {v23.8b}, [TMP8]
-    usubl           v22.8h, v22.8b, v0.8b
-    usubl           v23.8h, v23.8b, v0.8b
-    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [WORKSPACE], 64
-
-    br              x30
-
-    .unreq          SAMPLE_DATA
-    .unreq          START_COL
-    .unreq          WORKSPACE
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-    .unreq          TMP5
-    .unreq          TMP6
-    .unreq          TMP7
-    .unreq          TMP8
-    .unreq          TMPDUP
-
 /*****************************************************************************/
 
 /*
@@ -2652,360 +1823,6 @@ asm_function jsimd_fdct_islow_neon
 #undef XFIX_P_3_072
 
 
-/*****************************************************************************/
-
-/*
- * jsimd_fdct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the forward DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
- * function from jfdctfst.c
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- *       rid of a bunch of VLD1.16 instructions
- */
-
-#undef XFIX_0_541196100
-#define XFIX_0_382683433  v0.h[0]
-#define XFIX_0_541196100  v0.h[1]
-#define XFIX_0_707106781  v0.h[2]
-#define XFIX_1_306562965  v0.h[3]
-
-asm_function jsimd_fdct_ifast_neon
-
-    DATA            .req x0
-    TMP             .req x9
-
-    /* Load constants */
-    get_symbol_loc  TMP, Ljsimd_fdct_ifast_neon_consts
-    ld1             {v0.4h}, [TMP]
-
-    /* Load all DATA into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17    | v0.8h
-     *   1 | d18     | d19    | q9
-     *   2 | d20     | d21    | q10
-     *   3 | d22     | d23    | q11
-     *   4 | d24     | d25    | q12
-     *   5 | d26     | d27    | q13
-     *   6 | d28     | d29    | q14
-     *   7 | d30     | d31    | q15
-     */
-
-    ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
-    ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-    mov             TMP, #2
-    sub             DATA, DATA, #64
-1:
-    /* Transpose */
-    transpose_8x8   v16, v17, v18, v19, v20, v21, v22, v23, v1, v2, v3, v4
-    subs            TMP, TMP, #1
-    /* 1-D FDCT */
-    add             v4.8h, v19.8h, v20.8h
-    sub             v20.8h, v19.8h, v20.8h
-    sub             v28.8h, v18.8h, v21.8h
-    add             v18.8h, v18.8h, v21.8h
-    sub             v29.8h, v17.8h, v22.8h
-    add             v17.8h, v17.8h, v22.8h
-    sub             v21.8h, v16.8h, v23.8h
-    add             v16.8h, v16.8h, v23.8h
-    sub             v6.8h, v17.8h, v18.8h
-    sub             v7.8h, v16.8h, v4.8h
-    add             v5.8h, v17.8h, v18.8h
-    add             v6.8h, v6.8h, v7.8h
-    add             v4.8h, v16.8h, v4.8h
-    sqdmulh         v6.8h, v6.8h, XFIX_0_707106781
-    add             v19.8h, v20.8h, v28.8h
-    add             v16.8h, v4.8h, v5.8h
-    sub             v20.8h, v4.8h, v5.8h
-    add             v5.8h, v28.8h, v29.8h
-    add             v29.8h, v29.8h, v21.8h
-    sqdmulh         v5.8h, v5.8h, XFIX_0_707106781
-    sub             v28.8h, v19.8h, v29.8h
-    add             v18.8h, v7.8h, v6.8h
-    sqdmulh         v28.8h, v28.8h, XFIX_0_382683433
-    sub             v22.8h, v7.8h, v6.8h
-    sqdmulh         v19.8h, v19.8h, XFIX_0_541196100
-    sqdmulh         v7.8h, v29.8h, XFIX_1_306562965
-    add             v6.8h, v21.8h, v5.8h
-    sub             v5.8h, v21.8h, v5.8h
-    add             v29.8h, v29.8h, v28.8h
-    add             v19.8h, v19.8h, v28.8h
-    add             v29.8h, v29.8h, v7.8h
-    add             v21.8h, v5.8h, v19.8h
-    sub             v19.8h, v5.8h, v19.8h
-    add             v17.8h, v6.8h, v29.8h
-    sub             v23.8h, v6.8h, v29.8h
-
-    b.ne            1b
-
-    /* store results */
-    st1             {v16.8h, v17.8h, v18.8h, v19.8h}, [DATA], 64
-    st1             {v20.8h, v21.8h, v22.8h, v23.8h}, [DATA]
-
-    br              x30
-
-    .unreq          DATA
-    .unreq          TMP
-#undef XFIX_0_382683433
-#undef XFIX_0_541196100
-#undef XFIX_0_707106781
-#undef XFIX_1_306562965
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
- *                     DCTELEM *workspace);
- *
- */
-asm_function jsimd_quantize_neon
-
-    COEF_BLOCK      .req x0
-    DIVISORS        .req x1
-    WORKSPACE       .req x2
-
-    RECIPROCAL      .req DIVISORS
-    CORRECTION      .req x9
-    SHIFT           .req x10
-    LOOP_COUNT      .req x11
-
-    mov             LOOP_COUNT, #2
-    add             CORRECTION, DIVISORS, #(64 * 2)
-    add             SHIFT, DIVISORS, #(64 * 6)
-1:
-    subs            LOOP_COUNT, LOOP_COUNT, #1
-    ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [WORKSPACE], 64
-    ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [CORRECTION], 64
-    abs             v20.8h, v0.8h
-    abs             v21.8h, v1.8h
-    abs             v22.8h, v2.8h
-    abs             v23.8h, v3.8h
-    ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [RECIPROCAL], 64
-    add             v20.8h, v20.8h, v4.8h  /* add correction */
-    add             v21.8h, v21.8h, v5.8h
-    add             v22.8h, v22.8h, v6.8h
-    add             v23.8h, v23.8h, v7.8h
-    umull           v4.4s, v20.4h, v28.4h  /* multiply by reciprocal */
-    umull2          v16.4s, v20.8h, v28.8h
-    umull           v5.4s, v21.4h, v29.4h
-    umull2          v17.4s, v21.8h, v29.8h
-    umull           v6.4s, v22.4h, v30.4h  /* multiply by reciprocal */
-    umull2          v18.4s, v22.8h, v30.8h
-    umull           v7.4s, v23.4h, v31.4h
-    umull2          v19.4s, v23.8h, v31.8h
-    ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [SHIFT], 64
-    shrn            v4.4h, v4.4s, #16
-    shrn            v5.4h, v5.4s, #16
-    shrn            v6.4h, v6.4s, #16
-    shrn            v7.4h, v7.4s, #16
-    shrn2           v4.8h, v16.4s, #16
-    shrn2           v5.8h, v17.4s, #16
-    shrn2           v6.8h, v18.4s, #16
-    shrn2           v7.8h, v19.4s, #16
-    neg             v24.8h, v24.8h
-    neg             v25.8h, v25.8h
-    neg             v26.8h, v26.8h
-    neg             v27.8h, v27.8h
-    sshr            v0.8h, v0.8h, #15  /* extract sign */
-    sshr            v1.8h, v1.8h, #15
-    sshr            v2.8h, v2.8h, #15
-    sshr            v3.8h, v3.8h, #15
-    ushl            v4.8h, v4.8h, v24.8h  /* shift */
-    ushl            v5.8h, v5.8h, v25.8h
-    ushl            v6.8h, v6.8h, v26.8h
-    ushl            v7.8h, v7.8h, v27.8h
-
-    eor             v4.16b, v4.16b, v0.16b  /* restore sign */
-    eor             v5.16b, v5.16b, v1.16b
-    eor             v6.16b, v6.16b, v2.16b
-    eor             v7.16b, v7.16b, v3.16b
-    sub             v4.8h, v4.8h, v0.8h
-    sub             v5.8h, v5.8h, v1.8h
-    sub             v6.8h, v6.8h, v2.8h
-    sub             v7.8h, v7.8h, v3.8h
-    st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [COEF_BLOCK], 64
-
-    b.ne            1b
-
-    br              x30  /* return */
-
-    .unreq          COEF_BLOCK
-    .unreq          DIVISORS
-    .unreq          WORKSPACE
-    .unreq          RECIPROCAL
-    .unreq          CORRECTION
-    .unreq          SHIFT
-    .unreq          LOOP_COUNT
-
-
-/*****************************************************************************/
-
-/*
- * Downsample pixel values of a single component.
- * This version handles the common case of 2:1 horizontal and 1:1 vertical,
- * without smoothing.
- *
- * GLOBAL(void)
- * jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
- *                            JDIMENSION v_samp_factor,
- *                            JDIMENSION width_in_blocks,
- *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
- */
-
-asm_function jsimd_h2v1_downsample_neon
-    IMAGE_WIDTH     .req x0
-    MAX_V_SAMP      .req x1
-    V_SAMP          .req x2
-    BLOCK_WIDTH     .req x3
-    INPUT_DATA      .req x4
-    OUTPUT_DATA     .req x5
-    OUTPTR          .req x9
-    INPTR           .req x10
-    TMP1            .req x11
-    TMP2            .req x12
-    TMP3            .req x13
-    TMPDUP          .req w15
-
-    mov             TMPDUP, #0x10000
-    lsl             TMP2, BLOCK_WIDTH, #4
-    sub             TMP2, TMP2, IMAGE_WIDTH
-    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
-    add             TMP3, TMP3, TMP2, lsl #4
-    dup             v16.4s, TMPDUP
-    ld1             {v18.16b}, [TMP3]
-
-1:  /* row loop */
-    ldr             INPTR, [INPUT_DATA], #8
-    ldr             OUTPTR, [OUTPUT_DATA], #8
-    subs            TMP1, BLOCK_WIDTH, #1
-    b.eq            3f
-2:  /* columns */
-    ld1             {v0.16b}, [INPTR], #16
-    mov             v4.16b, v16.16b
-    subs            TMP1, TMP1, #1
-    uadalp          v4.8h, v0.16b
-    shrn            v6.8b, v4.8h, #1
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            2b
-3:  /* last columns */
-    ld1             {v0.16b}, [INPTR]
-    mov             v4.16b, v16.16b
-    subs            V_SAMP, V_SAMP, #1
-    /* expand right */
-    tbl             v2.16b, {v0.16b}, v18.16b
-    uadalp          v4.8h, v2.16b
-    shrn            v6.8b, v4.8h, #1
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            1b
-
-    br              x30
-
-    .unreq          IMAGE_WIDTH
-    .unreq          MAX_V_SAMP
-    .unreq          V_SAMP
-    .unreq          BLOCK_WIDTH
-    .unreq          INPUT_DATA
-    .unreq          OUTPUT_DATA
-    .unreq          OUTPTR
-    .unreq          INPTR
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMPDUP
-
-
-/*****************************************************************************/
-
-/*
- * Downsample pixel values of a single component.
- * This version handles the common case of 2:1 horizontal and 2:1 vertical,
- * without smoothing.
- *
- * GLOBAL(void)
- * jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
- *                            JDIMENSION v_samp_factor,
- *                            JDIMENSION width_in_blocks,
- *                            JSAMPARRAY input_data, JSAMPARRAY output_data);
- */
-
-.balign 16
-asm_function jsimd_h2v2_downsample_neon
-    IMAGE_WIDTH     .req x0
-    MAX_V_SAMP      .req x1
-    V_SAMP          .req x2
-    BLOCK_WIDTH     .req x3
-    INPUT_DATA      .req x4
-    OUTPUT_DATA     .req x5
-    OUTPTR          .req x9
-    INPTR0          .req x10
-    INPTR1          .req x14
-    TMP1            .req x11
-    TMP2            .req x12
-    TMP3            .req x13
-    TMPDUP          .req w15
-
-    mov             TMPDUP, #1
-    lsl             TMP2, BLOCK_WIDTH, #4
-    lsl             TMPDUP, TMPDUP, #17
-    sub             TMP2, TMP2, IMAGE_WIDTH
-    get_symbol_loc  TMP3, Ljsimd_h2_downsample_neon_consts
-    orr             TMPDUP, TMPDUP, #1
-    add             TMP3, TMP3, TMP2, lsl #4
-    dup             v16.4s, TMPDUP
-    ld1             {v18.16b}, [TMP3]
-
-1:  /* row loop */
-    ldr             INPTR0, [INPUT_DATA], #8
-    ldr             OUTPTR, [OUTPUT_DATA], #8
-    ldr             INPTR1, [INPUT_DATA], #8
-    subs            TMP1, BLOCK_WIDTH, #1
-    b.eq            3f
-2:  /* columns */
-    ld1             {v0.16b}, [INPTR0], #16
-    ld1             {v1.16b}, [INPTR1], #16
-    mov             v4.16b, v16.16b
-    subs            TMP1, TMP1, #1
-    uadalp          v4.8h, v0.16b
-    uadalp          v4.8h, v1.16b
-    shrn            v6.8b, v4.8h, #2
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            2b
-3:  /* last columns */
-    ld1             {v0.16b}, [INPTR0], #16
-    ld1             {v1.16b}, [INPTR1], #16
-    mov             v4.16b, v16.16b
-    subs            V_SAMP, V_SAMP, #1
-    /* expand right */
-    tbl             v2.16b, {v0.16b}, v18.16b
-    tbl             v3.16b, {v1.16b}, v18.16b
-    uadalp          v4.8h, v2.16b
-    uadalp          v4.8h, v3.16b
-    shrn            v6.8b, v4.8h, #2
-    st1             {v6.8b}, [OUTPTR], #8
-    b.ne            1b
-
-    br              x30
-
-    .unreq          IMAGE_WIDTH
-    .unreq          MAX_V_SAMP
-    .unreq          V_SAMP
-    .unreq          BLOCK_WIDTH
-    .unreq          INPUT_DATA
-    .unreq          OUTPUT_DATA
-    .unreq          OUTPTR
-    .unreq          INPTR0
-    .unreq          INPTR1
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMPDUP
-
-
 /*****************************************************************************/
 
 /*
diff --git a/external/jpeg/simd/arm/align.h b/external/jpeg/simd/arm/align.h
new file mode 100644
index 000000000000..cff4241e8439
--- /dev/null
+++ b/external/jpeg/simd/arm/align.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* How to obtain memory alignment for structures and variables */
+#if defined(_MSC_VER)
+#define ALIGN(alignment)  __declspec(align(alignment))
+#elif defined(__clang__) || defined(__GNUC__)
+#define ALIGN(alignment)  __attribute__((aligned(alignment)))
+#else
+#error "Unknown compiler"
+#endif
diff --git a/external/jpeg/simd/arm/jccolor-neon.c b/external/jpeg/simd/arm/jccolor-neon.c
new file mode 100644
index 000000000000..9fcc62dd25ca
--- /dev/null
+++ b/external/jpeg/simd/arm/jccolor-neon.c
@@ -0,0 +1,160 @@
+/*
+ * jccolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> YCbCr conversion constants */
+
+#define F_0_298  19595
+#define F_0_587  38470
+#define F_0_113  7471
+#define F_0_168  11059
+#define F_0_331  21709
+#define F_0_500  32768
+#define F_0_418  27439
+#define F_0_081  5329
+
+ALIGN(16) static const uint16_t jsimd_rgb_ycc_neon_consts[] = {
+  F_0_298, F_0_587, F_0_113, F_0_168,
+  F_0_331, F_0_500, F_0_418, F_0_081
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extrgbx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extbgrx_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extxbgr_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_ycc_convert_neon  jsimd_extxrgb_ycc_convert_neon
+#if defined(__aarch64__) || defined(_M_ARM64)
+#include "aarch64/jccolext-neon.c"
+#else
+#include "aarch32/jccolext-neon.c"
+#endif
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_ycc_convert_neon
diff --git a/external/jpeg/simd/arm/jcgray-neon.c b/external/jpeg/simd/arm/jcgray-neon.c
new file mode 100644
index 000000000000..71c7b2de2183
--- /dev/null
+++ b/external/jpeg/simd/arm/jcgray-neon.c
@@ -0,0 +1,120 @@
+/*
+ * jcgray-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* RGB -> Grayscale conversion constants */
+
+#define F_0_298  19595
+#define F_0_587  38470
+#define F_0_113  7471
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extrgbx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extbgrx_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extxbgr_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_neon  jsimd_extxrgb_gray_convert_neon
+#include "jcgryext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_neon
diff --git a/external/jpeg/simd/arm/jcgryext-neon.c b/external/jpeg/simd/arm/jcgryext-neon.c
new file mode 100644
index 000000000000..416a7385df85
--- /dev/null
+++ b/external/jpeg/simd/arm/jcgryext-neon.c
@@ -0,0 +1,106 @@
+/*
+ * jcgryext-neon.c - grayscale colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-neon.c */
+
+
+/* RGB -> Grayscale conversion is defined by the following equation:
+ *    Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+ *
+ * Avoid floating point arithmetic by using shifted integer constants:
+ *    0.29899597 = 19595 * 2^-16
+ *    0.58700561 = 38470 * 2^-16
+ *    0.11399841 =  7471 * 2^-16
+ * These constants are defined in jcgray-neon.c
+ *
+ * This is the same computation as the RGB -> Y portion of RGB -> YCbCr.
+ */
+
+void jsimd_rgb_gray_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                 JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                 int num_rows)
+{
+  JSAMPROW inptr;
+  JSAMPROW outptr;
+  /* Allocate temporary buffer for final (image_width % 16) pixels in row. */
+  ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    int cols_remaining = image_width;
+    for (; cols_remaining > 0; cols_remaining -= 16) {
+
+      /* To prevent buffer overread by the vector load instructions, the last
+       * (image_width % 16) columns of data are first memcopied to a temporary
+       * buffer large enough to accommodate the vector load.
+       */
+      if (cols_remaining < 16) {
+        memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
+        inptr = tmp_buf;
+      }
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t input_pixels = vld4q_u8(inptr);
+#else
+      uint8x16x3_t input_pixels = vld3q_u8(inptr);
+#endif
+      uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
+      uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
+      uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
+      uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
+
+      /* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
+      uint32x4_t y_ll = vmull_n_u16(vget_low_u16(r_l), F_0_298);
+      uint32x4_t y_lh = vmull_n_u16(vget_high_u16(r_l), F_0_298);
+      uint32x4_t y_hl = vmull_n_u16(vget_low_u16(r_h), F_0_298);
+      uint32x4_t y_hh = vmull_n_u16(vget_high_u16(r_h), F_0_298);
+      y_ll = vmlal_n_u16(y_ll, vget_low_u16(g_l), F_0_587);
+      y_lh = vmlal_n_u16(y_lh, vget_high_u16(g_l), F_0_587);
+      y_hl = vmlal_n_u16(y_hl, vget_low_u16(g_h), F_0_587);
+      y_hh = vmlal_n_u16(y_hh, vget_high_u16(g_h), F_0_587);
+      y_ll = vmlal_n_u16(y_ll, vget_low_u16(b_l), F_0_113);
+      y_lh = vmlal_n_u16(y_lh, vget_high_u16(b_l), F_0_113);
+      y_hl = vmlal_n_u16(y_hl, vget_low_u16(b_h), F_0_113);
+      y_hh = vmlal_n_u16(y_hh, vget_high_u16(b_h), F_0_113);
+
+      /* Descale Y values (rounding right shift) and narrow to 16-bit. */
+      uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
+                                    vrshrn_n_u32(y_lh, 16));
+      uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
+                                    vrshrn_n_u32(y_hh, 16));
+
+      /* Narrow Y values to 8-bit and store to memory.  Buffer overwrite is
+       * permitted up to the next multiple of ALIGN_SIZE bytes.
+       */
+      vst1q_u8(outptr, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
+
+      /* Increment pointers. */
+      inptr += (16 * RGB_PIXELSIZE);
+      outptr += 16;
+    }
+  }
+}
diff --git a/external/jpeg/simd/arm/jchuff.h b/external/jpeg/simd/arm/jchuff.h
new file mode 100644
index 000000000000..d4edd5ebc7b3
--- /dev/null
+++ b/external/jpeg/simd/arm/jchuff.h
@@ -0,0 +1,117 @@
+/*
+ * jchuff.h
+ *
+ * This file was part of the Independent JPEG Group's software:
+ * Copyright (C) 1991-1997, Thomas G. Lane.
+ * libjpeg-turbo Modifications:
+ * Copyright (C) 2009, 2018, D. R. Commander.
+ * Copyright (C) 2018, Matthias Räncker.
+ * Copyright (C) 2020-2021, Arm Limited.
+ * For conditions of distribution and use, see the accompanying README.ijg
+ * file.
+ */
+
+/* Expanded entropy encoder object for Huffman encoding.
+ *
+ * The savable_state subrecord contains fields that change within an MCU,
+ * but must not be updated permanently until we complete the MCU.
+ */
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define BIT_BUF_SIZE  64
+#else
+#define BIT_BUF_SIZE  32
+#endif
+
+typedef struct {
+  size_t put_buffer;                    /* current bit accumulation buffer */
+  int free_bits;                        /* # of bits available in it */
+  int last_dc_val[MAX_COMPS_IN_SCAN];   /* last DC coef for each component */
+} savable_state;
+
+typedef struct {
+  JOCTET *next_output_byte;     /* => next byte to write in buffer */
+  size_t free_in_buffer;        /* # of byte spaces remaining in buffer */
+  savable_state cur;            /* Current bit buffer & DC state */
+  j_compress_ptr cinfo;         /* dump_buffer needs access to this */
+  int simd;
+} working_state;
+
+/* Outputting bits to the file */
+
+/* Output byte b and, speculatively, an additional 0 byte. 0xFF must be encoded
+ * as 0xFF 0x00, so the output buffer pointer is advanced by 2 if the byte is
+ * 0xFF.  Otherwise, the output buffer pointer is advanced by 1, and the
+ * speculative 0 byte will be overwritten by the next byte.
+ */
+#define EMIT_BYTE(b) { \
+  buffer[0] = (JOCTET)(b); \
+  buffer[1] = 0; \
+  buffer -= -2 + ((JOCTET)(b) < 0xFF); \
+}
+
+/* Output the entire bit buffer.  If there are no 0xFF bytes in it, then write
+ * directly to the output buffer.  Otherwise, use the EMIT_BYTE() macro to
+ * encode 0xFF as 0xFF 0x00.
+ */
+#if defined(__aarch64__) || defined(_M_ARM64)
+
+#define FLUSH() { \
+  if (put_buffer & 0x8080808080808080 & ~(put_buffer + 0x0101010101010101)) { \
+    EMIT_BYTE(put_buffer >> 56) \
+    EMIT_BYTE(put_buffer >> 48) \
+    EMIT_BYTE(put_buffer >> 40) \
+    EMIT_BYTE(put_buffer >> 32) \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    *((uint64_t *)buffer) = BUILTIN_BSWAP64(put_buffer); \
+    buffer += 8; \
+  } \
+}
+
+#else
+
+#define FLUSH() { \
+  if (put_buffer & 0x80808080 & ~(put_buffer + 0x01010101)) { \
+    EMIT_BYTE(put_buffer >> 24) \
+    EMIT_BYTE(put_buffer >> 16) \
+    EMIT_BYTE(put_buffer >>  8) \
+    EMIT_BYTE(put_buffer      ) \
+  } else { \
+    *((uint32_t *)buffer) = BUILTIN_BSWAP32(put_buffer); \
+    buffer += 4; \
+  } \
+}
+
+#endif
+
+/* Fill the bit buffer to capacity with the leading bits from code, then output
+ * the bit buffer and put the remaining bits from code into the bit buffer.
+ */
+#define PUT_AND_FLUSH(code, size) { \
+  put_buffer = (put_buffer << (size + free_bits)) | (code >> -free_bits); \
+  FLUSH() \
+  free_bits += BIT_BUF_SIZE; \
+  put_buffer = code; \
+}
+
+/* Insert code into the bit buffer and output the bit buffer if needed.
+ * NOTE: We can't flush with free_bits == 0, since the left shift in
+ * PUT_AND_FLUSH() would have undefined behavior.
+ */
+#define PUT_BITS(code, size) { \
+  free_bits -= size; \
+  if (free_bits < 0) \
+    PUT_AND_FLUSH(code, size) \
+  else \
+    put_buffer = (put_buffer << size) | code; \
+}
+
+#define PUT_CODE(code, size, diff) { \
+  diff |= code << nbits; \
+  nbits += size; \
+  PUT_BITS(diff, nbits) \
+}
diff --git a/external/jpeg/simd/arm/jcphuff-neon.c b/external/jpeg/simd/arm/jcphuff-neon.c
new file mode 100644
index 000000000000..86a263faf756
--- /dev/null
+++ b/external/jpeg/simd/arm/jcphuff-neon.c
@@ -0,0 +1,591 @@
+/*
+ * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
+ *
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* Data preparation for encode_mcu_AC_first().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+void jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits)
+{
+  JCOEF *values_ptr = values;
+  JCOEF *diff_values_ptr = values + DCTSIZE2;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs1);
+    vst1q_s16(values_ptr + DCTSIZE, coefs2);
+    vst1q_s16(diff_values_ptr, diff1);
+    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs1 = vshrq_n_s16(coefs1, 15);
+    int16x8_t sign_coefs2 = vshrq_n_s16(coefs2, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff1 = veorq_s16(coefs1, sign_coefs1);
+    int16x8_t diff2 = veorq_s16(coefs2, sign_coefs2);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs1);
+    vst1q_s16(values_ptr + DCTSIZE, coefs2);
+    vst1q_s16(diff_values_ptr, diff1);
+    vst1q_s16(diff_values_ptr + DCTSIZE, diff2);
+    values_ptr += 16;
+    diff_values_ptr += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+    default:
+      break;
+    }
+
+    /* Isolate sign of coefficients. */
+    int16x8_t sign_coefs = vshrq_n_s16(coefs, 15);
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs = vabsq_s16(coefs);
+    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+
+    /* Compute diff values. */
+    int16x8_t diff = veorq_s16(coefs, sign_coefs);
+
+    /* Store transformed coefficients and diff values. */
+    vst1q_s16(values_ptr, coefs);
+    vst1q_s16(diff_values_ptr, diff);
+    values_ptr += 8;
+    diff_values_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in the values and diff_values blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_s16(values_ptr, vdupq_n_s16(0));
+    vst1q_s16(diff_values_ptr, vdupq_n_s16(0));
+    values_ptr += 8;
+    diff_values_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap.  A set bit means that the corresponding
+   * coefficient != 0.
+   */
+  int16x8_t row0 = vld1q_s16(values + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(values + 1 * DCTSIZE);
+  int16x8_t row2 = vld1q_s16(values + 2 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(values + 3 * DCTSIZE);
+  int16x8_t row4 = vld1q_s16(values + 4 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(values + 5 * DCTSIZE);
+  int16x8_t row6 = vld1q_s16(values + 6 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(values + 7 * DCTSIZE);
+
+  uint8x8_t row0_eq0 = vmovn_u16(vceqq_s16(row0, vdupq_n_s16(0)));
+  uint8x8_t row1_eq0 = vmovn_u16(vceqq_s16(row1, vdupq_n_s16(0)));
+  uint8x8_t row2_eq0 = vmovn_u16(vceqq_s16(row2, vdupq_n_s16(0)));
+  uint8x8_t row3_eq0 = vmovn_u16(vceqq_s16(row3, vdupq_n_s16(0)));
+  uint8x8_t row4_eq0 = vmovn_u16(vceqq_s16(row4, vdupq_n_s16(0)));
+  uint8x8_t row5_eq0 = vmovn_u16(vceqq_s16(row5, vdupq_n_s16(0)));
+  uint8x8_t row6_eq0 = vmovn_u16(vceqq_s16(row6, vdupq_n_s16(0)));
+  uint8x8_t row7_eq0 = vmovn_u16(vceqq_s16(row7, vdupq_n_s16(0)));
+
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+  row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
+  row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
+  row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
+  row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
+  row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
+  row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
+  row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
+  row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  *zerobits = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  zerobits[0] = ~bitmap0;
+  zerobits[1] = ~bitmap1;
+#endif
+}
+
+
+/* Data preparation for encode_mcu_AC_refine().
+ *
+ * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
+ * found in jcphuff.c.
+ */
+
+int jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits)
+{
+  /* Temporary storage buffers for data used to compute the signbits bitmap and
+   * the end-of-block (EOB) position
+   */
+  uint8_t coef_sign_bits[64];
+  uint8_t coef_eq1_bits[64];
+
+  JCOEF *absvalues_ptr = absvalues;
+  uint8_t *coef_sign_bits_ptr = coef_sign_bits;
+  uint8_t *eq1_bits_ptr = coef_eq1_bits;
+
+  /* Rows of coefficients to zero (since they haven't been processed) */
+  int i, rows_to_zero = 8;
+
+  for (i = 0; i < Sl / 16; i++) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs1);
+    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+  }
+
+  /* Same operation but for remaining partial vector */
+  int remaining_coefs = Sl % 16;
+  if (remaining_coefs > 8) {
+    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
+    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
+    int16x8_t coefs2 = vdupq_n_s16(0);
+    switch (remaining_coefs) {
+    case 15:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
+    case 14:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
+    case 13:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
+    case 12:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
+    case 11:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
+    case 10:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
+    case 9:
+      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs1 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
+    uint8x8_t sign_coefs2 =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
+    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs1 = vabsq_s16(coefs1);
+    int16x8_t abs_coefs2 = vabsq_s16(coefs2);
+    coefs1 = vshlq_s16(abs_coefs1, vdupq_n_s16(-Al));
+    coefs2 = vshlq_s16(abs_coefs2, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs1);
+    vst1q_s16(absvalues_ptr + DCTSIZE, coefs2);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_s16(coefs1, vdupq_n_s16(1)));
+    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_s16(coefs2, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq11);
+    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
+
+    absvalues_ptr += 16;
+    coef_sign_bits_ptr += 16;
+    eq1_bits_ptr += 16;
+    jpeg_natural_order_start += 16;
+    rows_to_zero -= 2;
+
+  } else if (remaining_coefs > 0) {
+    int16x8_t coefs = vdupq_n_s16(0);
+
+    switch (remaining_coefs) {
+    case 8:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
+    case 7:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
+    case 6:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
+    case 5:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
+    case 4:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
+    case 3:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
+    case 2:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
+    case 1:
+      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
+    default:
+      break;
+    }
+
+    /* Compute and store data for signbits bitmap. */
+    uint8x8_t sign_coefs =
+      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
+    vst1_u8(coef_sign_bits_ptr, sign_coefs);
+
+    /* Compute absolute value of coefficients and apply point transform Al. */
+    int16x8_t abs_coefs = vabsq_s16(coefs);
+    coefs = vshlq_s16(abs_coefs, vdupq_n_s16(-Al));
+    vst1q_s16(absvalues_ptr, coefs);
+
+    /* Test whether transformed coefficient values == 1 (used to find EOB
+     * position.)
+     */
+    uint8x8_t coefs_eq1 = vmovn_u16(vceqq_s16(coefs, vdupq_n_s16(1)));
+    vst1_u8(eq1_bits_ptr, coefs_eq1);
+
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+    rows_to_zero--;
+  }
+
+  /* Zero remaining memory in blocks. */
+  for (i = 0; i < rows_to_zero; i++) {
+    vst1q_s16(absvalues_ptr, vdupq_n_s16(0));
+    vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
+    vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
+    absvalues_ptr += 8;
+    coef_sign_bits_ptr += 8;
+    eq1_bits_ptr += 8;
+  }
+
+  /* Construct zerobits bitmap. */
+  int16x8_t abs_row0 = vld1q_s16(absvalues + 0 * DCTSIZE);
+  int16x8_t abs_row1 = vld1q_s16(absvalues + 1 * DCTSIZE);
+  int16x8_t abs_row2 = vld1q_s16(absvalues + 2 * DCTSIZE);
+  int16x8_t abs_row3 = vld1q_s16(absvalues + 3 * DCTSIZE);
+  int16x8_t abs_row4 = vld1q_s16(absvalues + 4 * DCTSIZE);
+  int16x8_t abs_row5 = vld1q_s16(absvalues + 5 * DCTSIZE);
+  int16x8_t abs_row6 = vld1q_s16(absvalues + 6 * DCTSIZE);
+  int16x8_t abs_row7 = vld1q_s16(absvalues + 7 * DCTSIZE);
+
+  uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_s16(abs_row0, vdupq_n_s16(0)));
+  uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_s16(abs_row1, vdupq_n_s16(0)));
+  uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_s16(abs_row2, vdupq_n_s16(0)));
+  uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_s16(abs_row3, vdupq_n_s16(0)));
+  uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_s16(abs_row4, vdupq_n_s16(0)));
+  uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_s16(abs_row5, vdupq_n_s16(0)));
+  uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_s16(abs_row6, vdupq_n_s16(0)));
+  uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_s16(abs_row7, vdupq_n_s16(0)));
+
+  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
+  const uint8x8_t bitmap_mask =
+    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
+
+  abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
+  abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
+  abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
+  abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
+  abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
+  abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
+  abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
+  abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
+
+  uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
+  uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
+  uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
+  uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
+  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store zerobits bitmap. */
+  bits[0] = ~bitmap0;
+  bits[1] = ~bitmap1;
+#endif
+
+  /* Construct signbits bitmap. */
+  uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
+  uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
+  uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
+  uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
+  uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
+  uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
+  uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
+  uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
+
+  signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
+  signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
+  signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
+  signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
+  signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
+  signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
+  signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
+  signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
+  bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
+  bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
+  bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+  /* Store signbits bitmap. */
+  bits[1] = ~bitmap;
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+  /* Store signbits bitmap. */
+  bits[2] = ~bitmap0;
+  bits[3] = ~bitmap1;
+#endif
+
+  /* Construct bitmap to find EOB position (the index of the last coefficient
+   * equal to 1.)
+   */
+  uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
+  uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
+  uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
+  uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
+  uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
+  uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
+  uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
+  uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
+
+  row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
+  row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
+  row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
+  row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
+  row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
+  row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
+  row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
+  row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
+
+  bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
+  bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
+  bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
+  bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
+  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
+  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
+  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
+
+#if defined(__aarch64__) || defined(_M_ARM64)
+  /* Move bitmap to a 64-bit scalar register. */
+  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
+
+  /* Return EOB position. */
+  if (bitmap == 0) {
+    /* EOB position is defined to be 0 if all coefficients != 1. */
+    return 0;
+  } else {
+    return 63 - BUILTIN_CLZLL(bitmap);
+  }
+#else
+  /* Move bitmap to two 32-bit scalar registers. */
+  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
+  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
+
+  /* Return EOB position. */
+  if (bitmap0 == 0 && bitmap1 == 0) {
+    return 0;
+  } else if (bitmap1 != 0) {
+    return 63 - BUILTIN_CLZ(bitmap1);
+  } else {
+    return 31 - BUILTIN_CLZ(bitmap0);
+  }
+#endif
+}
diff --git a/external/jpeg/simd/arm/jcsample-neon.c b/external/jpeg/simd/arm/jcsample-neon.c
new file mode 100644
index 000000000000..8a3e237838e9
--- /dev/null
+++ b/external/jpeg/simd/arm/jcsample-neon.c
@@ -0,0 +1,192 @@
+/*
+ * jcsample-neon.c - downsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+ALIGN(16) static const uint8_t jsimd_h2_downsample_consts[] = {
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 0 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 1 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0E,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 2 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0D, 0x0D,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 3 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0C, 0x0C, 0x0C,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 4 */
+  0x08, 0x09, 0x0A, 0x0B, 0x0B, 0x0B, 0x0B, 0x0B,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 5 */
+  0x08, 0x09, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A, 0x0A,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 6 */
+  0x08, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 7 */
+  0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,   /* Pad 8 */
+  0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06,   /* Pad 9 */
+  0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x05, 0x05,   /* Pad 10 */
+  0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+  0x00, 0x01, 0x02, 0x03, 0x04, 0x04, 0x04, 0x04,   /* Pad 11 */
+  0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+  0x00, 0x01, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03,   /* Pad 12 */
+  0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
+  0x00, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,   /* Pad 13 */
+  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
+  0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,   /* Pad 14 */
+  0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,   /* Pad 15 */
+  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the common case of 2:1 horizontal and 1:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v1_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+                                JDIMENSION v_samp_factor,
+                                JDIMENSION width_in_blocks,
+                                JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  JSAMPROW inptr, outptr;
+  /* Load expansion mask to pad remaining elements of last DCT block. */
+  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+  const uint8x16_t expand_mask =
+    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+  /* Load bias pattern (alternating every pixel.) */
+  /* { 0, 1, 0, 1, 0, 1, 0, 1 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00010000));
+  unsigned i, outrow;
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr = input_data[outrow];
+
+    /* Downsample all but the last DCT block of pixels. */
+    for (i = 0; i < width_in_blocks - 1; i++) {
+      uint8x16_t pixels = vld1q_u8(inptr + i * 2 * DCTSIZE);
+      /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+      /* Divide total by 2 and narrow to 8-bit. */
+      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+      /* Store samples to memory. */
+      vst1_u8(outptr + i * DCTSIZE, samples_u8);
+    }
+
+    /* Load pixels in last DCT block into a table. */
+    uint8x16_t pixels = vld1q_u8(inptr + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    /* Pad the empty elements with the value of the last pixel. */
+    pixels = vqtbl1q_u8(pixels, expand_mask);
+#else
+    uint8x8x2_t table = { { vget_low_u8(pixels), vget_high_u8(pixels) } };
+    pixels = vcombine_u8(vtbl2_u8(table, vget_low_u8(expand_mask)),
+                         vtbl2_u8(table, vget_high_u8(expand_mask)));
+#endif
+    /* Add adjacent pixel values, widen to 16-bit, and add bias. */
+    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels);
+    /* Divide total by 2, narrow to 8-bit, and store. */
+    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 1);
+    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+  }
+}
+
+
+/* Downsample pixel values of a single component.
+ * This version handles the standard case of 2:1 horizontal and 2:1 vertical,
+ * without smoothing.
+ */
+
+void jsimd_h2v2_downsample_neon(JDIMENSION image_width, int max_v_samp_factor,
+                                JDIMENSION v_samp_factor,
+                                JDIMENSION width_in_blocks,
+                                JSAMPARRAY input_data, JSAMPARRAY output_data)
+{
+  JSAMPROW inptr0, inptr1, outptr;
+  /* Load expansion mask to pad remaining elements of last DCT block. */
+  const int mask_offset = 16 * ((width_in_blocks * 2 * DCTSIZE) - image_width);
+  const uint8x16_t expand_mask =
+    vld1q_u8(&jsimd_h2_downsample_consts[mask_offset]);
+  /* Load bias pattern (alternating every pixel.) */
+  /* { 1, 2, 1, 2, 1, 2, 1, 2 } */
+  const uint16x8_t bias = vreinterpretq_u16_u32(vdupq_n_u32(0x00020001));
+  unsigned i, outrow;
+
+  for (outrow = 0; outrow < v_samp_factor; outrow++) {
+    outptr = output_data[outrow];
+    inptr0 = input_data[outrow];
+    inptr1 = input_data[outrow + 1];
+
+    /* Downsample all but the last DCT block of pixels. */
+    for (i = 0; i < width_in_blocks - 1; i++) {
+      uint8x16_t pixels_r0 = vld1q_u8(inptr0 + i * 2 * DCTSIZE);
+      uint8x16_t pixels_r1 = vld1q_u8(inptr1 + i * 2 * DCTSIZE);
+      /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+      uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+      /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate.
+       */
+      samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+      /* Divide total by 4 and narrow to 8-bit. */
+      uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+      /* Store samples to memory and increment pointers. */
+      vst1_u8(outptr + i * DCTSIZE, samples_u8);
+    }
+
+    /* Load pixels in last DCT block into a table. */
+    uint8x16_t pixels_r0 =
+      vld1q_u8(inptr0 + (width_in_blocks - 1) * 2 * DCTSIZE);
+    uint8x16_t pixels_r1 =
+      vld1q_u8(inptr1 + (width_in_blocks - 1) * 2 * DCTSIZE);
+#if defined(__aarch64__) || defined(_M_ARM64)
+    /* Pad the empty elements with the value of the last pixel. */
+    pixels_r0 = vqtbl1q_u8(pixels_r0, expand_mask);
+    pixels_r1 = vqtbl1q_u8(pixels_r1, expand_mask);
+#else
+    uint8x8x2_t table_r0 =
+      { { vget_low_u8(pixels_r0), vget_high_u8(pixels_r0) } };
+    uint8x8x2_t table_r1 =
+      { { vget_low_u8(pixels_r1), vget_high_u8(pixels_r1) } };
+    pixels_r0 = vcombine_u8(vtbl2_u8(table_r0, vget_low_u8(expand_mask)),
+                            vtbl2_u8(table_r0, vget_high_u8(expand_mask)));
+    pixels_r1 = vcombine_u8(vtbl2_u8(table_r1, vget_low_u8(expand_mask)),
+                            vtbl2_u8(table_r1, vget_high_u8(expand_mask)));
+#endif
+    /* Add adjacent pixel values in row 0, widen to 16-bit, and add bias. */
+    uint16x8_t samples_u16 = vpadalq_u8(bias, pixels_r0);
+    /* Add adjacent pixel values in row 1, widen to 16-bit, and accumulate. */
+    samples_u16 = vpadalq_u8(samples_u16, pixels_r1);
+    /* Divide total by 4, narrow to 8-bit, and store. */
+    uint8x8_t samples_u8 = vshrn_n_u16(samples_u16, 2);
+    vst1_u8(outptr + (width_in_blocks - 1) * DCTSIZE, samples_u8);
+  }
+}
diff --git a/external/jpeg/simd/arm/jdcolext-neon.c b/external/jpeg/simd/arm/jdcolext-neon.c
new file mode 100644
index 000000000000..ae440f45ac5e
--- /dev/null
+++ b/external/jpeg/simd/arm/jdcolext-neon.c
@@ -0,0 +1,353 @@
+/*
+ * jdcolext-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-neon.c. */
+
+
+/* YCbCr -> RGB conversion is defined by the following equations:
+ *    R = Y                        + 1.40200 * (Cr - 128)
+ *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ *    B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.3441467 = 11277 * 2^-15
+ *    0.7141418 = 23401 * 2^-15
+ *    1.4020386 = 22971 * 2^-14
+ *    1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdcolor-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for YCbCr -> RGB conversion routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
+                                JDIMENSION input_row, JSAMPARRAY output_buf,
+                                int num_rows)
+{
+  JSAMPROW outptr;
+  /* Pointers to Y, Cb, and Cr data */
+  JSAMPROW inptr0, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+    int cols_remaining = output_width;
+    for (; cols_remaining >= 16; cols_remaining -= 16) {
+      uint8x16_t y  = vld1q_u8(inptr0);
+      uint8x16_t cb = vld1q_u8(inptr1);
+      uint8x16_t cr = vld1q_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_low_u8(cr)));
+      int16x8_t cr_128_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_high_u8(cr)));
+      int16x8_t cb_128_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_low_u8(cb)));
+      int16x8_t cb_128_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
+                                       vget_high_u8(cb)));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
+      int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
+                                            consts, 0);
+      int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
+      int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
+                                            consts, 0);
+      g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
+                                  consts, 1);
+      g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
+                                  consts, 1);
+      g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
+                                  consts, 1);
+      g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
+                                  consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
+                                         vrshrn_n_s32(g_sub_y_lh, 15));
+      int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
+                                         vrshrn_n_s32(g_sub_y_hh, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
+                                               consts, 2);
+      int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
+                                               consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
+                                               consts, 3);
+      int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
+                                               consts, 3);
+      /* Add Y. */
+      int16x8_t r_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t r_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
+                                       vget_high_u8(y)));
+      int16x8_t b_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t b_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
+                                       vget_high_u8(y)));
+      int16x8_t g_l =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
+                                       vget_low_u8(y)));
+      int16x8_t g_h =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
+                                       vget_high_u8(y)));
+
+#if RGB_PIXELSIZE == 4
+      uint8x16x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+      rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+      rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      vst4q_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+      uint8x16x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
+      rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
+      rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
+      /* Store RGB pixel data to memory. */
+      vst3q_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
+      rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
+      uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
+      rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565_l);
+      vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
+#endif
+
+      /* Increment pointers. */
+      inptr0 += 16;
+      inptr1 += 16;
+      inptr2 += 16;
+      outptr += (RGB_PIXELSIZE * 16);
+    }
+
+    if (cols_remaining >= 8) {
+      uint8x8_t y  = vld1_u8(inptr0);
+      uint8x8_t cb = vld1_u8(inptr1);
+      uint8x8_t cr = vld1_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+      int16x8_t cb_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                       vrshrn_n_s32(g_sub_y_h, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+                                             consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+                                             consts, 3);
+      /* Add Y. */
+      int16x8_t r =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+      int16x8_t b =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+      int16x8_t g =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vqmovun_s16(r);
+      rgba.val[RGB_GREEN] = vqmovun_s16(g);
+      rgba.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      vst4_u8(outptr, rgba);
+#elif RGB_PIXELSIZE == 3
+      uint8x8x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vqmovun_s16(r);
+      rgb.val[RGB_GREEN] = vqmovun_s16(g);
+      rgb.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Store RGB pixel data to memory. */
+      vst3_u8(outptr, rgb);
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB pixel data to memory. */
+      vst1q_u16((uint16_t *)outptr, rgb565);
+#endif
+
+      /* Increment pointers. */
+      inptr0 += 8;
+      inptr1 += 8;
+      inptr2 += 8;
+      outptr += (RGB_PIXELSIZE * 8);
+      cols_remaining -= 8;
+    }
+
+    /* Handle the tail elements. */
+    if (cols_remaining > 0) {
+      uint8x8_t y  = vld1_u8(inptr0);
+      uint8x8_t cb = vld1_u8(inptr1);
+      uint8x8_t cr = vld1_u8(inptr2);
+      /* Subtract 128 from Cb and Cr. */
+      int16x8_t cr_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+      int16x8_t cb_128 =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+      /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+      int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+      int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+      g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+      g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+      /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+      int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                       vrshrn_n_s32(g_sub_y_h, 15));
+      /* Compute R-Y: 1.40200 * (Cr - 128) */
+      int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
+                                             consts, 2);
+      /* Compute B-Y: 1.77200 * (Cb - 128) */
+      int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
+                                             consts, 3);
+      /* Add Y. */
+      int16x8_t r =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
+      int16x8_t b =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
+      int16x8_t g =
+        vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
+
+#if RGB_PIXELSIZE == 4
+      uint8x8x4_t rgba;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgba.val[RGB_RED] = vqmovun_s16(r);
+      rgba.val[RGB_GREEN] = vqmovun_s16(g);
+      rgba.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Set alpha channel to opaque (0xFF). */
+      rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+      /* Store RGBA pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
+      case 6:
+        vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
+      case 5:
+        vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
+      case 4:
+        vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
+      case 3:
+        vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
+      case 2:
+        vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
+      case 1:
+        vst4_lane_u8(outptr, rgba, 0);
+      default:
+        break;
+      }
+#elif RGB_PIXELSIZE == 3
+      uint8x8x3_t rgb;
+      /* Convert each component to unsigned and narrow, clamping to [0-255]. */
+      rgb.val[RGB_RED] = vqmovun_s16(r);
+      rgb.val[RGB_GREEN] = vqmovun_s16(g);
+      rgb.val[RGB_BLUE] = vqmovun_s16(b);
+      /* Store RGB pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
+      case 6:
+        vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
+      case 5:
+        vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
+      case 4:
+        vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
+      case 3:
+        vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
+      case 2:
+        vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
+      case 1:
+        vst3_lane_u8(outptr, rgb, 0);
+      default:
+        break;
+      }
+#else
+      /* Pack R, G, and B values in ratio 5:6:5. */
+      uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
+      rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
+      /* Store RGB565 pixel data to memory. */
+      switch (cols_remaining) {
+      case 7:
+        vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
+      case 6:
+        vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
+      case 5:
+        vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
+      case 4:
+        vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
+      case 3:
+        vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
+      case 2:
+        vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
+      case 1:
+        vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
+      default:
+        break;
+      }
+#endif
+    }
+  }
+}
diff --git a/external/jpeg/simd/arm/jdcolor-neon.c b/external/jpeg/simd/arm/jdcolor-neon.c
new file mode 100644
index 000000000000..28dbc57243ce
--- /dev/null
+++ b/external/jpeg/simd/arm/jdcolor-neon.c
@@ -0,0 +1,141 @@
+/*
+ * jdcolor-neon.c - colorspace conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344  11277  /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714  23401  /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402  22971  /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772  29033  /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+  -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extrgbx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extbgrx_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extxbgr_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_extxrgb_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
+
+/* YCbCr -> RGB565 Conversion */
+
+#define RGB_PIXELSIZE  2
+#define jsimd_ycc_rgb_convert_neon  jsimd_ycc_rgb565_convert_neon
+#include "jdcolext-neon.c"
+#undef RGB_PIXELSIZE
+#undef jsimd_ycc_rgb_convert_neon
diff --git a/external/jpeg/simd/arm/jdmerge-neon.c b/external/jpeg/simd/arm/jdmerge-neon.c
new file mode 100644
index 000000000000..18fb9d8a55ab
--- /dev/null
+++ b/external/jpeg/simd/arm/jdmerge-neon.c
@@ -0,0 +1,144 @@
+/*
+ * jdmerge-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* YCbCr -> RGB conversion constants */
+
+#define F_0_344  11277  /* 0.3441467 = 11277 * 2^-15 */
+#define F_0_714  23401  /* 0.7141418 = 23401 * 2^-15 */
+#define F_1_402  22971  /* 1.4020386 = 22971 * 2^-14 */
+#define F_1_772  29033  /* 1.7720337 = 29033 * 2^-14 */
+
+ALIGN(16) static const int16_t jsimd_ycc_rgb_convert_neon_consts[] = {
+  -F_0_344, F_0_714, F_1_402, F_1_772
+};
+
+
+/* Include inline routines for colorspace extensions. */
+
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extrgbx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extrgbx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_ALPHA  3
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extbgrx_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extbgrx_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extxbgr_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extxbgr_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
+#undef jsimd_h2v2_merged_upsample_neon
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_ALPHA  0
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_neon  jsimd_h2v1_extxrgb_merged_upsample_neon
+#define jsimd_h2v2_merged_upsample_neon  jsimd_h2v2_extxrgb_merged_upsample_neon
+#include "jdmrgext-neon.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_ALPHA
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_neon
diff --git a/external/jpeg/simd/arm/jdmrgext-neon.c b/external/jpeg/simd/arm/jdmrgext-neon.c
new file mode 100644
index 000000000000..fa2ec056af80
--- /dev/null
+++ b/external/jpeg/simd/arm/jdmrgext-neon.c
@@ -0,0 +1,667 @@
+/*
+ * jdmrgext-neon.c - merged upsampling/color conversion (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-neon.c. */
+
+
+/* These routines combine simple (non-fancy, i.e. non-smooth) h2v1 or h2v2
+ * chroma upsampling and YCbCr -> RGB color conversion into a single function.
+ *
+ * As with the standalone functions, YCbCr -> RGB conversion is defined by the
+ * following equations:
+ *    R = Y                        + 1.40200 * (Cr - 128)
+ *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
+ *    B = Y + 1.77200 * (Cb - 128)
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.3441467 = 11277 * 2^-15
+ *    0.7141418 = 23401 * 2^-15
+ *    1.4020386 = 22971 * 2^-14
+ *    1.7720337 = 29033 * 2^-14
+ * These constants are defined in jdmerge-neon.c.
+ *
+ * To ensure correct results, rounding is used when descaling.
+ */
+
+/* Notes on safe memory access for merged upsampling/YCbCr -> RGB conversion
+ * routines:
+ *
+ * Input memory buffers can be safely overread up to the next multiple of
+ * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
+ * jmemmgr.c.
+ *
+ * The output buffer cannot safely be written beyond output_width, since
+ * output_buf points to a possibly unpadded row in the decompressed image
+ * buffer allocated by the calling program.
+ */
+
+/* Upsample and color convert for the case of 2:1 horizontal and 1:1 vertical.
+ */
+
+void jsimd_h2v1_merged_upsample_neon(JDIMENSION output_width,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION in_row_group_ctr,
+                                     JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr;
+  /* Pointers to Y, Cb, and Cr data */
+  JSAMPROW inptr0, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  int cols_remaining = output_width;
+  for (; cols_remaining >= 16; cols_remaining -= 16) {
+    /* De-interleave Y component values into two separate vectors, one
+     * containing the component values with even-numbered indices and one
+     * containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y = vld2_u8(inptr0);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+     * "odd" Y component values.  This effectively upsamples the chroma
+     * components horizontally.
+     */
+    int16x8_t g_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[0]));
+    int16x8_t r_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[0]));
+    int16x8_t b_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[0]));
+    int16x8_t g_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[1]));
+    int16x8_t r_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[1]));
+    int16x8_t b_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+    uint8x16x4_t rgba;
+    rgba.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+    rgba.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+    rgba.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+    /* Set alpha channel to opaque (0xFF). */
+    rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    vst4q_u8(outptr, rgba);
+#else
+    uint8x16x3_t rgb;
+    rgb.val[RGB_RED] = vcombine_u8(r.val[0], r.val[1]);
+    rgb.val[RGB_GREEN] = vcombine_u8(g.val[0], g.val[1]);
+    rgb.val[RGB_BLUE] = vcombine_u8(b.val[0], b.val[1]);
+    /* Store RGB pixel data to memory. */
+    vst3q_u8(outptr, rgb);
+#endif
+
+    /* Increment pointers. */
+    inptr0 += 16;
+    inptr1 += 8;
+    inptr2 += 8;
+    outptr += (RGB_PIXELSIZE * 16);
+  }
+
+  if (cols_remaining > 0) {
+    /* De-interleave Y component values into two separate vectors, one
+     * containing the component values with even-numbered indices and one
+     * containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y = vld2_u8(inptr0);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* Add the chroma-derived values (G-Y, R-Y, and B-Y) to both the "even" and
+     * "odd" Y component values.  This effectively upsamples the chroma
+     * components horizontally.
+     */
+    int16x8_t g_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[0]));
+    int16x8_t r_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[0]));
+    int16x8_t b_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[0]));
+    int16x8_t g_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y.val[1]));
+    int16x8_t r_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y.val[1]));
+    int16x8_t b_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r = vzip_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+    uint8x8x2_t g = vzip_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+    uint8x8x2_t b = vzip_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+#ifdef RGB_ALPHA
+    uint8x8x4_t rgba_h;
+    rgba_h.val[RGB_RED] = r.val[1];
+    rgba_h.val[RGB_GREEN] = g.val[1];
+    rgba_h.val[RGB_BLUE] = b.val[1];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    uint8x8x4_t rgba_l;
+    rgba_l.val[RGB_RED] = r.val[0];
+    rgba_l.val[RGB_GREEN] = g.val[0];
+    rgba_l.val[RGB_BLUE] = b.val[0];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst4_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgba_h, 6);
+    case 14:
+      vst4_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgba_h, 5);
+    case 13:
+      vst4_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgba_h, 4);
+    case 12:
+      vst4_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgba_h, 3);
+    case 11:
+      vst4_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgba_h, 2);
+    case 10:
+      vst4_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgba_h, 1);
+    case 9:
+      vst4_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgba_h, 0);
+    case 8:
+      vst4_u8(outptr, rgba_l);
+      break;
+    case 7:
+      vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba_l, 6);
+    case 6:
+      vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba_l, 5);
+    case 5:
+      vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba_l, 4);
+    case 4:
+      vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba_l, 3);
+    case 3:
+      vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba_l, 2);
+    case 2:
+      vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba_l, 1);
+    case 1:
+      vst4_lane_u8(outptr, rgba_l, 0);
+    default:
+      break;
+    }
+#else
+    uint8x8x3_t rgb_h;
+    rgb_h.val[RGB_RED] = r.val[1];
+    rgb_h.val[RGB_GREEN] = g.val[1];
+    rgb_h.val[RGB_BLUE] = b.val[1];
+    uint8x8x3_t rgb_l;
+    rgb_l.val[RGB_RED] = r.val[0];
+    rgb_l.val[RGB_GREEN] = g.val[0];
+    rgb_l.val[RGB_BLUE] = b.val[0];
+    /* Store RGB pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst3_lane_u8(outptr + 14 * RGB_PIXELSIZE, rgb_h, 6);
+    case 14:
+      vst3_lane_u8(outptr + 13 * RGB_PIXELSIZE, rgb_h, 5);
+    case 13:
+      vst3_lane_u8(outptr + 12 * RGB_PIXELSIZE, rgb_h, 4);
+    case 12:
+      vst3_lane_u8(outptr + 11 * RGB_PIXELSIZE, rgb_h, 3);
+    case 11:
+      vst3_lane_u8(outptr + 10 * RGB_PIXELSIZE, rgb_h, 2);
+    case 10:
+      vst3_lane_u8(outptr + 9 * RGB_PIXELSIZE, rgb_h, 1);
+    case 9:
+      vst3_lane_u8(outptr + 8 * RGB_PIXELSIZE, rgb_h, 0);
+    case 8:
+      vst3_u8(outptr, rgb_l);
+      break;
+    case 7:
+      vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb_l, 6);
+    case 6:
+      vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb_l, 5);
+    case 5:
+      vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb_l, 4);
+    case 4:
+      vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb_l, 3);
+    case 3:
+      vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb_l, 2);
+    case 2:
+      vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb_l, 1);
+    case 1:
+      vst3_lane_u8(outptr, rgb_l, 0);
+    default:
+      break;
+    }
+#endif
+  }
+}
+
+
+/* Upsample and color convert for the case of 2:1 horizontal and 2:1 vertical.
+ *
+ * See comments above for details regarding color conversion and safe memory
+ * access.
+ */
+
+void jsimd_h2v2_merged_upsample_neon(JDIMENSION output_width,
+                                     JSAMPIMAGE input_buf,
+                                     JDIMENSION in_row_group_ctr,
+                                     JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr0, outptr1;
+  /* Pointers to Y (both rows), Cb, and Cr data */
+  JSAMPROW inptr0_0, inptr0_1, inptr1, inptr2;
+
+  const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
+  const int16x8_t neg_128 = vdupq_n_s16(-128);
+
+  inptr0_0 = input_buf[0][in_row_group_ctr * 2];
+  inptr0_1 = input_buf[0][in_row_group_ctr * 2 + 1];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr0 = output_buf[0];
+  outptr1 = output_buf[1];
+
+  int cols_remaining = output_width;
+  for (; cols_remaining >= 16; cols_remaining -= 16) {
+    /* For each row, de-interleave Y component values into two separate
+     * vectors, one containing the component values with even-numbered indices
+     * and one containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y0 = vld2_u8(inptr0_0);
+    uint8x8x2_t y1 = vld2_u8(inptr0_1);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+     * the "even" and "odd" Y component values.  This effectively upsamples the
+     * chroma components both horizontally and vertically.
+     */
+    int16x8_t g0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[0]));
+    int16x8_t r0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[0]));
+    int16x8_t b0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[0]));
+    int16x8_t g0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[1]));
+    int16x8_t r0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[1]));
+    int16x8_t b0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[1]));
+    int16x8_t g1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[0]));
+    int16x8_t r1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[0]));
+    int16x8_t b1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[0]));
+    int16x8_t g1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[1]));
+    int16x8_t r1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[1]));
+    int16x8_t b1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+    uint8x16x4_t rgba0, rgba1;
+    rgba0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+    rgba1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+    rgba0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+    rgba1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+    rgba0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+    rgba1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    rgba1.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    vst4q_u8(outptr0, rgba0);
+    vst4q_u8(outptr1, rgba1);
+#else
+    uint8x16x3_t rgb0, rgb1;
+    rgb0.val[RGB_RED] = vcombine_u8(r0.val[0], r0.val[1]);
+    rgb1.val[RGB_RED] = vcombine_u8(r1.val[0], r1.val[1]);
+    rgb0.val[RGB_GREEN] = vcombine_u8(g0.val[0], g0.val[1]);
+    rgb1.val[RGB_GREEN] = vcombine_u8(g1.val[0], g1.val[1]);
+    rgb0.val[RGB_BLUE] = vcombine_u8(b0.val[0], b0.val[1]);
+    rgb1.val[RGB_BLUE] = vcombine_u8(b1.val[0], b1.val[1]);
+    /* Store RGB pixel data to memory. */
+    vst3q_u8(outptr0, rgb0);
+    vst3q_u8(outptr1, rgb1);
+#endif
+
+    /* Increment pointers. */
+    inptr0_0 += 16;
+    inptr0_1 += 16;
+    inptr1 += 8;
+    inptr2 += 8;
+    outptr0 += (RGB_PIXELSIZE * 16);
+    outptr1 += (RGB_PIXELSIZE * 16);
+  }
+
+  if (cols_remaining > 0) {
+    /* For each row, de-interleave Y component values into two separate
+     * vectors, one containing the component values with even-numbered indices
+     * and one containing the component values with odd-numbered indices.
+     */
+    uint8x8x2_t y0 = vld2_u8(inptr0_0);
+    uint8x8x2_t y1 = vld2_u8(inptr0_1);
+    uint8x8_t cb = vld1_u8(inptr1);
+    uint8x8_t cr = vld1_u8(inptr2);
+    /* Subtract 128 from Cb and Cr. */
+    int16x8_t cr_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
+    int16x8_t cb_128 =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
+    /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
+    int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
+    int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
+    g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
+    g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
+    /* Descale G components: shift right 15, round, and narrow to 16-bit. */
+    int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
+                                     vrshrn_n_s32(g_sub_y_h, 15));
+    /* Compute R-Y: 1.40200 * (Cr - 128) */
+    int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1), consts, 2);
+    /* Compute B-Y: 1.77200 * (Cb - 128) */
+    int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1), consts, 3);
+    /* For each row, add the chroma-derived values (G-Y, R-Y, and B-Y) to both
+     * the "even" and "odd" Y component values.  This effectively upsamples the
+     * chroma components both horizontally and vertically.
+     */
+    int16x8_t g0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[0]));
+    int16x8_t r0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[0]));
+    int16x8_t b0_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[0]));
+    int16x8_t g0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y0.val[1]));
+    int16x8_t r0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y0.val[1]));
+    int16x8_t b0_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y0.val[1]));
+    int16x8_t g1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[0]));
+    int16x8_t r1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[0]));
+    int16x8_t b1_even =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[0]));
+    int16x8_t g1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y),
+                                     y1.val[1]));
+    int16x8_t r1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y),
+                                     y1.val[1]));
+    int16x8_t b1_odd =
+      vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y),
+                                     y1.val[1]));
+    /* Convert each component to unsigned and narrow, clamping to [0-255].
+     * Re-interleave the "even" and "odd" component values.
+     */
+    uint8x8x2_t r0 = vzip_u8(vqmovun_s16(r0_even), vqmovun_s16(r0_odd));
+    uint8x8x2_t r1 = vzip_u8(vqmovun_s16(r1_even), vqmovun_s16(r1_odd));
+    uint8x8x2_t g0 = vzip_u8(vqmovun_s16(g0_even), vqmovun_s16(g0_odd));
+    uint8x8x2_t g1 = vzip_u8(vqmovun_s16(g1_even), vqmovun_s16(g1_odd));
+    uint8x8x2_t b0 = vzip_u8(vqmovun_s16(b0_even), vqmovun_s16(b0_odd));
+    uint8x8x2_t b1 = vzip_u8(vqmovun_s16(b1_even), vqmovun_s16(b1_odd));
+
+#ifdef RGB_ALPHA
+    uint8x8x4_t rgba0_h, rgba1_h;
+    rgba0_h.val[RGB_RED] = r0.val[1];
+    rgba1_h.val[RGB_RED] = r1.val[1];
+    rgba0_h.val[RGB_GREEN] = g0.val[1];
+    rgba1_h.val[RGB_GREEN] = g1.val[1];
+    rgba0_h.val[RGB_BLUE] = b0.val[1];
+    rgba1_h.val[RGB_BLUE] = b1.val[1];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    rgba1_h.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+
+    uint8x8x4_t rgba0_l, rgba1_l;
+    rgba0_l.val[RGB_RED] = r0.val[0];
+    rgba1_l.val[RGB_RED] = r1.val[0];
+    rgba0_l.val[RGB_GREEN] = g0.val[0];
+    rgba1_l.val[RGB_GREEN] = g1.val[0];
+    rgba0_l.val[RGB_BLUE] = b0.val[0];
+    rgba1_l.val[RGB_BLUE] = b1.val[0];
+    /* Set alpha channel to opaque (0xFF). */
+    rgba0_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    rgba1_l.val[RGB_ALPHA] = vdup_n_u8(0xFF);
+    /* Store RGBA pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst4_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgba0_h, 6);
+      vst4_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgba1_h, 6);
+    case 14:
+      vst4_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgba0_h, 5);
+      vst4_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgba1_h, 5);
+    case 13:
+      vst4_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgba0_h, 4);
+      vst4_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgba1_h, 4);
+    case 12:
+      vst4_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgba0_h, 3);
+      vst4_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgba1_h, 3);
+    case 11:
+      vst4_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgba0_h, 2);
+      vst4_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgba1_h, 2);
+    case 10:
+      vst4_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgba0_h, 1);
+      vst4_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgba1_h, 1);
+    case 9:
+      vst4_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgba0_h, 0);
+      vst4_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgba1_h, 0);
+    case 8:
+      vst4_u8(outptr0, rgba0_l);
+      vst4_u8(outptr1, rgba1_l);
+      break;
+    case 7:
+      vst4_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgba0_l, 6);
+      vst4_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgba1_l, 6);
+    case 6:
+      vst4_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgba0_l, 5);
+      vst4_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgba1_l, 5);
+    case 5:
+      vst4_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgba0_l, 4);
+      vst4_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgba1_l, 4);
+    case 4:
+      vst4_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgba0_l, 3);
+      vst4_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgba1_l, 3);
+    case 3:
+      vst4_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgba0_l, 2);
+      vst4_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgba1_l, 2);
+    case 2:
+      vst4_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgba0_l, 1);
+      vst4_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgba1_l, 1);
+    case 1:
+      vst4_lane_u8(outptr0, rgba0_l, 0);
+      vst4_lane_u8(outptr1, rgba1_l, 0);
+    default:
+      break;
+    }
+#else
+    uint8x8x3_t rgb0_h, rgb1_h;
+    rgb0_h.val[RGB_RED] = r0.val[1];
+    rgb1_h.val[RGB_RED] = r1.val[1];
+    rgb0_h.val[RGB_GREEN] = g0.val[1];
+    rgb1_h.val[RGB_GREEN] = g1.val[1];
+    rgb0_h.val[RGB_BLUE] = b0.val[1];
+    rgb1_h.val[RGB_BLUE] = b1.val[1];
+
+    uint8x8x3_t rgb0_l, rgb1_l;
+    rgb0_l.val[RGB_RED] = r0.val[0];
+    rgb1_l.val[RGB_RED] = r1.val[0];
+    rgb0_l.val[RGB_GREEN] = g0.val[0];
+    rgb1_l.val[RGB_GREEN] = g1.val[0];
+    rgb0_l.val[RGB_BLUE] = b0.val[0];
+    rgb1_l.val[RGB_BLUE] = b1.val[0];
+    /* Store RGB pixel data to memory. */
+    switch (cols_remaining) {
+    case 15:
+      vst3_lane_u8(outptr0 + 14 * RGB_PIXELSIZE, rgb0_h, 6);
+      vst3_lane_u8(outptr1 + 14 * RGB_PIXELSIZE, rgb1_h, 6);
+    case 14:
+      vst3_lane_u8(outptr0 + 13 * RGB_PIXELSIZE, rgb0_h, 5);
+      vst3_lane_u8(outptr1 + 13 * RGB_PIXELSIZE, rgb1_h, 5);
+    case 13:
+      vst3_lane_u8(outptr0 + 12 * RGB_PIXELSIZE, rgb0_h, 4);
+      vst3_lane_u8(outptr1 + 12 * RGB_PIXELSIZE, rgb1_h, 4);
+    case 12:
+      vst3_lane_u8(outptr0 + 11 * RGB_PIXELSIZE, rgb0_h, 3);
+      vst3_lane_u8(outptr1 + 11 * RGB_PIXELSIZE, rgb1_h, 3);
+    case 11:
+      vst3_lane_u8(outptr0 + 10 * RGB_PIXELSIZE, rgb0_h, 2);
+      vst3_lane_u8(outptr1 + 10 * RGB_PIXELSIZE, rgb1_h, 2);
+    case 10:
+      vst3_lane_u8(outptr0 + 9 * RGB_PIXELSIZE, rgb0_h, 1);
+      vst3_lane_u8(outptr1 + 9 * RGB_PIXELSIZE, rgb1_h, 1);
+    case 9:
+      vst3_lane_u8(outptr0 + 8 * RGB_PIXELSIZE, rgb0_h, 0);
+      vst3_lane_u8(outptr1 + 8 * RGB_PIXELSIZE, rgb1_h, 0);
+    case 8:
+      vst3_u8(outptr0, rgb0_l);
+      vst3_u8(outptr1, rgb1_l);
+      break;
+    case 7:
+      vst3_lane_u8(outptr0 + 6 * RGB_PIXELSIZE, rgb0_l, 6);
+      vst3_lane_u8(outptr1 + 6 * RGB_PIXELSIZE, rgb1_l, 6);
+    case 6:
+      vst3_lane_u8(outptr0 + 5 * RGB_PIXELSIZE, rgb0_l, 5);
+      vst3_lane_u8(outptr1 + 5 * RGB_PIXELSIZE, rgb1_l, 5);
+    case 5:
+      vst3_lane_u8(outptr0 + 4 * RGB_PIXELSIZE, rgb0_l, 4);
+      vst3_lane_u8(outptr1 + 4 * RGB_PIXELSIZE, rgb1_l, 4);
+    case 4:
+      vst3_lane_u8(outptr0 + 3 * RGB_PIXELSIZE, rgb0_l, 3);
+      vst3_lane_u8(outptr1 + 3 * RGB_PIXELSIZE, rgb1_l, 3);
+    case 3:
+      vst3_lane_u8(outptr0 + 2 * RGB_PIXELSIZE, rgb0_l, 2);
+      vst3_lane_u8(outptr1 + 2 * RGB_PIXELSIZE, rgb1_l, 2);
+    case 2:
+      vst3_lane_u8(outptr0 + 1 * RGB_PIXELSIZE, rgb0_l, 1);
+      vst3_lane_u8(outptr1 + 1 * RGB_PIXELSIZE, rgb1_l, 1);
+    case 1:
+      vst3_lane_u8(outptr0, rgb0_l, 0);
+      vst3_lane_u8(outptr1, rgb1_l, 0);
+    default:
+      break;
+    }
+#endif
+  }
+}
diff --git a/external/jpeg/simd/arm/jdsample-neon.c b/external/jpeg/simd/arm/jdsample-neon.c
new file mode 100644
index 000000000000..90ec6782c47f
--- /dev/null
+++ b/external/jpeg/simd/arm/jdsample-neon.c
@@ -0,0 +1,569 @@
+/*
+ * jdsample-neon.c - upsampling (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ *                s0        s1        s2
+ *            +---------+---------+---------+
+ *            |         |         |         |
+ *            | p0   p1 | p2   p3 | p4   p5 |
+ *            |         |         |         |
+ *            +---------+---------+---------+
+ *
+ * Samples s0-s2 were created by averaging the original pixel component values
+ * centered at positions p0-p5 above.  To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each row.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1.  For example:
+ *     p1(upsampled) = 3/4 * s0 + 1/4 * s1
+ *     p2(upsampled) = 3/4 * s1 + 1/4 * s0
+ * When computing the first and last pixel component values in the row, there
+ * is no adjacent sample to blend, so:
+ *     p0(upsampled) = s0
+ *     p5(upsampled) = s2
+ */
+
+void jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+    /* First pixel component value in this row of the original image */
+    *outptr = (JSAMPLE)GETJSAMPLE(*inptr);
+
+    /*    3/4 * containing sample + 1/4 * nearest neighboring sample
+     * For p1: containing sample = s0, nearest neighboring sample = s1
+     * For p2: containing sample = s1, nearest neighboring sample = s0
+     */
+    uint8x16_t s0 = vld1q_u8(inptr);
+    uint8x16_t s1 = vld1q_u8(inptr + 1);
+    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+     * denote low half and high half respectively.
+     */
+    uint16x8_t s1_add_3s0_l =
+      vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+    uint16x8_t s1_add_3s0_h =
+      vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+    uint16x8_t s0_add_3s1_l =
+      vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+    uint16x8_t s0_add_3s1_h =
+      vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+    /* Add ordered dithering bias to odd pixel values. */
+    s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+    s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+    /* The offset is initially 1, because the first pixel component has already
+     * been stored.  However, in subsequent iterations of the SIMD loop, this
+     * offset is (2 * colctr - 1) to stay within the bounds of the sample
+     * buffers without having to resort to a slow scalar tail case for the last
+     * (downsampled_width % 16) samples.  See "Creation of 2-D sample arrays"
+     * in jmemmgr.c for more details.
+     */
+    unsigned outptr_offset = 1;
+    uint8x16x2_t output_pixels;
+
+    /* We use software pipelining to maximise performance.  The code indented
+     * an extra two spaces begins the next iteration of the loop.
+     */
+    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+
+        s0 = vld1q_u8(inptr + colctr - 1);
+        s1 = vld1q_u8(inptr + colctr);
+
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+      output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+                                         vrshrn_n_u16(s1_add_3s0_h, 2));
+      output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+                                         vshrn_n_u16(s0_add_3s1_h, 2));
+
+        /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+         * denote low half and high half respectively.
+         */
+        s1_add_3s0_l =
+          vmlal_u8(vmovl_u8(vget_low_u8(s1)), vget_low_u8(s0), three_u8);
+        s1_add_3s0_h =
+          vmlal_u8(vmovl_u8(vget_high_u8(s1)), vget_high_u8(s0), three_u8);
+        s0_add_3s1_l =
+          vmlal_u8(vmovl_u8(vget_low_u8(s0)), vget_low_u8(s1), three_u8);
+        s0_add_3s1_h =
+          vmlal_u8(vmovl_u8(vget_high_u8(s0)), vget_high_u8(s1), three_u8);
+        /* Add ordered dithering bias to odd pixel values. */
+        s0_add_3s1_l = vaddq_u16(s0_add_3s1_l, one_u16);
+        s0_add_3s1_h = vaddq_u16(s0_add_3s1_h, one_u16);
+
+      /* Store pixel component values to memory. */
+      vst2q_u8(outptr + outptr_offset, output_pixels);
+      outptr_offset = 2 * colctr - 1;
+    }
+
+    /* Complete the last iteration of the loop. */
+
+    /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+    output_pixels.val[0] = vcombine_u8(vrshrn_n_u16(s1_add_3s0_l, 2),
+                                       vrshrn_n_u16(s1_add_3s0_h, 2));
+    output_pixels.val[1] = vcombine_u8(vshrn_n_u16(s0_add_3s1_l, 2),
+                                       vshrn_n_u16(s0_add_3s1_h, 2));
+    /* Store pixel component values to memory. */
+    vst2q_u8(outptr + outptr_offset, output_pixels);
+
+    /* Last pixel component value in this row of the original image */
+    outptr[2 * downsampled_width - 1] =
+      GETJSAMPLE(inptr[downsampled_width - 1]);
+  }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ *                s0        s1        s2
+ *            +---------+---------+---------+
+ *            | p0   p1 | p2   p3 | p4   p5 |
+ *       sA   |         |         |         |
+ *            | p6   p7 | p8   p9 | p10  p11|
+ *            +---------+---------+---------+
+ *            | p12  p13| p14  p15| p16  p17|
+ *       sB   |         |         |         |
+ *            | p18  p19| p20  p21| p22  p23|
+ *            +---------+---------+---------+
+ *            | p24  p25| p26  p27| p28  p29|
+ *       sC   |         |         |         |
+ *            | p30  p31| p32  p33| p34  p35|
+ *            +---------+---------+---------+
+ *
+ * Samples s0A-s2C were created by averaging the original pixel component
+ * values centered at positions p0-p35 above.  To approximate one of those
+ * original pixel component values, we proportionally blend the sample
+ * containing the pixel center with the nearest neighboring samples in each
+ * row, column, and diagonal.
+ *
+ * An upsampled pixel component value is computed by first blending the sample
+ * containing the pixel center with the nearest neighboring samples in the
+ * same column, in the ratio 3:1, and then blending each column sum with the
+ * nearest neighboring column sum, in the ratio 3:1.  For example:
+ *     p14(upsampled) = 3/4 * (3/4 * s1B + 1/4 * s1A) +
+ *                      1/4 * (3/4 * s0B + 1/4 * s0A)
+ *                    = 9/16 * s1B + 3/16 * s1A + 3/16 * s0B + 1/16 * s0A
+ * When computing the first and last pixel component values in the row, there
+ * is no horizontally adjacent sample to blend, so:
+ *     p12(upsampled) = 3/4 * s0B + 1/4 * s0A
+ *     p23(upsampled) = 3/4 * s2B + 1/4 * s2C
+ * When computing the first and last pixel component values in the column,
+ * there is no vertically adjacent sample to blend, so:
+ *     p2(upsampled) = 3/4 * s1A + 1/4 * s0A
+ *     p33(upsampled) = 3/4 * s1C + 1/4 * s2C
+ * When computing the corner pixel component values, there is no adjacent
+ * sample to blend, so:
+ *     p0(upsampled) = s0A
+ *     p35(upsampled) = s2C
+ */
+
+void jsimd_h2v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t seven_u16 = vdupq_n_u16(7);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+  const uint16x8_t three_u16 = vdupq_n_u16(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+     * respectively.
+     */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    /* First pixel component value in this row of the original image */
+    int s0colsum0 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr0);
+    *outptr0 = (JSAMPLE)((s0colsum0 * 4 + 8) >> 4);
+    int s0colsum1 = GETJSAMPLE(*inptr1) * 3 + GETJSAMPLE(*inptr2);
+    *outptr1 = (JSAMPLE)((s0colsum1 * 4 + 8) >> 4);
+
+    /* Step 1: Blend samples vertically in columns s0 and s1.
+     * Leave the divide by 4 until the end, when it can be done for both
+     * dimensions at once, right-shifting by 4.
+     */
+
+    /* Load and compute s0colsum0 and s0colsum1. */
+    uint8x16_t s0A = vld1q_u8(inptr0);
+    uint8x16_t s0B = vld1q_u8(inptr1);
+    uint8x16_t s0C = vld1q_u8(inptr2);
+    /* Multiplication makes vectors twice as wide.  '_l' and '_h' suffixes
+     * denote low half and high half respectively.
+     */
+    uint16x8_t s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)),
+                                      vget_low_u8(s0B), three_u8);
+    uint16x8_t s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)),
+                                      vget_high_u8(s0B), three_u8);
+    uint16x8_t s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)),
+                                      vget_low_u8(s0B), three_u8);
+    uint16x8_t s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)),
+                                      vget_high_u8(s0B), three_u8);
+    /* Load and compute s1colsum0 and s1colsum1. */
+    uint8x16_t s1A = vld1q_u8(inptr0 + 1);
+    uint8x16_t s1B = vld1q_u8(inptr1 + 1);
+    uint8x16_t s1C = vld1q_u8(inptr2 + 1);
+    uint16x8_t s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)),
+                                      vget_low_u8(s1B), three_u8);
+    uint16x8_t s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)),
+                                      vget_high_u8(s1B), three_u8);
+    uint16x8_t s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)),
+                                      vget_low_u8(s1B), three_u8);
+    uint16x8_t s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)),
+                                      vget_high_u8(s1B), three_u8);
+
+    /* Step 2: Blend the already-blended columns. */
+
+    uint16x8_t output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+    uint16x8_t output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+    uint16x8_t output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+    uint16x8_t output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+    uint16x8_t output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+    uint16x8_t output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+    uint16x8_t output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+    uint16x8_t output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+    /* Add ordered dithering bias to odd pixel values. */
+    output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+    output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+    output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+    output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+    /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+    uint8x16x2_t output_pixels0 = { {
+      vcombine_u8(vshrn_n_u16(output0_p1_l, 4), vshrn_n_u16(output0_p1_h, 4)),
+      vcombine_u8(vrshrn_n_u16(output0_p2_l, 4), vrshrn_n_u16(output0_p2_h, 4))
+    } };
+    uint8x16x2_t output_pixels1 = { {
+      vcombine_u8(vshrn_n_u16(output1_p1_l, 4), vshrn_n_u16(output1_p1_h, 4)),
+      vcombine_u8(vrshrn_n_u16(output1_p2_l, 4), vrshrn_n_u16(output1_p2_h, 4))
+    } };
+
+    /* Store pixel component values to memory.
+     * The minimum size of the output buffer for each row is 64 bytes => no
+     * need to worry about buffer overflow here.  See "Creation of 2-D sample
+     * arrays" in jmemmgr.c for more details.
+     */
+    vst2q_u8(outptr0 + 1, output_pixels0);
+    vst2q_u8(outptr1 + 1, output_pixels1);
+
+    /* The first pixel of the image shifted our loads and stores by one byte.
+     * We have to re-align on a 32-byte boundary at some point before the end
+     * of the row (we do it now on the 32/33 pixel boundary) to stay within the
+     * bounds of the sample buffers without having to resort to a slow scalar
+     * tail case for the last (downsampled_width % 16) samples.  See "Creation
+     * of 2-D sample arrays" in jmemmgr.c for more details.
+     */
+    for (colctr = 16; colctr < downsampled_width; colctr += 16) {
+      /* Step 1: Blend samples vertically in columns s0 and s1. */
+
+      /* Load and compute s0colsum0 and s0colsum1. */
+      s0A = vld1q_u8(inptr0 + colctr - 1);
+      s0B = vld1q_u8(inptr1 + colctr - 1);
+      s0C = vld1q_u8(inptr2 + colctr - 1);
+      s0colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s0A)), vget_low_u8(s0B),
+                             three_u8);
+      s0colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s0A)), vget_high_u8(s0B),
+                             three_u8);
+      s0colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s0C)), vget_low_u8(s0B),
+                             three_u8);
+      s0colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s0C)), vget_high_u8(s0B),
+                             three_u8);
+      /* Load and compute s1colsum0 and s1colsum1. */
+      s1A = vld1q_u8(inptr0 + colctr);
+      s1B = vld1q_u8(inptr1 + colctr);
+      s1C = vld1q_u8(inptr2 + colctr);
+      s1colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(s1A)), vget_low_u8(s1B),
+                             three_u8);
+      s1colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(s1A)), vget_high_u8(s1B),
+                             three_u8);
+      s1colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(s1C)), vget_low_u8(s1B),
+                             three_u8);
+      s1colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(s1C)), vget_high_u8(s1B),
+                             three_u8);
+
+      /* Step 2: Blend the already-blended columns. */
+
+      output0_p1_l = vmlaq_u16(s1colsum0_l, s0colsum0_l, three_u16);
+      output0_p1_h = vmlaq_u16(s1colsum0_h, s0colsum0_h, three_u16);
+      output0_p2_l = vmlaq_u16(s0colsum0_l, s1colsum0_l, three_u16);
+      output0_p2_h = vmlaq_u16(s0colsum0_h, s1colsum0_h, three_u16);
+      output1_p1_l = vmlaq_u16(s1colsum1_l, s0colsum1_l, three_u16);
+      output1_p1_h = vmlaq_u16(s1colsum1_h, s0colsum1_h, three_u16);
+      output1_p2_l = vmlaq_u16(s0colsum1_l, s1colsum1_l, three_u16);
+      output1_p2_h = vmlaq_u16(s0colsum1_h, s1colsum1_h, three_u16);
+      /* Add ordered dithering bias to odd pixel values. */
+      output0_p1_l = vaddq_u16(output0_p1_l, seven_u16);
+      output0_p1_h = vaddq_u16(output0_p1_h, seven_u16);
+      output1_p1_l = vaddq_u16(output1_p1_l, seven_u16);
+      output1_p1_h = vaddq_u16(output1_p1_h, seven_u16);
+      /* Right-shift by 4 (divide by 16), narrow to 8-bit, and combine. */
+      output_pixels0.val[0] = vcombine_u8(vshrn_n_u16(output0_p1_l, 4),
+                                          vshrn_n_u16(output0_p1_h, 4));
+      output_pixels0.val[1] = vcombine_u8(vrshrn_n_u16(output0_p2_l, 4),
+                                          vrshrn_n_u16(output0_p2_h, 4));
+      output_pixels1.val[0] = vcombine_u8(vshrn_n_u16(output1_p1_l, 4),
+                                          vshrn_n_u16(output1_p1_h, 4));
+      output_pixels1.val[1] = vcombine_u8(vrshrn_n_u16(output1_p2_l, 4),
+                                          vrshrn_n_u16(output1_p2_h, 4));
+      /* Store pixel component values to memory. */
+      vst2q_u8(outptr0 + 2 * colctr - 1, output_pixels0);
+      vst2q_u8(outptr1 + 2 * colctr - 1, output_pixels1);
+    }
+
+    /* Last pixel component value in this row of the original image */
+    int s1colsum0 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr0[downsampled_width - 1]);
+    outptr0[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum0 * 4 + 7) >> 4);
+    int s1colsum1 = GETJSAMPLE(inptr1[downsampled_width - 1]) * 3 +
+                    GETJSAMPLE(inptr2[downsampled_width - 1]);
+    outptr1[2 * downsampled_width - 1] = (JSAMPLE)((s1colsum1 * 4 + 7) >> 4);
+    inrow++;
+  }
+}
+
+
+/* The diagram below shows a column of samples produced by h1v2 downsampling
+ * (or by losslessly rotating or transposing an h2v1-downsampled image.)
+ *
+ *            +---------+
+ *            |   p0    |
+ *     sA     |         |
+ *            |   p1    |
+ *            +---------+
+ *            |   p2    |
+ *     sB     |         |
+ *            |   p3    |
+ *            +---------+
+ *            |   p4    |
+ *     sC     |         |
+ *            |   p5    |
+ *            +---------+
+ *
+ * Samples sA-sC were created by averaging the original pixel component values
+ * centered at positions p0-p5 above.  To approximate those original pixel
+ * component values, we proportionally blend the adjacent samples in each
+ * column.
+ *
+ * An upsampled pixel component value is computed by blending the sample
+ * containing the pixel center with the nearest neighboring sample, in the
+ * ratio 3:1.  For example:
+ *     p1(upsampled) = 3/4 * sA + 1/4 * sB
+ *     p2(upsampled) = 3/4 * sB + 1/4 * sA
+ * When computing the first and last pixel component values in the column,
+ * there is no adjacent sample to blend, so:
+ *     p0(upsampled) = sA
+ *     p5(upsampled) = sC
+ */
+
+void jsimd_h1v2_fancy_upsample_neon(int max_v_samp_factor,
+                                    JDIMENSION downsampled_width,
+                                    JSAMPARRAY input_data,
+                                    JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, inptr1, inptr2, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+  /* Set up constants. */
+  const uint16x8_t one_u16 = vdupq_n_u16(1);
+  const uint8x8_t three_u8 = vdup_n_u8(3);
+
+  inrow = outrow = 0;
+  while (outrow < max_v_samp_factor) {
+    inptr0 = input_data[inrow - 1];
+    inptr1 = input_data[inrow];
+    inptr2 = input_data[inrow + 1];
+    /* Suffixes 0 and 1 denote the upper and lower rows of output pixels,
+     * respectively.
+     */
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+    inrow++;
+
+    /* The size of the input and output buffers is always a multiple of 32
+     * bytes => no need to worry about buffer overflow when reading/writing
+     * memory.  See "Creation of 2-D sample arrays" in jmemmgr.c for more
+     * details.
+     */
+    for (colctr = 0; colctr < downsampled_width; colctr += 16) {
+      /* Load samples. */
+      uint8x16_t sA = vld1q_u8(inptr0 + colctr);
+      uint8x16_t sB = vld1q_u8(inptr1 + colctr);
+      uint8x16_t sC = vld1q_u8(inptr2 + colctr);
+      /* Blend samples vertically. */
+      uint16x8_t colsum0_l = vmlal_u8(vmovl_u8(vget_low_u8(sA)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum0_h = vmlal_u8(vmovl_u8(vget_high_u8(sA)),
+                                      vget_high_u8(sB), three_u8);
+      uint16x8_t colsum1_l = vmlal_u8(vmovl_u8(vget_low_u8(sC)),
+                                      vget_low_u8(sB), three_u8);
+      uint16x8_t colsum1_h = vmlal_u8(vmovl_u8(vget_high_u8(sC)),
+                                      vget_high_u8(sB), three_u8);
+      /* Add ordered dithering bias to pixel values in even output rows. */
+      colsum0_l = vaddq_u16(colsum0_l, one_u16);
+      colsum0_h = vaddq_u16(colsum0_h, one_u16);
+      /* Right-shift by 2 (divide by 4), narrow to 8-bit, and combine. */
+      uint8x16_t output_pixels0 = vcombine_u8(vshrn_n_u16(colsum0_l, 2),
+                                              vshrn_n_u16(colsum0_h, 2));
+      uint8x16_t output_pixels1 = vcombine_u8(vrshrn_n_u16(colsum1_l, 2),
+                                              vrshrn_n_u16(colsum1_h, 2));
+      /* Store pixel component values to memory. */
+      vst1q_u8(outptr0 + colctr, output_pixels0);
+      vst1q_u8(outptr1 + colctr, output_pixels1);
+    }
+  }
+}
+
+
+/* The diagram below shows a row of samples produced by h2v1 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            |         |         |
+ *            | p0   p1 | p2   p3 |
+ *            |         |         |
+ *            +---------+---------+
+ *
+ * Samples s0 and s1 were created by averaging the original pixel component
+ * values centered at positions p0-p3 above.  To approximate those original
+ * pixel component values, we duplicate the samples horizontally:
+ *     p0(upsampled) = p1(upsampled) = s0
+ *     p2(upsampled) = p3(upsampled) = s1
+ */
+
+void jsimd_h2v1_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr;
+  int inrow;
+  unsigned colctr;
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr = output_data[inrow];
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr + 2 * colctr, output_pixels);
+    }
+  }
+}
+
+
+/* The diagram below shows an array of samples produced by h2v2 downsampling.
+ *
+ *                s0        s1
+ *            +---------+---------+
+ *            | p0   p1 | p2   p3 |
+ *       sA   |         |         |
+ *            | p4   p5 | p6   p7 |
+ *            +---------+---------+
+ *            | p8   p9 | p10  p11|
+ *       sB   |         |         |
+ *            | p12  p13| p14  p15|
+ *            +---------+---------+
+ *
+ * Samples s0A-s1B were created by averaging the original pixel component
+ * values centered at positions p0-p15 above.  To approximate those original
+ * pixel component values, we duplicate the samples both horizontally and
+ * vertically:
+ *     p0(upsampled) = p1(upsampled) = p4(upsampled) = p5(upsampled) = s0A
+ *     p2(upsampled) = p3(upsampled) = p6(upsampled) = p7(upsampled) = s1A
+ *     p8(upsampled) = p9(upsampled) = p12(upsampled) = p13(upsampled) = s0B
+ *     p10(upsampled) = p11(upsampled) = p14(upsampled) = p15(upsampled) = s1B
+ */
+
+void jsimd_h2v2_upsample_neon(int max_v_samp_factor, JDIMENSION output_width,
+                              JSAMPARRAY input_data,
+                              JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr, outptr0, outptr1;
+  int inrow, outrow;
+  unsigned colctr;
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+    inptr = input_data[inrow];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    for (colctr = 0; 2 * colctr < output_width; colctr += 16) {
+      uint8x16_t samples = vld1q_u8(inptr + colctr);
+      /* Duplicate the samples.  The store operation below interleaves them so
+       * that adjacent pixel component values take on the same sample value,
+       * per above.
+       */
+      uint8x16x2_t output_pixels = { { samples, samples } };
+      /* Store pixel component values for both output rows to memory.
+       * Due to the way sample buffers are allocated, we don't need to worry
+       * about tail cases when output_width is not a multiple of 32.  See
+       * "Creation of 2-D sample arrays" in jmemmgr.c for details.
+       */
+      vst2q_u8(outptr0 + 2 * colctr, output_pixels);
+      vst2q_u8(outptr1 + 2 * colctr, output_pixels);
+    }
+  }
+}
diff --git a/external/jpeg/simd/arm/jfdctfst-neon.c b/external/jpeg/simd/arm/jfdctfst-neon.c
new file mode 100644
index 000000000000..bb371be39992
--- /dev/null
+++ b/external/jpeg/simd/arm/jfdctfst-neon.c
@@ -0,0 +1,214 @@
+/*
+ * jfdctfst-neon.c - fast integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_ifast_neon() performs a fast, not so accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples.  It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_ifast() function, which can be found in jfdctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.382683433 = 12544 * 2^-15
+ *    0.541196100 = 17795 * 2^-15
+ *    0.707106781 = 23168 * 2^-15
+ *    0.306562965 =  9984 * 2^-15
+ *
+ * See jfdctfst.c for further details of the DCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_fdct_ifast_neon() match up
+ * with those in jpeg_fdct_ifast().
+ */
+
+#define F_0_382  12544
+#define F_0_541  17792
+#define F_0_707  23168
+#define F_0_306  9984
+
+
+ALIGN(16) static const int16_t jsimd_fdct_ifast_neon_consts[] = {
+  F_0_382, F_0_541, F_0_707, F_0_306
+};
+
+void jsimd_fdct_ifast_neon(DCTELEM *data)
+{
+  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
+   * are used, followed by vuzp to transpose the block such that we have a
+   * column of samples per vector - allowing all rows to be processed at once.
+   */
+  int16x8x4_t data1 = vld4q_s16(data);
+  int16x8x4_t data2 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(data1.val[0], data2.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(data1.val[1], data2.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(data1.val[2], data2.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(data1.val[3], data2.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+
+  /* Load DCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_fdct_ifast_neon_consts);
+
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);    /* phase 2 */
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  col4 = vsubq_s16(tmp10, tmp11);
+
+  int16x8_t z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  col2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  col6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  int16x8_t z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  int16x8_t z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  int16x8_t z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  int16x8_t z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  int16x8_t z11 = vaddq_s16(tmp7, z3);        /* phase 5 */
+  int16x8_t z13 = vsubq_s16(tmp7, z3);
+
+  col5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  col3 = vsubq_s16(z13, z2);
+  col1 = vaddq_s16(z11, z4);
+  col7 = vsubq_s16(z11, z4);
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2: process columns. */
+
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part */
+  tmp10 = vaddq_s16(tmp0, tmp3);              /* phase 2 */
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vaddq_s16(tmp10, tmp11);             /* phase 3 */
+  row4 = vsubq_s16(tmp10, tmp11);
+
+  z1 = vqdmulhq_lane_s16(vaddq_s16(tmp12, tmp13), consts, 2);
+  row2 = vaddq_s16(tmp13, z1);                /* phase 5 */
+  row6 = vsubq_s16(tmp13, z1);
+
+  /* Odd part */
+  tmp10 = vaddq_s16(tmp4, tmp5);              /* phase 2 */
+  tmp11 = vaddq_s16(tmp5, tmp6);
+  tmp12 = vaddq_s16(tmp6, tmp7);
+
+  z5 = vqdmulhq_lane_s16(vsubq_s16(tmp10, tmp12), consts, 0);
+  z2 = vqdmulhq_lane_s16(tmp10, consts, 1);
+  z2 = vaddq_s16(z2, z5);
+  z4 = vqdmulhq_lane_s16(tmp12, consts, 3);
+  z5 = vaddq_s16(tmp12, z5);
+  z4 = vaddq_s16(z4, z5);
+  z3 = vqdmulhq_lane_s16(tmp11, consts, 2);
+
+  z11 = vaddq_s16(tmp7, z3);                  /* phase 5 */
+  z13 = vsubq_s16(tmp7, z3);
+
+  row5 = vaddq_s16(z13, z2);                  /* phase 6 */
+  row3 = vsubq_s16(z13, z2);
+  row1 = vaddq_s16(z11, z4);
+  row7 = vsubq_s16(z11, z4);
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/external/jpeg/simd/arm/jfdctint-neon.c b/external/jpeg/simd/arm/jfdctint-neon.c
new file mode 100644
index 000000000000..ccfc07b15d94
--- /dev/null
+++ b/external/jpeg/simd/arm/jfdctint-neon.c
@@ -0,0 +1,376 @@
+/*
+ * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
+ * (Discrete Cosine Transform) on one block of samples.  It uses the same
+ * calculations and produces exactly the same output as IJG's original
+ * jpeg_fdct_islow() function, which can be found in jfdctint.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ *
+ * See jfdctint.c for further details of the DCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_fdct_islow_neon() match up
+ * with those in jpeg_fdct_islow().
+ */
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS)
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+
+ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
+  F_0_298, -F_0_390,  F_0_541,  F_0_765,
+ -F_0_899,  F_1_175,  F_1_501, -F_1_847,
+ -F_1_961,  F_2_053, -F_2_562,  F_3_072
+};
+
+void jsimd_fdct_islow_neon(DCTELEM *data)
+{
+  /* Load DCT constants. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Load an 8x8 block of samples into Neon registers.  De-interleaving loads
+   * are used, followed by vuzp to transpose the block such that we have a
+   * column of samples per vector - allowing all rows to be processed at once.
+   */
+  int16x8x4_t s_rows_0123 = vld4q_s16(data);
+  int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
+
+  int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
+  int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
+  int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
+  int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
+
+  int16x8_t col0 = cols_04.val[0];
+  int16x8_t col1 = cols_15.val[0];
+  int16x8_t col2 = cols_26.val[0];
+  int16x8_t col3 = cols_37.val[0];
+  int16x8_t col4 = cols_04.val[1];
+  int16x8_t col5 = cols_15.val[1];
+  int16x8_t col6 = cols_26.val[1];
+  int16x8_t col7 = cols_37.val[1];
+
+  /* Pass 1: process rows. */
+
+  int16x8_t tmp0 = vaddq_s16(col0, col7);
+  int16x8_t tmp7 = vsubq_s16(col0, col7);
+  int16x8_t tmp1 = vaddq_s16(col1, col6);
+  int16x8_t tmp6 = vsubq_s16(col1, col6);
+  int16x8_t tmp2 = vaddq_s16(col2, col5);
+  int16x8_t tmp5 = vsubq_s16(col2, col5);
+  int16x8_t tmp3 = vaddq_s16(col3, col4);
+  int16x8_t tmp4 = vsubq_s16(col3, col4);
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
+  int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
+  int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
+  int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
+
+  col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  int32x4_t z1_l =
+    vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+  int32x4_t z1_h =
+    vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+  int32x4_t col2_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+  int32x4_t col2_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+  col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
+
+  int32x4_t col6_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+  int32x4_t col6_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+  col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
+                      vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
+
+  /* Odd part */
+  int16x8_t z1 = vaddq_s16(tmp4, tmp7);
+  int16x8_t z2 = vaddq_s16(tmp5, tmp6);
+  int16x8_t z3 = vaddq_s16(tmp4, tmp6);
+  int16x8_t z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P1));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P1));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P1));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P1));
+
+  /* Transpose to work on columns in pass 2. */
+  int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
+  int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
+  int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
+  int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
+
+  int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
+                                      vreinterpretq_s32_s16(cols_45.val[0]));
+  int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
+                                      vreinterpretq_s32_s16(cols_45.val[1]));
+  int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
+                                      vreinterpretq_s32_s16(cols_67.val[0]));
+  int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
+                                      vreinterpretq_s32_s16(cols_67.val[1]));
+
+  int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
+  int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
+  int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
+  int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
+
+  int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
+  int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
+  int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
+  int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
+  int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
+  int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
+  int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
+  int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
+
+  /* Pass 2: process columns. */
+
+  tmp0 = vaddq_s16(row0, row7);
+  tmp7 = vsubq_s16(row0, row7);
+  tmp1 = vaddq_s16(row1, row6);
+  tmp6 = vsubq_s16(row1, row6);
+  tmp2 = vaddq_s16(row2, row5);
+  tmp5 = vsubq_s16(row2, row5);
+  tmp3 = vaddq_s16(row3, row4);
+  tmp4 = vsubq_s16(row3, row4);
+
+  /* Even part */
+  tmp10 = vaddq_s16(tmp0, tmp3);
+  tmp13 = vsubq_s16(tmp0, tmp3);
+  tmp11 = vaddq_s16(tmp1, tmp2);
+  tmp12 = vsubq_s16(tmp1, tmp2);
+
+  row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
+  row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
+
+  tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
+  z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
+  z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
+
+  int32x4_t row2_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
+  int32x4_t row2_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
+  row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
+
+  int32x4_t row6_scaled_l =
+    vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
+  int32x4_t row6_scaled_h =
+    vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
+  row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
+                      vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
+
+  /* Odd part */
+  z1 = vaddq_s16(tmp4, tmp7);
+  z2 = vaddq_s16(tmp5, tmp6);
+  z3 = vaddq_s16(tmp4, tmp6);
+  z4 = vaddq_s16(tmp5, tmp7);
+  /* sqrt(2) * c3 */
+  z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
+  z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
+  z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
+  z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
+
+  /* sqrt(2) * (-c1+c3+c5-c7) */
+  tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
+  tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
+  /* sqrt(2) * ( c1+c3-c5+c7) */
+  tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
+  tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
+  /* sqrt(2) * ( c1+c3+c5-c7) */
+  tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
+  tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
+  /* sqrt(2) * ( c1+c3-c5-c7) */
+  tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
+  tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
+
+  /* sqrt(2) * (c7-c3) */
+  z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
+  z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
+  /* sqrt(2) * (-c1-c3) */
+  z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
+  z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
+  /* sqrt(2) * (-c3-c5) */
+  z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
+  z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
+  /* sqrt(2) * (c5-c3) */
+  z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
+  z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
+
+  z3_l = vaddq_s32(z3_l, z5_l);
+  z3_h = vaddq_s32(z3_h, z5_h);
+  z4_l = vaddq_s32(z4_l, z5_l);
+  z4_h = vaddq_s32(z4_h, z5_h);
+
+  tmp4_l = vaddq_s32(tmp4_l, z1_l);
+  tmp4_h = vaddq_s32(tmp4_h, z1_h);
+  tmp4_l = vaddq_s32(tmp4_l, z3_l);
+  tmp4_h = vaddq_s32(tmp4_h, z3_h);
+  row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp4_h, DESCALE_P2));
+
+  tmp5_l = vaddq_s32(tmp5_l, z2_l);
+  tmp5_h = vaddq_s32(tmp5_h, z2_h);
+  tmp5_l = vaddq_s32(tmp5_l, z4_l);
+  tmp5_h = vaddq_s32(tmp5_h, z4_h);
+  row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp5_h, DESCALE_P2));
+
+  tmp6_l = vaddq_s32(tmp6_l, z2_l);
+  tmp6_h = vaddq_s32(tmp6_h, z2_h);
+  tmp6_l = vaddq_s32(tmp6_l, z3_l);
+  tmp6_h = vaddq_s32(tmp6_h, z3_h);
+  row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp6_h, DESCALE_P2));
+
+  tmp7_l = vaddq_s32(tmp7_l, z1_l);
+  tmp7_h = vaddq_s32(tmp7_h, z1_h);
+  tmp7_l = vaddq_s32(tmp7_l, z4_l);
+  tmp7_h = vaddq_s32(tmp7_h, z4_h);
+  row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
+                      vrshrn_n_s32(tmp7_h, DESCALE_P2));
+
+  vst1q_s16(data + 0 * DCTSIZE, row0);
+  vst1q_s16(data + 1 * DCTSIZE, row1);
+  vst1q_s16(data + 2 * DCTSIZE, row2);
+  vst1q_s16(data + 3 * DCTSIZE, row3);
+  vst1q_s16(data + 4 * DCTSIZE, row4);
+  vst1q_s16(data + 5 * DCTSIZE, row5);
+  vst1q_s16(data + 6 * DCTSIZE, row6);
+  vst1q_s16(data + 7 * DCTSIZE, row7);
+}
diff --git a/external/jpeg/simd/arm/jidctfst-neon.c b/external/jpeg/simd/arm/jidctfst-neon.c
new file mode 100644
index 000000000000..a91be5362ebe
--- /dev/null
+++ b/external/jpeg/simd/arm/jidctfst-neon.c
@@ -0,0 +1,472 @@
+/*
+ * jidctfst-neon.c - fast integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+
+#include <arm_neon.h>
+
+
+/* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate
+ * inverse DCT (Discrete Cosine Transform) on one block of coefficients.  It
+ * uses the same calculations and produces exactly the same output as IJG's
+ * original jpeg_idct_ifast() function, which can be found in jidctfst.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.082392200 =  2688 * 2^-15
+ *    0.414213562 = 13568 * 2^-15
+ *    0.847759065 = 27776 * 2^-15
+ *    0.613125930 = 20096 * 2^-15
+ *
+ * See jidctfst.c for further details of the IDCT algorithm.  Where possible,
+ * the variable names and comments here in jsimd_idct_ifast_neon() match up
+ * with those in jpeg_idct_ifast().
+ */
+
+#define PASS1_BITS  2
+
+#define F_0_082  2688
+#define F_0_414  13568
+#define F_0_847  27776
+#define F_0_613  20096
+
+
+ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = {
+  F_0_082, F_0_414, F_0_847, F_0_613
+};
+
+void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  IFAST_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row4);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts);
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into vectors.
+     */
+    int16x8_t dcval = row0;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+    row4 = dcval;
+    row5 = dcval;
+    row6 = dcval;
+    row7 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_low_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_high_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7));
+    row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7));
+    row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6));
+    row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6));
+    row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5));
+    row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5));
+    row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4));
+    row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Use DC values for these columns.
+     */
+    int16x4_t dcval = vget_high_s16(row0);
+
+    /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x4_t tmp0 = vget_low_s16(row0);
+    int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4);
+    int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int16x4_t tmp10 = vadd_s16(tmp0, tmp2);   /* phase 3 */
+    int16x4_t tmp11 = vsub_s16(tmp0, tmp2);
+
+    int16x4_t tmp13 = vadd_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3);
+    int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsub_s16(tmp12, tmp13);
+
+    tmp0 = vadd_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsub_s16(tmp10, tmp13);
+    tmp1 = vadd_s16(tmp11, tmp12);
+    tmp2 = vsub_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1);
+    int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5);
+    int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7);
+
+    int16x4_t z13 = vadd_s16(tmp6, tmp5);     /* phase 6 */
+    int16x4_t neg_z10 = vsub_s16(tmp5, tmp6);
+    int16x4_t z11 = vadd_s16(tmp4, tmp7);
+    int16x4_t z12 = vsub_s16(tmp4, tmp7);
+
+    tmp7 = vadd_s16(z11, z13);                /* phase 5 */
+    int16x4_t z11_sub_z13 = vsub_s16(z11, z13);
+    tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vadd_s16(tmp11, z11_sub_z13);
+
+    int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10);
+    int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2);
+    z5 = vadd_s16(z5, z10_add_z12);
+    tmp10 = vqdmulh_lane_s16(z12, consts, 0);
+    tmp10 = vadd_s16(tmp10, z12);
+    tmp10 = vsub_s16(tmp10, z5);
+    tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3);
+    tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10));
+    tmp12 = vadd_s16(tmp12, z5);
+
+    tmp6 = vsub_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsub_s16(tmp11, tmp6);
+    tmp4 = vadd_s16(tmp10, tmp5);
+
+    row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval);
+    row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval);
+    row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval);
+    row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval);
+    row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval);
+    row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval);
+    row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval);
+    row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval);
+  } else {
+    /* Some AC coefficients are non-zero; full IDCT calculation required. */
+
+    /* Load quantization table. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part: dequantize DCT coefficients. */
+    int16x8_t tmp0 = row0;
+    int16x8_t tmp1 = vmulq_s16(row2, quant_row2);
+    int16x8_t tmp2 = vmulq_s16(row4, quant_row4);
+    int16x8_t tmp3 = vmulq_s16(row6, quant_row6);
+
+    int16x8_t tmp10 = vaddq_s16(tmp0, tmp2);   /* phase 3 */
+    int16x8_t tmp11 = vsubq_s16(tmp0, tmp2);
+
+    int16x8_t tmp13 = vaddq_s16(tmp1, tmp3);   /* phases 5-3 */
+    int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3);
+    int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1);
+    tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3);
+    tmp12 = vsubq_s16(tmp12, tmp13);
+
+    tmp0 = vaddq_s16(tmp10, tmp13);            /* phase 2 */
+    tmp3 = vsubq_s16(tmp10, tmp13);
+    tmp1 = vaddq_s16(tmp11, tmp12);
+    tmp2 = vsubq_s16(tmp11, tmp12);
+
+    /* Odd part: dequantize DCT coefficients. */
+    int16x8_t tmp4 = vmulq_s16(row1, quant_row1);
+    int16x8_t tmp5 = vmulq_s16(row3, quant_row3);
+    int16x8_t tmp6 = vmulq_s16(row5, quant_row5);
+    int16x8_t tmp7 = vmulq_s16(row7, quant_row7);
+
+    int16x8_t z13 = vaddq_s16(tmp6, tmp5);     /* phase 6 */
+    int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6);
+    int16x8_t z11 = vaddq_s16(tmp4, tmp7);
+    int16x8_t z12 = vsubq_s16(tmp4, tmp7);
+
+    tmp7 = vaddq_s16(z11, z13);                /* phase 5 */
+    int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+    tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+    tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+    int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+    int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+    z5 = vaddq_s16(z5, z10_add_z12);
+    tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+    tmp10 = vaddq_s16(tmp10, z12);
+    tmp10 = vsubq_s16(tmp10, z5);
+    tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+    tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+    tmp12 = vaddq_s16(tmp12, z5);
+
+    tmp6 = vsubq_s16(tmp12, tmp7);             /* phase 2 */
+    tmp5 = vsubq_s16(tmp11, tmp6);
+    tmp4 = vaddq_s16(tmp10, tmp5);
+
+    row0 = vaddq_s16(tmp0, tmp7);
+    row7 = vsubq_s16(tmp0, tmp7);
+    row1 = vaddq_s16(tmp1, tmp6);
+    row6 = vsubq_s16(tmp1, tmp6);
+    row2 = vaddq_s16(tmp2, tmp5);
+    row5 = vsubq_s16(tmp2, tmp5);
+    row4 = vaddq_s16(tmp3, tmp4);
+    row3 = vsubq_s16(tmp3, tmp4);
+  }
+
+  /* Transpose rows to work on columns in pass 2. */
+  int16x8x2_t rows_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t rows_23 = vtrnq_s16(row2, row3);
+  int16x8x2_t rows_45 = vtrnq_s16(row4, row5);
+  int16x8x2_t rows_67 = vtrnq_s16(row6, row7);
+
+  int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]),
+                                      vreinterpretq_s32_s16(rows_45.val[0]));
+  int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]),
+                                      vreinterpretq_s32_s16(rows_45.val[1]));
+  int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]),
+                                      vreinterpretq_s32_s16(rows_67.val[0]));
+  int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]),
+                                      vreinterpretq_s32_s16(rows_67.val[1]));
+
+  int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]);
+  int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]);
+  int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]);
+  int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]);
+
+  int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]);
+  int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]);
+  int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]);
+  int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]);
+  int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]);
+  int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]);
+  int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]);
+  int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]);
+
+  /* 1-D IDCT, pass 2 */
+
+  /* Even part */
+  int16x8_t tmp10 = vaddq_s16(col0, col4);
+  int16x8_t tmp11 = vsubq_s16(col0, col4);
+
+  int16x8_t tmp13 = vaddq_s16(col2, col6);
+  int16x8_t col2_sub_col6 = vsubq_s16(col2, col6);
+  int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1);
+  tmp12 = vaddq_s16(tmp12, col2_sub_col6);
+  tmp12 = vsubq_s16(tmp12, tmp13);
+
+  int16x8_t tmp0 = vaddq_s16(tmp10, tmp13);
+  int16x8_t tmp3 = vsubq_s16(tmp10, tmp13);
+  int16x8_t tmp1 = vaddq_s16(tmp11, tmp12);
+  int16x8_t tmp2 = vsubq_s16(tmp11, tmp12);
+
+  /* Odd part */
+  int16x8_t z13 = vaddq_s16(col5, col3);
+  int16x8_t neg_z10 = vsubq_s16(col3, col5);
+  int16x8_t z11 = vaddq_s16(col1, col7);
+  int16x8_t z12 = vsubq_s16(col1, col7);
+
+  int16x8_t tmp7 = vaddq_s16(z11, z13);      /* phase 5 */
+  int16x8_t z11_sub_z13 = vsubq_s16(z11, z13);
+  tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1);
+  tmp11 = vaddq_s16(tmp11, z11_sub_z13);
+
+  int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10);
+  int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2);
+  z5 = vaddq_s16(z5, z10_add_z12);
+  tmp10 = vqdmulhq_lane_s16(z12, consts, 0);
+  tmp10 = vaddq_s16(tmp10, z12);
+  tmp10 = vsubq_s16(tmp10, z5);
+  tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3);
+  tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10));
+  tmp12 = vaddq_s16(tmp12, z5);
+
+  int16x8_t tmp6 = vsubq_s16(tmp12, tmp7);   /* phase 2 */
+  int16x8_t tmp5 = vsubq_s16(tmp11, tmp6);
+  int16x8_t tmp4 = vaddq_s16(tmp10, tmp5);
+
+  col0 = vaddq_s16(tmp0, tmp7);
+  col7 = vsubq_s16(tmp0, tmp7);
+  col1 = vaddq_s16(tmp1, tmp6);
+  col6 = vsubq_s16(tmp1, tmp6);
+  col2 = vaddq_s16(tmp2, tmp5);
+  col5 = vsubq_s16(tmp2, tmp5);
+  col4 = vaddq_s16(tmp3, tmp4);
+  col3 = vsubq_s16(tmp3, tmp4);
+
+  /* Scale down by a factor of 8, narrowing to 8-bit. */
+  int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col1, PASS1_BITS + 3));
+  int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col5, PASS1_BITS + 3));
+  int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col3, PASS1_BITS + 3));
+  int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3),
+                                     vqshrn_n_s16(col7, PASS1_BITS + 3));
+  /* Clamp to range [0-255]. */
+  uint8x16_t cols_01 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_45 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_23 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+  uint8x16_t cols_67 =
+    vreinterpretq_u8_s8
+      (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE))));
+
+  /* Transpose block to prepare for store. */
+  uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01),
+                                     vreinterpretq_u32_u8(cols_45));
+  uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23),
+                                     vreinterpretq_u32_u8(cols_67));
+
+  uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]),
+                                    vreinterpretq_u8_u32(cols_0415.val[1]));
+  uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]),
+                                    vreinterpretq_u8_u32(cols_2637.val[1]));
+  uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]),
+                                     vreinterpretq_u16_u8(cols_2367.val[0]));
+  uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]),
+                                     vreinterpretq_u16_u8(cols_2367.val[1]));
+
+  uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]);
+  uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]);
+  uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]);
+  uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]);
+
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  JSAMPROW outptr4 = output_buf[4] + output_col;
+  JSAMPROW outptr5 = output_buf[5] + output_col;
+  JSAMPROW outptr6 = output_buf[6] + output_col;
+  JSAMPROW outptr7 = output_buf[7] + output_col;
+
+  /* Store DCT block to memory. */
+  vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0);
+  vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0);
+  vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0);
+  vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0);
+  vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1);
+  vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1);
+  vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1);
+  vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1);
+}
diff --git a/external/jpeg/simd/arm/jidctint-neon.c b/external/jpeg/simd/arm/jidctint-neon.c
new file mode 100644
index 000000000000..043b652e6c55
--- /dev/null
+++ b/external/jpeg/simd/arm/jidctint-neon.c
@@ -0,0 +1,802 @@
+/*
+ * jidctint-neon.c - accurate integer IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "jconfigint.h"
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define DESCALE_P1  (CONST_BITS - PASS1_BITS)
+#define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
+
+/* The computation of the inverse DCT requires the use of constants known at
+ * compile time.  Scaled integer constants are used to avoid floating-point
+ * arithmetic:
+ *    0.298631336 =  2446 * 2^-13
+ *    0.390180644 =  3196 * 2^-13
+ *    0.541196100 =  4433 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.175875602 =  9633 * 2^-13
+ *    1.501321110 = 12299 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    1.961570560 = 16069 * 2^-13
+ *    2.053119869 = 16819 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *    3.072711026 = 25172 * 2^-13
+ */
+
+#define F_0_298  2446
+#define F_0_390  3196
+#define F_0_541  4433
+#define F_0_765  6270
+#define F_0_899  7373
+#define F_1_175  9633
+#define F_1_501  12299
+#define F_1_847  15137
+#define F_1_961  16069
+#define F_2_053  16819
+#define F_2_562  20995
+#define F_3_072  25172
+
+#define F_1_175_MINUS_1_961  (F_1_175 - F_1_961)
+#define F_1_175_MINUS_0_390  (F_1_175 - F_0_390)
+#define F_0_541_MINUS_1_847  (F_0_541 - F_1_847)
+#define F_3_072_MINUS_2_562  (F_3_072 - F_2_562)
+#define F_0_298_MINUS_0_899  (F_0_298 - F_0_899)
+#define F_1_501_MINUS_0_899  (F_1_501 - F_0_899)
+#define F_2_053_MINUS_2_562  (F_2_053 - F_2_562)
+#define F_0_541_PLUS_0_765   (F_0_541 + F_0_765)
+
+
+ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
+  F_0_899,             F_0_541,
+  F_2_562,             F_0_298_MINUS_0_899,
+  F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
+  F_0_541_PLUS_0_765,  F_1_175,
+  F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
+  F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
+  0, 0, 0, 0
+};
+
+
+/* Forward declaration of regular and sparse IDCT helper functions */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+                                                  int16x4_t row1,
+                                                  int16x4_t row2,
+                                                  int16x4_t row3,
+                                                  int16x4_t row4,
+                                                  int16x4_t row5,
+                                                  int16x4_t row6,
+                                                  int16x4_t row7,
+                                                  int16x4_t quant_row0,
+                                                  int16x4_t quant_row1,
+                                                  int16x4_t quant_row2,
+                                                  int16x4_t quant_row3,
+                                                  int16x4_t quant_row4,
+                                                  int16x4_t quant_row5,
+                                                  int16x4_t quant_row6,
+                                                  int16x4_t quant_row7,
+                                                  int16_t *workspace_1,
+                                                  int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+                                                 int16x4_t row1,
+                                                 int16x4_t row2,
+                                                 int16x4_t row3,
+                                                 int16x4_t quant_row0,
+                                                 int16x4_t quant_row1,
+                                                 int16x4_t quant_row2,
+                                                 int16x4_t quant_row3,
+                                                 int16_t *workspace_1,
+                                                 int16_t *workspace_2);
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col,
+                                                  unsigned buf_offset);
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+                                                 JSAMPARRAY output_buf,
+                                                 JDIMENSION output_col,
+                                                 unsigned buf_offset);
+
+
+/* Perform dequantization and inverse DCT on one block of coefficients.  For
+ * reference, the C implementation (jpeg_idct_slow()) can be found in
+ * jidctint.c.
+ *
+ * Optimization techniques used for fast data access:
+ *
+ * In each pass, the inverse DCT is computed for the left and right 4x8 halves
+ * of the DCT block.  This avoids spilling due to register pressure, and the
+ * increased granularity allows for an optimized calculation depending on the
+ * values of the DCT coefficients.  Between passes, intermediate data is stored
+ * in 4x8 workspace buffers.
+ *
+ * Transposing the 8x8 DCT block after each pass can be achieved by transposing
+ * each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
+ * diagram below.)  Swapping quadrants is cheap, since the second pass can just
+ * swap the workspace buffer pointers.
+ *
+ *      +-------+-------+                   +-------+-------+
+ *      |       |       |                   |       |       |
+ *      |   0   |   1   |                   |   0   |   2   |
+ *      |       |       |    transpose      |       |       |
+ *      +-------+-------+     ------>       +-------+-------+
+ *      |       |       |                   |       |       |
+ *      |   2   |   3   |                   |   1   |   3   |
+ *      |       |       |                   |       |       |
+ *      +-------+-------+                   +-------+-------+
+ *
+ * Optimization techniques used to accelerate the inverse DCT calculation:
+ *
+ * In a DCT coefficient block, the coefficients are increasingly likely to be 0
+ * as you move diagonally from top left to bottom right.  If whole rows of
+ * coefficients are 0, then the inverse DCT calculation can be simplified.  On
+ * the first pass of the inverse DCT, we test for three special cases before
+ * defaulting to a full "regular" inverse DCT:
+ *
+ * 1) Coefficients in rows 4-7 are all zero.  In this case, we perform a
+ *    "sparse" simplified inverse DCT on rows 0-3.
+ * 2) AC coefficients (rows 1-7) are all zero.  In this case, the inverse DCT
+ *    result is equal to the dequantized DC coefficients.
+ * 3) AC and DC coefficients are all zero.  In this case, the inverse DCT
+ *    result is all zero.  For the left 4x8 half, this is handled identically
+ *    to Case 2 above.  For the right 4x8 half, we do no work and signal that
+ *    the "sparse" algorithm is required for the second pass.
+ *
+ * In the second pass, only a single special case is tested: whether the AC and
+ * DC coefficients were all zero in the right 4x8 block during the first pass
+ * (refer to Case 3 above.)  If this is the case, then a "sparse" variant of
+ * the second pass is performed for both the left and right halves of the DCT
+ * block.  (The transposition after the first pass means that the right 4x8
+ * block during the first pass becomes rows 4-7 during the second pass.)
+ */
+
+void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
+                           JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  int16_t workspace_l[8 * DCTSIZE / 2];
+  int16_t workspace_r[8 * DCTSIZE / 2];
+
+  /* Compute IDCT first pass on left 4x8 coefficient block. */
+
+  /* Load DCT coefficients in left 4x8 block. */
+  int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
+  int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
+  int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
+  int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
+  int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
+  int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
+  int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
+  int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table for left 4x8 block. */
+  int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
+  int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+  int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+  int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+  int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
+  int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+  int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+  int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+  /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
+  int16x4_t bitmap = vorr_s16(row7, row6);
+  bitmap = vorr_s16(bitmap, row5);
+  bitmap = vorr_s16(bitmap, row4);
+  int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+  if (bitmap_rows_4567 == 0) {
+    bitmap = vorr_s16(bitmap, row3);
+    bitmap = vorr_s16(bitmap, row2);
+    bitmap = vorr_s16(bitmap, row1);
+    int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+    if (left_ac_bitmap == 0) {
+      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+      /* Store 4x4 blocks to workspace, transposing in the process. */
+      vst4_s16(workspace_l, quadrant);
+      vst4_s16(workspace_r, quadrant);
+    } else {
+      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+                                    quant_row1, quant_row2, quant_row3,
+                                    workspace_l, workspace_r);
+    }
+  } else {
+    jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+                                   row6, row7, quant_row0, quant_row1,
+                                   quant_row2, quant_row3, quant_row4,
+                                   quant_row5, quant_row6, quant_row7,
+                                   workspace_l, workspace_r);
+  }
+
+  /* Compute IDCT first pass on right 4x8 coefficient block. */
+
+  /* Load DCT coefficients in right 4x8 block. */
+  row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
+  row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
+  row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
+  row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
+  row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
+  row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
+  row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
+  row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
+
+  /* Load quantization table for right 4x8 block. */
+  quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
+  quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+  quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+  quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+  quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
+  quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+  quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+  quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+  /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
+  bitmap = vorr_s16(row7, row6);
+  bitmap = vorr_s16(bitmap, row5);
+  bitmap = vorr_s16(bitmap, row4);
+  bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+  bitmap = vorr_s16(bitmap, row3);
+  bitmap = vorr_s16(bitmap, row2);
+  bitmap = vorr_s16(bitmap, row1);
+  int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+  /* If this remains non-zero, a "regular" second pass will be performed. */
+  int64_t right_ac_dc_bitmap = 1;
+
+  if (right_ac_bitmap == 0) {
+    bitmap = vorr_s16(bitmap, row0);
+    right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
+
+    if (right_ac_dc_bitmap != 0) {
+      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
+      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
+      /* Store 4x4 blocks to workspace, transposing in the process. */
+      vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
+      vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
+    }
+  } else {
+    if (bitmap_rows_4567 == 0) {
+      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
+                                    quant_row1, quant_row2, quant_row3,
+                                    workspace_l + 4 * DCTSIZE / 2,
+                                    workspace_r + 4 * DCTSIZE / 2);
+    } else {
+      jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
+                                     row6, row7, quant_row0, quant_row1,
+                                     quant_row2, quant_row3, quant_row4,
+                                     quant_row5, quant_row6, quant_row7,
+                                     workspace_l + 4 * DCTSIZE / 2,
+                                     workspace_r + 4 * DCTSIZE / 2);
+    }
+  }
+
+  /* Second pass: compute IDCT on rows in workspace. */
+
+  /* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
+  if (right_ac_dc_bitmap == 0) {
+    jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
+    jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
+  } else {
+    jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
+    jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
+  }
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.  (To process the full 8x8 DCT block, this
+ * function-- or some other optimized variant-- needs to be called for both the
+ * left and right 4x8 blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of AC coefficients is all 0.
+ *
+ * The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
+ * found in jidctint.c.  Algorithmic changes made here are documented inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
+                                                  int16x4_t row1,
+                                                  int16x4_t row2,
+                                                  int16x4_t row3,
+                                                  int16x4_t row4,
+                                                  int16x4_t row5,
+                                                  int16x4_t row6,
+                                                  int16x4_t row7,
+                                                  int16x4_t quant_row0,
+                                                  int16x4_t quant_row1,
+                                                  int16x4_t quant_row2,
+                                                  int16x4_t quant_row3,
+                                                  int16x4_t quant_row4,
+                                                  int16x4_t quant_row5,
+                                                  int16x4_t quant_row6,
+                                                  int16x4_t quant_row7,
+                                                  int16_t *workspace_1,
+                                                  int16_t *workspace_2)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part */
+  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+  int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+  z2_s16 = vmul_s16(row0, quant_row0);
+  z3_s16 = vmul_s16(row4, quant_row4);
+
+  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part */
+  int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
+  int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
+  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z5 = (z3 + z4) * 1.175875602;
+   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+   *   z3 += z5;  z4 += z5;
+   *
+   * This implementation:
+   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+   */
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
+   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
+   *
+   * This implementation:
+   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+   *   tmp0 += z3;  tmp1 += z4;
+   *   tmp2 += z3;  tmp3 += z4;
+   */
+
+  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+  tmp0 = vaddq_s32(tmp0, z3);
+  tmp1 = vaddq_s32(tmp1, z4);
+  tmp2 = vaddq_s32(tmp2, z3);
+  tmp3 = vaddq_s32(tmp3, z4);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x4x4_t rows_0123 = { {
+    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+  } };
+  int16x4x4_t rows_4567 = { {
+    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+  } };
+
+  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+   * (VST4 transposes the blocks.  We need to operate on rows in the next
+   * pass.)
+   */
+  vst4_s16(workspace_1, rows_0123);
+  vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform dequantization and the first pass of the accurate inverse DCT on a
+ * 4x8 block of coefficients.
+ *
+ * This "sparse" version assumes that the AC coefficients in rows 4-7 are all
+ * 0.  This simplifies the IDCT calculation, accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
+                                                 int16x4_t row1,
+                                                 int16x4_t row2,
+                                                 int16x4_t row3,
+                                                 int16x4_t quant_row0,
+                                                 int16x4_t quant_row1,
+                                                 int16x4_t quant_row2,
+                                                 int16x4_t quant_row3,
+                                                 int16_t *workspace_1,
+                                                 int16_t *workspace_2)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part (z3 is all 0) */
+  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+  z2_s16 = vmul_s16(row0, quant_row0);
+  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part (tmp0 and tmp1 are both all 0) */
+  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
+  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
+
+  int16x4_t z3_s16 = tmp2_s16;
+  int16x4_t z4_s16 = tmp3_s16;
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x4x4_t rows_0123 = { {
+    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
+  } };
+  int16x4x4_t rows_4567 = { {
+    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
+    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
+  } };
+
+  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
+   * (VST4 transposes the blocks.  We need to operate on rows in the next
+   * pass.)
+   */
+  vst4_s16(workspace_1, rows_0123);
+  vst4_s16(workspace_2, rows_4567);
+}
+
+
+/* Perform the second pass of the accurate inverse DCT on a 4x8 block of
+ * coefficients.  (To process the full 8x8 DCT block, this function-- or some
+ * other optimized variant-- needs to be called for both the right and left 4x8
+ * blocks.)
+ *
+ * This "regular" version assumes that no optimization can be made to the IDCT
+ * calculation, since no useful set of coefficient values are all 0 after the
+ * first pass.
+ *
+ * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
+ * can be found in jidctint.c.  Algorithmic changes made here are documented
+ * inline.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
+                                                  JSAMPARRAY output_buf,
+                                                  JDIMENSION output_col,
+                                                  unsigned buf_offset)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part */
+  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+  int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
+  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
+
+  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+  z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
+
+  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part */
+  int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
+  int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
+  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
+  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z5 = (z3 + z4) * 1.175875602;
+   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
+   *   z3 += z5;  z4 += z5;
+   *
+   * This implementation:
+   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
+   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
+   */
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  /* Implementation as per jpeg_idct_islow() in jidctint.c:
+   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
+   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
+   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
+   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
+   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
+   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
+   *
+   * This implementation:
+   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
+   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
+   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
+   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
+   *   tmp0 += z3;  tmp1 += z4;
+   *   tmp2 += z3;  tmp3 += z4;
+   */
+
+  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
+  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
+  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
+  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
+
+  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
+  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
+
+  tmp0 = vaddq_s32(tmp0, z3);
+  tmp1 = vaddq_s32(tmp1, z4);
+  tmp2 = vaddq_s32(tmp2, z3);
+  tmp3 = vaddq_s32(tmp3, z4);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+                                       vaddhn_s32(tmp12, tmp1));
+  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+                                       vaddhn_s32(tmp13, tmp0));
+  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+                                       vsubhn_s32(tmp11, tmp2));
+  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+                                       vsubhn_s32(tmp10, tmp3));
+  /* Descale and narrow to 8-bit. */
+  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+  /* Clamp to range [0-255]. */
+  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+
+  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
+   * together allows us to store 16-bit elements.)
+   */
+  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+  uint16x4x4_t cols_01_23_45_67 = { {
+    vreinterpret_u16_u8(cols_01_23.val[0]),
+    vreinterpret_u16_u8(cols_01_23.val[1]),
+    vreinterpret_u16_u8(cols_45_67.val[0]),
+    vreinterpret_u16_u8(cols_45_67.val[1])
+  } };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+  /* VST4 of 16-bit elements completes the transpose. */
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
+
+
+/* Performs the second pass of the accurate inverse DCT on a 4x8 block
+ * of coefficients.
+ *
+ * This "sparse" version assumes that the coefficient values (after the first
+ * pass) in rows 4-7 are all 0.  This simplifies the IDCT calculation,
+ * accelerating overall performance.
+ */
+
+static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
+                                                 JSAMPARRAY output_buf,
+                                                 JDIMENSION output_col,
+                                                 unsigned buf_offset)
+{
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
+#else
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  /* Even part (z3 is all 0) */
+  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
+
+  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
+  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
+
+  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
+  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
+  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
+  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
+  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
+
+  /* Odd part (tmp0 and tmp1 are both all 0) */
+  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
+  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
+
+  int16x4_t z3_s16 = tmp2_s16;
+  int16x4_t z4_s16 = tmp3_s16;
+
+  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
+  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
+  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
+  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
+
+  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
+  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
+  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
+  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
+                                       vaddhn_s32(tmp12, tmp1));
+  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
+                                       vaddhn_s32(tmp13, tmp0));
+  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
+                                       vsubhn_s32(tmp11, tmp2));
+  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
+                                       vsubhn_s32(tmp10, tmp3));
+  /* Descale and narrow to 8-bit. */
+  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
+  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
+  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
+  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
+  /* Clamp to range [0-255]. */
+  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
+                                 vdup_n_u8(CENTERJSAMPLE));
+
+  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
+   * together allows us to store 16-bit elements.)
+   */
+  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
+  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
+  uint16x4x4_t cols_01_23_45_67 = { {
+    vreinterpret_u16_u8(cols_01_23.val[0]),
+    vreinterpret_u16_u8(cols_01_23.val[1]),
+    vreinterpret_u16_u8(cols_45_67.val[0]),
+    vreinterpret_u16_u8(cols_45_67.val[1])
+  } };
+
+  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
+  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
+  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
+  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
+  /* VST4 of 16-bit elements completes the transpose. */
+  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
+  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
+  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
+  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
+}
diff --git a/external/jpeg/simd/arm/jidctred-neon.c b/external/jpeg/simd/arm/jidctred-neon.c
new file mode 100644
index 000000000000..be9627e61d47
--- /dev/null
+++ b/external/jpeg/simd/arm/jidctred-neon.c
@@ -0,0 +1,486 @@
+/*
+ * jidctred-neon.c - reduced-size IDCT (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+#include "align.h"
+#include "neon-compat.h"
+
+#include <arm_neon.h>
+
+
+#define CONST_BITS  13
+#define PASS1_BITS  2
+
+#define F_0_211  1730
+#define F_0_509  4176
+#define F_0_601  4926
+#define F_0_720  5906
+#define F_0_765  6270
+#define F_0_850  6967
+#define F_0_899  7373
+#define F_1_061  8697
+#define F_1_272  10426
+#define F_1_451  11893
+#define F_1_847  15137
+#define F_2_172  17799
+#define F_2_562  20995
+#define F_3_624  29692
+
+
+/* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
+ * 2x2 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_2x2() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.720959822 =  5906 * 2^-13
+ *    0.850430095 =  6967 * 2^-13
+ *    1.272758580 = 10426 * 2^-13
+ *    3.624509785 = 29692 * 2^-13
+ *
+ * See jidctred.c for further details of the 2x2 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_2x2_neon()
+ * match up with those in jpeg_idct_2x2().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
+  -F_0_720, F_0_850, -F_1_272, F_3_624
+};
+
+void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+  int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+  int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+  int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+  /* Dequantize DCT coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+  row1 = vmulq_s16(row1, quant_row1);
+  row3 = vmulq_s16(row3, quant_row3);
+  row5 = vmulq_s16(row5, quant_row5);
+  row7 = vmulq_s16(row7, quant_row7);
+
+  /* Load IDCT conversion constants. */
+  const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
+
+  /* Pass 1: process columns from input, put results in vectors row0 and
+   * row1.
+   */
+
+  /* Even part */
+  int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
+  int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
+
+  /* Odd part */
+  int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
+  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
+  int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
+  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
+
+  /* Final output stage: descale and narrow to 16-bit. */
+  row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
+  row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
+                      vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
+
+  /* Transpose two rows, ready for second pass. */
+  int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
+  int16x8_t cols_0246 = cols_0246_1357.val[0];
+  int16x8_t cols_1357 = cols_0246_1357.val[1];
+  /* Duplicate columns such that each is accessible in its own vector. */
+  int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
+                                         vreinterpretq_s32_s16(cols_1357));
+  int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
+  int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
+
+  /* Pass 2: process two rows, store to output array. */
+
+  /* Even part: we're only interested in col0; the top half of tmp10 is "don't
+   * care."
+   */
+  int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
+
+  /* Odd part: we're only interested in the bottom half of tmp0. */
+  int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
+  tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
+  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
+                                      vsubhn_s32(tmp10, tmp0));
+  output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
+                            CONST_BITS + PASS1_BITS + 3 + 2 - 16);
+  /* Narrow to 8-bit and convert to unsigned. */
+  uint8x8_t output_u8 = vqmovun_s16(output_s16);
+
+  /* Store 2x2 block to memory. */
+  vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
+  vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
+  vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
+  vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
+}
+
+
+/* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
+ * 4x4 output from an 8x8 DCT block.  It uses the same calculations and
+ * produces exactly the same output as IJG's original jpeg_idct_4x4() function
+ * from jpeg-6b, which can be found in jidctred.c.
+ *
+ * Scaled integer constants are used to avoid floating-point arithmetic:
+ *    0.211164243 =  1730 * 2^-13
+ *    0.509795579 =  4176 * 2^-13
+ *    0.601344887 =  4926 * 2^-13
+ *    0.765366865 =  6270 * 2^-13
+ *    0.899976223 =  7373 * 2^-13
+ *    1.061594337 =  8697 * 2^-13
+ *    1.451774981 = 11893 * 2^-13
+ *    1.847759065 = 15137 * 2^-13
+ *    2.172734803 = 17799 * 2^-13
+ *    2.562915447 = 20995 * 2^-13
+ *
+ * See jidctred.c for further details of the 4x4 IDCT algorithm.  Where
+ * possible, the variable names and comments here in jsimd_idct_4x4_neon()
+ * match up with those in jpeg_idct_4x4().
+ */
+
+ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
+  F_1_847, -F_0_765, -F_0_211,  F_1_451,
+ -F_2_172,  F_1_061, -F_0_509, -F_0_601,
+  F_0_899,  F_2_562,        0,        0
+};
+
+void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
+                         JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  ISLOW_MULT_TYPE *quantptr = dct_table;
+
+  /* Load DCT coefficients. */
+  int16x8_t row0  = vld1q_s16(coef_block + 0 * DCTSIZE);
+  int16x8_t row1  = vld1q_s16(coef_block + 1 * DCTSIZE);
+  int16x8_t row2  = vld1q_s16(coef_block + 2 * DCTSIZE);
+  int16x8_t row3  = vld1q_s16(coef_block + 3 * DCTSIZE);
+  int16x8_t row5  = vld1q_s16(coef_block + 5 * DCTSIZE);
+  int16x8_t row6  = vld1q_s16(coef_block + 6 * DCTSIZE);
+  int16x8_t row7  = vld1q_s16(coef_block + 7 * DCTSIZE);
+
+  /* Load quantization table values for DC coefficients. */
+  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
+  /* Dequantize DC coefficients. */
+  row0 = vmulq_s16(row0, quant_row0);
+
+  /* Construct bitmap to test if all AC coefficients are 0. */
+  int16x8_t bitmap = vorrq_s16(row1, row2);
+  bitmap = vorrq_s16(bitmap, row3);
+  bitmap = vorrq_s16(bitmap, row5);
+  bitmap = vorrq_s16(bitmap, row6);
+  bitmap = vorrq_s16(bitmap, row7);
+
+  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
+  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
+
+  /* Load constants for IDCT computation. */
+#ifdef HAVE_VLD1_S16_X3
+  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
+#else
+  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
+  const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
+  const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
+  const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
+  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
+#endif
+
+  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
+    /* All AC coefficients are zero.
+     * Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
+     */
+    int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
+    row0 = dcval;
+    row1 = dcval;
+    row2 = dcval;
+    row3 = dcval;
+  } else if (left_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 0, 1, 2, and 3.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_high_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_high_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                            CONST_BITS - PASS1_BITS + 1));
+  } else if (right_ac_bitmap == 0) {
+    /* AC coefficients are zero for columns 4, 5, 6, and 7.
+     * Compute DC values for these columns.
+     */
+    int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
+
+    /* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
+
+    /* Load quantization table. */
+    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
+    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
+    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
+    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
+    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
+    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+
+    int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
+    int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
+
+    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
+
+    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+    /* Odd part */
+    int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
+    z2 = vmul_s16(vget_low_s16(row5), quant_row5);
+    z3 = vmul_s16(vget_low_s16(row3), quant_row3);
+    int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
+
+    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
+    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
+    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
+    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
+
+    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
+    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
+    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
+    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
+                                     CONST_BITS - PASS1_BITS + 1), dcval);
+  } else {
+    /* All AC coefficients are non-zero; full IDCT calculation required. */
+    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
+    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
+    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
+    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
+    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
+    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
+
+    /* Even part */
+    int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
+    int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
+
+    int16x8_t z2 = vmulq_s16(row2, quant_row2);
+    int16x8_t z3 = vmulq_s16(row6, quant_row6);
+
+    int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
+    int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
+
+    int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
+    int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
+    int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
+
+    /* Odd part */
+    int16x8_t z1 = vmulq_s16(row7, quant_row7);
+    z2 = vmulq_s16(row5, quant_row5);
+    z3 = vmulq_s16(row3, quant_row3);
+    int16x8_t z4 = vmulq_s16(row1, quant_row1);
+
+    tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
+    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
+    tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
+    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
+
+    tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
+    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
+    tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
+    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
+
+    /* Final output stage: descale and narrow to 16-bit. */
+    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
+                                     CONST_BITS - PASS1_BITS + 1),
+                        vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
+                                     CONST_BITS - PASS1_BITS + 1));
+  }
+
+  /* Transpose 8x4 block to perform IDCT on rows in second pass. */
+  int16x8x2_t row_01 = vtrnq_s16(row0, row1);
+  int16x8x2_t row_23 = vtrnq_s16(row2, row3);
+
+  int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
+                                    vreinterpretq_s32_s16(row_23.val[0]));
+  int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
+                                    vreinterpretq_s32_s16(row_23.val[1]));
+
+  int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
+  int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
+  int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
+  int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
+  int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
+  int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
+  int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
+
+  /* Commence second pass of IDCT. */
+
+  /* Even part */
+  int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
+  int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
+
+  int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
+  int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
+
+  /* Odd part */
+  tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
+  tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
+  tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
+  tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
+
+  tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
+  tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
+  tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
+  tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
+
+  /* Final output stage: descale and clamp to range [0-255]. */
+  int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
+                                          vsubhn_s32(tmp12, tmp0));
+  int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
+                                          vsubhn_s32(tmp10, tmp2));
+  output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
+                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
+  /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
+   * An interleaving store completes the transpose.
+   */
+  uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
+                                    vqmovun_s16(output_cols_13));
+  uint16x4x2_t output_01_23 = { {
+    vreinterpret_u16_u8(output_0123.val[0]),
+    vreinterpret_u16_u8(output_0123.val[1])
+  } };
+
+  /* Store 4x4 block to memory. */
+  JSAMPROW outptr0 = output_buf[0] + output_col;
+  JSAMPROW outptr1 = output_buf[1] + output_col;
+  JSAMPROW outptr2 = output_buf[2] + output_col;
+  JSAMPROW outptr3 = output_buf[3] + output_col;
+  vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
+  vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
+  vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
+  vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
+}
diff --git a/external/jpeg/simd/arm/jquanti-neon.c b/external/jpeg/simd/arm/jquanti-neon.c
new file mode 100644
index 000000000000..a7eb6f1983b8
--- /dev/null
+++ b/external/jpeg/simd/arm/jquanti-neon.c
@@ -0,0 +1,190 @@
+/*
+ * jquanti-neon.c - sample data conversion and quantization (Arm Neon)
+ *
+ * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#define JPEG_INTERNALS
+#include "../../jinclude.h"
+#include "../../jpeglib.h"
+#include "../../jsimd.h"
+#include "../../jdct.h"
+#include "../../jsimddct.h"
+#include "../jsimd.h"
+
+#include <arm_neon.h>
+
+
+/* After downsampling, the resulting sample values are in the range [0, 255],
+ * but the Discrete Cosine Transform (DCT) operates on values centered around
+ * 0.
+ *
+ * To prepare sample values for the DCT, load samples into a DCT workspace,
+ * subtracting CENTERJSAMPLE (128).  The samples, now in the range [-128, 127],
+ * are also widened from 8- to 16-bit.
+ *
+ * The equivalent scalar C function convsamp() can be found in jcdctmgr.c.
+ */
+
+void jsimd_convsamp_neon(JSAMPARRAY sample_data, JDIMENSION start_col,
+                         DCTELEM *workspace)
+{
+  uint8x8_t samp_row0 = vld1_u8(sample_data[0] + start_col);
+  uint8x8_t samp_row1 = vld1_u8(sample_data[1] + start_col);
+  uint8x8_t samp_row2 = vld1_u8(sample_data[2] + start_col);
+  uint8x8_t samp_row3 = vld1_u8(sample_data[3] + start_col);
+  uint8x8_t samp_row4 = vld1_u8(sample_data[4] + start_col);
+  uint8x8_t samp_row5 = vld1_u8(sample_data[5] + start_col);
+  uint8x8_t samp_row6 = vld1_u8(sample_data[6] + start_col);
+  uint8x8_t samp_row7 = vld1_u8(sample_data[7] + start_col);
+
+  int16x8_t row0 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row0, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row1 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row1, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row2 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row2, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row3 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row3, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row4 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row4, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row5 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row5, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row6 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row6, vdup_n_u8(CENTERJSAMPLE)));
+  int16x8_t row7 =
+    vreinterpretq_s16_u16(vsubl_u8(samp_row7, vdup_n_u8(CENTERJSAMPLE)));
+
+  vst1q_s16(workspace + 0 * DCTSIZE, row0);
+  vst1q_s16(workspace + 1 * DCTSIZE, row1);
+  vst1q_s16(workspace + 2 * DCTSIZE, row2);
+  vst1q_s16(workspace + 3 * DCTSIZE, row3);
+  vst1q_s16(workspace + 4 * DCTSIZE, row4);
+  vst1q_s16(workspace + 5 * DCTSIZE, row5);
+  vst1q_s16(workspace + 6 * DCTSIZE, row6);
+  vst1q_s16(workspace + 7 * DCTSIZE, row7);
+}
+
+
+/* After the DCT, the resulting array of coefficient values needs to be divided
+ * by an array of quantization values.
+ *
+ * To avoid a slow division operation, the DCT coefficients are multiplied by
+ * the (scaled) reciprocals of the quantization values and then right-shifted.
+ *
+ * The equivalent scalar C function quantize() can be found in jcdctmgr.c.
+ */
+
+void jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
+                         DCTELEM *workspace)
+{
+  JCOEFPTR out_ptr = coef_block;
+  UDCTELEM *recip_ptr = (UDCTELEM *)divisors;
+  UDCTELEM *corr_ptr = (UDCTELEM *)divisors + DCTSIZE2;
+  DCTELEM *shift_ptr = divisors + 3 * DCTSIZE2;
+  int i;
+
+  for (i = 0; i < DCTSIZE; i += DCTSIZE / 2) {
+    /* Load DCT coefficients. */
+    int16x8_t row0 = vld1q_s16(workspace + (i + 0) * DCTSIZE);
+    int16x8_t row1 = vld1q_s16(workspace + (i + 1) * DCTSIZE);
+    int16x8_t row2 = vld1q_s16(workspace + (i + 2) * DCTSIZE);
+    int16x8_t row3 = vld1q_s16(workspace + (i + 3) * DCTSIZE);
+    /* Load reciprocals of quantization values. */
+    uint16x8_t recip0 = vld1q_u16(recip_ptr + (i + 0) * DCTSIZE);
+    uint16x8_t recip1 = vld1q_u16(recip_ptr + (i + 1) * DCTSIZE);
+    uint16x8_t recip2 = vld1q_u16(recip_ptr + (i + 2) * DCTSIZE);
+    uint16x8_t recip3 = vld1q_u16(recip_ptr + (i + 3) * DCTSIZE);
+    uint16x8_t corr0 = vld1q_u16(corr_ptr + (i + 0) * DCTSIZE);
+    uint16x8_t corr1 = vld1q_u16(corr_ptr + (i + 1) * DCTSIZE);
+    uint16x8_t corr2 = vld1q_u16(corr_ptr + (i + 2) * DCTSIZE);
+    uint16x8_t corr3 = vld1q_u16(corr_ptr + (i + 3) * DCTSIZE);
+    int16x8_t shift0 = vld1q_s16(shift_ptr + (i + 0) * DCTSIZE);
+    int16x8_t shift1 = vld1q_s16(shift_ptr + (i + 1) * DCTSIZE);
+    int16x8_t shift2 = vld1q_s16(shift_ptr + (i + 2) * DCTSIZE);
+    int16x8_t shift3 = vld1q_s16(shift_ptr + (i + 3) * DCTSIZE);
+
+    /* Extract sign from coefficients. */
+    int16x8_t sign_row0 = vshrq_n_s16(row0, 15);
+    int16x8_t sign_row1 = vshrq_n_s16(row1, 15);
+    int16x8_t sign_row2 = vshrq_n_s16(row2, 15);
+    int16x8_t sign_row3 = vshrq_n_s16(row3, 15);
+    /* Get absolute value of DCT coefficients. */
+    uint16x8_t abs_row0 = vreinterpretq_u16_s16(vabsq_s16(row0));
+    uint16x8_t abs_row1 = vreinterpretq_u16_s16(vabsq_s16(row1));
+    uint16x8_t abs_row2 = vreinterpretq_u16_s16(vabsq_s16(row2));
+    uint16x8_t abs_row3 = vreinterpretq_u16_s16(vabsq_s16(row3));
+    /* Add correction. */
+    abs_row0 = vaddq_u16(abs_row0, corr0);
+    abs_row1 = vaddq_u16(abs_row1, corr1);
+    abs_row2 = vaddq_u16(abs_row2, corr2);
+    abs_row3 = vaddq_u16(abs_row3, corr3);
+
+    /* Multiply DCT coefficients by quantization reciprocals. */
+    int32x4_t row0_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row0),
+                                                       vget_low_u16(recip0)));
+    int32x4_t row0_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row0),
+                                                       vget_high_u16(recip0)));
+    int32x4_t row1_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row1),
+                                                       vget_low_u16(recip1)));
+    int32x4_t row1_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row1),
+                                                       vget_high_u16(recip1)));
+    int32x4_t row2_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row2),
+                                                       vget_low_u16(recip2)));
+    int32x4_t row2_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row2),
+                                                       vget_high_u16(recip2)));
+    int32x4_t row3_l = vreinterpretq_s32_u32(vmull_u16(vget_low_u16(abs_row3),
+                                                       vget_low_u16(recip3)));
+    int32x4_t row3_h = vreinterpretq_s32_u32(vmull_u16(vget_high_u16(abs_row3),
+                                                       vget_high_u16(recip3)));
+    /* Narrow back to 16-bit. */
+    row0 = vcombine_s16(vshrn_n_s32(row0_l, 16), vshrn_n_s32(row0_h, 16));
+    row1 = vcombine_s16(vshrn_n_s32(row1_l, 16), vshrn_n_s32(row1_h, 16));
+    row2 = vcombine_s16(vshrn_n_s32(row2_l, 16), vshrn_n_s32(row2_h, 16));
+    row3 = vcombine_s16(vshrn_n_s32(row3_l, 16), vshrn_n_s32(row3_h, 16));
+
+    /* Since VSHR only supports an immediate as its second argument, negate the
+     * shift value and shift left.
+     */
+    row0 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row0),
+                                           vnegq_s16(shift0)));
+    row1 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row1),
+                                           vnegq_s16(shift1)));
+    row2 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row2),
+                                           vnegq_s16(shift2)));
+    row3 = vreinterpretq_s16_u16(vshlq_u16(vreinterpretq_u16_s16(row3),
+                                           vnegq_s16(shift3)));
+
+    /* Restore sign to original product. */
+    row0 = veorq_s16(row0, sign_row0);
+    row0 = vsubq_s16(row0, sign_row0);
+    row1 = veorq_s16(row1, sign_row1);
+    row1 = vsubq_s16(row1, sign_row1);
+    row2 = veorq_s16(row2, sign_row2);
+    row2 = vsubq_s16(row2, sign_row2);
+    row3 = veorq_s16(row3, sign_row3);
+    row3 = vsubq_s16(row3, sign_row3);
+
+    /* Store quantized coefficients to memory. */
+    vst1q_s16(out_ptr + (i + 0) * DCTSIZE, row0);
+    vst1q_s16(out_ptr + (i + 1) * DCTSIZE, row1);
+    vst1q_s16(out_ptr + (i + 2) * DCTSIZE, row2);
+    vst1q_s16(out_ptr + (i + 3) * DCTSIZE, row3);
+  }
+}
diff --git a/external/jpeg/simd/arm/jsimd_neon.S b/external/jpeg/simd/arm/jsimd_neon.S
deleted file mode 100644
index 345e060c3f68..000000000000
--- a/external/jpeg/simd/arm/jsimd_neon.S
+++ /dev/null
@@ -1,2880 +0,0 @@
-/*
- * Armv7 Neon optimizations for libjpeg-turbo
- *
- * Copyright (C) 2009-2011, Nokia Corporation and/or its subsidiary(-ies).
- *                          All Rights Reserved.
- * Author:  Siarhei Siamashka <siarhei.siamashka@nokia.com>
- * Copyright (C) 2014, Siarhei Siamashka.  All Rights Reserved.
- * Copyright (C) 2014, Linaro Limited.  All Rights Reserved.
- * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2015-2016, 2018, Matthieu Darbois.  All Rights Reserved.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-#if defined(__linux__) && defined(__ELF__)
-.section .note.GNU-stack, "", %progbits  /* mark stack as non-executable */
-#endif
-
-.text
-#if !defined(__APPLE__)
-.fpu neon
-.arch armv7a
-.object_arch armv4
-#endif
-
-.arm
-.syntax unified
-
-#define RESPECT_STRICT_ALIGNMENT  1
-
-
-/*****************************************************************************/
-
-/* Supplementary macro for setting function attributes */
-.macro asm_function fname
-#ifdef __APPLE__
-    .private_extern _\fname
-    .globl _\fname
-_\fname:
-#else
-    .global \fname
-#ifdef __ELF__
-    .hidden \fname
-    .type \fname, %function
-#endif
-\fname:
-#endif
-.endm
-
-/* Transpose a block of 4x4 coefficients in four 64-bit registers */
-.macro transpose_4x4 x0, x1, x2, x3
-    vtrn.16         \x0, \x1
-    vtrn.16         \x2, \x3
-    vtrn.32         \x0, \x2
-    vtrn.32         \x1, \x3
-.endm
-
-
-#define CENTERJSAMPLE  128
-
-/*****************************************************************************/
-
-/*
- * Perform dequantization and inverse DCT on one block of coefficients.
- *
- * GLOBAL(void)
- * jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
- *                       JSAMPARRAY output_buf, JDIMENSION output_col)
- */
-
-#define FIX_0_298631336  (2446)
-#define FIX_0_390180644  (3196)
-#define FIX_0_541196100  (4433)
-#define FIX_0_765366865  (6270)
-#define FIX_0_899976223  (7373)
-#define FIX_1_175875602  (9633)
-#define FIX_1_501321110  (12299)
-#define FIX_1_847759065  (15137)
-#define FIX_1_961570560  (16069)
-#define FIX_2_053119869  (16819)
-#define FIX_2_562915447  (20995)
-#define FIX_3_072711026  (25172)
-
-#define FIX_1_175875602_MINUS_1_961570560  (FIX_1_175875602 - FIX_1_961570560)
-#define FIX_1_175875602_MINUS_0_390180644  (FIX_1_175875602 - FIX_0_390180644)
-#define FIX_0_541196100_MINUS_1_847759065  (FIX_0_541196100 - FIX_1_847759065)
-#define FIX_3_072711026_MINUS_2_562915447  (FIX_3_072711026 - FIX_2_562915447)
-#define FIX_0_298631336_MINUS_0_899976223  (FIX_0_298631336 - FIX_0_899976223)
-#define FIX_1_501321110_MINUS_0_899976223  (FIX_1_501321110 - FIX_0_899976223)
-#define FIX_2_053119869_MINUS_2_562915447  (FIX_2_053119869 - FIX_2_562915447)
-#define FIX_0_541196100_PLUS_0_765366865   (FIX_0_541196100 + FIX_0_765366865)
-
-/*
- * Reference SIMD-friendly 1-D ISLOW iDCT C implementation.
- * Uses some ideas from the comments in 'simd/jiss2int-64.asm'
- */
-#define REF_1D_IDCT(xrow0, xrow1, xrow2, xrow3, xrow4, xrow5, xrow6, xrow7) { \
-  DCTELEM row0, row1, row2, row3, row4, row5, row6, row7; \
-  JLONG   q1, q2, q3, q4, q5, q6, q7; \
-  JLONG   tmp11_plus_tmp2, tmp11_minus_tmp2; \
-  \
-  /* 1-D iDCT input data */ \
-  row0 = xrow0; \
-  row1 = xrow1; \
-  row2 = xrow2; \
-  row3 = xrow3; \
-  row4 = xrow4; \
-  row5 = xrow5; \
-  row6 = xrow6; \
-  row7 = xrow7; \
-  \
-  q5 = row7 + row3; \
-  q4 = row5 + row1; \
-  q6 = MULTIPLY(q5, FIX_1_175875602_MINUS_1_961570560) + \
-       MULTIPLY(q4, FIX_1_175875602); \
-  q7 = MULTIPLY(q5, FIX_1_175875602) + \
-       MULTIPLY(q4, FIX_1_175875602_MINUS_0_390180644); \
-  q2 = MULTIPLY(row2, FIX_0_541196100) + \
-       MULTIPLY(row6, FIX_0_541196100_MINUS_1_847759065); \
-  q4 = q6; \
-  q3 = ((JLONG)row0 - (JLONG)row4) << 13; \
-  q6 += MULTIPLY(row5, -FIX_2_562915447) + \
-        MULTIPLY(row3, FIX_3_072711026_MINUS_2_562915447); \
-  /* now we can use q1 (reloadable constants have been used up) */ \
-  q1 = q3 + q2; \
-  q4 += MULTIPLY(row7, FIX_0_298631336_MINUS_0_899976223) + \
-        MULTIPLY(row1, -FIX_0_899976223); \
-  q5 = q7; \
-  q1 = q1 + q6; \
-  q7 += MULTIPLY(row7, -FIX_0_899976223) + \
-        MULTIPLY(row1, FIX_1_501321110_MINUS_0_899976223); \
-  \
-  /* (tmp11 + tmp2) has been calculated (out_row1 before descale) */ \
-  tmp11_plus_tmp2 = q1; \
-  row1 = 0; \
-  \
-  q1 = q1 - q6; \
-  q5 += MULTIPLY(row5, FIX_2_053119869_MINUS_2_562915447) + \
-        MULTIPLY(row3, -FIX_2_562915447); \
-  q1 = q1 - q6; \
-  q6 = MULTIPLY(row2, FIX_0_541196100_PLUS_0_765366865) + \
-       MULTIPLY(row6, FIX_0_541196100); \
-  q3 = q3 - q2; \
-  \
-  /* (tmp11 - tmp2) has been calculated (out_row6 before descale) */ \
-  tmp11_minus_tmp2 = q1; \
-  \
-  q1 = ((JLONG)row0 + (JLONG)row4) << 13; \
-  q2 = q1 + q6; \
-  q1 = q1 - q6; \
-  \
-  /* pick up the results */ \
-  tmp0  = q4; \
-  tmp1  = q5; \
-  tmp2  = (tmp11_plus_tmp2 - tmp11_minus_tmp2) / 2; \
-  tmp3  = q7; \
-  tmp10 = q2; \
-  tmp11 = (tmp11_plus_tmp2 + tmp11_minus_tmp2) / 2; \
-  tmp12 = q3; \
-  tmp13 = q1; \
-}
-
-#define XFIX_0_899976223                    d0[0]
-#define XFIX_0_541196100                    d0[1]
-#define XFIX_2_562915447                    d0[2]
-#define XFIX_0_298631336_MINUS_0_899976223  d0[3]
-#define XFIX_1_501321110_MINUS_0_899976223  d1[0]
-#define XFIX_2_053119869_MINUS_2_562915447  d1[1]
-#define XFIX_0_541196100_PLUS_0_765366865   d1[2]
-#define XFIX_1_175875602                    d1[3]
-#define XFIX_1_175875602_MINUS_0_390180644  d2[0]
-#define XFIX_0_541196100_MINUS_1_847759065  d2[1]
-#define XFIX_3_072711026_MINUS_2_562915447  d2[2]
-#define XFIX_1_175875602_MINUS_1_961570560  d2[3]
-
-.balign 16
-jsimd_idct_islow_neon_consts:
-  .short FIX_0_899976223                    /* d0[0] */
-  .short FIX_0_541196100                    /* d0[1] */
-  .short FIX_2_562915447                    /* d0[2] */
-  .short FIX_0_298631336_MINUS_0_899976223  /* d0[3] */
-  .short FIX_1_501321110_MINUS_0_899976223  /* d1[0] */
-  .short FIX_2_053119869_MINUS_2_562915447  /* d1[1] */
-  .short FIX_0_541196100_PLUS_0_765366865   /* d1[2] */
-  .short FIX_1_175875602                    /* d1[3] */
-  /* reloadable constants */
-  .short FIX_1_175875602_MINUS_0_390180644  /* d2[0] */
-  .short FIX_0_541196100_MINUS_1_847759065  /* d2[1] */
-  .short FIX_3_072711026_MINUS_2_562915447  /* d2[2] */
-  .short FIX_1_175875602_MINUS_1_961570560  /* d2[3] */
-
-asm_function jsimd_idct_islow_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req r1
-    TMP3            .req r2
-    TMP4            .req ip
-
-    ROW0L           .req d16
-    ROW0R           .req d17
-    ROW1L           .req d18
-    ROW1R           .req d19
-    ROW2L           .req d20
-    ROW2R           .req d21
-    ROW3L           .req d22
-    ROW3R           .req d23
-    ROW4L           .req d24
-    ROW4R           .req d25
-    ROW5L           .req d26
-    ROW5R           .req d27
-    ROW6L           .req d28
-    ROW6R           .req d29
-    ROW7L           .req d30
-    ROW7R           .req d31
-
-    /* Load and dequantize coefficients into Neon registers
-     * with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17     ( q8  )
-     *   1 | d18     | d19     ( q9  )
-     *   2 | d20     | d21     ( q10 )
-     *   3 | d22     | d23     ( q11 )
-     *   4 | d24     | d25     ( q12 )
-     *   5 | d26     | d27     ( q13 )
-     *   6 | d28     | d29     ( q14 )
-     *   7 | d30     | d31     ( q15 )
-     */
-    adr             ip, jsimd_idct_islow_neon_consts
-    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
-    vmul.s16        q8, q8, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q9, q9, q1
-    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
-    vmul.s16        q10, q10, q2
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vmul.s16        q11, q11, q3
-    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
-    vmul.s16        q12, q12, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q14, q14, q2
-    vmul.s16        q13, q13, q1
-    vld1.16         {d0, d1, d2, d3}, [ip, :128]  /* load constants */
-    add             ip, ip, #16
-    vmul.s16        q15, q15, q3
-    vpush           {d8 - d15}                    /* save Neon registers */
-    /* 1-D IDCT, pass 1, left 4x8 half */
-    vadd.s16        d4, ROW7L, ROW3L
-    vadd.s16        d5, ROW5L, ROW1L
-    vmull.s16       q6, d4, XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6, d5, XFIX_1_175875602
-    vmull.s16       q7, d4, XFIX_1_175875602
-      /* Check for the zero coefficients in the right 4x8 half */
-      push            {r4, r5}
-    vmlal.s16       q7, d5, XFIX_1_175875602_MINUS_0_390180644
-    vsubl.s16       q3, ROW0L, ROW4L
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 1 * 8))]
-    vmull.s16       q2, ROW2L, XFIX_0_541196100
-    vmlal.s16       q2, ROW6L, XFIX_0_541196100_MINUS_1_847759065
-      orr             r0, r4, r5
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW5L, XFIX_2_562915447
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 2 * 8))]
-    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vshl.s32        q3, q3, #13
-      orr             r0, r0, r4
-    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
-      orr             r0, r0, r5
-    vadd.s32        q1, q3, q2
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 3 * 8))]
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-      orr             r0, r0, r4
-    vmlsl.s16       q7, ROW7L, XFIX_0_899976223
-      orr             r0, r0, r5
-    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vrshrn.s32      ROW1L, q1, #11
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 4 * 8))]
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW5L, XFIX_2_053119869_MINUS_2_562915447
-      orr             r0, r0, r4
-    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
-      orr             r0, r0, r5
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 5 * 8))]
-    vmlal.s16       q6, ROW6L, XFIX_0_541196100
-    vsub.s32        q3, q3, q2
-      orr             r0, r0, r4
-    vrshrn.s32      ROW6L, q1, #11
-      orr             r0, r0, r5
-    vadd.s32        q1, q3, q5
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 6 * 8))]
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW0L, ROW4L
-      orr             r0, r0, r4
-    vrshrn.s32      ROW2L, q1, #11
-      orr             r0, r0, r5
-    vrshrn.s32      ROW5L, q3, #11
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 7 * 8))]
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW7L, XFIX_0_298631336_MINUS_0_899976223
-      orr             r0, r0, r4
-    vadd.s32        q2, q5, q6
-      orrs            r0, r0, r5
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-      ldrd            r4, [COEF_BLOCK, #(-96 + 2 * (4 + 0 * 8))]
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-      orr             r0, r4, r5
-    vsub.s32        q3, q1, q4
-      pop             {r4, r5}
-    vrshrn.s32      ROW7L, q2, #11
-    vrshrn.s32      ROW3L, q5, #11
-    vrshrn.s32      ROW0L, q6, #11
-    vrshrn.s32      ROW4L, q3, #11
-
-      beq             3f  /* Go to do some special handling for the sparse
-                             right 4x8 half */
-
-    /* 1-D IDCT, pass 1, right 4x8 half */
-    vld1.s16        {d2}, [ip, :64]  /* reload constants */
-    vadd.s16        d10, ROW7R, ROW3R
-    vadd.s16        d8, ROW5R, ROW1R
-      /* Transpose left 4x8 half */
-      vtrn.16         ROW6L, ROW7L
-    vmull.s16       q6, d10, XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6, d8, XFIX_1_175875602
-      vtrn.16         ROW2L, ROW3L
-    vmull.s16       q7, d10, XFIX_1_175875602
-    vmlal.s16       q7, d8, XFIX_1_175875602_MINUS_0_390180644
-      vtrn.16         ROW0L, ROW1L
-    vsubl.s16       q3, ROW0R, ROW4R
-    vmull.s16       q2, ROW2R, XFIX_0_541196100
-    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
-      vtrn.16         ROW4L, ROW5L
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
-    vmlal.s16       q6, ROW3R, XFIX_3_072711026_MINUS_2_562915447
-      vtrn.32         ROW1L, ROW3L
-    vshl.s32        q3, q3, #13
-    vmlsl.s16       q4, ROW1R, XFIX_0_899976223
-      vtrn.32         ROW4L, ROW6L
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-      vtrn.32         ROW0L, ROW2L
-    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
-    vmlal.s16       q7, ROW1R, XFIX_1_501321110_MINUS_0_899976223
-    vrshrn.s32      ROW1R, q1, #11
-      vtrn.32         ROW5L, ROW7L
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
-    vmlsl.s16       q5, ROW3R, XFIX_2_562915447
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2R, XFIX_0_541196100_PLUS_0_765366865
-    vmlal.s16       q6, ROW6R, XFIX_0_541196100
-    vsub.s32        q3, q3, q2
-    vrshrn.s32      ROW6R, q1, #11
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW0R, ROW4R
-    vrshrn.s32      ROW2R, q1, #11
-    vrshrn.s32      ROW5R, q3, #11
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vrshrn.s32      ROW7R, q2, #11
-    vrshrn.s32      ROW3R, q5, #11
-    vrshrn.s32      ROW0R, q6, #11
-    vrshrn.s32      ROW4R, q3, #11
-    /* Transpose right 4x8 half */
-    vtrn.16         ROW6R, ROW7R
-    vtrn.16         ROW2R, ROW3R
-    vtrn.16         ROW0R, ROW1R
-    vtrn.16         ROW4R, ROW5R
-    vtrn.32         ROW1R, ROW3R
-    vtrn.32         ROW4R, ROW6R
-    vtrn.32         ROW0R, ROW2R
-    vtrn.32         ROW5R, ROW7R
-
-1:  /* 1-D IDCT, pass 2 (normal variant), left 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW1R, XFIX_1_175875602   /* ROW5L <-> ROW1R */
-    vmlal.s16       q6, ROW1L, XFIX_1_175875602
-    vmlal.s16       q6, ROW3R, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
-    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7, ROW3R, XFIX_1_175875602   /* ROW7L <-> ROW3R */
-    vmlal.s16       q7, ROW3L, XFIX_1_175875602
-    vmlal.s16       q7, ROW1R, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
-    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
-    vsubl.s16       q3, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
-    vmull.s16       q2, ROW2L, XFIX_0_541196100
-    vmlal.s16       q2, ROW2R, XFIX_0_541196100_MINUS_1_847759065  /* ROW6L <-> ROW2R */
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW1R, XFIX_2_562915447   /* ROW5L <-> ROW1R */
-    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vshl.s32        q3, q3, #13
-    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-    vmlsl.s16       q7, ROW3R, XFIX_0_899976223   /* ROW7L <-> ROW3R */
-    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vshrn.s32       ROW1L, q1, #16
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW1R, XFIX_2_053119869_MINUS_2_562915447  /* ROW5L <-> ROW1R */
-    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
-    vmlal.s16       q6, ROW2R, XFIX_0_541196100   /* ROW6L <-> ROW2R */
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW0L, ROW0R              /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW2L, q1, #16
-    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW3R, XFIX_0_298631336_MINUS_0_899976223  /* ROW7L <-> ROW3R */
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW3L, q5, #16
-    vshrn.s32       ROW0L, q6, #16
-    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
-    /* 1-D IDCT, pass 2, right 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW5R, XFIX_1_175875602
-    vmlal.s16       q6, ROW5L, XFIX_1_175875602   /* ROW5L <-> ROW1R */
-    vmlal.s16       q6, ROW7R, XFIX_1_175875602_MINUS_1_961570560
-    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560  /* ROW7L <-> ROW3R */
-    vmull.s16       q7, ROW7R, XFIX_1_175875602
-    vmlal.s16       q7, ROW7L, XFIX_1_175875602   /* ROW7L <-> ROW3R */
-    vmlal.s16       q7, ROW5R, XFIX_1_175875602_MINUS_0_390180644
-    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644  /* ROW5L <-> ROW1R */
-    vsubl.s16       q3, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
-    vmull.s16       q2, ROW6L, XFIX_0_541196100   /* ROW6L <-> ROW2R */
-    vmlal.s16       q2, ROW6R, XFIX_0_541196100_MINUS_1_847759065
-    vmov            q4, q6
-    vmlsl.s16       q6, ROW5R, XFIX_2_562915447
-    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447  /* ROW7L <-> ROW3R */
-    vshl.s32        q3, q3, #13
-    vmlsl.s16       q4, ROW5L, XFIX_0_899976223   /* ROW5L <-> ROW1R */
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vadd.s32        q1, q1, q6
-    vmlsl.s16       q7, ROW7R, XFIX_0_899976223
-    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223  /* ROW5L <-> ROW1R */
-    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
-    vsub.s32        q1, q1, q6
-    vmlal.s16       q5, ROW5R, XFIX_2_053119869_MINUS_2_562915447
-    vmlsl.s16       q5, ROW7L, XFIX_2_562915447   /* ROW7L <-> ROW3R */
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865  /* ROW6L <-> ROW2R */
-    vmlal.s16       q6, ROW6R, XFIX_0_541196100
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW6R, q1, #16
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vaddl.s16       q5, ROW4L, ROW4R              /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
-    vshrn.s32       ROW5R, q3, #16
-    vshl.s32        q5, q5, #13
-    vmlal.s16       q4, ROW7R, XFIX_0_298631336_MINUS_0_899976223
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW7R, q2, #16
-    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW4R, q3, #16
-
-2:  /* Descale to 8-bit and range limit */
-    vqrshrn.s16     d16, q8, #2
-    vqrshrn.s16     d17, q9, #2
-    vqrshrn.s16     d18, q10, #2
-    vqrshrn.s16     d19, q11, #2
-    vpop            {d8 - d15}                    /* restore Neon registers */
-    vqrshrn.s16     d20, q12, #2
-      /* Transpose the final 8-bit samples and do signed->unsigned conversion */
-      vtrn.16         q8, q9
-    vqrshrn.s16     d21, q13, #2
-    vqrshrn.s16     d22, q14, #2
-      vmov.u8         q0, #(CENTERJSAMPLE)
-    vqrshrn.s16     d23, q15, #2
-      vtrn.8          d16, d17
-      vtrn.8          d18, d19
-      vadd.u8         q8, q8, q0
-      vadd.u8         q9, q9, q0
-      vtrn.16         q10, q11
-        /* Store results to the output buffer */
-        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-        add             TMP1, TMP1, OUTPUT_COL
-        add             TMP2, TMP2, OUTPUT_COL
-        vst1.8          {d16}, [TMP1]
-      vtrn.8          d20, d21
-        vst1.8          {d17}, [TMP2]
-        ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-        add             TMP1, TMP1, OUTPUT_COL
-        add             TMP2, TMP2, OUTPUT_COL
-        vst1.8          {d18}, [TMP1]
-      vadd.u8         q10, q10, q0
-        vst1.8          {d19}, [TMP2]
-        ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
-        add             TMP1, TMP1, OUTPUT_COL
-        add             TMP2, TMP2, OUTPUT_COL
-        add             TMP3, TMP3, OUTPUT_COL
-        add             TMP4, TMP4, OUTPUT_COL
-      vtrn.8          d22, d23
-        vst1.8          {d20}, [TMP1]
-      vadd.u8         q11, q11, q0
-        vst1.8          {d21}, [TMP2]
-        vst1.8          {d22}, [TMP3]
-        vst1.8          {d23}, [TMP4]
-    bx              lr
-
-3:  /* Left 4x8 half is done, right 4x8 half contains mostly zeros */
-
-    /* Transpose left 4x8 half */
-    vtrn.16         ROW6L, ROW7L
-    vtrn.16         ROW2L, ROW3L
-    vtrn.16         ROW0L, ROW1L
-    vtrn.16         ROW4L, ROW5L
-    vshl.s16        ROW0R, ROW0R, #2  /* PASS1_BITS */
-    vtrn.32         ROW1L, ROW3L
-    vtrn.32         ROW4L, ROW6L
-    vtrn.32         ROW0L, ROW2L
-    vtrn.32         ROW5L, ROW7L
-
-    cmp             r0, #0
-    beq             4f  /* Right 4x8 half has all zeros, go to 'sparse' second
-                           pass */
-
-    /* Only row 0 is non-zero for the right 4x8 half  */
-    vdup.s16        ROW1R, ROW0R[1]
-    vdup.s16        ROW2R, ROW0R[2]
-    vdup.s16        ROW3R, ROW0R[3]
-    vdup.s16        ROW4R, ROW0R[0]
-    vdup.s16        ROW5R, ROW0R[1]
-    vdup.s16        ROW6R, ROW0R[2]
-    vdup.s16        ROW7R, ROW0R[3]
-    vdup.s16        ROW0R, ROW0R[0]
-    b               1b  /* Go to 'normal' second pass */
-
-4:  /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), left 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW1L, XFIX_1_175875602
-    vmlal.s16       q6, ROW3L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7, ROW3L, XFIX_1_175875602
-    vmlal.s16       q7, ROW1L, XFIX_1_175875602_MINUS_0_390180644
-    vmull.s16       q2, ROW2L, XFIX_0_541196100
-    vshll.s16       q3, ROW0L, #13
-    vmov            q4, q6
-    vmlal.s16       q6, ROW3L, XFIX_3_072711026_MINUS_2_562915447
-    vmlsl.s16       q4, ROW1L, XFIX_0_899976223
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vmlal.s16       q7, ROW1L, XFIX_1_501321110_MINUS_0_899976223
-    vadd.s32        q1, q1, q6
-    vadd.s32        q6, q6, q6
-    vmlsl.s16       q5, ROW3L, XFIX_2_562915447
-    vshrn.s32       ROW1L, q1, #16
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW2L, XFIX_0_541196100_PLUS_0_765366865
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW2R, q1, #16                /* ROW6L <-> ROW2R */
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vshll.s16       q5, ROW0L, #13
-    vshrn.s32       ROW2L, q1, #16
-    vshrn.s32       ROW1R, q3, #16                /* ROW5L <-> ROW1R */
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW3R, q2, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW3L, q5, #16
-    vshrn.s32       ROW0L, q6, #16
-    vshrn.s32       ROW0R, q3, #16                /* ROW4L <-> ROW0R */
-    /* 1-D IDCT, pass 2 (sparse variant with zero rows 4-7), right 4x8 half */
-    vld1.s16        {d2}, [ip, :64]               /* reload constants */
-    vmull.s16       q6, ROW5L, XFIX_1_175875602
-    vmlal.s16       q6, ROW7L, XFIX_1_175875602_MINUS_1_961570560
-    vmull.s16       q7, ROW7L, XFIX_1_175875602
-    vmlal.s16       q7, ROW5L, XFIX_1_175875602_MINUS_0_390180644
-    vmull.s16       q2, ROW6L, XFIX_0_541196100
-    vshll.s16       q3, ROW4L, #13
-    vmov            q4, q6
-    vmlal.s16       q6, ROW7L, XFIX_3_072711026_MINUS_2_562915447
-    vmlsl.s16       q4, ROW5L, XFIX_0_899976223
-    vadd.s32        q1, q3, q2
-    vmov            q5, q7
-    vmlal.s16       q7, ROW5L, XFIX_1_501321110_MINUS_0_899976223
-    vadd.s32        q1, q1, q6
-    vadd.s32        q6, q6, q6
-    vmlsl.s16       q5, ROW7L, XFIX_2_562915447
-    vshrn.s32       ROW5L, q1, #16                /* ROW5L <-> ROW1R */
-    vsub.s32        q1, q1, q6
-    vmull.s16       q6, ROW6L, XFIX_0_541196100_PLUS_0_765366865
-    vsub.s32        q3, q3, q2
-    vshrn.s32       ROW6R, q1, #16
-    vadd.s32        q1, q3, q5
-    vsub.s32        q3, q3, q5
-    vshll.s16       q5, ROW4L, #13
-    vshrn.s32       ROW6L, q1, #16                /* ROW6L <-> ROW2R */
-    vshrn.s32       ROW5R, q3, #16
-    vadd.s32        q2, q5, q6
-    vsub.s32        q1, q5, q6
-    vadd.s32        q6, q2, q7
-    vsub.s32        q2, q2, q7
-    vadd.s32        q5, q1, q4
-    vsub.s32        q3, q1, q4
-    vshrn.s32       ROW7R, q2, #16
-    vshrn.s32       ROW7L, q5, #16                /* ROW7L <-> ROW3R */
-    vshrn.s32       ROW4L, q6, #16                /* ROW4L <-> ROW0R */
-    vshrn.s32       ROW4R, q3, #16
-    b               2b                            /* Go to epilogue */
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-    .unreq          ROW0L
-    .unreq          ROW0R
-    .unreq          ROW1L
-    .unreq          ROW1R
-    .unreq          ROW2L
-    .unreq          ROW2R
-    .unreq          ROW3L
-    .unreq          ROW3R
-    .unreq          ROW4L
-    .unreq          ROW4R
-    .unreq          ROW5L
-    .unreq          ROW5R
-    .unreq          ROW6L
-    .unreq          ROW6R
-    .unreq          ROW7L
-    .unreq          ROW7R
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the inverse DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_ifast'
- * function from jidctfst.c
- *
- * Normally 1-D AAN DCT needs 5 multiplications and 29 additions.
- * But in Arm Neon case some extra additions are required because VQDMULH
- * instruction can't handle the constants larger than 1. So the expressions
- * like "x * 1.082392200" have to be converted to "x * 0.082392200 + x",
- * which introduces an extra addition. Overall, there are 6 extra additions
- * per 1-D IDCT pass, totalling to 5 VQDMULH and 35 VADD/VSUB instructions.
- */
-
-#define XFIX_1_082392200  d0[0]
-#define XFIX_1_414213562  d0[1]
-#define XFIX_1_847759065  d0[2]
-#define XFIX_2_613125930  d0[3]
-
-.balign 16
-jsimd_idct_ifast_neon_consts:
-  .short (277 * 128 - 256 * 128)  /* XFIX_1_082392200 */
-  .short (362 * 128 - 256 * 128)  /* XFIX_1_414213562 */
-  .short (473 * 128 - 256 * 128)  /* XFIX_1_847759065 */
-  .short (669 * 128 - 512 * 128)  /* XFIX_2_613125930 */
-
-asm_function jsimd_idct_ifast_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req r1
-    TMP3            .req r2
-    TMP4            .req ip
-
-    /* Load and dequantize coefficients into Neon registers
-     * with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17     ( q8  )
-     *   1 | d18     | d19     ( q9  )
-     *   2 | d20     | d21     ( q10 )
-     *   3 | d22     | d23     ( q11 )
-     *   4 | d24     | d25     ( q12 )
-     *   5 | d26     | d27     ( q13 )
-     *   6 | d28     | d29     ( q14 )
-     *   7 | d30     | d31     ( q15 )
-     */
-    adr             ip, jsimd_idct_ifast_neon_consts
-    vld1.16         {d16, d17, d18, d19}, [COEF_BLOCK, :128]!
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vld1.16         {d20, d21, d22, d23}, [COEF_BLOCK, :128]!
-    vmul.s16        q8, q8, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q9, q9, q1
-    vld1.16         {d24, d25, d26, d27}, [COEF_BLOCK, :128]!
-    vmul.s16        q10, q10, q2
-    vld1.16         {d0, d1, d2, d3}, [DCT_TABLE, :128]!
-    vmul.s16        q11, q11, q3
-    vld1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]
-    vmul.s16        q12, q12, q0
-    vld1.16         {d4, d5, d6, d7}, [DCT_TABLE, :128]!
-    vmul.s16        q14, q14, q2
-    vmul.s16        q13, q13, q1
-    vld1.16         {d0}, [ip, :64]  /* load constants */
-    vmul.s16        q15, q15, q3
-    vpush           {d8 - d13}       /* save Neon registers */
-    /* 1-D IDCT, pass 1 */
-    vsub.s16        q2, q10, q14
-    vadd.s16        q14, q10, q14
-    vsub.s16        q1, q11, q13
-    vadd.s16        q13, q11, q13
-    vsub.s16        q5, q9, q15
-    vadd.s16        q15, q9, q15
-    vqdmulh.s16     q4, q2, XFIX_1_414213562
-    vqdmulh.s16     q6, q1, XFIX_2_613125930
-    vadd.s16        q3, q1, q1
-    vsub.s16        q1, q5, q1
-    vadd.s16        q10, q2, q4
-    vqdmulh.s16     q4, q1, XFIX_1_847759065
-    vsub.s16        q2, q15, q13
-    vadd.s16        q3, q3, q6
-    vqdmulh.s16     q6, q2, XFIX_1_414213562
-    vadd.s16        q1, q1, q4
-    vqdmulh.s16     q4, q5, XFIX_1_082392200
-    vsub.s16        q10, q10, q14
-    vadd.s16        q2, q2, q6
-    vsub.s16        q6, q8, q12
-    vadd.s16        q12, q8, q12
-    vadd.s16        q9, q5, q4
-    vadd.s16        q5, q6, q10
-    vsub.s16        q10, q6, q10
-    vadd.s16        q6, q15, q13
-    vadd.s16        q8, q12, q14
-    vsub.s16        q3, q6, q3
-    vsub.s16        q12, q12, q14
-    vsub.s16        q3, q3, q1
-    vsub.s16        q1, q9, q1
-    vadd.s16        q2, q3, q2
-    vsub.s16        q15, q8, q6
-    vadd.s16        q1, q1, q2
-    vadd.s16        q8, q8, q6
-    vadd.s16        q14, q5, q3
-    vsub.s16        q9, q5, q3
-    vsub.s16        q13, q10, q2
-    vadd.s16        q10, q10, q2
-      /* Transpose */
-      vtrn.16         q8, q9
-    vsub.s16        q11, q12, q1
-      vtrn.16         q14, q15
-    vadd.s16        q12, q12, q1
-      vtrn.16         q10, q11
-      vtrn.16         q12, q13
-      vtrn.32         q9, q11
-      vtrn.32         q12, q14
-      vtrn.32         q8, q10
-      vtrn.32         q13, q15
-      vswp            d28, d21
-      vswp            d26, d19
-    /* 1-D IDCT, pass 2 */
-    vsub.s16        q2, q10, q14
-      vswp            d30, d23
-    vadd.s16        q14, q10, q14
-      vswp            d24, d17
-    vsub.s16        q1, q11, q13
-    vadd.s16        q13, q11, q13
-    vsub.s16        q5, q9, q15
-    vadd.s16        q15, q9, q15
-    vqdmulh.s16     q4, q2, XFIX_1_414213562
-    vqdmulh.s16     q6, q1, XFIX_2_613125930
-    vadd.s16        q3, q1, q1
-    vsub.s16        q1, q5, q1
-    vadd.s16        q10, q2, q4
-    vqdmulh.s16     q4, q1, XFIX_1_847759065
-    vsub.s16        q2, q15, q13
-    vadd.s16        q3, q3, q6
-    vqdmulh.s16     q6, q2, XFIX_1_414213562
-    vadd.s16        q1, q1, q4
-    vqdmulh.s16     q4, q5, XFIX_1_082392200
-    vsub.s16        q10, q10, q14
-    vadd.s16        q2, q2, q6
-    vsub.s16        q6, q8, q12
-    vadd.s16        q12, q8, q12
-    vadd.s16        q9, q5, q4
-    vadd.s16        q5, q6, q10
-    vsub.s16        q10, q6, q10
-    vadd.s16        q6, q15, q13
-    vadd.s16        q8, q12, q14
-    vsub.s16        q3, q6, q3
-    vsub.s16        q12, q12, q14
-    vsub.s16        q3, q3, q1
-    vsub.s16        q1, q9, q1
-    vadd.s16        q2, q3, q2
-    vsub.s16        q15, q8, q6
-    vadd.s16        q1, q1, q2
-    vadd.s16        q8, q8, q6
-    vadd.s16        q14, q5, q3
-    vsub.s16        q9, q5, q3
-    vsub.s16        q13, q10, q2
-    vpop            {d8 - d13}    /* restore Neon registers */
-    vadd.s16        q10, q10, q2
-    vsub.s16        q11, q12, q1
-    vadd.s16        q12, q12, q1
-    /* Descale to 8-bit and range limit */
-    vmov.u8         q0, #0x80
-    vqshrn.s16      d16, q8, #5
-    vqshrn.s16      d17, q9, #5
-    vqshrn.s16      d18, q10, #5
-    vqshrn.s16      d19, q11, #5
-    vqshrn.s16      d20, q12, #5
-    vqshrn.s16      d21, q13, #5
-    vqshrn.s16      d22, q14, #5
-    vqshrn.s16      d23, q15, #5
-    vadd.u8         q8, q8, q0
-    vadd.u8         q9, q9, q0
-    vadd.u8         q10, q10, q0
-    vadd.u8         q11, q11, q0
-    /* Transpose the final 8-bit samples */
-    vtrn.16         q8, q9
-    vtrn.16         q10, q11
-    vtrn.32         q8, q10
-    vtrn.32         q9, q11
-    vtrn.8          d16, d17
-    vtrn.8          d18, d19
-      /* Store results to the output buffer */
-      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-      add             TMP1, TMP1, OUTPUT_COL
-      add             TMP2, TMP2, OUTPUT_COL
-      vst1.8          {d16}, [TMP1]
-      vst1.8          {d17}, [TMP2]
-      ldmia           OUTPUT_BUF!, {TMP1, TMP2}
-      add             TMP1, TMP1, OUTPUT_COL
-      add             TMP2, TMP2, OUTPUT_COL
-      vst1.8          {d18}, [TMP1]
-    vtrn.8          d20, d21
-      vst1.8          {d19}, [TMP2]
-      ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
-      add             TMP1, TMP1, OUTPUT_COL
-      add             TMP2, TMP2, OUTPUT_COL
-      add             TMP3, TMP3, OUTPUT_COL
-      add             TMP4, TMP4, OUTPUT_COL
-      vst1.8          {d20}, [TMP1]
-    vtrn.8          d22, d23
-      vst1.8          {d21}, [TMP2]
-      vst1.8          {d22}, [TMP3]
-      vst1.8          {d23}, [TMP4]
-    bx              lr
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_4x4_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 4x4 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_4x4'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 4x4 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular Neon optimized function is
- *       bit exact compatibility with jpeg-6b.
- *
- * TODO: a bit better instructions scheduling can be achieved by expanding
- *       idct_helper/transpose_4x4 macros and reordering instructions,
- *       but readability will suffer somewhat.
- */
-
-#define CONST_BITS  13
-
-#define FIX_0_211164243  (1730)   /* FIX(0.211164243) */
-#define FIX_0_509795579  (4176)   /* FIX(0.509795579) */
-#define FIX_0_601344887  (4926)   /* FIX(0.601344887) */
-#define FIX_0_720959822  (5906)   /* FIX(0.720959822) */
-#define FIX_0_765366865  (6270)   /* FIX(0.765366865) */
-#define FIX_0_850430095  (6967)   /* FIX(0.850430095) */
-#define FIX_0_899976223  (7373)   /* FIX(0.899976223) */
-#define FIX_1_061594337  (8697)   /* FIX(1.061594337) */
-#define FIX_1_272758580  (10426)  /* FIX(1.272758580) */
-#define FIX_1_451774981  (11893)  /* FIX(1.451774981) */
-#define FIX_1_847759065  (15137)  /* FIX(1.847759065) */
-#define FIX_2_172734803  (17799)  /* FIX(2.172734803) */
-#define FIX_2_562915447  (20995)  /* FIX(2.562915447) */
-#define FIX_3_624509785  (29692)  /* FIX(3.624509785) */
-
-.balign 16
-jsimd_idct_4x4_neon_consts:
-  .short FIX_1_847759065        /* d0[0] */
-  .short -FIX_0_765366865       /* d0[1] */
-  .short -FIX_0_211164243       /* d0[2] */
-  .short FIX_1_451774981        /* d0[3] */
-  .short -FIX_2_172734803       /* d1[0] */
-  .short FIX_1_061594337        /* d1[1] */
-  .short -FIX_0_509795579       /* d1[2] */
-  .short -FIX_0_601344887       /* d1[3] */
-  .short FIX_0_899976223        /* d2[0] */
-  .short FIX_2_562915447        /* d2[1] */
-  .short 1 << (CONST_BITS + 1)  /* d2[2] */
-  .short 0                      /* d2[3] */
-
-.macro idct_helper x4, x6, x8, x10, x12, x14, x16, shift, y26, y27, y28, y29
-    vmull.s16       q14, \x4, d2[2]
-    vmlal.s16       q14, \x8, d0[0]
-    vmlal.s16       q14, \x14, d0[1]
-
-    vmull.s16       q13, \x16, d1[2]
-    vmlal.s16       q13, \x12, d1[3]
-    vmlal.s16       q13, \x10, d2[0]
-    vmlal.s16       q13, \x6, d2[1]
-
-    vmull.s16       q15, \x4, d2[2]
-    vmlsl.s16       q15, \x8, d0[0]
-    vmlsl.s16       q15, \x14, d0[1]
-
-    vmull.s16       q12, \x16, d0[2]
-    vmlal.s16       q12, \x12, d0[3]
-    vmlal.s16       q12, \x10, d1[0]
-    vmlal.s16       q12, \x6, d1[1]
-
-    vadd.s32        q10, q14, q13
-    vsub.s32        q14, q14, q13
-
-  .if \shift > 16
-    vrshr.s32       q10, q10, #\shift
-    vrshr.s32       q14, q14, #\shift
-    vmovn.s32       \y26, q10
-    vmovn.s32       \y29, q14
-  .else
-    vrshrn.s32      \y26, q10, #\shift
-    vrshrn.s32      \y29, q14, #\shift
-  .endif
-
-    vadd.s32        q10, q15, q12
-    vsub.s32        q15, q15, q12
-
-  .if \shift > 16
-    vrshr.s32       q10, q10, #\shift
-    vrshr.s32       q15, q15, #\shift
-    vmovn.s32       \y27, q10
-    vmovn.s32       \y28, q15
-  .else
-    vrshrn.s32      \y27, q10, #\shift
-    vrshrn.s32      \y28, q15, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_4x4_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req r1
-    TMP3            .req r2
-    TMP4            .req ip
-
-    vpush           {d8 - d15}
-
-    /* Load constants (d3 is just used for padding) */
-    adr             TMP4, jsimd_idct_4x4_neon_consts
-    vld1.16         {d0, d1, d2, d3}, [TMP4, :128]
-
-    /* Load all COEF_BLOCK into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d4      | d5
-     *   1 | d6      | d7
-     *   2 | d8      | d9
-     *   3 | d10     | d11
-     *   4 | -       | -
-     *   5 | d12     | d13
-     *   6 | d14     | d15
-     *   7 | d16     | d17
-     */
-    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
-    vld1.16         {d8, d9, d10, d11}, [COEF_BLOCK, :128]!
-    add COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d12, d13, d14, d15}, [COEF_BLOCK, :128]!
-    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
-    /* dequantize */
-    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
-    vmul.s16        q2, q2, q9
-    vld1.16         {d22, d23, d24, d25}, [DCT_TABLE, :128]!
-    vmul.s16        q3, q3, q10
-    vmul.s16        q4, q4, q11
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d26, d27, d28, d29}, [DCT_TABLE, :128]!
-    vmul.s16        q5, q5, q12
-    vmul.s16        q6, q6, q13
-    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
-    vmul.s16        q7, q7, q14
-    vmul.s16        q8, q8, q15
-
-    /* Pass 1 */
-    idct_helper     d4, d6, d8, d10, d12, d14, d16, 12, d4, d6, d8, d10
-    transpose_4x4   d4, d6, d8, d10
-    idct_helper     d5, d7, d9, d11, d13, d15, d17, 12, d5, d7, d9, d11
-    transpose_4x4   d5, d7, d9, d11
-
-    /* Pass 2 */
-    idct_helper     d4, d6, d8, d10, d7, d9, d11, 19, d26, d27, d28, d29
-    transpose_4x4   d26, d27, d28, d29
-
-    /* Range limit */
-    vmov.u16        q15, #0x80
-    vadd.s16        q13, q13, q15
-    vadd.s16        q14, q14, q15
-    vqmovun.s16     d26, q13
-    vqmovun.s16     d27, q14
-
-    /* Store results to the output buffer */
-    ldmia           OUTPUT_BUF, {TMP1, TMP2, TMP3, TMP4}
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-    add             TMP3, TMP3, OUTPUT_COL
-    add             TMP4, TMP4, OUTPUT_COL
-
-#if defined(__ARMEL__) && !RESPECT_STRICT_ALIGNMENT
-    /* We can use much less instructions on little endian systems if the
-     * OS kernel is not configured to trap unaligned memory accesses
-     */
-    vst1.32         {d26[0]}, [TMP1]!
-    vst1.32         {d27[0]}, [TMP3]!
-    vst1.32         {d26[1]}, [TMP2]!
-    vst1.32         {d27[1]}, [TMP4]!
-#else
-    vst1.8          {d26[0]}, [TMP1]!
-    vst1.8          {d27[0]}, [TMP3]!
-    vst1.8          {d26[1]}, [TMP1]!
-    vst1.8          {d27[1]}, [TMP3]!
-    vst1.8          {d26[2]}, [TMP1]!
-    vst1.8          {d27[2]}, [TMP3]!
-    vst1.8          {d26[3]}, [TMP1]!
-    vst1.8          {d27[3]}, [TMP3]!
-
-    vst1.8          {d26[4]}, [TMP2]!
-    vst1.8          {d27[4]}, [TMP4]!
-    vst1.8          {d26[5]}, [TMP2]!
-    vst1.8          {d27[5]}, [TMP4]!
-    vst1.8          {d26[6]}, [TMP2]!
-    vst1.8          {d27[6]}, [TMP4]!
-    vst1.8          {d26[7]}, [TMP2]!
-    vst1.8          {d27[7]}, [TMP4]!
-#endif
-
-    vpop            {d8 - d15}
-    bx              lr
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_idct_2x2_neon
- *
- * This function contains inverse-DCT code for getting reduced-size
- * 2x2 pixels output from an 8x8 DCT block. It uses the same  calculations
- * and produces exactly the same output as IJG's original 'jpeg_idct_2x2'
- * function from jpeg-6b (jidctred.c).
- *
- * NOTE: jpeg-8 has an improved implementation of 2x2 inverse-DCT, which
- *       requires much less arithmetic operations and hence should be faster.
- *       The primary purpose of this particular Neon optimized function is
- *       bit exact compatibility with jpeg-6b.
- */
-
-.balign 8
-jsimd_idct_2x2_neon_consts:
-  .short -FIX_0_720959822  /* d0[0] */
-  .short FIX_0_850430095   /* d0[1] */
-  .short -FIX_1_272758580  /* d0[2] */
-  .short FIX_3_624509785   /* d0[3] */
-
-.macro idct_helper x4, x6, x10, x12, x16, shift, y26, y27
-    vshll.s16       q14, \x4, #15
-    vmull.s16       q13, \x6, d0[3]
-    vmlal.s16       q13, \x10, d0[2]
-    vmlal.s16       q13, \x12, d0[1]
-    vmlal.s16       q13, \x16, d0[0]
-
-    vadd.s32        q10, q14, q13
-    vsub.s32        q14, q14, q13
-
-  .if \shift > 16
-    vrshr.s32       q10, q10, #\shift
-    vrshr.s32       q14, q14, #\shift
-    vmovn.s32       \y26, q10
-    vmovn.s32       \y27, q14
-  .else
-    vrshrn.s32      \y26, q10, #\shift
-    vrshrn.s32      \y27, q14, #\shift
-  .endif
-.endm
-
-asm_function jsimd_idct_2x2_neon
-
-    DCT_TABLE       .req r0
-    COEF_BLOCK      .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_COL      .req r3
-    TMP1            .req r0
-    TMP2            .req ip
-
-    vpush           {d8 - d15}
-
-    /* Load constants */
-    adr             TMP2, jsimd_idct_2x2_neon_consts
-    vld1.16         {d0}, [TMP2, :64]
-
-    /* Load all COEF_BLOCK into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d4      | d5
-     *   1 | d6      | d7
-     *   2 | -       | -
-     *   3 | d10     | d11
-     *   4 | -       | -
-     *   5 | d12     | d13
-     *   6 | -       | -
-     *   7 | d16     | d17
-     */
-    vld1.16         {d4, d5, d6, d7}, [COEF_BLOCK, :128]!
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d10, d11}, [COEF_BLOCK, :128]!
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d12, d13}, [COEF_BLOCK, :128]!
-    add             COEF_BLOCK, COEF_BLOCK, #16
-    vld1.16         {d16, d17}, [COEF_BLOCK, :128]!
-    /* Dequantize */
-    vld1.16         {d18, d19, d20, d21}, [DCT_TABLE, :128]!
-    vmul.s16        q2, q2, q9
-    vmul.s16        q3, q3, q10
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d24, d25}, [DCT_TABLE, :128]!
-    vmul.s16        q5, q5, q12
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d26, d27}, [DCT_TABLE, :128]!
-    vmul.s16        q6, q6, q13
-    add             DCT_TABLE, DCT_TABLE, #16
-    vld1.16         {d30, d31}, [DCT_TABLE, :128]!
-    vmul.s16        q8, q8, q15
-
-    /* Pass 1 */
-#if 0
-    idct_helper     d4, d6, d10, d12, d16, 13, d4, d6
-    transpose_4x4   d4, d6, d8, d10
-    idct_helper     d5, d7, d11, d13, d17, 13, d5, d7
-    transpose_4x4   d5, d7, d9, d11
-#else
-    vmull.s16       q13, d6, d0[3]
-    vmlal.s16       q13, d10, d0[2]
-    vmlal.s16       q13, d12, d0[1]
-    vmlal.s16       q13, d16, d0[0]
-    vmull.s16       q12, d7, d0[3]
-    vmlal.s16       q12, d11, d0[2]
-    vmlal.s16       q12, d13, d0[1]
-    vmlal.s16       q12, d17, d0[0]
-    vshll.s16       q14, d4, #15
-    vshll.s16       q15, d5, #15
-    vadd.s32        q10, q14, q13
-    vsub.s32        q14, q14, q13
-    vrshrn.s32      d4, q10, #13
-    vrshrn.s32      d6, q14, #13
-    vadd.s32        q10, q15, q12
-    vsub.s32        q14, q15, q12
-    vrshrn.s32      d5, q10, #13
-    vrshrn.s32      d7, q14, #13
-    vtrn.16         q2, q3
-    vtrn.32         q3, q5
-#endif
-
-    /* Pass 2 */
-    idct_helper     d4, d6, d10, d7, d11, 20, d26, d27
-
-    /* Range limit */
-    vmov.u16        q15, #0x80
-    vadd.s16        q13, q13, q15
-    vqmovun.s16     d26, q13
-    vqmovun.s16     d27, q13
-
-    /* Store results to the output buffer */
-    ldmia           OUTPUT_BUF, {TMP1, TMP2}
-    add             TMP1, TMP1, OUTPUT_COL
-    add             TMP2, TMP2, OUTPUT_COL
-
-    vst1.8          {d26[0]}, [TMP1]!
-    vst1.8          {d27[4]}, [TMP1]!
-    vst1.8          {d26[1]}, [TMP2]!
-    vst1.8          {d27[5]}, [TMP2]!
-
-    vpop            {d8 - d15}
-    bx              lr
-
-    .unreq          DCT_TABLE
-    .unreq          COEF_BLOCK
-    .unreq          OUTPUT_BUF
-    .unreq          OUTPUT_COL
-    .unreq          TMP1
-    .unreq          TMP2
-
-.purgem idct_helper
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_ycc_extrgb_convert_neon
- * jsimd_ycc_extbgr_convert_neon
- * jsimd_ycc_extrgbx_convert_neon
- * jsimd_ycc_extbgrx_convert_neon
- * jsimd_ycc_extxbgr_convert_neon
- * jsimd_ycc_extxrgb_convert_neon
- *
- * Colorspace conversion YCbCr -> RGB
- */
-
-
-.macro do_load size
-  .if \size == 8
-    vld1.8          {d4}, [U, :64]!
-    vld1.8          {d5}, [V, :64]!
-    vld1.8          {d0}, [Y, :64]!
-    pld             [U, #64]
-    pld             [V, #64]
-    pld             [Y, #64]
-  .elseif \size == 4
-    vld1.8          {d4[0]}, [U]!
-    vld1.8          {d4[1]}, [U]!
-    vld1.8          {d4[2]}, [U]!
-    vld1.8          {d4[3]}, [U]!
-    vld1.8          {d5[0]}, [V]!
-    vld1.8          {d5[1]}, [V]!
-    vld1.8          {d5[2]}, [V]!
-    vld1.8          {d5[3]}, [V]!
-    vld1.8          {d0[0]}, [Y]!
-    vld1.8          {d0[1]}, [Y]!
-    vld1.8          {d0[2]}, [Y]!
-    vld1.8          {d0[3]}, [Y]!
-  .elseif \size == 2
-    vld1.8          {d4[4]}, [U]!
-    vld1.8          {d4[5]}, [U]!
-    vld1.8          {d5[4]}, [V]!
-    vld1.8          {d5[5]}, [V]!
-    vld1.8          {d0[4]}, [Y]!
-    vld1.8          {d0[5]}, [Y]!
-  .elseif \size == 1
-    vld1.8          {d4[6]}, [U]!
-    vld1.8          {d5[6]}, [V]!
-    vld1.8          {d0[6]}, [Y]!
-  .else
-    .error unsupported macroblock size
-  .endif
-.endm
-
-.macro do_store bpp, size
-  .if \bpp == 24
-    .if \size == 8
-      vst3.8        {d10, d11, d12}, [RGB]!
-    .elseif \size == 4
-      vst3.8        {d10[0], d11[0], d12[0]}, [RGB]!
-      vst3.8        {d10[1], d11[1], d12[1]}, [RGB]!
-      vst3.8        {d10[2], d11[2], d12[2]}, [RGB]!
-      vst3.8        {d10[3], d11[3], d12[3]}, [RGB]!
-    .elseif \size == 2
-      vst3.8        {d10[4], d11[4], d12[4]}, [RGB]!
-      vst3.8        {d10[5], d11[5], d12[5]}, [RGB]!
-    .elseif \size == 1
-      vst3.8        {d10[6], d11[6], d12[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .elseif \bpp == 32
-    .if \size == 8
-      vst4.8        {d10, d11, d12, d13}, [RGB]!
-    .elseif \size == 4
-      vst4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
-      vst4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
-      vst4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
-      vst4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
-    .elseif \size == 2
-      vst4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
-      vst4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
-    .elseif \size == 1
-      vst4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .elseif \bpp == 16
-    .if \size == 8
-      vst1.16       {q15}, [RGB]!
-    .elseif \size == 4
-      vst1.16       {d30}, [RGB]!
-    .elseif \size == 2
-      vst1.16       {d31[0]}, [RGB]!
-      vst1.16       {d31[1]}, [RGB]!
-    .elseif \size == 1
-      vst1.16       {d31[2]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .else
-    .error unsupported bpp
-  .endif
-.endm
-
-.macro generate_jsimd_ycc_rgb_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined YCbCr->RGB conversion
- */
-
-.macro do_yuv_to_rgb_stage1
-    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
-    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
-    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
-    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
-.endm
-
-.macro do_yuv_to_rgb_stage2
-    vrshrn.s32      d20, q10, #15
-    vrshrn.s32      d21, q11, #15
-    vrshrn.s32      d24, q12, #14
-    vrshrn.s32      d25, q13, #14
-    vrshrn.s32      d28, q14, #14
-    vrshrn.s32      d29, q15, #14
-    vaddw.u8        q11, q10, d0
-    vaddw.u8        q12, q12, d0
-    vaddw.u8        q14, q14, d0
-  .if \bpp != 16
-    vqmovun.s16     d1\g_offs, q11
-    vqmovun.s16     d1\r_offs, q12
-    vqmovun.s16     d1\b_offs, q14
-  .else  /* rgb565 */
-    vqshlu.s16      q13, q11, #8
-    vqshlu.s16      q15, q12, #8
-    vqshlu.s16      q14, q14, #8
-    vsri.u16        q15, q13, #5
-    vsri.u16        q15, q14, #11
-  .endif
-.endm
-
-.macro do_yuv_to_rgb_stage2_store_load_stage1
-                                       /* "do_yuv_to_rgb_stage2" and "store" */
-                                       vrshrn.s32      d20, q10, #15
-    /* "load" and "do_yuv_to_rgb_stage1" */
-    pld             [U, #64]
-                                       vrshrn.s32      d21, q11, #15
-    pld             [V, #64]
-                                       vrshrn.s32      d24, q12, #14
-                                       vrshrn.s32      d25, q13, #14
-    vld1.8          {d4}, [U, :64]!
-                                       vrshrn.s32      d28, q14, #14
-    vld1.8          {d5}, [V, :64]!
-                                       vrshrn.s32      d29, q15, #14
-    vaddw.u8        q3, q1, d4      /* q3 = u - 128 */
-    vaddw.u8        q4, q1, d5      /* q2 = v - 128 */
-                                       vaddw.u8        q11, q10, d0
-    vmull.s16       q10, d6, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q10, d8, d1[2]  /* multiply by -23401 */
-                                       vaddw.u8        q12, q12, d0
-                                       vaddw.u8        q14, q14, d0
-  .if \bpp != 16  /**************** rgb24/rgb32 ******************************/
-                                       vqmovun.s16     d1\g_offs, q11
-    pld             [Y, #64]
-                                       vqmovun.s16     d1\r_offs, q12
-    vld1.8          {d0}, [Y, :64]!
-                                       vqmovun.s16     d1\b_offs, q14
-    vmull.s16       q11, d7, d1[1]  /* multiply by -11277 */
-    vmlal.s16       q11, d9, d1[2]  /* multiply by -23401 */
-                                       do_store        \bpp, 8
-    vmull.s16       q12, d8, d1[0]  /* multiply by 22971 */
-    vmull.s16       q13, d9, d1[0]  /* multiply by 22971 */
-    vmull.s16       q14, d6, d1[3]  /* multiply by 29033 */
-    vmull.s16       q15, d7, d1[3]  /* multiply by 29033 */
-  .else  /**************************** rgb565 ********************************/
-                                       vqshlu.s16      q13, q11, #8
-    pld             [Y, #64]
-                                       vqshlu.s16      q15, q12, #8
-                                       vqshlu.s16      q14, q14, #8
-    vld1.8          {d0}, [Y, :64]!
-    vmull.s16       q11, d7, d1[1]
-    vmlal.s16       q11, d9, d1[2]
-                                       vsri.u16        q15, q13, #5
-    vmull.s16       q12, d8, d1[0]
-                                       vsri.u16        q15, q14, #11
-    vmull.s16       q13, d9, d1[0]
-    vmull.s16       q14, d6, d1[3]
-                                       do_store        \bpp, 8
-    vmull.s16       q15, d7, d1[3]
-  .endif
-.endm
-
-.macro do_yuv_to_rgb
-    do_yuv_to_rgb_stage1
-    do_yuv_to_rgb_stage2
-.endm
-
-/* Apple gas crashes on adrl, work around that by using adr.
- * But this requires a copy of these constants for each function.
- */
-
-.balign 16
-jsimd_ycc_\colorid\()_neon_consts:
-  .short 0,      0,     0,      0
-  .short 22971, -11277, -23401, 29033
-  .short -128,  -128,   -128,   -128
-  .short -128,  -128,   -128,   -128
-
-asm_function jsimd_ycc_\colorid\()_convert_neon
-    OUTPUT_WIDTH    .req r0
-    INPUT_BUF       .req r1
-    INPUT_ROW       .req r2
-    OUTPUT_BUF      .req r3
-    NUM_ROWS        .req r4
-
-    INPUT_BUF0      .req r5
-    INPUT_BUF1      .req r6
-    INPUT_BUF2      .req INPUT_BUF
-
-    RGB             .req r7
-    Y               .req r8
-    U               .req r9
-    V               .req r10
-    N               .req ip
-
-    /* Load constants to d1, d2, d3 (d0 is just used for padding) */
-    adr             ip, jsimd_ycc_\colorid\()_neon_consts
-    vld1.16         {d0, d1, d2, d3}, [ip, :128]
-
-    /* Save Arm registers and handle input arguments */
-    push            {r4, r5, r6, r7, r8, r9, r10, lr}
-    ldr             NUM_ROWS, [sp, #(4 * 8)]
-    ldr             INPUT_BUF0, [INPUT_BUF]
-    ldr             INPUT_BUF1, [INPUT_BUF, #4]
-    ldr             INPUT_BUF2, [INPUT_BUF, #8]
-    .unreq          INPUT_BUF
-
-    /* Save Neon registers */
-    vpush           {d8 - d15}
-
-    /* Initially set d10, d11, d12, d13 to 0xFF */
-    vmov.u8         q5, #255
-    vmov.u8         q6, #255
-
-    /* Outer loop over scanlines */
-    cmp             NUM_ROWS, #1
-    blt             9f
-0:
-    ldr             Y, [INPUT_BUF0, INPUT_ROW, lsl #2]
-    ldr             U, [INPUT_BUF1, INPUT_ROW, lsl #2]
-    mov             N, OUTPUT_WIDTH
-    ldr             V, [INPUT_BUF2, INPUT_ROW, lsl #2]
-    add             INPUT_ROW, INPUT_ROW, #1
-    ldr             RGB, [OUTPUT_BUF], #4
-
-    /* Inner loop over pixels */
-    subs            N, N, #8
-    blt             3f
-    do_load         8
-    do_yuv_to_rgb_stage1
-    subs            N, N, #8
-    blt             2f
-1:
-    do_yuv_to_rgb_stage2_store_load_stage1
-    subs            N, N, #8
-    bge             1b
-2:
-    do_yuv_to_rgb_stage2
-    do_store        \bpp, 8
-    tst             N, #7
-    beq             8f
-3:
-    tst             N, #4
-    beq             3f
-    do_load         4
-3:
-    tst             N, #2
-    beq             4f
-    do_load         2
-4:
-    tst             N, #1
-    beq             5f
-    do_load         1
-5:
-    do_yuv_to_rgb
-    tst             N, #4
-    beq             6f
-    do_store        \bpp, 4
-6:
-    tst             N, #2
-    beq             7f
-    do_store        \bpp, 2
-7:
-    tst             N, #1
-    beq             8f
-    do_store        \bpp, 1
-8:
-    subs            NUM_ROWS, NUM_ROWS, #1
-    bgt             0b
-9:
-    /* Restore all registers and return */
-    vpop            {d8 - d15}
-    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
-
-    .unreq          OUTPUT_WIDTH
-    .unreq          INPUT_ROW
-    .unreq          OUTPUT_BUF
-    .unreq          NUM_ROWS
-    .unreq          INPUT_BUF0
-    .unreq          INPUT_BUF1
-    .unreq          INPUT_BUF2
-    .unreq          RGB
-    .unreq          Y
-    .unreq          U
-    .unreq          V
-    .unreq          N
-
-.purgem do_yuv_to_rgb
-.purgem do_yuv_to_rgb_stage1
-.purgem do_yuv_to_rgb_stage2
-.purgem do_yuv_to_rgb_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R  G  B */
-generate_jsimd_ycc_rgb_convert_neon extrgb,  24, 0, 1, 2
-generate_jsimd_ycc_rgb_convert_neon extbgr,  24, 2, 1, 0
-generate_jsimd_ycc_rgb_convert_neon extrgbx, 32, 0, 1, 2
-generate_jsimd_ycc_rgb_convert_neon extbgrx, 32, 2, 1, 0
-generate_jsimd_ycc_rgb_convert_neon extxbgr, 32, 3, 2, 1
-generate_jsimd_ycc_rgb_convert_neon extxrgb, 32, 1, 2, 3
-generate_jsimd_ycc_rgb_convert_neon rgb565,  16, 0, 0, 0
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_extrgb_ycc_convert_neon
- * jsimd_extbgr_ycc_convert_neon
- * jsimd_extrgbx_ycc_convert_neon
- * jsimd_extbgrx_ycc_convert_neon
- * jsimd_extxbgr_ycc_convert_neon
- * jsimd_extxrgb_ycc_convert_neon
- *
- * Colorspace conversion RGB -> YCbCr
- */
-
-.macro do_store size
-  .if \size == 8
-    vst1.8          {d20}, [Y]!
-    vst1.8          {d21}, [U]!
-    vst1.8          {d22}, [V]!
-  .elseif \size == 4
-    vst1.8          {d20[0]}, [Y]!
-    vst1.8          {d20[1]}, [Y]!
-    vst1.8          {d20[2]}, [Y]!
-    vst1.8          {d20[3]}, [Y]!
-    vst1.8          {d21[0]}, [U]!
-    vst1.8          {d21[1]}, [U]!
-    vst1.8          {d21[2]}, [U]!
-    vst1.8          {d21[3]}, [U]!
-    vst1.8          {d22[0]}, [V]!
-    vst1.8          {d22[1]}, [V]!
-    vst1.8          {d22[2]}, [V]!
-    vst1.8          {d22[3]}, [V]!
-  .elseif \size == 2
-    vst1.8          {d20[4]}, [Y]!
-    vst1.8          {d20[5]}, [Y]!
-    vst1.8          {d21[4]}, [U]!
-    vst1.8          {d21[5]}, [U]!
-    vst1.8          {d22[4]}, [V]!
-    vst1.8          {d22[5]}, [V]!
-  .elseif \size == 1
-    vst1.8          {d20[6]}, [Y]!
-    vst1.8          {d21[6]}, [U]!
-    vst1.8          {d22[6]}, [V]!
-  .else
-    .error unsupported macroblock size
-  .endif
-.endm
-
-.macro do_load bpp, size
-  .if \bpp == 24
-    .if \size == 8
-      vld3.8        {d10, d11, d12}, [RGB]!
-      pld           [RGB, #128]
-    .elseif \size == 4
-      vld3.8        {d10[0], d11[0], d12[0]}, [RGB]!
-      vld3.8        {d10[1], d11[1], d12[1]}, [RGB]!
-      vld3.8        {d10[2], d11[2], d12[2]}, [RGB]!
-      vld3.8        {d10[3], d11[3], d12[3]}, [RGB]!
-    .elseif \size == 2
-      vld3.8        {d10[4], d11[4], d12[4]}, [RGB]!
-      vld3.8        {d10[5], d11[5], d12[5]}, [RGB]!
-    .elseif \size == 1
-      vld3.8        {d10[6], d11[6], d12[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .elseif \bpp == 32
-    .if \size == 8
-      vld4.8        {d10, d11, d12, d13}, [RGB]!
-      pld           [RGB, #128]
-    .elseif \size == 4
-      vld4.8        {d10[0], d11[0], d12[0], d13[0]}, [RGB]!
-      vld4.8        {d10[1], d11[1], d12[1], d13[1]}, [RGB]!
-      vld4.8        {d10[2], d11[2], d12[2], d13[2]}, [RGB]!
-      vld4.8        {d10[3], d11[3], d12[3], d13[3]}, [RGB]!
-    .elseif \size == 2
-      vld4.8        {d10[4], d11[4], d12[4], d13[4]}, [RGB]!
-      vld4.8        {d10[5], d11[5], d12[5], d13[5]}, [RGB]!
-    .elseif \size == 1
-      vld4.8        {d10[6], d11[6], d12[6], d13[6]}, [RGB]!
-    .else
-      .error unsupported macroblock size
-    .endif
-  .else
-    .error unsupported bpp
-  .endif
-.endm
-
-.macro generate_jsimd_rgb_ycc_convert_neon colorid, bpp, r_offs, g_offs, b_offs
-
-/*
- * 2-stage pipelined RGB->YCbCr conversion
- */
-
-.macro do_rgb_to_yuv_stage1
-    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
-    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
-    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
-    vmull.u16       q7, d4, d0[0]
-    vmlal.u16       q7, d6, d0[1]
-    vmlal.u16       q7, d8, d0[2]
-    vmull.u16       q8, d5, d0[0]
-    vmlal.u16       q8, d7, d0[1]
-    vmlal.u16       q8, d9, d0[2]
-    vrev64.32       q9, q1
-    vrev64.32       q13, q1
-    vmlsl.u16       q9, d4, d0[3]
-    vmlsl.u16       q9, d6, d1[0]
-    vmlal.u16       q9, d8, d1[1]
-    vmlsl.u16       q13, d5, d0[3]
-    vmlsl.u16       q13, d7, d1[0]
-    vmlal.u16       q13, d9, d1[1]
-    vrev64.32       q14, q1
-    vrev64.32       q15, q1
-    vmlal.u16       q14, d4, d1[1]
-    vmlsl.u16       q14, d6, d1[2]
-    vmlsl.u16       q14, d8, d1[3]
-    vmlal.u16       q15, d5, d1[1]
-    vmlsl.u16       q15, d7, d1[2]
-    vmlsl.u16       q15, d9, d1[3]
-.endm
-
-.macro do_rgb_to_yuv_stage2
-    vrshrn.u32      d20, q7, #16
-    vrshrn.u32      d21, q8, #16
-    vshrn.u32       d22, q9, #16
-    vshrn.u32       d23, q13, #16
-    vshrn.u32       d24, q14, #16
-    vshrn.u32       d25, q15, #16
-    vmovn.u16       d20, q10       /* d20 = y */
-    vmovn.u16       d21, q11       /* d21 = u */
-    vmovn.u16       d22, q12       /* d22 = v */
-.endm
-
-.macro do_rgb_to_yuv
-    do_rgb_to_yuv_stage1
-    do_rgb_to_yuv_stage2
-.endm
-
-.macro do_rgb_to_yuv_stage2_store_load_stage1
-      vrshrn.u32      d20, q7, #16
-      vrshrn.u32      d21, q8, #16
-      vshrn.u32       d22, q9, #16
-    vrev64.32       q9, q1
-      vshrn.u32       d23, q13, #16
-    vrev64.32       q13, q1
-      vshrn.u32       d24, q14, #16
-      vshrn.u32       d25, q15, #16
-    do_load         \bpp, 8
-      vmovn.u16       d20, q10     /* d20 = y */
-    vmovl.u8        q2, d1\r_offs  /* r = { d4, d5 } */
-      vmovn.u16       d21, q11     /* d21 = u */
-    vmovl.u8        q3, d1\g_offs  /* g = { d6, d7 } */
-      vmovn.u16       d22, q12     /* d22 = v */
-    vmovl.u8        q4, d1\b_offs  /* b = { d8, d9 } */
-    vmull.u16       q7, d4, d0[0]
-    vmlal.u16       q7, d6, d0[1]
-    vmlal.u16       q7, d8, d0[2]
-      vst1.8          {d20}, [Y]!
-    vmull.u16       q8, d5, d0[0]
-    vmlal.u16       q8, d7, d0[1]
-    vmlal.u16       q8, d9, d0[2]
-    vmlsl.u16       q9, d4, d0[3]
-    vmlsl.u16       q9, d6, d1[0]
-    vmlal.u16       q9, d8, d1[1]
-      vst1.8          {d21}, [U]!
-    vmlsl.u16       q13, d5, d0[3]
-    vmlsl.u16       q13, d7, d1[0]
-    vmlal.u16       q13, d9, d1[1]
-    vrev64.32       q14, q1
-    vrev64.32       q15, q1
-    vmlal.u16       q14, d4, d1[1]
-    vmlsl.u16       q14, d6, d1[2]
-    vmlsl.u16       q14, d8, d1[3]
-      vst1.8          {d22}, [V]!
-    vmlal.u16       q15, d5, d1[1]
-    vmlsl.u16       q15, d7, d1[2]
-    vmlsl.u16       q15, d9, d1[3]
-.endm
-
-.balign 16
-jsimd_\colorid\()_ycc_neon_consts:
-  .short 19595, 38470, 7471,  11059
-  .short 21709, 32768, 27439, 5329
-  .short 32767, 128,   32767, 128
-  .short 32767, 128,   32767, 128
-
-asm_function jsimd_\colorid\()_ycc_convert_neon
-    OUTPUT_WIDTH    .req r0
-    INPUT_BUF       .req r1
-    OUTPUT_BUF      .req r2
-    OUTPUT_ROW      .req r3
-    NUM_ROWS        .req r4
-
-    OUTPUT_BUF0     .req r5
-    OUTPUT_BUF1     .req r6
-    OUTPUT_BUF2     .req OUTPUT_BUF
-
-    RGB             .req r7
-    Y               .req r8
-    U               .req r9
-    V               .req r10
-    N               .req ip
-
-    /* Load constants to d0, d1, d2, d3 */
-    adr             ip, jsimd_\colorid\()_ycc_neon_consts
-    vld1.16         {d0, d1, d2, d3}, [ip, :128]
-
-    /* Save Arm registers and handle input arguments */
-    push            {r4, r5, r6, r7, r8, r9, r10, lr}
-    ldr             NUM_ROWS, [sp, #(4 * 8)]
-    ldr             OUTPUT_BUF0, [OUTPUT_BUF]
-    ldr             OUTPUT_BUF1, [OUTPUT_BUF, #4]
-    ldr             OUTPUT_BUF2, [OUTPUT_BUF, #8]
-    .unreq          OUTPUT_BUF
-
-    /* Save Neon registers */
-    vpush           {d8 - d15}
-
-    /* Outer loop over scanlines */
-    cmp             NUM_ROWS, #1
-    blt             9f
-0:
-    ldr             Y, [OUTPUT_BUF0, OUTPUT_ROW, lsl #2]
-    ldr             U, [OUTPUT_BUF1, OUTPUT_ROW, lsl #2]
-    mov             N, OUTPUT_WIDTH
-    ldr             V, [OUTPUT_BUF2, OUTPUT_ROW, lsl #2]
-    add             OUTPUT_ROW, OUTPUT_ROW, #1
-    ldr             RGB, [INPUT_BUF], #4
-
-    /* Inner loop over pixels */
-    subs            N, N, #8
-    blt             3f
-    do_load         \bpp, 8
-    do_rgb_to_yuv_stage1
-    subs            N, N, #8
-    blt             2f
-1:
-    do_rgb_to_yuv_stage2_store_load_stage1
-    subs            N, N, #8
-    bge             1b
-2:
-    do_rgb_to_yuv_stage2
-    do_store        8
-    tst             N, #7
-    beq             8f
-3:
-    tst             N, #4
-    beq             3f
-    do_load         \bpp, 4
-3:
-    tst             N, #2
-    beq             4f
-    do_load         \bpp, 2
-4:
-    tst             N, #1
-    beq             5f
-    do_load         \bpp, 1
-5:
-    do_rgb_to_yuv
-    tst             N, #4
-    beq             6f
-    do_store        4
-6:
-    tst             N, #2
-    beq             7f
-    do_store        2
-7:
-    tst             N, #1
-    beq             8f
-    do_store        1
-8:
-    subs            NUM_ROWS, NUM_ROWS, #1
-    bgt             0b
-9:
-    /* Restore all registers and return */
-    vpop            {d8 - d15}
-    pop             {r4, r5, r6, r7, r8, r9, r10, pc}
-
-    .unreq          OUTPUT_WIDTH
-    .unreq          OUTPUT_ROW
-    .unreq          INPUT_BUF
-    .unreq          NUM_ROWS
-    .unreq          OUTPUT_BUF0
-    .unreq          OUTPUT_BUF1
-    .unreq          OUTPUT_BUF2
-    .unreq          RGB
-    .unreq          Y
-    .unreq          U
-    .unreq          V
-    .unreq          N
-
-.purgem do_rgb_to_yuv
-.purgem do_rgb_to_yuv_stage1
-.purgem do_rgb_to_yuv_stage2
-.purgem do_rgb_to_yuv_stage2_store_load_stage1
-
-.endm
-
-/*--------------------------------- id ----- bpp R  G  B */
-generate_jsimd_rgb_ycc_convert_neon extrgb,  24, 0, 1, 2
-generate_jsimd_rgb_ycc_convert_neon extbgr,  24, 2, 1, 0
-generate_jsimd_rgb_ycc_convert_neon extrgbx, 32, 0, 1, 2
-generate_jsimd_rgb_ycc_convert_neon extbgrx, 32, 2, 1, 0
-generate_jsimd_rgb_ycc_convert_neon extxbgr, 32, 3, 2, 1
-generate_jsimd_rgb_ycc_convert_neon extxrgb, 32, 1, 2, 3
-
-.purgem do_load
-.purgem do_store
-
-
-/*****************************************************************************/
-
-/*
- * Load data into workspace, applying unsigned->signed conversion
- *
- * TODO: can be combined with 'jsimd_fdct_ifast_neon' to get
- *       rid of VST1.16 instructions
- */
-
-asm_function jsimd_convsamp_neon
-    SAMPLE_DATA     .req r0
-    START_COL       .req r1
-    WORKSPACE       .req r2
-    TMP1            .req r3
-    TMP2            .req r4
-    TMP3            .req r5
-    TMP4            .req ip
-
-    push            {r4, r5}
-    vmov.u8         d0, #128
-
-    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
-    add             TMP1, TMP1, START_COL
-    add             TMP2, TMP2, START_COL
-    add             TMP3, TMP3, START_COL
-    add             TMP4, TMP4, START_COL
-    vld1.8          {d16}, [TMP1]
-    vsubl.u8        q8, d16, d0
-    vld1.8          {d18}, [TMP2]
-    vsubl.u8        q9, d18, d0
-    vld1.8          {d20}, [TMP3]
-    vsubl.u8        q10, d20, d0
-    vld1.8          {d22}, [TMP4]
-    ldmia           SAMPLE_DATA!, {TMP1, TMP2, TMP3, TMP4}
-    vsubl.u8        q11, d22, d0
-    vst1.16         {d16, d17, d18, d19}, [WORKSPACE, :128]!
-    add             TMP1, TMP1, START_COL
-    add             TMP2, TMP2, START_COL
-    vst1.16         {d20, d21, d22, d23}, [WORKSPACE, :128]!
-    add             TMP3, TMP3, START_COL
-    add             TMP4, TMP4, START_COL
-    vld1.8          {d24}, [TMP1]
-    vsubl.u8        q12, d24, d0
-    vld1.8          {d26}, [TMP2]
-    vsubl.u8        q13, d26, d0
-    vld1.8          {d28}, [TMP3]
-    vsubl.u8        q14, d28, d0
-    vld1.8          {d30}, [TMP4]
-    vsubl.u8        q15, d30, d0
-    vst1.16         {d24, d25, d26, d27}, [WORKSPACE, :128]!
-    vst1.16         {d28, d29, d30, d31}, [WORKSPACE, :128]!
-    pop             {r4, r5}
-    bx              lr
-
-    .unreq          SAMPLE_DATA
-    .unreq          START_COL
-    .unreq          WORKSPACE
-    .unreq          TMP1
-    .unreq          TMP2
-    .unreq          TMP3
-    .unreq          TMP4
-
-
-/*****************************************************************************/
-
-/*
- * jsimd_fdct_ifast_neon
- *
- * This function contains a fast, not so accurate integer implementation of
- * the forward DCT (Discrete Cosine Transform). It uses the same calculations
- * and produces exactly the same output as IJG's original 'jpeg_fdct_ifast'
- * function from jfdctfst.c
- *
- * TODO: can be combined with 'jsimd_convsamp_neon' to get
- *       rid of a bunch of VLD1.16 instructions
- */
-
-#define XFIX_0_382683433  d0[0]
-#define XFIX_0_541196100  d0[1]
-#define XFIX_0_707106781  d0[2]
-#define XFIX_1_306562965  d0[3]
-
-.balign 16
-jsimd_fdct_ifast_neon_consts:
-  .short (98 * 128)               /* XFIX_0_382683433 */
-  .short (139 * 128)              /* XFIX_0_541196100 */
-  .short (181 * 128)              /* XFIX_0_707106781 */
-  .short (334 * 128 - 256 * 128)  /* XFIX_1_306562965 */
-
-asm_function jsimd_fdct_ifast_neon
-
-    DATA            .req r0
-    TMP             .req ip
-
-    vpush           {d8 - d15}
-
-    /* Load constants */
-    adr             TMP, jsimd_fdct_ifast_neon_consts
-    vld1.16         {d0}, [TMP, :64]
-
-    /* Load all DATA into Neon registers with the following allocation:
-     *       0 1 2 3 | 4 5 6 7
-     *      ---------+--------
-     *   0 | d16     | d17    | q8
-     *   1 | d18     | d19    | q9
-     *   2 | d20     | d21    | q10
-     *   3 | d22     | d23    | q11
-     *   4 | d24     | d25    | q12
-     *   5 | d26     | d27    | q13
-     *   6 | d28     | d29    | q14
-     *   7 | d30     | d31    | q15
-     */
-
-    vld1.16         {d16, d17, d18, d19}, [DATA, :128]!
-    vld1.16         {d20, d21, d22, d23}, [DATA, :128]!
-    vld1.16         {d24, d25, d26, d27}, [DATA, :128]!
-    vld1.16         {d28, d29, d30, d31}, [DATA, :128]
-    sub             DATA, DATA, #(128 - 32)
-
-    mov             TMP, #2
-1:
-    /* Transpose */
-    vtrn.16         q12, q13
-    vtrn.16         q10, q11
-    vtrn.16         q8, q9
-    vtrn.16         q14, q15
-    vtrn.32         q9, q11
-    vtrn.32         q13, q15
-    vtrn.32         q8, q10
-    vtrn.32         q12, q14
-    vswp            d30, d23
-    vswp            d24, d17
-    vswp            d26, d19
-      /* 1-D FDCT */
-      vadd.s16        q2, q11, q12
-    vswp            d28, d21
-      vsub.s16        q12, q11, q12
-      vsub.s16        q6, q10, q13
-      vadd.s16        q10, q10, q13
-      vsub.s16        q7, q9, q14
-      vadd.s16        q9, q9, q14
-      vsub.s16        q1, q8, q15
-      vadd.s16        q8, q8, q15
-      vsub.s16        q4, q9, q10
-      vsub.s16        q5, q8, q2
-      vadd.s16        q3, q9, q10
-      vadd.s16        q4, q4, q5
-      vadd.s16        q2, q8, q2
-      vqdmulh.s16     q4, q4, XFIX_0_707106781
-      vadd.s16        q11, q12, q6
-      vadd.s16        q8, q2, q3
-      vsub.s16        q12, q2, q3
-      vadd.s16        q3, q6, q7
-      vadd.s16        q7, q7, q1
-      vqdmulh.s16     q3, q3, XFIX_0_707106781
-      vsub.s16        q6, q11, q7
-      vadd.s16        q10, q5, q4
-      vqdmulh.s16     q6, q6, XFIX_0_382683433
-      vsub.s16        q14, q5, q4
-      vqdmulh.s16     q11, q11, XFIX_0_541196100
-      vqdmulh.s16     q5, q7, XFIX_1_306562965
-      vadd.s16        q4, q1, q3
-      vsub.s16        q3, q1, q3
-      vadd.s16        q7, q7, q6
-      vadd.s16        q11, q11, q6
-      vadd.s16        q7, q7, q5
-      vadd.s16        q13, q3, q11
-      vsub.s16        q11, q3, q11
-      vadd.s16        q9, q4, q7
-      vsub.s16        q15, q4, q7
-    subs            TMP, TMP, #1
-    bne             1b
-
-    /* store results */
-    vst1.16         {d16, d17, d18, d19}, [DATA, :128]!
-    vst1.16         {d20, d21, d22, d23}, [DATA, :128]!
-    vst1.16         {d24, d25, d26, d27}, [DATA, :128]!
-    vst1.16         {d28, d29, d30, d31}, [DATA, :128]
-
-    vpop            {d8 - d15}
-    bx              lr
-
-    .unreq          DATA
-    .unreq          TMP
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_quantize_neon(JCOEFPTR coef_block, DCTELEM *divisors,
- *                     DCTELEM *workspace);
- *
- * Note: the code uses 2 stage pipelining in order to improve instructions
- *       scheduling and eliminate stalls (this provides ~15% better
- *       performance for this function on both Arm Cortex-A8 and
- *       Arm Cortex-A9 when compared to the non-pipelined variant).
- *       The instructions which belong to the second stage use different
- *       indentation for better readiability.
- */
-asm_function jsimd_quantize_neon
-
-    COEF_BLOCK      .req r0
-    DIVISORS        .req r1
-    WORKSPACE       .req r2
-
-    RECIPROCAL      .req DIVISORS
-    CORRECTION      .req r3
-    SHIFT           .req ip
-    LOOP_COUNT      .req r4
-
-    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
-    vabs.s16        q12, q0
-    add             CORRECTION, DIVISORS, #(64 * 2)
-    add             SHIFT, DIVISORS, #(64 * 6)
-    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
-    vabs.s16        q13, q1
-    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
-    vadd.u16        q12, q12, q10  /* add correction */
-    vadd.u16        q13, q13, q11
-    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
-    vmull.u16       q11, d25, d17
-    vmull.u16       q8, d26, d18
-    vmull.u16       q9, d27, d19
-    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
-    vshrn.u32       d20, q10, #16
-    vshrn.u32       d21, q11, #16
-    vshrn.u32       d22, q8, #16
-    vshrn.u32       d23, q9, #16
-    vneg.s16        q12, q12
-    vneg.s16        q13, q13
-    vshr.s16        q2, q0, #15    /* extract sign */
-    vshr.s16        q3, q1, #15
-    vshl.u16        q14, q10, q12  /* shift */
-    vshl.u16        q15, q11, q13
-
-    push            {r4, r5}
-    mov             LOOP_COUNT, #3
-1:
-    vld1.16         {d0, d1, d2, d3}, [WORKSPACE, :128]!
-      veor.u16        q14, q14, q2  /* restore sign */
-    vabs.s16        q12, q0
-    vld1.16         {d20, d21, d22, d23}, [CORRECTION, :128]!
-    vabs.s16        q13, q1
-      veor.u16        q15, q15, q3
-    vld1.16         {d16, d17, d18, d19}, [RECIPROCAL, :128]!
-    vadd.u16        q12, q12, q10  /* add correction */
-    vadd.u16        q13, q13, q11
-    vmull.u16       q10, d24, d16  /* multiply by reciprocal */
-    vmull.u16       q11, d25, d17
-    vmull.u16       q8, d26, d18
-    vmull.u16       q9, d27, d19
-      vsub.u16        q14, q14, q2
-    vld1.16         {d24, d25, d26, d27}, [SHIFT, :128]!
-      vsub.u16        q15, q15, q3
-    vshrn.u32       d20, q10, #16
-    vshrn.u32       d21, q11, #16
-      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
-    vshrn.u32       d22, q8, #16
-    vshrn.u32       d23, q9, #16
-    vneg.s16        q12, q12
-    vneg.s16        q13, q13
-    vshr.s16        q2, q0, #15    /* extract sign */
-    vshr.s16        q3, q1, #15
-    vshl.u16        q14, q10, q12  /* shift */
-    vshl.u16        q15, q11, q13
-    subs            LOOP_COUNT, LOOP_COUNT, #1
-    bne             1b
-    pop             {r4, r5}
-
-      veor.u16        q14, q14, q2  /* restore sign */
-      veor.u16        q15, q15, q3
-      vsub.u16        q14, q14, q2
-      vsub.u16        q15, q15, q3
-      vst1.16         {d28, d29, d30, d31}, [COEF_BLOCK, :128]!
-
-    bx              lr  /* return */
-
-    .unreq          COEF_BLOCK
-    .unreq          DIVISORS
-    .unreq          WORKSPACE
-    .unreq          RECIPROCAL
-    .unreq          CORRECTION
-    .unreq          SHIFT
-    .unreq          LOOP_COUNT
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(void)
- * jsimd_h2v1_fancy_upsample_neon(int max_v_samp_factor,
- *                                JDIMENSION downsampled_width,
- *                                JSAMPARRAY input_data,
- *                                JSAMPARRAY *output_data_ptr);
- *
- * Note: the use of unaligned writes is the main remaining bottleneck in
- *       this code, which can be potentially solved to get up to tens
- *       of percents performance improvement on Cortex-A8/Cortex-A9.
- */
-
-/*
- * Upsample 16 source pixels to 32 destination pixels. The new 16 source
- * pixels are loaded to q0. The previous 16 source pixels are in q1. The
- * shifted-by-one source pixels are constructed in q2 by using q0 and q1.
- * Register d28 is used for multiplication by 3. Register q15 is used
- * for adding +1 bias.
- */
-.macro upsample16 OUTPTR, INPTR
-    vld1.8          {q0}, [\INPTR]!
-    vmovl.u8        q8, d0
-    vext.8          q2, q1, q0, #15
-    vmovl.u8        q9, d1
-    vaddw.u8        q10, q15, d4
-    vaddw.u8        q11, q15, d5
-    vmlal.u8        q8, d4, d28
-    vmlal.u8        q9, d5, d28
-    vmlal.u8        q10, d0, d28
-    vmlal.u8        q11, d1, d28
-    vmov            q1, q0        /* backup source pixels to q1 */
-    vrshrn.u16      d6, q8, #2
-    vrshrn.u16      d7, q9, #2
-    vshrn.u16       d8, q10, #2
-    vshrn.u16       d9, q11, #2
-    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
-.endm
-
-/*
- * Upsample 32 source pixels to 64 destination pixels. Compared to 'usample16'
- * macro, the roles of q0 and q1 registers are reversed for even and odd
- * groups of 16 pixels, that's why "vmov q1, q0" instructions are not needed.
- * Also this unrolling allows to reorder loads and stores to compensate
- * multiplication latency and reduce stalls.
- */
-.macro upsample32 OUTPTR, INPTR
-    /* even 16 pixels group */
-    vld1.8          {q0}, [\INPTR]!
-    vmovl.u8        q8, d0
-    vext.8          q2, q1, q0, #15
-    vmovl.u8        q9, d1
-    vaddw.u8        q10, q15, d4
-    vaddw.u8        q11, q15, d5
-    vmlal.u8        q8, d4, d28
-    vmlal.u8        q9, d5, d28
-    vmlal.u8        q10, d0, d28
-    vmlal.u8        q11, d1, d28
-      /* odd 16 pixels group */
-      vld1.8          {q1}, [\INPTR]!
-    vrshrn.u16      d6, q8, #2
-    vrshrn.u16      d7, q9, #2
-    vshrn.u16       d8, q10, #2
-    vshrn.u16       d9, q11, #2
-      vmovl.u8        q8, d2
-      vext.8          q2, q0, q1, #15
-      vmovl.u8        q9, d3
-      vaddw.u8        q10, q15, d4
-      vaddw.u8        q11, q15, d5
-      vmlal.u8        q8, d4, d28
-      vmlal.u8        q9, d5, d28
-      vmlal.u8        q10, d2, d28
-      vmlal.u8        q11, d3, d28
-    vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
-      vrshrn.u16      d6, q8, #2
-      vrshrn.u16      d7, q9, #2
-      vshrn.u16       d8, q10, #2
-      vshrn.u16       d9, q11, #2
-      vst2.8          {d6, d7, d8, d9}, [\OUTPTR]!
-.endm
-
-/*
- * Upsample a row of WIDTH pixels from INPTR to OUTPTR.
- */
-.macro upsample_row OUTPTR, INPTR, WIDTH, TMP1
-    /* special case for the first and last pixels */
-    sub             \WIDTH, \WIDTH, #1
-    add             \OUTPTR, \OUTPTR, #1
-    ldrb            \TMP1, [\INPTR, \WIDTH]
-    strb            \TMP1, [\OUTPTR, \WIDTH, asl #1]
-    ldrb            \TMP1, [\INPTR], #1
-    strb            \TMP1, [\OUTPTR, #-1]
-    vmov.8          d3[7], \TMP1
-
-    subs            \WIDTH, \WIDTH, #32
-    blt             5f
-0:  /* process 32 pixels per iteration */
-    upsample32      \OUTPTR, \INPTR
-    subs            \WIDTH, \WIDTH, #32
-    bge             0b
-5:
-    adds            \WIDTH, \WIDTH, #16
-    blt             1f
-0:  /* process 16 pixels if needed */
-    upsample16      \OUTPTR, \INPTR
-    subs            \WIDTH, \WIDTH, #16
-1:
-    adds            \WIDTH, \WIDTH, #16
-    beq             9f
-
-    /* load the remaining 1-15 pixels */
-    add             \INPTR, \INPTR, \WIDTH
-    tst             \WIDTH, #1
-    beq             2f
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[0]}, [\INPTR]
-2:
-    tst             \WIDTH, #2
-    beq             2f
-    vext.8          d0, d0, d0, #6
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[1]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[0]}, [\INPTR]
-2:
-    tst             \WIDTH, #4
-    beq             2f
-    vrev64.32       d0, d0
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[3]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[2]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[1]}, [\INPTR]
-    sub             \INPTR, \INPTR, #1
-    vld1.8          {d0[0]}, [\INPTR]
-2:
-    tst             \WIDTH, #8
-    beq             2f
-    vmov            d1, d0
-    sub             \INPTR, \INPTR, #8
-    vld1.8          {d0}, [\INPTR]
-2:  /* upsample the remaining pixels */
-    vmovl.u8        q8, d0
-    vext.8          q2, q1, q0, #15
-    vmovl.u8        q9, d1
-    vaddw.u8        q10, q15, d4
-    vaddw.u8        q11, q15, d5
-    vmlal.u8        q8, d4, d28
-    vmlal.u8        q9, d5, d28
-    vmlal.u8        q10, d0, d28
-    vmlal.u8        q11, d1, d28
-    vrshrn.u16      d10, q8, #2
-    vrshrn.u16      d12, q9, #2
-    vshrn.u16       d11, q10, #2
-    vshrn.u16       d13, q11, #2
-    vzip.8          d10, d11
-    vzip.8          d12, d13
-    /* store the remaining pixels */
-    tst             \WIDTH, #8
-    beq             2f
-    vst1.8          {d10, d11}, [\OUTPTR]!
-    vmov            q5, q6
-2:
-    tst             \WIDTH, #4
-    beq             2f
-    vst1.8          {d10}, [\OUTPTR]!
-    vmov            d10, d11
-2:
-    tst             \WIDTH, #2
-    beq             2f
-    vst1.8          {d10[0]}, [\OUTPTR]!
-    vst1.8          {d10[1]}, [\OUTPTR]!
-    vst1.8          {d10[2]}, [\OUTPTR]!
-    vst1.8          {d10[3]}, [\OUTPTR]!
-    vext.8          d10, d10, d10, #4
-2:
-    tst             \WIDTH, #1
-    beq             2f
-    vst1.8          {d10[0]}, [\OUTPTR]!
-    vst1.8          {d10[1]}, [\OUTPTR]!
-2:
-9:
-.endm
-
-asm_function jsimd_h2v1_fancy_upsample_neon
-
-    MAX_V_SAMP_FACTOR .req r0
-    DOWNSAMPLED_WIDTH .req r1
-    INPUT_DATA        .req r2
-    OUTPUT_DATA_PTR   .req r3
-    OUTPUT_DATA       .req OUTPUT_DATA_PTR
-
-    OUTPTR            .req r4
-    INPTR             .req r5
-    WIDTH             .req ip
-    TMP               .req lr
-
-    push            {r4, r5, r6, lr}
-    vpush           {d8 - d15}
-
-    ldr             OUTPUT_DATA, [OUTPUT_DATA_PTR]
-    cmp             MAX_V_SAMP_FACTOR, #0
-    ble             99f
-
-    /* initialize constants */
-    vmov.u8         d28, #3
-    vmov.u16        q15, #1
-11:
-    ldr             INPTR, [INPUT_DATA], #4
-    ldr             OUTPTR, [OUTPUT_DATA], #4
-    mov             WIDTH, DOWNSAMPLED_WIDTH
-    upsample_row    OUTPTR, INPTR, WIDTH, TMP
-    subs            MAX_V_SAMP_FACTOR, MAX_V_SAMP_FACTOR, #1
-    bgt             11b
-
-99:
-    vpop            {d8 - d15}
-    pop             {r4, r5, r6, pc}
-
-    .unreq          MAX_V_SAMP_FACTOR
-    .unreq          DOWNSAMPLED_WIDTH
-    .unreq          INPUT_DATA
-    .unreq          OUTPUT_DATA_PTR
-    .unreq          OUTPUT_DATA
-
-    .unreq          OUTPTR
-    .unreq          INPTR
-    .unreq          WIDTH
-    .unreq          TMP
-
-.purgem upsample16
-.purgem upsample32
-.purgem upsample_row
-
-
-/*****************************************************************************/
-
-/*
- * GLOBAL(JOCTET *)
- * jsimd_huff_encode_one_block(working_state *state, JOCTET *buffer,
- *                             JCOEFPTR block, int last_dc_val,
- *                             c_derived_tbl *dctbl, c_derived_tbl *actbl)
- *
- */
-
-.macro emit_byte BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
-    sub             \PUT_BITS, \PUT_BITS, #0x8
-    lsr             \TMP, \PUT_BUFFER, \PUT_BITS
-    uxtb            \TMP, \TMP
-    strb            \TMP, [\BUFFER, #1]!
-    cmp             \TMP, #0xff
-    /*it eq*/
-    strbeq          \ZERO, [\BUFFER, #1]!
-.endm
-
-.macro put_bits PUT_BUFFER, PUT_BITS, CODE, SIZE
-    /*lsl             \PUT_BUFFER, \PUT_BUFFER, \SIZE*/
-    add             \PUT_BITS, \SIZE
-    /*orr             \PUT_BUFFER, \PUT_BUFFER, \CODE*/
-    orr             \PUT_BUFFER, \CODE, \PUT_BUFFER, lsl \SIZE
-.endm
-
-.macro checkbuf15 BUFFER, PUT_BUFFER, PUT_BITS, ZERO, TMP
-  cmp               \PUT_BITS, #0x10
-  blt               15f
-    eor               \ZERO, \ZERO, \ZERO
-    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
-    emit_byte         \BUFFER, \PUT_BUFFER, \PUT_BITS, \ZERO, \TMP
-15:
-.endm
-
-.balign 16
-jsimd_huff_encode_one_block_neon_consts:
-  .byte 0x01
-  .byte 0x02
-  .byte 0x04
-  .byte 0x08
-  .byte 0x10
-  .byte 0x20
-  .byte 0x40
-  .byte 0x80
-
-asm_function jsimd_huff_encode_one_block_neon
-    push            {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-    add             r7, sp, #0x1c
-    sub             r4, sp, #0x40
-    bfc             r4, #0, #5
-    mov             sp, r4           /* align sp on 32 bytes */
-    vst1.64         {d8, d9, d10, d11}, [r4, :128]!
-    vst1.64         {d12, d13, d14, d15}, [r4, :128]
-    sub             sp, #0x140       /* reserve 320 bytes */
-    str             r0, [sp, #0x18]  /* working state > sp + Ox18 */
-    add             r4, sp, #0x20    /* r4 = t1 */
-    ldr             lr, [r7, #0x8]   /* lr = dctbl */
-    sub             r10, r1, #0x1    /* r10=buffer-- */
-    ldrsh           r1, [r2]
-    mov             r9, #0x10
-    mov             r8, #0x1
-    adr             r5, jsimd_huff_encode_one_block_neon_consts
-    /* prepare data */
-    vld1.8          {d26}, [r5, :64]
-    veor            q8, q8, q8
-    veor            q9, q9, q9
-    vdup.16         q14, r9
-    vdup.16         q15, r8
-    veor            q10, q10, q10
-    veor            q11, q11, q11
-    sub             r1, r1, r3
-    add             r9, r2, #0x22
-    add             r8, r2, #0x18
-    add             r3, r2, #0x36
-    vmov.16         d0[0], r1
-    vld1.16         {d2[0]}, [r9, :16]
-    vld1.16         {d4[0]}, [r8, :16]
-    vld1.16         {d6[0]}, [r3, :16]
-    add             r1, r2, #0x2
-    add             r9, r2, #0x30
-    add             r8, r2, #0x26
-    add             r3, r2, #0x28
-    vld1.16         {d0[1]}, [r1, :16]
-    vld1.16         {d2[1]}, [r9, :16]
-    vld1.16         {d4[1]}, [r8, :16]
-    vld1.16         {d6[1]}, [r3, :16]
-    add             r1, r2, #0x10
-    add             r9, r2, #0x40
-    add             r8, r2, #0x34
-    add             r3, r2, #0x1a
-    vld1.16         {d0[2]}, [r1, :16]
-    vld1.16         {d2[2]}, [r9, :16]
-    vld1.16         {d4[2]}, [r8, :16]
-    vld1.16         {d6[2]}, [r3, :16]
-    add             r1, r2, #0x20
-    add             r9, r2, #0x32
-    add             r8, r2, #0x42
-    add             r3, r2, #0xc
-    vld1.16         {d0[3]}, [r1, :16]
-    vld1.16         {d2[3]}, [r9, :16]
-    vld1.16         {d4[3]}, [r8, :16]
-    vld1.16         {d6[3]}, [r3, :16]
-    add             r1, r2, #0x12
-    add             r9, r2, #0x24
-    add             r8, r2, #0x50
-    add             r3, r2, #0xe
-    vld1.16         {d1[0]}, [r1, :16]
-    vld1.16         {d3[0]}, [r9, :16]
-    vld1.16         {d5[0]}, [r8, :16]
-    vld1.16         {d7[0]}, [r3, :16]
-    add             r1, r2, #0x4
-    add             r9, r2, #0x16
-    add             r8, r2, #0x60
-    add             r3, r2, #0x1c
-    vld1.16         {d1[1]}, [r1, :16]
-    vld1.16         {d3[1]}, [r9, :16]
-    vld1.16         {d5[1]}, [r8, :16]
-    vld1.16         {d7[1]}, [r3, :16]
-    add             r1, r2, #0x6
-    add             r9, r2, #0x8
-    add             r8, r2, #0x52
-    add             r3, r2, #0x2a
-    vld1.16         {d1[2]}, [r1, :16]
-    vld1.16         {d3[2]}, [r9, :16]
-    vld1.16         {d5[2]}, [r8, :16]
-    vld1.16         {d7[2]}, [r3, :16]
-    add             r1, r2, #0x14
-    add             r9, r2, #0xa
-    add             r8, r2, #0x44
-    add             r3, r2, #0x38
-    vld1.16         {d1[3]}, [r1, :16]
-    vld1.16         {d3[3]}, [r9, :16]
-    vld1.16         {d5[3]}, [r8, :16]
-    vld1.16         {d7[3]}, [r3, :16]
-    vcgt.s16        q8, q8, q0
-    vcgt.s16        q9, q9, q1
-    vcgt.s16        q10, q10, q2
-    vcgt.s16        q11, q11, q3
-    vabs.s16        q0, q0
-    vabs.s16        q1, q1
-    vabs.s16        q2, q2
-    vabs.s16        q3, q3
-    veor            q8, q8, q0
-    veor            q9, q9, q1
-    veor            q10, q10, q2
-    veor            q11, q11, q3
-    add             r9, r4, #0x20
-    add             r8, r4, #0x80
-    add             r3, r4, #0xa0
-    vclz.i16        q0, q0
-    vclz.i16        q1, q1
-    vclz.i16        q2, q2
-    vclz.i16        q3, q3
-    vsub.i16        q0, q14, q0
-    vsub.i16        q1, q14, q1
-    vsub.i16        q2, q14, q2
-    vsub.i16        q3, q14, q3
-    vst1.16         {d0, d1, d2, d3}, [r4, :256]
-    vst1.16         {d4, d5, d6, d7}, [r9, :256]
-    vshl.s16        q0, q15, q0
-    vshl.s16        q1, q15, q1
-    vshl.s16        q2, q15, q2
-    vshl.s16        q3, q15, q3
-    vsub.i16        q0, q0, q15
-    vsub.i16        q1, q1, q15
-    vsub.i16        q2, q2, q15
-    vsub.i16        q3, q3, q15
-    vand            q8, q8, q0
-    vand            q9, q9, q1
-    vand            q10, q10, q2
-    vand            q11, q11, q3
-    vst1.16         {d16, d17, d18, d19}, [r8, :256]
-    vst1.16         {d20, d21, d22, d23}, [r3, :256]
-    add             r1, r2, #0x46
-    add             r9, r2, #0x3a
-    add             r8, r2, #0x74
-    add             r3, r2, #0x6a
-    vld1.16         {d8[0]}, [r1, :16]
-    vld1.16         {d10[0]}, [r9, :16]
-    vld1.16         {d12[0]}, [r8, :16]
-    vld1.16         {d14[0]}, [r3, :16]
-    veor            q8, q8, q8
-    veor            q9, q9, q9
-    veor            q10, q10, q10
-    veor            q11, q11, q11
-    add             r1, r2, #0x54
-    add             r9, r2, #0x2c
-    add             r8, r2, #0x76
-    add             r3, r2, #0x78
-    vld1.16         {d8[1]}, [r1, :16]
-    vld1.16         {d10[1]}, [r9, :16]
-    vld1.16         {d12[1]}, [r8, :16]
-    vld1.16         {d14[1]}, [r3, :16]
-    add             r1, r2, #0x62
-    add             r9, r2, #0x1e
-    add             r8, r2, #0x68
-    add             r3, r2, #0x7a
-    vld1.16         {d8[2]}, [r1, :16]
-    vld1.16         {d10[2]}, [r9, :16]
-    vld1.16         {d12[2]}, [r8, :16]
-    vld1.16         {d14[2]}, [r3, :16]
-    add             r1, r2, #0x70
-    add             r9, r2, #0x2e
-    add             r8, r2, #0x5a
-    add             r3, r2, #0x6c
-    vld1.16         {d8[3]}, [r1, :16]
-    vld1.16         {d10[3]}, [r9, :16]
-    vld1.16         {d12[3]}, [r8, :16]
-    vld1.16         {d14[3]}, [r3, :16]
-    add             r1, r2, #0x72
-    add             r9, r2, #0x3c
-    add             r8, r2, #0x4c
-    add             r3, r2, #0x5e
-    vld1.16         {d9[0]}, [r1, :16]
-    vld1.16         {d11[0]}, [r9, :16]
-    vld1.16         {d13[0]}, [r8, :16]
-    vld1.16         {d15[0]}, [r3, :16]
-    add             r1, r2, #0x64
-    add             r9, r2, #0x4a
-    add             r8, r2, #0x3e
-    add             r3, r2, #0x6e
-    vld1.16         {d9[1]}, [r1, :16]
-    vld1.16         {d11[1]}, [r9, :16]
-    vld1.16         {d13[1]}, [r8, :16]
-    vld1.16         {d15[1]}, [r3, :16]
-    add             r1, r2, #0x56
-    add             r9, r2, #0x58
-    add             r8, r2, #0x4e
-    add             r3, r2, #0x7c
-    vld1.16         {d9[2]}, [r1, :16]
-    vld1.16         {d11[2]}, [r9, :16]
-    vld1.16         {d13[2]}, [r8, :16]
-    vld1.16         {d15[2]}, [r3, :16]
-    add             r1, r2, #0x48
-    add             r9, r2, #0x66
-    add             r8, r2, #0x5c
-    add             r3, r2, #0x7e
-    vld1.16         {d9[3]}, [r1, :16]
-    vld1.16         {d11[3]}, [r9, :16]
-    vld1.16         {d13[3]}, [r8, :16]
-    vld1.16         {d15[3]}, [r3, :16]
-    vcgt.s16        q8, q8, q4
-    vcgt.s16        q9, q9, q5
-    vcgt.s16        q10, q10, q6
-    vcgt.s16        q11, q11, q7
-    vabs.s16        q4, q4
-    vabs.s16        q5, q5
-    vabs.s16        q6, q6
-    vabs.s16        q7, q7
-    veor            q8, q8, q4
-    veor            q9, q9, q5
-    veor            q10, q10, q6
-    veor            q11, q11, q7
-    add             r1, r4, #0x40
-    add             r9, r4, #0x60
-    add             r8, r4, #0xc0
-    add             r3, r4, #0xe0
-    vclz.i16        q4, q4
-    vclz.i16        q5, q5
-    vclz.i16        q6, q6
-    vclz.i16        q7, q7
-    vsub.i16        q4, q14, q4
-    vsub.i16        q5, q14, q5
-    vsub.i16        q6, q14, q6
-    vsub.i16        q7, q14, q7
-    vst1.16         {d8, d9, d10, d11}, [r1, :256]
-    vst1.16         {d12, d13, d14, d15}, [r9, :256]
-    vshl.s16        q4, q15, q4
-    vshl.s16        q5, q15, q5
-    vshl.s16        q6, q15, q6
-    vshl.s16        q7, q15, q7
-    vsub.i16        q4, q4, q15
-    vsub.i16        q5, q5, q15
-    vsub.i16        q6, q6, q15
-    vsub.i16        q7, q7, q15
-    vand            q8, q8, q4
-    vand            q9, q9, q5
-    vand            q10, q10, q6
-    vand            q11, q11, q7
-    vst1.16         {d16, d17, d18, d19}, [r8, :256]
-    vst1.16         {d20, d21, d22, d23}, [r3, :256]
-    ldr             r12, [r7, #0xc]       /* r12 = actbl */
-    add             r1, lr, #0x400        /* r1 = dctbl->ehufsi */
-    mov             r9, r12               /* r9 = actbl */
-    add             r6, r4, #0x80         /* r6 = t2 */
-    ldr             r11, [r0, #0x8]       /* r11 = put_buffer */
-    ldr             r4, [r0, #0xc]        /* r4  = put_bits */
-    ldrh            r2, [r6, #-128]       /* r2  = nbits */
-    ldrh            r3, [r6]              /* r3  = temp2 & (((JLONG)1)<<nbits) - 1; */
-    ldr             r0, [lr, r2, lsl #2]
-    ldrb            r5, [r1, r2]
-    put_bits        r11, r4, r0, r5
-    checkbuf15      r10, r11, r4, r5, r0
-    put_bits        r11, r4, r3, r2
-    checkbuf15      r10, r11, r4, r5, r0
-    mov             lr, r6                /* lr = t2 */
-    add             r5, r9, #0x400        /* r5 = actbl->ehufsi */
-    ldrsb           r6, [r5, #0xf0]       /* r6 = actbl->ehufsi[0xf0] */
-    veor            q8, q8, q8
-    vceq.i16        q0, q0, q8
-    vceq.i16        q1, q1, q8
-    vceq.i16        q2, q2, q8
-    vceq.i16        q3, q3, q8
-    vceq.i16        q4, q4, q8
-    vceq.i16        q5, q5, q8
-    vceq.i16        q6, q6, q8
-    vceq.i16        q7, q7, q8
-    vmovn.i16       d0, q0
-    vmovn.i16       d2, q1
-    vmovn.i16       d4, q2
-    vmovn.i16       d6, q3
-    vmovn.i16       d8, q4
-    vmovn.i16       d10, q5
-    vmovn.i16       d12, q6
-    vmovn.i16       d14, q7
-    vand            d0, d0, d26
-    vand            d2, d2, d26
-    vand            d4, d4, d26
-    vand            d6, d6, d26
-    vand            d8, d8, d26
-    vand            d10, d10, d26
-    vand            d12, d12, d26
-    vand            d14, d14, d26
-    vpadd.i8        d0, d0, d2
-    vpadd.i8        d4, d4, d6
-    vpadd.i8        d8, d8, d10
-    vpadd.i8        d12, d12, d14
-    vpadd.i8        d0, d0, d4
-    vpadd.i8        d8, d8, d12
-    vpadd.i8        d0, d0, d8
-    vmov.32         r1, d0[1]
-    vmov.32         r8, d0[0]
-    mvn             r1, r1
-    mvn             r8, r8
-    lsrs            r1, r1, #0x1
-    rrx             r8, r8            /* shift in last r1 bit while shifting out DC bit */
-    rbit            r1, r1            /* r1 = index1 */
-    rbit            r8, r8            /* r8 = index0 */
-    ldr             r0, [r9, #0x3c0]  /* r0 = actbl->ehufco[0xf0] */
-    str             r1, [sp, #0x14]   /* index1 > sp + 0x14 */
-    cmp             r8, #0x0
-    beq             6f
-1:
-    clz             r2, r8
-    add             lr, lr, r2, lsl #1
-    lsl             r8, r8, r2
-    ldrh            r1, [lr, #-126]
-2:
-    cmp             r2, #0x10
-    blt             3f
-    sub             r2, r2, #0x10
-    put_bits        r11, r4, r0, r6
-    cmp             r4, #0x10
-    blt             2b
-    eor             r3, r3, r3
-    emit_byte       r10, r11, r4, r3, r12
-    emit_byte       r10, r11, r4, r3, r12
-    b               2b
-3:
-    add             r2, r1, r2, lsl #4
-    ldrh            r3, [lr, #2]!
-    ldr             r12, [r9, r2, lsl #2]
-    ldrb            r2, [r5, r2]
-    put_bits        r11, r4, r12, r2
-    checkbuf15      r10, r11, r4, r2, r12
-    put_bits        r11, r4, r3, r1
-    checkbuf15      r10, r11, r4, r2, r12
-    lsls            r8, r8, #0x1
-    bne             1b
-6:
-    add             r12, sp, #0x20   /* r12 = t1 */
-    ldr             r8, [sp, #0x14]  /* r8 = index1 */
-    adds            r12, #0xc0       /* r12 = t2 + (DCTSIZE2/2) */
-    cmp             r8, #0x0
-    beq             6f
-    clz             r2, r8
-    sub             r12, r12, lr
-    lsl             r8, r8, r2
-    add             r2, r2, r12, lsr #1
-    add             lr, lr, r2, lsl #1
-    b               7f
-1:
-    clz             r2, r8
-    add             lr, lr, r2, lsl #1
-    lsl             r8, r8, r2
-7:
-    ldrh            r1, [lr, #-126]
-2:
-    cmp             r2, #0x10
-    blt             3f
-    sub             r2, r2, #0x10
-    put_bits        r11, r4, r0, r6
-    cmp             r4, #0x10
-    blt             2b
-    eor             r3, r3, r3
-    emit_byte       r10, r11, r4, r3, r12
-    emit_byte       r10, r11, r4, r3, r12
-    b               2b
-3:
-    add             r2, r1, r2, lsl #4
-    ldrh            r3, [lr, #2]!
-    ldr             r12, [r9, r2, lsl #2]
-    ldrb            r2, [r5, r2]
-    put_bits        r11, r4, r12, r2
-    checkbuf15      r10, r11, r4, r2, r12
-    put_bits        r11, r4, r3, r1
-    checkbuf15      r10, r11, r4, r2, r12
-    lsls            r8, r8, #0x1
-    bne             1b
-6:
-    add             r0, sp, #0x20
-    add             r0, #0xfe
-    cmp             lr, r0
-    bhs             1f
-    ldr             r1, [r9]
-    ldrb            r0, [r5]
-    put_bits        r11, r4, r1, r0
-    checkbuf15      r10, r11, r4, r0, r1
-1:
-    ldr             r12, [sp, #0x18]
-    str             r11, [r12, #0x8]
-    str             r4, [r12, #0xc]
-    add             r0, r10, #0x1
-    add             r4, sp, #0x140
-    vld1.64         {d8, d9, d10, d11}, [r4, :128]!
-    vld1.64         {d12, d13, d14, d15}, [r4, :128]
-    sub             r4, r7, #0x1c
-    mov             sp, r4
-    pop             {r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-.purgem emit_byte
-.purgem put_bits
-.purgem checkbuf15
diff --git a/external/jpeg/simd/arm/neon-compat.h.in b/external/jpeg/simd/arm/neon-compat.h.in
new file mode 100644
index 000000000000..436c402a1778
--- /dev/null
+++ b/external/jpeg/simd/arm/neon-compat.h.in
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+#cmakedefine HAVE_VLD1_S16_X3
+#cmakedefine HAVE_VLD1_U16_X2
+#cmakedefine HAVE_VLD1Q_U8_X4
+
+/* Define compiler-independent count-leading-zeros and byte-swap macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#define BUILTIN_CLZ(x)  _CountLeadingZeros(x)
+#define BUILTIN_CLZLL(x)  _CountLeadingZeros64(x)
+#define BUILTIN_BSWAP32(x)  _byteswap_ulong(x)
+#define BUILTIN_BSWAP64(x)  _byteswap_uint64(x)
+#elif defined(__clang__) || defined(__GNUC__)
+#define BUILTIN_CLZ(x)  __builtin_clz(x)
+#define BUILTIN_CLZLL(x)  __builtin_clzll(x)
+#define BUILTIN_BSWAP32(x)  __builtin_bswap32(x)
+#define BUILTIN_BSWAP64(x)  __builtin_bswap64(x)
+#else
+#error "Unknown compiler"
+#endif
diff --git a/external/jpeg/simd/gas-preprocessor.in b/external/jpeg/simd/gas-preprocessor.in
deleted file mode 100644
index 560f788b5541..000000000000
--- a/external/jpeg/simd/gas-preprocessor.in
+++ /dev/null
@@ -1 +0,0 @@
-gas-preprocessor.pl @CMAKE_ASM_COMPILER@ ${1+"$@"}
diff --git a/external/jpeg/simd/i386/jchuff-sse2.asm b/external/jpeg/simd/i386/jchuff-sse2.asm
index 79f0ca52cc7d..278cf5e83af3 100644
--- a/external/jpeg/simd/i386/jchuff-sse2.asm
+++ b/external/jpeg/simd/i386/jchuff-sse2.asm
@@ -1,8 +1,9 @@
 ;
 ; jchuff-sse2.asm - Huffman entropy encoding (SSE2)
 ;
-; Copyright (C) 2009-2011, 2014-2017, D. R. Commander.
+; Copyright (C) 2009-2011, 2014-2017, 2019, D. R. Commander.
 ; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -15,134 +16,255 @@
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
 ;
 ; This file contains an SSE2 implementation for Huffman coding of one block.
-; The following code is based directly on jchuff.c; see jchuff.c for more
-; details.
+; The following code is based on jchuff.c; see jchuff.c for more details.
 
 %include "jsimdext.inc"
 
+struc working_state
+.next_output_byte:   resp 1     ; => next byte to write in buffer
+.free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1     ; current bit accumulation buffer
+.cur.free_bits       resd 1     ; # of bits available in it
+.cur.last_dc_val     resd 4     ; last DC coef for each component
+.cinfo:              resp 1     ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco:             resd 256   ; code for each symbol
+.ehufsi:             resb 256   ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
-    alignz      32
     GLOBAL_DATA(jconst_huff_encode_one_block)
 
 EXTN(jconst_huff_encode_one_block):
 
-%include "jpeg_nbits_table.inc"
+    alignz      32
+
+jpeg_mask_bits dq 0x0000, 0x0001, 0x0003, 0x0007
+               dq 0x000f, 0x001f, 0x003f, 0x007f
+               dq 0x00ff, 0x01ff, 0x03ff, 0x07ff
+               dq 0x0fff, 0x1fff, 0x3fff, 0x7fff
+
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 <<  9 db 10
+times 1 <<  8 db  9
+times 1 <<  7 db  8
+times 1 <<  6 db  7
+times 1 <<  5 db  6
+times 1 <<  4 db  5
+times 1 <<  3 db  4
+times 1 <<  2 db  3
+times 1 <<  1 db  2
+times 1 <<  0 db  1
+times 1       db  0
+jpeg_nbits_table:
+times 1       db  0
+times 1 <<  0 db  1
+times 1 <<  1 db  2
+times 1 <<  2 db  3
+times 1 <<  3 db  4
+times 1 <<  4 db  5
+times 1 <<  5 db  6
+times 1 <<  6 db  7
+times 1 <<  7 db  8
+times 1 <<  8 db  9
+times 1 <<  9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
 
     alignz      32
 
+%ifdef PIC
+%define NBITS(x)      nbits_base + x
+%else
+%define NBITS(x)      jpeg_nbits_table + x
+%endif
+%define MASK_BITS(x)  NBITS((x) * 8) + (jpeg_mask_bits - jpeg_nbits_table)
+
 ; --------------------------------------------------------------------------
     SECTION     SEG_TEXT
     BITS        32
 
-; These macros perform the same task as the emit_bits() function in the
-; original libjpeg code.  In addition to reducing overhead by explicitly
-; inlining the code, additional performance is achieved by taking into
-; account the size of the bit buffer and waiting until it is almost full
-; before emptying it.  This mostly benefits 64-bit platforms, since 6
-; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
-
-%macro EMIT_BYTE 0
-    sub         put_bits, 8             ; put_bits -= 8;
-    mov         edx, put_buffer
-    mov         ecx, put_bits
-    shr         edx, cl                 ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
-    mov         byte [eax], dl          ; *buffer++ = c;
-    add         eax, 1
-    cmp         dl, 0xFF                ; need to stuff a zero byte?
-    jne         %%.EMIT_BYTE_END
-    mov         byte [eax], 0           ; *buffer++ = 0;
-    add         eax, 1
-%%.EMIT_BYTE_END:
-%endmacro
-
-%macro PUT_BITS 1
-    add         put_bits, ecx           ; put_bits += size;
-    shl         put_buffer, cl          ; put_buffer = (put_buffer << size);
-    or          put_buffer, %1
+%define mm_put_buffer     mm0
+%define mm_all_0xff       mm1
+%define mm_temp           mm2
+%define mm_nbits          mm3
+%define mm_code_bits      mm3
+%define mm_code           mm4
+%define mm_overflow_bits  mm5
+%define mm_save_nbits     mm6
+
+; Shorthand used to describe SIMD operations:
+; wN:  xmmN treated as eight signed 16-bit values
+; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
+; bN:  xmmN treated as 16 unsigned 8-bit values, or
+;      mmN treated as eight unsigned 8-bit values
+; bN[i]:  perform the same operation on all unsigned 8-bit values,
+;         i=0..15 (SSE register) or i=0..7 (MMX register)
+; Contents of SIMD registers are shown in memory order.
+
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - temp register
+; %2 - low byte of temp register
+; %3 - second byte of temp register
+; %4-%8 (optional) - extra instructions to execute before the macro completes
+; %9 - the label to which to jump when the macro completes
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits.  temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 9
+%define %%temp   %1
+%define %%tempb  %2
+%define %%temph  %3
+    add         nbits, free_bits             ; nbits += free_bits;
+    neg         free_bits                    ; free_bits = -free_bits;
+    movq        mm_temp, mm_code             ; temp = code;
+    movd        mm_nbits, nbits              ; nbits --> MMX register
+    movd        mm_overflow_bits, free_bits  ; overflow_bits (temp register) = free_bits;
+    neg         free_bits                    ; free_bits = -free_bits;
+    psllq       mm_put_buffer, mm_nbits      ; put_buffer <<= nbits;
+    psrlq       mm_temp, mm_overflow_bits    ; temp >>= overflow_bits;
+    add         free_bits, 64                ; free_bits += 64;
+    por         mm_temp, mm_put_buffer       ; temp |= put_buffer;
+%ifidn %%temp, nbits_base
+    movd        mm_save_nbits, nbits_base    ; save nbits_base
+%endif
+    movq        mm_code_bits, mm_temp        ; code_bits (temp register) = temp;
+    movq        mm_put_buffer, mm_code       ; put_buffer = code;
+    pcmpeqb     mm_temp, mm_all_0xff         ; b_temp[i] = (b_temp[i] == 0xFF ? 0xFF : 0);
+    movq        mm_code, mm_code_bits        ; code = code_bits;
+    psrlq       mm_code_bits, 32             ; code_bits >>= 32;
+    pmovmskb    nbits, mm_temp               ; nbits = 0;  nbits |= ((b_temp[i] >> 7) << i);
+    movd        %%temp, mm_code_bits         ; temp = code_bits;
+    bswap       %%temp                       ; temp = htonl(temp);
+    test        nbits, nbits                 ; if (nbits != 0)  /* Some 0xFF bytes */
+    jnz         %%.SLOW                      ;   goto %%.SLOW
+    mov         dword [buffer], %%temp       ; *(uint32_t)buffer = temp;
+%ifidn %%temp, nbits_base
+    movd        nbits_base, mm_save_nbits    ; restore nbits_base
+%endif
+    %4
+    movd        nbits, mm_code               ; nbits = (uint32_t)(code);
+    %5
+    bswap       nbits                        ; nbits = htonl(nbits);
+    mov         dword [buffer + 4], nbits    ; *(uint32_t)(buffer + 4) = nbits;
+    lea         buffer, [buffer + 8]         ; buffer += 8;
+    %6
+    %7
+    %8
+    jmp %9                                   ; return
+%%.SLOW:
+    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+    ; bytes in the qword.
+    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
+    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
+    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         %%temp, 16                 ; temp >>= 16;
+    mov         byte [buffer], %%tempb     ; buffer[0] = temp[0];
+    cmp         %%tempb, 0xFF              ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], %%temph     ; buffer[0] = temp[1];
+    cmp         %%temph, 0xFF              ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    movd        nbits, mm_code             ; nbits (temp register) = (uint32_t)(code)
+%ifidn %%temp, nbits_base
+    movd        nbits_base, mm_save_nbits  ; restore nbits_base
+%endif
+    bswap       nbits                      ; nbits = htonl(nbits)
+    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
+    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
+    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+    shr         nbits, 16                  ; nbits >>= 16;
+    mov         byte [buffer], nbitsb      ; buffer[0] = nbits[0];
+    cmp         nbitsb, 0xFF               ; Set CF if nbits[0] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], nbitsh      ; buffer[0] = nbits[1];
+    %4
+    cmp         nbitsh, 0xFF               ; Set CF if nbits[1] < 0xFF
+    mov         byte [buffer+1], 0         ; buffer[1] = 0;
+    sbb         buffer, -2                 ; buffer -= (-2 + (nbits[1] < 0xFF ? 1 : 0));
+    %5
+    %6
+    %7
+    %8
+    jmp %9                                 ; return;
 %endmacro
 
-%macro CHECKBUF15 0
-    cmp         put_bits, 16            ; if (put_bits > 31) {
-    jl          %%.CHECKBUF15_END
-    mov         eax, POINTER [esp+buffer]
-    EMIT_BYTE
-    EMIT_BYTE
-    mov         POINTER [esp+buffer], eax
-%%.CHECKBUF15_END:
+%macro PUSH 1
+    push        %1
+%assign stack_offset  stack_offset + 4
 %endmacro
 
-%macro EMIT_BITS 1
-    PUT_BITS    %1
-    CHECKBUF15
+%macro POP 1
+    pop         %1
+%assign stack_offset  stack_offset - 4
 %endmacro
 
-%macro kloop_prepare 37                 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
-    pxor        xmm4, xmm4              ; __m128i neg = _mm_setzero_si128();
-    pxor        xmm5, xmm5              ; __m128i neg = _mm_setzero_si128();
-    pxor        xmm6, xmm6              ; __m128i neg = _mm_setzero_si128();
-    pxor        xmm7, xmm7              ; __m128i neg = _mm_setzero_si128();
-    pinsrw      %34, word [esi + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
-    pinsrw      %35, word [esi + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
-    pinsrw      %36, word [esi + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
-    pinsrw      %37, word [esi + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
-    pinsrw      %34, word [esi + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
-    pinsrw      %35, word [esi + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
-    pinsrw      %36, word [esi + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
-    pinsrw      %37, word [esi + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
-    pinsrw      %34, word [esi + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
-    pinsrw      %35, word [esi + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
-    pinsrw      %36, word [esi + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
-    pinsrw      %37, word [esi + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
-    pinsrw      %34, word [esi + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
-    pinsrw      %35, word [esi + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
-    pinsrw      %36, word [esi + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
-    pinsrw      %37, word [esi + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
-    pinsrw      %34, word [esi + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
-    pinsrw      %35, word [esi + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
-    pinsrw      %36, word [esi + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
-    pinsrw      %37, word [esi + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
-    pinsrw      %34, word [esi + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
-    pinsrw      %35, word [esi + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
-    pinsrw      %36, word [esi + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
-    pinsrw      %37, word [esi + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
-    pinsrw      %34, word [esi + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
-    pinsrw      %35, word [esi + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
-    pinsrw      %36, word [esi + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
-    pinsrw      %37, word [esi + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
-    pinsrw      %34, word [esi + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
-    pinsrw      %35, word [esi + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
-    pinsrw      %36, word [esi + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
-%if %1 != 32
-    pinsrw      %37, word [esi + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
+; If PIC is defined, load the address of a symbol defined in this file into a
+; register.  Equivalent to
+;   get_GOT     %1
+;   lea         %1, [GOTOFF(%1, %2)]
+; without using the GOT.
+;
+; Usage:
+; %1 - register into which to load the address of the symbol
+; %2 - symbol whose address should be loaded
+; %3 - optional multi-line macro to execute before the symbol address is loaded
+; %4 - optional multi-line macro to execute after the symbol address is loaded
+;
+; If PIC is not defined, then %3 and %4 are executed in order.
+
+%macro GET_SYM 2-4
+%ifdef PIC
+    call        %%.geteip
+%%.ref:
+    %4
+    add         %1, %2 - %%.ref
+    jmp         short %%.done
+    align       32
+%%.geteip:
+    %3          4               ; must adjust stack pointer because of call
+    mov         %1, POINTER [esp]
+    ret
+    align       32
+%%.done:
 %else
-    pinsrw      %37, ecx, 7             ; xmm_shadow[31] = block[jno31];
+    %3          0
+    %4
 %endif
-    pcmpgtw     xmm4, %34               ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw     xmm5, %35               ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw     xmm6, %36               ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw     xmm7, %37               ; neg = _mm_cmpgt_epi16(neg, x1);
-    paddw       %34, xmm4               ; x1 = _mm_add_epi16(x1, neg);
-    paddw       %35, xmm5               ; x1 = _mm_add_epi16(x1, neg);
-    paddw       %36, xmm6               ; x1 = _mm_add_epi16(x1, neg);
-    paddw       %37, xmm7               ; x1 = _mm_add_epi16(x1, neg);
-    pxor        %34, xmm4               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        %35, xmm5               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        %36, xmm6               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        %37, xmm7               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        xmm4, %34               ; neg = _mm_xor_si128(neg, x1);
-    pxor        xmm5, %35               ; neg = _mm_xor_si128(neg, x1);
-    pxor        xmm6, %36               ; neg = _mm_xor_si128(neg, x1);
-    pxor        xmm7, %37               ; neg = _mm_xor_si128(neg, x1);
-    movdqa      XMMWORD [esp + t1 + %1 * SIZEOF_WORD], %34          ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
-    movdqa      XMMWORD [esp + t1 + (%1 + 8) * SIZEOF_WORD], %35    ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
-    movdqa      XMMWORD [esp + t1 + (%1 + 16) * SIZEOF_WORD], %36   ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
-    movdqa      XMMWORD [esp + t1 + (%1 + 24) * SIZEOF_WORD], %37   ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
-    movdqa      XMMWORD [esp + t2 + %1 * SIZEOF_WORD], xmm4         ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
-    movdqa      XMMWORD [esp + t2 + (%1 + 8) * SIZEOF_WORD], xmm5   ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
-    movdqa      XMMWORD [esp + t2 + (%1 + 16) * SIZEOF_WORD], xmm6  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
-    movdqa      XMMWORD [esp + t2 + (%1 + 24) * SIZEOF_WORD], xmm7  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
 %endmacro
 
 ;
@@ -153,272 +275,487 @@ EXTN(jconst_huff_encode_one_block):
 ;                                  JCOEFPTR block, int last_dc_val,
 ;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
 ;
-
-; eax + 8 = working_state *state
-; eax + 12 = JOCTET *buffer
-; eax + 16 = JCOEFPTR block
-; eax + 20 = int last_dc_val
-; eax + 24 = c_derived_tbl *dctbl
-; eax + 28 = c_derived_tbl *actbl
-
-%define pad         6 * SIZEOF_DWORD    ; Align to 16 bytes
-%define t1          pad
-%define t2          t1 + (DCTSIZE2 * SIZEOF_WORD)
-%define block       t2 + (DCTSIZE2 * SIZEOF_WORD)
-%define actbl       block + SIZEOF_DWORD
-%define buffer      actbl + SIZEOF_DWORD
-%define temp        buffer + SIZEOF_DWORD
-%define temp2       temp + SIZEOF_DWORD
-%define temp3       temp2 + SIZEOF_DWORD
-%define temp4       temp3 + SIZEOF_DWORD
-%define temp5       temp4 + SIZEOF_DWORD
-%define gotptr      temp5 + SIZEOF_DWORD  ; void *gotptr
-%define put_buffer  ebx
-%define put_bits    edi
+; Stack layout:
+; Function args
+; Return address
+; Saved ebx
+; Saved ebp
+; Saved esi
+; Saved edi <-- esp_save
+; ...
+; esp_save
+; t_ 64*2 bytes (aligned to 128 bytes)
+;
+; esp is used (as t) to point into t_ (data in lower indices is not used once
+; esp passes over them, so this is signal-safe.)  Aligning to 128 bytes allows
+; us to find the rest of the data again.
+;
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel.  In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support.  The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.)  This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; eax - frame --> buffer
+; ebx - nbits_base (PIC) / emit_temp
+; ecx - dctbl --> size --> state
+; edx - block --> nbits
+; esi - code_temp --> state --> actbl
+; edi - index_temp --> free_bits
+; esp - t
+; ebp - index
+
+%define frame       eax
+%ifdef PIC
+%define nbits_base  ebx
+%endif
+%define emit_temp   ebx
+%define emit_tempb  bl
+%define emit_temph  bh
+%define dctbl       ecx
+%define block       edx
+%define code_temp   esi
+%define index_temp  edi
+%define t           esp
+%define index       ebp
+
+%assign save_frame  DCTSIZE2 * SIZEOF_WORD
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
 
     align       32
     GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
 
 EXTN(jsimd_huff_encode_one_block_sse2):
-    push        ebp
-    mov         eax, esp                     ; eax = original ebp
-    sub         esp, byte 4
-    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [esp], eax
-    mov         ebp, esp                     ; ebp = aligned ebp
-    sub         esp, temp5+9*SIZEOF_DWORD-pad
-    push        ebx
-    push        ecx
-;   push        edx                     ; need not be preserved
-    push        esi
-    push        edi
-    push        ebp
-
-    mov         esi, POINTER [eax+8]       ; (working_state *state)
-    mov         put_buffer, dword [esi+8]  ; put_buffer = state->cur.put_buffer;
-    mov         put_bits, dword [esi+12]   ; put_bits = state->cur.put_bits;
-    push        esi                        ; esi is now scratch
-
-    get_GOT     edx                        ; get GOT address
-    movpic      POINTER [esp+gotptr], edx  ; save GOT address
-
-    mov         ecx, POINTER [eax+28]
-    mov         edx, POINTER [eax+16]
-    mov         esi, POINTER [eax+12]
-    mov         POINTER [esp+actbl], ecx
-    mov         POINTER [esp+block], edx
-    mov         POINTER [esp+buffer], esi
-
-    ; Encode the DC coefficient difference per section F.1.2.1
-    mov         esi, POINTER [esp+block]  ; block
-    movsx       ecx, word [esi]           ; temp = temp2 = block[0] - last_dc_val;
-    sub         ecx, dword [eax+20]
-    mov         esi, ecx
-
-    ; This is a well-known technique for obtaining the absolute value
-    ; with out a branch.  It is derived from an assembly language technique
-    ; presented in "How to Optimize for the Pentium Processors",
-    ; Copyright (c) 1996, 1997 by Agner Fog.
-    mov         edx, ecx
-    sar         edx, 31                 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-    xor         ecx, edx                ; temp ^= temp3;
-    sub         ecx, edx                ; temp -= temp3;
-
-    ; For a negative input, want temp2 = bitwise complement of abs(input)
-    ; This code assumes we are on a two's complement machine
-    add         esi, edx                ; temp2 += temp3;
-    mov         dword [esp+temp], esi   ; backup temp2 in temp
-
-    ; Find the number of bits needed for the magnitude of the coefficient
-    movpic      ebp, POINTER [esp+gotptr]                        ; load GOT address (ebp)
-    movzx       edx, byte [GOTOFF(ebp, jpeg_nbits_table + ecx)]  ; nbits = JPEG_NBITS(temp);
-    mov         dword [esp+temp2], edx                           ; backup nbits in temp2
-
-    ; Emit the Huffman-coded symbol for the number of bits
-    mov         ebp, POINTER [eax+24]         ; After this point, arguments are not accessible anymore
-    mov         eax,  INT [ebp + edx * 4]     ; code = dctbl->ehufco[nbits];
-    movzx       ecx, byte [ebp + edx + 1024]  ; size = dctbl->ehufsi[nbits];
-    EMIT_BITS   eax                           ; EMIT_BITS(code, size)
-
-    mov         ecx, dword [esp+temp2]        ; restore nbits
-
-    ; Mask off any extra bits in code
-    mov         eax, 1
-    shl         eax, cl
-    dec         eax
-    and         eax, dword [esp+temp]   ; temp2 &= (((JLONG)1)<<nbits) - 1;
-
-    ; Emit that number of bits of the value, if positive,
-    ; or the complement of its magnitude, if negative.
-    EMIT_BITS   eax                     ; EMIT_BITS(temp2, nbits)
-
-    ; Prepare data
-    xor         ecx, ecx
-    mov         esi, POINTER [esp+block]
-    kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
-                   18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
-                   27, 20, 13, 6,  7,  14, 21, 28, 35, \
-                   xmm0, xmm1, xmm2, xmm3
-    kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
-                   30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
-                   53, 60, 61, 54, 47, 55, 62, 63, 63, \
-                   xmm0, xmm1, xmm2, xmm3
-
-    pxor        xmm7, xmm7
-    movdqa      xmm0, XMMWORD [esp + t1 + 0 * SIZEOF_WORD]   ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
-    movdqa      xmm1, XMMWORD [esp + t1 + 8 * SIZEOF_WORD]   ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
-    movdqa      xmm2, XMMWORD [esp + t1 + 16 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
-    movdqa      xmm3, XMMWORD [esp + t1 + 24 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
-    pcmpeqw     xmm0, xmm7              ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-    pcmpeqw     xmm1, xmm7              ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-    pcmpeqw     xmm2, xmm7              ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-    pcmpeqw     xmm3, xmm7              ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-    packsswb    xmm0, xmm1              ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-    packsswb    xmm2, xmm3              ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-    pmovmskb    edx, xmm0               ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-    pmovmskb    ecx, xmm2               ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-    shl         ecx, 16
-    or          edx, ecx
-    not         edx                     ; index = ~index;
-
-    lea         esi, [esp+t1]
-    mov         ebp, POINTER [esp+actbl]  ; ebp = actbl
-
-.BLOOP:
-    bsf         ecx, edx                ; r = __builtin_ctzl(index);
-    jz          near .ELOOP
-    lea         esi, [esi+ecx*2]        ; k += r;
-    shr         edx, cl                 ; index >>= r;
-    mov         dword [esp+temp3], edx
-.BRLOOP:
-    cmp         ecx, 16                       ; while (r > 15) {
-    jl          near .ERLOOP
-    sub         ecx, 16                       ; r -= 16;
-    mov         dword [esp+temp], ecx
-    mov         eax, INT [ebp + 240 * 4]      ; code_0xf0 = actbl->ehufco[0xf0];
-    movzx       ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-    EMIT_BITS   eax                           ; EMIT_BITS(code_0xf0, size_0xf0)
-    mov         ecx, dword [esp+temp]
-    jmp         .BRLOOP
-.ERLOOP:
-    movsx       eax, word [esi]                                  ; temp = t1[k];
-    movpic      edx, POINTER [esp+gotptr]                        ; load GOT address (edx)
-    movzx       eax, byte [GOTOFF(edx, jpeg_nbits_table + eax)]  ; nbits = JPEG_NBITS(temp);
-    mov         dword [esp+temp2], eax
-    ; Emit Huffman symbol for run length / number of bits
-    shl         ecx, 4                        ; temp3 = (r << 4) + nbits;
-    add         ecx, eax
-    mov         eax,  INT [ebp + ecx * 4]     ; code = actbl->ehufco[temp3];
-    movzx       ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
-    EMIT_BITS   eax
-
-    movsx       edx, word [esi+DCTSIZE2*2]    ; temp2 = t2[k];
-    ; Mask off any extra bits in code
-    mov         ecx, dword [esp+temp2]
-    mov         eax, 1
-    shl         eax, cl
-    dec         eax
-    and         eax, edx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
-    EMIT_BITS   eax                     ; PUT_BITS(temp2, nbits)
-    mov         edx, dword [esp+temp3]
-    add         esi, 2                  ; ++k;
-    shr         edx, 1                  ; index >>= 1;
-
-    jmp         .BLOOP
-.ELOOP:
-    movdqa      xmm0, XMMWORD [esp + t1 + 32 * SIZEOF_WORD]  ; __m128i tmp0 = _mm_loadu_si128((__m128i *)(t1 + 0));
-    movdqa      xmm1, XMMWORD [esp + t1 + 40 * SIZEOF_WORD]  ; __m128i tmp1 = _mm_loadu_si128((__m128i *)(t1 + 8));
-    movdqa      xmm2, XMMWORD [esp + t1 + 48 * SIZEOF_WORD]  ; __m128i tmp2 = _mm_loadu_si128((__m128i *)(t1 + 16));
-    movdqa      xmm3, XMMWORD [esp + t1 + 56 * SIZEOF_WORD]  ; __m128i tmp3 = _mm_loadu_si128((__m128i *)(t1 + 24));
-    pcmpeqw     xmm0, xmm7              ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-    pcmpeqw     xmm1, xmm7              ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-    pcmpeqw     xmm2, xmm7              ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-    pcmpeqw     xmm3, xmm7              ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-    packsswb    xmm0, xmm1              ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-    packsswb    xmm2, xmm3              ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-    pmovmskb    edx, xmm0               ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-    pmovmskb    ecx, xmm2               ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-    shl         ecx, 16
-    or          edx, ecx
-    not         edx                     ; index = ~index;
-
-    lea         eax, [esp + t1 + (DCTSIZE2/2) * 2]
-    sub         eax, esi
-    shr         eax, 1
-    bsf         ecx, edx                ; r = __builtin_ctzl(index);
-    jz          near .ELOOP2
-    shr         edx, cl                 ; index >>= r;
-    add         ecx, eax
-    lea         esi, [esi+ecx*2]        ; k += r;
-    mov         dword [esp+temp3], edx
-    jmp         .BRLOOP2
-.BLOOP2:
-    bsf         ecx, edx                ; r = __builtin_ctzl(index);
-    jz          near .ELOOP2
-    lea         esi, [esi+ecx*2]        ; k += r;
-    shr         edx, cl                 ; index >>= r;
-    mov         dword [esp+temp3], edx
-.BRLOOP2:
-    cmp         ecx, 16                       ; while (r > 15) {
-    jl          near .ERLOOP2
-    sub         ecx, 16                       ; r -= 16;
-    mov         dword [esp+temp], ecx
-    mov         eax, INT [ebp + 240 * 4]      ; code_0xf0 = actbl->ehufco[0xf0];
-    movzx       ecx, byte [ebp + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-    EMIT_BITS   eax                           ; EMIT_BITS(code_0xf0, size_0xf0)
-    mov         ecx, dword [esp+temp]
-    jmp         .BRLOOP2
-.ERLOOP2:
-    movsx       eax, word [esi]         ; temp = t1[k];
-    bsr         eax, eax                ; nbits = 32 - __builtin_clz(temp);
-    inc         eax
-    mov         dword [esp+temp2], eax
-    ; Emit Huffman symbol for run length / number of bits
-    shl         ecx, 4                        ; temp3 = (r << 4) + nbits;
-    add         ecx, eax
-    mov         eax,  INT [ebp + ecx * 4]     ; code = actbl->ehufco[temp3];
-    movzx       ecx, byte [ebp + ecx + 1024]  ; size = actbl->ehufsi[temp3];
-    EMIT_BITS   eax
-
-    movsx       edx, word [esi+DCTSIZE2*2]    ; temp2 = t2[k];
-    ; Mask off any extra bits in code
-    mov         ecx, dword [esp+temp2]
-    mov         eax, 1
-    shl         eax, cl
-    dec         eax
-    and         eax, edx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
-    EMIT_BITS   eax                     ; PUT_BITS(temp2, nbits)
-    mov         edx, dword [esp+temp3]
-    add         esi, 2                  ; ++k;
-    shr         edx, 1                  ; index >>= 1;
-
-    jmp         .BLOOP2
-.ELOOP2:
-    ; If the last coef(s) were zero, emit an end-of-block code
-    lea         edx, [esp + t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
-    cmp         edx, esi                            ; if (r > 0) {
-    je          .EFN
-    mov         eax,  INT [ebp]                     ; code = actbl->ehufco[0];
-    movzx       ecx, byte [ebp + 1024]              ; size = actbl->ehufsi[0];
-    EMIT_BITS   eax
-.EFN:
-    mov         eax, [esp+buffer]
-    pop         esi
-    ; Save put_buffer & put_bits
-    mov         dword [esi+8], put_buffer  ; state->cur.put_buffer = put_buffer;
-    mov         dword [esi+12], put_bits   ; state->cur.put_bits = put_bits;
-
-    pop         ebp
-    pop         edi
-    pop         esi
-;   pop         edx                     ; need not be preserved
-    pop         ecx
-    pop         ebx
-    mov         esp, ebp                ; esp <- aligned ebp
-    pop         esp                     ; esp <- original ebp
-    pop         ebp
+
+%assign stack_offset      0
+%define arg_state         4 + stack_offset
+%define arg_buffer        8 + stack_offset
+%define arg_block        12 + stack_offset
+%define arg_last_dc_val  16 + stack_offset
+%define arg_dctbl        20 + stack_offset
+%define arg_actbl        24 + stack_offset
+
+                                                          ;X: X = code stream
+    mov         block, [esp + arg_block]
+    PUSH        ebx
+    PUSH        ebp
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    PUSH        esi
+    PUSH        edi
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    mov         frame, esp
+    lea         t, [frame - (save_frame + 4)]
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    and         t, -DCTSIZE2 * SIZEOF_WORD                                             ; t = &t_[0]
+    mov         [t + save_frame], frame
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
+    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
+    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
+    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
+    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
+                                                          ;A:      (Row 0, offset 1)
+    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
+    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
+
+    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
+    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
+    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
+    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
+    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
+    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
+    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
+                                                          ;        (Row 1, offset 1)
+    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
+    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
+    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
+    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
+    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
+    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
+    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
+                                                          ;        (Row 3, offset 1)
+    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
+    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
+    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
+    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
+    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
+    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
+    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
+    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
+    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
+                                                          ;        (Row 2, offset 1)
+    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
+    movsx       code_temp, word [block]                   ;Z:     code_temp = block[0];
+
+; %1 - stack pointer adjustment
+%macro GET_SYM_BEFORE 1
+    movaps      XMMWORD [t + 16 * SIZEOF_WORD + %1], xmm2
+                                                          ;C: t[i+16] = w2[i];
+    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
+    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+    sub         code_temp, [frame + arg_last_dc_val]      ;Z:     code_temp -= last_dc_val;
+
+    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
+    pmovmskb    index_temp, xmm2                          ;Z:     index_temp = 0;  index_temp |= ((b2[i] >> 7) << i);
+    pmovmskb    index, xmm0                               ;Z:     index = 0;  index |= ((b0[i] >> 7) << i);
+    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
+    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
+    shl         index_temp, 16                            ;Z:     index_temp <<= 16;
+    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
+    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
+    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
+    or          index, index_temp                         ;Z:     index |= index_temp;
+%undef index_temp
+%define free_bits  edi
+%endmacro
+
+%macro GET_SYM_AFTER 0
+    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
+    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
+    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
+    not         index                                     ;Z:     index = ~index;
+    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
+                                                          ;        (Row 7, offset 1)
+    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+    mov         dctbl, [frame + arg_dctbl]
+    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
+    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
+    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
+    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
+    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
+    pcmpeqw     mm_all_0xff, mm_all_0xff                  ;Z:     all_0xff[i] = 0xFF;
+%endmacro
+
+    GET_SYM     nbits_base, jpeg_nbits_table, GET_SYM_BEFORE, GET_SYM_AFTER
+
+    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
+    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
+    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
+    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
+    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
+    cmp         code_temp, 1 << 31                        ;Z:     Set CF if code_temp < 0x80000000,
+                                                          ;Z:     i.e. if code_temp is positive
+    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
+    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
+    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
+    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
+    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
+                                                          ;        (Row 6, offset 1)
+    adc         code_temp, -1                             ;Z:     code_temp += -1 + (code_temp >= 0 ? 1 : 0);
+    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
+    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
+    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
+    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
+    movd        mm_temp, code_temp                        ;Z:     temp = code_temp
+    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
+                                                          ;        (Row 5, offset 1)
+    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+
+    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    lea         t, [t - SIZEOF_WORD]                      ;Z:     t = &t[-1]
+    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
+    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
+    movaps      XMMWORD [t + (40+1) * SIZEOF_WORD], xmm1  ;F: t[40+i] = w1[i];
+    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
+    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
+    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
+    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
+                                                          ;        (Row 4, offset 1)
+%undef block
+%define nbits  edx
+%define nbitsb  dl
+%define nbitsh  dh
+    movzx       nbits, byte [NBITS(code_temp)]            ;Z:     nbits = JPEG_NBITS(code_temp);
+%undef code_temp
+%define state  esi
+    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
+    mov         state, [frame + arg_state]
+    movd        mm_nbits, nbits                           ;Z:     nbits --> MMX register
+    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+    movd        mm_code, dword [dctbl + c_derived_tbl.ehufco + nbits * 4]
+                                                          ;Z:     code = dctbl->ehufco[nbits];
+%define size  ecx
+%define sizeb  cl
+%define sizeh  ch
+    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
+    movaps      XMMWORD [t + (32+1) * SIZEOF_WORD], xmm5  ;E: t[32+i] = w5[i];
+    movzx       size, byte [dctbl + c_derived_tbl.ehufsi + nbits]
+                                                          ;Z:     size = dctbl->ehufsi[nbits];
+%undef dctbl
+    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    movq        mm_put_buffer, [state + working_state.cur.put_buffer.simd]
+                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
+    mov         free_bits, [state + working_state.cur.free_bits]
+                                                          ;Z:     free_bits = state->cur.free_bits;
+%undef state
+%define actbl  esi
+    mov         actbl, [frame + arg_actbl]
+%define buffer  eax
+    mov         buffer, [frame + arg_buffer]
+%undef frame
+    jmp        .BEGIN
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+; size <= 32, so this is not really a loop
+.BRLOOP1:                                                 ; .BRLOOP1:
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ; nbits = actbl->ehufsi[0xf0];
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ; code = actbl->ehufco[0xf0];
+    and         index, 0x7ffffff                          ; clear index if size == 32
+    sub         size, 16                                  ; size -= 16;
+    sub         free_bits, nbits                          ; if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP1                             ;   goto .EMIT_BRLOOP1;
+    movd        mm_nbits, nbits                           ; nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ; put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ; put_buffer |= code;
+    jmp         .ERLOOP1                                  ; goto .ERLOOP1;
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+%ifdef PIC
+    times 6     nop
+%else
+    times 2     nop
+%endif
+.BLOOP1:                                                  ; do {  /* size = # of zero bits/elements to skip */
+; if size == 32, index remains unchanged.  Correct in .BRLOOP.
+    shr         index, sizeb                              ;   index >>= size;
+    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
+    cmp         size, 16                                  ;   if (size > 16)
+    jg          .BRLOOP1                                  ;     goto .BRLOOP1;
+.ERLOOP1:                                                 ; .ERLOOP1:
+    movsx       nbits, word [t]                           ;   nbits = *t;
+%ifdef PIC
+    add         size, size                                ;   size += size;
+%else
+    lea         size, [size * 2]                          ;   size += size;
+%endif
+    movd        mm_temp, nbits                            ;   temp = nbits;
+    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
+    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+                                                          ;   code = actbl->ehufco[size-16];
+    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+                                                          ;   size = actbl->ehufsi[size-16];
+.BEGIN:                                                   ; .BEGIN:
+    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
+    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
+    add         nbits, size                               ;   nbits += size;
+    por         mm_code, mm_temp                          ;   code |= temp;
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_ERLOOP1                             ;     insert code, flush buffer, init size, goto .BLOOP1
+    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
+    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
+    inc         size                                      ;   ++size;
+    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP1                                   ; } while (index != 0);
+; Round 2
+; t points to the last used word, possibly below t_ if the previous index had 32 zero bits.
+.ELOOP1:                                                  ; .ELOOP1:
+    pmovmskb    size, xmm4                                ; size = 0;  size |= ((b4[i] >> 7) << i);
+    pmovmskb    index, xmm5                               ; index = 0;  index |= ((b5[i] >> 7) << i);
+    shl         size, 16                                  ; size <<= 16;
+    or          index, size                               ; index |= size;
+    not         index                                     ; index = ~index;
+    lea         nbits, [t + (1 + DCTSIZE2) * SIZEOF_WORD]
+                                                          ; nbits = t + 1 + 64;
+    and         nbits, -DCTSIZE2 * SIZEOF_WORD            ; nbits &= -128;  /* now points to &t_[64] */
+    sub         nbits, t                                  ; nbits -= t;
+    shr         nbits, 1                                  ; nbits >>= 1;  /* # of leading 0 bits in old index + 33 */
+    tzcnt       size, index                               ; size = # of trailing 0 bits in index
+    inc         size                                      ; ++size;
+    test        index, index                              ; if (index == 0)
+    jz          .ELOOP2                                   ;   goto .ELOOP2;
+; NOTE: size == 32 cannot happen, since the last element is always 0.
+    shr         index, sizeb                              ; index >>= size;
+    lea         size, [size + nbits - 33]                 ; size = size + nbits - 33;
+    lea         t, [t + size * SIZEOF_WORD]               ; t += size;
+    cmp         size, 16                                  ; if (size <= 16)
+    jle         .ERLOOP2                                  ;   goto .ERLOOP2;
+.BRLOOP2:                                                 ; do {
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ;   nbits = actbl->ehufsi[0xf0];
+    sub         size, 16                                  ;   size -= 16;
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ;   code = actbl->ehufco[0xf0];
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP2                             ;     insert code and flush put_buffer
+    movd        mm_nbits, nbits                           ;   else { nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
+    cmp         size, 16                                  ;     if (size <= 16)
+    jle        .ERLOOP2                                   ;       goto .ERLOOP2;
+    jmp        .BRLOOP2                                   ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align      16
+.BLOOP2:                                                  ; do {  /* size = # of zero bits/elements to skip */
+    shr         index, sizeb                              ;   index >>= size;
+    lea         t, [t + size * SIZEOF_WORD]               ;   t += size;
+    cmp         size, 16                                  ;   if (size > 16)
+    jg          .BRLOOP2                                  ;     goto .BRLOOP2;
+.ERLOOP2:                                                 ; .ERLOOP2:
+    movsx       nbits, word [t]                           ;   nbits = *t;
+    add         size, size                                ;   size += size;
+    movd        mm_temp, nbits                            ;   temp = nbits;
+    movzx       nbits, byte [NBITS(nbits)]                ;   nbits = JPEG_NBITS(nbits);
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    lea         size, [size * 8 + nbits]                  ;   size = size * 8 + nbits;
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + (size - 16) * 4]
+                                                          ;   code = actbl->ehufco[size-16];
+    movzx       size, byte [actbl + c_derived_tbl.ehufsi + (size - 16)]
+                                                          ;   size = actbl->ehufsi[size-16];
+    psllq       mm_code, mm_nbits                         ;   code <<= nbits;
+    pand        mm_temp, [MASK_BITS(nbits)]               ;   temp &= (1 << nbits) - 1;
+    lea         nbits, [nbits + size]                     ;   nbits += size;
+    por         mm_code, mm_temp                          ;   code |= temp;
+    xor         size, size                                ;   size = 0;  /* kill tzcnt input dependency */
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_ERLOOP2                             ;     insert code, flush buffer, init size, goto .BLOOP2
+    tzcnt       size, index                               ;   size = # of trailing 0 bits in index
+    movd        mm_nbits, nbits                           ;   nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;   put_buffer <<= nbits;
+    inc         size                                      ;   ++size;
+    por         mm_put_buffer, mm_code                    ;   put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP2                                   ; } while (index != 0);
+.ELOOP2:                                                  ; .ELOOP2:
+    mov         nbits, t                                  ; nbits = t;
+    lea         t, [t + SIZEOF_WORD]                      ; t = &t[1];
+    and         nbits, DCTSIZE2 * SIZEOF_WORD - 1         ; nbits &= 127;
+    and         t, -DCTSIZE2 * SIZEOF_WORD                ; t &= -128;  /* t = &t_[0]; */
+    cmp         nbits, (DCTSIZE2 - 2) * SIZEOF_WORD       ; if (nbits != 62 * 2)
+    je          .EFN                                      ; {
+    movd        mm_code, dword [actbl + c_derived_tbl.ehufco + 0]
+                                                          ;   code = actbl->ehufco[0];
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+                                                          ;   nbits = actbl->ehufsi[0];
+    sub         free_bits, nbits                          ;   if ((free_bits -= nbits) <= 0)
+    jg          .EFN_SKIP_EMIT_CODE                       ;   {
+    EMIT_QWORD  size, sizeb, sizeh, , , , , , .EFN        ;     insert code, flush put_buffer
+    align       16
+.EFN_SKIP_EMIT_CODE:                                      ;   } else {
+    movd        mm_nbits, nbits                           ;     nbits --> MMX register
+    psllq       mm_put_buffer, mm_nbits                   ;     put_buffer <<= nbits;
+    por         mm_put_buffer, mm_code                    ;     put_buffer |= code;
+.EFN:                                                     ; } }
+%define frame  esp
+    mov         frame, [t + save_frame]
+%define state  ecx
+    mov         state, [frame + arg_state]
+    movq        [state + working_state.cur.put_buffer.simd], mm_put_buffer
+                                                          ; state->cur.put_buffer.simd = put_buffer;
+    emms
+    mov         [state + working_state.cur.free_bits], free_bits
+                                                          ; state->cur.free_bits = free_bits;
+    POP         edi
+    POP         esi
+    POP         ebp
+    POP         ebx
     ret
 
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP1:
+    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , , , \
+      .ERLOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_ERLOOP1:
+    EMIT_QWORD  size, sizeb, sizeh, \
+      { xor     size, size }, \
+      { tzcnt   size, index }, \
+      { inc     size }, \
+      { test    index, index }, \
+      { jnz     .BLOOP1 }, \
+      .ELOOP1
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP2:
+    EMIT_QWORD  emit_temp, emit_tempb, emit_temph, , , , \
+      { cmp     size, 16 }, \
+      { jle     .ERLOOP2 }, \
+      .BRLOOP2
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_ERLOOP2:
+    EMIT_QWORD  size, sizeb, sizeh, \
+      { xor     size, size }, \
+      { tzcnt   size, index }, \
+      { inc     size }, \
+      { test    index, index }, \
+      { jnz     .BLOOP2 }, \
+      .ELOOP2
+
 ; For some reason, the OS X linker does not honor the request to align the
 ; segment unless we do this.
     align       32
diff --git a/external/jpeg/simd/i386/jcphuff-sse2.asm b/external/jpeg/simd/i386/jcphuff-sse2.asm
index 8b731783760f..c26b48a47d85 100644
--- a/external/jpeg/simd/i386/jcphuff-sse2.asm
+++ b/external/jpeg/simd/i386/jcphuff-sse2.asm
@@ -523,6 +523,8 @@ EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
     add         KK, 2
     dec         K
     jnz         .BLOOPR16
+    test        LEN, 15
+    je          .PADDINGR
 .ELOOPR16:
     mov         LENEND, LEN
 
diff --git a/external/jpeg/simd/jsimd.c b/external/jpeg/simd/jsimd.c
deleted file mode 100644
index 443dc484b4bf..000000000000
--- a/external/jpeg/simd/jsimd.c
+++ /dev/null
@@ -1,6 +0,0 @@
-// By halx99, This file is only workaround for build ios combined archs
-#if defined(__arm__)
-    #include "arm/jsimd.c"
-#else // assume arm64
-    #include "arm64/jsimd.c"
-#endif
diff --git a/external/jpeg/simd/jsimd.h b/external/jpeg/simd/jsimd.h
index fdcc61ebf6b6..64747c6360c1 100644
--- a/external/jpeg/simd/jsimd.h
+++ b/external/jpeg/simd/jsimd.h
@@ -6,7 +6,8 @@
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2014, Linaro Limited.
  * Copyright (C) 2015-2016, 2018, Matthieu Darbois.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2020, Arm Limited.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -121,6 +122,8 @@ EXTERN(void) jsimd_extxrgb_ycc_convert_neon
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
 
+#ifndef NEON_INTRINSICS
+
 EXTERN(void) jsimd_extrgb_ycc_convert_neon_slowld3
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
@@ -128,6 +131,8 @@ EXTERN(void) jsimd_extbgr_ycc_convert_neon_slowld3
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
 
+#endif
+
 EXTERN(void) jsimd_rgb_ycc_convert_dspr2
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
@@ -263,6 +268,28 @@ EXTERN(void) jsimd_extxrgb_gray_convert_avx2
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
 
+EXTERN(void) jsimd_rgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_neon
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
 EXTERN(void) jsimd_rgb_gray_convert_dspr2
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
@@ -285,6 +312,28 @@ EXTERN(void) jsimd_extxrgb_gray_convert_dspr2
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
 
+EXTERN(void) jsimd_rgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extrgbx_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgr_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extbgrx_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxbgr_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+EXTERN(void) jsimd_extxrgb_gray_convert_mmi
+  (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
+   JDIMENSION output_row, int num_rows);
+
 EXTERN(void) jsimd_rgb_gray_convert_altivec
   (JDIMENSION img_width, JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
    JDIMENSION output_row, int num_rows);
@@ -401,6 +450,8 @@ EXTERN(void) jsimd_ycc_rgb565_convert_neon
   (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
    JSAMPARRAY output_buf, int num_rows);
 
+#ifndef NEON_INTRINSICS
+
 EXTERN(void) jsimd_ycc_extrgb_convert_neon_slowst3
   (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
    JSAMPARRAY output_buf, int num_rows);
@@ -408,6 +459,8 @@ EXTERN(void) jsimd_ycc_extbgr_convert_neon_slowst3
   (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
    JSAMPARRAY output_buf, int num_rows);
 
+#endif
+
 EXTERN(void) jsimd_ycc_rgb_convert_dspr2
   (JDIMENSION out_width, JSAMPIMAGE input_buf, JDIMENSION input_row,
    JSAMPARRAY output_buf, int num_rows);
@@ -562,6 +615,13 @@ EXTERN(void) jsimd_h2v2_upsample_avx2
   (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
 
+EXTERN(void) jsimd_h2v1_upsample_neon
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_upsample_neon
+  (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+
 EXTERN(void) jsimd_h2v1_upsample_dspr2
   (int max_v_samp_factor, JDIMENSION output_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
@@ -608,6 +668,12 @@ EXTERN(void) jsimd_h2v2_fancy_upsample_avx2
 EXTERN(void) jsimd_h2v1_fancy_upsample_neon
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h2v2_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
+EXTERN(void) jsimd_h1v2_fancy_upsample_neon
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 
 EXTERN(void) jsimd_h2v1_fancy_upsample_dspr2
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
@@ -616,6 +682,9 @@ EXTERN(void) jsimd_h2v2_fancy_upsample_dspr2
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
 
+EXTERN(void) jsimd_h2v1_fancy_upsample_mmi
+  (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
+   JSAMPARRAY *output_data_ptr);
 EXTERN(void) jsimd_h2v2_fancy_upsample_mmi
   (int max_v_samp_factor, JDIMENSION downsampled_width, JSAMPARRAY input_data,
    JSAMPARRAY *output_data_ptr);
@@ -762,6 +831,50 @@ EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_avx2
   (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
    JSAMPARRAY output_buf);
 
+EXTERN(void) jsimd_h2v1_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_neon
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
 EXTERN(void) jsimd_h2v1_merged_upsample_dspr2
   (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
    JSAMPARRAY output_buf, JSAMPLE *range);
@@ -806,6 +919,50 @@ EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_dspr2
   (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
    JSAMPARRAY output_buf, JSAMPLE *range);
 
+EXTERN(void) jsimd_h2v1_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extrgbx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extbgrx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v1_extxrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
+EXTERN(void) jsimd_h2v2_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extrgbx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extbgrx_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxbgr_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+EXTERN(void) jsimd_h2v2_extxrgb_merged_upsample_mmi
+  (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
+   JSAMPARRAY output_buf);
+
 EXTERN(void) jsimd_h2v1_merged_upsample_altivec
   (JDIMENSION output_width, JSAMPIMAGE input_buf, JDIMENSION in_row_group_ctr,
    JSAMPARRAY output_buf);
@@ -909,6 +1066,8 @@ EXTERN(void) jsimd_fdct_ifast_neon(DCTELEM *data);
 
 EXTERN(void) jsimd_fdct_ifast_dspr2(DCTELEM *data);
 
+EXTERN(void) jsimd_fdct_ifast_mmi(DCTELEM *data);
+
 EXTERN(void) jsimd_fdct_ifast_altivec(DCTELEM *data);
 
 /* Floating Point Forward DCT */
@@ -1040,6 +1199,10 @@ EXTERN(void) jsimd_idct_ifast_rows_dspr2
   (DCTELEM *wsptr, JSAMPARRAY output_buf, JDIMENSION output_col,
    const int *idct_coefs);
 
+EXTERN(void) jsimd_idct_ifast_mmi
+  (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
+   JDIMENSION output_col);
+
 EXTERN(void) jsimd_idct_ifast_altivec
   (void *dct_table, JCOEFPTR coef_block, JSAMPARRAY output_buf,
    JDIMENSION output_col);
@@ -1069,15 +1232,27 @@ EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon
   (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
    c_derived_tbl *dctbl, c_derived_tbl *actbl);
 
+#ifndef NEON_INTRINSICS
+
 EXTERN(JOCTET *) jsimd_huff_encode_one_block_neon_slowtbl
   (void *state, JOCTET *buffer, JCOEFPTR block, int last_dc_val,
    c_derived_tbl *dctbl, c_derived_tbl *actbl);
 
+#endif
+
 /* Progressive Huffman encoding */
 EXTERN(void) jsimd_encode_mcu_AC_first_prepare_sse2
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
    JCOEF *values, size_t *zerobits);
 
+EXTERN(void) jsimd_encode_mcu_AC_first_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *values, size_t *zerobits);
+
 EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_sse2
   (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
    JCOEF *absvalues, size_t *bits);
+
+EXTERN(int) jsimd_encode_mcu_AC_refine_prepare_neon
+  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
+   JCOEF *absvalues, size_t *bits);
diff --git a/external/jpeg/simd/jsimd_neon.S b/external/jpeg/simd/jsimd_neon.S
deleted file mode 100644
index 3a1c2248ed1d..000000000000
--- a/external/jpeg/simd/jsimd_neon.S
+++ /dev/null
@@ -1,6 +0,0 @@
-// By halx99, This file is only workaround for build ios combined archs
-#if defined(__arm__)
-    #include "arm/jsimd_neon.S"
-#else // assume arm64
-    #include "arm64/jsimd_neon.S"
-#endif
diff --git a/external/jpeg/simd/loongson/jccolext-mmi.c b/external/jpeg/simd/loongson/jccolext-mmi.c
deleted file mode 100644
index 6cdeb5e09a6f..000000000000
--- a/external/jpeg/simd/loongson/jccolext-mmi.c
+++ /dev/null
@@ -1,483 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
- *                          All Rights Reserved.
- * Authors:  ZhuChen     <zhuchen@loongson.cn>
- *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *           CaiWanwei   <caiwanwei@loongson.cn>
- *           ZhangLixia  <zhanglixia-hf@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* This file is included by jccolor-mmi.c */
-
-
-#if RGB_RED == 0
-#define mmA  mm0
-#define mmB  mm1
-#elif RGB_GREEN == 0
-#define mmA  mm2
-#define mmB  mm3
-#elif RGB_BLUE == 0
-#define mmA  mm4
-#define mmB  mm5
-#else
-#define mmA  mm6
-#define mmB  mm7
-#endif
-
-#if RGB_RED == 1
-#define mmC  mm0
-#define mmD  mm1
-#elif RGB_GREEN == 1
-#define mmC  mm2
-#define mmD  mm3
-#elif RGB_BLUE == 1
-#define mmC  mm4
-#define mmD  mm5
-#else
-#define mmC  mm6
-#define mmD  mm7
-#endif
-
-#if RGB_RED == 2
-#define mmE  mm0
-#define mmF  mm1
-#elif RGB_GREEN == 2
-#define mmE  mm2
-#define mmF  mm3
-#elif RGB_BLUE == 2
-#define mmE  mm4
-#define mmF  mm5
-#else
-#define mmE  mm6
-#define mmF  mm7
-#endif
-
-#if RGB_RED == 3
-#define mmG  mm0
-#define mmH  mm1
-#elif RGB_GREEN == 3
-#define mmG  mm2
-#define mmH  mm3
-#elif RGB_BLUE == 3
-#define mmG  mm4
-#define mmH  mm5
-#else
-#define mmG  mm6
-#define mmH  mm7
-#endif
-
-
-void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
-                               JSAMPIMAGE output_buf, JDIMENSION output_row,
-                               int num_rows)
-{
-  JSAMPROW inptr, outptr0, outptr1, outptr2;
-  int num_cols, col;
-  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
-  __m64 wk[7];
-  __m64 Y_BG, Cb_RG, Cr_BG;
-
-  while (--num_rows >= 0) {
-    inptr = *input_buf++;
-    outptr0 = output_buf[0][output_row];
-    outptr1 = output_buf[1][output_row];
-    outptr2 = output_buf[2][output_row];
-    output_row++;
-
-    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
-         outptr0 += 8, outptr1 += 8, outptr2 += 8) {
-
-#if RGB_PIXELSIZE == 3
-
-      if (num_cols < 8) {
-        col = num_cols * 3;
-        asm(".set noreorder\r\n"
-
-            "li     $8, 1\r\n"
-            "move   $9, %3\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 1f\r\n"
-            "nop    \r\n"
-            "subu   $9, $9, 1\r\n"
-            "xor    $12, $12, $12\r\n"
-            "move   $13, %5\r\n"
-            "dadd   $13, $13, $9\r\n"
-            "lbu    $12, 0($13)\r\n"
-
-            "1:     \r\n"
-            "li     $8, 2\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 2f\r\n"
-            "nop    \r\n"
-            "subu   $9, $9, 2\r\n"
-            "xor    $11, $11, $11\r\n"
-            "move   $13, %5\r\n"
-            "dadd   $13, $13, $9\r\n"
-            "lhu    $11, 0($13)\r\n"
-            "sll    $12, $12, 16\r\n"
-            "or     $12, $12, $11\r\n"
-
-            "2:     \r\n"
-            "dmtc1  $12, %0\r\n"
-            "li     $8, 4\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 3f\r\n"
-            "nop    \r\n"
-            "subu   $9, $9, 4\r\n"
-            "move   $13, %5\r\n"
-            "dadd   $13, $13, $9\r\n"
-            "lwu    $14, 0($13)\r\n"
-            "dmtc1  $14, %1\r\n"
-            "dsll32 $12, $12, 0\r\n"
-            "or     $12, $12, $14\r\n"
-            "dmtc1  $12, %0\r\n"
-
-            "3:     \r\n"
-            "li     $8, 8\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 4f\r\n"
-            "nop    \r\n"
-            "mov.s  %1, %0\r\n"
-            "ldc1   %0, 0(%5)\r\n"
-            "li     $9, 8\r\n"
-            "j      5f\r\n"
-            "nop    \r\n"
-
-            "4:     \r\n"
-            "li     $8, 16\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 5f\r\n"
-            "nop    \r\n"
-            "mov.s  %2, %0\r\n"
-            "ldc1   %0, 0(%5)\r\n"
-            "ldc1   %1, 8(%5)\r\n"
-
-            "5:     \r\n"
-            "nop    \r\n"
-            ".set reorder\r\n"
-
-            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
-            : "r" (col), "r" (num_rows), "r" (inptr)
-            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
-              "$14", "memory"
-           );
-      } else {
-        if (!(((long)inptr) & 7)) {
-          mmA = _mm_load_si64((__m64 *)&inptr[0]);
-          mmG = _mm_load_si64((__m64 *)&inptr[8]);
-          mmF = _mm_load_si64((__m64 *)&inptr[16]);
-        } else {
-          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
-          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
-          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
-        }
-        inptr += RGB_PIXELSIZE * 8;
-      }
-      mmD = mmA;
-      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
-      mmD = _mm_srli_si64(mmD, 4 * BYTE_BIT);
-
-      mmA = _mm_unpackhi_pi8(mmA, mmG);
-      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
-
-      mmD = _mm_unpacklo_pi8(mmD, mmF);
-      mmG = _mm_unpackhi_pi8(mmG, mmF);
-
-      mmE = mmA;
-      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
-      mmE = _mm_srli_si64(mmE, 4 * BYTE_BIT);
-
-      mmA = _mm_unpackhi_pi8(mmA, mmD);
-      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
-
-      mmE = _mm_unpacklo_pi8(mmE, mmG);
-      mmD = _mm_unpackhi_pi8(mmD, mmG);
-      mmC = mmA;
-      mmA = _mm_loadlo_pi8_f(mmA);
-      mmC = _mm_loadhi_pi8_f(mmC);
-
-      mmB = mmE;
-      mmE = _mm_loadlo_pi8_f(mmE);
-      mmB = _mm_loadhi_pi8_f(mmB);
-
-      mmF = mmD;
-      mmD = _mm_loadlo_pi8_f(mmD);
-      mmF = _mm_loadhi_pi8_f(mmF);
-
-#else  /* RGB_PIXELSIZE == 4 */
-
-      if (num_cols < 8) {
-        col = num_cols;
-        asm(".set noreorder\r\n"
-
-            "li     $8, 1\r\n"
-            "move   $9, %4\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 1f\r\n"
-            "nop    \r\n"
-            "subu   $9, $9, 1\r\n"
-            "dsll   $11, $9, 2\r\n"
-            "move   $13, %5\r\n"
-            "daddu  $13, $13, $11\r\n"
-            "lwc1   %0, 0($13)\r\n"
-
-            "1:     \r\n"
-            "li     $8, 2\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 2f\r\n"
-            "nop    \r\n"
-            "subu   $9, $9, 2\r\n"
-            "dsll   $11, $9, 2\r\n"
-            "move   $13, %5\r\n"
-            "daddu  $13, $13, $11\r\n"
-            "mov.s  %1, %0\r\n"
-            "ldc1   %0, 0($13)\r\n"
-
-            "2:     \r\n"
-            "li     $8, 4\r\n"
-            "and    $10, $9, $8\r\n"
-            "beqz   $10, 3f\r\n"
-            "nop    \r\n"
-            "mov.s  %2, %0\r\n"
-            "mov.s  %3, %1\r\n"
-            "ldc1   %0, 0(%5)\r\n"
-            "ldc1   %1, 8(%5)\r\n"
-
-            "3:     \r\n"
-            "nop    \r\n"
-            ".set reorder\r\n"
-
-            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
-            : "r" (col), "r" (inptr)
-            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
-           );
-      } else {
-        if (!(((long)inptr) & 7)) {
-          mmA = _mm_load_si64((__m64 *)&inptr[0]);
-          mmF = _mm_load_si64((__m64 *)&inptr[8]);
-          mmD = _mm_load_si64((__m64 *)&inptr[16]);
-          mmC = _mm_load_si64((__m64 *)&inptr[24]);
-        } else {
-          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
-          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
-          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
-          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
-        }
-        inptr += RGB_PIXELSIZE * 8;
-      }
-      mmB = mmA;
-      mmA = _mm_unpacklo_pi8(mmA, mmF);
-      mmB = _mm_unpackhi_pi8(mmB, mmF);
-
-      mmG = mmD;
-      mmD = _mm_unpacklo_pi8(mmD, mmC);
-      mmG = _mm_unpackhi_pi8(mmG, mmC);
-
-      mmE = mmA;
-      mmA = _mm_unpacklo_pi16(mmA, mmD);
-      mmE = _mm_unpackhi_pi16(mmE, mmD);
-
-      mmH = mmB;
-      mmB = _mm_unpacklo_pi16(mmB, mmG);
-      mmH = _mm_unpackhi_pi16(mmH, mmG);
-
-      mmC = mmA;
-      mmA = _mm_loadlo_pi8_f(mmA);
-      mmC = _mm_loadhi_pi8_f(mmC);
-
-      mmD = mmB;
-      mmB = _mm_loadlo_pi8_f(mmB);
-      mmD = _mm_loadhi_pi8_f(mmD);
-
-      mmG = mmE;
-      mmE = _mm_loadlo_pi8_f(mmE);
-      mmG = _mm_loadhi_pi8_f(mmG);
-
-      mmF = mmH;
-      mmF = _mm_unpacklo_pi8(mmF, mmH);
-      mmH = _mm_unpackhi_pi8(mmH, mmH);
-      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
-      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
-
-#endif
-
-      wk[0] = mm0;
-      wk[1] = mm1;
-      wk[2] = mm4;
-      wk[3] = mm5;
-
-      mm6 = mm1;
-      mm1 = _mm_unpacklo_pi16(mm1, mm3);
-      mm6 = _mm_unpackhi_pi16(mm6, mm3);
-      mm7 = mm1;
-      mm4 = mm6;
-      mm1 = _mm_madd_pi16(mm1, PW_F0299_F0337);
-      mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
-      mm7 = _mm_madd_pi16(mm7, PW_MF016_MF033);
-      mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
-
-      wk[4] = mm1;
-      wk[5] = mm6;
-
-      mm1 = _mm_loadlo_pi16_f(mm5);
-      mm6 = _mm_loadhi_pi16_f(mm5);
-      mm1 = _mm_srli_pi32(mm1, 1);
-      mm6 = _mm_srli_pi32(mm6, 1);
-
-      mm5 = PD_ONEHALFM1_CJ;
-      mm7 = _mm_add_pi32(mm7, mm1);
-      mm4 = _mm_add_pi32(mm4, mm6);
-      mm7 = _mm_add_pi32(mm7, mm5);
-      mm4 = _mm_add_pi32(mm4, mm5);
-      mm7 = _mm_srli_pi32(mm7, SCALEBITS);
-      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
-      mm7 = _mm_packs_pi32(mm7, mm4);
-
-      mm1 = wk[2];
-      mm6 = mm0;
-      mm0 = _mm_unpacklo_pi16(mm0, mm2);
-      mm6 = _mm_unpackhi_pi16(mm6, mm2);
-      mm5 = mm0;
-      mm4 = mm6;
-      mm0 = _mm_madd_pi16(mm0, PW_F0299_F0337);
-      mm6 = _mm_madd_pi16(mm6, PW_F0299_F0337);
-      mm5 = _mm_madd_pi16(mm5, PW_MF016_MF033);
-      mm4 = _mm_madd_pi16(mm4, PW_MF016_MF033);
-
-      wk[6] = mm0;
-      wk[7] = mm6;
-      mm0 = _mm_loadlo_pi16_f(mm1);
-      mm6 = _mm_loadhi_pi16_f(mm1);
-      mm0 = _mm_srli_pi32(mm0, 1);
-      mm6 = _mm_srli_pi32(mm6, 1);
-
-      mm1 = PD_ONEHALFM1_CJ;
-      mm5 = _mm_add_pi32(mm5, mm0);
-      mm4 = _mm_add_pi32(mm4, mm6);
-      mm5 = _mm_add_pi32(mm5, mm1);
-      mm4 = _mm_add_pi32(mm4, mm1);
-      mm5 = _mm_srli_pi32(mm5, SCALEBITS);
-      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
-      mm5 = _mm_packs_pi32(mm5, mm4);
-
-      mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
-      mm5  = _mm_or_si64(mm5, mm7);
-      Cb_RG = mm5;
-
-      mm0 = wk[3];
-      mm6 = wk[2];
-      mm1 = wk[1];
-
-      mm4 = mm0;
-      mm0 = _mm_unpacklo_pi16(mm0, mm3);
-      mm4 = _mm_unpackhi_pi16(mm4, mm3);
-      mm7 = mm0;
-      mm5 = mm4;
-      mm0 = _mm_madd_pi16(mm0, PW_F0114_F0250);
-      mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
-      mm7 = _mm_madd_pi16(mm7, PW_MF008_MF041);
-      mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
-
-      mm3 = PD_ONEHALF;
-      mm0 = _mm_add_pi32(mm0, wk[4]);
-      mm4 = _mm_add_pi32(mm4, wk[5]);
-      mm0 = _mm_add_pi32(mm0, mm3);
-      mm4 = _mm_add_pi32(mm4, mm3);
-      mm0 = _mm_srli_pi32(mm0, SCALEBITS);
-      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
-      mm0 = _mm_packs_pi32(mm0, mm4);
-
-      mm3 = _mm_loadlo_pi16_f(mm1);
-      mm4 = _mm_loadhi_pi16_f(mm1);
-      mm3 = _mm_srli_pi32(mm3, 1);
-      mm4 = _mm_srli_pi32(mm4, 1);
-
-      mm1 = PD_ONEHALFM1_CJ;
-      mm7 = _mm_add_pi32(mm7, mm3);
-      mm5 = _mm_add_pi32(mm5, mm4);
-      mm7 = _mm_add_pi32(mm7, mm1);
-      mm5 = _mm_add_pi32(mm5, mm1);
-      mm7 = _mm_srli_pi32(mm7, SCALEBITS);
-      mm5 = _mm_srli_pi32(mm5, SCALEBITS);
-      mm7 = _mm_packs_pi32(mm7, mm5);
-
-      mm3 = wk[0];
-      mm4 = mm6;
-      mm6 = _mm_unpacklo_pi16(mm6, mm2);
-      mm4 = _mm_unpackhi_pi16(mm4, mm2);
-      mm1 = mm6;
-      mm5 = mm4;
-      mm6 = _mm_madd_pi16(mm6, PW_F0114_F0250);
-      mm4 = _mm_madd_pi16(mm4, PW_F0114_F0250);
-      mm1 = _mm_madd_pi16(mm1, PW_MF008_MF041);
-      mm5 = _mm_madd_pi16(mm5, PW_MF008_MF041);
-
-      mm2 = PD_ONEHALF;
-      mm6 = _mm_add_pi32(mm6, wk[6]);
-      mm4 = _mm_add_pi32(mm4, wk[7]);
-      mm6 = _mm_add_pi32(mm6, mm2);
-      mm4 = _mm_add_pi32(mm4, mm2);
-      mm6 = _mm_srli_pi32(mm6, SCALEBITS);
-      mm4 = _mm_srli_pi32(mm4, SCALEBITS);
-      mm6 = _mm_packs_pi32(mm6, mm4);
-
-      mm0 = _mm_slli_pi16(mm0, BYTE_BIT);
-      mm6 = _mm_or_si64(mm6, mm0);
-      Y_BG = mm6;
-
-      mm2 = _mm_loadlo_pi16_f(mm3);
-      mm4 = _mm_loadhi_pi16_f(mm3);
-      mm2 = _mm_srli_pi32(mm2, 1);
-      mm4 = _mm_srli_pi32(mm4, 1);
-
-      mm0 = PD_ONEHALFM1_CJ;
-      mm1 = _mm_add_pi32(mm1, mm2);
-      mm5 = _mm_add_pi32(mm5, mm4);
-      mm1 = _mm_add_pi32(mm1, mm0);
-      mm5 = _mm_add_pi32(mm5, mm0);
-      mm1 = _mm_srli_pi32(mm1, SCALEBITS);
-      mm5 = _mm_srli_pi32(mm5, SCALEBITS);
-      mm1 = _mm_packs_pi32(mm1, mm5);
-
-      mm7 = _mm_slli_pi16(mm7, BYTE_BIT);
-      mm1 = _mm_or_si64(mm1, mm7);
-      Cr_BG = mm1;
-
-      _mm_store_si64((__m64 *)&outptr0[0], Y_BG);
-      _mm_store_si64((__m64 *)&outptr1[0], Cb_RG);
-      _mm_store_si64((__m64 *)&outptr2[0], Cr_BG);
-    }
-  }
-}
-
-#undef mmA
-#undef mmB
-#undef mmC
-#undef mmD
-#undef mmE
-#undef mmF
-#undef mmG
-#undef mmH
diff --git a/external/jpeg/simd/loongson/jdcolext-mmi.c b/external/jpeg/simd/loongson/jdcolext-mmi.c
deleted file mode 100644
index 560d9b022786..000000000000
--- a/external/jpeg/simd/loongson/jdcolext-mmi.c
+++ /dev/null
@@ -1,424 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
- * Copyright (C) 2015, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- *                          All Rights Reserved.
- * Authors:  ZhuChen     <zhuchen@loongson.cn>
- *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *           CaiWanwei   <caiwanwei@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* This file is included by jdcolor-mmi.c */
-
-
-#if RGB_RED == 0
-#define mmA  mm0
-#define mmB  mm1
-#elif RGB_GREEN == 0
-#define mmA  mm2
-#define mmB  mm3
-#elif RGB_BLUE == 0
-#define mmA  mm4
-#define mmB  mm5
-#else
-#define mmA  mm6
-#define mmB  mm7
-#endif
-
-#if RGB_RED == 1
-#define mmC  mm0
-#define mmD  mm1
-#elif RGB_GREEN == 1
-#define mmC  mm2
-#define mmD  mm3
-#elif RGB_BLUE == 1
-#define mmC  mm4
-#define mmD  mm5
-#else
-#define mmC  mm6
-#define mmD  mm7
-#endif
-
-#if RGB_RED == 2
-#define mmE  mm0
-#define mmF  mm1
-#elif RGB_GREEN == 2
-#define mmE  mm2
-#define mmF  mm3
-#elif RGB_BLUE == 2
-#define mmE  mm4
-#define mmF  mm5
-#else
-#define mmE  mm6
-#define mmF  mm7
-#endif
-
-#if RGB_RED == 3
-#define mmG  mm0
-#define mmH  mm1
-#elif RGB_GREEN == 3
-#define mmG  mm2
-#define mmH  mm3
-#elif RGB_BLUE == 3
-#define mmG  mm4
-#define mmH  mm5
-#else
-#define mmG  mm6
-#define mmH  mm7
-#endif
-
-
-void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
-                               JDIMENSION input_row, JSAMPARRAY output_buf,
-                               int num_rows)
-{
-  JSAMPROW outptr, inptr0, inptr1, inptr2;
-  int num_cols, col;
-  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
-  __m64 mm8, wk[2];
-
-  while (--num_rows >= 0) {
-    inptr0 = input_buf[0][input_row];
-    inptr1 = input_buf[1][input_row];
-    inptr2 = input_buf[2][input_row];
-    input_row++;
-    outptr = *output_buf++;
-
-    for (num_cols = out_width; num_cols > 0; num_cols -= 8,
-         inptr0 += 8, inptr1 += 8, inptr2 += 8) {
-
-      mm5 = _mm_load_si64((__m64 *)inptr1);
-      mm1 = _mm_load_si64((__m64 *)inptr2);
-      mm8 = _mm_load_si64((__m64 *)inptr0);
-      mm4 = 0;
-      mm7 = 0;
-      mm4 = _mm_cmpeq_pi16(mm4, mm4);
-      mm7 = _mm_cmpeq_pi16(mm7, mm7);
-      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
-      mm7 = _mm_slli_pi16(mm7, 7);      /* mm7={0xFF80 0xFF80 0xFF80 0xFF80} */
-      mm0 = mm4;                        /* mm0=mm4={0xFF 0x00 0xFF 0x00 ..} */
-
-      mm4 = _mm_and_si64(mm4, mm5);           /* mm4=Cb(0246)=CbE */
-      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);     /* mm5=Cb(1357)=CbO */
-      mm0 = _mm_and_si64(mm0, mm1);           /* mm0=Cr(0246)=CrE */
-      mm1 = _mm_srli_pi16(mm1, BYTE_BIT);     /* mm1=Cr(1357)=CrO */
-      mm4 = _mm_add_pi16(mm4, mm7);
-      mm5 = _mm_add_pi16(mm5, mm7);
-      mm0 = _mm_add_pi16(mm0, mm7);
-      mm1 = _mm_add_pi16(mm1, mm7);
-
-      /* (Original)
-       * R = Y                + 1.40200 * Cr
-       * G = Y - 0.34414 * Cb - 0.71414 * Cr
-       * B = Y + 1.77200 * Cb
-       *
-       * (This implementation)
-       * R = Y                + 0.40200 * Cr + Cr
-       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
-       * B = Y - 0.22800 * Cb + Cb + Cb
-       */
-
-      mm2 = mm4;                              /* mm2 = CbE */
-      mm3 = mm5;                              /* mm3 = CbO */
-      mm4 = _mm_add_pi16(mm4, mm4);           /* mm4 = 2*CbE */
-      mm5 = _mm_add_pi16(mm5, mm5);           /* mm5 = 2*CbO */
-      mm6 = mm0;                              /* mm6 = CrE */
-      mm7 = mm1;                              /* mm7 = CrO */
-      mm0 = _mm_add_pi16(mm0, mm0);           /* mm0 = 2*CrE */
-      mm1 = _mm_add_pi16(mm1, mm1);           /* mm1 = 2*CrO */
-
-      mm4 = _mm_mulhi_pi16(mm4, PW_MF0228);   /* mm4=(2*CbE * -FIX(0.22800) */
-      mm5 = _mm_mulhi_pi16(mm5, PW_MF0228);   /* mm5=(2*CbO * -FIX(0.22800) */
-      mm0 = _mm_mulhi_pi16(mm0, PW_F0402);    /* mm0=(2*CrE * FIX(0.40200)) */
-      mm1 = _mm_mulhi_pi16(mm1, PW_F0402);    /* mm1=(2*CrO * FIX(0.40200)) */
-
-      mm4 = _mm_add_pi16(mm4, PW_ONE);
-      mm5 = _mm_add_pi16(mm5, PW_ONE);
-      mm4 = _mm_srai_pi16(mm4, 1);            /* mm4=(CbE * -FIX(0.22800)) */
-      mm5 = _mm_srai_pi16(mm5, 1);            /* mm5=(CbO * -FIX(0.22800)) */
-      mm0 = _mm_add_pi16(mm0, PW_ONE);
-      mm1 = _mm_add_pi16(mm1, PW_ONE);
-      mm0 = _mm_srai_pi16(mm0, 1);            /* mm0=(CrE * FIX(0.40200)) */
-      mm1 = _mm_srai_pi16(mm1, 1);            /* mm1=(CrO * FIX(0.40200)) */
-
-      mm4 = _mm_add_pi16(mm4, mm2);
-      mm5 = _mm_add_pi16(mm5, mm3);
-      mm4 = _mm_add_pi16(mm4, mm2);       /* mm4=(CbE * FIX(1.77200))=(B-Y)E */
-      mm5 = _mm_add_pi16(mm5, mm3);       /* mm5=(CbO * FIX(1.77200))=(B-Y)O */
-      mm0 = _mm_add_pi16(mm0, mm6);       /* mm0=(CrE * FIX(1.40200))=(R-Y)E */
-      mm1 = _mm_add_pi16(mm1, mm7);       /* mm1=(CrO * FIX(1.40200))=(R-Y)O */
-
-      wk[0] = mm4;                            /* wk(0)=(B-Y)E */
-      wk[1] = mm5;                            /* wk(1)=(B-Y)O */
-
-      mm4 = mm2;
-      mm5 = mm3;
-      mm2 = _mm_unpacklo_pi16(mm2, mm6);
-      mm4 = _mm_unpackhi_pi16(mm4, mm6);
-      mm2 = _mm_madd_pi16(mm2, PW_MF0344_F0285);
-      mm4 = _mm_madd_pi16(mm4, PW_MF0344_F0285);
-      mm3 = _mm_unpacklo_pi16(mm3, mm7);
-      mm5 = _mm_unpackhi_pi16(mm5, mm7);
-      mm3 = _mm_madd_pi16(mm3, PW_MF0344_F0285);
-      mm5 = _mm_madd_pi16(mm5, PW_MF0344_F0285);
-
-      mm2 = _mm_add_pi32(mm2, PD_ONEHALF);
-      mm4 = _mm_add_pi32(mm4, PD_ONEHALF);
-      mm2 = _mm_srai_pi32(mm2, SCALEBITS);
-      mm4 = _mm_srai_pi32(mm4, SCALEBITS);
-      mm3 = _mm_add_pi32(mm3, PD_ONEHALF);
-      mm5 = _mm_add_pi32(mm5, PD_ONEHALF);
-      mm3 = _mm_srai_pi32(mm3, SCALEBITS);
-      mm5 = _mm_srai_pi32(mm5, SCALEBITS);
-
-      mm2 = _mm_packs_pi32(mm2, mm4);  /* mm2=CbE*-FIX(0.344)+CrE*FIX(0.285) */
-      mm3 = _mm_packs_pi32(mm3, mm5);  /* mm3=CbO*-FIX(0.344)+CrO*FIX(0.285) */
-      mm2 = _mm_sub_pi16(mm2, mm6);  /* mm2=CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
-      mm3 = _mm_sub_pi16(mm3, mm7);  /* mm3=CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
-
-      mm5 = mm8;                              /* mm5=Y(01234567) */
-
-      mm4 = _mm_cmpeq_pi16(mm4, mm4);
-      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);    /* mm4={0xFF 0x00 0xFF 0x00 ..} */
-      mm4 = _mm_and_si64(mm4, mm5);          /* mm4=Y(0246)=YE */
-      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);    /* mm5=Y(1357)=YO */
-
-      mm0 = _mm_add_pi16(mm0, mm4);      /* mm0=((R-Y)E+YE)=RE=(R0 R2 R4 R6) */
-      mm1 = _mm_add_pi16(mm1, mm5);      /* mm1=((R-Y)O+YO)=RO=(R1 R3 R5 R7) */
-      mm0 = _mm_packs_pu16(mm0, mm0);    /* mm0=(R0 R2 R4 R6 ** ** ** **) */
-      mm1 = _mm_packs_pu16(mm1, mm1);    /* mm1=(R1 R3 R5 R7 ** ** ** **) */
-
-      mm2 = _mm_add_pi16(mm2, mm4);      /* mm2=((G-Y)E+YE)=GE=(G0 G2 G4 G6) */
-      mm3 = _mm_add_pi16(mm3, mm5);      /* mm3=((G-Y)O+YO)=GO=(G1 G3 G5 G7) */
-      mm2 = _mm_packs_pu16(mm2, mm2);    /* mm2=(G0 G2 G4 G6 ** ** ** **) */
-      mm3 = _mm_packs_pu16(mm3, mm3);    /* mm3=(G1 G3 G5 G7 ** ** ** **) */
-
-      mm4 = _mm_add_pi16(mm4, wk[0]);    /* mm4=(YE+(B-Y)E)=BE=(B0 B2 B4 B6) */
-      mm5 = _mm_add_pi16(mm5, wk[1]);    /* mm5=(YO+(B-Y)O)=BO=(B1 B3 B5 B7) */
-      mm4 = _mm_packs_pu16(mm4, mm4);    /* mm4=(B0 B2 B4 B6 ** ** ** **) */
-      mm5 = _mm_packs_pu16(mm5, mm5);    /* mm5=(B1 B3 B5 B7 ** ** ** **) */
-
-#if RGB_PIXELSIZE == 3
-
-      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
-      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
-      mmA = _mm_unpacklo_pi8(mmA, mmC);     /* mmA=(00 10 02 12 04 14 06 16) */
-      mmE = _mm_unpacklo_pi8(mmE, mmB);     /* mmE=(20 01 22 03 24 05 26 07) */
-      mmD = _mm_unpacklo_pi8(mmD, mmF);     /* mmD=(11 21 13 23 15 25 17 27) */
-
-      mmG = mmA;
-      mmH = mmA;
-      mmA = _mm_unpacklo_pi16(mmA, mmE);    /* mmA=(00 10 20 01 02 12 22 03) */
-      mmG = _mm_unpackhi_pi16(mmG, mmE);    /* mmG=(04 14 24 05 06 16 26 07) */
-
-      mmH = _mm_srli_si64(mmH, 2 * BYTE_BIT);
-      mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
-
-      mmC = mmD;
-      mmB = mmD;
-      mmD = _mm_unpacklo_pi16(mmD, mmH);    /* mmD=(11 21 02 12 13 23 04 14) */
-      mmC = _mm_unpackhi_pi16(mmC, mmH);    /* mmC=(15 25 06 16 17 27 -- --) */
-
-      mmB = _mm_srli_si64(mmB, 2 * BYTE_BIT); /* mmB=(13 23 15 25 17 27 -- --) */
-
-      mmF = mmE;
-      mmE = _mm_unpacklo_pi16(mmE, mmB);    /* mmE=(22 03 13 23 24 05 15 25) */
-      mmF = _mm_unpackhi_pi16(mmF, mmB);    /* mmF=(26 07 17 27 -- -- -- --) */
-
-      mmA = _mm_unpacklo_pi32(mmA, mmD);    /* mmA=(00 10 20 01 11 21 02 12) */
-      mmE = _mm_unpacklo_pi32(mmE, mmG);    /* mmE=(22 03 13 23 04 14 24 05) */
-      mmC = _mm_unpacklo_pi32(mmC, mmF);    /* mmC=(15 25 06 16 26 07 17 27) */
-
-      if (num_cols >= 8) {
-        _mm_store_si64((__m64 *)outptr, mmA);
-        _mm_store_si64((__m64 *)(outptr + 8), mmE);
-        _mm_store_si64((__m64 *)(outptr + 16), mmC);
-        outptr += RGB_PIXELSIZE * 8;
-      } else {
-        col = num_cols * 3;
-        asm(".set noreorder\r\n"
-
-            "li      $8, 16\r\n"
-            "move    $9, %4\r\n"
-            "mov.s   $f4, %1\r\n"
-            "mov.s   $f6, %3\r\n"
-            "move    $10, %5\r\n"
-            "bltu    $9, $8, 1f\r\n"
-            "nop     \r\n"
-            "gssdlc1 $f4, 7($10)\r\n"
-            "gssdrc1 $f4, 0($10)\r\n"
-            "gssdlc1 $f6, 7+8($10)\r\n"
-            "gssdrc1 $f6, 8($10)\r\n"
-            "mov.s   $f4, %2\r\n"
-            "subu    $9, $9, 16\r\n"
-            "daddu   $10, $10, 16\r\n"
-            "b       2f\r\n"
-            "nop     \r\n"
-
-            "1:      \r\n"
-            "li      $8, 8\r\n"               /* st8 */
-            "bltu    $9, $8, 2f\r\n"
-            "nop     \r\n"
-            "gssdlc1 $f4, 7($10)\r\n"
-            "gssdrc1 $f4, ($10)\r\n"
-            "mov.s   $f4, %3\r\n"
-            "subu    $9, $9, 8\r\n"
-            "daddu   $10, $10, 8\r\n"
-
-            "2:      \r\n"
-            "li      $8, 4\r\n"               /* st4 */
-            "mfc1    $11, $f4\r\n"
-            "bltu    $9, $8, 3f\r\n"
-            "nop     \r\n"
-            "swl     $11, 3($10)\r\n"
-            "swr     $11, 0($10)\r\n"
-            "li      $8, 32\r\n"
-            "mtc1    $8, $f6\r\n"
-            "dsrl    $f4, $f4, $f6\r\n"
-            "mfc1    $11, $f4\r\n"
-            "subu    $9, $9, 4\r\n"
-            "daddu   $10, $10, 4\r\n"
-
-            "3:      \r\n"
-            "li      $8, 2\r\n"               /* st2 */
-            "bltu    $9, $8, 4f\r\n"
-            "nop     \r\n"
-            "ush     $11, 0($10)\r\n"
-            "srl     $11, 16\r\n"
-            "subu    $9, $9, 2\r\n"
-            "daddu   $10, $10, 2\r\n"
-
-            "4:      \r\n"
-            "li      $8, 1\r\n"               /* st1 */
-            "bltu    $9, $8, 5f\r\n"
-            "nop     \r\n"
-            "sb      $11, 0($10)\r\n"
-
-            "5:      \r\n"
-            "nop     \r\n"                    /* end */
-            : "=m" (*outptr)
-            : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
-            : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
-           );
-      }
-
-#else  /* RGB_PIXELSIZE == 4 */
-
-#ifdef RGBX_FILLER_0XFF
-      mm6 = _mm_cmpeq_pi8(mm6, mm6);
-      mm7 = _mm_cmpeq_pi8(mm7, mm7);
-#else
-      mm6 = _mm_xor_si64(mm6, mm6);
-      mm7 = _mm_xor_si64(mm7, mm7);
-#endif
-      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
-      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
-      /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
-      /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
-
-      mmA = _mm_unpacklo_pi8(mmA, mmC);     /* mmA=(00 10 02 12 04 14 06 16) */
-      mmE = _mm_unpacklo_pi8(mmE, mmG);     /* mmE=(20 30 22 32 24 34 26 36) */
-      mmB = _mm_unpacklo_pi8(mmB, mmD);     /* mmB=(01 11 03 13 05 15 07 17) */
-      mmF = _mm_unpacklo_pi8(mmF, mmH);     /* mmF=(21 31 23 33 25 35 27 37) */
-
-      mmC = mmA;
-      mmA = _mm_unpacklo_pi16(mmA, mmE);    /* mmA=(00 10 20 30 02 12 22 32) */
-      mmC = _mm_unpackhi_pi16(mmC, mmE);    /* mmC=(04 14 24 34 06 16 26 36) */
-      mmG = mmB;
-      mmB = _mm_unpacklo_pi16(mmB, mmF);    /* mmB=(01 11 21 31 03 13 23 33) */
-      mmG = _mm_unpackhi_pi16(mmG, mmF);    /* mmG=(05 15 25 35 07 17 27 37) */
-
-      mmD = mmA;
-      mmA = _mm_unpacklo_pi32(mmA, mmB);    /* mmA=(00 10 20 30 01 11 21 31) */
-      mmD = _mm_unpackhi_pi32(mmD, mmB);    /* mmD=(02 12 22 32 03 13 23 33) */
-      mmH = mmC;
-      mmC = _mm_unpacklo_pi32(mmC, mmG);    /* mmC=(04 14 24 34 05 15 25 35) */
-      mmH = _mm_unpackhi_pi32(mmH, mmG);    /* mmH=(06 16 26 36 07 17 27 37) */
-
-      if (num_cols >= 8) {
-        _mm_store_si64((__m64 *)outptr, mmA);
-        _mm_store_si64((__m64 *)(outptr + 8), mmD);
-        _mm_store_si64((__m64 *)(outptr + 16), mmC);
-        _mm_store_si64((__m64 *)(outptr + 24), mmH);
-        outptr += RGB_PIXELSIZE * 8;
-      } else {
-        col = num_cols;
-        asm(".set noreorder\r\n"              /* st16 */
-
-            "li      $8, 4\r\n"
-            "move    $9, %6\r\n"
-            "move    $10, %7\r\n"
-            "mov.s   $f4, %2\r\n"
-            "mov.s   $f6, %4\r\n"
-            "bltu    $9, $8, 1f\r\n"
-            "nop     \r\n"
-            "gssdlc1 $f4, 7($10)\r\n"
-            "gssdrc1 $f4, ($10)\r\n"
-            "gssdlc1 $f6, 7+8($10)\r\n"
-            "gssdrc1 $f6, 8($10)\r\n"
-            "mov.s   $f4, %3\r\n"
-            "mov.s   $f6, %5\r\n"
-            "subu    $9, $9, 4\r\n"
-            "daddu   $10, $10, 16\r\n"
-
-            "1:      \r\n"
-            "li      $8, 2\r\n"               /* st8 */
-            "bltu    $9, $8, 2f\r\n"
-            "nop     \r\n"
-            "gssdlc1 $f4, 7($10)\r\n"
-            "gssdrc1 $f4, 0($10)\r\n"
-            "mov.s   $f4, $f6\r\n"
-            "subu    $9, $9, 2\r\n"
-            "daddu   $10, $10, 8\r\n"
-
-            "2:      \r\n"
-            "li      $8, 1\r\n"               /* st4 */
-            "bltu    $9, $8, 3f\r\n"
-            "nop     \r\n"
-            "gsswlc1 $f4, 3($10)\r\n"
-            "gsswrc1 $f4, 0($10)\r\n"
-
-            "3:      \r\n"
-            "li      %1, 0\r\n"               /* end */
-            : "=m" (*outptr), "=r" (col)
-            : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
-              "r" (outptr)
-            : "$f4", "$f6", "$8", "$9", "$10", "memory"
-           );
-      }
-
-#endif
-
-    }
-  }
-}
-
-#undef mmA
-#undef mmB
-#undef mmC
-#undef mmD
-#undef mmE
-#undef mmF
-#undef mmG
-#undef mmH
diff --git a/external/jpeg/simd/loongson/jdsample-mmi.c b/external/jpeg/simd/loongson/jdsample-mmi.c
deleted file mode 100644
index 00a6265176e3..000000000000
--- a/external/jpeg/simd/loongson/jdsample-mmi.c
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2015, 2018, D. R. Commander.  All Rights Reserved.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- *                          All Rights Reserved.
- * Authors:  ZhuChen     <zhuchen@loongson.cn>
- *           CaiWanwei   <caiwanwei@loongson.cn>
- *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* CHROMA UPSAMPLING */
-
-#include "jsimd_mmi.h"
-
-
-enum const_index {
-  index_PW_THREE,
-  index_PW_SEVEN,
-  index_PW_EIGHT,
-};
-
-static uint64_t const_value[] = {
-  _uint64_set_pi16(3, 3, 3, 3),
-  _uint64_set_pi16(7, 7, 7, 7),
-  _uint64_set_pi16(8, 8, 8, 8),
-};
-
-#define PW_THREE  get_const_value(index_PW_THREE)
-#define PW_SEVEN  get_const_value(index_PW_SEVEN)
-#define PW_EIGHT  get_const_value(index_PW_EIGHT)
-
-
-#define PROCESS_ROW(r) { \
-  mm7 = _mm_load_si64((__m64 *)outptr##r);      /* mm7=IntrL=( 0 1 2 3) */ \
-  mm3 = _mm_load_si64((__m64 *)outptr##r + 1);  /* mm3=IntrH=( 4 5 6 7) */ \
-  \
-  mm0 = mm7; \
-  mm4 = mm3; \
-  mm0 = _mm_srli_si64(mm0, 2 * BYTE_BIT);                   /* mm0=( 1 2 3 -) */ \
-  mm4 = _mm_slli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( - - - 4) */ \
-  mm5 = mm7; \
-  mm6 = mm3; \
-  mm5 = _mm_srli_si64(mm5, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm5=( 3 - - -) */ \
-  mm6 = _mm_slli_si64(mm6, 2 * BYTE_BIT);                   /* mm6=( - 4 5 6) */ \
-  \
-  mm0 = _mm_or_si64(mm0, mm4);                /* mm0=( 1 2 3 4) */ \
-  mm5 = _mm_or_si64(mm5, mm6);                /* mm5=( 3 4 5 6) */ \
-  \
-  mm1 = mm7; \
-  mm2 = mm3; \
-  mm1 = _mm_slli_si64(mm1, 2 * BYTE_BIT);     /* mm1=( - 0 1 2) */ \
-  mm2 = _mm_srli_si64(mm2, 2 * BYTE_BIT);     /* mm2=( 5 6 7 -) */ \
-  mm4 = mm3; \
-  mm4 = _mm_srli_si64(mm4, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm4=( 7 - - -) */ \
-  \
-  mm1 = _mm_or_si64(mm1, wk[r]);              /* mm1=(-1 0 1 2) */ \
-  mm2 = _mm_or_si64(mm2, wk[r + 2]);          /* mm2=( 5 6 6 8) */ \
-  \
-  wk[r] = mm4; \
-  \
-  mm7 = _mm_mullo_pi16(mm7, PW_THREE); \
-  mm3 = _mm_mullo_pi16(mm3, PW_THREE); \
-  mm1 = _mm_add_pi16(mm1, PW_EIGHT); \
-  mm5 = _mm_add_pi16(mm5, PW_EIGHT); \
-  mm0 = _mm_add_pi16(mm0, PW_SEVEN); \
-  mm2 = _mm_add_pi16(mm2, PW_SEVEN); \
-  \
-  mm1 = _mm_add_pi16(mm1, mm7); \
-  mm5 = _mm_add_pi16(mm5, mm3); \
-  mm1 = _mm_srli_pi16(mm1, 4);                /* mm1=OutrLE=( 0  2  4  6) */ \
-  mm5 = _mm_srli_pi16(mm5, 4);                /* mm5=OutrHE=( 8 10 12 14) */ \
-  mm0 = _mm_add_pi16(mm0, mm7); \
-  mm2 = _mm_add_pi16(mm2, mm3); \
-  mm0 = _mm_srli_pi16(mm0, 4);                /* mm0=OutrLO=( 1  3  5  7) */ \
-  mm2 = _mm_srli_pi16(mm2, 4);                /* mm2=OutrHO=( 9 11 13 15) */ \
-  \
-  mm0 = _mm_slli_pi16(mm0, BYTE_BIT); \
-  mm2 = _mm_slli_pi16(mm2, BYTE_BIT); \
-  mm1 = _mm_or_si64(mm1, mm0);     /* mm1=OutrL=( 0  1  2  3  4  5  6  7) */ \
-  mm5 = _mm_or_si64(mm5, mm2);     /* mm5=OutrH=( 8  9 10 11 12 13 14 15) */ \
-  \
-  _mm_store_si64((__m64 *)outptr##r, mm1); \
-  _mm_store_si64((__m64 *)outptr##r + 1, mm5); \
-}
-
-void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
-                                   JDIMENSION downsampled_width,
-                                   JSAMPARRAY input_data,
-                                   JSAMPARRAY *output_data_ptr)
-{
-  JSAMPARRAY output_data = *output_data_ptr;
-  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
-  int inrow, outrow, incol, tmp, tmp1;
-  __m64 mm0, mm1, mm2, mm3 = 0.0, mm4, mm5, mm6, mm7 = 0.0;
-  __m64 wk[4], mm_tmp;
-
-  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
-
-    inptr_1 = input_data[inrow - 1];
-    inptr0 = input_data[inrow];
-    inptr1 = input_data[inrow + 1];
-    outptr0 = output_data[outrow++];
-    outptr1 = output_data[outrow++];
-
-    if (downsampled_width & 7) {
-      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
-      tmp1 =  downsampled_width * sizeof(JSAMPLE);
-      asm("daddu  $8, %3, %6\r\n"
-          "lb     $9, ($8)\r\n"
-          "daddu  $8, %3, %7\r\n"
-          "sb     $9, ($8)\r\n"
-          "daddu  $8, %4, %6\r\n"
-          "lb     $9, ($8)\r\n"
-          "daddu  $8, %4, %7\r\n"
-          "sb     $9, ($8)\r\n"
-          "daddu  $8, %5, %6\r\n"
-          "lb     $9, ($8)\r\n"
-          "daddu  $8, %5, %7\r\n"
-          "sb     $9, ($8)\r\n"
-          : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
-          : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
-          : "$8", "$9"
-         );
-    }
-
-    /* process the first column block */
-    mm0 = _mm_load_si64((__m64 *)inptr0);     /* mm0 = row[ 0][0] */
-    mm1 = _mm_load_si64((__m64 *)inptr_1);    /* mm1 = row[-1][0] */
-    mm2 = _mm_load_si64((__m64 *)inptr1);     /* mm2 = row[ 1][0] */
-
-    mm3 = _mm_xor_si64(mm3, mm3);             /* mm3 = (all 0's) */
-    mm4 = mm0;
-    mm0 = _mm_unpacklo_pi8(mm0, mm3);         /* mm0 = row[ 0][0]( 0 1 2 3) */
-    mm4 = _mm_unpackhi_pi8(mm4, mm3);         /* mm4 = row[ 0][0]( 4 5 6 7) */
-    mm5 = mm1;
-    mm1 = _mm_unpacklo_pi8(mm1, mm3);         /* mm1 = row[-1][0]( 0 1 2 3) */
-    mm5 = _mm_unpackhi_pi8(mm5, mm3);         /* mm5 = row[-1][0]( 4 5 6 7) */
-    mm6 = mm2;
-    mm2 = _mm_unpacklo_pi8(mm2, mm3);         /* mm2 = row[+1][0]( 0 1 2 3) */
-    mm6 = _mm_unpackhi_pi8(mm6, mm3);         /* mm6 = row[+1][0]( 4 5 6 7) */
-
-    mm0 = _mm_mullo_pi16(mm0, PW_THREE);
-    mm4 = _mm_mullo_pi16(mm4, PW_THREE);
-
-    mm7 = _mm_cmpeq_pi8(mm7, mm7);
-    mm7 = _mm_srli_si64(mm7, (SIZEOF_MMWORD - 2) * BYTE_BIT);
-
-    mm1 = _mm_add_pi16(mm1, mm0);             /* mm1=Int0L=( 0 1 2 3) */
-    mm5 = _mm_add_pi16(mm5, mm4);             /* mm5=Int0H=( 4 5 6 7) */
-    mm2 = _mm_add_pi16(mm2, mm0);             /* mm2=Int1L=( 0 1 2 3) */
-    mm6 = _mm_add_pi16(mm6, mm4);             /* mm6=Int1H=( 4 5 6 7) */
-
-    _mm_store_si64((__m64 *)outptr0, mm1);      /* temporarily save */
-    _mm_store_si64((__m64 *)outptr0 + 1, mm5);  /* the intermediate data */
-    _mm_store_si64((__m64 *)outptr1, mm2);
-    _mm_store_si64((__m64 *)outptr1 + 1, mm6);
-
-    mm1 = _mm_and_si64(mm1, mm7);             /* mm1=( 0 - - -) */
-    mm2 = _mm_and_si64(mm2, mm7);             /* mm2=( 0 - - -) */
-
-    wk[0] = mm1;
-    wk[1] = mm2;
-
-    for (incol = downsampled_width; incol > 0;
-         incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
-         outptr0 += 16, outptr1 += 16) {
-
-      if (incol > 8) {
-        /* process the next column block */
-        mm0 = _mm_load_si64((__m64 *)inptr0 + 1);   /* mm0 = row[ 0][1] */
-        mm1 = _mm_load_si64((__m64 *)inptr_1 + 1);  /* mm1 = row[-1][1] */
-        mm2 = _mm_load_si64((__m64 *)inptr1 + 1);   /* mm2 = row[+1][1] */
-
-        mm3 = _mm_setzero_si64();             /* mm3 = (all 0's) */
-        mm4 = mm0;
-        mm0 = _mm_unpacklo_pi8(mm0, mm3);     /* mm0 = row[ 0][1]( 0 1 2 3) */
-        mm4 = _mm_unpackhi_pi8(mm4, mm3);     /* mm4 = row[ 0][1]( 4 5 6 7) */
-        mm5 = mm1;
-        mm1 = _mm_unpacklo_pi8(mm1, mm3);     /* mm1 = row[-1][1]( 0 1 2 3) */
-        mm5 = _mm_unpackhi_pi8(mm5, mm3);     /* mm5 = row[-1][1]( 4 5 6 7) */
-        mm6 = mm2;
-        mm2 = _mm_unpacklo_pi8(mm2, mm3);     /* mm2 = row[+1][1]( 0 1 2 3) */
-        mm6 = _mm_unpackhi_pi8(mm6, mm3);     /* mm6 = row[+1][1]( 4 5 6 7) */
-
-        mm0 = _mm_mullo_pi16(mm0, PW_THREE);
-        mm4 = _mm_mullo_pi16(mm4, PW_THREE);
-
-        mm1 = _mm_add_pi16(mm1, mm0);         /* mm1 = Int0L = ( 0 1 2 3) */
-        mm5 = _mm_add_pi16(mm5, mm4);         /* mm5 = Int0H = ( 4 5 6 7) */
-        mm2 = _mm_add_pi16(mm2, mm0);         /* mm2 = Int1L = ( 0 1 2 3) */
-        mm6 = _mm_add_pi16(mm6, mm4);         /* mm6 = Int1H = ( 4 5 6 7) */
-
-        _mm_store_si64((__m64 *)outptr0 + 2, mm1);  /* temporarily save */
-        _mm_store_si64((__m64 *)outptr0 + 3, mm5);  /* the intermediate data */
-        _mm_store_si64((__m64 *)outptr1 + 2, mm2);
-        _mm_store_si64((__m64 *)outptr1 + 3, mm6);
-
-        mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm1=( - - - 0) */
-        mm2 = _mm_slli_si64(mm2, (SIZEOF_MMWORD - 2) * BYTE_BIT); /* mm2=( - - - 0) */
-
-        wk[2] = mm1;
-        wk[3] = mm2;
-      } else {
-        /* process the last column block */
-        mm1 = _mm_cmpeq_pi8(mm1, mm1);
-        mm1 = _mm_slli_si64(mm1, (SIZEOF_MMWORD - 2) * BYTE_BIT);
-        mm2 = mm1;
-
-        mm_tmp = _mm_load_si64((__m64 *)outptr0 + 1);
-        mm1 = _mm_and_si64(mm1, mm_tmp);      /* mm1=( - - - 7) */
-        mm_tmp = _mm_load_si64((__m64 *)outptr1 + 1);
-        mm2 = _mm_and_si64(mm2, mm_tmp);      /* mm2=( - - - 7) */
-
-        wk[2] = mm1;
-        wk[3] = mm2;
-      }
-
-      /* process the upper row */
-      PROCESS_ROW(0)
-
-      /* process the lower row */
-      PROCESS_ROW(1)
-    }
-  }
-}
diff --git a/external/jpeg/simd/loongson/jquanti-mmi.c b/external/jpeg/simd/loongson/jquanti-mmi.c
deleted file mode 100644
index f9a3f8199672..000000000000
--- a/external/jpeg/simd/loongson/jquanti-mmi.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Loongson MMI optimizations for libjpeg-turbo
- *
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
- *                          All Rights Reserved.
- * Authors:  ZhuChen     <zhuchen@loongson.cn>
- *           CaiWanwei   <caiwanwei@loongson.cn>
- *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
- * Copyright (C) 2018, D. R. Commander.  All Rights Reserved.
- *
- * Based on the x86 SIMD extension for IJG JPEG library
- * Copyright (C) 1999-2006, MIYASAKA Masaru.
- *
- * This software is provided 'as-is', without any express or implied
- * warranty.  In no event will the authors be held liable for any damages
- * arising from the use of this software.
- *
- * Permission is granted to anyone to use this software for any purpose,
- * including commercial applications, and to alter it and redistribute it
- * freely, subject to the following restrictions:
- *
- * 1. The origin of this software must not be misrepresented; you must not
- *    claim that you wrote the original software. If you use this software
- *    in a product, an acknowledgment in the product documentation would be
- *    appreciated but is not required.
- * 2. Altered source versions must be plainly marked as such, and must not be
- *    misrepresented as being the original software.
- * 3. This notice may not be removed or altered from any source distribution.
- */
-
-/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
-
-#include "jsimd_mmi.h"
-
-
-#define DO_QUANT() { \
-  mm2 = _mm_load_si64((__m64 *)&workspace[0]); \
-  mm3 = _mm_load_si64((__m64 *)&workspace[4]); \
-  \
-  mm0 = mm2; \
-  mm1 = mm3; \
-  \
-  mm2 = _mm_srai_pi16(mm2, (WORD_BIT - 1));   /* -1 if value < 0, */ \
-                                              /* 0 otherwise */ \
-  mm3 = _mm_srai_pi16(mm3, (WORD_BIT - 1)); \
-  \
-  mm0 = _mm_xor_si64(mm0, mm2);               /* val = -val */ \
-  mm1 = _mm_xor_si64(mm1, mm3); \
-  mm0 = _mm_sub_pi16(mm0, mm2); \
-  mm1 = _mm_sub_pi16(mm1, mm3); \
-  \
-  corr0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]);  /* correction */ \
-  corr1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
-  \
-  mm0 = _mm_add_pi16(mm0, corr0);             /* correction + roundfactor */ \
-  mm1 = _mm_add_pi16(mm1, corr1); \
-  \
-  mm4 = mm0; \
-  mm5 = mm1; \
-  \
-  recip0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]);  /* reciprocal */ \
-  recip1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
-  \
-  mm0 = _mm_mulhi_pi16(mm0, recip0); \
-  mm1 = _mm_mulhi_pi16(mm1, recip1); \
-  \
-  mm0 = _mm_add_pi16(mm0, mm4);  /* reciprocal is always negative */ \
-  mm1 = _mm_add_pi16(mm1, mm5);  /* (MSB=1), so we always need to add the */ \
-                                 /* initial value (input value is never */ \
-                                 /* negative as we inverted it at the */ \
-                                 /* start of this routine) */ \
-  \
-  scale0 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]);  /* scale */ \
-  scale1 = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
-  \
-  mm6 = scale0; \
-  mm7 = scale1; \
-  mm4 = mm0; \
-  mm5 = mm1; \
-  \
-  mm0 = _mm_mulhi_pi16(mm0, mm6); \
-  mm1 = _mm_mulhi_pi16(mm1, mm7); \
-  \
-  mm6 = _mm_srai_pi16(mm6, (WORD_BIT - 1));   /* determine if scale... */ \
-                                              /* is negative */ \
-  mm7 = _mm_srai_pi16(mm7, (WORD_BIT - 1)); \
-  \
-  mm6 = _mm_and_si64(mm6, mm4);               /* and add input if it is */ \
-  mm7 = _mm_and_si64(mm7, mm5); \
-  mm0 = _mm_add_pi16(mm0, mm6); \
-  mm1 = _mm_add_pi16(mm1, mm7); \
-  \
-  mm4 = _mm_srai_pi16(mm4, (WORD_BIT - 1));   /* then check if... */ \
-  mm5 = _mm_srai_pi16(mm5, (WORD_BIT - 1));   /* negative input */ \
-  \
-  mm4 = _mm_and_si64(mm4, scale0);            /* and add scale if it is */ \
-  mm5 = _mm_and_si64(mm5, scale1); \
-  mm0 = _mm_add_pi16(mm0, mm4); \
-  mm1 = _mm_add_pi16(mm1, mm5); \
-  \
-  mm0 = _mm_xor_si64(mm0, mm2);               /* val = -val */ \
-  mm1 = _mm_xor_si64(mm1, mm3); \
-  mm0 = _mm_sub_pi16(mm0, mm2); \
-  mm1 = _mm_sub_pi16(mm1, mm3); \
-  \
-  _mm_store_si64((__m64 *)&output_ptr[0], mm0); \
-  _mm_store_si64((__m64 *)&output_ptr[4], mm1); \
-  \
-  workspace += DCTSIZE; \
-  divisors += DCTSIZE; \
-  output_ptr += DCTSIZE; \
-}
-
-
-void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors,
-                        DCTELEM *workspace)
-{
-  JCOEFPTR output_ptr = coef_block;
-  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6, mm7;
-  __m64 corr0, corr1, recip0, recip1, scale0, scale1;
-
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-  DO_QUANT()
-}
diff --git a/external/jpeg/simd/mips64/jccolext-mmi.c b/external/jpeg/simd/mips64/jccolext-mmi.c
new file mode 100644
index 000000000000..558eb2ab1020
--- /dev/null
+++ b/external/jpeg/simd/mips64/jccolext-mmi.c
@@ -0,0 +1,455 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jccolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+                               JSAMPIMAGE output_buf, JDIMENSION output_row,
+                               int num_rows)
+{
+  JSAMPROW inptr, outptr0, outptr1, outptr2;
+  int num_cols, col;
+  __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+  __m64 xo;
+#endif
+  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+  __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho;
+  __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho;
+  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+  __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb;
+  __m64 crle, crhe, cre, crlo, crho, cro, cr;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr0 = output_buf[0][output_row];
+    outptr1 = output_buf[1][output_row];
+    outptr2 = output_buf[2][output_row];
+    output_row++;
+
+    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+         outptr0 += 8, outptr1 += 8, outptr2 += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+      if (num_cols < 8) {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %3\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            "xor      $12, $12, $12\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lbu      $12, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            "xor      $11, $11, $11\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lhu      $11, 0($13)\r\n"
+            "sll      $12, $12, 16\r\n"
+            "or       $12, $12, $11\r\n"
+
+            "2:       \r\n"
+            "dmtc1    $12, %0\r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 4\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lwu      $14, 0($13)\r\n"
+            "dmtc1    $14, %1\r\n"
+            "dsll32   $12, $12, 0\r\n"
+            "or       $12, $12, $14\r\n"
+            "dmtc1    $12, %0\r\n"
+
+            "3:       \r\n"
+            "li       $8, 8\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 4f\r\n"
+            "nop      \r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "li       $9, 8\r\n"
+            "j        5f\r\n"
+            "nop      \r\n"
+
+            "4:       \r\n"
+            "li       $8, 16\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 5f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+            : "r" (col), "r" (num_rows), "r" (inptr)
+            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+              "$14", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmG);
+      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+      mmD = _mm_unpacklo_pi8(mmD, mmF);
+      mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmD);
+      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmD = _mm_unpackhi_pi8(mmD, mmG);
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmB = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_loadhi_pi8_f(mmD);
+      mmD = _mm_loadlo_pi8_f(mmD);
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+      if (num_cols < 8) {
+        col = num_cols;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "lwc1     %0, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0($13)\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "mov.s    %3, %1\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "3:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+            : "r" (col), "r" (inptr)
+            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmF = _mm_load_si64((__m64 *)&inptr[8]);
+          mmD = _mm_load_si64((__m64 *)&inptr[16]);
+          mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmB = _mm_unpackhi_pi8(mmA, mmF);
+      mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+      mmG = _mm_unpackhi_pi8(mmD, mmC);
+      mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+      mmE = _mm_unpackhi_pi16(mmA, mmD);
+      mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+      mmH = _mm_unpackhi_pi16(mmB, mmG);
+      mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmD = _mm_loadhi_pi8_f(mmB);
+      mmB = _mm_loadlo_pi8_f(mmB);
+
+      mmG = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_unpacklo_pi8(mmH, mmH);
+      mmH = _mm_unpackhi_pi8(mmH, mmH);
+      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+       *
+       * (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
+       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
+       */
+
+      rglo = _mm_unpacklo_pi16(ro, go);
+      rgho = _mm_unpackhi_pi16(ro, go);
+      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+      cblo = _mm_madd_pi16(rglo, PW_MF016_MF033);
+      cbho = _mm_madd_pi16(rgho, PW_MF016_MF033);
+
+      blo = _mm_loadlo_pi16_f(bo);
+      bho = _mm_loadhi_pi16_f(bo);
+      halfblo = _mm_srli_pi32(blo, 1);
+      halfbho = _mm_srli_pi32(bho, 1);
+
+      cblo = _mm_add_pi32(cblo, halfblo);
+      cbho = _mm_add_pi32(cbho, halfbho);
+      cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ);
+      cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ);
+      cblo = _mm_srli_pi32(cblo, SCALEBITS);
+      cbho = _mm_srli_pi32(cbho, SCALEBITS);
+      cbo = _mm_packs_pi32(cblo, cbho);
+
+      rgle = _mm_unpacklo_pi16(re, ge);
+      rghe = _mm_unpackhi_pi16(re, ge);
+      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+      cble = _mm_madd_pi16(rgle, PW_MF016_MF033);
+      cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033);
+
+      ble = _mm_loadlo_pi16_f(be);
+      bhe = _mm_loadhi_pi16_f(be);
+      halfble = _mm_srli_pi32(ble, 1);
+      halfbhe = _mm_srli_pi32(bhe, 1);
+
+      cble = _mm_add_pi32(cble, halfble);
+      cbhe = _mm_add_pi32(cbhe, halfbhe);
+      cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ);
+      cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ);
+      cble = _mm_srli_pi32(cble, SCALEBITS);
+      cbhe = _mm_srli_pi32(cbhe, SCALEBITS);
+      cbe = _mm_packs_pi32(cble, cbhe);
+
+      cbo = _mm_slli_pi16(cbo, BYTE_BIT);
+      cb = _mm_or_si64(cbe, cbo);
+
+      bglo = _mm_unpacklo_pi16(bo, go);
+      bgho = _mm_unpackhi_pi16(bo, go);
+      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+      crlo = _mm_madd_pi16(bglo, PW_MF008_MF041);
+      crho = _mm_madd_pi16(bgho, PW_MF008_MF041);
+
+      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+      yho = _mm_add_pi32(yho_bg, yho_rg);
+      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+      yho = _mm_add_pi32(yho, PD_ONEHALF);
+      ylo = _mm_srli_pi32(ylo, SCALEBITS);
+      yho = _mm_srli_pi32(yho, SCALEBITS);
+      yo = _mm_packs_pi32(ylo, yho);
+
+      rlo = _mm_loadlo_pi16_f(ro);
+      rho = _mm_loadhi_pi16_f(ro);
+      halfrlo = _mm_srli_pi32(rlo, 1);
+      halfrho = _mm_srli_pi32(rho, 1);
+
+      crlo = _mm_add_pi32(crlo, halfrlo);
+      crho = _mm_add_pi32(crho, halfrho);
+      crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ);
+      crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ);
+      crlo = _mm_srli_pi32(crlo, SCALEBITS);
+      crho = _mm_srli_pi32(crho, SCALEBITS);
+      cro = _mm_packs_pi32(crlo, crho);
+
+      bgle = _mm_unpacklo_pi16(be, ge);
+      bghe = _mm_unpackhi_pi16(be, ge);
+      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+      crle = _mm_madd_pi16(bgle, PW_MF008_MF041);
+      crhe = _mm_madd_pi16(bghe, PW_MF008_MF041);
+
+      yle = _mm_add_pi32(yle_bg, yle_rg);
+      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+      yle = _mm_add_pi32(yle, PD_ONEHALF);
+      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+      yle = _mm_srli_pi32(yle, SCALEBITS);
+      yhe = _mm_srli_pi32(yhe, SCALEBITS);
+      ye = _mm_packs_pi32(yle, yhe);
+
+      yo = _mm_slli_pi16(yo, BYTE_BIT);
+      y = _mm_or_si64(ye, yo);
+
+      rle = _mm_loadlo_pi16_f(re);
+      rhe = _mm_loadhi_pi16_f(re);
+      halfrle = _mm_srli_pi32(rle, 1);
+      halfrhe = _mm_srli_pi32(rhe, 1);
+
+      crle = _mm_add_pi32(crle, halfrle);
+      crhe = _mm_add_pi32(crhe, halfrhe);
+      crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ);
+      crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ);
+      crle = _mm_srli_pi32(crle, SCALEBITS);
+      crhe = _mm_srli_pi32(crhe, SCALEBITS);
+      cre = _mm_packs_pi32(crle, crhe);
+
+      cro = _mm_slli_pi16(cro, BYTE_BIT);
+      cr = _mm_or_si64(cre, cro);
+
+      _mm_store_si64((__m64 *)&outptr0[0], y);
+      _mm_store_si64((__m64 *)&outptr1[0], cb);
+      _mm_store_si64((__m64 *)&outptr2[0], cr);
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/external/jpeg/simd/loongson/jccolor-mmi.c b/external/jpeg/simd/mips64/jccolor-mmi.c
similarity index 100%
rename from external/jpeg/simd/loongson/jccolor-mmi.c
rename to external/jpeg/simd/mips64/jccolor-mmi.c
diff --git a/external/jpeg/simd/mips64/jcgray-mmi.c b/external/jpeg/simd/mips64/jcgray-mmi.c
new file mode 100644
index 000000000000..9c7b833f2e7b
--- /dev/null
+++ b/external/jpeg/simd/mips64/jcgray-mmi.c
@@ -0,0 +1,132 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2014, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* RGB --> GRAYSCALE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_114  ((short)7471)                /* FIX(0.11400) */
+#define F_0_250  ((short)16384)               /* FIX(0.25000) */
+#define F_0_299  ((short)19595)               /* FIX(0.29900) */
+#define F_0_587  ((short)38470)               /* FIX(0.58700) */
+#define F_0_337  ((short)(F_0_587 - F_0_250)) /* FIX(0.58700) - FIX(0.25000) */
+
+enum const_index {
+  index_PD_ONEHALF,
+  index_PW_F0299_F0337,
+  index_PW_F0114_F0250
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1))),
+  _uint64_set_pi16(F_0_337, F_0_299, F_0_337, F_0_299),
+  _uint64_set_pi16(F_0_250, F_0_114, F_0_250, F_0_114)
+};
+
+#define get_const_value(index)  (*(__m64 *)&const_value[index])
+
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+#define PW_F0299_F0337   get_const_value(index_PW_F0299_F0337)
+#define PW_F0114_F0250   get_const_value(index_PW_F0114_F0250)
+
+
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extrgbx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extbgrx_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extxbgr_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_rgb_gray_convert_mmi  jsimd_extxrgb_gray_convert_mmi
+#include "jcgryext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_rgb_gray_convert_mmi
diff --git a/external/jpeg/simd/mips64/jcgryext-mmi.c b/external/jpeg/simd/mips64/jcgryext-mmi.c
new file mode 100644
index 000000000000..08a83d6699cb
--- /dev/null
+++ b/external/jpeg/simd/mips64/jcgryext-mmi.c
@@ -0,0 +1,374 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jcgray-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
+                                JSAMPIMAGE output_buf, JDIMENSION output_row,
+                                int num_rows)
+{
+  JSAMPROW inptr, outptr;
+  int num_cols, col;
+  __m64 re, ro, ge, go, be, bo, xe;
+#if RGB_PIXELSIZE == 4
+  __m64 xo;
+#endif
+  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
+  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
+  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
+
+  while (--num_rows >= 0) {
+    inptr = *input_buf++;
+    outptr = output_buf[0][output_row];
+    output_row++;
+
+    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
+         outptr += 8) {
+
+#if RGB_PIXELSIZE == 3
+
+      if (num_cols < 8) {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %3\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            "xor      $12, $12, $12\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lbu      $12, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            "xor      $11, $11, $11\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lhu      $11, 0($13)\r\n"
+            "sll      $12, $12, 16\r\n"
+            "or       $12, $12, $11\r\n"
+
+            "2:       \r\n"
+            "dmtc1    $12, %0\r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 4\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $9\r\n"
+            "lwu      $14, 0($13)\r\n"
+            "dmtc1    $14, %1\r\n"
+            "dsll32   $12, $12, 0\r\n"
+            "or       $12, $12, $14\r\n"
+            "dmtc1    $12, %0\r\n"
+
+            "3:       \r\n"
+            "li       $8, 8\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 4f\r\n"
+            "nop      \r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "li       $9, 8\r\n"
+            "j        5f\r\n"
+            "nop      \r\n"
+
+            "4:       \r\n"
+            "li       $8, 16\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 5f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
+            : "r" (col), "r" (num_rows), "r" (inptr)
+            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
+              "$14", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmG = _mm_load_si64((__m64 *)&inptr[8]);
+          mmF = _mm_load_si64((__m64 *)&inptr[16]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmG);
+      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
+
+      mmD = _mm_unpacklo_pi8(mmD, mmF);
+      mmG = _mm_unpackhi_pi8(mmG, mmF);
+
+      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
+
+      mmA = _mm_unpackhi_pi8(mmA, mmD);
+      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
+
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmD = _mm_unpackhi_pi8(mmD, mmG);
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmB = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_loadhi_pi8_f(mmD);
+      mmD = _mm_loadlo_pi8_f(mmD);
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+      if (num_cols < 8) {
+        col = num_cols;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 1\r\n"
+            "move     $9, %4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 1f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 1\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "lwc1     %0, 0($13)\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 2f\r\n"
+            "nop      \r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_SLL   "$11, $9, 2\r\n"
+            "move     $13, %5\r\n"
+            PTR_ADDU  "$13, $13, $11\r\n"
+            "mov.s    %1, %0\r\n"
+            "ldc1     %0, 0($13)\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"
+            "and      $10, $9, $8\r\n"
+            "beqz     $10, 3f\r\n"
+            "nop      \r\n"
+            "mov.s    %2, %0\r\n"
+            "mov.s    %3, %1\r\n"
+            "ldc1     %0, 0(%5)\r\n"
+            "ldc1     %1, 8(%5)\r\n"
+
+            "3:       \r\n"
+            "nop      \r\n"
+            ".set reorder\r\n"
+
+            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
+            : "r" (col), "r" (inptr)
+            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
+           );
+      } else {
+        if (!(((long)inptr) & 7)) {
+          mmA = _mm_load_si64((__m64 *)&inptr[0]);
+          mmF = _mm_load_si64((__m64 *)&inptr[8]);
+          mmD = _mm_load_si64((__m64 *)&inptr[16]);
+          mmC = _mm_load_si64((__m64 *)&inptr[24]);
+        } else {
+          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
+          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
+          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
+          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
+        }
+        inptr += RGB_PIXELSIZE * 8;
+      }
+      mmB = _mm_unpackhi_pi8(mmA, mmF);
+      mmA = _mm_unpacklo_pi8(mmA, mmF);
+
+      mmG = _mm_unpackhi_pi8(mmD, mmC);
+      mmD = _mm_unpacklo_pi8(mmD, mmC);
+
+      mmE = _mm_unpackhi_pi16(mmA, mmD);
+      mmA = _mm_unpacklo_pi16(mmA, mmD);
+
+      mmH = _mm_unpackhi_pi16(mmB, mmG);
+      mmB = _mm_unpacklo_pi16(mmB, mmG);
+
+      mmC = _mm_loadhi_pi8_f(mmA);
+      mmA = _mm_loadlo_pi8_f(mmA);
+
+      mmD = _mm_loadhi_pi8_f(mmB);
+      mmB = _mm_loadlo_pi8_f(mmB);
+
+      mmG = _mm_loadhi_pi8_f(mmE);
+      mmE = _mm_loadlo_pi8_f(mmE);
+
+      mmF = _mm_unpacklo_pi8(mmH, mmH);
+      mmH = _mm_unpackhi_pi8(mmH, mmH);
+      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
+      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
+
+#endif
+
+      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
+       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
+       *
+       * (Original)
+       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
+       *
+       * (This implementation)
+       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
+       */
+
+      rglo = _mm_unpacklo_pi16(ro, go);
+      rgho = _mm_unpackhi_pi16(ro, go);
+      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
+      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
+
+      rgle = _mm_unpacklo_pi16(re, ge);
+      rghe = _mm_unpackhi_pi16(re, ge);
+      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
+      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
+
+      bglo = _mm_unpacklo_pi16(bo, go);
+      bgho = _mm_unpackhi_pi16(bo, go);
+      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
+      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
+
+      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
+      yho = _mm_add_pi32(yho_bg, yho_rg);
+      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
+      yho = _mm_add_pi32(yho, PD_ONEHALF);
+      ylo = _mm_srli_pi32(ylo, SCALEBITS);
+      yho = _mm_srli_pi32(yho, SCALEBITS);
+      yo = _mm_packs_pi32(ylo, yho);
+
+      bgle = _mm_unpacklo_pi16(be, ge);
+      bghe = _mm_unpackhi_pi16(be, ge);
+      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
+      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
+
+      yle = _mm_add_pi32(yle_bg, yle_rg);
+      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
+      yle = _mm_add_pi32(yle, PD_ONEHALF);
+      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
+      yle = _mm_srli_pi32(yle, SCALEBITS);
+      yhe = _mm_srli_pi32(yhe, SCALEBITS);
+      ye = _mm_packs_pi32(yle, yhe);
+
+      yo = _mm_slli_pi16(yo, BYTE_BIT);
+      y = _mm_or_si64(ye, yo);
+
+      _mm_store_si64((__m64 *)&outptr[0], y);
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/external/jpeg/simd/loongson/jcsample-mmi.c b/external/jpeg/simd/mips64/jcsample-mmi.c
similarity index 56%
rename from external/jpeg/simd/loongson/jcsample-mmi.c
rename to external/jpeg/simd/mips64/jcsample-mmi.c
index 2f2d85196ccc..0354dac0879c 100644
--- a/external/jpeg/simd/loongson/jcsample-mmi.c
+++ b/external/jpeg/simd/mips64/jcsample-mmi.c
@@ -1,7 +1,7 @@
 /*
  * Loongson MMI optimizations for libjpeg-turbo
  *
- * Copyright (C) 2015, 2018, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2015, 2018-2019, D. R. Commander.  All Rights Reserved.
  * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
  *                          All Rights Reserved.
  * Authors:  ZhuChen     <zhuchen@loongson.cn>
@@ -39,18 +39,20 @@ void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
                                JDIMENSION width_in_blocks,
                                JSAMPARRAY input_data, JSAMPARRAY output_data)
 {
-  int inrow, outrow, outcol, bias;
+  int inrow, outrow, outcol;
   JDIMENSION output_cols = width_in_blocks * DCTSIZE;
   JSAMPROW inptr0, inptr1, outptr;
-  __m64 mm0, mm1, mm2, mm3, mm4, mm5, mm6 = 0.0, mm7;
+  __m64 bias, mask = 0.0, thisavg, nextavg, avg;
+  __m64 this0o, this0e, this0, this0sum, next0o, next0e, next0, next0sum;
+  __m64 this1o, this1e, this1, this1sum, next1o, next1e, next1, next1sum;
 
   expand_right_edge(input_data, max_v_samp_factor, image_width,
                     output_cols * 2);
 
-  bias = (1 << 17) + 1;                      /* 0x00020001 (bias pattern) */
-  mm7 = _mm_set1_pi32(bias);                 /* mm7={1, 2, 1, 2} */
-  mm6 = _mm_cmpeq_pi16(mm6, mm6);
-  mm6 = _mm_srli_pi16(mm6, BYTE_BIT);        /* mm6={0xFF 0x00 0xFF 0x00 ..} */
+  bias = _mm_set1_pi32((1 << 17) + 1);   /* 0x00020001 (32-bit bias pattern) */
+                                         /* bias={1, 2, 1, 2} (16-bit) */
+  mask = _mm_cmpeq_pi16(mask, mask);
+  mask = _mm_srli_pi16(mask, BYTE_BIT);  /* {0xFF 0x00 0xFF 0x00 ..} */
 
   for (inrow = 0, outrow = 0; outrow < v_samp_factor;
        inrow += 2, outrow++) {
@@ -62,39 +64,35 @@ void jsimd_h2v2_downsample_mmi(JDIMENSION image_width, int max_v_samp_factor,
     for (outcol = output_cols; outcol > 0;
          outcol -= 8, inptr0 += 16, inptr1 += 16, outptr += 8) {
 
-      mm0 = _mm_load_si64((__m64 *)&inptr0[0]);
-      mm1 = _mm_load_si64((__m64 *)&inptr1[0]);
-      mm2 = _mm_load_si64((__m64 *)&inptr0[8]);
-      mm3 = _mm_load_si64((__m64 *)&inptr1[8]);
+      this0 = _mm_load_si64((__m64 *)&inptr0[0]);
+      this1 = _mm_load_si64((__m64 *)&inptr1[0]);
+      next0 = _mm_load_si64((__m64 *)&inptr0[8]);
+      next1 = _mm_load_si64((__m64 *)&inptr1[8]);
 
-      mm4 = mm0;
-      mm5 = mm1;
-      mm0 = _mm_and_si64(mm0, mm6);
-      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
-      mm1 = _mm_and_si64(mm1, mm6);
-      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);
-      mm0 = _mm_add_pi16(mm0, mm4);
-      mm1 = _mm_add_pi16(mm1, mm5);
+      this0o = _mm_and_si64(this0, mask);
+      this0e = _mm_srli_pi16(this0, BYTE_BIT);
+      this1o = _mm_and_si64(this1, mask);
+      this1e = _mm_srli_pi16(this1, BYTE_BIT);
+      this0sum = _mm_add_pi16(this0o, this0e);
+      this1sum = _mm_add_pi16(this1o, this1e);
 
-      mm4 = mm2;
-      mm5 = mm3;
-      mm2 = _mm_and_si64(mm2, mm6);
-      mm4 = _mm_srli_pi16(mm4, BYTE_BIT);
-      mm3 = _mm_and_si64(mm3, mm6);
-      mm5 = _mm_srli_pi16(mm5, BYTE_BIT);
-      mm2 = _mm_add_pi16(mm2, mm4);
-      mm3 = _mm_add_pi16(mm3, mm5);
+      next0o = _mm_and_si64(next0, mask);
+      next0e = _mm_srli_pi16(next0, BYTE_BIT);
+      next1o = _mm_and_si64(next1, mask);
+      next1e = _mm_srli_pi16(next1, BYTE_BIT);
+      next0sum = _mm_add_pi16(next0o, next0e);
+      next1sum = _mm_add_pi16(next1o, next1e);
 
-      mm0 = _mm_add_pi16(mm0, mm1);
-      mm2 = _mm_add_pi16(mm2, mm3);
-      mm0 = _mm_add_pi16(mm0, mm7);
-      mm2 = _mm_add_pi16(mm2, mm7);
-      mm0 = _mm_srli_pi16(mm0, 2);
-      mm2 = _mm_srli_pi16(mm2, 2);
+      thisavg = _mm_add_pi16(this0sum, this1sum);
+      nextavg = _mm_add_pi16(next0sum, next1sum);
+      thisavg = _mm_add_pi16(thisavg, bias);
+      nextavg = _mm_add_pi16(nextavg, bias);
+      thisavg = _mm_srli_pi16(thisavg, 2);
+      nextavg = _mm_srli_pi16(nextavg, 2);
 
-      mm0 = _mm_packs_pu16(mm0, mm2);
+      avg = _mm_packs_pu16(thisavg, nextavg);
 
-      _mm_store_si64((__m64 *)&outptr[0], mm0);
+      _mm_store_si64((__m64 *)&outptr[0], avg);
     }
   }
 }
diff --git a/external/jpeg/simd/loongson/jcsample.h b/external/jpeg/simd/mips64/jcsample.h
similarity index 90%
rename from external/jpeg/simd/loongson/jcsample.h
rename to external/jpeg/simd/mips64/jcsample.h
index 2ac48167fc20..bd07fcc4ed4a 100644
--- a/external/jpeg/simd/loongson/jcsample.h
+++ b/external/jpeg/simd/mips64/jcsample.h
@@ -20,7 +20,7 @@ expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
       ptr = image_data[row] + input_cols;
-      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      pixval = ptr[-1];
       for (count = numcols; count > 0; count--)
         *ptr++ = pixval;
     }
diff --git a/external/jpeg/simd/mips64/jdcolext-mmi.c b/external/jpeg/simd/mips64/jdcolext-mmi.c
new file mode 100644
index 000000000000..3b5b2f203078
--- /dev/null
+++ b/external/jpeg/simd/mips64/jdcolext-mmi.c
@@ -0,0 +1,415 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdcolor-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
+                               JDIMENSION input_row, JSAMPARRAY output_buf,
+                               int num_rows)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols, col;
+  __m64 ye, yo, y, cbe, cbe2, cbo, cbo2, cb, cre, cre2, cro, cro2, cr;
+  __m64 re, ro, gle, ghe, ge, glo, gho, go, be, bo, xe = 0.0, xo = 0.0;
+  __m64 decenter, mask;
+
+  while (--num_rows >= 0) {
+    inptr0 = input_buf[0][input_row];
+    inptr1 = input_buf[1][input_row];
+    inptr2 = input_buf[2][input_row];
+    input_row++;
+    outptr = *output_buf++;
+
+    for (num_cols = out_width; num_cols > 0; num_cols -= 8,
+         inptr0 += 8, inptr1 += 8, inptr2 += 8) {
+
+      cb = _mm_load_si64((__m64 *)inptr1);
+      cr = _mm_load_si64((__m64 *)inptr2);
+      y = _mm_load_si64((__m64 *)inptr0);
+
+      mask = decenter = 0.0;
+      mask = _mm_cmpeq_pi16(mask, mask);
+      decenter = _mm_cmpeq_pi16(decenter, decenter);
+      mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
+      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+      cbe = _mm_and_si64(mask, cb);           /* Cb(0246) */
+      cbo = _mm_srli_pi16(cb, BYTE_BIT);      /* Cb(1357) */
+      cre = _mm_and_si64(mask, cr);           /* Cr(0246) */
+      cro = _mm_srli_pi16(cr, BYTE_BIT);      /* Cr(1357) */
+      cbe = _mm_add_pi16(cbe, decenter);
+      cbo = _mm_add_pi16(cbo, decenter);
+      cre = _mm_add_pi16(cre, decenter);
+      cro = _mm_add_pi16(cro, decenter);
+
+      /* (Original)
+       * R = Y                + 1.40200 * Cr
+       * G = Y - 0.34414 * Cb - 0.71414 * Cr
+       * B = Y + 1.77200 * Cb
+       *
+       * (This implementation)
+       * R = Y                + 0.40200 * Cr + Cr
+       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+       * B = Y - 0.22800 * Cb + Cb + Cb
+       */
+
+      cbe2 = _mm_add_pi16(cbe, cbe);          /* 2*CbE */
+      cbo2 = _mm_add_pi16(cbo, cbo);          /* 2*CbO */
+      cre2 = _mm_add_pi16(cre, cre);          /* 2*CrE */
+      cro2 = _mm_add_pi16(cro, cro);          /* 2*CrO */
+
+      be = _mm_mulhi_pi16(cbe2, PW_MF0228);   /* (2*CbE * -FIX(0.22800) */
+      bo = _mm_mulhi_pi16(cbo2, PW_MF0228);   /* (2*CbO * -FIX(0.22800) */
+      re = _mm_mulhi_pi16(cre2, PW_F0402);    /* (2*CrE * FIX(0.40200)) */
+      ro = _mm_mulhi_pi16(cro2, PW_F0402);    /* (2*CrO * FIX(0.40200)) */
+
+      be = _mm_add_pi16(be, PW_ONE);
+      bo = _mm_add_pi16(bo, PW_ONE);
+      be = _mm_srai_pi16(be, 1);              /* (CbE * -FIX(0.22800)) */
+      bo = _mm_srai_pi16(bo, 1);              /* (CbO * -FIX(0.22800)) */
+      re = _mm_add_pi16(re, PW_ONE);
+      ro = _mm_add_pi16(ro, PW_ONE);
+      re = _mm_srai_pi16(re, 1);              /* (CrE * FIX(0.40200)) */
+      ro = _mm_srai_pi16(ro, 1);              /* (CrO * FIX(0.40200)) */
+
+      be = _mm_add_pi16(be, cbe);
+      bo = _mm_add_pi16(bo, cbo);
+      be = _mm_add_pi16(be, cbe);             /* (CbE * FIX(1.77200))=(B-Y)E */
+      bo = _mm_add_pi16(bo, cbo);             /* (CbO * FIX(1.77200))=(B-Y)O */
+      re = _mm_add_pi16(re, cre);             /* (CrE * FIX(1.40200))=(R-Y)E */
+      ro = _mm_add_pi16(ro, cro);             /* (CrO * FIX(1.40200))=(R-Y)O */
+
+      gle = _mm_unpacklo_pi16(cbe, cre);
+      ghe = _mm_unpackhi_pi16(cbe, cre);
+      gle = _mm_madd_pi16(gle, PW_MF0344_F0285);
+      ghe = _mm_madd_pi16(ghe, PW_MF0344_F0285);
+      glo = _mm_unpacklo_pi16(cbo, cro);
+      gho = _mm_unpackhi_pi16(cbo, cro);
+      glo = _mm_madd_pi16(glo, PW_MF0344_F0285);
+      gho = _mm_madd_pi16(gho, PW_MF0344_F0285);
+
+      gle = _mm_add_pi32(gle, PD_ONEHALF);
+      ghe = _mm_add_pi32(ghe, PD_ONEHALF);
+      gle = _mm_srai_pi32(gle, SCALEBITS);
+      ghe = _mm_srai_pi32(ghe, SCALEBITS);
+      glo = _mm_add_pi32(glo, PD_ONEHALF);
+      gho = _mm_add_pi32(gho, PD_ONEHALF);
+      glo = _mm_srai_pi32(glo, SCALEBITS);
+      gho = _mm_srai_pi32(gho, SCALEBITS);
+
+      ge = _mm_packs_pi32(gle, ghe);       /* CbE*-FIX(0.344)+CrE*FIX(0.285) */
+      go = _mm_packs_pi32(glo, gho);       /* CbO*-FIX(0.344)+CrO*FIX(0.285) */
+      ge = _mm_sub_pi16(ge, cre);  /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
+      go = _mm_sub_pi16(go, cro);  /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
+
+      ye = _mm_and_si64(mask, y);             /* Y(0246) */
+      yo = _mm_srli_pi16(y, BYTE_BIT);        /* Y(1357) */
+
+      re = _mm_add_pi16(re, ye);              /* ((R-Y)E+YE)=(R0 R2 R4 R6) */
+      ro = _mm_add_pi16(ro, yo);              /* ((R-Y)O+YO)=(R1 R3 R5 R7) */
+      re = _mm_packs_pu16(re, re);            /* (R0 R2 R4 R6 ** ** ** **) */
+      ro = _mm_packs_pu16(ro, ro);            /* (R1 R3 R5 R7 ** ** ** **) */
+
+      ge = _mm_add_pi16(ge, ye);              /* ((G-Y)E+YE)=(G0 G2 G4 G6) */
+      go = _mm_add_pi16(go, yo);              /* ((G-Y)O+YO)=(G1 G3 G5 G7) */
+      ge = _mm_packs_pu16(ge, ge);            /* (G0 G2 G4 G6 ** ** ** **) */
+      go = _mm_packs_pu16(go, go);            /* (G1 G3 G5 G7 ** ** ** **) */
+
+      be = _mm_add_pi16(be, ye);              /* (YE+(B-Y)E)=(B0 B2 B4 B6) */
+      bo = _mm_add_pi16(bo, yo);              /* (YO+(B-Y)O)=(B1 B3 B5 B7) */
+      be = _mm_packs_pu16(be, be);            /* (B0 B2 B4 B6 ** ** ** **) */
+      bo = _mm_packs_pu16(bo, bo);            /* (B1 B3 B5 B7 ** ** ** **) */
+
+#if RGB_PIXELSIZE == 3
+
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmB);       /* (20 01 22 03 24 05 26 07) */
+      mmD = _mm_unpacklo_pi8(mmD, mmF);       /* (11 21 13 23 15 25 17 27) */
+
+      mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT);
+
+      mmG = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 05 06 16 26 07) */
+      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 01 02 12 22 03) */
+
+      mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
+      mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT);  /* (13 23 15 25 17 27 -- --) */
+
+      mmC = _mm_unpackhi_pi16(mmD, mmH);      /* (15 25 06 16 17 27 -- --) */
+      mmD = _mm_unpacklo_pi16(mmD, mmH);      /* (11 21 02 12 13 23 04 14) */
+
+      mmF = _mm_unpackhi_pi16(mmE, mmB);      /* (26 07 17 27 -- -- -- --) */
+      mmE = _mm_unpacklo_pi16(mmE, mmB);      /* (22 03 13 23 24 05 15 25) */
+
+      mmA = _mm_unpacklo_pi32(mmA, mmD);      /* (00 10 20 01 11 21 02 12) */
+      mmE = _mm_unpacklo_pi32(mmE, mmG);      /* (22 03 13 23 04 14 24 05) */
+      mmC = _mm_unpacklo_pi32(mmC, mmF);      /* (15 25 06 16 26 07 17 27) */
+
+      if (num_cols >= 8) {
+        if (!(((long)outptr) & 7)) {
+          _mm_store_si64((__m64 *)outptr, mmA);
+          _mm_store_si64((__m64 *)(outptr + 8), mmE);
+          _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        } else {
+          _mm_storeu_si64((__m64 *)outptr, mmA);
+          _mm_storeu_si64((__m64 *)(outptr + 8), mmE);
+          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        }
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols * 3;
+        asm(".set noreorder\r\n"
+
+            "li       $8, 16\r\n"
+            "move     $9, %4\r\n"
+            "mov.s    $f4, %1\r\n"
+            "mov.s    $f6, %3\r\n"
+            "move     $10, %5\r\n"
+            "bltu     $9, $8, 1f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "gssdlc1  $f6, 7+8($10)\r\n"
+            "gssdrc1  $f6, 8($10)\r\n"
+            "mov.s    $f4, %2\r\n"
+            "subu     $9, $9, 16\r\n"
+            PTR_ADDU  "$10, $10, 16\r\n"
+            "b        2f\r\n"
+            "nop      \r\n"
+
+            "1:       \r\n"
+            "li       $8, 8\r\n"              /* st8 */
+            "bltu     $9, $8, 2f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "mov.s    $f4, %3\r\n"
+            "subu     $9, $9, 8\r\n"
+            PTR_ADDU  "$10, $10, 8\r\n"
+
+            "2:       \r\n"
+            "li       $8, 4\r\n"              /* st4 */
+            "mfc1     $11, $f4\r\n"
+            "bltu     $9, $8, 3f\r\n"
+            "nop      \r\n"
+            "swl      $11, 3($10)\r\n"
+            "swr      $11, 0($10)\r\n"
+            "li       $8, 32\r\n"
+            "mtc1     $8, $f6\r\n"
+            "dsrl     $f4, $f4, $f6\r\n"
+            "mfc1     $11, $f4\r\n"
+            "subu     $9, $9, 4\r\n"
+            PTR_ADDU  "$10, $10, 4\r\n"
+
+            "3:       \r\n"
+            "li       $8, 2\r\n"              /* st2 */
+            "bltu     $9, $8, 4f\r\n"
+            "nop      \r\n"
+            "ush      $11, 0($10)\r\n"
+            "srl      $11, 16\r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_ADDU  "$10, $10, 2\r\n"
+
+            "4:       \r\n"
+            "li       $8, 1\r\n"              /* st1 */
+            "bltu     $9, $8, 5f\r\n"
+            "nop      \r\n"
+            "sb       $11, 0($10)\r\n"
+
+            "5:       \r\n"
+            "nop      \r\n"                   /* end */
+            : "=m" (*outptr)
+            : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
+           );
+      }
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+      xe = _mm_cmpeq_pi8(xe, xe);
+      xo = _mm_cmpeq_pi8(xo, xo);
+#else
+      xe = _mm_xor_si64(xe, xe);
+      xo = _mm_xor_si64(xo, xo);
+#endif
+      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
+      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
+      /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
+      /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
+
+      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
+      mmE = _mm_unpacklo_pi8(mmE, mmG);       /* (20 30 22 32 24 34 26 36) */
+      mmB = _mm_unpacklo_pi8(mmB, mmD);       /* (01 11 03 13 05 15 07 17) */
+      mmF = _mm_unpacklo_pi8(mmF, mmH);       /* (21 31 23 33 25 35 27 37) */
+
+      mmC = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 34 06 16 26 36) */
+      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 30 02 12 22 32) */
+      mmG = _mm_unpackhi_pi16(mmB, mmF);      /* (05 15 25 35 07 17 27 37) */
+      mmB = _mm_unpacklo_pi16(mmB, mmF);      /* (01 11 21 31 03 13 23 33) */
+
+      mmD = _mm_unpackhi_pi32(mmA, mmB);      /* (02 12 22 32 03 13 23 33) */
+      mmA = _mm_unpacklo_pi32(mmA, mmB);      /* (00 10 20 30 01 11 21 31) */
+      mmH = _mm_unpackhi_pi32(mmC, mmG);      /* (06 16 26 36 07 17 27 37) */
+      mmC = _mm_unpacklo_pi32(mmC, mmG);      /* (04 14 24 34 05 15 25 35) */
+
+      if (num_cols >= 8) {
+        if (!(((long)outptr) & 7)) {
+          _mm_store_si64((__m64 *)outptr, mmA);
+          _mm_store_si64((__m64 *)(outptr + 8), mmD);
+          _mm_store_si64((__m64 *)(outptr + 16), mmC);
+          _mm_store_si64((__m64 *)(outptr + 24), mmH);
+        } else {
+          _mm_storeu_si64((__m64 *)outptr, mmA);
+          _mm_storeu_si64((__m64 *)(outptr + 8), mmD);
+          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+          _mm_storeu_si64((__m64 *)(outptr + 24), mmH);
+        }
+        outptr += RGB_PIXELSIZE * 8;
+      } else {
+        col = num_cols;
+        asm(".set noreorder\r\n"              /* st16 */
+
+            "li       $8, 4\r\n"
+            "move     $9, %6\r\n"
+            "move     $10, %7\r\n"
+            "mov.s    $f4, %2\r\n"
+            "mov.s    $f6, %4\r\n"
+            "bltu     $9, $8, 1f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "gssdlc1  $f6, 7+8($10)\r\n"
+            "gssdrc1  $f6, 8($10)\r\n"
+            "mov.s    $f4, %3\r\n"
+            "mov.s    $f6, %5\r\n"
+            "subu     $9, $9, 4\r\n"
+            PTR_ADDU  "$10, $10, 16\r\n"
+
+            "1:       \r\n"
+            "li       $8, 2\r\n"              /* st8 */
+            "bltu     $9, $8, 2f\r\n"
+            "nop      \r\n"
+            "gssdlc1  $f4, 7($10)\r\n"
+            "gssdrc1  $f4, 0($10)\r\n"
+            "mov.s    $f4, $f6\r\n"
+            "subu     $9, $9, 2\r\n"
+            PTR_ADDU  "$10, $10, 8\r\n"
+
+            "2:       \r\n"
+            "li       $8, 1\r\n"              /* st4 */
+            "bltu     $9, $8, 3f\r\n"
+            "nop      \r\n"
+            "gsswlc1  $f4, 3($10)\r\n"
+            "gsswrc1  $f4, 0($10)\r\n"
+
+            "3:       \r\n"
+            "li       %1, 0\r\n"              /* end */
+            : "=m" (*outptr), "=r" (col)
+            : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
+              "r" (outptr)
+            : "$f4", "$f6", "$8", "$9", "$10", "memory"
+           );
+      }
+
+#endif
+
+    }
+  }
+}
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/external/jpeg/simd/loongson/jdcolor-mmi.c b/external/jpeg/simd/mips64/jdcolor-mmi.c
similarity index 100%
rename from external/jpeg/simd/loongson/jdcolor-mmi.c
rename to external/jpeg/simd/mips64/jdcolor-mmi.c
diff --git a/external/jpeg/simd/mips64/jdmerge-mmi.c b/external/jpeg/simd/mips64/jdmerge-mmi.c
new file mode 100644
index 000000000000..0a39bd56805f
--- /dev/null
+++ b/external/jpeg/simd/mips64/jdmerge-mmi.c
@@ -0,0 +1,149 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2011, 2015, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* YCC --> RGB CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define F_0_344  ((short)22554)  /* FIX(0.34414) */
+#define F_0_402  ((short)26345)  /* FIX(1.40200) - FIX(1) */
+#define F_0_285  ((short)18734)  /* FIX(1) - FIX(0.71414) */
+#define F_0_228  ((short)14942)  /* FIX(2) - FIX(1.77200) */
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_F0402,
+  index_PW_MF0228,
+  index_PW_MF0344_F0285,
+  index_PD_ONEHALF
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(F_0_402, F_0_402, F_0_402, F_0_402),
+  _uint64_set_pi16(-F_0_228, -F_0_228, -F_0_228, -F_0_228),
+  _uint64_set_pi16(F_0_285, -F_0_344, F_0_285, -F_0_344),
+  _uint64_set_pi32((int)(1 << (SCALEBITS - 1)), (int)(1 << (SCALEBITS - 1)))
+};
+
+#define PW_ONE           get_const_value(index_PW_ONE)
+#define PW_F0402         get_const_value(index_PW_F0402)
+#define PW_MF0228        get_const_value(index_PW_MF0228)
+#define PW_MF0344_F0285  get_const_value(index_PW_MF0344_F0285)
+#define PD_ONEHALF       get_const_value(index_PD_ONEHALF)
+
+#define RGBX_FILLER_0XFF  1
+
+
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+
+#define RGB_RED  EXT_RGB_RED
+#define RGB_GREEN  EXT_RGB_GREEN
+#define RGB_BLUE  EXT_RGB_BLUE
+#define RGB_PIXELSIZE  EXT_RGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_RGBX_RED
+#define RGB_GREEN  EXT_RGBX_GREEN
+#define RGB_BLUE  EXT_RGBX_BLUE
+#define RGB_PIXELSIZE  EXT_RGBX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extrgbx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extrgbx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_BGR_RED
+#define RGB_GREEN  EXT_BGR_GREEN
+#define RGB_BLUE  EXT_BGR_BLUE
+#define RGB_PIXELSIZE  EXT_BGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_BGRX_RED
+#define RGB_GREEN  EXT_BGRX_GREEN
+#define RGB_BLUE  EXT_BGRX_BLUE
+#define RGB_PIXELSIZE  EXT_BGRX_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extbgrx_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extbgrx_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_XBGR_RED
+#define RGB_GREEN  EXT_XBGR_GREEN
+#define RGB_BLUE  EXT_XBGR_BLUE
+#define RGB_PIXELSIZE  EXT_XBGR_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extxbgr_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extxbgr_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
+
+#define RGB_RED  EXT_XRGB_RED
+#define RGB_GREEN  EXT_XRGB_GREEN
+#define RGB_BLUE  EXT_XRGB_BLUE
+#define RGB_PIXELSIZE  EXT_XRGB_PIXELSIZE
+#define jsimd_h2v1_merged_upsample_mmi  jsimd_h2v1_extxrgb_merged_upsample_mmi
+#define jsimd_h2v2_merged_upsample_mmi  jsimd_h2v2_extxrgb_merged_upsample_mmi
+#include "jdmrgext-mmi.c"
+#undef RGB_RED
+#undef RGB_GREEN
+#undef RGB_BLUE
+#undef RGB_PIXELSIZE
+#undef jsimd_h2v1_merged_upsample_mmi
+#undef jsimd_h2v2_merged_upsample_mmi
diff --git a/external/jpeg/simd/mips64/jdmrgext-mmi.c b/external/jpeg/simd/mips64/jdmrgext-mmi.c
new file mode 100644
index 000000000000..be09ff2a659e
--- /dev/null
+++ b/external/jpeg/simd/mips64/jdmrgext-mmi.c
@@ -0,0 +1,615 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
+ * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* This file is included by jdmerge-mmi.c */
+
+
+#if RGB_RED == 0
+#define mmA  re
+#define mmB  ro
+#elif RGB_GREEN == 0
+#define mmA  ge
+#define mmB  go
+#elif RGB_BLUE == 0
+#define mmA  be
+#define mmB  bo
+#else
+#define mmA  xe
+#define mmB  xo
+#endif
+
+#if RGB_RED == 1
+#define mmC  re
+#define mmD  ro
+#elif RGB_GREEN == 1
+#define mmC  ge
+#define mmD  go
+#elif RGB_BLUE == 1
+#define mmC  be
+#define mmD  bo
+#else
+#define mmC  xe
+#define mmD  xo
+#endif
+
+#if RGB_RED == 2
+#define mmE  re
+#define mmF  ro
+#elif RGB_GREEN == 2
+#define mmE  ge
+#define mmF  go
+#elif RGB_BLUE == 2
+#define mmE  be
+#define mmF  bo
+#else
+#define mmE  xe
+#define mmF  xo
+#endif
+
+#if RGB_RED == 3
+#define mmG  re
+#define mmH  ro
+#elif RGB_GREEN == 3
+#define mmG  ge
+#define mmH  go
+#elif RGB_BLUE == 3
+#define mmG  be
+#define mmH  bo
+#else
+#define mmG  xe
+#define mmH  xo
+#endif
+
+
+void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  JSAMPROW outptr, inptr0, inptr1, inptr2;
+  int num_cols, col;
+  __m64 ythise, ythiso, ythis, ynexte, ynexto, ynext, yl, y;
+  __m64 cbl, cbl2, cbh, cbh2, cb, crl, crl2, crh, crh2, cr;
+  __m64 rle, rlo, rl, rhe, rho, rh, re, ro;
+  __m64 ga, gb, gle, glo, gl, gc, gd, ghe, gho, gh, ge, go;
+  __m64 ble, blo, bl, bhe, bho, bh, be, bo, xe = 0.0, xo = 0.0;
+  __m64 decenter, mask, zero = 0.0;
+#if RGB_PIXELSIZE == 4
+  __m64 mm8, mm9;
+#endif
+
+  inptr0 = input_buf[0][in_row_group_ctr];
+  inptr1 = input_buf[1][in_row_group_ctr];
+  inptr2 = input_buf[2][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  for (num_cols = output_width >> 1; num_cols > 0; num_cols -= 8,
+       inptr0 += 16, inptr1 += 8, inptr2 += 8) {
+
+    cb = _mm_load_si64((__m64 *)inptr1);
+    cr = _mm_load_si64((__m64 *)inptr2);
+    ythis = _mm_load_si64((__m64 *)inptr0);
+    ynext = _mm_load_si64((__m64 *)inptr0 + 1);
+
+    mask = decenter = 0.0;
+    mask = _mm_cmpeq_pi16(mask, mask);
+    decenter = _mm_cmpeq_pi16(decenter, decenter);
+    mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
+    decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+    cbl = _mm_unpacklo_pi8(cb, zero);         /* Cb(0123) */
+    cbh = _mm_unpackhi_pi8(cb, zero);         /* Cb(4567) */
+    crl = _mm_unpacklo_pi8(cr, zero);         /* Cr(0123) */
+    crh = _mm_unpackhi_pi8(cr, zero);         /* Cr(4567) */
+    cbl = _mm_add_pi16(cbl, decenter);
+    cbh = _mm_add_pi16(cbh, decenter);
+    crl = _mm_add_pi16(crl, decenter);
+    crh = _mm_add_pi16(crh, decenter);
+
+    /* (Original)
+     * R = Y                + 1.40200 * Cr
+     * G = Y - 0.34414 * Cb - 0.71414 * Cr
+     * B = Y + 1.77200 * Cb
+     *
+     * (This implementation)
+     * R = Y                + 0.40200 * Cr + Cr
+     * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
+     * B = Y - 0.22800 * Cb + Cb + Cb
+     */
+
+    cbl2 = _mm_add_pi16(cbl, cbl);            /* 2*CbL */
+    cbh2 = _mm_add_pi16(cbh, cbh);            /* 2*CbH */
+    crl2 = _mm_add_pi16(crl, crl);            /* 2*CrL */
+    crh2 = _mm_add_pi16(crh, crh);            /* 2*CrH */
+
+    bl = _mm_mulhi_pi16(cbl2, PW_MF0228);     /* (2*CbL * -FIX(0.22800) */
+    bh = _mm_mulhi_pi16(cbh2, PW_MF0228);     /* (2*CbH * -FIX(0.22800) */
+    rl = _mm_mulhi_pi16(crl2, PW_F0402);      /* (2*CrL * FIX(0.40200)) */
+    rh = _mm_mulhi_pi16(crh2, PW_F0402);      /* (2*CrH * FIX(0.40200)) */
+
+    bl = _mm_add_pi16(bl, PW_ONE);
+    bh = _mm_add_pi16(bh, PW_ONE);
+    bl = _mm_srai_pi16(bl, 1);                /* (CbL * -FIX(0.22800)) */
+    bh = _mm_srai_pi16(bh, 1);                /* (CbH * -FIX(0.22800)) */
+    rl = _mm_add_pi16(rl, PW_ONE);
+    rh = _mm_add_pi16(rh, PW_ONE);
+    rl = _mm_srai_pi16(rl, 1);                /* (CrL * FIX(0.40200)) */
+    rh = _mm_srai_pi16(rh, 1);                /* (CrH * FIX(0.40200)) */
+
+    bl = _mm_add_pi16(bl, cbl);
+    bh = _mm_add_pi16(bh, cbh);
+    bl = _mm_add_pi16(bl, cbl);               /* (CbL * FIX(1.77200))=(B-Y)L */
+    bh = _mm_add_pi16(bh, cbh);               /* (CbH * FIX(1.77200))=(B-Y)H */
+    rl = _mm_add_pi16(rl, crl);               /* (CrL * FIX(1.40200))=(R-Y)L */
+    rh = _mm_add_pi16(rh, crh);               /* (CrH * FIX(1.40200))=(R-Y)H */
+
+    ga = _mm_unpacklo_pi16(cbl, crl);
+    gb = _mm_unpackhi_pi16(cbl, crl);
+    ga = _mm_madd_pi16(ga, PW_MF0344_F0285);
+    gb = _mm_madd_pi16(gb, PW_MF0344_F0285);
+    gc = _mm_unpacklo_pi16(cbh, crh);
+    gd = _mm_unpackhi_pi16(cbh, crh);
+    gc = _mm_madd_pi16(gc, PW_MF0344_F0285);
+    gd = _mm_madd_pi16(gd, PW_MF0344_F0285);
+
+    ga = _mm_add_pi32(ga, PD_ONEHALF);
+    gb = _mm_add_pi32(gb, PD_ONEHALF);
+    ga = _mm_srai_pi32(ga, SCALEBITS);
+    gb = _mm_srai_pi32(gb, SCALEBITS);
+    gc = _mm_add_pi32(gc, PD_ONEHALF);
+    gd = _mm_add_pi32(gd, PD_ONEHALF);
+    gc = _mm_srai_pi32(gc, SCALEBITS);
+    gd = _mm_srai_pi32(gd, SCALEBITS);
+
+    gl = _mm_packs_pi32(ga, gb);           /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+    gh = _mm_packs_pi32(gc, gd);           /* CbH*-FIX(0.344)+CrH*FIX(0.285) */
+    gl = _mm_sub_pi16(gl, crl);    /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+    gh = _mm_sub_pi16(gh, crh);    /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */
+
+    ythise = _mm_and_si64(mask, ythis);       /* Y(0246) */
+    ythiso = _mm_srli_pi16(ythis, BYTE_BIT);  /* Y(1357) */
+    ynexte = _mm_and_si64(mask, ynext);       /* Y(8ACE) */
+    ynexto = _mm_srli_pi16(ynext, BYTE_BIT);  /* Y(9BDF) */
+
+    rle = _mm_add_pi16(rl, ythise);           /* (R0 R2 R4 R6) */
+    rlo = _mm_add_pi16(rl, ythiso);           /* (R1 R3 R5 R7) */
+    rhe = _mm_add_pi16(rh, ynexte);           /* (R8 RA RC RE) */
+    rho = _mm_add_pi16(rh, ynexto);           /* (R9 RB RD RF) */
+    re = _mm_packs_pu16(rle, rhe);            /* (R0 R2 R4 R6 R8 RA RC RE) */
+    ro = _mm_packs_pu16(rlo, rho);            /* (R1 R3 R5 R7 R9 RB RD RF) */
+
+    gle = _mm_add_pi16(gl, ythise);           /* (G0 G2 G4 G6) */
+    glo = _mm_add_pi16(gl, ythiso);           /* (G1 G3 G5 G7) */
+    ghe = _mm_add_pi16(gh, ynexte);           /* (G8 GA GC GE) */
+    gho = _mm_add_pi16(gh, ynexto);           /* (G9 GB GD GF) */
+    ge = _mm_packs_pu16(gle, ghe);            /* (G0 G2 G4 G6 G8 GA GC GE) */
+    go = _mm_packs_pu16(glo, gho);            /* (G1 G3 G5 G7 G9 GB GD GF) */
+
+    ble = _mm_add_pi16(bl, ythise);           /* (B0 B2 B4 B6) */
+    blo = _mm_add_pi16(bl, ythiso);           /* (B1 B3 B5 B7) */
+    bhe = _mm_add_pi16(bh, ynexte);           /* (B8 BA BC BE) */
+    bho = _mm_add_pi16(bh, ynexto);           /* (B9 BB BD BF) */
+    be = _mm_packs_pu16(ble, bhe);            /* (B0 B2 B4 B6 B8 BA BC BE) */
+    bo = _mm_packs_pu16(blo, bho);            /* (B1 B3 B5 B7 B9 BB BD BF) */
+
+#if RGB_PIXELSIZE == 3
+
+    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+    mmG = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
+    mmA = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
+    mmH = _mm_unpacklo_pi8(mmE, mmB);         /* (20 01 22 03 24 05 26 07) */
+    mmE = _mm_unpackhi_pi8(mmE, mmB);         /* (28 09 2A 0B 2C 0D 2E 0F) */
+    mmC = _mm_unpacklo_pi8(mmD, mmF);         /* (11 21 13 23 15 25 17 27) */
+    mmD = _mm_unpackhi_pi8(mmD, mmF);         /* (19 29 1B 2B 1D 2D 1F 2F) */
+
+    mmB = _mm_unpacklo_pi16(mmG, mmA);        /* (00 10 08 18 02 12 0A 1A) */
+    mmA = _mm_unpackhi_pi16(mmG, mmA);        /* (04 14 0C 1C 06 16 0E 1E) */
+    mmF = _mm_unpacklo_pi16(mmH, mmE);        /* (20 01 28 09 22 03 2A 0B) */
+    mmE = _mm_unpackhi_pi16(mmH, mmE);        /* (24 05 2C 0D 26 07 2E 0F) */
+    mmH = _mm_unpacklo_pi16(mmC, mmD);        /* (11 21 19 29 13 23 1B 2B) */
+    mmG = _mm_unpackhi_pi16(mmC, mmD);        /* (15 25 1D 2D 17 27 1F 2F) */
+
+    mmC = _mm_unpacklo_pi16(mmB, mmF);        /* (00 10 20 01 08 18 28 09) */
+    mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT);
+    mmB = _mm_unpacklo_pi16(mmH, mmB);        /* (11 21 02 12 19 29 0A 1A) */
+    mmD = _mm_unpackhi_pi16(mmF, mmH);        /* (22 03 13 23 2A 0B 1B 2B) */
+    mmF = _mm_unpacklo_pi16(mmA, mmE);        /* (04 14 24 05 0C 1C 2C 0D) */
+    mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT);
+    mmH = _mm_unpacklo_pi16(mmG, mmA);        /* (15 25 06 16 1D 2D 0E 1E) */
+    mmG = _mm_unpackhi_pi16(mmE, mmG);        /* (26 07 17 27 2E 0F 1F 2F) */
+
+    mmA = _mm_unpacklo_pi32(mmC, mmB);        /* (00 10 20 01 11 21 02 12) */
+    mmE = _mm_unpackhi_pi32(mmC, mmB);        /* (08 18 28 09 19 29 0A 1A) */
+    mmB = _mm_unpacklo_pi32(mmD, mmF);        /* (22 03 13 23 04 14 24 05) */
+    mmF = _mm_unpackhi_pi32(mmD, mmF);        /* (2A 0B 1B 2B 0C 1C 2C 0D) */
+    mmC = _mm_unpacklo_pi32(mmH, mmG);        /* (15 25 06 16 26 07 17 27) */
+    mmG = _mm_unpackhi_pi32(mmH, mmG);        /* (1D 2D 0E 1E 2E 0F 1F 2F) */
+
+    if (num_cols >= 8) {
+      if (!(((long)outptr) & 7)) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmB);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        _mm_store_si64((__m64 *)(outptr + 24), mmE);
+        _mm_store_si64((__m64 *)(outptr + 32), mmF);
+        _mm_store_si64((__m64 *)(outptr + 40), mmG);
+      } else {
+        _mm_storeu_si64((__m64 *)outptr, mmA);
+        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        _mm_storeu_si64((__m64 *)(outptr + 24), mmE);
+        _mm_storeu_si64((__m64 *)(outptr + 32), mmF);
+        _mm_storeu_si64((__m64 *)(outptr + 40), mmG);
+      }
+      outptr += RGB_PIXELSIZE * 16;
+    } else {
+      if (output_width & 1)
+        col = num_cols * 6 + 3;
+      else
+        col = num_cols * 6;
+
+      asm(".set noreorder\r\n"                /* st24 */
+
+          "li       $8, 24\r\n"
+          "move     $9, %7\r\n"
+          "mov.s    $f4, %1\r\n"
+          "mov.s    $f6, %2\r\n"
+          "mov.s    $f8, %3\r\n"
+          "move     $10, %8\r\n"
+          "bltu     $9, $8, 1f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "gssdlc1  $f8, 7+16($10)\r\n"
+          "gssdrc1  $f8, 16($10)\r\n"
+          "mov.s    $f4, %4\r\n"
+          "mov.s    $f6, %5\r\n"
+          "mov.s    $f8, %6\r\n"
+          "subu     $9, $9, 24\r\n"
+          PTR_ADDU  "$10, $10, 24\r\n"
+
+          "1:       \r\n"
+          "li       $8, 16\r\n"               /* st16 */
+          "bltu     $9, $8, 2f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "mov.s    $f4, $f8\r\n"
+          "subu     $9, $9, 16\r\n"
+          PTR_ADDU  "$10, $10, 16\r\n"
+
+          "2:       \r\n"
+          "li       $8,  8\r\n"               /* st8 */
+          "bltu     $9, $8, 3f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "mov.s    $f4, $f6\r\n"
+          "subu     $9, $9, 8\r\n"
+          PTR_ADDU  "$10, $10, 8\r\n"
+
+          "3:       \r\n"
+          "li       $8,  4\r\n"               /* st4 */
+          "mfc1     $11, $f4\r\n"
+          "bltu     $9, $8, 4f\r\n"
+          "nop      \r\n"
+          "swl      $11, 3($10)\r\n"
+          "swr      $11, 0($10)\r\n"
+          "li       $8, 32\r\n"
+          "mtc1     $8, $f6\r\n"
+          "dsrl     $f4, $f4, $f6\r\n"
+          "mfc1     $11, $f4\r\n"
+          "subu     $9, $9, 4\r\n"
+          PTR_ADDU  "$10, $10, 4\r\n"
+
+          "4:       \r\n"
+          "li       $8, 2\r\n"                /* st2 */
+          "bltu     $9, $8, 5f\r\n"
+          "nop      \r\n"
+          "ush      $11, 0($10)\r\n"
+          "srl      $11, 16\r\n"
+          "subu     $9, $9, 2\r\n"
+          PTR_ADDU  "$10, $10, 2\r\n"
+
+          "5:       \r\n"
+          "li       $8, 1\r\n"                /* st1 */
+          "bltu     $9, $8, 6f\r\n"
+          "nop      \r\n"
+          "sb       $11, 0($10)\r\n"
+
+          "6:       \r\n"
+          "nop      \r\n"                     /* end */
+          : "=m" (*outptr)
+          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmE), "f" (mmF),
+            "f" (mmG), "r" (col), "r" (outptr)
+          : "$f4", "$f6", "$f8", "$8", "$9", "$10", "$11", "memory"
+         );
+    }
+
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+    xe = _mm_cmpeq_pi8(xe, xe);
+    xo = _mm_cmpeq_pi8(xo, xo);
+#else
+    xe = _mm_xor_si64(xe, xe);
+    xo = _mm_xor_si64(xo, xo);
+#endif
+    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
+    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
+    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
+    /* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */
+
+    mm8 = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
+    mm9 = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
+    mmA = _mm_unpacklo_pi8(mmE, mmG);         /* (20 30 22 32 24 34 26 36) */
+    mmE = _mm_unpackhi_pi8(mmE, mmG);         /* (28 38 2A 3A 2C 3C 2E 3E) */
+
+    mmG = _mm_unpacklo_pi8(mmB, mmD);         /* (01 11 03 13 05 15 07 17) */
+    mmB = _mm_unpackhi_pi8(mmB, mmD);         /* (09 19 0B 1B 0D 1D 0F 1F) */
+    mmD = _mm_unpacklo_pi8(mmF, mmH);         /* (21 31 23 33 25 35 27 37) */
+    mmF = _mm_unpackhi_pi8(mmF, mmH);         /* (29 39 2B 3B 2D 3D 2F 3F) */
+
+    mmH = _mm_unpacklo_pi16(mm8, mmA);        /* (00 10 20 30 02 12 22 32) */
+    mm8 = _mm_unpackhi_pi16(mm8, mmA);        /* (04 14 24 34 06 16 26 36) */
+    mmA = _mm_unpacklo_pi16(mmG, mmD);        /* (01 11 21 31 03 13 23 33) */
+    mmD = _mm_unpackhi_pi16(mmG, mmD);        /* (05 15 25 35 07 17 27 37) */
+
+    mmG = _mm_unpackhi_pi16(mm9, mmE);        /* (0C 1C 2C 3C 0E 1E 2E 3E) */
+    mm9 = _mm_unpacklo_pi16(mm9, mmE);        /* (08 18 28 38 0A 1A 2A 3A) */
+    mmE = _mm_unpacklo_pi16(mmB, mmF);        /* (09 19 29 39 0B 1B 2B 3B) */
+    mmF = _mm_unpackhi_pi16(mmB, mmF);        /* (0D 1D 2D 3D 0F 1F 2F 3F) */
+
+    mmB = _mm_unpackhi_pi32(mmH, mmA);        /* (02 12 22 32 03 13 23 33) */
+    mmA = _mm_unpacklo_pi32(mmH, mmA);        /* (00 10 20 30 01 11 21 31) */
+    mmC = _mm_unpacklo_pi32(mm8, mmD);        /* (04 14 24 34 05 15 25 35) */
+    mmD = _mm_unpackhi_pi32(mm8, mmD);        /* (06 16 26 36 07 17 27 37) */
+
+    mmH = _mm_unpackhi_pi32(mmG, mmF);        /* (0E 1E 2E 3E 0F 1F 2F 3F) */
+    mmG = _mm_unpacklo_pi32(mmG, mmF);        /* (0C 1C 2C 3C 0D 1D 2D 3D) */
+    mmF = _mm_unpackhi_pi32(mm9, mmE);        /* (0A 1A 2A 3A 0B 1B 2B 3B) */
+    mmE = _mm_unpacklo_pi32(mm9, mmE);        /* (08 18 28 38 09 19 29 39) */
+
+    if (num_cols >= 8) {
+      if (!(((long)outptr) & 7)) {
+        _mm_store_si64((__m64 *)outptr, mmA);
+        _mm_store_si64((__m64 *)(outptr + 8), mmB);
+        _mm_store_si64((__m64 *)(outptr + 16), mmC);
+        _mm_store_si64((__m64 *)(outptr + 24), mmD);
+        _mm_store_si64((__m64 *)(outptr + 32), mmE);
+        _mm_store_si64((__m64 *)(outptr + 40), mmF);
+        _mm_store_si64((__m64 *)(outptr + 48), mmG);
+        _mm_store_si64((__m64 *)(outptr + 56), mmH);
+      } else {
+        _mm_storeu_si64((__m64 *)outptr, mmA);
+        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
+        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
+        _mm_storeu_si64((__m64 *)(outptr + 24), mmD);
+        _mm_storeu_si64((__m64 *)(outptr + 32), mmE);
+        _mm_storeu_si64((__m64 *)(outptr + 40), mmF);
+        _mm_storeu_si64((__m64 *)(outptr + 48), mmG);
+        _mm_storeu_si64((__m64 *)(outptr + 56), mmH);
+      }
+      outptr += RGB_PIXELSIZE * 16;
+    } else {
+      if (output_width & 1)
+        col = num_cols * 2 + 1;
+      else
+        col = num_cols * 2;
+      asm(".set noreorder\r\n"                /* st32 */
+
+          "li       $8, 8\r\n"
+          "move     $9, %10\r\n"
+          "move     $10, %11\r\n"
+          "mov.s    $f4, %2\r\n"
+          "mov.s    $f6, %3\r\n"
+          "mov.s    $f8, %4\r\n"
+          "mov.s    $f10, %5\r\n"
+          "bltu     $9, $8, 1f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "gssdlc1  $f8, 7+16($10)\r\n"
+          "gssdrc1  $f8, 16($10)\r\n"
+          "gssdlc1  $f10, 7+24($10)\r\n"
+          "gssdrc1  $f10, 24($10)\r\n"
+          "mov.s    $f4, %6\r\n"
+          "mov.s    $f6, %7\r\n"
+          "mov.s    $f8, %8\r\n"
+          "mov.s    $f10, %9\r\n"
+          "subu     $9, $9, 8\r\n"
+          PTR_ADDU  "$10, $10, 32\r\n"
+
+          "1:       \r\n"
+          "li       $8, 4\r\n"                /* st16 */
+          "bltu     $9, $8, 2f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "gssdlc1  $f6, 7+8($10)\r\n"
+          "gssdrc1  $f6, 8($10)\r\n"
+          "mov.s    $f4, $f8\r\n"
+          "mov.s    $f6, $f10\r\n"
+          "subu     $9, $9, 4\r\n"
+          PTR_ADDU  "$10, $10, 16\r\n"
+
+          "2:       \r\n"
+          "li       $8, 2\r\n"                /* st8 */
+          "bltu     $9, $8, 3f\r\n"
+          "nop      \r\n"
+          "gssdlc1  $f4, 7($10)\r\n"
+          "gssdrc1  $f4, 0($10)\r\n"
+          "mov.s    $f4, $f6\r\n"
+          "subu     $9, $9, 2\r\n"
+          PTR_ADDU  "$10, $10, 8\r\n"
+
+          "3:       \r\n"
+          "li       $8, 1\r\n"                /* st4 */
+          "bltu     $9, $8, 4f\r\n"
+          "nop      \r\n"
+          "gsswlc1  $f4, 3($10)\r\n"
+          "gsswrc1  $f4, 0($10)\r\n"
+
+          "4:       \r\n"
+          "li       %1, 0\r\n"                /* end */
+          : "=m" (*outptr), "=r" (col)
+          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmD), "f" (mmE), "f" (mmF),
+            "f" (mmG), "f" (mmH), "r" (col), "r" (outptr)
+          : "$f4", "$f6", "$f8", "$f10", "$8", "$9", "$10", "memory"
+         );
+    }
+
+#endif
+
+  }
+
+  if (!((output_width >> 1) & 7)) {
+    if (output_width & 1) {
+      cb = _mm_load_si64((__m64 *)inptr1);
+      cr = _mm_load_si64((__m64 *)inptr2);
+      y = _mm_load_si64((__m64 *)inptr0);
+
+      decenter = 0.0;
+      decenter = _mm_cmpeq_pi16(decenter, decenter);
+      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
+
+      cbl = _mm_unpacklo_pi8(cb, zero);       /* Cb(0123) */
+      crl = _mm_unpacklo_pi8(cr, zero);       /* Cr(0123) */
+      cbl = _mm_add_pi16(cbl, decenter);
+      crl = _mm_add_pi16(crl, decenter);
+
+      cbl2 = _mm_add_pi16(cbl, cbl);          /* 2*CbL */
+      crl2 = _mm_add_pi16(crl, crl);          /* 2*CrL */
+      bl = _mm_mulhi_pi16(cbl2, PW_MF0228);   /* (2*CbL * -FIX(0.22800) */
+      rl = _mm_mulhi_pi16(crl2, PW_F0402);    /* (2*CrL * FIX(0.40200)) */
+
+      bl = _mm_add_pi16(bl, PW_ONE);
+      bl = _mm_srai_pi16(bl, 1);              /* (CbL * -FIX(0.22800)) */
+      rl = _mm_add_pi16(rl, PW_ONE);
+      rl = _mm_srai_pi16(rl, 1);              /* (CrL * FIX(0.40200)) */
+
+      bl = _mm_add_pi16(bl, cbl);
+      bl = _mm_add_pi16(bl, cbl);             /* (CbL * FIX(1.77200))=(B-Y)L */
+      rl = _mm_add_pi16(rl, crl);             /* (CrL * FIX(1.40200))=(R-Y)L */
+
+      gl = _mm_unpacklo_pi16(cbl, crl);
+      gl = _mm_madd_pi16(gl, PW_MF0344_F0285);
+      gl = _mm_add_pi32(gl, PD_ONEHALF);
+      gl = _mm_srai_pi32(gl, SCALEBITS);
+      gl = _mm_packs_pi32(gl, zero);       /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
+      gl = _mm_sub_pi16(gl, crl);  /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
+
+      yl = _mm_unpacklo_pi8(y, zero);         /* Y(0123) */
+      rl = _mm_add_pi16(rl, yl);              /* (R0 R1 R2 R3) */
+      gl = _mm_add_pi16(gl, yl);              /* (G0 G1 G2 G3) */
+      bl = _mm_add_pi16(bl, yl);              /* (B0 B1 B2 B3) */
+      re = _mm_packs_pu16(rl, rl);
+      ge = _mm_packs_pu16(gl, gl);
+      be = _mm_packs_pu16(bl, bl);
+#if RGB_PIXELSIZE == 3
+      mmA = _mm_unpacklo_pi8(mmA, mmC);
+      mmA = _mm_unpacklo_pi16(mmA, mmE);
+      asm(".set noreorder\r\n"
+
+          "move    $8, %2\r\n"
+          "mov.s   $f4, %1\r\n"
+          "mfc1    $9, $f4\r\n"
+          "ush     $9, 0($8)\r\n"
+          "srl     $9, 16\r\n"
+          "sb      $9, 2($8)\r\n"
+          : "=m" (*outptr)
+          : "f" (mmA), "r" (outptr)
+          : "$f4", "$8", "$9", "memory"
+         );
+#else  /* RGB_PIXELSIZE == 4 */
+
+#ifdef RGBX_FILLER_0XFF
+      xe = _mm_cmpeq_pi8(xe, xe);
+#else
+      xe = _mm_xor_si64(xe, xe);
+#endif
+      mmA = _mm_unpacklo_pi8(mmA, mmC);
+      mmE = _mm_unpacklo_pi8(mmE, mmG);
+      mmA = _mm_unpacklo_pi16(mmA, mmE);
+      asm(".set noreorder\r\n"
+
+          "move    $8, %2\r\n"
+          "mov.s   $f4, %1\r\n"
+          "gsswlc1 $f4, 3($8)\r\n"
+          "gsswrc1 $f4, 0($8)\r\n"
+          : "=m" (*outptr)
+          : "f" (mmA), "r" (outptr)
+          : "$f4", "$8", "memory"
+         );
+#endif
+    }
+  }
+}
+
+
+void jsimd_h2v2_merged_upsample_mmi(JDIMENSION output_width,
+                                    JSAMPIMAGE input_buf,
+                                    JDIMENSION in_row_group_ctr,
+                                    JSAMPARRAY output_buf)
+{
+  JSAMPROW inptr, outptr;
+
+  inptr = input_buf[0][in_row_group_ctr];
+  outptr = output_buf[0];
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
+  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+                                 output_buf);
+
+  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
+  output_buf[0] = output_buf[1];
+  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
+                                 output_buf);
+
+  input_buf[0][in_row_group_ctr] = inptr;
+  output_buf[0] = outptr;
+}
+
+
+#undef mmA
+#undef mmB
+#undef mmC
+#undef mmD
+#undef mmE
+#undef mmF
+#undef mmG
+#undef mmH
diff --git a/external/jpeg/simd/mips64/jdsample-mmi.c b/external/jpeg/simd/mips64/jdsample-mmi.c
new file mode 100644
index 000000000000..8ae94e7dcf9e
--- /dev/null
+++ b/external/jpeg/simd/mips64/jdsample-mmi.c
@@ -0,0 +1,304 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           ZhangLixia  <zhanglixia-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* CHROMA UPSAMPLING */
+
+#include "jsimd_mmi.h"
+
+
+enum const_index {
+  index_PW_ONE,
+  index_PW_TWO,
+  index_PW_THREE,
+  index_PW_SEVEN,
+  index_PW_EIGHT,
+};
+
+static uint64_t const_value[] = {
+  _uint64_set_pi16(1, 1, 1, 1),
+  _uint64_set_pi16(2, 2, 2, 2),
+  _uint64_set_pi16(3, 3, 3, 3),
+  _uint64_set_pi16(7, 7, 7, 7),
+  _uint64_set_pi16(8, 8, 8, 8),
+};
+
+#define PW_ONE    get_const_value(index_PW_ONE)
+#define PW_TWO    get_const_value(index_PW_TWO)
+#define PW_THREE  get_const_value(index_PW_THREE)
+#define PW_SEVEN  get_const_value(index_PW_SEVEN)
+#define PW_EIGHT  get_const_value(index_PW_EIGHT)
+
+
+#define PROCESS_ROW(row, wkoffset, bias1, bias2, shift) { \
+  __m64 samp123X, samp3XXX, samp1234, sampX012, samp_1012; \
+  __m64 sampXXX4, sampX456, samp3456, samp567X, samp7XXX, samp5678; \
+  __m64 outle, outhe, outlo, outho, outl, outh; \
+  \
+  samp123X = _mm_srli_si64(samp0123, 2 * BYTE_BIT);  /* ( 1 2 3 -) */ \
+  sampXXX4 = _mm_slli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 4) */ \
+  samp3XXX = _mm_srli_si64(samp0123, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 3 - - -) */ \
+  sampX456 = _mm_slli_si64(samp4567, 2 * BYTE_BIT);  /* ( - 4 5 6) */ \
+  \
+  samp1234 = _mm_or_si64(samp123X, sampXXX4);  /* ( 1 2 3 4) */ \
+  samp3456 = _mm_or_si64(samp3XXX, sampX456);  /* ( 3 4 5 6) */ \
+  \
+  sampX012 = _mm_slli_si64(samp0123, 2 * BYTE_BIT);  /* ( - 0 1 2) */ \
+  samp567X = _mm_srli_si64(samp4567, 2 * BYTE_BIT);  /* ( 5 6 7 -) */ \
+  samp7XXX = _mm_srli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 7 - - -) */ \
+  \
+  samp_1012 = _mm_or_si64(sampX012, wk[row]);            /* (-1 0 1 2) */ \
+  samp5678 = _mm_or_si64(samp567X, wk[row + wkoffset]);  /* ( 5 6 7 8) */ \
+  \
+  wk[row] = samp7XXX; \
+  \
+  samp0123 = _mm_mullo_pi16(samp0123, PW_THREE); \
+  samp4567 = _mm_mullo_pi16(samp4567, PW_THREE); \
+  samp_1012 = _mm_add_pi16(samp_1012, bias1); \
+  samp3456 = _mm_add_pi16(samp3456, bias1); \
+  samp1234 = _mm_add_pi16(samp1234, bias2); \
+  samp5678 = _mm_add_pi16(samp5678, bias2); \
+  \
+  outle = _mm_add_pi16(samp_1012, samp0123); \
+  outhe = _mm_add_pi16(samp3456, samp4567); \
+  outle = _mm_srli_pi16(outle, shift);        /* ( 0  2  4  6) */ \
+  outhe = _mm_srli_pi16(outhe, shift);        /* ( 8 10 12 14) */ \
+  outlo = _mm_add_pi16(samp1234, samp0123); \
+  outho = _mm_add_pi16(samp5678, samp4567); \
+  outlo = _mm_srli_pi16(outlo, shift);        /* ( 1  3  5  7) */ \
+  outho = _mm_srli_pi16(outho, shift);        /* ( 9 11 13 15) */ \
+  \
+  outlo = _mm_slli_pi16(outlo, BYTE_BIT); \
+  outho = _mm_slli_pi16(outho, BYTE_BIT); \
+  outl = _mm_or_si64(outle, outlo);           /* ( 0  1  2  3  4  5  6  7) */ \
+  outh = _mm_or_si64(outhe, outho);           /* ( 8  9 10 11 12 13 14 15) */ \
+  \
+  _mm_store_si64((__m64 *)outptr##row, outl); \
+  _mm_store_si64((__m64 *)outptr##row + 1, outh); \
+}
+
+void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
+  int inrow, outrow, incol, tmp, tmp1;
+  __m64 this_1l, this_1h, this_1, thiscolsum_1l, thiscolsum_1h;
+  __m64 this0l, this0h, this0;
+  __m64 this1l, this1h, this1, thiscolsum1l, thiscolsum1h;
+  __m64 next_1l, next_1h, next_1, nextcolsum_1l, nextcolsum_1h;
+  __m64 next0l, next0h, next0;
+  __m64 next1l, next1h, next1, nextcolsum1l, nextcolsum1h;
+  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[4], zero = 0.0;
+
+  mask0 = _mm_cmpeq_pi8(mask0, mask0);
+  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
+
+    inptr_1 = input_data[inrow - 1];
+    inptr0 = input_data[inrow];
+    inptr1 = input_data[inrow + 1];
+    outptr0 = output_data[outrow++];
+    outptr1 = output_data[outrow++];
+
+    if (downsampled_width & 7) {
+      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+      tmp1 = downsampled_width * sizeof(JSAMPLE);
+      asm(PTR_ADDU  "$8, %3, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %3, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %4, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %4, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %5, %6\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %5, %7\r\n"
+          "sb       $9, ($8)\r\n"
+          : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
+          : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
+          : "$8", "$9"
+         );
+    }
+
+    /* process the first column block */
+    this0 = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+    this_1 = _mm_load_si64((__m64 *)inptr_1);  /* row[-1][0] */
+    this1 = _mm_load_si64((__m64 *)inptr1);    /* row[ 1][0] */
+
+    this0l = _mm_unpacklo_pi8(this0, zero);    /* row[ 0][0]( 0 1 2 3) */
+    this0h = _mm_unpackhi_pi8(this0, zero);    /* row[ 0][0]( 4 5 6 7) */
+    this_1l = _mm_unpacklo_pi8(this_1, zero);  /* row[-1][0]( 0 1 2 3) */
+    this_1h = _mm_unpackhi_pi8(this_1, zero);  /* row[-1][0]( 4 5 6 7) */
+    this1l = _mm_unpacklo_pi8(this1, zero);    /* row[+1][0]( 0 1 2 3) */
+    this1h = _mm_unpackhi_pi8(this1, zero);    /* row[+1][0]( 4 5 6 7) */
+
+    this0l = _mm_mullo_pi16(this0l, PW_THREE);
+    this0h = _mm_mullo_pi16(this0h, PW_THREE);
+
+    thiscolsum_1l = _mm_add_pi16(this_1l, this0l);  /* ( 0 1 2 3) */
+    thiscolsum_1h = _mm_add_pi16(this_1h, this0h);  /* ( 4 5 6 7) */
+    thiscolsum1l = _mm_add_pi16(this0l, this1l);    /* ( 0 1 2 3) */
+    thiscolsum1h = _mm_add_pi16(this0h, this1h);    /* ( 4 5 6 7) */
+
+    /* temporarily save the intermediate data */
+    _mm_store_si64((__m64 *)outptr0, thiscolsum_1l);
+    _mm_store_si64((__m64 *)outptr0 + 1, thiscolsum_1h);
+    _mm_store_si64((__m64 *)outptr1, thiscolsum1l);
+    _mm_store_si64((__m64 *)outptr1 + 1, thiscolsum1h);
+
+    wk[0] = _mm_and_si64(thiscolsum_1l, mask0);  /* ( 0 - - -) */
+    wk[1] = _mm_and_si64(thiscolsum1l, mask0);   /* ( 0 - - -) */
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
+         outptr0 += 16, outptr1 += 16) {
+
+      if (incol > 8) {
+        /* process the next column block */
+        next0 = _mm_load_si64((__m64 *)inptr0 + 1);    /* row[ 0][1] */
+        next_1 = _mm_load_si64((__m64 *)inptr_1 + 1);  /* row[-1][1] */
+        next1 = _mm_load_si64((__m64 *)inptr1 + 1);    /* row[+1][1] */
+
+        next0l = _mm_unpacklo_pi8(next0, zero);    /* row[ 0][1]( 0 1 2 3) */
+        next0h = _mm_unpackhi_pi8(next0, zero);    /* row[ 0][1]( 4 5 6 7) */
+        next_1l = _mm_unpacklo_pi8(next_1, zero);  /* row[-1][1]( 0 1 2 3) */
+        next_1h = _mm_unpackhi_pi8(next_1, zero);  /* row[-1][1]( 4 5 6 7) */
+        next1l = _mm_unpacklo_pi8(next1, zero);    /* row[+1][1]( 0 1 2 3) */
+        next1h = _mm_unpackhi_pi8(next1, zero);    /* row[+1][1]( 4 5 6 7) */
+
+        next0l = _mm_mullo_pi16(next0l, PW_THREE);
+        next0h = _mm_mullo_pi16(next0h, PW_THREE);
+
+        nextcolsum_1l = _mm_add_pi16(next_1l, next0l);  /* ( 0 1 2 3) */
+        nextcolsum_1h = _mm_add_pi16(next_1h, next0h);  /* ( 4 5 6 7) */
+        nextcolsum1l = _mm_add_pi16(next0l, next1l);    /* ( 0 1 2 3) */
+        nextcolsum1h = _mm_add_pi16(next0h, next1h);    /* ( 4 5 6 7) */
+
+        /* temporarily save the intermediate data */
+        _mm_store_si64((__m64 *)outptr0 + 2, nextcolsum_1l);
+        _mm_store_si64((__m64 *)outptr0 + 3, nextcolsum_1h);
+        _mm_store_si64((__m64 *)outptr1 + 2, nextcolsum1l);
+        _mm_store_si64((__m64 *)outptr1 + 3, nextcolsum1h);
+
+        wk[2] = _mm_slli_si64(nextcolsum_1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
+        wk[3] = _mm_slli_si64(nextcolsum1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);   /* ( - - - 0) */
+      } else {
+        __m64 tmp;
+
+        /* process the last column block */
+        tmp = _mm_load_si64((__m64 *)outptr0 + 1);
+        wk[2] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
+        tmp = _mm_load_si64((__m64 *)outptr1 + 1);
+        wk[3] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
+      }
+
+      /* process the upper row */
+      samp0123 = _mm_load_si64((__m64 *)outptr0);      /* ( 0 1 2 3) */ \
+      samp4567 = _mm_load_si64((__m64 *)outptr0 + 1);  /* ( 4 5 6 7) */ \
+      PROCESS_ROW(0, 2, PW_EIGHT, PW_SEVEN, 4)
+
+      /* process the lower row */
+      samp0123 = _mm_load_si64((__m64 *)outptr1);      /* ( 0 1 2 3) */ \
+      samp4567 = _mm_load_si64((__m64 *)outptr1 + 1);  /* ( 4 5 6 7) */ \
+      PROCESS_ROW(1, 2, PW_EIGHT, PW_SEVEN, 4)
+    }
+  }
+}
+
+
+void jsimd_h2v1_fancy_upsample_mmi(int max_v_samp_factor,
+                                   JDIMENSION downsampled_width,
+                                   JSAMPARRAY input_data,
+                                   JSAMPARRAY *output_data_ptr)
+{
+  JSAMPARRAY output_data = *output_data_ptr;
+  JSAMPROW inptr0, outptr0;
+  int inrow, incol, tmp, tmp1;
+  __m64 thisl, this, nextl, next;
+  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[2], zero = 0.0;
+
+  mask0 = _mm_cmpeq_pi8(mask0, mask0);
+  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
+
+  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
+
+    inptr0 = input_data[inrow];
+    outptr0 = output_data[inrow];
+
+    if (downsampled_width & 7) {
+      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
+      tmp1 = downsampled_width * sizeof(JSAMPLE);
+      asm(PTR_ADDU  "$8, %1, %2\r\n"
+          "lb       $9, ($8)\r\n"
+          PTR_ADDU  "$8, %1, %3\r\n"
+          "sb       $9, ($8)\r\n"
+          : "=m" (*inptr0)
+          : "r" (inptr0), "r" (tmp), "r" (tmp1)
+          : "$8", "$9"
+         );
+    }
+
+    /* process the first column block */
+    this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+    thisl = _mm_unpacklo_pi8(this, zero);     /* row[ 0][0]( 0 1 2 3) */
+    wk[0] = _mm_and_si64(thisl, mask0);       /* ( 0 - - -) */
+
+    for (incol = downsampled_width; incol > 0;
+         incol -= 8, inptr0 += 8, outptr0 += 16) {
+
+      if (incol > 8) {
+        /* process the next column block */
+        next = _mm_load_si64((__m64 *)inptr0 + 1);  /* row[ 0][1] */
+        nextl = _mm_unpacklo_pi8(next, zero);       /* row[ 0][1]( 0 1 2 3) */
+        wk[1] = _mm_slli_si64(nextl, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
+      } else {
+        __m64 thish;
+
+        /* process the last column block */
+        this = _mm_load_si64((__m64 *)inptr0);  /* row[ 0][0] */
+        thish = _mm_unpackhi_pi8(this, zero);   /* row[ 0][1]( 4 5 6 7) */
+        wk[1] = _mm_and_si64(masklast, thish);  /* ( - - - 7) */
+      }
+
+      /* process the row */
+      this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
+      samp0123 = _mm_unpacklo_pi8(this, zero);  /* ( 0 1 2 3) */
+      samp4567 = _mm_unpackhi_pi8(this, zero);  /* ( 4 5 6 7) */
+      PROCESS_ROW(0, 1, PW_ONE, PW_TWO, 2)
+    }
+  }
+}
diff --git a/external/jpeg/simd/mips64/jfdctfst-mmi.c b/external/jpeg/simd/mips64/jfdctfst-mmi.c
new file mode 100644
index 000000000000..f7caf09a8860
--- /dev/null
+++ b/external/jpeg/simd/mips64/jfdctfst-mmi.c
@@ -0,0 +1,255 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER FORWARD DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  8
+
+#define F_0_382  ((short)98)   /* FIX(0.382683433) */
+#define F_0_541  ((short)139)  /* FIX(0.541196100) */
+#define F_0_707  ((short)181)  /* FIX(0.707106781) */
+#define F_1_306  ((short)334)  /* FIX(1.306562965) */
+
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+  index_PW_F0707,
+  index_PW_F0382,
+  index_PW_F0541,
+  index_PW_F1306
+};
+
+static uint64_t const_value[] = {
+  _uint64_set1_pi16(F_0_707),
+  _uint64_set1_pi16(F_0_382),
+  _uint64_set1_pi16(F_0_541),
+  _uint64_set1_pi16(F_1_306)
+};
+
+#define PW_F0707  get_const_value(index_PW_F0707)
+#define PW_F0382  get_const_value(index_PW_F0382)
+#define PW_F0541  get_const_value(index_PW_F0541)
+#define PW_F1306  get_const_value(index_PW_F1306)
+
+
+#define DO_FDCT_MULTIPLY(out, in, multiplier) { \
+  __m64 mulhi, mullo, mul12, mul34; \
+  \
+  mullo = _mm_mullo_pi16(in, multiplier); \
+  mulhi = _mm_mulhi_pi16(in, multiplier); \
+  mul12 = _mm_unpacklo_pi16(mullo, mulhi); \
+  mul34 = _mm_unpackhi_pi16(mullo, mulhi); \
+  mul12 = _mm_srai_pi32(mul12, CONST_BITS); \
+  mul34 = _mm_srai_pi32(mul34, CONST_BITS); \
+  out = _mm_packs_pi32(mul12, mul34); \
+}
+
+#define DO_FDCT_COMMON() { \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp3); \
+  tmp13 = _mm_sub_pi16(tmp0, tmp3); \
+  tmp11 = _mm_add_pi16(tmp1, tmp2); \
+  tmp12 = _mm_sub_pi16(tmp1, tmp2); \
+  \
+  out0 = _mm_add_pi16(tmp10, tmp11); \
+  out4 = _mm_sub_pi16(tmp10, tmp11); \
+  \
+  z1 = _mm_add_pi16(tmp12, tmp13); \
+  DO_FDCT_MULTIPLY(z1, z1, PW_F0707) \
+  \
+  out2 = _mm_add_pi16(tmp13, z1); \
+  out6 = _mm_sub_pi16(tmp13, z1); \
+  \
+  /* Odd part */ \
+  \
+  tmp10 = _mm_add_pi16(tmp4, tmp5); \
+  tmp11 = _mm_add_pi16(tmp5, tmp6); \
+  tmp12 = _mm_add_pi16(tmp6, tmp7); \
+  \
+  z5 = _mm_sub_pi16(tmp10, tmp12); \
+  DO_FDCT_MULTIPLY(z5, z5, PW_F0382) \
+  \
+  DO_FDCT_MULTIPLY(z2, tmp10, PW_F0541) \
+  z2 = _mm_add_pi16(z2, z5); \
+  \
+  DO_FDCT_MULTIPLY(z4, tmp12, PW_F1306) \
+  z4 = _mm_add_pi16(z4, z5); \
+  \
+  DO_FDCT_MULTIPLY(z3, tmp11, PW_F0707) \
+  \
+  z11 = _mm_add_pi16(tmp7, z3); \
+  z13 = _mm_sub_pi16(tmp7, z3); \
+  \
+  out5 = _mm_add_pi16(z13, z2); \
+  out3 = _mm_sub_pi16(z13, z2); \
+  out1 = _mm_add_pi16(z11, z4); \
+  out7 = _mm_sub_pi16(z11, z4); \
+}
+
+#define DO_FDCT_PASS1() { \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 col0, col1, col2, col3, col4, col5, col6, col7; \
+  \
+  row0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);     /* (00 01 02 03) */ \
+  row0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4]); /* (04 05 06 07) */ \
+  row1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);     /* (10 11 12 13) */ \
+  row1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4]); /* (14 15 16 17) */ \
+  row2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);     /* (20 21 22 23) */ \
+  row2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4]); /* (24 25 26 27) */ \
+  row3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);     /* (30 31 32 33) */ \
+  row3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4]); /* (34 35 36 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row23a = _mm_unpacklo_pi16(row2l, row3l);   /* row23a=(20 30 21 31) */ \
+  row23b = _mm_unpackhi_pi16(row2l, row3l);   /* row23b=(22 32 23 33) */ \
+  row23c = _mm_unpacklo_pi16(row2h, row3h);   /* row23c=(24 34 25 35) */ \
+  row23d = _mm_unpackhi_pi16(row2h, row3h);   /* row23d=(26 36 27 37) */ \
+  \
+  row01a = _mm_unpacklo_pi16(row0l, row1l);   /* row01a=(00 10 01 11) */ \
+  row01b = _mm_unpackhi_pi16(row0l, row1l);   /* row01b=(02 12 03 13) */ \
+  row01c = _mm_unpacklo_pi16(row0h, row1h);   /* row01c=(04 14 05 15) */ \
+  row01d = _mm_unpackhi_pi16(row0h, row1h);   /* row01d=(06 16 07 17) */ \
+  \
+  col0 = _mm_unpacklo_pi32(row01a, row23a);   /* col0=(00 10 20 30) */ \
+  col1 = _mm_unpackhi_pi32(row01a, row23a);   /* col1=(01 11 21 31) */ \
+  col6 = _mm_unpacklo_pi32(row01d, row23d);   /* col6=(06 16 26 36) */ \
+  col7 = _mm_unpackhi_pi32(row01d, row23d);   /* col7=(07 17 27 37) */ \
+  \
+  tmp6 = _mm_sub_pi16(col1, col6);            /* tmp6=col1-col6 */ \
+  tmp7 = _mm_sub_pi16(col0, col7);            /* tmp7=col0-col7 */ \
+  tmp1 = _mm_add_pi16(col1, col6);            /* tmp1=col1+col6 */ \
+  tmp0 = _mm_add_pi16(col0, col7);            /* tmp0=col0+col7 */ \
+  \
+  col2 = _mm_unpacklo_pi32(row01b, row23b);   /* col2=(02 12 22 32) */ \
+  col3 = _mm_unpackhi_pi32(row01b, row23b);   /* col3=(03 13 23 33) */ \
+  col4 = _mm_unpacklo_pi32(row01c, row23c);   /* col4=(04 14 24 34) */ \
+  col5 = _mm_unpackhi_pi32(row01c, row23c);   /* col5=(05 15 25 35) */ \
+  \
+  tmp3 = _mm_add_pi16(col3, col4);            /* tmp3=col3+col4 */ \
+  tmp2 = _mm_add_pi16(col2, col5);            /* tmp2=col2+col5 */ \
+  tmp4 = _mm_sub_pi16(col3, col4);            /* tmp4=col3-col4 */ \
+  tmp5 = _mm_sub_pi16(col2, col5);            /* tmp5=col2-col5 */ \
+  \
+  DO_FDCT_COMMON() \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0 + 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1 + 4], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2 + 4], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3 + 4], out7); \
+}
+
+#define DO_FDCT_PASS2() { \
+  __m64 col0l, col0h, col1l, col1h, col2l, col2h, col3l, col3h; \
+  __m64 col01a, col01b, col01c, col01d, col23a, col23b, col23c, col23d; \
+  __m64 row0, row1, row2, row3, row4, row5, row6, row7; \
+  \
+  col0l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col1l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col2l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col3l = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col0h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 4]);  /* (40 50 60 70) */ \
+  col1h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 5]);  /* (41 51 61 71) */ \
+  col2h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 6]);  /* (42 52 62 72) */ \
+  col3h = _mm_load_si64((__m64 *)&dataptr[DCTSIZE * 7]);  /* (43 53 63 73) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  col23a = _mm_unpacklo_pi16(col2l, col3l);   /* col23a=(02 03 12 13) */ \
+  col23b = _mm_unpackhi_pi16(col2l, col3l);   /* col23b=(22 23 32 33) */ \
+  col23c = _mm_unpacklo_pi16(col2h, col3h);   /* col23c=(42 43 52 53) */ \
+  col23d = _mm_unpackhi_pi16(col2h, col3h);   /* col23d=(62 63 72 73) */ \
+  \
+  col01a = _mm_unpacklo_pi16(col0l, col1l);   /* col01a=(00 01 10 11) */ \
+  col01b = _mm_unpackhi_pi16(col0l, col1l);   /* col01b=(20 21 30 31) */ \
+  col01c = _mm_unpacklo_pi16(col0h, col1h);   /* col01c=(40 41 50 51) */ \
+  col01d = _mm_unpackhi_pi16(col0h, col1h);   /* col01d=(60 61 70 71) */ \
+  \
+  row0 = _mm_unpacklo_pi32(col01a, col23a);   /* row0=(00 01 02 03) */ \
+  row1 = _mm_unpackhi_pi32(col01a, col23a);   /* row1=(10 11 12 13) */ \
+  row6 = _mm_unpacklo_pi32(col01d, col23d);   /* row6=(60 61 62 63) */ \
+  row7 = _mm_unpackhi_pi32(col01d, col23d);   /* row7=(70 71 72 73) */ \
+  \
+  tmp6 = _mm_sub_pi16(row1, row6);            /* tmp6=row1-row6 */ \
+  tmp7 = _mm_sub_pi16(row0, row7);            /* tmp7=row0-row7 */ \
+  tmp1 = _mm_add_pi16(row1, row6);            /* tmp1=row1+row6 */ \
+  tmp0 = _mm_add_pi16(row0, row7);            /* tmp0=row0+row7 */ \
+  \
+  row2 = _mm_unpacklo_pi32(col01b, col23b);   /* row2=(20 21 22 23) */ \
+  row3 = _mm_unpackhi_pi32(col01b, col23b);   /* row3=(30 31 32 33) */ \
+  row4 = _mm_unpacklo_pi32(col01c, col23c);   /* row4=(40 41 42 43) */ \
+  row5 = _mm_unpackhi_pi32(col01c, col23c);   /* row5=(50 51 52 53) */ \
+  \
+  tmp3 = _mm_add_pi16(row3, row4);            /* tmp3=row3+row4 */ \
+  tmp2 = _mm_add_pi16(row2, row5);            /* tmp2=row2+row5 */ \
+  tmp4 = _mm_sub_pi16(row3, row4);            /* tmp4=row3-row4 */ \
+  tmp5 = _mm_sub_pi16(row2, row5);            /* tmp5=row2-row5 */ \
+  \
+  DO_FDCT_COMMON() \
+  \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 0], out0); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 1], out1); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 2], out2); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 3], out3); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 4], out4); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 5], out5); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 6], out6); \
+  _mm_store_si64((__m64 *)&dataptr[DCTSIZE * 7], out7); \
+}
+
+void jsimd_fdct_ifast_mmi(DCTELEM *data)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 tmp10, tmp11, tmp12, tmp13, z1, z2, z3, z4, z5, z11, z13;
+  DCTELEM *dataptr = data;
+
+  /* Pass 1: process rows. */
+
+  DO_FDCT_PASS1()
+  dataptr += DCTSIZE * 4;
+  DO_FDCT_PASS1()
+
+  /* Pass 2: process columns. */
+
+  dataptr = data;
+  DO_FDCT_PASS2()
+  dataptr += 4;
+  DO_FDCT_PASS2()
+}
diff --git a/external/jpeg/simd/loongson/jfdctint-mmi.c b/external/jpeg/simd/mips64/jfdctint-mmi.c
similarity index 100%
rename from external/jpeg/simd/loongson/jfdctint-mmi.c
rename to external/jpeg/simd/mips64/jfdctint-mmi.c
diff --git a/external/jpeg/simd/mips64/jidctfst-mmi.c b/external/jpeg/simd/mips64/jidctfst-mmi.c
new file mode 100644
index 000000000000..503bb35a3cc5
--- /dev/null
+++ b/external/jpeg/simd/mips64/jidctfst-mmi.c
@@ -0,0 +1,395 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2014-2015, 2018-2019, D. R. Commander.  All Rights Reserved.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  LiuQingfa <liuqingfa-hf@loongson.cn>
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* FAST INTEGER INVERSE DCT */
+
+#include "jsimd_mmi.h"
+
+
+#define CONST_BITS  8
+#define PASS1_BITS  2
+
+#define FIX_1_082  ((short)277)                   /* FIX(1.082392200) */
+#define FIX_1_414  ((short)362)                   /* FIX(1.414213562) */
+#define FIX_1_847  ((short)473)                   /* FIX(1.847759065) */
+#define FIX_2_613  ((short)669)                   /* FIX(2.613125930) */
+#define FIX_1_613  ((short)(FIX_2_613 - 256 * 3)) /* FIX(2.613125930) - FIX(1) */
+
+#define PRE_MULTIPLY_SCALE_BITS  2
+#define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
+
+enum const_index {
+  index_PW_F1082,
+  index_PW_F1414,
+  index_PW_F1847,
+  index_PW_MF1613,
+  index_PB_CENTERJSAMP
+};
+
+static uint64_t const_value[] = {
+  _uint64_set1_pi16(FIX_1_082 << CONST_SHIFT),
+  _uint64_set1_pi16(FIX_1_414 << CONST_SHIFT),
+  _uint64_set1_pi16(FIX_1_847 << CONST_SHIFT),
+  _uint64_set1_pi16(-FIX_1_613 << CONST_SHIFT),
+  _uint64_set1_pi8(CENTERJSAMPLE)
+};
+
+#define PW_F1414        get_const_value(index_PW_F1414)
+#define PW_F1847        get_const_value(index_PW_F1847)
+#define PW_MF1613       get_const_value(index_PW_MF1613)
+#define PW_F1082        get_const_value(index_PW_F1082)
+#define PB_CENTERJSAMP  get_const_value(index_PB_CENTERJSAMP)
+
+
+#define test_m32_zero(mm32)  (!(*(uint32_t *)&mm32))
+#define test_m64_zero(mm64)  (!(*(uint64_t *)&mm64))
+
+
+#define DO_IDCT_COMMON() { \
+  tmp7 = _mm_add_pi16(z11, z13); \
+  \
+  tmp11 = _mm_sub_pi16(z11, z13); \
+  tmp11 = _mm_slli_pi16(tmp11, PRE_MULTIPLY_SCALE_BITS); \
+  tmp11 = _mm_mulhi_pi16(tmp11, PW_F1414); \
+  \
+  tmp10 = _mm_slli_pi16(z12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_slli_pi16(z10, PRE_MULTIPLY_SCALE_BITS); \
+  \
+  /* To avoid overflow... \
+   * \
+   * (Original) \
+   * tmp12 = -2.613125930 * z10 + z5; \
+   * \
+   * (This implementation) \
+   * tmp12 = (-1.613125930 - 1) * z10 + z5; \
+   *       = -1.613125930 * z10 - z10 + z5; \
+   */ \
+  \
+  z5 = _mm_add_pi16(tmp10, tmp12); \
+  z5 = _mm_mulhi_pi16(z5, PW_F1847); \
+  \
+  tmp10 = _mm_mulhi_pi16(tmp10, PW_F1082); \
+  tmp10 = _mm_sub_pi16(tmp10, z5); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_MF1613); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_sub_pi16(tmp12, z10); \
+  tmp12 = _mm_add_pi16(tmp12, z5); \
+  \
+  /* Final output stage */ \
+  \
+  tmp6 = _mm_sub_pi16(tmp12, tmp7); \
+  tmp5 = _mm_sub_pi16(tmp11, tmp6); \
+  tmp4 = _mm_add_pi16(tmp10, tmp5); \
+  \
+  out0 = _mm_add_pi16(tmp0, tmp7); \
+  out7 = _mm_sub_pi16(tmp0, tmp7); \
+  out1 = _mm_add_pi16(tmp1, tmp6); \
+  out6 = _mm_sub_pi16(tmp1, tmp6); \
+  \
+  out2 = _mm_add_pi16(tmp2, tmp5); \
+  out5 = _mm_sub_pi16(tmp2, tmp5); \
+  out4 = _mm_add_pi16(tmp3, tmp4); \
+  out3 = _mm_sub_pi16(tmp3, tmp4); \
+}
+
+#define DO_IDCT_PASS1(iter) { \
+  __m64 col0l, col1l, col2l, col3l, col4l, col5l, col6l, col7l; \
+  __m64 quant0l, quant1l, quant2l, quant3l; \
+  __m64 quant4l, quant5l, quant6l, quant7l; \
+  __m64 row01a, row01b, row01c, row01d, row23a, row23b, row23c, row23d; \
+  __m64 row0l, row0h, row1l, row1h, row2l, row2h, row3l, row3h; \
+  __m32 col0a, col1a, mm0; \
+  \
+  col0a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 1]); \
+  col1a = _mm_load_si32((__m32 *)&inptr[DCTSIZE * 2]); \
+  mm0 = _mm_or_si32(col0a, col1a); \
+  \
+  if (test_m32_zero(mm0)) { \
+    __m64 mm1, mm2; \
+    \
+    col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]); \
+    col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]); \
+    col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]); \
+    col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]); \
+    col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]); \
+    col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]); \
+    col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]); \
+    col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]); \
+    \
+    mm1 = _mm_or_si64(col1l, col3l); \
+    mm2 = _mm_or_si64(col2l, col4l); \
+    mm1 = _mm_or_si64(mm1, col5l); \
+    mm2 = _mm_or_si64(mm2, col6l); \
+    mm1 = _mm_or_si64(mm1, col7l); \
+    mm1 = _mm_or_si64(mm1, mm2); \
+    \
+    if (test_m64_zero(mm1)) { \
+      __m64 dcval, dcvall, dcvalh, row0, row1, row2, row3; \
+      \
+      /* AC terms all zero */ \
+      \
+      quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+      \
+      dcval = _mm_mullo_pi16(col0l, quant0l);    /* dcval=(00 10 20 30) */ \
+      \
+      dcvall = _mm_unpacklo_pi16(dcval, dcval);  /* dcvall=(00 00 10 10) */ \
+      dcvalh = _mm_unpackhi_pi16(dcval, dcval);  /* dcvalh=(20 20 30 30) */ \
+      \
+      row0 = _mm_unpacklo_pi32(dcvall, dcvall);  /* row0=(00 00 00 00) */ \
+      row1 = _mm_unpackhi_pi32(dcvall, dcvall);  /* row1=(10 10 10 10) */ \
+      row2 = _mm_unpacklo_pi32(dcvalh, dcvalh);  /* row2=(20 20 20 20) */ \
+      row3 = _mm_unpackhi_pi32(dcvalh, dcvalh);  /* row3=(30 30 30 30) */ \
+      \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3); \
+      _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3); \
+      \
+      goto nextcolumn##iter; \
+    } \
+  } \
+  \
+  /* Even part */ \
+  \
+  col0l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 0]);  /* (00 10 20 30) */ \
+  col2l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 2]);  /* (02 12 22 32) */ \
+  col4l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 4]);  /* (04 14 24 34) */ \
+  col6l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 6]);  /* (06 16 26 36) */ \
+  \
+  quant0l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 0]); \
+  quant2l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 2]); \
+  quant4l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 4]); \
+  quant6l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 6]); \
+  \
+  tmp0 = _mm_mullo_pi16(col0l, quant0l); \
+  tmp1 = _mm_mullo_pi16(col2l, quant2l); \
+  tmp2 = _mm_mullo_pi16(col4l, quant4l); \
+  tmp3 = _mm_mullo_pi16(col6l, quant6l); \
+  \
+  tmp10 = _mm_add_pi16(tmp0, tmp2); \
+  tmp11 = _mm_sub_pi16(tmp0, tmp2); \
+  tmp13 = _mm_add_pi16(tmp1, tmp3); \
+  \
+  tmp12 = _mm_sub_pi16(tmp1, tmp3); \
+  tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+  tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+  \
+  tmp0 = _mm_add_pi16(tmp10, tmp13); \
+  tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+  tmp1 = _mm_add_pi16(tmp11, tmp12); \
+  tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  col1l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 1]);  /* (01 11 21 31) */ \
+  col3l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 3]);  /* (03 13 23 33) */ \
+  col5l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 5]);  /* (05 15 25 35) */ \
+  col7l = _mm_load_si64((__m64 *)&inptr[DCTSIZE * 7]);  /* (07 17 27 37) */ \
+  \
+  quant1l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 1]); \
+  quant3l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 3]); \
+  quant5l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 5]); \
+  quant7l = _mm_load_si64((__m64 *)&quantptr[DCTSIZE * 7]); \
+  \
+  tmp4 = _mm_mullo_pi16(col1l, quant1l); \
+  tmp5 = _mm_mullo_pi16(col3l, quant3l); \
+  tmp6 = _mm_mullo_pi16(col5l, quant5l); \
+  tmp7 = _mm_mullo_pi16(col7l, quant7l); \
+  \
+  z13 = _mm_add_pi16(tmp6, tmp5); \
+  z10 = _mm_sub_pi16(tmp6, tmp5); \
+  z11 = _mm_add_pi16(tmp4, tmp7); \
+  z12 = _mm_sub_pi16(tmp4, tmp7); \
+  \
+  DO_IDCT_COMMON() \
+  \
+  /* out0=(00 10 20 30), out1=(01 11 21 31) */ \
+  /* out2=(02 12 22 32), out3=(03 13 23 33) */ \
+  /* out4=(04 14 24 34), out5=(05 15 25 35) */ \
+  /* out6=(06 16 26 36), out7=(07 17 27 37) */ \
+  \
+  /* Transpose coefficients */ \
+  \
+  row01a = _mm_unpacklo_pi16(out0, out1);     /* row01a=(00 01 10 11) */ \
+  row23a = _mm_unpackhi_pi16(out0, out1);     /* row23a=(20 21 30 31) */ \
+  row01d = _mm_unpacklo_pi16(out6, out7);     /* row01d=(06 07 16 17) */ \
+  row23d = _mm_unpackhi_pi16(out6, out7);     /* row23d=(26 27 36 37) */ \
+  \
+  row01b = _mm_unpacklo_pi16(out2, out3);     /* row01b=(02 03 12 13) */ \
+  row23b = _mm_unpackhi_pi16(out2, out3);     /* row23b=(22 23 32 33) */ \
+  row01c = _mm_unpacklo_pi16(out4, out5);     /* row01c=(04 05 14 15) */ \
+  row23c = _mm_unpackhi_pi16(out4, out5);     /* row23c=(24 25 34 35) */ \
+  \
+  row0l = _mm_unpacklo_pi32(row01a, row01b);  /* row0l=(00 01 02 03) */ \
+  row1l = _mm_unpackhi_pi32(row01a, row01b);  /* row1l=(10 11 12 13) */ \
+  row2l = _mm_unpacklo_pi32(row23a, row23b);  /* row2l=(20 21 22 23) */ \
+  row3l = _mm_unpackhi_pi32(row23a, row23b);  /* row3l=(30 31 32 33) */ \
+  \
+  row0h = _mm_unpacklo_pi32(row01c, row01d);  /* row0h=(04 05 06 07) */ \
+  row1h = _mm_unpackhi_pi32(row01c, row01d);  /* row1h=(14 15 16 17) */ \
+  row2h = _mm_unpacklo_pi32(row23c, row23d);  /* row2h=(24 25 26 27) */ \
+  row3h = _mm_unpackhi_pi32(row23c, row23d);  /* row3h=(34 35 36 37) */ \
+  \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0], row0l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 0 + 4], row0h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1], row1l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 1 + 4], row1h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2], row2l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 2 + 4], row2h); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3], row3l); \
+  _mm_store_si64((__m64 *)&wsptr[DCTSIZE * 3 + 4], row3h); \
+}
+
+#define DO_IDCT_PASS2(ctr) { \
+  __m64 row0l, row1l, row2l, row3l, row4l, row5l, row6l, row7l; \
+  __m64 col0123a, col0123b, col0123c, col0123d; \
+  __m64 col01l, col01h, col23l, col23h; \
+  __m64 col0, col1, col2, col3; \
+  __m64 row06, row17, row24, row35; \
+  \
+  row0l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 0]);  /* (00 01 02 03) */ \
+  row1l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 1]);  /* (10 11 12 13) */ \
+  row2l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 2]);  /* (20 21 22 23) */ \
+  row3l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 3]);  /* (30 31 32 33) */ \
+  row4l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 4]);  /* (40 41 42 43) */ \
+  row5l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 5]);  /* (50 51 52 53) */ \
+  row6l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 6]);  /* (60 61 62 63) */ \
+  row7l = _mm_load_si64((__m64 *)&wsptr[DCTSIZE * 7]);  /* (70 71 72 73) */ \
+  \
+  /* Even part */ \
+  \
+  tmp10 = _mm_add_pi16(row0l, row4l); \
+  tmp11 = _mm_sub_pi16(row0l, row4l); \
+  tmp13 = _mm_add_pi16(row2l, row6l); \
+  \
+  tmp12 = _mm_sub_pi16(row2l, row6l); \
+  tmp12 = _mm_slli_pi16(tmp12, PRE_MULTIPLY_SCALE_BITS); \
+  tmp12 = _mm_mulhi_pi16(tmp12, PW_F1414); \
+  tmp12 = _mm_sub_pi16(tmp12, tmp13); \
+  \
+  tmp0 = _mm_add_pi16(tmp10, tmp13); \
+  tmp3 = _mm_sub_pi16(tmp10, tmp13); \
+  tmp1 = _mm_add_pi16(tmp11, tmp12); \
+  tmp2 = _mm_sub_pi16(tmp11, tmp12); \
+  \
+  /* Odd part */ \
+  \
+  z13 = _mm_add_pi16(row5l, row3l); \
+  z10 = _mm_sub_pi16(row5l, row3l); \
+  z11 = _mm_add_pi16(row1l, row7l); \
+  z12 = _mm_sub_pi16(row1l, row7l); \
+  \
+  DO_IDCT_COMMON() \
+  \
+  /* out0=(00 01 02 03), out1=(10 11 12 13) */ \
+  /* out2=(20 21 22 23), out3=(30 31 32 33) */ \
+  /* out4=(40 41 42 43), out5=(50 51 52 53) */ \
+  /* out6=(60 61 62 63), out7=(70 71 72 73) */ \
+  \
+  out0 = _mm_srai_pi16(out0, PASS1_BITS + 3); \
+  out1 = _mm_srai_pi16(out1, PASS1_BITS + 3); \
+  out2 = _mm_srai_pi16(out2, PASS1_BITS + 3); \
+  out3 = _mm_srai_pi16(out3, PASS1_BITS + 3); \
+  out4 = _mm_srai_pi16(out4, PASS1_BITS + 3); \
+  out5 = _mm_srai_pi16(out5, PASS1_BITS + 3); \
+  out6 = _mm_srai_pi16(out6, PASS1_BITS + 3); \
+  out7 = _mm_srai_pi16(out7, PASS1_BITS + 3); \
+  \
+  row06 = _mm_packs_pi16(out0, out6);  /* row06=(00 01 02 03 60 61 62 63) */ \
+  row17 = _mm_packs_pi16(out1, out7);  /* row17=(10 11 12 13 70 71 72 73) */ \
+  row24 = _mm_packs_pi16(out2, out4);  /* row24=(20 21 22 23 40 41 42 43) */ \
+  row35 = _mm_packs_pi16(out3, out5);  /* row35=(30 31 32 33 50 51 52 53) */ \
+  \
+  row06 = _mm_add_pi8(row06, PB_CENTERJSAMP); \
+  row17 = _mm_add_pi8(row17, PB_CENTERJSAMP); \
+  row24 = _mm_add_pi8(row24, PB_CENTERJSAMP); \
+  row35 = _mm_add_pi8(row35, PB_CENTERJSAMP); \
+  \
+  /* Transpose coefficients */ \
+  \
+  col0123a = _mm_unpacklo_pi8(row06, row17);  /* col0123a=(00 10 01 11 02 12 03 13) */ \
+  col0123d = _mm_unpackhi_pi8(row06, row17);  /* col0123d=(60 70 61 71 62 72 63 73) */ \
+  col0123b = _mm_unpacklo_pi8(row24, row35);  /* col0123b=(20 30 21 31 22 32 23 33) */ \
+  col0123c = _mm_unpackhi_pi8(row24, row35);  /* col0123c=(40 50 41 51 42 52 43 53) */ \
+  \
+  col01l = _mm_unpacklo_pi16(col0123a, col0123b);  /* col01l=(00 10 20 30 01 11 21 31) */ \
+  col23l = _mm_unpackhi_pi16(col0123a, col0123b);  /* col23l=(02 12 22 32 03 13 23 33) */ \
+  col01h = _mm_unpacklo_pi16(col0123c, col0123d);  /* col01h=(40 50 60 70 41 51 61 71) */ \
+  col23h = _mm_unpackhi_pi16(col0123c, col0123d);  /* col23h=(42 52 62 72 43 53 63 73) */ \
+  \
+  col0 = _mm_unpacklo_pi32(col01l, col01h);   /* col0=(00 10 20 30 40 50 60 70) */ \
+  col1 = _mm_unpackhi_pi32(col01l, col01h);   /* col1=(01 11 21 31 41 51 61 71) */ \
+  col2 = _mm_unpacklo_pi32(col23l, col23h);   /* col2=(02 12 22 32 42 52 62 72) */ \
+  col3 = _mm_unpackhi_pi32(col23l, col23h);   /* col3=(03 13 23 33 43 53 63 73) */ \
+  \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 0] + output_col), col0); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 1] + output_col), col1); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 2] + output_col), col2); \
+  _mm_store_si64((__m64 *)(output_buf[ctr + 3] + output_col), col3); \
+}
+
+void jsimd_idct_ifast_mmi(void *dct_table, JCOEFPTR coef_block,
+                          JSAMPARRAY output_buf, JDIMENSION output_col)
+{
+  __m64 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m64 tmp10, tmp11, tmp12, tmp13;
+  __m64 out0, out1, out2, out3, out4, out5, out6, out7;
+  __m64 z5, z10, z11, z12, z13;
+  JCOEFPTR inptr;
+  ISLOW_MULT_TYPE *quantptr;
+  JCOEF *wsptr;
+  JCOEF workspace[DCTSIZE2];  /* buffers data between passes */
+
+  /* Pass 1: process columns. */
+
+  inptr = coef_block;
+  quantptr = (ISLOW_MULT_TYPE *)dct_table;
+  wsptr = workspace;
+
+  DO_IDCT_PASS1(1)
+nextcolumn1:
+  inptr += 4;
+  quantptr += 4;
+  wsptr += DCTSIZE * 4;
+  DO_IDCT_PASS1(2)
+nextcolumn2:
+
+  /* Pass 2: process rows. */
+
+  wsptr = workspace;
+
+  DO_IDCT_PASS2(0)
+  wsptr += 4;
+  DO_IDCT_PASS2(4)
+}
diff --git a/external/jpeg/simd/loongson/jidctint-mmi.c b/external/jpeg/simd/mips64/jidctint-mmi.c
similarity index 100%
rename from external/jpeg/simd/loongson/jidctint-mmi.c
rename to external/jpeg/simd/mips64/jidctint-mmi.c
diff --git a/external/jpeg/simd/mips64/jquanti-mmi.c b/external/jpeg/simd/mips64/jquanti-mmi.c
new file mode 100644
index 000000000000..339002fd804e
--- /dev/null
+++ b/external/jpeg/simd/mips64/jquanti-mmi.c
@@ -0,0 +1,124 @@
+/*
+ * Loongson MMI optimizations for libjpeg-turbo
+ *
+ * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ *                          All Rights Reserved.
+ * Authors:  ZhuChen     <zhuchen@loongson.cn>
+ *           CaiWanwei   <caiwanwei@loongson.cn>
+ *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ * Copyright (C) 2018-2019, D. R. Commander.  All Rights Reserved.
+ *
+ * Based on the x86 SIMD extension for IJG JPEG library
+ * Copyright (C) 1999-2006, MIYASAKA Masaru.
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty.  In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ *    claim that you wrote the original software. If you use this software
+ *    in a product, an acknowledgment in the product documentation would be
+ *    appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ *    misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ */
+
+/* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
+
+#include "jsimd_mmi.h"
+
+
+#define DO_QUANT() { \
+  __m64 rowl, rowh, rowls, rowhs, rowlsave, rowhsave; \
+  __m64 corrl, corrh, recipl, reciph, scalel, scaleh; \
+  \
+  rowl = _mm_load_si64((__m64 *)&workspace[0]); \
+  rowh = _mm_load_si64((__m64 *)&workspace[4]); \
+  \
+  /* Branch-less absolute value */ \
+  rowls = _mm_srai_pi16(rowl, (WORD_BIT - 1));  /* -1 if value < 0, */ \
+                                                /* 0 otherwise */ \
+  rowhs = _mm_srai_pi16(rowh, (WORD_BIT - 1)); \
+  \
+  rowl = _mm_xor_si64(rowl, rowls);           /* val = -val */ \
+  rowh = _mm_xor_si64(rowh, rowhs); \
+  rowl = _mm_sub_pi16(rowl, rowls); \
+  rowh = _mm_sub_pi16(rowh, rowhs); \
+  \
+  corrl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1]);  /* correction */ \
+  corrh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 1 + 4]); \
+  \
+  rowlsave = rowl = _mm_add_pi16(rowl, corrl);  /* correction + roundfactor */ \
+  rowhsave = rowh = _mm_add_pi16(rowh, corrh); \
+  \
+  recipl = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0]);  /* reciprocal */ \
+  reciph = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 0 + 4]); \
+  \
+  rowl = _mm_mulhi_pi16(rowl, recipl); \
+  rowh = _mm_mulhi_pi16(rowh, reciph); \
+  \
+  /* reciprocal is always negative (MSB=1), so we always need to add the */ \
+  /* initial value (input value is never negative as we inverted it at the */ \
+  /* start of this routine) */ \
+  rowlsave = rowl = _mm_add_pi16(rowl, rowlsave); \
+  rowhsave = rowh = _mm_add_pi16(rowh, rowhsave); \
+  \
+  scalel = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2]);  /* scale */ \
+  scaleh = _mm_load_si64((__m64 *)&divisors[DCTSIZE2 * 2 + 4]); \
+  \
+  rowl = _mm_mulhi_pi16(rowl, scalel); \
+  rowh = _mm_mulhi_pi16(rowh, scaleh); \
+  \
+  /* determine if scale is negative */ \
+  scalel = _mm_srai_pi16(scalel, (WORD_BIT - 1)); \
+  scaleh = _mm_srai_pi16(scaleh, (WORD_BIT - 1)); \
+  \
+  /* and add input if it is */ \
+  scalel = _mm_and_si64(scalel, rowlsave); \
+  scaleh = _mm_and_si64(scaleh, rowhsave); \
+  rowl = _mm_add_pi16(rowl, scalel); \
+  rowh = _mm_add_pi16(rowh, scaleh); \
+  \
+  /* then check if negative input */ \
+  rowlsave = _mm_srai_pi16(rowlsave, (WORD_BIT - 1)); \
+  rowhsave = _mm_srai_pi16(rowhsave, (WORD_BIT - 1)); \
+  \
+  /* and add scale if it is */ \
+  rowlsave = _mm_and_si64(rowlsave, scalel); \
+  rowhsave = _mm_and_si64(rowhsave, scaleh); \
+  rowl = _mm_add_pi16(rowl, rowlsave); \
+  rowh = _mm_add_pi16(rowh, rowhsave); \
+  \
+  rowl = _mm_xor_si64(rowl, rowls);           /* val = -val */ \
+  rowh = _mm_xor_si64(rowh, rowhs); \
+  rowl = _mm_sub_pi16(rowl, rowls); \
+  rowh = _mm_sub_pi16(rowh, rowhs); \
+  \
+  _mm_store_si64((__m64 *)&output_ptr[0], rowl); \
+  _mm_store_si64((__m64 *)&output_ptr[4], rowh); \
+  \
+  workspace += DCTSIZE; \
+  divisors += DCTSIZE; \
+  output_ptr += DCTSIZE; \
+}
+
+
+void jsimd_quantize_mmi(JCOEFPTR coef_block, DCTELEM *divisors,
+                        DCTELEM *workspace)
+{
+  JCOEFPTR output_ptr = coef_block;
+
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+  DO_QUANT()
+}
diff --git a/external/jpeg/simd/loongson/jsimd.c b/external/jpeg/simd/mips64/jsimd.c
similarity index 66%
rename from external/jpeg/simd/loongson/jsimd.c
rename to external/jpeg/simd/mips64/jsimd.c
index e8b183225373..e8f1af562bab 100644
--- a/external/jpeg/simd/loongson/jsimd.c
+++ b/external/jpeg/simd/mips64/jsimd.c
@@ -1,11 +1,11 @@
 /*
- * jsimd_loongson.c
+ * jsimd_mips64.c
  *
  * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
  * Copyright (C) 2009-2011, 2014, 2016, 2018, D. R. Commander.
  * Copyright (C) 2013-2014, MIPS Technologies, Inc., California.
  * Copyright (C) 2015, 2018, Matthieu Darbois.
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  *
  * Based on the x86 SIMD extension for IJG JPEG library,
  * Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -13,7 +13,7 @@
  *
  * This file contains the interface between the "normal" portions
  * of the library and the SIMD implementations when running on a
- * Loongson architecture.
+ * 64-bit MIPS architecture.
  */
 
 #define JPEG_INTERNALS
@@ -24,8 +24,76 @@
 #include "../../jsimddct.h"
 #include "../jsimd.h"
 
+#include <stdio.h>
+#include <string.h>
+#include <ctype.h>
+
 static unsigned int simd_support = ~0;
 
+#if defined(__linux__)
+
+#define SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT  (1024 * 1024)
+
+LOCAL(int)
+check_feature(char *buffer, char *feature)
+{
+  char *p;
+
+  if (*feature == 0)
+    return 0;
+  if (strncmp(buffer, "ASEs implemented", 16) != 0)
+    return 0;
+  buffer += 16;
+  while (isspace(*buffer))
+    buffer++;
+
+  /* Check if 'feature' is present in the buffer as a separate word */
+  while ((p = strstr(buffer, feature))) {
+    if (p > buffer && !isspace(*(p - 1))) {
+      buffer++;
+      continue;
+    }
+    p += strlen(feature);
+    if (*p != 0 && !isspace(*p)) {
+      buffer++;
+      continue;
+    }
+    return 1;
+  }
+  return 0;
+}
+
+LOCAL(int)
+parse_proc_cpuinfo(int bufsize)
+{
+  char *buffer = (char *)malloc(bufsize);
+  FILE *fd;
+
+  simd_support = 0;
+
+  if (!buffer)
+    return 0;
+
+  fd = fopen("/proc/cpuinfo", "r");
+  if (fd) {
+    while (fgets(buffer, bufsize, fd)) {
+      if (!strchr(buffer, '\n') && !feof(fd)) {
+        /* "impossible" happened - insufficient size of the buffer! */
+        fclose(fd);
+        free(buffer);
+        return 0;
+      }
+      if (check_feature(buffer, "loongson-mmi"))
+        simd_support |= JSIMD_MMI;
+    }
+    fclose(fd);
+  }
+  free(buffer);
+  return 1;
+}
+
+#endif
+
 /*
  * Check what SIMD accelerations are supported.
  *
@@ -37,14 +105,32 @@ init_simd(void)
 #ifndef NO_GETENV
   char *env = NULL;
 #endif
+#if defined(__linux__)
+  int bufsize = 1024; /* an initial guess for the line buffer size limit */
+#endif
 
   if (simd_support != ~0U)
     return;
 
+  simd_support = 0;
+
+#if defined(__linux__)
+  while (!parse_proc_cpuinfo(bufsize)) {
+    bufsize *= 2;
+    if (bufsize > SOMEWHAT_SANE_PROC_CPUINFO_SIZE_LIMIT)
+      break;
+  }
+#elif defined(__mips_loongson_vector_rev)
+  /* Only enable MMI by default on non-Linux platforms when the compiler flags
+   * support it. */
   simd_support |= JSIMD_MMI;
+#endif
 
 #ifndef NO_GETENV
   /* Force different settings through environment variables */
+  env = getenv("JSIMD_FORCEMMI");
+  if ((env != NULL) && (strcmp(env, "1") == 0))
+    simd_support = JSIMD_MMI;
   env = getenv("JSIMD_FORCENONE");
   if ((env != NULL) && (strcmp(env, "1") == 0))
     simd_support = 0;
@@ -73,6 +159,19 @@ jsimd_can_rgb_ycc(void)
 GLOBAL(int)
 jsimd_can_rgb_gray(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if ((RGB_PIXELSIZE != 3) && (RGB_PIXELSIZE != 4))
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
@@ -150,6 +249,37 @@ jsimd_rgb_gray_convert(j_compress_ptr cinfo, JSAMPARRAY input_buf,
                        JSAMPIMAGE output_buf, JDIMENSION output_row,
                        int num_rows)
 {
+  void (*mmifct) (JDIMENSION, JSAMPARRAY, JSAMPIMAGE, JDIMENSION, int);
+
+  switch (cinfo->in_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_extrgb_gray_convert_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_extrgbx_gray_convert_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_extbgr_gray_convert_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_extbgrx_gray_convert_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_extxbgr_gray_convert_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_extxrgb_gray_convert_mmi;
+    break;
+  default:
+    mmifct = jsimd_rgb_gray_convert_mmi;
+    break;
+  }
+
+  mmifct(cinfo->image_width, input_buf, output_buf, output_row, num_rows);
 }
 
 GLOBAL(void)
@@ -311,6 +441,17 @@ jsimd_can_h2v2_fancy_upsample(void)
 GLOBAL(int)
 jsimd_can_h2v1_fancy_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
@@ -327,17 +468,42 @@ GLOBAL(void)
 jsimd_h2v1_fancy_upsample(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                           JSAMPARRAY input_data, JSAMPARRAY *output_data_ptr)
 {
+  jsimd_h2v1_fancy_upsample_mmi(cinfo->max_v_samp_factor,
+                                compptr->downsampled_width, input_data,
+                                output_data_ptr);
 }
 
 GLOBAL(int)
 jsimd_can_h2v2_merged_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
 GLOBAL(int)
 jsimd_can_h2v1_merged_upsample(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
@@ -345,12 +511,74 @@ GLOBAL(void)
 jsimd_h2v2_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                            JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_h2v2_extrgb_merged_upsample_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_h2v2_extrgbx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_h2v2_extbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_h2v2_extbgrx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_h2v2_extxbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_h2v2_extxrgb_merged_upsample_mmi;
+    break;
+  default:
+    mmifct = jsimd_h2v2_merged_upsample_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
 GLOBAL(void)
 jsimd_h2v1_merged_upsample(j_decompress_ptr cinfo, JSAMPIMAGE input_buf,
                            JDIMENSION in_row_group_ctr, JSAMPARRAY output_buf)
 {
+  void (*mmifct) (JDIMENSION, JSAMPIMAGE, JDIMENSION, JSAMPARRAY);
+
+  switch (cinfo->out_color_space) {
+  case JCS_EXT_RGB:
+    mmifct = jsimd_h2v1_extrgb_merged_upsample_mmi;
+    break;
+  case JCS_EXT_RGBX:
+  case JCS_EXT_RGBA:
+    mmifct = jsimd_h2v1_extrgbx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGR:
+    mmifct = jsimd_h2v1_extbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_BGRX:
+  case JCS_EXT_BGRA:
+    mmifct = jsimd_h2v1_extbgrx_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XBGR:
+  case JCS_EXT_ABGR:
+    mmifct = jsimd_h2v1_extxbgr_merged_upsample_mmi;
+    break;
+  case JCS_EXT_XRGB:
+  case JCS_EXT_ARGB:
+    mmifct = jsimd_h2v1_extxrgb_merged_upsample_mmi;
+    break;
+  default:
+    mmifct = jsimd_h2v1_merged_upsample_mmi;
+    break;
+  }
+
+  mmifct(cinfo->output_width, input_buf, in_row_group_ctr, output_buf);
 }
 
 GLOBAL(int)
@@ -397,6 +625,17 @@ jsimd_can_fdct_islow(void)
 GLOBAL(int)
 jsimd_can_fdct_ifast(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(DCTELEM) != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
@@ -415,6 +654,7 @@ jsimd_fdct_islow(DCTELEM *data)
 GLOBAL(void)
 jsimd_fdct_ifast(DCTELEM *data)
 {
+  jsimd_fdct_ifast_mmi(data);
 }
 
 GLOBAL(void)
@@ -537,6 +777,25 @@ jsimd_can_idct_islow(void)
 GLOBAL(int)
 jsimd_can_idct_ifast(void)
 {
+  init_simd();
+
+  /* The code is optimised for these values only */
+  if (DCTSIZE != 8)
+    return 0;
+  if (sizeof(JCOEF) != 2)
+    return 0;
+  if (BITS_IN_JSAMPLE != 8)
+    return 0;
+  if (sizeof(JDIMENSION) != 4)
+    return 0;
+  if (sizeof(IFAST_MULT_TYPE) != 2)
+    return 0;
+  if (IFAST_SCALE_BITS != 2)
+    return 0;
+
+  if (simd_support & JSIMD_MMI)
+    return 1;
+
   return 0;
 }
 
@@ -559,6 +818,7 @@ jsimd_idct_ifast(j_decompress_ptr cinfo, jpeg_component_info *compptr,
                  JCOEFPTR coef_block, JSAMPARRAY output_buf,
                  JDIMENSION output_col)
 {
+  jsimd_idct_ifast_mmi(compptr->dct_table, coef_block, output_buf, output_col);
 }
 
 GLOBAL(void)
diff --git a/external/jpeg/simd/loongson/jsimd_mmi.h b/external/jpeg/simd/mips64/jsimd_mmi.h
similarity index 83%
rename from external/jpeg/simd/loongson/jsimd_mmi.h
rename to external/jpeg/simd/mips64/jsimd_mmi.h
index 59b2ee0b7a1e..5e4261c9d98f 100644
--- a/external/jpeg/simd/loongson/jsimd_mmi.h
+++ b/external/jpeg/simd/mips64/jsimd_mmi.h
@@ -1,11 +1,12 @@
 /*
  * Loongson MMI optimizations for libjpeg-turbo
  *
- * Copyright (C) 2016-2017, Loongson Technology Corporation Limited, BeiJing.
+ * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
  *                          All Rights Reserved.
  * Authors:  ZhuChen     <zhuchen@loongson.cn>
  *           CaiWanwei   <caiwanwei@loongson.cn>
  *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
+ *           QingfaLiu   <liuqingfa-hf@loongson.cn>
  *
  * This software is provided 'as-is', without any express or implied
  * warranty.  In no event will the authors be held liable for any damages
@@ -32,6 +33,13 @@
 
 
 /* Common code */
+#if defined(_ABI64) && _MIPS_SIM == _ABI64
+# define PTR_ADDU  "daddu "
+# define PTR_SLL   "dsll "
+#else
+# define PTR_ADDU  "addu "
+# define PTR_SLL   "sll "
+#endif
 
 #define SIZEOF_MMWORD  8
 #define BYTE_BIT  8
@@ -47,11 +55,13 @@
    ((uint64_t)(uint8_t)f << 16) | \
    ((uint64_t)(uint8_t)g << 8)  | \
    ((uint64_t)(uint8_t)h))
+#define _uint64_set1_pi8(a)  _uint64_set_pi8(a, a, a, a, a, a, a, a)
 #define _uint64_set_pi16(a, b, c, d) \
   (((uint64_t)(uint16_t)a << 48) | \
    ((uint64_t)(uint16_t)b << 32) | \
    ((uint64_t)(uint16_t)c << 16) | \
    ((uint64_t)(uint16_t)d))
+#define _uint64_set1_pi16(a)  _uint64_set_pi16(a, a, a, a)
 #define _uint64_set_pi32(a, b) \
   (((uint64_t)(uint32_t)a << 32) | \
    ((uint64_t)(uint32_t)b))
diff --git a/external/jpeg/simd/loongson/loongson-mmintrin.h b/external/jpeg/simd/mips64/loongson-mmintrin.h
similarity index 98%
rename from external/jpeg/simd/loongson/loongson-mmintrin.h
rename to external/jpeg/simd/mips64/loongson-mmintrin.h
index 50d166b75325..db9b35ab6064 100644
--- a/external/jpeg/simd/loongson/loongson-mmintrin.h
+++ b/external/jpeg/simd/mips64/loongson-mmintrin.h
@@ -1217,14 +1217,24 @@ _mm_store_pi32(__m32 *dest, __m64 src)
 extern __inline void FUNCTION_ATTRIBS
 _mm_store_si64(__m64 *dest, __m64 src)
 {
-  asm("gssdlc1 %1, 7+%0\n\t"
-      "gssdrc1 %1, %0\n\t"
+  asm("sdc1 %1, %0 \n\t"
       : "=m" (*dest)
       : "f" (src)
       : "memory"
      );
 }
 
+extern __inline void FUNCTION_ATTRIBS
+_mm_storeu_si64(__m64 *dest, __m64 src)
+{
+  asm("gssdlc1 %1, 7(%0) \n\t"
+      "gssdrc1 %1, 0(%0) \n\t"
+      :
+      : "r" (dest), "f" (src)
+      : "memory"
+     );
+}
+
 extern __inline __m64 FUNCTION_ATTRIBS
 _mm_load_si32(const __m32 *src)
 {
diff --git a/external/jpeg/simd/nasm/jpeg_nbits_table.inc b/external/jpeg/simd/nasm/jpeg_nbits_table.inc
deleted file mode 100644
index 2ce6c284d9fa..000000000000
--- a/external/jpeg/simd/nasm/jpeg_nbits_table.inc
+++ /dev/null
@@ -1,4097 +0,0 @@
-jpeg_nbits_table db \
-   0,  1,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  4,  4,  4,  4, \
-   5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, \
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6, \
-   6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6, \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
-   7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-   9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, \
-  16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
diff --git a/external/jpeg/simd/nasm/jsimdext.inc b/external/jpeg/simd/nasm/jsimdext.inc
index 9930d80c2ab3..e8d50b034973 100644
--- a/external/jpeg/simd/nasm/jsimdext.inc
+++ b/external/jpeg/simd/nasm/jsimdext.inc
@@ -2,8 +2,9 @@
 ; jsimdext.inc - common declarations
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright (C) 2010, 2016, 2019, D. R. Commander.
+; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
 ; Copyright (C) 2018, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
 ;
@@ -130,13 +131,53 @@ section .note.GNU-stack noalloc noexec nowrite progbits
 ;  Common types
 ;
 %ifdef __x86_64__
+%ifnidn __OUTPUT_FORMAT__, elfx32
 %define POINTER         qword           ; general pointer type
 %define SIZEOF_POINTER  SIZEOF_QWORD    ; sizeof(POINTER)
 %define POINTER_BIT     QWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
-%else
+%define resp            resq
+%define dp              dq
+%define raxp            rax
+%define rbxp            rbx
+%define rcxp            rcx
+%define rdxp            rdx
+%define rsip            rsi
+%define rdip            rdi
+%define rbpp            rbp
+%define rspp            rsp
+%define r8p             r8
+%define r9p             r9
+%define r10p            r10
+%define r11p            r11
+%define r12p            r12
+%define r13p            r13
+%define r14p            r14
+%define r15p            r15
+%endif
+%endif
+%ifndef raxp
 %define POINTER         dword           ; general pointer type
 %define SIZEOF_POINTER  SIZEOF_DWORD    ; sizeof(POINTER)
 %define POINTER_BIT     DWORD_BIT       ; sizeof(POINTER)*BYTE_BIT
+%define resp            resd
+%define dp              dd
+; x86_64 ILP32 ABI (x32)
+%define raxp            eax
+%define rbxp            ebx
+%define rcxp            ecx
+%define rdxp            edx
+%define rsip            esi
+%define rdip            edi
+%define rbpp            ebp
+%define rspp            esp
+%define r8p             r8d
+%define r9p             r9d
+%define r10p            r10d
+%define r11p            r11d
+%define r12p            r12d
+%define r13p            r13d
+%define r14p            r14d
+%define r15p            r15d
 %endif
 
 %define INT             dword           ; signed integer type
diff --git a/external/jpeg/simd/powerpc/jcsample.h b/external/jpeg/simd/powerpc/jcsample.h
index 2ac48167fc20..bd07fcc4ed4a 100644
--- a/external/jpeg/simd/powerpc/jcsample.h
+++ b/external/jpeg/simd/powerpc/jcsample.h
@@ -20,7 +20,7 @@ expand_right_edge(JSAMPARRAY image_data, int num_rows, JDIMENSION input_cols,
   if (numcols > 0) {
     for (row = 0; row < num_rows; row++) {
       ptr = image_data[row] + input_cols;
-      pixval = ptr[-1];         /* don't need GETJSAMPLE() here */
+      pixval = ptr[-1];
       for (count = numcols; count > 0; count--)
         *ptr++ = pixval;
     }
diff --git a/external/jpeg/simd/x86_64/jccolext-avx2.asm b/external/jpeg/simd/x86_64/jccolext-avx2.asm
index 10d28348a96f..ffb527db00e1 100644
--- a/external/jpeg/simd/x86_64/jccolext-avx2.asm
+++ b/external/jpeg/simd/x86_64/jccolext-avx2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright (C) 2009, 2016, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -57,9 +58,9 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
 
     mov         rsi, r12
     mov         ecx, r13d
-    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
     lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
     lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
     lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
@@ -77,10 +78,10 @@ EXTN(jsimd_rgb_ycc_convert_avx2):
     push        rsi
     push        rcx                     ; col
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr0
-    mov         rbx, JSAMPROW [rbx]     ; outptr1
-    mov         rdx, JSAMPROW [rdx]     ; outptr2
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+    mov         rbxp, JSAMPROW [rbx]    ; outptr1
+    mov         rdxp, JSAMPROW [rdx]    ; outptr2
 
     cmp         rcx, byte SIZEOF_YMMWORD
     jae         near .columnloop
diff --git a/external/jpeg/simd/x86_64/jccolext-sse2.asm b/external/jpeg/simd/x86_64/jccolext-sse2.asm
index 2c914d318382..af70ed6010f6 100644
--- a/external/jpeg/simd/x86_64/jccolext-sse2.asm
+++ b/external/jpeg/simd/x86_64/jccolext-sse2.asm
@@ -2,6 +2,7 @@
 ; jccolext.asm - colorspace conversion (64-bit SSE2)
 ;
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -56,9 +57,9 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
 
     mov         rsi, r12
     mov         ecx, r13d
-    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
     lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
     lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
     lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
@@ -76,10 +77,10 @@ EXTN(jsimd_rgb_ycc_convert_sse2):
     push        rsi
     push        rcx                     ; col
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr0
-    mov         rbx, JSAMPROW [rbx]     ; outptr1
-    mov         rdx, JSAMPROW [rdx]     ; outptr2
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
+    mov         rbxp, JSAMPROW [rbx]    ; outptr1
+    mov         rdxp, JSAMPROW [rdx]    ; outptr2
 
     cmp         rcx, byte SIZEOF_XMMWORD
     jae         near .columnloop
diff --git a/external/jpeg/simd/x86_64/jcgryext-avx2.asm b/external/jpeg/simd/x86_64/jcgryext-avx2.asm
index 175b60de613e..ddcc2c0a2fe4 100644
--- a/external/jpeg/simd/x86_64/jcgryext-avx2.asm
+++ b/external/jpeg/simd/x86_64/jcgryext-avx2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright (C) 2011, 2016, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -57,7 +58,7 @@ EXTN(jsimd_rgb_gray_convert_avx2):
 
     mov         rsi, r12
     mov         ecx, r13d
-    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
     lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
 
     pop         rcx
@@ -71,8 +72,8 @@ EXTN(jsimd_rgb_gray_convert_avx2):
     push        rsi
     push        rcx                     ; col
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr0
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
 
     cmp         rcx, byte SIZEOF_YMMWORD
     jae         near .columnloop
diff --git a/external/jpeg/simd/x86_64/jcgryext-sse2.asm b/external/jpeg/simd/x86_64/jcgryext-sse2.asm
index 873be80564a6..f1d399a63b85 100644
--- a/external/jpeg/simd/x86_64/jcgryext-sse2.asm
+++ b/external/jpeg/simd/x86_64/jcgryext-sse2.asm
@@ -2,6 +2,7 @@
 ; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
 ;
 ; Copyright (C) 2011, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -56,7 +57,7 @@ EXTN(jsimd_rgb_gray_convert_sse2):
 
     mov         rsi, r12
     mov         ecx, r13d
-    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
+    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
     lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
 
     pop         rcx
@@ -70,8 +71,8 @@ EXTN(jsimd_rgb_gray_convert_sse2):
     push        rsi
     push        rcx                     ; col
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr0
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr0
 
     cmp         rcx, byte SIZEOF_XMMWORD
     jae         near .columnloop
diff --git a/external/jpeg/simd/x86_64/jchuff-sse2.asm b/external/jpeg/simd/x86_64/jchuff-sse2.asm
index aa78fd5cd5e0..00720283ec59 100644
--- a/external/jpeg/simd/x86_64/jchuff-sse2.asm
+++ b/external/jpeg/simd/x86_64/jchuff-sse2.asm
@@ -1,8 +1,9 @@
 ;
 ; jchuff-sse2.asm - Huffman entropy encoding (64-bit SSE2)
 ;
-; Copyright (C) 2009-2011, 2014-2016, D. R. Commander.
+; Copyright (C) 2009-2011, 2014-2016, 2019, D. R. Commander.
 ; Copyright (C) 2015, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -15,11 +16,25 @@
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
 ;
 ; This file contains an SSE2 implementation for Huffman coding of one block.
-; The following code is based directly on jchuff.c; see jchuff.c for more
-; details.
+; The following code is based on jchuff.c; see jchuff.c for more details.
 
 %include "jsimdext.inc"
 
+struc working_state
+.next_output_byte:   resp 1     ; => next byte to write in buffer
+.free_in_buffer:     resp 1     ; # of byte spaces remaining in buffer
+.cur.put_buffer.simd resq 1     ; current bit accumulation buffer
+.cur.free_bits       resd 1     ; # of bits available in it
+.cur.last_dc_val     resd 4     ; last DC coef for each component
+.cinfo:              resp 1     ; dump_buffer needs access to this
+endstruc
+
+struc c_derived_tbl
+.ehufco:             resd 256   ; code for each symbol
+.ehufsi:             resb 256   ; length of code for each symbol
+; If no code has been allocated for a symbol S, ehufsi[S] contains 0
+endstruc
+
 ; --------------------------------------------------------------------------
     SECTION     SEG_CONST
 
@@ -28,134 +43,137 @@
 
 EXTN(jconst_huff_encode_one_block):
 
-%include "jpeg_nbits_table.inc"
+jpeg_mask_bits dd 0x0000, 0x0001, 0x0003, 0x0007
+               dd 0x000f, 0x001f, 0x003f, 0x007f
+               dd 0x00ff, 0x01ff, 0x03ff, 0x07ff
+               dd 0x0fff, 0x1fff, 0x3fff, 0x7fff
 
     alignz      32
 
-; --------------------------------------------------------------------------
-    SECTION     SEG_TEXT
-    BITS        64
-
-; These macros perform the same task as the emit_bits() function in the
-; original libjpeg code.  In addition to reducing overhead by explicitly
-; inlining the code, additional performance is achieved by taking into
-; account the size of the bit buffer and waiting until it is almost full
-; before emptying it.  This mostly benefits 64-bit platforms, since 6
-; bytes can be stored in a 64-bit bit buffer before it has to be emptied.
-
-%macro EMIT_BYTE 0
-    sub         put_bits, 8             ; put_bits -= 8;
-    mov         rdx, put_buffer
-    mov         ecx, put_bits
-    shr         rdx, cl                 ; c = (JOCTET)GETJOCTET(put_buffer >> put_bits);
-    mov         byte [buffer], dl       ; *buffer++ = c;
-    add         buffer, 1
-    cmp         dl, 0xFF                ; need to stuff a zero byte?
-    jne         %%.EMIT_BYTE_END
-    mov         byte [buffer], 0        ; *buffer++ = 0;
-    add         buffer, 1
-%%.EMIT_BYTE_END:
-%endmacro
+times 1 << 14 db 15
+times 1 << 13 db 14
+times 1 << 12 db 13
+times 1 << 11 db 12
+times 1 << 10 db 11
+times 1 <<  9 db 10
+times 1 <<  8 db  9
+times 1 <<  7 db  8
+times 1 <<  6 db  7
+times 1 <<  5 db  6
+times 1 <<  4 db  5
+times 1 <<  3 db  4
+times 1 <<  2 db  3
+times 1 <<  1 db  2
+times 1 <<  0 db  1
+times 1       db  0
+jpeg_nbits_table:
+times 1       db  0
+times 1 <<  0 db  1
+times 1 <<  1 db  2
+times 1 <<  2 db  3
+times 1 <<  3 db  4
+times 1 <<  4 db  5
+times 1 <<  5 db  6
+times 1 <<  6 db  7
+times 1 <<  7 db  8
+times 1 <<  8 db  9
+times 1 <<  9 db 10
+times 1 << 10 db 11
+times 1 << 11 db 12
+times 1 << 12 db 13
+times 1 << 13 db 14
+times 1 << 14 db 15
 
-%macro PUT_BITS 1
-    add         put_bits, ecx           ; put_bits += size;
-    shl         put_buffer, cl          ; put_buffer = (put_buffer << size);
-    or          put_buffer, %1
-%endmacro
+    alignz      32
 
-%macro CHECKBUF31 0
-    cmp         put_bits, 32            ; if (put_bits > 31) {
-    jl          %%.CHECKBUF31_END
-    EMIT_BYTE
-    EMIT_BYTE
-    EMIT_BYTE
-    EMIT_BYTE
-%%.CHECKBUF31_END:
-%endmacro
+%define NBITS(x)      nbits_base + x
+%define MASK_BITS(x)  NBITS((x) * 4) + (jpeg_mask_bits - jpeg_nbits_table)
 
-%macro CHECKBUF47 0
-    cmp         put_bits, 48            ; if (put_bits > 47) {
-    jl          %%.CHECKBUF47_END
-    EMIT_BYTE
-    EMIT_BYTE
-    EMIT_BYTE
-    EMIT_BYTE
-    EMIT_BYTE
-    EMIT_BYTE
-%%.CHECKBUF47_END:
-%endmacro
+; --------------------------------------------------------------------------
+    SECTION     SEG_TEXT
+    BITS        64
 
-%macro EMIT_BITS 2
-    CHECKBUF47
-    mov         ecx, %2
-    PUT_BITS    %1
-%endmacro
+; Shorthand used to describe SIMD operations:
+; wN:  xmmN treated as eight signed 16-bit values
+; wN[i]:  perform the same operation on all eight signed 16-bit values, i=0..7
+; bN:  xmmN treated as 16 unsigned 8-bit values
+; bN[i]:  perform the same operation on all 16 unsigned 8-bit values, i=0..15
+; Contents of SIMD registers are shown in memory order.
 
-%macro kloop_prepare 37                 ;(ko, jno0, ..., jno31, xmm0, xmm1, xmm2, xmm3)
-    pxor        xmm8, xmm8              ; __m128i neg = _mm_setzero_si128();
-    pxor        xmm9, xmm9              ; __m128i neg = _mm_setzero_si128();
-    pxor        xmm10, xmm10            ; __m128i neg = _mm_setzero_si128();
-    pxor        xmm11, xmm11            ; __m128i neg = _mm_setzero_si128();
-    pinsrw      %34, word [r12 + %2  * SIZEOF_WORD], 0  ; xmm_shadow[0] = block[jno0];
-    pinsrw      %35, word [r12 + %10 * SIZEOF_WORD], 0  ; xmm_shadow[8] = block[jno8];
-    pinsrw      %36, word [r12 + %18 * SIZEOF_WORD], 0  ; xmm_shadow[16] = block[jno16];
-    pinsrw      %37, word [r12 + %26 * SIZEOF_WORD], 0  ; xmm_shadow[24] = block[jno24];
-    pinsrw      %34, word [r12 + %3  * SIZEOF_WORD], 1  ; xmm_shadow[1] = block[jno1];
-    pinsrw      %35, word [r12 + %11 * SIZEOF_WORD], 1  ; xmm_shadow[9] = block[jno9];
-    pinsrw      %36, word [r12 + %19 * SIZEOF_WORD], 1  ; xmm_shadow[17] = block[jno17];
-    pinsrw      %37, word [r12 + %27 * SIZEOF_WORD], 1  ; xmm_shadow[25] = block[jno25];
-    pinsrw      %34, word [r12 + %4  * SIZEOF_WORD], 2  ; xmm_shadow[2] = block[jno2];
-    pinsrw      %35, word [r12 + %12 * SIZEOF_WORD], 2  ; xmm_shadow[10] = block[jno10];
-    pinsrw      %36, word [r12 + %20 * SIZEOF_WORD], 2  ; xmm_shadow[18] = block[jno18];
-    pinsrw      %37, word [r12 + %28 * SIZEOF_WORD], 2  ; xmm_shadow[26] = block[jno26];
-    pinsrw      %34, word [r12 + %5  * SIZEOF_WORD], 3  ; xmm_shadow[3] = block[jno3];
-    pinsrw      %35, word [r12 + %13 * SIZEOF_WORD], 3  ; xmm_shadow[11] = block[jno11];
-    pinsrw      %36, word [r12 + %21 * SIZEOF_WORD], 3  ; xmm_shadow[19] = block[jno19];
-    pinsrw      %37, word [r12 + %29 * SIZEOF_WORD], 3  ; xmm_shadow[27] = block[jno27];
-    pinsrw      %34, word [r12 + %6  * SIZEOF_WORD], 4  ; xmm_shadow[4] = block[jno4];
-    pinsrw      %35, word [r12 + %14 * SIZEOF_WORD], 4  ; xmm_shadow[12] = block[jno12];
-    pinsrw      %36, word [r12 + %22 * SIZEOF_WORD], 4  ; xmm_shadow[20] = block[jno20];
-    pinsrw      %37, word [r12 + %30 * SIZEOF_WORD], 4  ; xmm_shadow[28] = block[jno28];
-    pinsrw      %34, word [r12 + %7  * SIZEOF_WORD], 5  ; xmm_shadow[5] = block[jno5];
-    pinsrw      %35, word [r12 + %15 * SIZEOF_WORD], 5  ; xmm_shadow[13] = block[jno13];
-    pinsrw      %36, word [r12 + %23 * SIZEOF_WORD], 5  ; xmm_shadow[21] = block[jno21];
-    pinsrw      %37, word [r12 + %31 * SIZEOF_WORD], 5  ; xmm_shadow[29] = block[jno29];
-    pinsrw      %34, word [r12 + %8  * SIZEOF_WORD], 6  ; xmm_shadow[6] = block[jno6];
-    pinsrw      %35, word [r12 + %16 * SIZEOF_WORD], 6  ; xmm_shadow[14] = block[jno14];
-    pinsrw      %36, word [r12 + %24 * SIZEOF_WORD], 6  ; xmm_shadow[22] = block[jno22];
-    pinsrw      %37, word [r12 + %32 * SIZEOF_WORD], 6  ; xmm_shadow[30] = block[jno30];
-    pinsrw      %34, word [r12 + %9  * SIZEOF_WORD], 7  ; xmm_shadow[7] = block[jno7];
-    pinsrw      %35, word [r12 + %17 * SIZEOF_WORD], 7  ; xmm_shadow[15] = block[jno15];
-    pinsrw      %36, word [r12 + %25 * SIZEOF_WORD], 7  ; xmm_shadow[23] = block[jno23];
-%if %1 != 32
-    pinsrw      %37, word [r12 + %33 * SIZEOF_WORD], 7  ; xmm_shadow[31] = block[jno31];
-%else
-    pinsrw      %37, ebx, 7             ; xmm_shadow[31] = block[jno31];
-%endif
-    pcmpgtw     xmm8, %34               ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw     xmm9, %35               ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw     xmm10, %36              ; neg = _mm_cmpgt_epi16(neg, x1);
-    pcmpgtw     xmm11, %37              ; neg = _mm_cmpgt_epi16(neg, x1);
-    paddw       %34, xmm8               ; x1 = _mm_add_epi16(x1, neg);
-    paddw       %35, xmm9               ; x1 = _mm_add_epi16(x1, neg);
-    paddw       %36, xmm10              ; x1 = _mm_add_epi16(x1, neg);
-    paddw       %37, xmm11              ; x1 = _mm_add_epi16(x1, neg);
-    pxor        %34, xmm8               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        %35, xmm9               ; x1 = _mm_xor_si128(x1, neg);
-    pxor        %36, xmm10              ; x1 = _mm_xor_si128(x1, neg);
-    pxor        %37, xmm11              ; x1 = _mm_xor_si128(x1, neg);
-    pxor        xmm8, %34               ; neg = _mm_xor_si128(neg, x1);
-    pxor        xmm9, %35               ; neg = _mm_xor_si128(neg, x1);
-    pxor        xmm10, %36              ; neg = _mm_xor_si128(neg, x1);
-    pxor        xmm11, %37              ; neg = _mm_xor_si128(neg, x1);
-    movdqa      XMMWORD [t1 + %1 * SIZEOF_WORD], %34           ; _mm_storeu_si128((__m128i *)(t1 + ko), x1);
-    movdqa      XMMWORD [t1 + (%1 + 8) * SIZEOF_WORD], %35     ; _mm_storeu_si128((__m128i *)(t1 + ko + 8), x1);
-    movdqa      XMMWORD [t1 + (%1 + 16) * SIZEOF_WORD], %36    ; _mm_storeu_si128((__m128i *)(t1 + ko + 16), x1);
-    movdqa      XMMWORD [t1 + (%1 + 24) * SIZEOF_WORD], %37    ; _mm_storeu_si128((__m128i *)(t1 + ko + 24), x1);
-    movdqa      XMMWORD [t2 + %1 * SIZEOF_WORD], xmm8          ; _mm_storeu_si128((__m128i *)(t2 + ko), neg);
-    movdqa      XMMWORD [t2 + (%1 + 8) * SIZEOF_WORD], xmm9    ; _mm_storeu_si128((__m128i *)(t2 + ko + 8), neg);
-    movdqa      XMMWORD [t2 + (%1 + 16) * SIZEOF_WORD], xmm10  ; _mm_storeu_si128((__m128i *)(t2 + ko + 16), neg);
-    movdqa      XMMWORD [t2 + (%1 + 24) * SIZEOF_WORD], xmm11  ; _mm_storeu_si128((__m128i *)(t2 + ko + 24), neg);
+; Fill the bit buffer to capacity with the leading bits from code, then output
+; the bit buffer and put the remaining bits from code into the bit buffer.
+;
+; Usage:
+; code - contains the bits to shift into the bit buffer (LSB-aligned)
+; %1 - the label to which to jump when the macro completes
+; %2 (optional) - extra instructions to execute after nbits has been set
+;
+; Upon completion, free_bits will be set to the number of remaining bits from
+; code, and put_buffer will contain those remaining bits.  temp and code will
+; be clobbered.
+;
+; This macro encodes any 0xFF bytes as 0xFF 0x00, as does the EMIT_BYTE()
+; macro in jchuff.c.
+
+%macro EMIT_QWORD 1-2
+    add         nbitsb, free_bitsb      ; nbits += free_bits;
+    neg         free_bitsb              ; free_bits = -free_bits;
+    mov         tempd, code             ; temp = code;
+    shl         put_buffer, nbitsb      ; put_buffer <<= nbits;
+    mov         nbitsb, free_bitsb      ; nbits = free_bits;
+    neg         free_bitsb              ; free_bits = -free_bits;
+    shr         tempd, nbitsb           ; temp >>= nbits;
+    or          tempq, put_buffer       ; temp |= put_buffer;
+    movq        xmm0, tempq             ; xmm0.u64 = { temp, 0 };
+    bswap       tempq                   ; temp = htonl(temp);
+    mov         put_buffer, codeq       ; put_buffer = code;
+    pcmpeqb     xmm0, xmm1              ; b0[i] = (b0[i] == 0xFF ? 0xFF : 0);
+    %2
+    pmovmskb    code, xmm0              ; code = 0;  code |= ((b0[i] >> 7) << i);
+    mov         qword [buffer], tempq   ; memcpy(buffer, &temp, 8);
+                                        ; (speculative; will be overwritten if
+                                        ; code contains any 0xFF bytes)
+    add         free_bitsb, 64          ; free_bits += 64;
+    add         bufferp, 8              ; buffer += 8;
+    test        code, code              ; if (code == 0)  /* No 0xFF bytes */
+    jz          %1                      ;   return;
+    ; Execute the equivalent of the EMIT_BYTE() macro in jchuff.c for all 8
+    ; bytes in the qword.
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer-7], 0      ; buffer[-7] = 0;
+    sbb         bufferp, 6              ; buffer -= (6 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempq, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempq, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    shr         tempd, 16               ; temp >>= 16;
+    mov         byte [buffer], tempb    ; buffer[0] = temp[0];
+    cmp         tempb, 0xFF             ; Set CF if temp[0] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[0] < 0xFF ? 1 : 0));
+    mov         byte [buffer], temph    ; buffer[0] = temp[1];
+    cmp         temph, 0xFF             ; Set CF if temp[1] < 0xFF
+    mov         byte [buffer+1], 0      ; buffer[1] = 0;
+    sbb         bufferp, -2             ; buffer -= (-2 + (temp[1] < 0xFF ? 1 : 0));
+    jmp         %1                      ; return;
 %endmacro
 
 ;
@@ -166,181 +184,399 @@ EXTN(jconst_huff_encode_one_block):
 ;                                  JCOEFPTR block, int last_dc_val,
 ;                                  c_derived_tbl *dctbl, c_derived_tbl *actbl)
 ;
-
-; r10 = working_state *state
-; r11 = JOCTET *buffer
-; r12 = JCOEFPTR block
-; r13d = int last_dc_val
-; r14 = c_derived_tbl *dctbl
-; r15 = c_derived_tbl *actbl
-
-%define t1          rbp - (DCTSIZE2 * SIZEOF_WORD)
-%define t2          t1 - (DCTSIZE2 * SIZEOF_WORD)
-%define put_buffer  r8
-%define put_bits    r9d
-%define buffer      rax
+; NOTES:
+; When shuffling data, we try to avoid pinsrw as much as possible, since it is
+; slow on many CPUs.  Its reciprocal throughput (issue latency) is 1 even on
+; modern CPUs, so chains of pinsrw instructions (even with different outputs)
+; can limit performance.  pinsrw is a VectorPath instruction on AMD K8 and
+; requires 2 µops (with memory operand) on Intel.  In either case, only one
+; pinsrw instruction can be decoded per cycle (and nothing else if they are
+; back-to-back), so out-of-order execution cannot be used to work around long
+; pinsrw chains (though for Sandy Bridge and later, this may be less of a
+; problem if the code runs from the µop cache.)
+;
+; We use tzcnt instead of bsf without checking for support.  The instruction is
+; executed as bsf on CPUs that don't support tzcnt (encoding is equivalent to
+; rep bsf.)  The destination (first) operand of bsf (and tzcnt on some CPUs) is
+; an input dependency (although the behavior is not formally defined, Intel
+; CPUs usually leave the destination unmodified if the source is zero.)  This
+; can prevent out-of-order execution, so we clear the destination before
+; invoking tzcnt.
+;
+; Initial register allocation
+; rax - buffer
+; rbx - temp
+; rcx - nbits
+; rdx - block --> free_bits
+; rsi - nbits_base
+; rdi - t
+; rbp - code
+; r8  - dctbl --> code_temp
+; r9  - actbl
+; r10 - state
+; r11 - index
+; r12 - put_buffer
+
+%define buffer       rax
+%ifdef WIN64
+%define bufferp      rax
+%else
+%define bufferp      raxp
+%endif
+%define tempq        rbx
+%define tempd        ebx
+%define tempb        bl
+%define temph        bh
+%define nbitsq       rcx
+%define nbits        ecx
+%define nbitsb       cl
+%define block        rdx
+%define nbits_base   rsi
+%define t            rdi
+%define td           edi
+%define codeq        rbp
+%define code         ebp
+%define dctbl        r8
+%define actbl        r9
+%define state        r10
+%define index        r11
+%define indexd       r11d
+%define put_buffer   r12
+%define put_bufferd  r12d
+
+; Step 1: Re-arrange input data according to jpeg_natural_order
+; xx 01 02 03 04 05 06 07      xx 01 08 16 09 02 03 10
+; 08 09 10 11 12 13 14 15      17 24 32 25 18 11 04 05
+; 16 17 18 19 20 21 22 23      12 19 26 33 40 48 41 34
+; 24 25 26 27 28 29 30 31 ==>  27 20 13 06 07 14 21 28
+; 32 33 34 35 36 37 38 39      35 42 49 56 57 50 43 36
+; 40 41 42 43 44 45 46 47      29 22 15 23 30 37 44 51
+; 48 49 50 51 52 53 54 55      58 59 52 45 38 31 39 46
+; 56 57 58 59 60 61 62 63      53 60 61 54 47 55 62 63
 
     align       32
     GLOBAL_FUNCTION(jsimd_huff_encode_one_block_sse2)
 
 EXTN(jsimd_huff_encode_one_block_sse2):
+
+%ifdef WIN64
+
+; rcx = working_state *state
+; rdx = JOCTET *buffer
+; r8 = JCOEFPTR block
+; r9 = int last_dc_val
+; [rax+48] = c_derived_tbl *dctbl
+; [rax+56] = c_derived_tbl *actbl
+
+                                                          ;X: X = code stream
+    mov         buffer, rdx
+    mov         block, r8
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
+    push        rbx
     push        rbp
-    mov         rax, rsp                     ; rax = original rbp
-    sub         rsp, byte 4
-    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
-    mov         [rsp], rax
-    mov         rbp, rsp                     ; rbp = aligned rbp
-    lea         rsp, [t2]
-    push_xmm    4
-    collect_args 6
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    push        rsi
+    push        rdi
+    push        r12
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    mov         state, rcx
+    movsx       code, word [block]                        ;Z:     code = block[0];
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    sub         code, r9d                                 ;Z:     code -= last_dc_val;
+    mov         dctbl, POINTER [rsp+6*8+4*8]
+    mov         actbl, POINTER [rsp+6*8+5*8]
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    lea         nbits_base, [rel jpeg_nbits_table]
+    add         rsp, -DCTSIZE2 * SIZEOF_WORD
+    mov         t, rsp
+
+%else
+
+; rdi = working_state *state
+; rsi = JOCTET *buffer
+; rdx = JCOEFPTR block
+; rcx = int last_dc_val
+; r8 = c_derived_tbl *dctbl
+; r9 = c_derived_tbl *actbl
+
+                                                          ;X: X = code stream
+    movups      xmm3, XMMWORD [block + 0 * SIZEOF_WORD]   ;D: w3 = xx 01 02 03 04 05 06 07
     push        rbx
+    push        rbp
+    movdqa      xmm0, xmm3                                ;A: w0 = xx 01 02 03 04 05 06 07
+    push        r12
+    mov         state, rdi
+    mov         buffer, rsi
+    movups      xmm1, XMMWORD [block + 8 * SIZEOF_WORD]   ;B: w1 = 08 09 10 11 12 13 14 15
+    movsx       codeq, word [block]                       ;Z:     code = block[0];
+    lea         nbits_base, [rel jpeg_nbits_table]
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    sub         codeq, rcx                                ;Z:     code -= last_dc_val;
+    punpckldq   xmm0, xmm1                                ;A: w0 = xx 01 08 09 02 03 10 11
+    lea         t, [rsp - DCTSIZE2 * SIZEOF_WORD]         ;   use red zone for t_
 
-    mov         buffer, r11                  ; r11 is now sratch
-
-    mov         put_buffer, MMWORD [r10+16]  ; put_buffer = state->cur.put_buffer;
-    mov         put_bits,    dword [r10+24]  ; put_bits = state->cur.put_bits;
-    push        r10                          ; r10 is now scratch
-
-    ; Encode the DC coefficient difference per section F.1.2.1
-    movsx       edi, word [r12]         ; temp = temp2 = block[0] - last_dc_val;
-    sub         edi, r13d               ; r13 is not used anymore
-    mov         ebx, edi
-
-    ; This is a well-known technique for obtaining the absolute value
-    ; without a branch.  It is derived from an assembly language technique
-    ; presented in "How to Optimize for the Pentium Processors",
-    ; Copyright (c) 1996, 1997 by Agner Fog.
-    mov         esi, edi
-    sar         esi, 31                 ; temp3 = temp >> (CHAR_BIT * sizeof(int) - 1);
-    xor         edi, esi                ; temp ^= temp3;
-    sub         edi, esi                ; temp -= temp3;
-
-    ; For a negative input, want temp2 = bitwise complement of abs(input)
-    ; This code assumes we are on a two's complement machine
-    add         ebx, esi                ; temp2 += temp3;
-
-    ; Find the number of bits needed for the magnitude of the coefficient
-    lea         r11, [rel jpeg_nbits_table]
-    movzx       rdi, byte [r11 + rdi]         ; nbits = JPEG_NBITS(temp);
-    ; Emit the Huffman-coded symbol for the number of bits
-    mov         r11d,  INT [r14 + rdi * 4]    ; code = dctbl->ehufco[nbits];
-    movzx       esi, byte [r14 + rdi + 1024]  ; size = dctbl->ehufsi[nbits];
-    EMIT_BITS   r11, esi                      ; EMIT_BITS(code, size)
-
-    ; Mask off any extra bits in code
-    mov         esi, 1
-    mov         ecx, edi
-    shl         esi, cl
-    dec         esi
-    and         ebx, esi                ; temp2 &= (((JLONG)1)<<nbits) - 1;
-
-    ; Emit that number of bits of the value, if positive,
-    ; or the complement of its magnitude, if negative.
-    EMIT_BITS   rbx, edi                ; EMIT_BITS(temp2, nbits)
-
-    ; Prepare data
-    xor         ebx, ebx
-    kloop_prepare  0,  1,  8,  16, 9,  2,  3,  10, 17, 24, 32, 25, \
-                   18, 11, 4,  5,  12, 19, 26, 33, 40, 48, 41, 34, \
-                   27, 20, 13, 6,  7,  14, 21, 28, 35, \
-                   xmm0, xmm1, xmm2, xmm3
-    kloop_prepare  32, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, \
-                   30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, \
-                   53, 60, 61, 54, 47, 55, 62, 63, 63, \
-                   xmm4, xmm5, xmm6, xmm7
-
-    pxor        xmm8, xmm8
-    pcmpeqw     xmm0, xmm8              ; tmp0 = _mm_cmpeq_epi16(tmp0, zero);
-    pcmpeqw     xmm1, xmm8              ; tmp1 = _mm_cmpeq_epi16(tmp1, zero);
-    pcmpeqw     xmm2, xmm8              ; tmp2 = _mm_cmpeq_epi16(tmp2, zero);
-    pcmpeqw     xmm3, xmm8              ; tmp3 = _mm_cmpeq_epi16(tmp3, zero);
-    pcmpeqw     xmm4, xmm8              ; tmp4 = _mm_cmpeq_epi16(tmp4, zero);
-    pcmpeqw     xmm5, xmm8              ; tmp5 = _mm_cmpeq_epi16(tmp5, zero);
-    pcmpeqw     xmm6, xmm8              ; tmp6 = _mm_cmpeq_epi16(tmp6, zero);
-    pcmpeqw     xmm7, xmm8              ; tmp7 = _mm_cmpeq_epi16(tmp7, zero);
-    packsswb    xmm0, xmm1              ; tmp0 = _mm_packs_epi16(tmp0, tmp1);
-    packsswb    xmm2, xmm3              ; tmp2 = _mm_packs_epi16(tmp2, tmp3);
-    packsswb    xmm4, xmm5              ; tmp4 = _mm_packs_epi16(tmp4, tmp5);
-    packsswb    xmm6, xmm7              ; tmp6 = _mm_packs_epi16(tmp6, tmp7);
-    pmovmskb    r11d, xmm0              ; index  = ((uint64_t)_mm_movemask_epi8(tmp0)) << 0;
-    pmovmskb    r12d, xmm2              ; index  = ((uint64_t)_mm_movemask_epi8(tmp2)) << 16;
-    pmovmskb    r13d, xmm4              ; index  = ((uint64_t)_mm_movemask_epi8(tmp4)) << 32;
-    pmovmskb    r14d, xmm6              ; index  = ((uint64_t)_mm_movemask_epi8(tmp6)) << 48;
-    shl         r12, 16
-    shl         r14, 16
-    or          r11, r12
-    or          r13, r14
-    shl         r13, 32
-    or          r11, r13
-    not         r11                     ; index = ~index;
-
-    ;mov MMWORD [ t1 + DCTSIZE2 * SIZEOF_WORD ], r11
-    ;jmp .EFN
-
-    mov         r13d,  INT [r15 + 240 * 4]     ; code_0xf0 = actbl->ehufco[0xf0];
-    movzx       r14d, byte [r15 + 1024 + 240]  ; size_0xf0 = actbl->ehufsi[0xf0];
-    lea         rsi, [t1]
-.BLOOP:
-    bsf         r12, r11                     ; r = __builtin_ctzl(index);
-    jz          .ELOOP
-    mov         rcx, r12
-    lea         rsi, [rsi+r12*2]             ; k += r;
-    shr         r11, cl                      ; index >>= r;
-    movzx       rdi, word [rsi]              ; temp = t1[k];
-    lea         rbx, [rel jpeg_nbits_table]
-    movzx       rdi, byte [rbx + rdi]        ; nbits = JPEG_NBITS(temp);
-.BRLOOP:
-    cmp         r12, 16                 ; while (r > 15) {
-    jl          .ERLOOP
-    EMIT_BITS   r13, r14d               ; EMIT_BITS(code_0xf0, size_0xf0)
-    sub         r12, 16                 ; r -= 16;
-    jmp         .BRLOOP
-.ERLOOP:
-    ; Emit Huffman symbol for run length / number of bits
-    CHECKBUF31  ; uses rcx, rdx
-
-    shl         r12, 4                        ; temp3 = (r << 4) + nbits;
-    add         r12, rdi
-    mov         ebx,  INT [r15 + r12 * 4]     ; code = actbl->ehufco[temp3];
-    movzx       ecx, byte [r15 + r12 + 1024]  ; size = actbl->ehufsi[temp3];
-    PUT_BITS    rbx
-
-    ;EMIT_CODE(code, size)
-
-    movsx       ebx, word [rsi-DCTSIZE2*2]    ; temp2 = t2[k];
-    ; Mask off any extra bits in code
-    mov         rcx, rdi
-    mov         rdx, 1
-    shl         rdx, cl
-    dec         rdx
-    and         rbx, rdx                ; temp2 &= (((JLONG)1)<<nbits) - 1;
-    PUT_BITS    rbx                     ; PUT_BITS(temp2, nbits)
-
-    shr         r11, 1                  ; index >>= 1;
-    add         rsi, 2                  ; ++k;
-    jmp         .BLOOP
-.ELOOP:
-    ; If the last coef(s) were zero, emit an end-of-block code
-    lea         rdi, [t1 + (DCTSIZE2-1) * 2]  ; r = DCTSIZE2-1-k;
-    cmp         rdi, rsi                      ; if (r > 0) {
-    je          .EFN
-    mov         ebx,  INT [r15]               ; code = actbl->ehufco[0];
-    movzx       r12d, byte [r15 + 1024]       ; size = actbl->ehufsi[0];
-    EMIT_BITS   rbx, r12d
-.EFN:
-    pop         r10
-    ; Save put_buffer & put_bits
-    mov         MMWORD [r10+16], put_buffer  ; state->cur.put_buffer = put_buffer;
-    mov         dword  [r10+24], put_bits    ; state->cur.put_bits = put_bits;
+%endif
 
+    pshuflw     xmm0, xmm0, 11001001b                     ;A: w0 = 01 08 xx 09 02 03 10 11
+    pinsrw      xmm0, word [block + 16 * SIZEOF_WORD], 2  ;A: w0 = 01 08 16 09 02 03 10 11
+    punpckhdq   xmm3, xmm1                                ;D: w3 = 04 05 12 13 06 07 14 15
+    punpcklqdq  xmm1, xmm3                                ;B: w1 = 08 09 10 11 04 05 12 13
+    pinsrw      xmm0, word [block + 17 * SIZEOF_WORD], 7  ;A: w0 = 01 08 16 09 02 03 10 17
+                                                          ;A:      (Row 0, offset 1)
+    pcmpgtw     xmm4, xmm0                                ;A: w4[i] = (w0[i] < 0 ? -1 : 0);
+    paddw       xmm0, xmm4                                ;A: w0[i] += w4[i];
+    movaps      XMMWORD [t + 0 * SIZEOF_WORD], xmm0       ;A: t[i] = w0[i];
+
+    movq        xmm2, qword [block + 24 * SIZEOF_WORD]    ;B: w2 = 24 25 26 27 -- -- -- --
+    pshuflw     xmm2, xmm2, 11011000b                     ;B: w2 = 24 26 25 27 -- -- -- --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;B: w1 = -- 08 09 10 11 04 05 12
+    movups      xmm5, XMMWORD [block + 48 * SIZEOF_WORD]  ;H: w5 = 48 49 50 51 52 53 54 55
+    movsd       xmm1, xmm2                                ;B: w1 = 24 26 25 27 11 04 05 12
+    punpcklqdq  xmm2, xmm5                                ;C: w2 = 24 26 25 27 48 49 50 51
+    pinsrw      xmm1, word [block + 32 * SIZEOF_WORD], 1  ;B: w1 = 24 32 25 27 11 04 05 12
+    pxor        xmm4, xmm4                                ;A: w4[i] = 0;
+    psrldq      xmm3, 2 * SIZEOF_WORD                     ;D: w3 = 12 13 06 07 14 15 -- --
+    pcmpeqw     xmm0, xmm4                                ;A: w0[i] = (w0[i] == 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 18 * SIZEOF_WORD], 3  ;B: w1 = 24 32 25 18 11 04 05 12
+                                                          ;        (Row 1, offset 1)
+    pcmpgtw     xmm4, xmm1                                ;B: w4[i] = (w1[i] < 0 ? -1 : 0);
+    paddw       xmm1, xmm4                                ;B: w1[i] += w4[i];
+    movaps      XMMWORD [t + 8 * SIZEOF_WORD], xmm1       ;B: t[i+8] = w1[i];
+    pxor        xmm4, xmm4                                ;B: w4[i] = 0;
+    pcmpeqw     xmm1, xmm4                                ;B: w1[i] = (w1[i] == 0 ? -1 : 0);
+
+    packsswb    xmm0, xmm1                                ;AB: b0[i] = w0[i], b0[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    pinsrw      xmm3, word [block + 20 * SIZEOF_WORD], 0  ;D: w3 = 20 13 06 07 14 15 -- --
+    pinsrw      xmm3, word [block + 21 * SIZEOF_WORD], 5  ;D: w3 = 20 13 06 07 14 21 -- --
+    pinsrw      xmm3, word [block + 28 * SIZEOF_WORD], 6  ;D: w3 = 20 13 06 07 14 21 28 --
+    pinsrw      xmm3, word [block + 35 * SIZEOF_WORD], 7  ;D: w3 = 20 13 06 07 14 21 28 35
+                                                          ;        (Row 3, offset 1)
+    pcmpgtw     xmm4, xmm3                                ;D: w4[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm4                                ;D: w3[i] += w4[i];
+    movaps      XMMWORD [t + 24 * SIZEOF_WORD], xmm3      ;D: t[i+24] = w3[i];
+    pxor        xmm4, xmm4                                ;D: w4[i] = 0;
+    pcmpeqw     xmm3, xmm4                                ;D: w3[i] = (w3[i] == 0 ? -1 : 0);
+
+    pinsrw      xmm2, word [block + 19 * SIZEOF_WORD], 0  ;C: w2 = 19 26 25 27 48 49 50 51
+    cmp         code, 1 << 31                             ;Z:     Set CF if code < 0x80000000,
+                                                          ;Z:     i.e. if code is positive
+    pinsrw      xmm2, word [block + 33 * SIZEOF_WORD], 2  ;C: w2 = 19 26 33 27 48 49 50 51
+    pinsrw      xmm2, word [block + 40 * SIZEOF_WORD], 3  ;C: w2 = 19 26 33 40 48 49 50 51
+    adc         code, -1                                  ;Z:     code += -1 + (code >= 0 ? 1 : 0);
+    pinsrw      xmm2, word [block + 41 * SIZEOF_WORD], 5  ;C: w2 = 19 26 33 40 48 41 50 51
+    pinsrw      xmm2, word [block + 34 * SIZEOF_WORD], 6  ;C: w2 = 19 26 33 40 48 41 34 51
+    movsxd      codeq, code                               ;Z:     sign extend code
+    pinsrw      xmm2, word [block + 27 * SIZEOF_WORD], 7  ;C: w2 = 19 26 33 40 48 41 34 27
+                                                          ;        (Row 2, offset 1)
+    pcmpgtw     xmm4, xmm2                                ;C: w4[i] = (w2[i] < 0 ? -1 : 0);
+    paddw       xmm2, xmm4                                ;C: w2[i] += w4[i];
+    movaps      XMMWORD [t + 16 * SIZEOF_WORD], xmm2      ;C: t[i+16] = w2[i];
+    pxor        xmm4, xmm4                                ;C: w4[i] = 0;
+    pcmpeqw     xmm2, xmm4                                ;C: w2[i] = (w2[i] == 0 ? -1 : 0);
+
+    packsswb    xmm2, xmm3                                ;CD: b2[i] = w2[i], b2[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    movzx       nbitsq, byte [NBITS(codeq)]               ;Z:     nbits = JPEG_NBITS(code);
+    movdqa      xmm3, xmm5                                ;H: w3 = 48 49 50 51 52 53 54 55
+    pmovmskb    tempd, xmm2                               ;Z:     temp = 0;  temp |= ((b2[i] >> 7) << i);
+    pmovmskb    put_bufferd, xmm0                         ;Z:     put_buffer = 0;  put_buffer |= ((b0[i] >> 7) << i);
+    movups      xmm0, XMMWORD [block + 56 * SIZEOF_WORD]  ;H: w0 = 56 57 58 59 60 61 62 63
+    punpckhdq   xmm3, xmm0                                ;H: w3 = 52 53 60 61 54 55 62 63
+    shl         tempd, 16                                 ;Z:     temp <<= 16;
+    psrldq      xmm3, 1 * SIZEOF_WORD                     ;H: w3 = 53 60 61 54 55 62 63 --
+    pxor        xmm2, xmm2                                ;H: w2[i] = 0;
+    or          put_bufferd, tempd                        ;Z:     put_buffer |= temp;
+    pshuflw     xmm3, xmm3, 00111001b                     ;H: w3 = 60 61 54 53 55 62 63 --
+    movq        xmm1, qword [block + 44 * SIZEOF_WORD]    ;G: w1 = 44 45 46 47 -- -- -- --
+    unpcklps    xmm5, xmm0                                ;E: w5 = 48 49 56 57 50 51 58 59
+    pxor        xmm0, xmm0                                ;H: w0[i] = 0;
+    pinsrw      xmm3, word [block + 47 * SIZEOF_WORD], 3  ;H: w3 = 60 61 54 47 55 62 63 --
+                                                          ;        (Row 7, offset 1)
+    pcmpgtw     xmm2, xmm3                                ;H: w2[i] = (w3[i] < 0 ? -1 : 0);
+    paddw       xmm3, xmm2                                ;H: w3[i] += w2[i];
+    movaps      XMMWORD [t + 56 * SIZEOF_WORD], xmm3      ;H: t[i+56] = w3[i];
+    movq        xmm4, qword [block + 36 * SIZEOF_WORD]    ;G: w4 = 36 37 38 39 -- -- -- --
+    pcmpeqw     xmm3, xmm0                                ;H: w3[i] = (w3[i] == 0 ? -1 : 0);
+    punpckldq   xmm4, xmm1                                ;G: w4 = 36 37 44 45 38 39 46 47
+    mov         tempd, [dctbl + c_derived_tbl.ehufco + nbitsq * 4]
+                                                          ;Z:     temp = dctbl->ehufco[nbits];
+    movdqa      xmm1, xmm4                                ;F: w1 = 36 37 44 45 38 39 46 47
+    psrldq      xmm4, 1 * SIZEOF_WORD                     ;G: w4 = 37 44 45 38 39 46 47 --
+    shufpd      xmm1, xmm5, 10b                           ;F: w1 = 36 37 44 45 50 51 58 59
+    and         code, dword [MASK_BITS(nbitsq)]           ;Z:     code &= (1 << nbits) - 1;
+    pshufhw     xmm4, xmm4, 11010011b                     ;G: w4 = 37 44 45 38 -- 39 46 --
+    pslldq      xmm1, 1 * SIZEOF_WORD                     ;F: w1 = -- 36 37 44 45 50 51 58
+    shl         tempq, nbitsb                             ;Z:     temp <<= nbits;
+    pinsrw      xmm4, word [block + 59 * SIZEOF_WORD], 0  ;G: w4 = 59 44 45 38 -- 39 46 --
+    pshufd      xmm1, xmm1, 11011000b                     ;F: w1 = -- 36 45 50 37 44 51 58
+    pinsrw      xmm4, word [block + 52 * SIZEOF_WORD], 1  ;G: w4 = 59 52 45 38 -- 39 46 --
+    or          code, tempd                               ;Z:     code |= temp;
+    movlps      xmm1, qword [block + 20 * SIZEOF_WORD]    ;F: w1 = 20 21 22 23 37 44 51 58
+    pinsrw      xmm4, word [block + 31 * SIZEOF_WORD], 4  ;G: w4 = 59 52 45 38 31 39 46 --
+    pshuflw     xmm1, xmm1, 01110010b                     ;F: w1 = 22 20 23 21 37 44 51 58
+    pinsrw      xmm4, word [block + 53 * SIZEOF_WORD], 7  ;G: w4 = 59 52 45 38 31 39 46 53
+                                                          ;        (Row 6, offset 1)
+    pxor        xmm2, xmm2                                ;G: w2[i] = 0;
+    pcmpgtw     xmm0, xmm4                                ;G: w0[i] = (w4[i] < 0 ? -1 : 0);
+    pinsrw      xmm1, word [block + 15 * SIZEOF_WORD], 1  ;F: w1 = 22 15 23 21 37 44 51 58
+    paddw       xmm4, xmm0                                ;G: w4[i] += w0[i];
+    movaps      XMMWORD [t + 48 * SIZEOF_WORD], xmm4      ;G: t[48+i] = w4[i];
+    pinsrw      xmm1, word [block + 30 * SIZEOF_WORD], 3  ;F: w1 = 22 15 23 30 37 44 51 58
+                                                          ;        (Row 5, offset 1)
+    pcmpeqw     xmm4, xmm2                                ;G: w4[i] = (w4[i] == 0 ? -1 : 0);
+    pinsrw      xmm5, word [block + 42 * SIZEOF_WORD], 0  ;E: w5 = 42 49 56 57 50 51 58 59
+
+    packsswb    xmm4, xmm3                                ;GH: b4[i] = w4[i], b4[i+8] = w3[i]
+                                                          ;    w/ signed saturation
+
+    pxor        xmm0, xmm0                                ;F: w0[i] = 0;
+    pinsrw      xmm5, word [block + 43 * SIZEOF_WORD], 5  ;E: w5 = 42 49 56 57 50 43 58 59
+    pcmpgtw     xmm2, xmm1                                ;F: w2[i] = (w1[i] < 0 ? -1 : 0);
+    pmovmskb    tempd, xmm4                               ;Z:     temp = 0;  temp |= ((b4[i] >> 7) << i);
+    pinsrw      xmm5, word [block + 36 * SIZEOF_WORD], 6  ;E: w5 = 42 49 56 57 50 43 36 59
+    paddw       xmm1, xmm2                                ;F: w1[i] += w2[i];
+    movaps      XMMWORD [t + 40 * SIZEOF_WORD], xmm1      ;F: t[40+i] = w1[i];
+    pinsrw      xmm5, word [block + 29 * SIZEOF_WORD], 7  ;E: w5 = 42 49 56 57 50 43 36 29
+                                                          ;        (Row 4, offset 1)
+%undef block
+%define free_bitsq  rdx
+%define free_bitsd  edx
+%define free_bitsb  dl
+    pcmpeqw     xmm1, xmm0                                ;F: w1[i] = (w1[i] == 0 ? -1 : 0);
+    shl         tempq, 48                                 ;Z:     temp <<= 48;
+    pxor        xmm2, xmm2                                ;E: w2[i] = 0;
+    pcmpgtw     xmm0, xmm5                                ;E: w0[i] = (w5[i] < 0 ? -1 : 0);
+    paddw       xmm5, xmm0                                ;E: w5[i] += w0[i];
+    or          tempq, put_buffer                         ;Z:     temp |= put_buffer;
+    movaps      XMMWORD [t + 32 * SIZEOF_WORD], xmm5      ;E: t[32+i] = w5[i];
+    lea         t, [dword t - 2]                          ;Z:     t = &t[-1];
+    pcmpeqw     xmm5, xmm2                                ;E: w5[i] = (w5[i] == 0 ? -1 : 0);
+
+    packsswb    xmm5, xmm1                                ;EF: b5[i] = w5[i], b5[i+8] = w1[i]
+                                                          ;    w/ signed saturation
+
+    add         nbitsb, byte [dctbl + c_derived_tbl.ehufsi + nbitsq]
+                                                          ;Z:     nbits += dctbl->ehufsi[nbits];
+%undef dctbl
+%define code_temp  r8d
+    pmovmskb    indexd, xmm5                              ;Z:     index = 0;  index |= ((b5[i] >> 7) << i);
+    mov         free_bitsd, [state+working_state.cur.free_bits]
+                                                          ;Z:     free_bits = state->cur.free_bits;
+    pcmpeqw     xmm1, xmm1                                ;Z:     b1[i] = 0xFF;
+    shl         index, 32                                 ;Z:     index <<= 32;
+    mov         put_buffer, [state+working_state.cur.put_buffer.simd]
+                                                          ;Z:     put_buffer = state->cur.put_buffer.simd;
+    or          index, tempq                              ;Z:     index |= temp;
+    not         index                                     ;Z:     index = ~index;
+    sub         free_bitsb, nbitsb                        ;Z:     if ((free_bits -= nbits) >= 0)
+    jnl         .ENTRY_SKIP_EMIT_CODE                     ;Z:       goto .ENTRY_SKIP_EMIT_CODE;
+    align       16
+.EMIT_CODE:                                               ;Z:     .EMIT_CODE:
+    EMIT_QWORD  .BLOOP_COND                               ;Z:     insert code, flush buffer, goto .BLOOP_COND
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.BRLOOP:                                                  ; do {
+    lea         code_temp, [nbitsq - 16]                  ;   code_temp = nbits - 16;
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0xf0]
+                                                          ;   nbits = actbl->ehufsi[0xf0];
+    mov         code, [actbl + c_derived_tbl.ehufco + 0xf0 * 4]
+                                                          ;   code = actbl->ehufco[0xf0];
+    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_BRLOOP_CODE                         ;     goto .EMIT_BRLOOP_CODE;
+    shl         put_buffer, nbitsb                        ;   put_buffer <<= nbits;
+    mov         nbits, code_temp                          ;   nbits = code_temp;
+    or          put_buffer, codeq                         ;   put_buffer |= code;
+    cmp         nbits, 16                                 ;   if (nbits <= 16)
+    jle         .ERLOOP                                   ;     break;
+    jmp         .BRLOOP                                   ; } while (1);
+
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+    times 5     nop
+.ENTRY_SKIP_EMIT_CODE:                                    ; .ENTRY_SKIP_EMIT_CODE:
+    shl         put_buffer, nbitsb                        ; put_buffer <<= nbits;
+    or          put_buffer, codeq                         ; put_buffer |= code;
+.BLOOP_COND:                                              ; .BLOOP_COND:
+    test        index, index                              ; if (index != 0)
+    jz          .ELOOP                                    ; {
+.BLOOP:                                                   ;   do {
+    xor         nbits, nbits                              ;     nbits = 0;  /* kill tzcnt input dependency */
+    tzcnt       nbitsq, index                             ;     nbits = # of trailing 0 bits in index
+    inc         nbits                                     ;     ++nbits;
+    lea         t, [t + nbitsq * 2]                       ;     t = &t[nbits];
+    shr         index, nbitsb                             ;     index >>= nbits;
+.EMIT_BRLOOP_CODE_END:                                    ; .EMIT_BRLOOP_CODE_END:
+    cmp         nbits, 16                                 ;     if (nbits > 16)
+    jg          .BRLOOP                                   ;       goto .BRLOOP;
+.ERLOOP:                                                  ; .ERLOOP:
+    movsx       codeq, word [t]                           ;     code = *t;
+    lea         tempd, [nbitsq * 2]                       ;     temp = nbits * 2;
+    movzx       nbits, byte [NBITS(codeq)]                ;     nbits = JPEG_NBITS(code);
+    lea         tempd, [nbitsq + tempq * 8]               ;     temp = temp * 8 + nbits;
+    mov         code_temp, [actbl + c_derived_tbl.ehufco + (tempq - 16) * 4]
+                                                          ;     code_temp = actbl->ehufco[temp-16];
+    shl         code_temp, nbitsb                         ;     code_temp <<= nbits;
+    and         code, dword [MASK_BITS(nbitsq)]           ;     code &= (1 << nbits) - 1;
+    add         nbitsb, [actbl + c_derived_tbl.ehufsi + (tempq - 16)]
+                                                          ;     free_bits -= actbl->ehufsi[temp-16];
+    or          code, code_temp                           ;     code |= code_temp;
+    sub         free_bitsb, nbitsb                        ;     if ((free_bits -= nbits) <= 0)
+    jle         .EMIT_CODE                                ;       goto .EMIT_CODE;
+    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
+    or          put_buffer, codeq                         ;     put_buffer |= code;
+    test        index, index
+    jnz         .BLOOP                                    ;   } while (index != 0);
+.ELOOP:                                                   ; }  /* index != 0 */
+    sub         td, esp                                   ; t -= (WIN64: &t_[0], UNIX: &t_[64]);
+%ifdef WIN64
+    cmp         td, (DCTSIZE2 - 2) * SIZEOF_WORD          ; if (t != 62)
+%else
+    cmp         td, -2 * SIZEOF_WORD                      ; if (t != -2)
+%endif
+    je          .EFN                                      ; {
+    movzx       nbits, byte [actbl + c_derived_tbl.ehufsi + 0]
+                                                          ;   nbits = actbl->ehufsi[0];
+    mov         code, [actbl + c_derived_tbl.ehufco + 0]  ;   code = actbl->ehufco[0];
+    sub         free_bitsb, nbitsb                        ;   if ((free_bits -= nbits) <= 0)
+    jg          .EFN_SKIP_EMIT_CODE                       ;   {
+    EMIT_QWORD  .EFN                                      ;     insert code, flush buffer
+    align       16
+.EFN_SKIP_EMIT_CODE:                                      ;   } else {
+    shl         put_buffer, nbitsb                        ;     put_buffer <<= nbits;
+    or          put_buffer, codeq                         ;     put_buffer |= code;
+.EFN:                                                     ; } }
+    mov         [state + working_state.cur.put_buffer.simd], put_buffer
+                                                          ; state->cur.put_buffer.simd = put_buffer;
+    mov         byte [state + working_state.cur.free_bits], free_bitsb
+                                                          ; state->cur.free_bits = free_bits;
+%ifdef WIN64
+    sub         rsp, -DCTSIZE2 * SIZEOF_WORD
+    pop         r12
+    pop         rdi
+    pop         rsi
+    pop         rbp
     pop         rbx
-    uncollect_args 6
-    pop_xmm     4
-    mov         rsp, rbp                ; rsp <- aligned rbp
-    pop         rsp                     ; rsp <- original rbp
+%else
+    pop         r12
     pop         rbp
+    pop         rbx
+%endif
     ret
 
+; ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    align       16
+.EMIT_BRLOOP_CODE:
+    EMIT_QWORD  .EMIT_BRLOOP_CODE_END, { mov nbits, code_temp }
+                                                          ; insert code, flush buffer,
+                                                          ; nbits = code_temp, goto .EMIT_BRLOOP_CODE_END
+
 ; For some reason, the OS X linker does not honor the request to align the
 ; segment unless we do this.
     align       32
diff --git a/external/jpeg/simd/x86_64/jcphuff-sse2.asm b/external/jpeg/simd/x86_64/jcphuff-sse2.asm
index 8ed44728fed6..01b5c0235faf 100644
--- a/external/jpeg/simd/x86_64/jcphuff-sse2.asm
+++ b/external/jpeg/simd/x86_64/jcphuff-sse2.asm
@@ -504,6 +504,8 @@ EXTN(jsimd_encode_mcu_AC_refine_prepare_sse2):
     add         KK, 16
     dec         K
     jnz         .BLOOPR16
+    test        LEN, 15
+    je          .PADDINGR
 .ELOOPR16:
     test        LEN, 8
     jz          .TRYR7
diff --git a/external/jpeg/simd/x86_64/jcsample-avx2.asm b/external/jpeg/simd/x86_64/jcsample-avx2.asm
index d9922bb4cbf9..b32527aebeaa 100644
--- a/external/jpeg/simd/x86_64/jcsample-avx2.asm
+++ b/external/jpeg/simd/x86_64/jcsample-avx2.asm
@@ -4,6 +4,7 @@
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -71,7 +72,7 @@ EXTN(jsimd_h2v1_downsample_avx2):
     push        rax
     push        rcx
 
-    mov         rdi, JSAMPROW [rsi]
+    mov         rdip, JSAMPROW [rsi]
     add         rdi, rdx
     mov         al, JSAMPLE [rdi-1]
 
@@ -107,8 +108,8 @@ EXTN(jsimd_h2v1_downsample_avx2):
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
 
     cmp         rcx, byte SIZEOF_YMMWORD
     jae         short .columnloop
@@ -233,7 +234,7 @@ EXTN(jsimd_h2v2_downsample_avx2):
     push        rax
     push        rcx
 
-    mov         rdi, JSAMPROW [rsi]
+    mov         rdip, JSAMPROW [rsi]
     add         rdi, rdx
     mov         al, JSAMPLE [rdi-1]
 
@@ -269,9 +270,9 @@ EXTN(jsimd_h2v2_downsample_avx2):
     push        rdi
     push        rsi
 
-    mov         rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
-    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
-    mov         rdi, JSAMPROW [rdi]                    ; outptr
+    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdip, JSAMPROW [rdi]                    ; outptr
 
     cmp         rcx, byte SIZEOF_YMMWORD
     jae         short .columnloop
diff --git a/external/jpeg/simd/x86_64/jcsample-sse2.asm b/external/jpeg/simd/x86_64/jcsample-sse2.asm
index 0f107e9a07ff..2fcfe4567ab9 100644
--- a/external/jpeg/simd/x86_64/jcsample-sse2.asm
+++ b/external/jpeg/simd/x86_64/jcsample-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -70,7 +71,7 @@ EXTN(jsimd_h2v1_downsample_sse2):
     push        rax
     push        rcx
 
-    mov         rdi, JSAMPROW [rsi]
+    mov         rdip, JSAMPROW [rsi]
     add         rdi, rdx
     mov         al, JSAMPLE [rdi-1]
 
@@ -105,8 +106,8 @@ EXTN(jsimd_h2v1_downsample_sse2):
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
 
     cmp         rcx, byte SIZEOF_XMMWORD
     jae         short .columnloop
@@ -215,7 +216,7 @@ EXTN(jsimd_h2v2_downsample_sse2):
     push        rax
     push        rcx
 
-    mov         rdi, JSAMPROW [rsi]
+    mov         rdip, JSAMPROW [rsi]
     add         rdi, rdx
     mov         al, JSAMPLE [rdi-1]
 
@@ -250,9 +251,9 @@ EXTN(jsimd_h2v2_downsample_sse2):
     push        rdi
     push        rsi
 
-    mov         rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
-    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
-    mov         rdi, JSAMPROW [rdi]                    ; outptr
+    mov         rdxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdip, JSAMPROW [rdi]                    ; outptr
 
     cmp         rcx, byte SIZEOF_XMMWORD
     jae         short .columnloop
diff --git a/external/jpeg/simd/x86_64/jdcolext-avx2.asm b/external/jpeg/simd/x86_64/jdcolext-avx2.asm
index 677b8ed84e45..2370fda64249 100644
--- a/external/jpeg/simd/x86_64/jdcolext-avx2.asm
+++ b/external/jpeg/simd/x86_64/jdcolext-avx2.asm
@@ -4,6 +4,7 @@
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2012, 2016, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -58,9 +59,9 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
 
     mov         rdi, r11
     mov         ecx, r12d
-    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
     lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
     lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
     lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
@@ -79,10 +80,10 @@ EXTN(jsimd_ycc_rgb_convert_avx2):
     push        rsi
     push        rcx                     ; col
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr0
-    mov         rbx, JSAMPROW [rbx]     ; inptr1
-    mov         rdx, JSAMPROW [rdx]     ; inptr2
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr0
+    mov         rbxp, JSAMPROW [rbx]    ; inptr1
+    mov         rdxp, JSAMPROW [rdx]    ; inptr2
+    mov         rdip, JSAMPROW [rdi]    ; outptr
 .columnloop:
 
     vmovdqu     ymm5, YMMWORD [rbx]     ; ymm5=Cb(0123456789ABCDEFGHIJKLMNOPQRSTUV)
diff --git a/external/jpeg/simd/x86_64/jdcolext-sse2.asm b/external/jpeg/simd/x86_64/jdcolext-sse2.asm
index 071aa629133d..e07c8d75188c 100644
--- a/external/jpeg/simd/x86_64/jdcolext-sse2.asm
+++ b/external/jpeg/simd/x86_64/jdcolext-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -57,9 +58,9 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
 
     mov         rdi, r11
     mov         ecx, r12d
-    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
     lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
     lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
     lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
@@ -78,10 +79,10 @@ EXTN(jsimd_ycc_rgb_convert_sse2):
     push        rsi
     push        rcx                     ; col
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr0
-    mov         rbx, JSAMPROW [rbx]     ; inptr1
-    mov         rdx, JSAMPROW [rdx]     ; inptr2
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr0
+    mov         rbxp, JSAMPROW [rbx]    ; inptr1
+    mov         rdxp, JSAMPROW [rdx]    ; inptr2
+    mov         rdip, JSAMPROW [rdi]    ; outptr
 .columnloop:
 
     movdqa      xmm5, XMMWORD [rbx]     ; xmm5=Cb(0123456789ABCDEF)
diff --git a/external/jpeg/simd/x86_64/jdmrgext-avx2.asm b/external/jpeg/simd/x86_64/jdmrgext-avx2.asm
index bb733c587a49..8b264b4f039f 100644
--- a/external/jpeg/simd/x86_64/jdmrgext-avx2.asm
+++ b/external/jpeg/simd/x86_64/jdmrgext-avx2.asm
@@ -4,6 +4,7 @@
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2012, 2016, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -58,14 +59,14 @@ EXTN(jsimd_h2v1_merged_upsample_avx2):
 
     mov         rdi, r11
     mov         ecx, r12d
-    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
     mov         rdi, r13
-    mov         rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
-    mov         rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
-    mov         rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
-    mov         rdi, JSAMPROW [rdi]                      ; outptr
+    mov         rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         rdip, JSAMPROW [rdi]                      ; outptr
 
     pop         rcx                     ; col
 
@@ -514,15 +515,16 @@ EXTN(jsimd_h2v2_merged_upsample_avx2):
 
     mov         rdi, r11
     mov         ecx, r12d
-    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
     mov         rdi, r13
     lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
 
-    push        rdx                     ; inptr2
-    push        rbx                     ; inptr1
-    push        rsi                     ; inptr00
+    sub         rsp, SIZEOF_JSAMPARRAY*4
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
     mov         rbx, rsp
 
     push        rdi
@@ -546,16 +548,16 @@ EXTN(jsimd_h2v2_merged_upsample_avx2):
     pop         rax
     pop         rcx
     pop         rdi
-    pop         rsi
-    pop         rbx
-    pop         rdx
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
 
     add         rdi, byte SIZEOF_JSAMPROW  ; outptr1
     add         rsi, byte SIZEOF_JSAMPROW  ; inptr01
 
-    push        rdx                     ; inptr2
-    push        rbx                     ; inptr1
-    push        rsi                     ; inptr00
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
     mov         rbx, rsp
 
     push        rdi
@@ -579,9 +581,10 @@ EXTN(jsimd_h2v2_merged_upsample_avx2):
     pop         rax
     pop         rcx
     pop         rdi
-    pop         rsi
-    pop         rbx
-    pop         rdx
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+    add         rsp, SIZEOF_JSAMPARRAY*4
 
     pop         rbx
     uncollect_args 4
diff --git a/external/jpeg/simd/x86_64/jdmrgext-sse2.asm b/external/jpeg/simd/x86_64/jdmrgext-sse2.asm
index b176a4cd4f91..eb3ab9dbd945 100644
--- a/external/jpeg/simd/x86_64/jdmrgext-sse2.asm
+++ b/external/jpeg/simd/x86_64/jdmrgext-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2012, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -57,14 +58,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
 
     mov         rdi, r11
     mov         ecx, r12d
-    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
     mov         rdi, r13
-    mov         rsi, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
-    mov         rbx, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
-    mov         rdx, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
-    mov         rdi, JSAMPROW [rdi]                      ; outptr
+    mov         rsip, JSAMPROW [rsi+rcx*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rbxp, JSAMPROW [rbx+rcx*SIZEOF_JSAMPROW]  ; inptr1
+    mov         rdxp, JSAMPROW [rdx+rcx*SIZEOF_JSAMPROW]  ; inptr2
+    mov         rdip, JSAMPROW [rdi]                      ; outptr
 
     pop         rcx                     ; col
 
@@ -456,15 +457,16 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
 
     mov         rdi, r11
     mov         ecx, r12d
-    mov         rsi, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
-    mov         rbx, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
-    mov         rdx, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
+    mov         rsip, JSAMPARRAY [rdi+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rdi+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rdi+2*SIZEOF_JSAMPARRAY]
     mov         rdi, r13
     lea         rsi, [rsi+rcx*SIZEOF_JSAMPROW]
 
-    push        rdx                     ; inptr2
-    push        rbx                     ; inptr1
-    push        rsi                     ; inptr00
+    sub         rsp, SIZEOF_JSAMPARRAY*4
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
     mov         rbx, rsp
 
     push        rdi
@@ -488,16 +490,16 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
     pop         rax
     pop         rcx
     pop         rdi
-    pop         rsi
-    pop         rbx
-    pop         rdx
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
 
     add         rdi, byte SIZEOF_JSAMPROW  ; outptr1
     add         rsi, byte SIZEOF_JSAMPROW  ; inptr01
 
-    push        rdx                     ; inptr2
-    push        rbx                     ; inptr1
-    push        rsi                     ; inptr00
+    mov         JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY], rsip  ; intpr00
+    mov         JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY], rbxp  ; intpr1
+    mov         JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY], rdxp  ; intpr2
     mov         rbx, rsp
 
     push        rdi
@@ -521,9 +523,10 @@ EXTN(jsimd_h2v2_merged_upsample_sse2):
     pop         rax
     pop         rcx
     pop         rdi
-    pop         rsi
-    pop         rbx
-    pop         rdx
+    mov         rsip, JSAMPARRAY [rsp+0*SIZEOF_JSAMPARRAY]
+    mov         rbxp, JSAMPARRAY [rsp+1*SIZEOF_JSAMPARRAY]
+    mov         rdxp, JSAMPARRAY [rsp+2*SIZEOF_JSAMPARRAY]
+    add         rsp, SIZEOF_JSAMPARRAY*4
 
     pop         rbx
     uncollect_args 4
diff --git a/external/jpeg/simd/x86_64/jdsample-avx2.asm b/external/jpeg/simd/x86_64/jdsample-avx2.asm
index fc274a95ea30..1e4979f933e4 100644
--- a/external/jpeg/simd/x86_64/jdsample-avx2.asm
+++ b/external/jpeg/simd/x86_64/jdsample-avx2.asm
@@ -4,6 +4,7 @@
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
 ; Copyright (C) 2015, Intel Corporation.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -76,7 +77,7 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 
     vpxor       ymm0, ymm0, ymm0                 ; ymm0=(all 0's)
     vpcmpeqb    xmm9, xmm9, xmm9
@@ -90,8 +91,8 @@ EXTN(jsimd_h2v1_fancy_upsample_avx2):
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
 
     test        rax, SIZEOF_YMMWORD-1
     jz          short .skip
@@ -235,18 +236,18 @@ EXTN(jsimd_h2v2_fancy_upsample_avx2):
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 .rowloop:
     push        rax                     ; colctr
     push        rcx
     push        rdi
     push        rsi
 
-    mov         rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
-    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
-    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
-    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
-    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
 
     vpxor       ymm8, ymm8, ymm8                 ; ymm8=(all 0's)
     vpcmpeqb    xmm9, xmm9, xmm9
@@ -539,13 +540,13 @@ EXTN(jsimd_h2v1_upsample_avx2):
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 .rowloop:
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
     mov         rax, rdx                ; colctr
 .columnloop:
 
@@ -629,14 +630,14 @@ EXTN(jsimd_h2v2_upsample_avx2):
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 .rowloop:
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]                    ; inptr
-    mov         rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
-    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         rsip, JSAMPROW [rsi]                   ; inptr
+    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
     mov         rax, rdx                               ; colctr
 .columnloop:
 
diff --git a/external/jpeg/simd/x86_64/jdsample-sse2.asm b/external/jpeg/simd/x86_64/jdsample-sse2.asm
index 20e07670e919..38dbceec269d 100644
--- a/external/jpeg/simd/x86_64/jdsample-sse2.asm
+++ b/external/jpeg/simd/x86_64/jdsample-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -74,14 +75,14 @@ EXTN(jsimd_h2v1_fancy_upsample_sse2):
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 .rowloop:
     push        rax                     ; colctr
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
 
     test        rax, SIZEOF_XMMWORD-1
     jz          short .skip
@@ -221,18 +222,18 @@ EXTN(jsimd_h2v2_fancy_upsample_sse2):
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 .rowloop:
     push        rax                     ; colctr
     push        rcx
     push        rdi
     push        rsi
 
-    mov         rcx, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
-    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
-    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
-    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
-    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         rcxp, JSAMPROW [rsi-1*SIZEOF_JSAMPROW]  ; inptr1(above)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
+    mov         rsip, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1(below)
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
 
     test        rax, SIZEOF_XMMWORD-1
     jz          short .skip
@@ -512,13 +513,13 @@ EXTN(jsimd_h2v1_upsample_sse2):
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 .rowloop:
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]     ; inptr
-    mov         rdi, JSAMPROW [rdi]     ; outptr
+    mov         rsip, JSAMPROW [rsi]    ; inptr
+    mov         rdip, JSAMPROW [rdi]    ; outptr
     mov         rax, rdx                ; colctr
 .columnloop:
 
@@ -600,14 +601,14 @@ EXTN(jsimd_h2v2_upsample_sse2):
 
     mov         rsi, r12                ; input_data
     mov         rdi, r13
-    mov         rdi, JSAMPARRAY [rdi]   ; output_data
+    mov         rdip, JSAMPARRAY [rdi]  ; output_data
 .rowloop:
     push        rdi
     push        rsi
 
-    mov         rsi, JSAMPROW [rsi]                    ; inptr
-    mov         rbx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]  ; outptr0
-    mov         rdi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]  ; outptr1
+    mov         rsip, JSAMPROW [rsi]                   ; inptr
+    mov         rbxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW] ; outptr0
+    mov         rdip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW] ; outptr1
     mov         rax, rdx                               ; colctr
 .columnloop:
 
diff --git a/external/jpeg/simd/x86_64/jidctflt-sse2.asm b/external/jpeg/simd/x86_64/jidctflt-sse2.asm
index ab95e1a6d665..60bf96189613 100644
--- a/external/jpeg/simd/x86_64/jidctflt-sse2.asm
+++ b/external/jpeg/simd/x86_64/jidctflt-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -455,12 +456,12 @@ EXTN(jsimd_idct_float_sse2):
     pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
     pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
 
-    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-    mov         rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rbxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
     movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
-    mov         rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-    mov         rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rbxp, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
     movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
 
diff --git a/external/jpeg/simd/x86_64/jidctfst-sse2.asm b/external/jpeg/simd/x86_64/jidctfst-sse2.asm
index a66a6811e9dd..cb97fdfbb246 100644
--- a/external/jpeg/simd/x86_64/jidctfst-sse2.asm
+++ b/external/jpeg/simd/x86_64/jidctfst-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -460,21 +461,21 @@ EXTN(jsimd_idct_ifast_sse2):
     pshufd      xmm6, xmm4, 0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
     pshufd      xmm2, xmm7, 0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
 
-    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
-    mov         rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
 
-    mov         rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
-    mov         rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
 
diff --git a/external/jpeg/simd/x86_64/jidctint-avx2.asm b/external/jpeg/simd/x86_64/jidctint-avx2.asm
index 9c38f9e39069..ca7e317f6e1b 100644
--- a/external/jpeg/simd/x86_64/jidctint-avx2.asm
+++ b/external/jpeg/simd/x86_64/jidctint-avx2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, 2018, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -387,23 +388,23 @@ EXTN(jsimd_idct_islow_avx2):
 
     mov         eax, r13d
 
-    mov         rdx, JSAMPROW [r12+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rsi, JSAMPROW [r12+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [r12+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm0
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
 
-    mov         rdx, JSAMPROW [r12+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rsi, JSAMPROW [r12+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [r12+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
 
-    mov         rdx, JSAMPROW [r12+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rsi, JSAMPROW [r12+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [r12+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
 
-    mov         rdx, JSAMPROW [r12+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rsi, JSAMPROW [r12+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [r12+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r12+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm7
 
diff --git a/external/jpeg/simd/x86_64/jidctint-sse2.asm b/external/jpeg/simd/x86_64/jidctint-sse2.asm
index 8983bf099c03..7aa869bc0b51 100644
--- a/external/jpeg/simd/x86_64/jidctint-sse2.asm
+++ b/external/jpeg/simd/x86_64/jidctint-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, 2020, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -817,21 +818,21 @@ EXTN(jsimd_idct_islow_sse2):
     pshufd      xmm2, xmm4, 0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
     pshufd      xmm5, xmm3, 0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
 
-    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
-    mov         rdx, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
 
-    mov         rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
-    mov         rdx, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
     movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
     movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
 
diff --git a/external/jpeg/simd/x86_64/jidctred-sse2.asm b/external/jpeg/simd/x86_64/jidctred-sse2.asm
index 7fbfcc519dda..4ece9d891cbd 100644
--- a/external/jpeg/simd/x86_64/jidctred-sse2.asm
+++ b/external/jpeg/simd/x86_64/jidctred-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -379,12 +380,12 @@ EXTN(jsimd_idct_4x4_sse2):
     pshufd      xmm1, xmm4, 0x4E        ; xmm1=(20 21 22 23 30 31 32 33 00 ..)
     pshufd      xmm3, xmm4, 0x93        ; xmm3=(30 31 32 33 00 01 02 03 10 ..)
 
-    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
     movd        XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
     movd        XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm2
-    mov         rdx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
     movd        XMM_DWORD [rdx+rax*SIZEOF_JSAMPLE], xmm1
     movd        XMM_DWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
 
@@ -558,8 +559,8 @@ EXTN(jsimd_idct_2x2_sse2):
     pextrw      ebx, xmm6, 0x00         ; ebx=(C0 D0 -- --)
     pextrw      ecx, xmm6, 0x01         ; ecx=(C1 D1 -- --)
 
-    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
-    mov         rsi, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
+    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
+    mov         rsip, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
     mov         word [rdx+rax*SIZEOF_JSAMPLE], bx
     mov         word [rsi+rax*SIZEOF_JSAMPLE], cx
 
diff --git a/external/jpeg/simd/x86_64/jquantf-sse2.asm b/external/jpeg/simd/x86_64/jquantf-sse2.asm
index 83596a915b0b..ab2e3954f633 100644
--- a/external/jpeg/simd/x86_64/jquantf-sse2.asm
+++ b/external/jpeg/simd/x86_64/jquantf-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -51,8 +52,8 @@ EXTN(jsimd_convsamp_float_sse2):
     mov         rdi, r12
     mov         rcx, DCTSIZE/2
 .convloop:
-    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
 
     movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
     movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
diff --git a/external/jpeg/simd/x86_64/jquanti-avx2.asm b/external/jpeg/simd/x86_64/jquanti-avx2.asm
index 5f04d223305d..70fe81139cc2 100644
--- a/external/jpeg/simd/x86_64/jquanti-avx2.asm
+++ b/external/jpeg/simd/x86_64/jquanti-avx2.asm
@@ -4,6 +4,7 @@
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, 2018, D. R. Commander.
 ; Copyright (C) 2016, Matthieu Darbois.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -44,23 +45,23 @@ EXTN(jsimd_convsamp_avx2):
 
     mov         eax, r11d
 
-    mov         rsi, JSAMPROW [r10+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rdi, JSAMPROW [r10+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r10+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        xmm0, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
     pinsrq      xmm0, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
 
-    mov         rsi, JSAMPROW [r10+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rdi, JSAMPROW [r10+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r10+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        xmm1, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
     pinsrq      xmm1, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
 
-    mov         rsi, JSAMPROW [r10+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rdi, JSAMPROW [r10+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r10+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        xmm2, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
     pinsrq      xmm2, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
 
-    mov         rsi, JSAMPROW [r10+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rdi, JSAMPROW [r10+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rsip, JSAMPROW [r10+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdip, JSAMPROW [r10+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
     movq        xmm3, XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE]
     pinsrq      xmm3, XMM_MMWORD [rdi+rax*SIZEOF_JSAMPLE], 1
 
diff --git a/external/jpeg/simd/x86_64/jquanti-sse2.asm b/external/jpeg/simd/x86_64/jquanti-sse2.asm
index bb6fa69ea3c1..3ee442027a5a 100644
--- a/external/jpeg/simd/x86_64/jquanti-sse2.asm
+++ b/external/jpeg/simd/x86_64/jquanti-sse2.asm
@@ -3,6 +3,7 @@
 ;
 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
 ; Copyright (C) 2009, 2016, D. R. Commander.
+; Copyright (C) 2018, Matthias Räncker.
 ;
 ; Based on the x86 SIMD extension for IJG JPEG library
 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
@@ -51,14 +52,14 @@ EXTN(jsimd_convsamp_sse2):
     mov         rdi, r12
     mov         rcx, DCTSIZE/4
 .convloop:
-    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
 
     movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
     movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
 
-    mov         rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
-    mov         rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rbxp, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
+    mov         rdxp, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
 
     movq        xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
     movq        xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
diff --git a/external/jpeg/simd/x86_64/jsimd.c b/external/jpeg/simd/x86_64/jsimd.c
index 1e5698b3a4b9..eb7667999bb6 100644
--- a/external/jpeg/simd/x86_64/jsimd.c
+++ b/external/jpeg/simd/x86_64/jsimd.c
@@ -1031,8 +1031,6 @@ jsimd_can_encode_mcu_AC_first_prepare(void)
     return 0;
   if (sizeof(JCOEF) != 2)
     return 0;
-  if (SIZEOF_SIZE_T != 8)
-    return 0;
   if (simd_support & JSIMD_SSE2)
     return 1;
 
@@ -1057,8 +1055,6 @@ jsimd_can_encode_mcu_AC_refine_prepare(void)
     return 0;
   if (sizeof(JCOEF) != 2)
     return 0;
-  if (SIZEOF_SIZE_T != 8)
-    return 0;
   if (simd_support & JSIMD_SSE2)
     return 1;
 
diff --git a/external/jpeg/structure.txt b/external/jpeg/structure.txt
new file mode 100644
index 000000000000..15b8d378564a
--- /dev/null
+++ b/external/jpeg/structure.txt
@@ -0,0 +1,900 @@
+IJG JPEG LIBRARY:  SYSTEM ARCHITECTURE
+
+This file was part of the Independent JPEG Group's software:
+Copyright (C) 1991-2012, Thomas G. Lane, Guido Vollbeding.
+It was modified by The libjpeg-turbo Project to include only information
+relevant to libjpeg-turbo.
+For conditions of distribution and use, see the accompanying README.ijg file.
+
+
+This file provides an overview of the architecture of the IJG JPEG software;
+that is, the functions of the various modules in the system and the interfaces
+between modules.  For more precise details about any data structure or calling
+convention, see the include files and comments in the source code.
+
+We assume that the reader is already somewhat familiar with the JPEG standard.
+The README.ijg file includes references for learning about JPEG.  The file
+libjpeg.txt describes the library from the viewpoint of an application
+programmer using the library; it's best to read that file before this one.
+Also, the file coderules.txt describes the coding style conventions we use.
+
+In this document, JPEG-specific terminology follows the JPEG standard:
+  A "component" means a color channel, e.g., Red or Luminance.
+  A "sample" is a single component value (i.e., one number in the image data).
+  A "coefficient" is a frequency coefficient (a DCT transform output number).
+  A "block" is an 8x8 group of samples or coefficients.
+  An "MCU" (minimum coded unit) is an interleaved set of blocks of size
+        determined by the sampling factors, or a single block in a
+        noninterleaved scan.
+We do not use the terms "pixel" and "sample" interchangeably.  When we say
+pixel, we mean an element of the full-size image, while a sample is an element
+of the downsampled image.  Thus the number of samples may vary across
+components while the number of pixels does not.  (This terminology is not used
+rigorously throughout the code, but it is used in places where confusion would
+otherwise result.)
+
+
+*** System features ***
+
+The IJG distribution contains two parts:
+  * A subroutine library for JPEG compression and decompression.
+  * cjpeg/djpeg, two sample applications that use the library to transform
+    JFIF JPEG files to and from several other image formats.
+cjpeg/djpeg are of no great intellectual complexity: they merely add a simple
+command-line user interface and I/O routines for several uncompressed image
+formats.  This document concentrates on the library itself.
+
+We desire the library to be capable of supporting all JPEG baseline, extended
+sequential, and progressive DCT processes.  Hierarchical processes are not
+supported.
+
+The library does not support the lossless (spatial) JPEG process.  Lossless
+JPEG shares little or no code with lossy JPEG, and would normally be used
+without the extensive pre- and post-processing provided by this library.
+We feel that lossless JPEG is better handled by a separate library.
+
+Within these limits, any set of compression parameters allowed by the JPEG
+spec should be readable for decompression.  (We can be more restrictive about
+what formats we can generate.)  Although the system design allows for all
+parameter values, some uncommon settings are not yet implemented and may
+never be; nonintegral sampling ratios are the prime example.  Furthermore,
+we treat 8-bit vs. 12-bit data precision as a compile-time switch, not a
+run-time option, because most machines can store 8-bit pixels much more
+compactly than 12-bit.
+
+By itself, the library handles only interchange JPEG datastreams --- in
+particular the widely used JFIF file format.  The library can be used by
+surrounding code to process interchange or abbreviated JPEG datastreams that
+are embedded in more complex file formats.  (For example, libtiff uses this
+library to implement JPEG compression within the TIFF file format.)
+
+The library includes a substantial amount of code that is not covered by the
+JPEG standard but is necessary for typical applications of JPEG.  These
+functions preprocess the image before JPEG compression or postprocess it after
+decompression.  They include colorspace conversion, downsampling/upsampling,
+and color quantization.  This code can be omitted if not needed.
+
+A wide range of quality vs. speed tradeoffs are possible in JPEG processing,
+and even more so in decompression postprocessing.  The decompression library
+provides multiple implementations that cover most of the useful tradeoffs,
+ranging from very-high-quality down to fast-preview operation.  On the
+compression side we have generally not provided low-quality choices, since
+compression is normally less time-critical.  It should be understood that the
+low-quality modes may not meet the JPEG standard's accuracy requirements;
+nonetheless, they are useful for viewers.
+
+
+*** System overview ***
+
+The compressor and decompressor are each divided into two main sections:
+the JPEG compressor or decompressor proper, and the preprocessing or
+postprocessing functions.  The interface between these two sections is the
+image data that Rec. ITU-T T.81 | ISO/IEC 10918-1 regards as its input or
+output: this data is in the colorspace to be used for compression, and it is
+downsampled to the sampling factors to be used.  The preprocessing and
+postprocessing steps are responsible for converting a normal image
+representation to or from this form.  (Those few applications that want to deal
+with YCbCr downsampled data can skip the preprocessing or postprocessing step.)
+
+Looking more closely, the compressor library contains the following main
+elements:
+
+  Preprocessing:
+    * Color space conversion (e.g., RGB to YCbCr).
+    * Edge expansion and downsampling.  Optionally, this step can do simple
+      smoothing --- this is often helpful for low-quality source data.
+  JPEG proper:
+    * MCU assembly, DCT, quantization.
+    * Entropy coding (sequential or progressive, Huffman or arithmetic).
+
+In addition to these modules we need overall control, marker generation,
+and support code (memory management & error handling).  There is also a
+module responsible for physically writing the output data --- typically
+this is just an interface to fwrite(), but some applications may need to
+do something else with the data.
+
+The decompressor library contains the following main elements:
+
+  JPEG proper:
+    * Entropy decoding (sequential or progressive, Huffman or arithmetic).
+    * Dequantization, inverse DCT, MCU disassembly.
+  Postprocessing:
+    * Upsampling.  Optionally, this step may be able to do more general
+      rescaling of the image.
+    * Color space conversion (e.g., YCbCr to RGB).  This step may also
+      provide gamma adjustment [ currently it does not ].
+    * Optional color quantization (e.g., reduction to 256 colors).
+    * Optional color precision reduction (e.g., 24-bit to 15-bit color).
+      [This feature is not currently implemented.]
+
+We also need overall control, marker parsing, and a data source module.
+The support code (memory management & error handling) can be shared with
+the compression half of the library.
+
+There may be several implementations of each of these elements, particularly
+in the decompressor, where a wide range of speed/quality tradeoffs is very
+useful.  It must be understood that some of the best speedups involve
+merging adjacent steps in the pipeline.  For example, upsampling, color space
+conversion, and color quantization might all be done at once when using a
+low-quality ordered-dither technique.  The system architecture is designed to
+allow such merging where appropriate.
+
+
+Note: it is convenient to regard edge expansion (padding to block boundaries)
+as a preprocessing/postprocessing function, even though
+Rec. ITU-T T.81 | ISO/IEC 10918-1 includes it in compression/decompression.  We
+do this because downsampling/upsampling can be simplified a little if they work
+on padded data: it's not necessary to have special cases at the right and
+bottom edges.  Therefore the interface buffer is always an integral number of
+blocks wide and high, and we expect compression preprocessing to pad the source
+data properly.  Padding will occur only to the next block (8-sample) boundary.
+In an interleaved-scan situation, additional dummy blocks may be used to fill
+out MCUs, but the MCU assembly and disassembly logic will create or discard
+these blocks internally.  (This is advantageous for speed reasons, since we
+avoid DCTing the dummy blocks.  It also permits a small reduction in file size,
+because the compressor can choose dummy block contents so as to minimize their
+size in compressed form.  Finally, it makes the interface buffer specification
+independent of whether the file is actually interleaved or not.)  Applications
+that wish to deal directly with the downsampled data must provide similar
+buffering and padding for odd-sized images.
+
+
+*** Poor man's object-oriented programming ***
+
+It should be clear by now that we have a lot of quasi-independent processing
+steps, many of which have several possible behaviors.  To avoid cluttering the
+code with lots of switch statements, we use a simple form of object-style
+programming to separate out the different possibilities.
+
+For example, two different color quantization algorithms could be implemented
+as two separate modules that present the same external interface; at runtime,
+the calling code will access the proper module indirectly through an "object".
+
+We can get the limited features we need while staying within portable C.
+The basic tool is a function pointer.  An "object" is just a struct
+containing one or more function pointer fields, each of which corresponds to
+a method name in real object-oriented languages.  During initialization we
+fill in the function pointers with references to whichever module we have
+determined we need to use in this run.  Then invocation of the module is done
+by indirecting through a function pointer; on most machines this is no more
+expensive than a switch statement, which would be the only other way of
+making the required run-time choice.  The really significant benefit, of
+course, is keeping the source code clean and well structured.
+
+We can also arrange to have private storage that varies between different
+implementations of the same kind of object.  We do this by making all the
+module-specific object structs be separately allocated entities, which will
+be accessed via pointers in the master compression or decompression struct.
+The "public" fields or methods for a given kind of object are specified by
+a commonly known struct.  But a module's initialization code can allocate
+a larger struct that contains the common struct as its first member, plus
+additional private fields.  With appropriate pointer casting, the module's
+internal functions can access these private fields.  (For a simple example,
+see jdatadst.c, which implements the external interface specified by struct
+jpeg_destination_mgr, but adds extra fields.)
+
+(Of course this would all be a lot easier if we were using C++, but we are
+not yet prepared to assume that everyone has a C++ compiler.)
+
+An important benefit of this scheme is that it is easy to provide multiple
+versions of any method, each tuned to a particular case.  While a lot of
+precalculation might be done to select an optimal implementation of a method,
+the cost per invocation is constant.  For example, the upsampling step might
+have a "generic" method, plus one or more "hardwired" methods for the most
+popular sampling factors; the hardwired methods would be faster because they'd
+use straight-line code instead of for-loops.  The cost to determine which
+method to use is paid only once, at startup, and the selection criteria are
+hidden from the callers of the method.
+
+This plan differs a little bit from usual object-oriented structures, in that
+only one instance of each object class will exist during execution.  The
+reason for having the class structure is that on different runs we may create
+different instances (choose to execute different modules).  You can think of
+the term "method" as denoting the common interface presented by a particular
+set of interchangeable functions, and "object" as denoting a group of related
+methods, or the total shared interface behavior of a group of modules.
+
+
+*** Overall control structure ***
+
+We previously mentioned the need for overall control logic in the compression
+and decompression libraries.  In IJG implementations prior to v5, overall
+control was mostly provided by "pipeline control" modules, which proved to be
+large, unwieldy, and hard to understand.  To improve the situation, the
+control logic has been subdivided into multiple modules.  The control modules
+consist of:
+
+1. Master control for module selection and initialization.  This has two
+responsibilities:
+
+   1A.  Startup initialization at the beginning of image processing.
+        The individual processing modules to be used in this run are selected
+        and given initialization calls.
+
+   1B.  Per-pass control.  This determines how many passes will be performed
+        and calls each active processing module to configure itself
+        appropriately at the beginning of each pass.  End-of-pass processing,
+        where necessary, is also invoked from the master control module.
+
+   Method selection is partially distributed, in that a particular processing
+   module may contain several possible implementations of a particular method,
+   which it will select among when given its initialization call.  The master
+   control code need only be concerned with decisions that affect more than
+   one module.
+
+2. Data buffering control.  A separate control module exists for each
+   inter-processing-step data buffer.  This module is responsible for
+   invoking the processing steps that write or read that data buffer.
+
+Each buffer controller sees the world as follows:
+
+input data => processing step A => buffer => processing step B => output data
+                      |              |               |
+              ------------------ controller ------------------
+
+The controller knows the dataflow requirements of steps A and B: how much data
+they want to accept in one chunk and how much they output in one chunk.  Its
+function is to manage its buffer and call A and B at the proper times.
+
+A data buffer control module may itself be viewed as a processing step by a
+higher-level control module; thus the control modules form a binary tree with
+elementary processing steps at the leaves of the tree.
+
+The control modules are objects.  A considerable amount of flexibility can
+be had by replacing implementations of a control module.  For example:
+* Merging of adjacent steps in the pipeline is done by replacing a control
+  module and its pair of processing-step modules with a single processing-
+  step module.  (Hence the possible merges are determined by the tree of
+  control modules.)
+* In some processing modes, a given interstep buffer need only be a "strip"
+  buffer large enough to accommodate the desired data chunk sizes.  In other
+  modes, a full-image buffer is needed and several passes are required.
+  The control module determines which kind of buffer is used and manipulates
+  virtual array buffers as needed.  One or both processing steps may be
+  unaware of the multi-pass behavior.
+
+In theory, we might be able to make all of the data buffer controllers
+interchangeable and provide just one set of implementations for all.  In
+practice, each one contains considerable special-case processing for its
+particular job.  The buffer controller concept should be regarded as an
+overall system structuring principle, not as a complete description of the
+task performed by any one controller.
+
+
+*** Compression object structure ***
+
+Here is a sketch of the logical structure of the JPEG compression library:
+
+                                                 |-- Colorspace conversion
+                  |-- Preprocessing controller --|
+                  |                              |-- Downsampling
+Main controller --|
+                  |                            |-- Forward DCT, quantize
+                  |-- Coefficient controller --|
+                                               |-- Entropy encoding
+
+This sketch also describes the flow of control (subroutine calls) during
+typical image data processing.  Each of the components shown in the diagram is
+an "object" which may have several different implementations available.  One
+or more source code files contain the actual implementation(s) of each object.
+
+The objects shown above are:
+
+* Main controller: buffer controller for the subsampled-data buffer, which
+  holds the preprocessed input data.  This controller invokes preprocessing to
+  fill the subsampled-data buffer, and JPEG compression to empty it.  There is
+  usually no need for a full-image buffer here; a strip buffer is adequate.
+
+* Preprocessing controller: buffer controller for the downsampling input data
+  buffer, which lies between colorspace conversion and downsampling.  Note
+  that a unified conversion/downsampling module would probably replace this
+  controller entirely.
+
+* Colorspace conversion: converts application image data into the desired
+  JPEG color space; also changes the data from pixel-interleaved layout to
+  separate component planes.  Processes one pixel row at a time.
+
+* Downsampling: performs reduction of chroma components as required.
+  Optionally may perform pixel-level smoothing as well.  Processes a "row
+  group" at a time, where a row group is defined as Vmax pixel rows of each
+  component before downsampling, and Vk sample rows afterwards (remember Vk
+  differs across components).  Some downsampling or smoothing algorithms may
+  require context rows above and below the current row group; the
+  preprocessing controller is responsible for supplying these rows via proper
+  buffering.  The downsampler is responsible for edge expansion at the right
+  edge (i.e., extending each sample row to a multiple of 8 samples); but the
+  preprocessing controller is responsible for vertical edge expansion (i.e.,
+  duplicating the bottom sample row as needed to make a multiple of 8 rows).
+
+* Coefficient controller: buffer controller for the DCT-coefficient data.
+  This controller handles MCU assembly, including insertion of dummy DCT
+  blocks when needed at the right or bottom edge.  When performing
+  Huffman-code optimization or emitting a multiscan JPEG file, this
+  controller is responsible for buffering the full image.  The equivalent of
+  one fully interleaved MCU row of subsampled data is processed per call,
+  even when the JPEG file is noninterleaved.
+
+* Forward DCT and quantization: Perform DCT, quantize, and emit coefficients.
+  Works on one or more DCT blocks at a time.  (Note: the coefficients are now
+  emitted in normal array order, which the entropy encoder is expected to
+  convert to zigzag order as necessary.  Prior versions of the IJG code did
+  the conversion to zigzag order within the quantization step.)
+
+* Entropy encoding: Perform Huffman or arithmetic entropy coding and emit the
+  coded data to the data destination module.  Works on one MCU per call.
+  For progressive JPEG, the same DCT blocks are fed to the entropy coder
+  during each pass, and the coder must emit the appropriate subset of
+  coefficients.
+
+In addition to the above objects, the compression library includes these
+objects:
+
+* Master control: determines the number of passes required, controls overall
+  and per-pass initialization of the other modules.
+
+* Marker writing: generates JPEG markers (except for RSTn, which is emitted
+  by the entropy encoder when needed).
+
+* Data destination manager: writes the output JPEG datastream to its final
+  destination (e.g., a file).  The destination manager supplied with the
+  library knows how to write to a stdio stream or to a memory buffer;
+  for other behaviors, the surrounding application may provide its own
+  destination manager.
+
+* Memory manager: allocates and releases memory, controls virtual arrays
+  (with backing store management, where required).
+
+* Error handler: performs formatting and output of error and trace messages;
+  determines handling of nonfatal errors.  The surrounding application may
+  override some or all of this object's methods to change error handling.
+
+* Progress monitor: supports output of "percent-done" progress reports.
+  This object represents an optional callback to the surrounding application:
+  if wanted, it must be supplied by the application.
+
+The error handler, destination manager, and progress monitor objects are
+defined as separate objects in order to simplify application-specific
+customization of the JPEG library.  A surrounding application may override
+individual methods or supply its own all-new implementation of one of these
+objects.  The object interfaces for these objects are therefore treated as
+part of the application interface of the library, whereas the other objects
+are internal to the library.
+
+The error handler and memory manager are shared by JPEG compression and
+decompression; the progress monitor, if used, may be shared as well.
+
+
+*** Decompression object structure ***
+
+Here is a sketch of the logical structure of the JPEG decompression library:
+
+                                               |-- Entropy decoding
+                  |-- Coefficient controller --|
+                  |                            |-- Dequantize, Inverse DCT
+Main controller --|
+                  |                               |-- Upsampling
+                  |-- Postprocessing controller --|   |-- Colorspace conversion
+                                                  |-- Color quantization
+                                                  |-- Color precision reduction
+
+As before, this diagram also represents typical control flow.  The objects
+shown are:
+
+* Main controller: buffer controller for the subsampled-data buffer, which
+  holds the output of JPEG decompression proper.  This controller's primary
+  task is to feed the postprocessing procedure.  Some upsampling algorithms
+  may require context rows above and below the current row group; when this
+  is true, the main controller is responsible for managing its buffer so as
+  to make context rows available.  In the current design, the main buffer is
+  always a strip buffer; a full-image buffer is never required.
+
+* Coefficient controller: buffer controller for the DCT-coefficient data.
+  This controller handles MCU disassembly, including deletion of any dummy
+  DCT blocks at the right or bottom edge.  When reading a multiscan JPEG
+  file, this controller is responsible for buffering the full image.
+  (Buffering DCT coefficients, rather than samples, is necessary to support
+  progressive JPEG.)  The equivalent of one fully interleaved MCU row of
+  subsampled data is processed per call, even when the source JPEG file is
+  noninterleaved.
+
+* Entropy decoding: Read coded data from the data source module and perform
+  Huffman or arithmetic entropy decoding.  Works on one MCU per call.
+  For progressive JPEG decoding, the coefficient controller supplies the prior
+  coefficients of each MCU (initially all zeroes), which the entropy decoder
+  modifies in each scan.
+
+* Dequantization and inverse DCT: like it says.  Note that the coefficients
+  buffered by the coefficient controller have NOT been dequantized; we
+  merge dequantization and inverse DCT into a single step for speed reasons.
+  When scaled-down output is asked for, simplified DCT algorithms may be used
+  that emit fewer samples per DCT block, not the full 8x8.  Works on one DCT
+  block at a time.
+
+* Postprocessing controller: buffer controller for the color quantization
+  input buffer, when quantization is in use.  (Without quantization, this
+  controller just calls the upsampler.)  For two-pass quantization, this
+  controller is responsible for buffering the full-image data.
+
+* Upsampling: restores chroma components to full size.  (May support more
+  general output rescaling, too.  Note that if undersized DCT outputs have
+  been emitted by the DCT module, this module must adjust so that properly
+  sized outputs are created.)  Works on one row group at a time.  This module
+  also calls the color conversion module, so its top level is effectively a
+  buffer controller for the upsampling->color conversion buffer.  However, in
+  all but the highest-quality operating modes, upsampling and color
+  conversion are likely to be merged into a single step.
+
+* Colorspace conversion: convert from JPEG color space to output color space,
+  and change data layout from separate component planes to pixel-interleaved.
+  Works on one pixel row at a time.
+
+* Color quantization: reduce the data to colormapped form, using either an
+  externally specified colormap or an internally generated one.  This module
+  is not used for full-color output.  Works on one pixel row at a time; may
+  require two passes to generate a color map.  Note that the output will
+  always be a single component representing colormap indexes.  In the current
+  design, the output values are JSAMPLEs, so an 8-bit compilation cannot
+  quantize to more than 256 colors.  This is unlikely to be a problem in
+  practice.
+
+* Color reduction: this module handles color precision reduction, e.g.,
+  generating 15-bit color (5 bits/primary) from JPEG's 24-bit output.
+  Not quite clear yet how this should be handled... should we merge it with
+  colorspace conversion???
+
+Note that some high-speed operating modes might condense the entire
+postprocessing sequence to a single module (upsample, color convert, and
+quantize in one step).
+
+In addition to the above objects, the decompression library includes these
+objects:
+
+* Master control: determines the number of passes required, controls overall
+  and per-pass initialization of the other modules.  This is subdivided into
+  input and output control: jdinput.c controls only input-side processing,
+  while jdmaster.c handles overall initialization and output-side control.
+
+* Marker reading: decodes JPEG markers (except for RSTn).
+
+* Data source manager: supplies the input JPEG datastream.  The source
+  manager supplied with the library knows how to read from a stdio stream
+  or from a memory buffer;  for other behaviors, the surrounding application
+  may provide its own source manager.
+
+* Memory manager: same as for compression library.
+
+* Error handler: same as for compression library.
+
+* Progress monitor: same as for compression library.
+
+As with compression, the data source manager, error handler, and progress
+monitor are candidates for replacement by a surrounding application.
+
+
+*** Decompression input and output separation ***
+
+To support efficient incremental display of progressive JPEG files, the
+decompressor is divided into two sections that can run independently:
+
+1. Data input includes marker parsing, entropy decoding, and input into the
+   coefficient controller's DCT coefficient buffer.  Note that this
+   processing is relatively cheap and fast.
+
+2. Data output reads from the DCT coefficient buffer and performs the IDCT
+   and all postprocessing steps.
+
+For a progressive JPEG file, the data input processing is allowed to get
+arbitrarily far ahead of the data output processing.  (This occurs only
+if the application calls jpeg_consume_input(); otherwise input and output
+run in lockstep, since the input section is called only when the output
+section needs more data.)  In this way the application can avoid making
+extra display passes when data is arriving faster than the display pass
+can run.  Furthermore, it is possible to abort an output pass without
+losing anything, since the coefficient buffer is read-only as far as the
+output section is concerned.  See libjpeg.txt for more detail.
+
+A full-image coefficient array is only created if the JPEG file has multiple
+scans (or if the application specifies buffered-image mode anyway).  When
+reading a single-scan file, the coefficient controller normally creates only
+a one-MCU buffer, so input and output processing must run in lockstep in this
+case.  jpeg_consume_input() is effectively a no-op in this situation.
+
+The main impact of dividing the decompressor in this fashion is that we must
+be very careful with shared variables in the cinfo data structure.  Each
+variable that can change during the course of decompression must be
+classified as belonging to data input or data output, and each section must
+look only at its own variables.  For example, the data output section may not
+depend on any of the variables that describe the current scan in the JPEG
+file, because these may change as the data input section advances into a new
+scan.
+
+The progress monitor is (somewhat arbitrarily) defined to treat input of the
+file as one pass when buffered-image mode is not used, and to ignore data
+input work completely when buffered-image mode is used.  Note that the
+library has no reliable way to predict the number of passes when dealing
+with a progressive JPEG file, nor can it predict the number of output passes
+in buffered-image mode.  So the work estimate is inherently bogus anyway.
+
+No comparable division is currently made in the compression library, because
+there isn't any real need for it.
+
+
+*** Data formats ***
+
+Arrays of pixel sample values use the following data structure:
+
+    typedef something JSAMPLE;          a pixel component value, 0..MAXJSAMPLE
+    typedef JSAMPLE *JSAMPROW;          ptr to a row of samples
+    typedef JSAMPROW *JSAMPARRAY;       ptr to a list of rows
+    typedef JSAMPARRAY *JSAMPIMAGE;     ptr to a list of color-component arrays
+
+The basic element type JSAMPLE will be one of unsigned char or short.  Short
+will be used if samples wider than 8 bits are to be supported (this is a
+compile-time option).  Otherwise, unsigned char is used.
+
+With these conventions, JSAMPLE values can be assumed to be >= 0.  This helps
+simplify correct rounding during downsampling, etc.  The JPEG standard's
+specification that sample values run from -128..127 is accommodated by
+subtracting 128 from the sample value in the DCT step.  Similarly, during
+decompression the output of the IDCT step will be immediately shifted back to
+0..255.  (NB: different values are required when 12-bit samples are in use.
+The code is written in terms of MAXJSAMPLE and CENTERJSAMPLE, which will be
+defined as 255 and 128 respectively in an 8-bit implementation, and as 4095
+and 2048 in a 12-bit implementation.)
+
+We use a pointer per row, rather than a two-dimensional JSAMPLE array.  This
+choice costs only a small amount of memory and has several benefits:
+* Code using the data structure doesn't need to know the allocated width of
+  the rows.  This simplifies edge expansion/compression, since we can work
+  in an array that's wider than the logical picture width.
+* Indexing doesn't require multiplication; this is a performance win on many
+  machines.
+* Arrays with more than 64K total elements can be supported even on machines
+  where malloc() cannot allocate chunks larger than 64K.
+* The rows forming a component array may be allocated at different times
+  without extra copying.  This trick allows some speedups in smoothing steps
+  that need access to the previous and next rows.
+
+Note that each color component is stored in a separate array; we don't use the
+traditional layout in which the components of a pixel are stored together.
+This simplifies coding of modules that work on each component independently,
+because they don't need to know how many components there are.  Furthermore,
+we can read or write each component to a temporary file independently, which
+is helpful when dealing with noninterleaved JPEG files.
+
+In general, a specific sample value is accessed by code such as
+        image[colorcomponent][row][col]
+where col is measured from the image left edge, but row is measured from the
+first sample row currently in memory.  Either of the first two indexings can
+be precomputed by copying the relevant pointer.
+
+
+Since most image-processing applications prefer to work on images in which
+the components of a pixel are stored together, the data passed to or from the
+surrounding application uses the traditional convention: a single pixel is
+represented by N consecutive JSAMPLE values, and an image row is an array of
+(# of color components)*(image width) JSAMPLEs.  One or more rows of data can
+be represented by a pointer of type JSAMPARRAY in this scheme.  This scheme is
+converted to component-wise storage inside the JPEG library.  (Applications
+that want to skip JPEG preprocessing or postprocessing will have to contend
+with component-wise storage.)
+
+
+Arrays of DCT-coefficient values use the following data structure:
+
+    typedef short JCOEF;                a 16-bit signed integer
+    typedef JCOEF JBLOCK[DCTSIZE2];     an 8x8 block of coefficients
+    typedef JBLOCK *JBLOCKROW;          ptr to one horizontal row of 8x8 blocks
+    typedef JBLOCKROW *JBLOCKARRAY;     ptr to a list of such rows
+    typedef JBLOCKARRAY *JBLOCKIMAGE;   ptr to a list of color component arrays
+
+The underlying type is at least a 16-bit signed integer; while "short" is big
+enough on all machines of interest, on some machines it is preferable to use
+"int" for speed reasons, despite the storage cost.  Coefficients are grouped
+into 8x8 blocks (but we always use #defines DCTSIZE and DCTSIZE2 rather than
+"8" and "64").
+
+The contents of a coefficient block may be in either "natural" or zigzagged
+order, and may be true values or divided by the quantization coefficients,
+depending on where the block is in the processing pipeline.  In the current
+library, coefficient blocks are kept in natural order everywhere; the entropy
+codecs zigzag or dezigzag the data as it is written or read.  The blocks
+contain quantized coefficients everywhere outside the DCT/IDCT subsystems.
+(This latter decision may need to be revisited to support variable
+quantization a la JPEG Part 3.)
+
+Notice that the allocation unit is now a row of 8x8 blocks, corresponding to
+eight rows of samples.  Otherwise the structure is much the same as for
+samples, and for the same reasons.
+
+
+*** Suspendable processing ***
+
+In some applications it is desirable to use the JPEG library as an
+incremental, memory-to-memory filter.  In this situation the data source or
+destination may be a limited-size buffer, and we can't rely on being able to
+empty or refill the buffer at arbitrary times.  Instead the application would
+like to have control return from the library at buffer overflow/underrun, and
+then resume compression or decompression at a later time.
+
+This scenario is supported for simple cases.  (For anything more complex, we
+recommend that the application "bite the bullet" and develop real multitasking
+capability.)  The libjpeg.txt file goes into more detail about the usage and
+limitations of this capability; here we address the implications for library
+structure.
+
+The essence of the problem is that the entropy codec (coder or decoder) must
+be prepared to stop at arbitrary times.  In turn, the controllers that call
+the entropy codec must be able to stop before having produced or consumed all
+the data that they normally would handle in one call.  That part is reasonably
+straightforward: we make the controller call interfaces include "progress
+counters" which indicate the number of data chunks successfully processed, and
+we require callers to test the counter rather than just assume all of the data
+was processed.
+
+Rather than trying to restart at an arbitrary point, the current Huffman
+codecs are designed to restart at the beginning of the current MCU after a
+suspension due to buffer overflow/underrun.  At the start of each call, the
+codec's internal state is loaded from permanent storage (in the JPEG object
+structures) into local variables.  On successful completion of the MCU, the
+permanent state is updated.  (This copying is not very expensive, and may even
+lead to *improved* performance if the local variables can be registerized.)
+If a suspension occurs, the codec simply returns without updating the state,
+thus effectively reverting to the start of the MCU.  Note that this implies
+leaving some data unprocessed in the source/destination buffer (ie, the
+compressed partial MCU).  The data source/destination module interfaces are
+specified so as to make this possible.  This also implies that the data buffer
+must be large enough to hold a worst-case compressed MCU; a couple thousand
+bytes should be enough.
+
+In a successive-approximation AC refinement scan, the progressive Huffman
+decoder has to be able to undo assignments of newly nonzero coefficients if it
+suspends before the MCU is complete, since decoding requires distinguishing
+previously-zero and previously-nonzero coefficients.  This is a bit tedious
+but probably won't have much effect on performance.  Other variants of Huffman
+decoding need not worry about this, since they will just store the same values
+again if forced to repeat the MCU.
+
+This approach would probably not work for an arithmetic codec, since its
+modifiable state is quite large and couldn't be copied cheaply.  Instead it
+would have to suspend and resume exactly at the point of the buffer end.
+
+The JPEG marker reader is designed to cope with suspension at an arbitrary
+point.  It does so by backing up to the start of the marker parameter segment,
+so the data buffer must be big enough to hold the largest marker of interest.
+Again, a couple KB should be adequate.  (A special "skip" convention is used
+to bypass COM and APPn markers, so these can be larger than the buffer size
+without causing problems; otherwise a 64K buffer would be needed in the worst
+case.)
+
+The JPEG marker writer currently does *not* cope with suspension.
+We feel that this is not necessary; it is much easier simply to require
+the application to ensure there is enough buffer space before starting.  (An
+empty 2K buffer is more than sufficient for the header markers; and ensuring
+there are a dozen or two bytes available before calling jpeg_finish_compress()
+will suffice for the trailer.)  This would not work for writing multi-scan
+JPEG files, but we simply do not intend to support that capability with
+suspension.
+
+
+*** Memory manager services ***
+
+The JPEG library's memory manager controls allocation and deallocation of
+memory, and it manages large "virtual" data arrays on machines where the
+operating system does not provide virtual memory.  Note that the same
+memory manager serves both compression and decompression operations.
+
+In all cases, allocated objects are tied to a particular compression or
+decompression master record, and they will be released when that master
+record is destroyed.
+
+The memory manager does not provide explicit deallocation of objects.
+Instead, objects are created in "pools" of free storage, and a whole pool
+can be freed at once.  This approach helps prevent storage-leak bugs, and
+it speeds up operations whenever malloc/free are slow (as they often are).
+The pools can be regarded as lifetime identifiers for objects.  Two
+pools/lifetimes are defined:
+  * JPOOL_PERMANENT     lasts until master record is destroyed
+  * JPOOL_IMAGE         lasts until done with image (JPEG datastream)
+Permanent lifetime is used for parameters and tables that should be carried
+across from one datastream to another; this includes all application-visible
+parameters.  Image lifetime is used for everything else.  (A third lifetime,
+JPOOL_PASS = one processing pass, was originally planned.  However it was
+dropped as not being worthwhile.  The actual usage patterns are such that the
+peak memory usage would be about the same anyway; and having per-pass storage
+substantially complicates the virtual memory allocation rules --- see below.)
+
+The memory manager deals with three kinds of object:
+1. "Small" objects.  Typically these require no more than 10K-20K total.
+2. "Large" objects.  These may require tens to hundreds of K depending on
+   image size.  Semantically they behave the same as small objects, but we
+   distinguish them because pool allocation heuristics may differ for large and
+   small objects (historically, large objects were also referenced by far
+   pointers on MS-DOS machines.)  Note that individual "large" objects cannot
+   exceed the size allowed by type size_t, which may be 64K or less on some
+   machines.
+3. "Virtual" objects.  These are large 2-D arrays of JSAMPLEs or JBLOCKs
+   (typically large enough for the entire image being processed).  The
+   memory manager provides stripwise access to these arrays.  On machines
+   without virtual memory, the rest of the array may be swapped out to a
+   temporary file.
+
+(Note: JSAMPARRAY and JBLOCKARRAY data structures are a combination of large
+objects for the data proper and small objects for the row pointers.  For
+convenience and speed, the memory manager provides single routines to create
+these structures.  Similarly, virtual arrays include a small control block
+and a JSAMPARRAY or JBLOCKARRAY working buffer, all created with one call.)
+
+In the present implementation, virtual arrays are only permitted to have image
+lifespan.  (Permanent lifespan would not be reasonable, and pass lifespan is
+not very useful since a virtual array's raison d'etre is to store data for
+multiple passes through the image.)  We also expect that only "small" objects
+will be given permanent lifespan, though this restriction is not required by
+the memory manager.
+
+In a non-virtual-memory machine, some performance benefit can be gained by
+making the in-memory buffers for virtual arrays be as large as possible.
+(For small images, the buffers might fit entirely in memory, so blind
+swapping would be very wasteful.)  The memory manager will adjust the height
+of the buffers to fit within a prespecified maximum memory usage.  In order
+to do this in a reasonably optimal fashion, the manager needs to allocate all
+of the virtual arrays at once.  Therefore, there isn't a one-step allocation
+routine for virtual arrays; instead, there is a "request" routine that simply
+allocates the control block, and a "realize" routine (called just once) that
+determines space allocation and creates all of the actual buffers.  The
+realize routine must allow for space occupied by non-virtual large objects.
+(We don't bother to factor in the space needed for small objects, on the
+grounds that it isn't worth the trouble.)
+
+To support all this, we establish the following protocol for doing business
+with the memory manager:
+  1. Modules must request virtual arrays (which may have only image lifespan)
+     during the initial setup phase, i.e., in their jinit_xxx routines.
+  2. All "large" objects (including JSAMPARRAYs and JBLOCKARRAYs) must also be
+     allocated during initial setup.
+  3. realize_virt_arrays will be called at the completion of initial setup.
+     The above conventions ensure that sufficient information is available
+     for it to choose a good size for virtual array buffers.
+Small objects of any lifespan may be allocated at any time.  We expect that
+the total space used for small objects will be small enough to be negligible
+in the realize_virt_arrays computation.
+
+In a virtual-memory machine, we simply pretend that the available space is
+infinite, thus causing realize_virt_arrays to decide that it can allocate all
+the virtual arrays as full-size in-memory buffers.  The overhead of the
+virtual-array access protocol is very small when no swapping occurs.
+
+A virtual array can be specified to be "pre-zeroed"; when this flag is set,
+never-yet-written sections of the array are set to zero before being made
+available to the caller.  If this flag is not set, never-written sections
+of the array contain garbage.  (This feature exists primarily because the
+equivalent logic would otherwise be needed in jdcoefct.c for progressive
+JPEG mode; we may as well make it available for possible other uses.)
+
+The first write pass on a virtual array is required to occur in top-to-bottom
+order; read passes, as well as any write passes after the first one, may
+access the array in any order.  This restriction exists partly to simplify
+the virtual array control logic, and partly because some file systems may not
+support seeking beyond the current end-of-file in a temporary file.  The main
+implication of this restriction is that rearrangement of rows (such as
+converting top-to-bottom data order to bottom-to-top) must be handled while
+reading data out of the virtual array, not while putting it in.
+
+
+*** Memory manager internal structure ***
+
+To isolate system dependencies as much as possible, we have broken the
+memory manager into two parts.  There is a reasonably system-independent
+"front end" (jmemmgr.c) and a "back end" that contains only the code
+likely to change across systems.  All of the memory management methods
+outlined above are implemented by the front end.  The back end provides
+the following routines for use by the front end (none of these routines
+are known to the rest of the JPEG code):
+
+jpeg_mem_init, jpeg_mem_term    system-dependent initialization/shutdown
+
+jpeg_get_small, jpeg_free_small interface to malloc and free library routines
+                                (or their equivalents)
+
+jpeg_get_large, jpeg_free_large historically was used to interface with
+                                FAR malloc/free on MS-DOS machines;  now the
+                                same as jpeg_get_small/jpeg_free_small
+
+jpeg_mem_available              estimate available memory
+
+jpeg_open_backing_store         create a backing-store object
+
+read_backing_store,             manipulate a backing-store object
+write_backing_store,
+close_backing_store
+
+On some systems there will be more than one type of backing-store object.
+jpeg_open_backing_store is responsible for choosing how to implement a given
+object.  The read/write/close routines are method pointers in the structure
+that describes a given object; this lets them be different for different object
+types.
+
+It may be necessary to ensure that backing store objects are explicitly
+released upon abnormal program termination.  To support this, we will expect
+the main program or surrounding application to arrange to call self_destruct
+(typically via jpeg_destroy) upon abnormal termination.  This may require a
+SIGINT signal handler or equivalent.  We don't want to have the back end module
+install its own signal handler, because that would pre-empt the surrounding
+application's ability to control signal handling.
+
+The IJG distribution includes several memory manager back end implementations.
+Usually the same back end should be suitable for all applications on a given
+system, but it is possible for an application to supply its own back end at
+need.
+
+
+*** Implications of DNL marker ***
+
+Some JPEG files may use a DNL marker to postpone definition of the image
+height (this would be useful for a fax-like scanner's output, for instance).
+In these files the SOF marker claims the image height is 0, and you only
+find out the true image height at the end of the first scan.
+
+We could read these files as follows:
+1. Upon seeing zero image height, replace it by 65535 (the maximum allowed).
+2. When the DNL is found, update the image height in the global image
+   descriptor.
+This implies that control modules must avoid making copies of the image
+height, and must re-test for termination after each MCU row.  This would
+be easy enough to do.
+
+In cases where image-size data structures are allocated, this approach will
+result in very inefficient use of virtual memory or much-larger-than-necessary
+temporary files.  This seems acceptable for something that probably won't be a
+mainstream usage.  People might have to forgo use of memory-hogging options
+(such as two-pass color quantization or noninterleaved JPEG files) if they
+want efficient conversion of such files.  (One could improve efficiency by
+demanding a user-supplied upper bound for the height, less than 65536; in most
+cases it could be much less.)
+
+The standard also permits the SOF marker to overestimate the image height,
+with a DNL to give the true, smaller height at the end of the first scan.
+This would solve the space problems if the overestimate wasn't too great.
+However, it implies that you don't even know whether DNL will be used.
+
+This leads to a couple of very serious objections:
+1. Testing for a DNL marker must occur in the inner loop of the decompressor's
+   Huffman decoder; this implies a speed penalty whether the feature is used
+   or not.
+2. There is no way to hide the last-minute change in image height from an
+   application using the decoder.  Thus *every* application using the IJG
+   library would suffer a complexity penalty whether it cared about DNL or
+   not.
+We currently do not support DNL because of these problems.
+
+A different approach is to insist that DNL-using files be preprocessed by a
+separate program that reads ahead to the DNL, then goes back and fixes the SOF
+marker.  This is a much simpler solution and is probably far more efficient.
+Even if one wants piped input, buffering the first scan of the JPEG file needs
+a lot smaller temp file than is implied by the maximum-height method.  For
+this approach we'd simply treat DNL as a no-op in the decompressor (at most,
+check that it matches the SOF image height).
+
+We will not worry about making the compressor capable of outputting DNL.
+Something similar to the first scheme above could be applied if anyone ever
+wants to make that work.
diff --git a/external/jpeg/tjbench.c b/external/jpeg/tjbench.c
index faad9784b82a..156c9061dc44 100644
--- a/external/jpeg/tjbench.c
+++ b/external/jpeg/tjbench.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2019 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2019, 2021 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -800,6 +800,8 @@ static void usage(char *progName)
   printf("-componly = Stop after running compression tests.  Do not test decompression.\n");
   printf("-nowrite = Do not write reference or output images (improves consistency of\n");
   printf("     performance measurements.)\n");
+  printf("-limitscans = Refuse to decompress or transform progressive JPEG images that\n");
+  printf("     have an unreasonably large number of scans\n");
   printf("-stoponwarning = Immediately discontinue the current\n");
   printf("     compression/decompression/transform operation if the underlying codec\n");
   printf("     throws a warning (non-fatal error)\n\n");
@@ -955,6 +957,8 @@ int main(int argc, char *argv[])
         compOnly = 1;
       else if (!strcasecmp(argv[i], "-nowrite"))
         doWrite = 0;
+      else if (!strcasecmp(argv[i], "-limitscans"))
+        flags |= TJFLAG_LIMITSCANS;
       else if (!strcasecmp(argv[i], "-stoponwarning"))
         flags |= TJFLAG_STOPONWARNING;
       else usage(argv[0]);
diff --git a/external/jpeg/tjexample.c b/external/jpeg/tjexample.c
index ef32c939a91e..a9cd865b3ea7 100644
--- a/external/jpeg/tjexample.c
+++ b/external/jpeg/tjexample.c
@@ -1,6 +1,6 @@
 /*
- * Copyright (C)2011-2012, 2014-2015, 2017, 2019 D. R. Commander.
- *                                               All Rights Reserved.
+ * Copyright (C)2011-2012, 2014-2015, 2017, 2019, 2021 D. R. Commander.
+ *                                                     All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -288,8 +288,10 @@ int main(int argc, char **argv)
         THROW_TJ("initializing transformer");
       xform.options |= TJXOPT_TRIM;
       if (tjTransform(tjInstance, jpegBuf, jpegSize, 1, &dstBuf, &dstSize,
-                      &xform, flags) < 0)
+                      &xform, flags) < 0) {
+        tjFree(dstBuf);
         THROW_TJ("transforming input image");
+      }
       tjFree(jpegBuf);
       jpegBuf = dstBuf;
       jpegSize = dstSize;
diff --git a/external/jpeg/tjutil.h b/external/jpeg/tjutil.h
index f72840ceeba5..8542bab982ef 100644
--- a/external/jpeg/tjutil.h
+++ b/external/jpeg/tjutil.h
@@ -30,7 +30,7 @@
 #ifndef __MINGW32__
 #include <stdio.h>
 #define snprintf(str, n, format, ...) \
-  _snprintf_s(str, n, _TRUNCATE, format, __VA_ARGS__)
+  _snprintf_s(str, n, _TRUNCATE, format, ##__VA_ARGS__)
 #endif
 #define strcasecmp  stricmp
 #define strncasecmp  strnicmp
diff --git a/external/jpeg/transupp.c b/external/jpeg/transupp.c
index 77b33c46b88a..6e860778908f 100644
--- a/external/jpeg/transupp.c
+++ b/external/jpeg/transupp.c
@@ -2,7 +2,7 @@
  * transupp.c
  *
  * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1997-2011, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 1997-2019, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2010, 2017, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -88,6 +88,189 @@
  */
 
 
+LOCAL(void)
+dequant_comp(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+             jvirt_barray_ptr coef_array, JQUANT_TBL *qtblptr1)
+{
+  JDIMENSION blk_x, blk_y;
+  int offset_y, k;
+  JQUANT_TBL *qtblptr;
+  JBLOCKARRAY buffer;
+  JBLOCKROW block;
+  JCOEFPTR ptr;
+
+  qtblptr = compptr->quant_table;
+  for (blk_y = 0; blk_y < compptr->height_in_blocks;
+       blk_y += compptr->v_samp_factor) {
+    buffer = (*cinfo->mem->access_virt_barray)
+      ((j_common_ptr)cinfo, coef_array, blk_y,
+       (JDIMENSION)compptr->v_samp_factor, TRUE);
+    for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+      block = buffer[offset_y];
+      for (blk_x = 0; blk_x < compptr->width_in_blocks; blk_x++) {
+        ptr = block[blk_x];
+        for (k = 0; k < DCTSIZE2; k++)
+          if (qtblptr->quantval[k] != qtblptr1->quantval[k])
+            ptr[k] *= qtblptr->quantval[k] / qtblptr1->quantval[k];
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+requant_comp(j_decompress_ptr cinfo, jpeg_component_info *compptr,
+             jvirt_barray_ptr coef_array, JQUANT_TBL *qtblptr1)
+{
+  JDIMENSION blk_x, blk_y;
+  int offset_y, k;
+  JQUANT_TBL *qtblptr;
+  JBLOCKARRAY buffer;
+  JBLOCKROW block;
+  JCOEFPTR ptr;
+  JCOEF temp, qval;
+
+  qtblptr = compptr->quant_table;
+  for (blk_y = 0; blk_y < compptr->height_in_blocks;
+       blk_y += compptr->v_samp_factor) {
+    buffer = (*cinfo->mem->access_virt_barray)
+      ((j_common_ptr)cinfo, coef_array, blk_y,
+       (JDIMENSION)compptr->v_samp_factor, TRUE);
+    for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+      block = buffer[offset_y];
+      for (blk_x = 0; blk_x < compptr->width_in_blocks; blk_x++) {
+        ptr = block[blk_x];
+        for (k = 0; k < DCTSIZE2; k++) {
+          temp = qtblptr->quantval[k];
+          qval = qtblptr1->quantval[k];
+          if (temp != qval) {
+            temp *= ptr[k];
+            /* The following quantization code is copied from jcdctmgr.c */
+#ifdef FAST_DIVIDE
+#define DIVIDE_BY(a, b)  a /= b
+#else
+#define DIVIDE_BY(a, b)  if (a >= b) a /= b;  else a = 0
+#endif
+            if (temp < 0) {
+              temp = -temp;
+              temp += qval >> 1; /* for rounding */
+              DIVIDE_BY(temp, qval);
+              temp = -temp;
+            } else {
+              temp += qval >> 1; /* for rounding */
+              DIVIDE_BY(temp, qval);
+            }
+            ptr[k] = temp;
+          }
+        }
+      }
+    }
+  }
+}
+
+
+/*
+ * Calculate largest common denominator using Euclid's algorithm.
+ */
+LOCAL(JCOEF)
+largest_common_denominator(JCOEF a, JCOEF b)
+{
+  JCOEF c;
+
+  do {
+    c = a % b;
+    a = b;
+    b = c;
+  } while (c);
+
+  return a;
+}
+
+
+LOCAL(void)
+adjust_quant(j_decompress_ptr srcinfo, jvirt_barray_ptr *src_coef_arrays,
+             j_decompress_ptr dropinfo, jvirt_barray_ptr *drop_coef_arrays,
+             boolean trim, j_compress_ptr dstinfo)
+{
+  jpeg_component_info *compptr1, *compptr2;
+  JQUANT_TBL *qtblptr1, *qtblptr2, *qtblptr3;
+  int ci, k;
+
+  for (ci = 0; ci < dstinfo->num_components && ci < dropinfo->num_components;
+       ci++) {
+    compptr1 = srcinfo->comp_info + ci;
+    compptr2 = dropinfo->comp_info + ci;
+    qtblptr1 = compptr1->quant_table;
+    qtblptr2 = compptr2->quant_table;
+    for (k = 0; k < DCTSIZE2; k++) {
+      if (qtblptr1->quantval[k] != qtblptr2->quantval[k]) {
+        if (trim)
+          requant_comp(dropinfo, compptr2, drop_coef_arrays[ci], qtblptr1);
+        else {
+          qtblptr3 = dstinfo->quant_tbl_ptrs[compptr1->quant_tbl_no];
+          for (k = 0; k < DCTSIZE2; k++)
+            if (qtblptr1->quantval[k] != qtblptr2->quantval[k])
+              qtblptr3->quantval[k] =
+                largest_common_denominator(qtblptr1->quantval[k],
+                                           qtblptr2->quantval[k]);
+          dequant_comp(srcinfo, compptr1, src_coef_arrays[ci], qtblptr3);
+          dequant_comp(dropinfo, compptr2, drop_coef_arrays[ci], qtblptr3);
+        }
+        break;
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_drop(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+        JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+        jvirt_barray_ptr *src_coef_arrays,
+        j_decompress_ptr dropinfo, jvirt_barray_ptr *drop_coef_arrays,
+        JDIMENSION drop_width, JDIMENSION drop_height)
+/* Drop (insert) the contents of another image into the source image.  If the
+ * number of components in the drop image is smaller than the number of
+ * components in the destination image, then we fill in the remaining
+ * components with zero.  This allows for dropping the contents of grayscale
+ * images into (arbitrarily sampled) color images.
+ */
+{
+  JDIMENSION comp_width, comp_height;
+  JDIMENSION blk_y, x_drop_blocks, y_drop_blocks;
+  int ci, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  jpeg_component_info *compptr;
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = drop_width * compptr->h_samp_factor;
+    comp_height = drop_height * compptr->v_samp_factor;
+    x_drop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_drop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (blk_y = 0; blk_y < comp_height; blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], blk_y + y_drop_blocks,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      if (ci < dropinfo->num_components) {
+        src_buffer = (*dropinfo->mem->access_virt_barray)
+          ((j_common_ptr)dropinfo, drop_coef_arrays[ci], blk_y,
+           (JDIMENSION)compptr->v_samp_factor, FALSE);
+        for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+          jcopy_block_row(src_buffer[offset_y],
+                          dst_buffer[offset_y] + x_drop_blocks, comp_width);
+        }
+      } else {
+        for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+          MEMZERO(dst_buffer[offset_y] + x_drop_blocks,
+                  comp_width * sizeof(JBLOCK));
+        }
+      }
+    }
+  }
+}
+
+
 LOCAL(void)
 do_crop(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
         JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
@@ -124,6 +307,417 @@ do_crop(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
 }
 
 
+LOCAL(void)
+do_crop_ext_zero(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                 JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+                 jvirt_barray_ptr *src_coef_arrays,
+                 jvirt_barray_ptr *dst_coef_arrays)
+/* Crop.  This is only used when no rotate/flip is requested with the crop.
+ * Extension: If the destination size is larger than the source, we fill in the
+ * expanded region with zero (neutral gray).  Note that we also have to zero
+ * partial iMCUs at the right and bottom edge of the source image area in this
+ * case.
+ */
+{
+  JDIMENSION MCU_cols, MCU_rows, comp_width, comp_height;
+  JDIMENSION dst_blk_y, x_crop_blocks, y_crop_blocks;
+  int ci, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  jpeg_component_info *compptr;
+
+  MCU_cols = srcinfo->output_width /
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+  MCU_rows = srcinfo->output_height /
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = MCU_cols * compptr->h_samp_factor;
+    comp_height = MCU_rows * compptr->v_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      if (dstinfo->_jpeg_height > srcinfo->output_height) {
+        if (dst_blk_y < y_crop_blocks ||
+            dst_blk_y >= y_crop_blocks + comp_height) {
+          for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+            MEMZERO(dst_buffer[offset_y],
+                    compptr->width_in_blocks * sizeof(JBLOCK));
+          }
+          continue;
+        }
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y - y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+           FALSE);
+      } else {
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y + y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+           FALSE);
+      }
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        if (dstinfo->_jpeg_width > srcinfo->output_width) {
+          if (x_crop_blocks > 0) {
+            MEMZERO(dst_buffer[offset_y], x_crop_blocks * sizeof(JBLOCK));
+          }
+          jcopy_block_row(src_buffer[offset_y],
+                          dst_buffer[offset_y] + x_crop_blocks, comp_width);
+          if (compptr->width_in_blocks > x_crop_blocks + comp_width) {
+            MEMZERO(dst_buffer[offset_y] + x_crop_blocks + comp_width,
+                    (compptr->width_in_blocks - x_crop_blocks - comp_width) *
+                    sizeof(JBLOCK));
+          }
+        } else {
+          jcopy_block_row(src_buffer[offset_y] + x_crop_blocks,
+                          dst_buffer[offset_y], compptr->width_in_blocks);
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_crop_ext_flat(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                 JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+                 jvirt_barray_ptr *src_coef_arrays,
+                 jvirt_barray_ptr *dst_coef_arrays)
+/* Crop.  This is only used when no rotate/flip is requested with the crop.
+ * Extension: The destination width is larger than the source, and we fill in
+ * the expanded region with the DC coefficient of the adjacent block.  Note
+ * that we also have to fill partial iMCUs at the right and bottom edge of the
+ * source image area in this case.
+ */
+{
+  JDIMENSION MCU_cols, MCU_rows, comp_width, comp_height;
+  JDIMENSION dst_blk_x, dst_blk_y, x_crop_blocks, y_crop_blocks;
+  int ci, offset_y;
+  JCOEF dc;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  jpeg_component_info *compptr;
+
+  MCU_cols = srcinfo->output_width /
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+  MCU_rows = srcinfo->output_height /
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = MCU_cols * compptr->h_samp_factor;
+    comp_height = MCU_rows * compptr->v_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      if (dstinfo->_jpeg_height > srcinfo->output_height) {
+        if (dst_blk_y < y_crop_blocks ||
+            dst_blk_y >= y_crop_blocks + comp_height) {
+          for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+            MEMZERO(dst_buffer[offset_y],
+                    compptr->width_in_blocks * sizeof(JBLOCK));
+          }
+          continue;
+        }
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y - y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+           FALSE);
+      } else {
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y + y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+          FALSE);
+      }
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        if (x_crop_blocks > 0) {
+          MEMZERO(dst_buffer[offset_y], x_crop_blocks * sizeof(JBLOCK));
+          dc = src_buffer[offset_y][0][0];
+          for (dst_blk_x = 0; dst_blk_x < x_crop_blocks; dst_blk_x++) {
+            dst_buffer[offset_y][dst_blk_x][0] = dc;
+          }
+        }
+        jcopy_block_row(src_buffer[offset_y],
+                        dst_buffer[offset_y] + x_crop_blocks, comp_width);
+        if (compptr->width_in_blocks > x_crop_blocks + comp_width) {
+          MEMZERO(dst_buffer[offset_y] + x_crop_blocks + comp_width,
+                  (compptr->width_in_blocks - x_crop_blocks - comp_width) *
+                  sizeof(JBLOCK));
+          dc = src_buffer[offset_y][comp_width - 1][0];
+          for (dst_blk_x = x_crop_blocks + comp_width;
+               dst_blk_x < compptr->width_in_blocks; dst_blk_x++) {
+            dst_buffer[offset_y][dst_blk_x][0] = dc;
+          }
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_crop_ext_reflect(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+                    JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+                    jvirt_barray_ptr *src_coef_arrays,
+                    jvirt_barray_ptr *dst_coef_arrays)
+/* Crop.  This is only used when no rotate/flip is requested with the crop.
+ * Extension: The destination width is larger than the source, and we fill in
+ * the expanded region with repeated reflections of the source image.  Note
+ * that we also have to fill partial iMCUs at the right and bottom edge of the
+ * source image area in this case.
+ */
+{
+  JDIMENSION MCU_cols, MCU_rows, comp_width, comp_height, src_blk_x;
+  JDIMENSION dst_blk_x, dst_blk_y, x_crop_blocks, y_crop_blocks;
+  int ci, k, offset_y;
+  JBLOCKARRAY src_buffer, dst_buffer;
+  JBLOCKROW src_row_ptr, dst_row_ptr;
+  JCOEFPTR src_ptr, dst_ptr;
+  jpeg_component_info *compptr;
+
+  MCU_cols = srcinfo->output_width /
+             (dstinfo->max_h_samp_factor * dstinfo_min_DCT_h_scaled_size);
+  MCU_rows = srcinfo->output_height /
+             (dstinfo->max_v_samp_factor * dstinfo_min_DCT_v_scaled_size);
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    comp_width = MCU_cols * compptr->h_samp_factor;
+    comp_height = MCU_rows * compptr->v_samp_factor;
+    x_crop_blocks = x_crop_offset * compptr->h_samp_factor;
+    y_crop_blocks = y_crop_offset * compptr->v_samp_factor;
+    for (dst_blk_y = 0; dst_blk_y < compptr->height_in_blocks;
+         dst_blk_y += compptr->v_samp_factor) {
+      dst_buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, dst_coef_arrays[ci], dst_blk_y,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      if (dstinfo->_jpeg_height > srcinfo->output_height) {
+        if (dst_blk_y < y_crop_blocks ||
+            dst_blk_y >= y_crop_blocks + comp_height) {
+          for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+            MEMZERO(dst_buffer[offset_y],
+                    compptr->width_in_blocks * sizeof(JBLOCK));
+          }
+          continue;
+        }
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y - y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+           FALSE);
+      } else {
+        src_buffer = (*srcinfo->mem->access_virt_barray)
+          ((j_common_ptr)srcinfo, src_coef_arrays[ci],
+           dst_blk_y + y_crop_blocks, (JDIMENSION)compptr->v_samp_factor,
+           FALSE);
+      }
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        /* Copy source region */
+        jcopy_block_row(src_buffer[offset_y],
+                        dst_buffer[offset_y] + x_crop_blocks, comp_width);
+        if (x_crop_blocks > 0) {
+          /* Reflect to left */
+          dst_row_ptr = dst_buffer[offset_y] + x_crop_blocks;
+          for (dst_blk_x = x_crop_blocks; dst_blk_x > 0;) {
+            src_row_ptr = dst_row_ptr;      /* (re)set axis of reflection */
+            for (src_blk_x = comp_width; src_blk_x > 0 && dst_blk_x > 0;
+                 src_blk_x--, dst_blk_x--) {
+              dst_ptr = *(--dst_row_ptr);   /* destination goes left */
+              src_ptr = *src_row_ptr++;     /* source goes right */
+              /* This unrolled loop doesn't need to know which row it's on. */
+              for (k = 0; k < DCTSIZE2; k += 2) {
+                *dst_ptr++ = *src_ptr++;    /* copy even column */
+                *dst_ptr++ = -(*src_ptr++); /* copy odd column with sign
+                                               change */
+              }
+            }
+          }
+        }
+        if (compptr->width_in_blocks > x_crop_blocks + comp_width) {
+          /* Reflect to right */
+          dst_row_ptr = dst_buffer[offset_y] + x_crop_blocks + comp_width;
+          for (dst_blk_x = compptr->width_in_blocks - x_crop_blocks - comp_width;
+               dst_blk_x > 0;) {
+            src_row_ptr = dst_row_ptr;      /* (re)set axis of reflection */
+            for (src_blk_x = comp_width; src_blk_x > 0 && dst_blk_x > 0;
+                 src_blk_x--, dst_blk_x--) {
+              dst_ptr = *dst_row_ptr++;     /* destination goes right */
+              src_ptr = *(--src_row_ptr);   /* source goes left */
+              /* This unrolled loop doesn't need to know which row it's on. */
+              for (k = 0; k < DCTSIZE2; k += 2) {
+                *dst_ptr++ = *src_ptr++;    /* copy even column */
+                *dst_ptr++ = -(*src_ptr++); /* copy odd column with sign
+                                               change */
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_wipe(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+        JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+        jvirt_barray_ptr *src_coef_arrays,
+        JDIMENSION drop_width, JDIMENSION drop_height)
+/* Wipe - discard image contents of specified region and fill with zero
+ * (neutral gray)
+ */
+{
+  JDIMENSION x_wipe_blocks, wipe_width;
+  JDIMENSION y_wipe_blocks, wipe_bottom;
+  int ci, offset_y;
+  JBLOCKARRAY buffer;
+  jpeg_component_info *compptr;
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    x_wipe_blocks = x_crop_offset * compptr->h_samp_factor;
+    wipe_width = drop_width * compptr->h_samp_factor;
+    y_wipe_blocks = y_crop_offset * compptr->v_samp_factor;
+    wipe_bottom = drop_height * compptr->v_samp_factor + y_wipe_blocks;
+    for (; y_wipe_blocks < wipe_bottom;
+         y_wipe_blocks += compptr->v_samp_factor) {
+      buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], y_wipe_blocks,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        MEMZERO(buffer[offset_y] + x_wipe_blocks, wipe_width * sizeof(JBLOCK));
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_flatten(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+           JDIMENSION x_crop_offset, JDIMENSION y_crop_offset,
+           jvirt_barray_ptr *src_coef_arrays,
+           JDIMENSION drop_width, JDIMENSION drop_height)
+/* Flatten - discard image contents of specified region, similarly to wipe,
+ * but fill with the average of adjacent blocks instead of zero.
+ */
+{
+  JDIMENSION x_wipe_blocks, wipe_width, wipe_right;
+  JDIMENSION y_wipe_blocks, wipe_bottom, blk_x;
+  int ci, offset_y, dc_left_value, dc_right_value, average;
+  JBLOCKARRAY buffer;
+  jpeg_component_info *compptr;
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    x_wipe_blocks = x_crop_offset * compptr->h_samp_factor;
+    wipe_width = drop_width * compptr->h_samp_factor;
+    wipe_right = wipe_width + x_wipe_blocks;
+    y_wipe_blocks = y_crop_offset * compptr->v_samp_factor;
+    wipe_bottom = drop_height * compptr->v_samp_factor + y_wipe_blocks;
+    for (; y_wipe_blocks < wipe_bottom;
+         y_wipe_blocks += compptr->v_samp_factor) {
+      buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], y_wipe_blocks,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        MEMZERO(buffer[offset_y] + x_wipe_blocks, wipe_width * sizeof(JBLOCK));
+        if (x_wipe_blocks > 0) {
+          dc_left_value = buffer[offset_y][x_wipe_blocks - 1][0];
+          if (wipe_right < compptr->width_in_blocks) {
+            dc_right_value = buffer[offset_y][wipe_right][0];
+            average = (dc_left_value + dc_right_value) >> 1;
+          } else {
+            average = dc_left_value;
+          }
+        } else if (wipe_right < compptr->width_in_blocks) {
+          average = buffer[offset_y][wipe_right][0];
+        } else continue;
+        for (blk_x = x_wipe_blocks; blk_x < wipe_right; blk_x++) {
+          buffer[offset_y][blk_x][0] = (JCOEF)average;
+        }
+      }
+    }
+  }
+}
+
+
+LOCAL(void)
+do_reflect(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
+           JDIMENSION x_crop_offset, jvirt_barray_ptr *src_coef_arrays,
+           JDIMENSION drop_width, JDIMENSION drop_height)
+/* Reflect - discard image contents of specified region, similarly to wipe,
+ * but fill with repeated reflections of the outside region instead of zero.
+ * NB: y_crop_offset is assumed to be zero.
+ */
+{
+  JDIMENSION x_wipe_blocks, wipe_width;
+  JDIMENSION y_wipe_blocks, wipe_bottom;
+  JDIMENSION src_blk_x, dst_blk_x;
+  int ci, k, offset_y;
+  JBLOCKARRAY buffer;
+  JBLOCKROW src_row_ptr, dst_row_ptr;
+  JCOEFPTR src_ptr, dst_ptr;
+  jpeg_component_info *compptr;
+
+  for (ci = 0; ci < dstinfo->num_components; ci++) {
+    compptr = dstinfo->comp_info + ci;
+    x_wipe_blocks = x_crop_offset * compptr->h_samp_factor;
+    wipe_width = drop_width * compptr->h_samp_factor;
+    wipe_bottom = drop_height * compptr->v_samp_factor;
+    for (y_wipe_blocks = 0; y_wipe_blocks < wipe_bottom;
+         y_wipe_blocks += compptr->v_samp_factor) {
+      buffer = (*srcinfo->mem->access_virt_barray)
+        ((j_common_ptr)srcinfo, src_coef_arrays[ci], y_wipe_blocks,
+         (JDIMENSION)compptr->v_samp_factor, TRUE);
+      for (offset_y = 0; offset_y < compptr->v_samp_factor; offset_y++) {
+        if (x_wipe_blocks > 0) {
+          /* Reflect from left */
+          dst_row_ptr = buffer[offset_y] + x_wipe_blocks;
+          for (dst_blk_x = wipe_width; dst_blk_x > 0;) {
+            src_row_ptr = dst_row_ptr;     /* (re)set axis of reflection */
+            for (src_blk_x = x_wipe_blocks;
+                 src_blk_x > 0 && dst_blk_x > 0; src_blk_x--, dst_blk_x--) {
+              dst_ptr = *dst_row_ptr++;    /* destination goes right */
+              src_ptr = *(--src_row_ptr);  /* source goes left */
+              /* this unrolled loop doesn't need to know which row it's on... */
+              for (k = 0; k < DCTSIZE2; k += 2) {
+                *dst_ptr++ = *src_ptr++;   /* copy even column */
+                *dst_ptr++ = -(*src_ptr++); /* copy odd column with sign change */
+              }
+            }
+          }
+        } else if (compptr->width_in_blocks > x_wipe_blocks + wipe_width) {
+          /* Reflect from right */
+          dst_row_ptr = buffer[offset_y] + x_wipe_blocks + wipe_width;
+          for (dst_blk_x = wipe_width; dst_blk_x > 0;) {
+            src_row_ptr = dst_row_ptr;     /* (re)set axis of reflection */
+            src_blk_x = compptr->width_in_blocks - x_wipe_blocks - wipe_width;
+            for (; src_blk_x > 0 && dst_blk_x > 0; src_blk_x--, dst_blk_x--) {
+              dst_ptr = *(--dst_row_ptr);  /* destination goes left */
+              src_ptr = *src_row_ptr++;    /* source goes right */
+              /* this unrolled loop doesn't need to know which row it's on... */
+              for (k = 0; k < DCTSIZE2; k += 2) {
+                *dst_ptr++ = *src_ptr++;   /* copy even column */
+                *dst_ptr++ = -(*src_ptr++); /* copy odd column with sign change */
+              }
+            }
+          }
+        } else {
+          MEMZERO(buffer[offset_y] + x_wipe_blocks,
+                  wipe_width * sizeof(JBLOCK));
+        }
+      }
+    }
+  }
+}
+
+
 LOCAL(void)
 do_flip_h_no_crop(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
                   JDIMENSION x_crop_offset, jvirt_barray_ptr *src_coef_arrays)
@@ -783,7 +1377,7 @@ jt_read_integer(const char **strptr, JDIMENSION *result)
  * The routine returns TRUE if the spec string is valid, FALSE if not.
  *
  * The crop spec string should have the format
- *      <width>[f]x<height>[f]{+-}<xoffset>{+-}<yoffset>
+ *      <width>[{fr}]x<height>[{fr}]{+-}<xoffset>{+-}<yoffset>
  * where width, height, xoffset, and yoffset are unsigned integers.
  * Each of the elements can be omitted to indicate a default value.
  * (A weakness of this style is that it is not possible to omit xoffset
@@ -808,6 +1402,9 @@ jtransform_parse_crop_spec(jpeg_transform_info *info, const char *spec)
     if (*spec == 'f' || *spec == 'F') {
       spec++;
       info->crop_width_set = JCROP_FORCE;
+    } else if (*spec == 'r' || *spec == 'R') {
+      spec++;
+      info->crop_width_set = JCROP_REFLECT;
     } else
       info->crop_width_set = JCROP_POS;
   }
@@ -819,6 +1416,9 @@ jtransform_parse_crop_spec(jpeg_transform_info *info, const char *spec)
     if (*spec == 'f' || *spec == 'F') {
       spec++;
       info->crop_height_set = JCROP_FORCE;
+    } else if (*spec == 'r' || *spec == 'R') {
+      spec++;
+      info->crop_height_set = JCROP_REFLECT;
     } else
       info->crop_height_set = JCROP_POS;
   }
@@ -893,10 +1493,10 @@ jtransform_request_workspace(j_decompress_ptr srcinfo,
   jvirt_barray_ptr *coef_arrays;
   boolean need_workspace, transpose_it;
   jpeg_component_info *compptr;
-  JDIMENSION xoffset, yoffset;
+  JDIMENSION xoffset, yoffset, dtemp;
   JDIMENSION width_in_iMCUs, height_in_iMCUs;
   JDIMENSION width_in_blocks, height_in_blocks;
-  int ci, h_samp_factor, v_samp_factor;
+  int itemp, ci, h_samp_factor, v_samp_factor;
 
   /* Determine number of components in output image */
   if (info->force_grayscale &&
@@ -982,39 +1582,129 @@ jtransform_request_workspace(j_decompress_ptr srcinfo,
       info->crop_xoffset = 0;   /* default to +0 */
     if (info->crop_yoffset_set == JCROP_UNSET)
       info->crop_yoffset = 0;   /* default to +0 */
-    if (info->crop_xoffset >= info->output_width ||
-        info->crop_yoffset >= info->output_height)
-      ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
-    if (info->crop_width_set == JCROP_UNSET)
+    if (info->crop_width_set == JCROP_UNSET) {
+      if (info->crop_xoffset >= info->output_width)
+        ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
       info->crop_width = info->output_width - info->crop_xoffset;
-    if (info->crop_height_set == JCROP_UNSET)
+    } else {
+      /* Check for crop extension */
+      if (info->crop_width > info->output_width) {
+        /* Crop extension does not work when transforming! */
+        if (info->transform != JXFORM_NONE ||
+            info->crop_xoffset >= info->crop_width ||
+            info->crop_xoffset > info->crop_width - info->output_width)
+          ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+      } else {
+        if (info->crop_xoffset >= info->output_width ||
+            info->crop_width <= 0 ||
+            info->crop_xoffset > info->output_width - info->crop_width)
+          ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+      }
+    }
+    if (info->crop_height_set == JCROP_UNSET) {
+      if (info->crop_yoffset >= info->output_height)
+        ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
       info->crop_height = info->output_height - info->crop_yoffset;
-    /* Ensure parameters are valid */
-    if (info->crop_width <= 0 || info->crop_width > info->output_width ||
-        info->crop_height <= 0 || info->crop_height > info->output_height ||
-        info->crop_xoffset > info->output_width - info->crop_width ||
-        info->crop_yoffset > info->output_height - info->crop_height)
-      ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+    } else {
+      /* Check for crop extension */
+      if (info->crop_height > info->output_height) {
+        /* Crop extension does not work when transforming! */
+        if (info->transform != JXFORM_NONE ||
+            info->crop_yoffset >= info->crop_height ||
+            info->crop_yoffset > info->crop_height - info->output_height)
+          ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+      } else {
+        if (info->crop_yoffset >= info->output_height ||
+            info->crop_height <= 0 ||
+            info->crop_yoffset > info->output_height - info->crop_height)
+          ERREXIT(srcinfo, JERR_BAD_CROP_SPEC);
+      }
+    }
     /* Convert negative crop offsets into regular offsets */
-    if (info->crop_xoffset_set == JCROP_NEG)
-      xoffset = info->output_width - info->crop_width - info->crop_xoffset;
-    else
+    if (info->crop_xoffset_set != JCROP_NEG)
       xoffset = info->crop_xoffset;
-    if (info->crop_yoffset_set == JCROP_NEG)
-      yoffset = info->output_height - info->crop_height - info->crop_yoffset;
+    else if (info->crop_width > info->output_width) /* crop extension */
+      xoffset = info->crop_width - info->output_width - info->crop_xoffset;
     else
+      xoffset = info->output_width - info->crop_width - info->crop_xoffset;
+    if (info->crop_yoffset_set != JCROP_NEG)
       yoffset = info->crop_yoffset;
-    /* Now adjust so that upper left corner falls at an iMCU boundary */
-    if (info->crop_width_set == JCROP_FORCE)
-      info->output_width = info->crop_width;
+    else if (info->crop_height > info->output_height) /* crop extension */
+      yoffset = info->crop_height - info->output_height - info->crop_yoffset;
     else
-      info->output_width =
-        info->crop_width + (xoffset % info->iMCU_sample_width);
-    if (info->crop_height_set == JCROP_FORCE)
-      info->output_height = info->crop_height;
-    else
-      info->output_height =
-        info->crop_height + (yoffset % info->iMCU_sample_height);
+      yoffset = info->output_height - info->crop_height - info->crop_yoffset;
+    /* Now adjust so that upper left corner falls at an iMCU boundary */
+    switch (info->transform) {
+    case JXFORM_DROP:
+      /* Ensure the effective drop region will not exceed the requested */
+      itemp = info->iMCU_sample_width;
+      dtemp = itemp - 1 - ((xoffset + itemp - 1) % itemp);
+      xoffset += dtemp;
+      if (info->crop_width <= dtemp)
+        info->drop_width = 0;
+      else if (xoffset + info->crop_width - dtemp == info->output_width)
+        /* Matching right edge: include partial iMCU */
+        info->drop_width = (info->crop_width - dtemp + itemp - 1) / itemp;
+      else
+        info->drop_width = (info->crop_width - dtemp) / itemp;
+      itemp = info->iMCU_sample_height;
+      dtemp = itemp - 1 - ((yoffset + itemp - 1) % itemp);
+      yoffset += dtemp;
+      if (info->crop_height <= dtemp)
+        info->drop_height = 0;
+      else if (yoffset + info->crop_height - dtemp == info->output_height)
+        /* Matching bottom edge: include partial iMCU */
+        info->drop_height = (info->crop_height - dtemp + itemp - 1) / itemp;
+      else
+        info->drop_height = (info->crop_height - dtemp) / itemp;
+      /* Check if sampling factors match for dropping */
+      if (info->drop_width != 0 && info->drop_height != 0)
+        for (ci = 0; ci < info->num_components &&
+                     ci < info->drop_ptr->num_components; ci++) {
+          if (info->drop_ptr->comp_info[ci].h_samp_factor *
+              srcinfo->max_h_samp_factor !=
+              srcinfo->comp_info[ci].h_samp_factor *
+              info->drop_ptr->max_h_samp_factor)
+            ERREXIT6(srcinfo, JERR_BAD_DROP_SAMPLING, ci,
+              info->drop_ptr->comp_info[ci].h_samp_factor,
+              info->drop_ptr->max_h_samp_factor,
+              srcinfo->comp_info[ci].h_samp_factor,
+              srcinfo->max_h_samp_factor, 'h');
+          if (info->drop_ptr->comp_info[ci].v_samp_factor *
+              srcinfo->max_v_samp_factor !=
+              srcinfo->comp_info[ci].v_samp_factor *
+              info->drop_ptr->max_v_samp_factor)
+            ERREXIT6(srcinfo, JERR_BAD_DROP_SAMPLING, ci,
+              info->drop_ptr->comp_info[ci].v_samp_factor,
+              info->drop_ptr->max_v_samp_factor,
+              srcinfo->comp_info[ci].v_samp_factor,
+              srcinfo->max_v_samp_factor, 'v');
+        }
+      break;
+    case JXFORM_WIPE:
+      /* Ensure the effective wipe region will cover the requested */
+      info->drop_width = (JDIMENSION)jdiv_round_up
+        ((long)(info->crop_width + (xoffset % info->iMCU_sample_width)),
+         (long)info->iMCU_sample_width);
+      info->drop_height = (JDIMENSION)jdiv_round_up
+        ((long)(info->crop_height + (yoffset % info->iMCU_sample_height)),
+         (long)info->iMCU_sample_height);
+      break;
+    default:
+      /* Ensure the effective crop region will cover the requested */
+      if (info->crop_width_set == JCROP_FORCE ||
+          info->crop_width > info->output_width)
+        info->output_width = info->crop_width;
+      else
+        info->output_width =
+          info->crop_width + (xoffset % info->iMCU_sample_width);
+      if (info->crop_height_set == JCROP_FORCE ||
+          info->crop_height > info->output_height)
+        info->output_height = info->crop_height;
+      else
+        info->output_height =
+          info->crop_height + (yoffset % info->iMCU_sample_height);
+    }
     /* Save x/y offsets measured in iMCUs */
     info->x_crop_offset = xoffset / info->iMCU_sample_width;
     info->y_crop_offset = yoffset / info->iMCU_sample_height;
@@ -1030,7 +1720,9 @@ jtransform_request_workspace(j_decompress_ptr srcinfo,
   transpose_it = FALSE;
   switch (info->transform) {
   case JXFORM_NONE:
-    if (info->x_crop_offset != 0 || info->y_crop_offset != 0)
+    if (info->x_crop_offset != 0 || info->y_crop_offset != 0 ||
+        info->output_width > srcinfo->output_width ||
+        info->output_height > srcinfo->output_height)
       need_workspace = TRUE;
     /* No workspace needed if neither cropping nor transforming */
     break;
@@ -1084,6 +1776,10 @@ jtransform_request_workspace(j_decompress_ptr srcinfo,
     need_workspace = TRUE;
     transpose_it = TRUE;
     break;
+  case JXFORM_WIPE:
+    break;
+  case JXFORM_DROP:
+    break;
   }
 
   /* Allocate workspace if needed.
@@ -1187,47 +1883,47 @@ adjust_exif_parameters(JOCTET *data, unsigned int length, JDIMENSION new_width,
   if (length < 12) return; /* Length of an IFD entry */
 
   /* Discover byte order */
-  if (GETJOCTET(data[0]) == 0x49 && GETJOCTET(data[1]) == 0x49)
+  if (data[0] == 0x49 && data[1] == 0x49)
     is_motorola = FALSE;
-  else if (GETJOCTET(data[0]) == 0x4D && GETJOCTET(data[1]) == 0x4D)
+  else if (data[0] == 0x4D && data[1] == 0x4D)
     is_motorola = TRUE;
   else
     return;
 
   /* Check Tag Mark */
   if (is_motorola) {
-    if (GETJOCTET(data[2]) != 0) return;
-    if (GETJOCTET(data[3]) != 0x2A) return;
+    if (data[2] != 0) return;
+    if (data[3] != 0x2A) return;
   } else {
-    if (GETJOCTET(data[3]) != 0) return;
-    if (GETJOCTET(data[2]) != 0x2A) return;
+    if (data[3] != 0) return;
+    if (data[2] != 0x2A) return;
   }
 
   /* Get first IFD offset (offset to IFD0) */
   if (is_motorola) {
-    if (GETJOCTET(data[4]) != 0) return;
-    if (GETJOCTET(data[5]) != 0) return;
-    firstoffset = GETJOCTET(data[6]);
+    if (data[4] != 0) return;
+    if (data[5] != 0) return;
+    firstoffset = data[6];
     firstoffset <<= 8;
-    firstoffset += GETJOCTET(data[7]);
+    firstoffset += data[7];
   } else {
-    if (GETJOCTET(data[7]) != 0) return;
-    if (GETJOCTET(data[6]) != 0) return;
-    firstoffset = GETJOCTET(data[5]);
+    if (data[7] != 0) return;
+    if (data[6] != 0) return;
+    firstoffset = data[5];
     firstoffset <<= 8;
-    firstoffset += GETJOCTET(data[4]);
+    firstoffset += data[4];
   }
   if (firstoffset > length - 2) return; /* check end of data segment */
 
   /* Get the number of directory entries contained in this IFD */
   if (is_motorola) {
-    number_of_tags = GETJOCTET(data[firstoffset]);
+    number_of_tags = data[firstoffset];
     number_of_tags <<= 8;
-    number_of_tags += GETJOCTET(data[firstoffset + 1]);
+    number_of_tags += data[firstoffset + 1];
   } else {
-    number_of_tags = GETJOCTET(data[firstoffset + 1]);
+    number_of_tags = data[firstoffset + 1];
     number_of_tags <<= 8;
-    number_of_tags += GETJOCTET(data[firstoffset]);
+    number_of_tags += data[firstoffset];
   }
   if (number_of_tags == 0) return;
   firstoffset += 2;
@@ -1237,13 +1933,13 @@ adjust_exif_parameters(JOCTET *data, unsigned int length, JDIMENSION new_width,
     if (firstoffset > length - 12) return; /* check end of data segment */
     /* Get Tag number */
     if (is_motorola) {
-      tagnum = GETJOCTET(data[firstoffset]);
+      tagnum = data[firstoffset];
       tagnum <<= 8;
-      tagnum += GETJOCTET(data[firstoffset + 1]);
+      tagnum += data[firstoffset + 1];
     } else {
-      tagnum = GETJOCTET(data[firstoffset + 1]);
+      tagnum = data[firstoffset + 1];
       tagnum <<= 8;
-      tagnum += GETJOCTET(data[firstoffset]);
+      tagnum += data[firstoffset];
     }
     if (tagnum == 0x8769) break; /* found ExifSubIFD offset Tag */
     if (--number_of_tags == 0) return;
@@ -1252,29 +1948,29 @@ adjust_exif_parameters(JOCTET *data, unsigned int length, JDIMENSION new_width,
 
   /* Get the ExifSubIFD offset */
   if (is_motorola) {
-    if (GETJOCTET(data[firstoffset + 8]) != 0) return;
-    if (GETJOCTET(data[firstoffset + 9]) != 0) return;
-    offset = GETJOCTET(data[firstoffset + 10]);
+    if (data[firstoffset + 8] != 0) return;
+    if (data[firstoffset + 9] != 0) return;
+    offset = data[firstoffset + 10];
     offset <<= 8;
-    offset += GETJOCTET(data[firstoffset + 11]);
+    offset += data[firstoffset + 11];
   } else {
-    if (GETJOCTET(data[firstoffset + 11]) != 0) return;
-    if (GETJOCTET(data[firstoffset + 10]) != 0) return;
-    offset = GETJOCTET(data[firstoffset + 9]);
+    if (data[firstoffset + 11] != 0) return;
+    if (data[firstoffset + 10] != 0) return;
+    offset = data[firstoffset + 9];
     offset <<= 8;
-    offset += GETJOCTET(data[firstoffset + 8]);
+    offset += data[firstoffset + 8];
   }
   if (offset > length - 2) return; /* check end of data segment */
 
   /* Get the number of directory entries contained in this SubIFD */
   if (is_motorola) {
-    number_of_tags = GETJOCTET(data[offset]);
+    number_of_tags = data[offset];
     number_of_tags <<= 8;
-    number_of_tags += GETJOCTET(data[offset + 1]);
+    number_of_tags += data[offset + 1];
   } else {
-    number_of_tags = GETJOCTET(data[offset + 1]);
+    number_of_tags = data[offset + 1];
     number_of_tags <<= 8;
-    number_of_tags += GETJOCTET(data[offset]);
+    number_of_tags += data[offset];
   }
   if (number_of_tags < 2) return;
   offset += 2;
@@ -1284,13 +1980,13 @@ adjust_exif_parameters(JOCTET *data, unsigned int length, JDIMENSION new_width,
     if (offset > length - 12) return; /* check end of data segment */
     /* Get Tag number */
     if (is_motorola) {
-      tagnum = GETJOCTET(data[offset]);
+      tagnum = data[offset];
       tagnum <<= 8;
-      tagnum += GETJOCTET(data[offset + 1]);
+      tagnum += data[offset + 1];
     } else {
-      tagnum = GETJOCTET(data[offset + 1]);
+      tagnum = data[offset + 1];
       tagnum <<= 8;
-      tagnum += GETJOCTET(data[offset]);
+      tagnum += data[offset];
     }
     if (tagnum == 0xA002 || tagnum == 0xA003) {
       if (tagnum == 0xA002)
@@ -1384,7 +2080,7 @@ jtransform_adjust_parameters(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
   dstinfo->jpeg_height = info->output_height;
 #endif
 
-  /* Transpose destination image parameters */
+  /* Transpose destination image parameters, adjust quantization */
   switch (info->transform) {
   case JXFORM_TRANSPOSE:
   case JXFORM_TRANSVERSE:
@@ -1396,6 +2092,12 @@ jtransform_adjust_parameters(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
 #endif
     transpose_critical_parameters(dstinfo);
     break;
+  case JXFORM_DROP:
+    if (info->drop_width != 0 && info->drop_height != 0)
+      adjust_quant(srcinfo, src_coef_arrays,
+                   info->drop_ptr, info->drop_coef_arrays,
+                   info->trim, dstinfo);
+    break;
   default:
 #if JPEG_LIB_VERSION < 80
     dstinfo->image_width = info->output_width;
@@ -1408,12 +2110,12 @@ jtransform_adjust_parameters(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
   if (srcinfo->marker_list != NULL &&
       srcinfo->marker_list->marker == JPEG_APP0 + 1 &&
       srcinfo->marker_list->data_length >= 6 &&
-      GETJOCTET(srcinfo->marker_list->data[0]) == 0x45 &&
-      GETJOCTET(srcinfo->marker_list->data[1]) == 0x78 &&
-      GETJOCTET(srcinfo->marker_list->data[2]) == 0x69 &&
-      GETJOCTET(srcinfo->marker_list->data[3]) == 0x66 &&
-      GETJOCTET(srcinfo->marker_list->data[4]) == 0 &&
-      GETJOCTET(srcinfo->marker_list->data[5]) == 0) {
+      srcinfo->marker_list->data[0] == 0x45 &&
+      srcinfo->marker_list->data[1] == 0x78 &&
+      srcinfo->marker_list->data[2] == 0x69 &&
+      srcinfo->marker_list->data[3] == 0x66 &&
+      srcinfo->marker_list->data[4] == 0 &&
+      srcinfo->marker_list->data[5] == 0) {
     /* Suppress output of JFIF marker */
     dstinfo->write_JFIF_header = FALSE;
     /* Adjust Exif image parameters */
@@ -1462,7 +2164,23 @@ jtransform_execute_transform(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
    */
   switch (info->transform) {
   case JXFORM_NONE:
-    if (info->x_crop_offset != 0 || info->y_crop_offset != 0)
+    if (info->output_width > srcinfo->output_width ||
+        info->output_height > srcinfo->output_height) {
+      if (info->output_width > srcinfo->output_width &&
+          info->crop_width_set == JCROP_REFLECT)
+        do_crop_ext_reflect(srcinfo, dstinfo,
+                            info->x_crop_offset, info->y_crop_offset,
+                            src_coef_arrays, dst_coef_arrays);
+      else if (info->output_width > srcinfo->output_width &&
+               info->crop_width_set == JCROP_FORCE)
+        do_crop_ext_flat(srcinfo, dstinfo,
+                         info->x_crop_offset, info->y_crop_offset,
+                         src_coef_arrays, dst_coef_arrays);
+      else
+        do_crop_ext_zero(srcinfo, dstinfo,
+                         info->x_crop_offset, info->y_crop_offset,
+                         src_coef_arrays, dst_coef_arrays);
+    } else if (info->x_crop_offset != 0 || info->y_crop_offset != 0)
       do_crop(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
               src_coef_arrays, dst_coef_arrays);
     break;
@@ -1498,6 +2216,30 @@ jtransform_execute_transform(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
     do_rot_270(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
                src_coef_arrays, dst_coef_arrays);
     break;
+  case JXFORM_WIPE:
+    if (info->crop_width_set == JCROP_REFLECT &&
+        info->y_crop_offset == 0 && info->drop_height ==
+        (JDIMENSION)jdiv_round_up
+          ((long)info->output_height, (long)info->iMCU_sample_height) &&
+        (info->x_crop_offset == 0 ||
+         info->x_crop_offset + info->drop_width ==
+         (JDIMENSION)jdiv_round_up
+           ((long)info->output_width, (long)info->iMCU_sample_width)))
+      do_reflect(srcinfo, dstinfo, info->x_crop_offset,
+                 src_coef_arrays, info->drop_width, info->drop_height);
+    else if (info->crop_width_set == JCROP_FORCE)
+      do_flatten(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+                 src_coef_arrays, info->drop_width, info->drop_height);
+    else
+      do_wipe(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+              src_coef_arrays, info->drop_width, info->drop_height);
+    break;
+  case JXFORM_DROP:
+    if (info->drop_width != 0 && info->drop_height != 0)
+      do_drop(srcinfo, dstinfo, info->x_crop_offset, info->y_crop_offset,
+              src_coef_arrays, info->drop_ptr, info->drop_coef_arrays,
+              info->drop_width, info->drop_height);
+    break;
   }
 }
 
@@ -1604,20 +2346,20 @@ jcopy_markers_execute(j_decompress_ptr srcinfo, j_compress_ptr dstinfo,
     if (dstinfo->write_JFIF_header &&
         marker->marker == JPEG_APP0 &&
         marker->data_length >= 5 &&
-        GETJOCTET(marker->data[0]) == 0x4A &&
-        GETJOCTET(marker->data[1]) == 0x46 &&
-        GETJOCTET(marker->data[2]) == 0x49 &&
-        GETJOCTET(marker->data[3]) == 0x46 &&
-        GETJOCTET(marker->data[4]) == 0)
+        marker->data[0] == 0x4A &&
+        marker->data[1] == 0x46 &&
+        marker->data[2] == 0x49 &&
+        marker->data[3] == 0x46 &&
+        marker->data[4] == 0)
       continue;                 /* reject duplicate JFIF */
     if (dstinfo->write_Adobe_marker &&
         marker->marker == JPEG_APP0 + 14 &&
         marker->data_length >= 5 &&
-        GETJOCTET(marker->data[0]) == 0x41 &&
-        GETJOCTET(marker->data[1]) == 0x64 &&
-        GETJOCTET(marker->data[2]) == 0x6F &&
-        GETJOCTET(marker->data[3]) == 0x62 &&
-        GETJOCTET(marker->data[4]) == 0x65)
+        marker->data[0] == 0x41 &&
+        marker->data[1] == 0x64 &&
+        marker->data[2] == 0x6F &&
+        marker->data[3] == 0x62 &&
+        marker->data[4] == 0x65)
       continue;                 /* reject duplicate Adobe */
     jpeg_write_marker(dstinfo, marker->marker,
                       marker->data, marker->data_length);
diff --git a/external/jpeg/transupp.h b/external/jpeg/transupp.h
index 80264cc2ffa7..ea6be1fc3058 100644
--- a/external/jpeg/transupp.h
+++ b/external/jpeg/transupp.h
@@ -2,7 +2,7 @@
  * transupp.h
  *
  * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1997-2011, Thomas G. Lane, Guido Vollbeding.
+ * Copyright (C) 1997-2019, Thomas G. Lane, Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2017, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -62,6 +62,17 @@
  * output image covers at least the requested region, but may cover more.)
  * The adjustment of the region dimensions may be optionally disabled.
  *
+ * A complementary lossless wipe option is provided to discard (gray out) data
+ * inside a given image region while losslessly preserving what is outside.
+ * A lossless drop option is also provided, which allows another JPEG image to
+ * be inserted ("dropped") into the source image data at a given position,
+ * replacing the existing image data at that position.  Both the source image
+ * and the drop image must have the same subsampling level.  It is best if they
+ * also have the same quantization (quality.)  Otherwise, the quantization of
+ * the output image will be adapted to accommodate the higher of the source
+ * image quality and the drop image quality.  The trim option can be used with
+ * the drop option to requantize the drop image to match the source image.
+ *
  * We also provide a lossless-resize option, which is kind of a lossless-crop
  * operation in the DCT coefficient block domain - it discards higher-order
  * coefficients and losslessly preserves lower-order coefficients of a
@@ -92,20 +103,23 @@ typedef enum {
   JXFORM_TRANSVERSE,      /* transpose across UR-to-LL axis */
   JXFORM_ROT_90,          /* 90-degree clockwise rotation */
   JXFORM_ROT_180,         /* 180-degree rotation */
-  JXFORM_ROT_270          /* 270-degree clockwise (or 90 ccw) */
+  JXFORM_ROT_270,         /* 270-degree clockwise (or 90 ccw) */
+  JXFORM_WIPE,            /* wipe */
+  JXFORM_DROP             /* drop */
 } JXFORM_CODE;
 
 /*
  * Codes for crop parameters, which can individually be unspecified,
  * positive or negative for xoffset or yoffset,
- * positive or forced for width or height.
+ * positive or force or reflect for width or height.
  */
 
 typedef enum {
   JCROP_UNSET,
   JCROP_POS,
   JCROP_NEG,
-  JCROP_FORCE
+  JCROP_FORCE,
+  JCROP_REFLECT
 } JCROP_CODE;
 
 /*
@@ -120,7 +134,7 @@ typedef struct {
   boolean perfect;              /* if TRUE, fail if partial MCUs are requested */
   boolean trim;                 /* if TRUE, trim partial MCUs as needed */
   boolean force_grayscale;      /* if TRUE, convert color image to grayscale */
-  boolean crop;                 /* if TRUE, crop source image */
+  boolean crop;                 /* if TRUE, crop or wipe source image, or drop */
   boolean slow_hflip;  /* For best performance, the JXFORM_FLIP_H transform
                           normally modifies the source coefficients in place.
                           Setting this to TRUE will instead use a slower,
@@ -133,14 +147,18 @@ typedef struct {
    * These can be filled in by jtransform_parse_crop_spec().
    */
   JDIMENSION crop_width;        /* Width of selected region */
-  JCROP_CODE crop_width_set;    /* (forced disables adjustment) */
+  JCROP_CODE crop_width_set;    /* (force-disables adjustment) */
   JDIMENSION crop_height;       /* Height of selected region */
-  JCROP_CODE crop_height_set;   /* (forced disables adjustment) */
+  JCROP_CODE crop_height_set;   /* (force-disables adjustment) */
   JDIMENSION crop_xoffset;      /* X offset of selected region */
   JCROP_CODE crop_xoffset_set;  /* (negative measures from right edge) */
   JDIMENSION crop_yoffset;      /* Y offset of selected region */
   JCROP_CODE crop_yoffset_set;  /* (negative measures from bottom edge) */
 
+  /* Drop parameters: set by caller for drop request */
+  j_decompress_ptr drop_ptr;
+  jvirt_barray_ptr *drop_coef_arrays;
+
   /* Internal workspace: caller should not touch these */
   int num_components;           /* # of components in workspace */
   jvirt_barray_ptr *workspace_coef_arrays; /* workspace for transformations */
@@ -148,6 +166,8 @@ typedef struct {
   JDIMENSION output_height;
   JDIMENSION x_crop_offset;     /* destination crop offsets measured in iMCUs */
   JDIMENSION y_crop_offset;
+  JDIMENSION drop_width;        /* drop/wipe dimensions measured in iMCUs */
+  JDIMENSION drop_height;
   int iMCU_sample_width;        /* destination iMCU size */
   int iMCU_sample_height;
 } jpeg_transform_info;
diff --git a/external/jpeg/turbojpeg-jni.c b/external/jpeg/turbojpeg-jni.c
new file mode 100644
index 000000000000..1b728e3a6ca2
--- /dev/null
+++ b/external/jpeg/turbojpeg-jni.c
@@ -0,0 +1,1228 @@
+/*
+ * Copyright (C)2011-2020 D. R. Commander.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice,
+ *   this list of conditions and the following disclaimer.
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ * - Neither the name of the libjpeg-turbo Project nor the names of its
+ *   contributors may be used to endorse or promote products derived from this
+ *   software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS",
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include "turbojpeg.h"
+#ifdef WIN32
+#include "tjutil.h"
+#endif
+#include <jni.h>
+#include "java/org_libjpegturbo_turbojpeg_TJCompressor.h"
+#include "java/org_libjpegturbo_turbojpeg_TJDecompressor.h"
+#include "java/org_libjpegturbo_turbojpeg_TJTransformer.h"
+#include "java/org_libjpegturbo_turbojpeg_TJ.h"
+
+#define BAILIF0(f) { \
+  if (!(f) || (*env)->ExceptionCheck(env)) { \
+    goto bailout; \
+  } \
+}
+
+#define THROW(msg, exceptionClass) { \
+  jclass _exccls = (*env)->FindClass(env, exceptionClass); \
+  \
+  BAILIF0(_exccls); \
+  (*env)->ThrowNew(env, _exccls, msg); \
+  goto bailout; \
+}
+
+#define THROW_TJ() { \
+  jclass _exccls; \
+  jmethodID _excid; \
+  jobject _excobj; \
+  jstring _errstr; \
+  \
+  BAILIF0(_errstr = (*env)->NewStringUTF(env, tjGetErrorStr2(handle))); \
+  BAILIF0(_exccls = (*env)->FindClass(env, \
+    "org/libjpegturbo/turbojpeg/TJException")); \
+  BAILIF0(_excid = (*env)->GetMethodID(env, _exccls, "<init>", \
+                                       "(Ljava/lang/String;I)V")); \
+  BAILIF0(_excobj = (*env)->NewObject(env, _exccls, _excid, _errstr, \
+                                      tjGetErrorCode(handle))); \
+  (*env)->Throw(env, _excobj); \
+  goto bailout; \
+}
+
+#define THROW_ARG(msg)  THROW(msg, "java/lang/IllegalArgumentException")
+
+#define THROW_MEM() \
+  THROW("Memory allocation failure", "java/lang/OutOfMemoryError");
+
+#define GET_HANDLE() \
+  jclass _cls = (*env)->GetObjectClass(env, obj); \
+  jfieldID _fid; \
+  \
+  BAILIF0(_cls); \
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "handle", "J")); \
+  handle = (tjhandle)(size_t)(*env)->GetLongField(env, obj, _fid);
+
+#ifdef _WIN32
+#define setenv(envvar, value, dummy)  _putenv_s(envvar, value)
+#endif
+
+#define PROP2ENV(property, envvar) { \
+  if ((jName = (*env)->NewStringUTF(env, property)) != NULL && \
+      (jValue = (*env)->CallStaticObjectMethod(env, cls, mid, \
+                                               jName)) != NULL) { \
+    if ((value = (*env)->GetStringUTFChars(env, jValue, 0)) != NULL) { \
+      setenv(envvar, value, 1); \
+      (*env)->ReleaseStringUTFChars(env, jValue, value); \
+    } \
+  } \
+}
+
+#define SAFE_RELEASE(javaArray, cArray) { \
+  if (javaArray && cArray) \
+    (*env)->ReleasePrimitiveArrayCritical(env, javaArray, (void *)cArray, 0); \
+  cArray = NULL; \
+}
+
+static int ProcessSystemProperties(JNIEnv *env)
+{
+  jclass cls;
+  jmethodID mid;
+  jstring jName, jValue;
+  const char *value;
+
+  BAILIF0(cls = (*env)->FindClass(env, "java/lang/System"));
+  BAILIF0(mid = (*env)->GetStaticMethodID(env, cls, "getProperty",
+    "(Ljava/lang/String;)Ljava/lang/String;"));
+
+  PROP2ENV("turbojpeg.optimize", "TJ_OPTIMIZE");
+  PROP2ENV("turbojpeg.arithmetic", "TJ_ARITHMETIC");
+  PROP2ENV("turbojpeg.restart", "TJ_RESTART");
+  PROP2ENV("turbojpeg.progressive", "TJ_PROGRESSIVE");
+  return 0;
+
+bailout:
+  return -1;
+}
+
+/* TurboJPEG 1.2.x: TJ::bufSize() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSize
+  (JNIEnv *env, jclass cls, jint width, jint height, jint jpegSubsamp)
+{
+  jint retval = (jint)tjBufSize(width, height, jpegSubsamp);
+
+  if (retval == -1) THROW_ARG(tjGetErrorStr());
+
+bailout:
+  return retval;
+}
+
+/* TurboJPEG 1.4.x: TJ::bufSizeYUV() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII
+  (JNIEnv *env, jclass cls, jint width, jint pad, jint height, jint subsamp)
+{
+  jint retval = (jint)tjBufSizeYUV2(width, pad, height, subsamp);
+
+  if (retval == -1) THROW_ARG(tjGetErrorStr());
+
+bailout:
+  return retval;
+}
+
+/* TurboJPEG 1.2.x: TJ::bufSizeYUV() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__III
+  (JNIEnv *env, jclass cls, jint width, jint height, jint subsamp)
+{
+  return Java_org_libjpegturbo_turbojpeg_TJ_bufSizeYUV__IIII(env, cls, width,
+                                                             4, height,
+                                                             subsamp);
+}
+
+/* TurboJPEG 1.4.x: TJ::planeSizeYUV() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeSizeYUV__IIIII
+  (JNIEnv *env, jclass cls, jint componentID, jint width, jint stride,
+   jint height, jint subsamp)
+{
+  jint retval = (jint)tjPlaneSizeYUV(componentID, width, stride, height,
+                                     subsamp);
+
+  if (retval == -1) THROW_ARG(tjGetErrorStr());
+
+bailout:
+  return retval;
+}
+
+/* TurboJPEG 1.4.x: TJ::planeWidth() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeWidth__III
+  (JNIEnv *env, jclass cls, jint componentID, jint width, jint subsamp)
+{
+  jint retval = (jint)tjPlaneWidth(componentID, width, subsamp);
+
+  if (retval == -1) THROW_ARG(tjGetErrorStr());
+
+bailout:
+  return retval;
+}
+
+/* TurboJPEG 1.4.x: TJ::planeHeight() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJ_planeHeight__III
+  (JNIEnv *env, jclass cls, jint componentID, jint height, jint subsamp)
+{
+  jint retval = (jint)tjPlaneHeight(componentID, height, subsamp);
+
+  if (retval == -1) THROW_ARG(tjGetErrorStr());
+
+bailout:
+  return retval;
+}
+
+/* TurboJPEG 1.2.x: TJCompressor::init() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_init
+  (JNIEnv *env, jobject obj)
+{
+  jclass cls;
+  jfieldID fid;
+  tjhandle handle;
+
+  if ((handle = tjInitCompress()) == NULL)
+    THROW(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
+
+  BAILIF0(cls = (*env)->GetObjectClass(env, obj));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
+  (*env)->SetLongField(env, obj, fid, (size_t)handle);
+
+bailout:
+  return;
+}
+
+static jint TJCompressor_compress
+  (JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint x, jint y,
+   jint width, jint pitch, jint height, jint pf, jbyteArray dst,
+   jint jpegSubsamp, jint jpegQual, jint flags)
+{
+  tjhandle handle = 0;
+  unsigned long jpegSize = 0;
+  jsize arraySize = 0, actualPitch;
+  unsigned char *srcBuf = NULL, *jpegBuf = NULL;
+
+  GET_HANDLE();
+
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || width < 1 ||
+      height < 1 || pitch < 0)
+    THROW_ARG("Invalid argument in compress()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF)
+    THROW_ARG("Mismatch between Java and C API");
+
+  actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
+  arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
+  if ((*env)->GetArrayLength(env, src) * srcElementSize < arraySize)
+    THROW_ARG("Source buffer is not large enough");
+  jpegSize = tjBufSize(width, height, jpegSubsamp);
+  if ((*env)->GetArrayLength(env, dst) < (jsize)jpegSize)
+    THROW_ARG("Destination buffer is not large enough");
+
+  if (ProcessSystemProperties(env) < 0) goto bailout;
+
+  BAILIF0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+  if (tjCompress2(handle, &srcBuf[y * actualPitch + x * tjPixelSize[pf]],
+                  width, pitch, height, pf, &jpegBuf, &jpegSize, jpegSubsamp,
+                  jpegQual, flags | TJFLAG_NOREALLOC) == -1) {
+    SAFE_RELEASE(dst, jpegBuf);
+    SAFE_RELEASE(src, srcBuf);
+    THROW_TJ();
+  }
+
+bailout:
+  SAFE_RELEASE(dst, jpegBuf);
+  SAFE_RELEASE(src, srcBuf);
+  return (jint)jpegSize;
+}
+
+/* TurboJPEG 1.3.x: TJCompressor::compress() byte source */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIIIII_3BIII
+  (JNIEnv *env, jobject obj, jbyteArray src, jint x, jint y, jint width,
+   jint pitch, jint height, jint pf, jbyteArray dst, jint jpegSubsamp,
+   jint jpegQual, jint flags)
+{
+  return TJCompressor_compress(env, obj, src, 1, x, y, width, pitch, height,
+                               pf, dst, jpegSubsamp, jpegQual, flags);
+}
+
+/* TurboJPEG 1.2.x: TJCompressor::compress() byte source */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3BIIII_3BIII
+  (JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pitch,
+   jint height, jint pf, jbyteArray dst, jint jpegSubsamp, jint jpegQual,
+   jint flags)
+{
+  return TJCompressor_compress(env, obj, src, 1, 0, 0, width, pitch, height,
+                               pf, dst, jpegSubsamp, jpegQual, flags);
+}
+
+/* TurboJPEG 1.3.x: TJCompressor::compress() int source */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3IIIIIII_3BIII
+  (JNIEnv *env, jobject obj, jintArray src, jint x, jint y, jint width,
+   jint stride, jint height, jint pf, jbyteArray dst, jint jpegSubsamp,
+   jint jpegQual, jint flags)
+{
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    THROW_ARG("Invalid argument in compress()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    THROW_ARG("Pixel format must be 32-bit when compressing from an integer buffer.");
+
+  return TJCompressor_compress(env, obj, src, sizeof(jint), x, y, width,
+                               stride * sizeof(jint), height, pf, dst,
+                               jpegSubsamp, jpegQual, flags);
+
+bailout:
+  return 0;
+}
+
+/* TurboJPEG 1.2.x: TJCompressor::compress() int source */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compress___3IIIII_3BIII
+  (JNIEnv *env, jobject obj, jintArray src, jint width, jint stride,
+   jint height, jint pf, jbyteArray dst, jint jpegSubsamp, jint jpegQual,
+   jint flags)
+{
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    THROW_ARG("Invalid argument in compress()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    THROW_ARG("Pixel format must be 32-bit when compressing from an integer buffer.");
+
+  return TJCompressor_compress(env, obj, src, sizeof(jint), 0, 0, width,
+                               stride * sizeof(jint), height, pf, dst,
+                               jpegSubsamp, jpegQual, flags);
+
+bailout:
+  return 0;
+}
+
+/* TurboJPEG 1.4.x: TJCompressor::compressFromYUV() */
+JNIEXPORT jint JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_compressFromYUV___3_3B_3II_3III_3BII
+  (JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+   jint width, jintArray jSrcStrides, jint height, jint subsamp,
+   jbyteArray dst, jint jpegQual, jint flags)
+{
+  tjhandle handle = 0;
+  unsigned long jpegSize = 0;
+  jbyteArray jSrcPlanes[3] = { NULL, NULL, NULL };
+  const unsigned char *srcPlanesTmp[3] = { NULL, NULL, NULL };
+  const unsigned char *srcPlanes[3] = { NULL, NULL, NULL };
+  int *srcOffsetsTmp = NULL, srcOffsets[3] = { 0, 0, 0 };
+  int *srcStridesTmp = NULL, srcStrides[3] = { 0, 0, 0 };
+  unsigned char *jpegBuf = NULL;
+  int nc = (subsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3), i;
+
+  GET_HANDLE();
+
+  if (subsamp < 0 || subsamp >= org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
+    THROW_ARG("Invalid argument in compressFromYUV()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMSAMP != TJ_NUMSAMP)
+    THROW_ARG("Mismatch between Java and C API");
+
+  if ((*env)->GetArrayLength(env, srcobjs) < nc)
+    THROW_ARG("Planes array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jSrcOffsets) < nc)
+    THROW_ARG("Offsets array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jSrcStrides) < nc)
+    THROW_ARG("Strides array is too small for the subsampling type");
+
+  jpegSize = tjBufSize(width, height, subsamp);
+  if ((*env)->GetArrayLength(env, dst) < (jsize)jpegSize)
+    THROW_ARG("Destination buffer is not large enough");
+
+  if (ProcessSystemProperties(env) < 0) goto bailout;
+
+  BAILIF0(srcOffsetsTmp =
+          (*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
+  for (i = 0; i < nc; i++) srcOffsets[i] = srcOffsetsTmp[i];
+  SAFE_RELEASE(jSrcOffsets, srcOffsetsTmp);
+
+  BAILIF0(srcStridesTmp =
+          (*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
+  for (i = 0; i < nc; i++) srcStrides[i] = srcStridesTmp[i];
+  SAFE_RELEASE(jSrcStrides, srcStridesTmp);
+
+  for (i = 0; i < nc; i++) {
+    int planeSize = tjPlaneSizeYUV(i, width, srcStrides[i], height, subsamp);
+    int pw = tjPlaneWidth(i, width, subsamp);
+
+    if (planeSize < 0 || pw < 0)
+      THROW_ARG(tjGetErrorStr());
+
+    if (srcOffsets[i] < 0)
+      THROW_ARG("Invalid argument in compressFromYUV()");
+    if (srcStrides[i] < 0 && srcOffsets[i] - planeSize + pw < 0)
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+
+    BAILIF0(jSrcPlanes[i] = (*env)->GetObjectArrayElement(env, srcobjs, i));
+    if ((*env)->GetArrayLength(env, jSrcPlanes[i]) <
+        srcOffsets[i] + planeSize)
+      THROW_ARG("Source plane is not large enough");
+
+    BAILIF0(srcPlanesTmp[i] =
+            (*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i], 0));
+    srcPlanes[i] = &srcPlanesTmp[i][srcOffsets[i]];
+    SAFE_RELEASE(jSrcPlanes[i], srcPlanesTmp[i]);
+  }
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+  if (tjCompressFromYUVPlanes(handle, srcPlanes, width, srcStrides, height,
+                              subsamp, &jpegBuf, &jpegSize, jpegQual,
+                              flags | TJFLAG_NOREALLOC) == -1) {
+    SAFE_RELEASE(dst, jpegBuf);
+    THROW_TJ();
+  }
+
+bailout:
+  SAFE_RELEASE(dst, jpegBuf);
+  return (jint)jpegSize;
+}
+
+static void TJCompressor_encodeYUV
+  (JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint x, jint y,
+   jint width, jint pitch, jint height, jint pf, jobjectArray dstobjs,
+   jintArray jDstOffsets, jintArray jDstStrides, jint subsamp, jint flags)
+{
+  tjhandle handle = 0;
+  jsize arraySize = 0, actualPitch;
+  unsigned char *srcBuf = NULL;
+  jbyteArray jDstPlanes[3] = { NULL, NULL, NULL };
+  unsigned char *dstPlanesTmp[3] = { NULL, NULL, NULL };
+  unsigned char *dstPlanes[3] = { NULL, NULL, NULL };
+  int *dstOffsetsTmp = NULL, dstOffsets[3] = { 0, 0, 0 };
+  int *dstStridesTmp = NULL, dstStrides[3] = { 0, 0, 0 };
+  int nc = (subsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3), i;
+
+  GET_HANDLE();
+
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || width < 1 ||
+      height < 1 || pitch < 0 || subsamp < 0 ||
+      subsamp >= org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
+    THROW_ARG("Invalid argument in encodeYUV()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF ||
+      org_libjpegturbo_turbojpeg_TJ_NUMSAMP != TJ_NUMSAMP)
+    THROW_ARG("Mismatch between Java and C API");
+
+  if ((*env)->GetArrayLength(env, dstobjs) < nc)
+    THROW_ARG("Planes array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jDstOffsets) < nc)
+    THROW_ARG("Offsets array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jDstStrides) < nc)
+    THROW_ARG("Strides array is too small for the subsampling type");
+
+  actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
+  arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
+  if ((*env)->GetArrayLength(env, src) * srcElementSize < arraySize)
+    THROW_ARG("Source buffer is not large enough");
+
+  BAILIF0(dstOffsetsTmp =
+          (*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
+  for (i = 0; i < nc; i++) dstOffsets[i] = dstOffsetsTmp[i];
+  SAFE_RELEASE(jDstOffsets, dstOffsetsTmp);
+
+  BAILIF0(dstStridesTmp =
+          (*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
+  for (i = 0; i < nc; i++) dstStrides[i] = dstStridesTmp[i];
+  SAFE_RELEASE(jDstStrides, dstStridesTmp);
+
+  for (i = 0; i < nc; i++) {
+    int planeSize = tjPlaneSizeYUV(i, width, dstStrides[i], height, subsamp);
+    int pw = tjPlaneWidth(i, width, subsamp);
+
+    if (planeSize < 0 || pw < 0)
+      THROW_ARG(tjGetErrorStr());
+
+    if (dstOffsets[i] < 0)
+      THROW_ARG("Invalid argument in encodeYUV()");
+    if (dstStrides[i] < 0 && dstOffsets[i] - planeSize + pw < 0)
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+
+    BAILIF0(jDstPlanes[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
+    if ((*env)->GetArrayLength(env, jDstPlanes[i]) <
+        dstOffsets[i] + planeSize)
+      THROW_ARG("Destination plane is not large enough");
+
+    BAILIF0(dstPlanesTmp[i] =
+            (*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i], 0));
+    dstPlanes[i] = &dstPlanesTmp[i][dstOffsets[i]];
+    SAFE_RELEASE(jDstPlanes[i], dstPlanesTmp[i]);
+  }
+  BAILIF0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+
+  if (tjEncodeYUVPlanes(handle, &srcBuf[y * actualPitch + x * tjPixelSize[pf]],
+                        width, pitch, height, pf, dstPlanes, dstStrides,
+                        subsamp, flags) == -1) {
+    SAFE_RELEASE(src, srcBuf);
+    THROW_TJ();
+  }
+
+bailout:
+  SAFE_RELEASE(src, srcBuf);
+}
+
+/* TurboJPEG 1.4.x: TJCompressor::encodeYUV() byte source */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIIIII_3_3B_3I_3III
+  (JNIEnv *env, jobject obj, jbyteArray src, jint x, jint y, jint width,
+   jint pitch, jint height, jint pf, jobjectArray dstobjs,
+   jintArray jDstOffsets, jintArray jDstStrides, jint subsamp, jint flags)
+{
+  TJCompressor_encodeYUV(env, obj, src, 1, x, y, width, pitch, height, pf,
+                         dstobjs, jDstOffsets, jDstStrides, subsamp, flags);
+}
+
+/* TurboJPEG 1.4.x: TJCompressor::encodeYUV() int source */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIIIII_3_3B_3I_3III
+  (JNIEnv *env, jobject obj, jintArray src, jint x, jint y, jint width,
+   jint stride, jint height, jint pf, jobjectArray dstobjs,
+   jintArray jDstOffsets, jintArray jDstStrides, jint subsamp, jint flags)
+{
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    THROW_ARG("Invalid argument in encodeYUV()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    THROW_ARG("Pixel format must be 32-bit when encoding from an integer buffer.");
+
+  TJCompressor_encodeYUV(env, obj, src, sizeof(jint), x, y, width,
+                         stride * sizeof(jint), height, pf, dstobjs,
+                         jDstOffsets, jDstStrides, subsamp, flags);
+
+bailout:
+  return;
+}
+
+static void JNICALL TJCompressor_encodeYUV_12
+  (JNIEnv *env, jobject obj, jarray src, jint srcElementSize, jint width,
+   jint pitch, jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
+{
+  tjhandle handle = 0;
+  jsize arraySize = 0;
+  unsigned char *srcBuf = NULL, *dstBuf = NULL;
+
+  GET_HANDLE();
+
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || width < 1 ||
+      height < 1 || pitch < 0)
+    THROW_ARG("Invalid argument in encodeYUV()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF)
+    THROW_ARG("Mismatch between Java and C API");
+
+  arraySize = (pitch == 0) ? width * tjPixelSize[pf] * height : pitch * height;
+  if ((*env)->GetArrayLength(env, src) * srcElementSize < arraySize)
+    THROW_ARG("Source buffer is not large enough");
+  if ((*env)->GetArrayLength(env, dst) <
+      (jsize)tjBufSizeYUV(width, height, subsamp))
+    THROW_ARG("Destination buffer is not large enough");
+
+  BAILIF0(srcBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+  if (tjEncodeYUV2(handle, srcBuf, width, pitch, height, pf, dstBuf, subsamp,
+                   flags) == -1) {
+    SAFE_RELEASE(dst, dstBuf);
+    SAFE_RELEASE(src, srcBuf);
+    THROW_TJ();
+  }
+
+bailout:
+  SAFE_RELEASE(dst, dstBuf);
+  SAFE_RELEASE(src, srcBuf);
+}
+
+/* TurboJPEG 1.2.x: TJCompressor::encodeYUV() byte source */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3BIIII_3BII
+  (JNIEnv *env, jobject obj, jbyteArray src, jint width, jint pitch,
+   jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
+{
+  TJCompressor_encodeYUV_12(env, obj, src, 1, width, pitch, height, pf, dst,
+                            subsamp, flags);
+}
+
+/* TurboJPEG 1.2.x: TJCompressor::encodeYUV() int source */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_encodeYUV___3IIIII_3BII
+  (JNIEnv *env, jobject obj, jintArray src, jint width, jint stride,
+   jint height, jint pf, jbyteArray dst, jint subsamp, jint flags)
+{
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    THROW_ARG("Invalid argument in encodeYUV()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    THROW_ARG("Pixel format must be 32-bit when encoding from an integer buffer.");
+
+  TJCompressor_encodeYUV_12(env, obj, src, sizeof(jint), width,
+                            stride * sizeof(jint), height, pf, dst, subsamp,
+                            flags);
+
+bailout:
+  return;
+}
+
+/* TurboJPEG 1.2.x: TJCompressor::destroy() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy
+  (JNIEnv *env, jobject obj)
+{
+  tjhandle handle = 0;
+
+  GET_HANDLE();
+
+  if (tjDestroy(handle) == -1) THROW_TJ();
+  (*env)->SetLongField(env, obj, _fid, 0);
+
+bailout:
+  return;
+}
+
+/* TurboJPEG 1.2.x: TJDecompressor::init() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_init
+  (JNIEnv *env, jobject obj)
+{
+  jclass cls;
+  jfieldID fid;
+  tjhandle handle;
+
+  if ((handle = tjInitDecompress()) == NULL)
+    THROW(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
+
+  BAILIF0(cls = (*env)->GetObjectClass(env, obj));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
+  (*env)->SetLongField(env, obj, fid, (size_t)handle);
+
+bailout:
+  return;
+}
+
+/* TurboJPEG 1.2.x: TJDecompressor::getScalingFactors() */
+JNIEXPORT jobjectArray JNICALL Java_org_libjpegturbo_turbojpeg_TJ_getScalingFactors
+  (JNIEnv *env, jclass cls)
+{
+  jclass sfcls = NULL;
+  jfieldID fid = 0;
+  tjscalingfactor *sf = NULL;
+  int n = 0, i;
+  jobject sfobj = NULL;
+  jobjectArray sfjava = NULL;
+
+  if ((sf = tjGetScalingFactors(&n)) == NULL || n == 0)
+    THROW_ARG(tjGetErrorStr());
+
+  BAILIF0(sfcls = (*env)->FindClass(env,
+    "org/libjpegturbo/turbojpeg/TJScalingFactor"));
+  BAILIF0(sfjava = (jobjectArray)(*env)->NewObjectArray(env, n, sfcls, 0));
+
+  for (i = 0; i < n; i++) {
+    BAILIF0(sfobj = (*env)->AllocObject(env, sfcls));
+    BAILIF0(fid = (*env)->GetFieldID(env, sfcls, "num", "I"));
+    (*env)->SetIntField(env, sfobj, fid, sf[i].num);
+    BAILIF0(fid = (*env)->GetFieldID(env, sfcls, "denom", "I"));
+    (*env)->SetIntField(env, sfobj, fid, sf[i].denom);
+    (*env)->SetObjectArrayElement(env, sfjava, i, sfobj);
+  }
+
+bailout:
+  return sfjava;
+}
+
+/* TurboJPEG 1.2.x: TJDecompressor::decompressHeader() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressHeader
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize)
+{
+  tjhandle handle = 0;
+  unsigned char *jpegBuf = NULL;
+  int width = 0, height = 0, jpegSubsamp = -1, jpegColorspace = -1;
+
+  GET_HANDLE();
+
+  if ((*env)->GetArrayLength(env, src) < jpegSize)
+    THROW_ARG("Source buffer is not large enough");
+
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+
+  if (tjDecompressHeader3(handle, jpegBuf, (unsigned long)jpegSize, &width,
+                          &height, &jpegSubsamp, &jpegColorspace) == -1) {
+    SAFE_RELEASE(src, jpegBuf);
+    THROW_TJ();
+  }
+
+  SAFE_RELEASE(src, jpegBuf);
+
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+  (*env)->SetIntField(env, obj, _fid, jpegSubsamp);
+  if ((_fid = (*env)->GetFieldID(env, _cls, "jpegColorspace", "I")) == 0)
+    (*env)->ExceptionClear(env);
+  else
+    (*env)->SetIntField(env, obj, _fid, jpegColorspace);
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+  (*env)->SetIntField(env, obj, _fid, width);
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  (*env)->SetIntField(env, obj, _fid, height);
+
+bailout:
+  SAFE_RELEASE(src, jpegBuf);
+}
+
+static void TJDecompressor_decompress
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jarray dst,
+   jint dstElementSize, jint x, jint y, jint width, jint pitch, jint height,
+   jint pf, jint flags)
+{
+  tjhandle handle = 0;
+  jsize arraySize = 0, actualPitch;
+  unsigned char *jpegBuf = NULL, *dstBuf = NULL;
+
+  GET_HANDLE();
+
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    THROW_ARG("Invalid argument in decompress()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF)
+    THROW_ARG("Mismatch between Java and C API");
+
+  if ((*env)->GetArrayLength(env, src) < jpegSize)
+    THROW_ARG("Source buffer is not large enough");
+  actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
+  arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
+  if ((*env)->GetArrayLength(env, dst) * dstElementSize < arraySize)
+    THROW_ARG("Destination buffer is not large enough");
+
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+  if (tjDecompress2(handle, jpegBuf, (unsigned long)jpegSize,
+                    &dstBuf[y * actualPitch + x * tjPixelSize[pf]], width,
+                    pitch, height, pf, flags) == -1) {
+    SAFE_RELEASE(dst, dstBuf);
+    SAFE_RELEASE(src, jpegBuf);
+    THROW_TJ();
+  }
+
+bailout:
+  SAFE_RELEASE(dst, dstBuf);
+  SAFE_RELEASE(src, jpegBuf);
+}
+
+/* TurboJPEG 1.3.x: TJDecompressor::decompress() byte destination */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIIIII
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
+   jint x, jint y, jint width, jint pitch, jint height, jint pf, jint flags)
+{
+  TJDecompressor_decompress(env, obj, src, jpegSize, dst, 1, x, y, width,
+                            pitch, height, pf, flags);
+}
+
+/* TurboJPEG 1.2.x: TJDecompressor::decompress() byte destination */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3BIIIII
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
+   jint width, jint pitch, jint height, jint pf, jint flags)
+{
+  TJDecompressor_decompress(env, obj, src, jpegSize, dst, 1, 0, 0, width,
+                            pitch, height, pf, flags);
+}
+
+/* TurboJPEG 1.3.x: TJDecompressor::decompress() int destination */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIIIII
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jintArray dst,
+   jint x, jint y, jint width, jint stride, jint height, jint pf, jint flags)
+{
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    THROW_ARG("Invalid argument in decompress()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    THROW_ARG("Pixel format must be 32-bit when decompressing to an integer buffer.");
+
+  TJDecompressor_decompress(env, obj, src, jpegSize, dst, sizeof(jint), x, y,
+                            width, stride * sizeof(jint), height, pf, flags);
+
+bailout:
+  return;
+}
+
+/* TurboJPEG 1.2.x: TJDecompressor::decompress() int destination */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompress___3BI_3IIIIII
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jintArray dst,
+   jint width, jint stride, jint height, jint pf, jint flags)
+{
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    THROW_ARG("Invalid argument in decompress()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    THROW_ARG("Pixel format must be 32-bit when decompressing to an integer buffer.");
+
+  TJDecompressor_decompress(env, obj, src, jpegSize, dst, sizeof(jint), 0, 0,
+                            width, stride * sizeof(jint), height, pf, flags);
+
+bailout:
+  return;
+}
+
+/* TurboJPEG 1.4.x: TJDecompressor::decompressToYUV() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3_3B_3II_3III
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize,
+   jobjectArray dstobjs, jintArray jDstOffsets, jint desiredWidth,
+   jintArray jDstStrides, jint desiredHeight, jint flags)
+{
+  tjhandle handle = 0;
+  unsigned char *jpegBuf = NULL;
+  jbyteArray jDstPlanes[3] = { NULL, NULL, NULL };
+  unsigned char *dstPlanesTmp[3] = { NULL, NULL, NULL };
+  unsigned char *dstPlanes[3] = { NULL, NULL, NULL };
+  int *dstOffsetsTmp = NULL, dstOffsets[3] = { 0, 0, 0 };
+  int *dstStridesTmp = NULL, dstStrides[3] = { 0, 0, 0 };
+  int jpegSubsamp = -1, jpegWidth = 0, jpegHeight = 0;
+  int nc = 0, i, width, height, scaledWidth, scaledHeight, nsf = 0;
+  tjscalingfactor *sf;
+
+  GET_HANDLE();
+
+  if ((*env)->GetArrayLength(env, src) < jpegSize)
+    THROW_ARG("Source buffer is not large enough");
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+  jpegSubsamp = (int)(*env)->GetIntField(env, obj, _fid);
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+  jpegWidth = (int)(*env)->GetIntField(env, obj, _fid);
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  jpegHeight = (int)(*env)->GetIntField(env, obj, _fid);
+
+  nc = (jpegSubsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3);
+
+  width = desiredWidth;
+  height = desiredHeight;
+  if (width == 0) width = jpegWidth;
+  if (height == 0) height = jpegHeight;
+  sf = tjGetScalingFactors(&nsf);
+  if (!sf || nsf < 1)
+    THROW_ARG(tjGetErrorStr());
+  for (i = 0; i < nsf; i++) {
+    scaledWidth = TJSCALED(jpegWidth, sf[i]);
+    scaledHeight = TJSCALED(jpegHeight, sf[i]);
+    if (scaledWidth <= width && scaledHeight <= height)
+      break;
+  }
+  if (i >= nsf)
+    THROW_ARG("Could not scale down to desired image dimensions");
+
+  BAILIF0(dstOffsetsTmp =
+          (*env)->GetPrimitiveArrayCritical(env, jDstOffsets, 0));
+  for (i = 0; i < nc; i++) dstOffsets[i] = dstOffsetsTmp[i];
+  SAFE_RELEASE(jDstOffsets, dstOffsetsTmp);
+
+  BAILIF0(dstStridesTmp =
+          (*env)->GetPrimitiveArrayCritical(env, jDstStrides, 0));
+  for (i = 0; i < nc; i++) dstStrides[i] = dstStridesTmp[i];
+  SAFE_RELEASE(jDstStrides, dstStridesTmp);
+
+  for (i = 0; i < nc; i++) {
+    int planeSize = tjPlaneSizeYUV(i, scaledWidth, dstStrides[i], scaledHeight,
+                                   jpegSubsamp);
+    int pw = tjPlaneWidth(i, scaledWidth, jpegSubsamp);
+
+    if (planeSize < 0 || pw < 0)
+      THROW_ARG(tjGetErrorStr());
+
+    if (dstOffsets[i] < 0)
+      THROW_ARG("Invalid argument in decompressToYUV()");
+    if (dstStrides[i] < 0 && dstOffsets[i] - planeSize + pw < 0)
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+
+    BAILIF0(jDstPlanes[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
+    if ((*env)->GetArrayLength(env, jDstPlanes[i]) <
+        dstOffsets[i] + planeSize)
+      THROW_ARG("Destination plane is not large enough");
+
+    BAILIF0(dstPlanesTmp[i] =
+            (*env)->GetPrimitiveArrayCritical(env, jDstPlanes[i], 0));
+    dstPlanes[i] = &dstPlanesTmp[i][dstOffsets[i]];
+    SAFE_RELEASE(jDstPlanes[i], dstPlanesTmp[i]);
+  }
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+
+  if (tjDecompressToYUVPlanes(handle, jpegBuf, (unsigned long)jpegSize,
+                              dstPlanes, desiredWidth, dstStrides,
+                              desiredHeight, flags) == -1) {
+    SAFE_RELEASE(src, jpegBuf);
+    THROW_TJ();
+  }
+
+bailout:
+  SAFE_RELEASE(src, jpegBuf);
+}
+
+/* TurboJPEG 1.2.x: TJDecompressor::decompressToYUV() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decompressToYUV___3BI_3BI
+  (JNIEnv *env, jobject obj, jbyteArray src, jint jpegSize, jbyteArray dst,
+   jint flags)
+{
+  tjhandle handle = 0;
+  unsigned char *jpegBuf = NULL, *dstBuf = NULL;
+  int jpegSubsamp = -1, jpegWidth = 0, jpegHeight = 0;
+
+  GET_HANDLE();
+
+  if ((*env)->GetArrayLength(env, src) < jpegSize)
+    THROW_ARG("Source buffer is not large enough");
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+  jpegSubsamp = (int)(*env)->GetIntField(env, obj, _fid);
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+  jpegWidth = (int)(*env)->GetIntField(env, obj, _fid);
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  jpegHeight = (int)(*env)->GetIntField(env, obj, _fid);
+  if ((*env)->GetArrayLength(env, dst) <
+      (jsize)tjBufSizeYUV(jpegWidth, jpegHeight, jpegSubsamp))
+    THROW_ARG("Destination buffer is not large enough");
+
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, src, 0));
+  BAILIF0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+  if (tjDecompressToYUV(handle, jpegBuf, (unsigned long)jpegSize, dstBuf,
+                        flags) == -1) {
+    SAFE_RELEASE(dst, dstBuf);
+    SAFE_RELEASE(src, jpegBuf);
+    THROW_TJ();
+  }
+
+bailout:
+  SAFE_RELEASE(dst, dstBuf);
+  SAFE_RELEASE(src, jpegBuf);
+}
+
+static void TJDecompressor_decodeYUV
+  (JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+   jintArray jSrcStrides, jint subsamp, jarray dst, jint dstElementSize,
+   jint x, jint y, jint width, jint pitch, jint height, jint pf, jint flags)
+{
+  tjhandle handle = 0;
+  jsize arraySize = 0, actualPitch;
+  jbyteArray jSrcPlanes[3] = { NULL, NULL, NULL };
+  const unsigned char *srcPlanesTmp[3] = { NULL, NULL, NULL };
+  const unsigned char *srcPlanes[3] = { NULL, NULL, NULL };
+  int *srcOffsetsTmp = NULL, srcOffsets[3] = { 0, 0, 0 };
+  int *srcStridesTmp = NULL, srcStrides[3] = { 0, 0, 0 };
+  unsigned char *dstBuf = NULL;
+  int nc = (subsamp == org_libjpegturbo_turbojpeg_TJ_SAMP_GRAY ? 1 : 3), i;
+
+  GET_HANDLE();
+
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF || subsamp < 0 ||
+      subsamp >= org_libjpegturbo_turbojpeg_TJ_NUMSAMP)
+    THROW_ARG("Invalid argument in decodeYUV()");
+  if (org_libjpegturbo_turbojpeg_TJ_NUMPF != TJ_NUMPF ||
+      org_libjpegturbo_turbojpeg_TJ_NUMSAMP != TJ_NUMSAMP)
+    THROW_ARG("Mismatch between Java and C API");
+
+  if ((*env)->GetArrayLength(env, srcobjs) < nc)
+    THROW_ARG("Planes array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jSrcOffsets) < nc)
+    THROW_ARG("Offsets array is too small for the subsampling type");
+  if ((*env)->GetArrayLength(env, jSrcStrides) < nc)
+    THROW_ARG("Strides array is too small for the subsampling type");
+
+  actualPitch = (pitch == 0) ? width * tjPixelSize[pf] : pitch;
+  arraySize = (y + height - 1) * actualPitch + (x + width) * tjPixelSize[pf];
+  if ((*env)->GetArrayLength(env, dst) * dstElementSize < arraySize)
+    THROW_ARG("Destination buffer is not large enough");
+
+  BAILIF0(srcOffsetsTmp =
+          (*env)->GetPrimitiveArrayCritical(env, jSrcOffsets, 0));
+  for (i = 0; i < nc; i++) srcOffsets[i] = srcOffsetsTmp[i];
+  SAFE_RELEASE(jSrcOffsets, srcOffsetsTmp);
+
+  BAILIF0(srcStridesTmp =
+          (*env)->GetPrimitiveArrayCritical(env, jSrcStrides, 0));
+  for (i = 0; i < nc; i++) srcStrides[i] = srcStridesTmp[i];
+  SAFE_RELEASE(jSrcStrides, srcStridesTmp);
+
+  for (i = 0; i < nc; i++) {
+    int planeSize = tjPlaneSizeYUV(i, width, srcStrides[i], height, subsamp);
+    int pw = tjPlaneWidth(i, width, subsamp);
+
+    if (planeSize < 0 || pw < 0)
+      THROW_ARG(tjGetErrorStr());
+
+    if (srcOffsets[i] < 0)
+      THROW_ARG("Invalid argument in decodeYUV()");
+    if (srcStrides[i] < 0 && srcOffsets[i] - planeSize + pw < 0)
+      THROW_ARG("Negative plane stride would cause memory to be accessed below plane boundary");
+
+    BAILIF0(jSrcPlanes[i] = (*env)->GetObjectArrayElement(env, srcobjs, i));
+    if ((*env)->GetArrayLength(env, jSrcPlanes[i]) <
+        srcOffsets[i] + planeSize)
+      THROW_ARG("Source plane is not large enough");
+
+    BAILIF0(srcPlanesTmp[i] =
+            (*env)->GetPrimitiveArrayCritical(env, jSrcPlanes[i], 0));
+    srcPlanes[i] = &srcPlanesTmp[i][srcOffsets[i]];
+    SAFE_RELEASE(jSrcPlanes[i], srcPlanesTmp[i]);
+  }
+  BAILIF0(dstBuf = (*env)->GetPrimitiveArrayCritical(env, dst, 0));
+
+  if (tjDecodeYUVPlanes(handle, srcPlanes, srcStrides, subsamp,
+                        &dstBuf[y * actualPitch + x * tjPixelSize[pf]], width,
+                        pitch, height, pf, flags) == -1) {
+    SAFE_RELEASE(dst, dstBuf);
+    THROW_TJ();
+  }
+
+bailout:
+  SAFE_RELEASE(dst, dstBuf);
+}
+
+/* TurboJPEG 1.4.x: TJDecompressor::decodeYUV() byte destination */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV___3_3B_3I_3II_3BIIIIIII
+  (JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+   jintArray jSrcStrides, jint subsamp, jbyteArray dst, jint x, jint y,
+   jint width, jint pitch, jint height, jint pf, jint flags)
+{
+  TJDecompressor_decodeYUV(env, obj, srcobjs, jSrcOffsets, jSrcStrides,
+                           subsamp, dst, 1, x, y, width, pitch, height, pf,
+                           flags);
+}
+
+/* TurboJPEG 1.4.x: TJDecompressor::decodeYUV() int destination */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_decodeYUV___3_3B_3I_3II_3IIIIIIII
+  (JNIEnv *env, jobject obj, jobjectArray srcobjs, jintArray jSrcOffsets,
+   jintArray jSrcStrides, jint subsamp, jintArray dst, jint x, jint y,
+   jint width, jint stride, jint height, jint pf, jint flags)
+{
+  if (pf < 0 || pf >= org_libjpegturbo_turbojpeg_TJ_NUMPF)
+    THROW_ARG("Invalid argument in decodeYUV()");
+  if (tjPixelSize[pf] != sizeof(jint))
+    THROW_ARG("Pixel format must be 32-bit when decoding to an integer buffer.");
+
+  TJDecompressor_decodeYUV(env, obj, srcobjs, jSrcOffsets, jSrcStrides,
+                           subsamp, dst, sizeof(jint), x, y, width,
+                           stride * sizeof(jint), height, pf, flags);
+
+bailout:
+  return;
+}
+
+/* TurboJPEG 1.2.x: TJTransformer::init() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_init
+  (JNIEnv *env, jobject obj)
+{
+  jclass cls;
+  jfieldID fid;
+  tjhandle handle;
+
+  if ((handle = tjInitTransform()) == NULL)
+    THROW(tjGetErrorStr(), "org/libjpegturbo/turbojpeg/TJException");
+
+  BAILIF0(cls = (*env)->GetObjectClass(env, obj));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "handle", "J"));
+  (*env)->SetLongField(env, obj, fid, (size_t)handle);
+
+bailout:
+  return;
+}
+
+typedef struct _JNICustomFilterParams {
+  JNIEnv *env;
+  jobject tobj;
+  jobject cfobj;
+} JNICustomFilterParams;
+
+static int JNICustomFilter(short *coeffs, tjregion arrayRegion,
+                           tjregion planeRegion, int componentIndex,
+                           int transformIndex, tjtransform *transform)
+{
+  JNICustomFilterParams *params = (JNICustomFilterParams *)transform->data;
+  JNIEnv *env = params->env;
+  jobject tobj = params->tobj, cfobj = params->cfobj;
+  jobject arrayRegionObj, planeRegionObj, bufobj, borobj;
+  jclass cls;
+  jmethodID mid;
+  jfieldID fid;
+
+  BAILIF0(bufobj = (*env)->NewDirectByteBuffer(env, coeffs,
+    sizeof(short) * arrayRegion.w * arrayRegion.h));
+  BAILIF0(cls = (*env)->FindClass(env, "java/nio/ByteOrder"));
+  BAILIF0(mid = (*env)->GetStaticMethodID(env, cls, "nativeOrder",
+                                          "()Ljava/nio/ByteOrder;"));
+  BAILIF0(borobj = (*env)->CallStaticObjectMethod(env, cls, mid));
+  BAILIF0(cls = (*env)->GetObjectClass(env, bufobj));
+  BAILIF0(mid = (*env)->GetMethodID(env, cls, "order",
+    "(Ljava/nio/ByteOrder;)Ljava/nio/ByteBuffer;"));
+  (*env)->CallObjectMethod(env, bufobj, mid, borobj);
+  BAILIF0(mid = (*env)->GetMethodID(env, cls, "asShortBuffer",
+                                    "()Ljava/nio/ShortBuffer;"));
+  BAILIF0(bufobj = (*env)->CallObjectMethod(env, bufobj, mid));
+
+  BAILIF0(cls = (*env)->FindClass(env, "java/awt/Rectangle"));
+  BAILIF0(arrayRegionObj = (*env)->AllocObject(env, cls));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "x", "I"));
+  (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.x);
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "y", "I"));
+  (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.y);
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "width", "I"));
+  (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.w);
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "height", "I"));
+  (*env)->SetIntField(env, arrayRegionObj, fid, arrayRegion.h);
+
+  BAILIF0(planeRegionObj = (*env)->AllocObject(env, cls));
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "x", "I"));
+  (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.x);
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "y", "I"));
+  (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.y);
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "width", "I"));
+  (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.w);
+  BAILIF0(fid = (*env)->GetFieldID(env, cls, "height", "I"));
+  (*env)->SetIntField(env, planeRegionObj, fid, planeRegion.h);
+
+  BAILIF0(cls = (*env)->GetObjectClass(env, cfobj));
+  BAILIF0(mid = (*env)->GetMethodID(env, cls, "customFilter",
+    "(Ljava/nio/ShortBuffer;Ljava/awt/Rectangle;Ljava/awt/Rectangle;IILorg/libjpegturbo/turbojpeg/TJTransform;)V"));
+  (*env)->CallVoidMethod(env, cfobj, mid, bufobj, arrayRegionObj,
+                         planeRegionObj, componentIndex, transformIndex, tobj);
+
+  return 0;
+
+bailout:
+  return -1;
+}
+
+/* TurboJPEG 1.2.x: TJTransformer::transform() */
+JNIEXPORT jintArray JNICALL Java_org_libjpegturbo_turbojpeg_TJTransformer_transform
+  (JNIEnv *env, jobject obj, jbyteArray jsrcBuf, jint jpegSize,
+   jobjectArray dstobjs, jobjectArray tobjs, jint flags)
+{
+  tjhandle handle = 0;
+  unsigned char *jpegBuf = NULL, **dstBufs = NULL;
+  jsize n = 0;
+  unsigned long *dstSizes = NULL;
+  tjtransform *t = NULL;
+  jbyteArray *jdstBufs = NULL;
+  int i, jpegWidth = 0, jpegHeight = 0, jpegSubsamp;
+  jintArray jdstSizes = 0;
+  jint *dstSizesi = NULL;
+  JNICustomFilterParams *params = NULL;
+
+  GET_HANDLE();
+
+  if ((*env)->GetArrayLength(env, jsrcBuf) < jpegSize)
+    THROW_ARG("Source buffer is not large enough");
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegWidth", "I"));
+  jpegWidth = (int)(*env)->GetIntField(env, obj, _fid);
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegHeight", "I"));
+  jpegHeight = (int)(*env)->GetIntField(env, obj, _fid);
+  BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "jpegSubsamp", "I"));
+  jpegSubsamp = (int)(*env)->GetIntField(env, obj, _fid);
+
+  n = (*env)->GetArrayLength(env, dstobjs);
+  if (n != (*env)->GetArrayLength(env, tobjs))
+    THROW_ARG("Mismatch between size of transforms array and destination buffers array");
+
+  if ((dstBufs =
+       (unsigned char **)malloc(sizeof(unsigned char *) * n)) == NULL)
+    THROW_MEM();
+  if ((jdstBufs = (jbyteArray *)malloc(sizeof(jbyteArray) * n)) == NULL)
+    THROW_MEM();
+  if ((dstSizes = (unsigned long *)malloc(sizeof(unsigned long) * n)) == NULL)
+    THROW_MEM();
+  if ((t = (tjtransform *)malloc(sizeof(tjtransform) * n)) == NULL)
+    THROW_MEM();
+  if ((params = (JNICustomFilterParams *)malloc(sizeof(JNICustomFilterParams) *
+                                                n)) == NULL)
+    THROW_MEM();
+  for (i = 0; i < n; i++) {
+    dstBufs[i] = NULL;  jdstBufs[i] = NULL;  dstSizes[i] = 0;
+    memset(&t[i], 0, sizeof(tjtransform));
+    memset(&params[i], 0, sizeof(JNICustomFilterParams));
+  }
+
+  for (i = 0; i < n; i++) {
+    jobject tobj, cfobj;
+
+    BAILIF0(tobj = (*env)->GetObjectArrayElement(env, tobjs, i));
+    BAILIF0(_cls = (*env)->GetObjectClass(env, tobj));
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "op", "I"));
+    t[i].op = (*env)->GetIntField(env, tobj, _fid);
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "options", "I"));
+    t[i].options = (*env)->GetIntField(env, tobj, _fid);
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "x", "I"));
+    t[i].r.x = (*env)->GetIntField(env, tobj, _fid);
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "y", "I"));
+    t[i].r.y = (*env)->GetIntField(env, tobj, _fid);
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "width", "I"));
+    t[i].r.w = (*env)->GetIntField(env, tobj, _fid);
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "height", "I"));
+    t[i].r.h = (*env)->GetIntField(env, tobj, _fid);
+
+    BAILIF0(_fid = (*env)->GetFieldID(env, _cls, "cf",
+      "Lorg/libjpegturbo/turbojpeg/TJCustomFilter;"));
+    cfobj = (*env)->GetObjectField(env, tobj, _fid);
+    if (cfobj) {
+      params[i].env = env;
+      params[i].tobj = tobj;
+      params[i].cfobj = cfobj;
+      t[i].customFilter = JNICustomFilter;
+      t[i].data = (void *)&params[i];
+    }
+  }
+
+  for (i = 0; i < n; i++) {
+    int w = jpegWidth, h = jpegHeight;
+
+    if (t[i].r.w != 0) w = t[i].r.w;
+    if (t[i].r.h != 0) h = t[i].r.h;
+    BAILIF0(jdstBufs[i] = (*env)->GetObjectArrayElement(env, dstobjs, i));
+    if ((unsigned long)(*env)->GetArrayLength(env, jdstBufs[i]) <
+        tjBufSize(w, h, jpegSubsamp))
+      THROW_ARG("Destination buffer is not large enough");
+  }
+  BAILIF0(jpegBuf = (*env)->GetPrimitiveArrayCritical(env, jsrcBuf, 0));
+  for (i = 0; i < n; i++)
+    BAILIF0(dstBufs[i] =
+            (*env)->GetPrimitiveArrayCritical(env, jdstBufs[i], 0));
+
+  if (tjTransform(handle, jpegBuf, jpegSize, n, dstBufs, dstSizes, t,
+                  flags | TJFLAG_NOREALLOC) == -1) {
+    for (i = 0; i < n; i++)
+      SAFE_RELEASE(jdstBufs[i], dstBufs[i]);
+    SAFE_RELEASE(jsrcBuf, jpegBuf);
+    THROW_TJ();
+  }
+
+  for (i = 0; i < n; i++)
+    SAFE_RELEASE(jdstBufs[i], dstBufs[i]);
+  SAFE_RELEASE(jsrcBuf, jpegBuf);
+
+  jdstSizes = (*env)->NewIntArray(env, n);
+  BAILIF0(dstSizesi = (*env)->GetIntArrayElements(env, jdstSizes, 0));
+  for (i = 0; i < n; i++) dstSizesi[i] = (int)dstSizes[i];
+
+bailout:
+  if (dstSizesi) (*env)->ReleaseIntArrayElements(env, jdstSizes, dstSizesi, 0);
+  if (dstBufs) {
+    for (i = 0; i < n; i++) {
+      if (dstBufs[i] && jdstBufs && jdstBufs[i])
+        (*env)->ReleasePrimitiveArrayCritical(env, jdstBufs[i], dstBufs[i], 0);
+    }
+    free(dstBufs);
+  }
+  SAFE_RELEASE(jsrcBuf, jpegBuf);
+  free(jdstBufs);
+  free(dstSizes);
+  free(t);
+  return jdstSizes;
+}
+
+/* TurboJPEG 1.2.x: TJDecompressor::destroy() */
+JNIEXPORT void JNICALL Java_org_libjpegturbo_turbojpeg_TJDecompressor_destroy
+  (JNIEnv *env, jobject obj)
+{
+  Java_org_libjpegturbo_turbojpeg_TJCompressor_destroy(env, obj);
+}
diff --git a/external/jpeg/turbojpeg.c b/external/jpeg/turbojpeg.c
index 8260555b51e7..793a3eedc2ab 100644
--- a/external/jpeg/turbojpeg.c
+++ b/external/jpeg/turbojpeg.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (C)2009-2020 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2021 D. R. Commander.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -112,6 +112,32 @@ typedef struct _tjinstance {
   boolean isInstanceError;
 } tjinstance;
 
+struct my_progress_mgr {
+  struct jpeg_progress_mgr pub;
+  tjinstance *this;
+};
+typedef struct my_progress_mgr *my_progress_ptr;
+
+static void my_progress_monitor(j_common_ptr dinfo)
+{
+  my_error_ptr myerr = (my_error_ptr)dinfo->err;
+  my_progress_ptr myprog = (my_progress_ptr)dinfo->progress;
+
+  if (dinfo->is_decompressor) {
+    int scan_no = ((j_decompress_ptr)dinfo)->input_scan_number;
+
+    if (scan_no > 500) {
+      snprintf(myprog->this->errStr, JMSG_LENGTH_MAX,
+               "Progressive JPEG image has more than 500 scans");
+      snprintf(errStr, JMSG_LENGTH_MAX,
+               "Progressive JPEG image has more than 500 scans");
+      myprog->this->isInstanceError = TRUE;
+      myerr->warning = FALSE;
+      longjmp(myerr->setjmp_buffer, 1);
+    }
+  }
+}
+
 static const int pixelsize[TJ_NUMSAMP] = { 3, 3, 3, 1, 3, 3 };
 
 static const JXFORM_CODE xformtypes[TJ_NUMXOP] = {
@@ -178,6 +204,11 @@ static int cs2pf[JPEG_NUMCS] = {
   this->isInstanceError = TRUE;  THROWG(m) \
 }
 
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+/* Private flag that triggers different TurboJPEG API behavior when fuzzing */
+#define TJFLAG_FUZZING  (1 << 30)
+#endif
+
 #define GET_INSTANCE(handle) \
   tjinstance *this = (tjinstance *)handle; \
   j_compress_ptr cinfo = NULL; \
@@ -689,7 +720,10 @@ DLLEXPORT int tjCompress2(tjhandle handle, const unsigned char *srcBuf,
   jpeg_finish_compress(cinfo);
 
 bailout:
-  if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
+  if (cinfo->global_state > CSTATE_START) {
+    if (alloc) (*cinfo->dest->term_destination) (cinfo);
+    jpeg_abort_compress(cinfo);
+  }
   free(row_pointer);
   if (this->jerr.warning) retval = -1;
   this->jerr.stopOnWarning = FALSE;
@@ -1057,7 +1091,10 @@ DLLEXPORT int tjCompressFromYUVPlanes(tjhandle handle,
   jpeg_finish_compress(cinfo);
 
 bailout:
-  if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
+  if (cinfo->global_state > CSTATE_START) {
+    if (alloc) (*cinfo->dest->term_destination) (cinfo);
+    jpeg_abort_compress(cinfo);
+  }
   for (i = 0; i < MAX_COMPONENTS; i++) {
     free(tmpbuf[i]);
     free(inbuf[i]);
@@ -1245,6 +1282,7 @@ DLLEXPORT int tjDecompress2(tjhandle handle, const unsigned char *jpegBuf,
 {
   JSAMPROW *row_pointer = NULL;
   int i, retval = 0, jpegwidth, jpegheight, scaledw, scaledh;
+  struct my_progress_mgr progress;
 
   GET_DINSTANCE(handle);
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
@@ -1261,6 +1299,14 @@ DLLEXPORT int tjDecompress2(tjhandle handle, const unsigned char *jpegBuf,
   else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 #endif
 
+  if (flags & TJFLAG_LIMITSCANS) {
+    MEMZERO(&progress, sizeof(struct my_progress_mgr));
+    progress.pub.progress_monitor = my_progress_monitor;
+    progress.this = this;
+    dinfo->progress = &progress.pub;
+  } else
+    dinfo->progress = NULL;
+
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
     retval = -1;  goto bailout;
@@ -1579,6 +1625,7 @@ DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
   JSAMPLE *_tmpbuf = NULL, *ptr;
   JSAMPROW *outbuf[MAX_COMPONENTS], *tmpbuf[MAX_COMPONENTS];
   int dctsize;
+  struct my_progress_mgr progress;
 
   GET_DINSTANCE(handle);
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
@@ -1600,6 +1647,14 @@ DLLEXPORT int tjDecompressToYUVPlanes(tjhandle handle,
   else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 #endif
 
+  if (flags & TJFLAG_LIMITSCANS) {
+    MEMZERO(&progress, sizeof(struct my_progress_mgr));
+    progress.pub.progress_monitor = my_progress_monitor;
+    progress.this = this;
+    dinfo->progress = &progress.pub;
+  } else
+    dinfo->progress = NULL;
+
   if (setjmp(this->jerr.setjmp_buffer)) {
     /* If we get here, the JPEG code has signaled an error. */
     retval = -1;  goto bailout;
@@ -1837,7 +1892,8 @@ DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
 {
   jpeg_transform_info *xinfo = NULL;
   jvirt_barray_ptr *srccoefs, *dstcoefs;
-  int retval = 0, i, jpegSubsamp, saveMarkers = 0;
+  int retval = 0, alloc = 1, i, jpegSubsamp, saveMarkers = 0;
+  struct my_progress_mgr progress;
 
   GET_INSTANCE(handle);
   this->jerr.stopOnWarning = (flags & TJFLAG_STOPONWARNING) ? TRUE : FALSE;
@@ -1854,6 +1910,14 @@ DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
   else if (flags & TJFLAG_FORCESSE2) putenv("JSIMD_FORCESSE2=1");
 #endif
 
+  if (flags & TJFLAG_LIMITSCANS) {
+    MEMZERO(&progress, sizeof(struct my_progress_mgr));
+    progress.pub.progress_monitor = my_progress_monitor;
+    progress.this = this;
+    dinfo->progress = &progress.pub;
+  } else
+    dinfo->progress = NULL;
+
   if ((xinfo =
        (jpeg_transform_info *)malloc(sizeof(jpeg_transform_info) * n)) == NULL)
     THROW("tjTransform(): Memory allocation failure");
@@ -1916,7 +1980,7 @@ DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
   srccoefs = jpeg_read_coefficients(dinfo);
 
   for (i = 0; i < n; i++) {
-    int w, h, alloc = 1;
+    int w, h;
 
     if (!xinfo[i].crop) {
       w = dinfo->image_width;  h = dinfo->image_height;
@@ -1974,7 +2038,10 @@ DLLEXPORT int tjTransform(tjhandle handle, const unsigned char *jpegBuf,
   jpeg_finish_decompress(dinfo);
 
 bailout:
-  if (cinfo->global_state > CSTATE_START) jpeg_abort_compress(cinfo);
+  if (cinfo->global_state > CSTATE_START) {
+    if (alloc) (*cinfo->dest->term_destination) (cinfo);
+    jpeg_abort_compress(cinfo);
+  }
   if (dinfo->global_state > DSTATE_START) jpeg_abort_decompress(dinfo);
   free(xinfo);
   if (this->jerr.warning) retval = -1;
@@ -2034,6 +2101,11 @@ DLLEXPORT unsigned char *tjLoadImage(const char *filename, int *width,
     THROWG("tjLoadImage(): Unsupported file type");
 
   src->input_file = file;
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  /* Refuse to load images larger than 1 Megapixel when fuzzing. */
+  if (flags & TJFLAG_FUZZING)
+    src->max_pixels = 1048576;
+#endif
   (*src->start_input) (cinfo, src);
   (*cinfo->mem->realize_virt_arrays) ((j_common_ptr)cinfo);
 
diff --git a/external/jpeg/turbojpeg.h b/external/jpeg/turbojpeg.h
index f3209dd34257..c2f6b5141dea 100644
--- a/external/jpeg/turbojpeg.h
+++ b/external/jpeg/turbojpeg.h
@@ -1,5 +1,6 @@
 /*
- * Copyright (C)2009-2015, 2017, 2020 D. R. Commander.  All Rights Reserved.
+ * Copyright (C)2009-2015, 2017, 2020-2021 D. R. Commander.
+ *                                         All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -418,6 +419,16 @@ enum TJCS {
  * reduce compression and decompression performance considerably.
  */
 #define TJFLAG_PROGRESSIVE  16384
+/**
+ * Limit the number of progressive JPEG scans that the decompression and
+ * transform functions will process.  If a progressive JPEG image contains an
+ * unreasonably large number of scans, then this flag will cause the
+ * decompression and transform functions to return an error.  The primary
+ * purpose of this is to allow security-critical applications to guard against
+ * an exploit of the progressive JPEG format described in
+ * <a href="https://libjpeg-turbo.org/pmwiki/uploads/About/TwoIssueswiththeJPEGStandard.pdf" target="_blank">this report</a>.
+ */
+#define TJFLAG_LIMITSCANS  32768
 
 
 /**
diff --git a/external/jpeg/usage.txt b/external/jpeg/usage.txt
new file mode 100644
index 000000000000..f7fa3c08ed2e
--- /dev/null
+++ b/external/jpeg/usage.txt
@@ -0,0 +1,683 @@
+NOTE:  This file was modified by The libjpeg-turbo Project to include only
+information relevant to libjpeg-turbo and to wordsmith certain sections.
+
+USAGE instructions for the Independent JPEG Group's JPEG software
+=================================================================
+
+This file describes usage of the JPEG conversion programs cjpeg and djpeg,
+as well as the utility programs jpegtran, rdjpgcom and wrjpgcom.  (See
+the other documentation files if you wish to use the JPEG library within
+your own programs.)
+
+If you are on a Unix machine you may prefer to read the Unix-style manual
+pages in files cjpeg.1, djpeg.1, jpegtran.1, rdjpgcom.1, wrjpgcom.1.
+
+
+INTRODUCTION
+
+These programs implement JPEG image encoding, decoding, and transcoding.
+JPEG (pronounced "jay-peg") is a standardized compression method for
+full-color and grayscale images.
+
+
+GENERAL USAGE
+
+We provide two programs, cjpeg to compress an image file into JPEG format,
+and djpeg to decompress a JPEG file back into a conventional image format.
+
+On Unix-like systems, you say:
+        cjpeg [switches] [imagefile] >jpegfile
+or
+        djpeg [switches] [jpegfile]  >imagefile
+The programs read the specified input file, or standard input if none is
+named.  They always write to standard output (with trace/error messages to
+standard error).  These conventions are handy for piping images between
+programs.
+
+On most non-Unix systems, you say:
+        cjpeg [switches] imagefile jpegfile
+or
+        djpeg [switches] jpegfile  imagefile
+i.e., both the input and output files are named on the command line.  This
+style is a little more foolproof, and it loses no functionality if you don't
+have pipes.  (You can get this style on Unix too, if you prefer, by defining
+TWO_FILE_COMMANDLINE when you compile the programs; see install.txt.)
+
+You can also say:
+        cjpeg [switches] -outfile jpegfile  imagefile
+or
+        djpeg [switches] -outfile imagefile  jpegfile
+This syntax works on all systems, so it is useful for scripts.
+
+The currently supported image file formats are: PPM (PBMPLUS color format),
+PGM (PBMPLUS grayscale format), BMP, GIF, and Targa.  cjpeg recognizes the
+input image format automatically, with the exception of some Targa files.  You
+have to tell djpeg which format to generate.
+
+JPEG files are in the defacto standard JFIF file format.  There are other,
+less widely used JPEG-based file formats, but we don't support them.
+
+All switch names may be abbreviated; for example, -grayscale may be written
+-gray or -gr.  Most of the "basic" switches can be abbreviated to as little as
+one letter.  Upper and lower case are equivalent (-BMP is the same as -bmp).
+British spellings are also accepted (e.g., -greyscale), though for brevity
+these are not mentioned below.
+
+
+CJPEG DETAILS
+
+The basic command line switches for cjpeg are:
+
+        -quality N[,...]  Scale quantization tables to adjust image quality.
+                          Quality is 0 (worst) to 100 (best); default is 75.
+                          (See below for more info.)
+
+        -grayscale      Create monochrome JPEG file from color input.
+                        Be sure to use this switch when compressing a grayscale
+                        BMP or GIF file, because cjpeg isn't bright enough to
+                        notice whether a BMP or GIF file uses only shades of
+                        gray.  By saying -grayscale, you'll get a smaller JPEG
+                        file that takes less time to process.
+
+        -rgb            Create RGB JPEG file.
+                        Using this switch suppresses the conversion from RGB
+                        colorspace input to the default YCbCr JPEG colorspace.
+
+        -optimize       Perform optimization of entropy encoding parameters.
+                        Without this, default encoding parameters are used.
+                        -optimize usually makes the JPEG file a little smaller,
+                        but cjpeg runs somewhat slower and needs much more
+                        memory.  Image quality and speed of decompression are
+                        unaffected by -optimize.
+
+        -progressive    Create progressive JPEG file (see below).
+
+        -targa          Input file is Targa format.  Targa files that contain
+                        an "identification" field will not be automatically
+                        recognized by cjpeg; for such files you must specify
+                        -targa to make cjpeg treat the input as Targa format.
+                        For most Targa files, you won't need this switch.
+
+The -quality switch lets you trade off compressed file size against quality of
+the reconstructed image: the higher the quality setting, the larger the JPEG
+file, and the closer the output image will be to the original input.  Normally
+you want to use the lowest quality setting (smallest file) that decompresses
+into something visually indistinguishable from the original image.  For this
+purpose the quality setting should generally be between 50 and 95 (the default
+is 75) for photographic images.  If you see defects at -quality 75, then go up
+5 or 10 counts at a time until you are happy with the output image.  (The
+optimal setting will vary from one image to another.)
+
+-quality 100 will generate a quantization table of all 1's, minimizing loss
+in the quantization step (but there is still information loss in subsampling,
+as well as roundoff error.)  For most images, specifying a quality value above
+about 95 will increase the size of the compressed file dramatically, and while
+the quality gain from these higher quality values is measurable (using metrics
+such as PSNR or SSIM), it is rarely perceivable by human vision.
+
+In the other direction, quality values below 50 will produce very small files
+of low image quality.  Settings around 5 to 10 might be useful in preparing an
+index of a large image library, for example.  Try -quality 2 (or so) for some
+amusing Cubist effects.  (Note: quality values below about 25 generate 2-byte
+quantization tables, which are considered optional in the JPEG standard.
+cjpeg emits a warning message when you give such a quality value, because some
+other JPEG programs may be unable to decode the resulting file.  Use -baseline
+if you need to ensure compatibility at low quality values.)
+
+The -quality option has been extended in this version of cjpeg to support
+separate quality settings for luminance and chrominance (or, in general,
+separate settings for every quantization table slot.)  The principle is the
+same as chrominance subsampling:  since the human eye is more sensitive to
+spatial changes in brightness than spatial changes in color, the chrominance
+components can be quantized more than the luminance components without
+incurring any visible image quality loss.  However, unlike subsampling, this
+feature reduces data in the frequency domain instead of the spatial domain,
+which allows for more fine-grained control.  This option is useful in
+quality-sensitive applications, for which the artifacts generated by
+subsampling may be unacceptable.
+
+The -quality option accepts a comma-separated list of parameters, which
+respectively refer to the quality levels that should be assigned to the
+quantization table slots.  If there are more q-table slots than parameters,
+then the last parameter is replicated.  Thus, if only one quality parameter is
+given, this is used for both luminance and chrominance (slots 0 and 1,
+respectively), preserving the legacy behavior of cjpeg v6b and prior.  More (or
+customized) quantization tables can be set with the -qtables option and
+assigned to components with the -qslots option (see the "wizard" switches
+below.)
+
+JPEG  files  generated  with separate luminance and chrominance quality are
+fully compliant with standard JPEG decoders.
+
+CAUTION: For this setting to be useful, be sure to pass an argument of
+-sample 1x1 to cjpeg to disable chrominance subsampling.  Otherwise, the
+default subsampling level (2x2, AKA "4:2:0") will be used.
+
+The -progressive switch creates a "progressive JPEG" file.  In this type of
+JPEG file, the data is stored in multiple scans of increasing quality.  If the
+file is being transmitted over a slow communications link, the decoder can use
+the first scan to display a low-quality image very quickly, and can then
+improve the display with each subsequent scan.  The final image is exactly
+equivalent to a standard JPEG file of the same quality setting, and the total
+file size is about the same --- often a little smaller.
+
+Switches for advanced users:
+
+        -arithmetic     Use arithmetic coding.  CAUTION: arithmetic coded JPEG
+                        is not yet widely implemented, so many decoders will
+                        be unable to view an arithmetic coded JPEG file at
+                        all.
+
+        -dct int        Use accurate integer DCT method (default).
+        -dct fast       Use less accurate integer DCT method [legacy feature].
+                        When the Independent JPEG Group's software was first
+                        released in 1991, the compression time for a
+                        1-megapixel JPEG image on a mainstream PC was measured
+                        in minutes.  Thus, the fast integer DCT algorithm
+                        provided noticeable performance benefits.  On modern
+                        CPUs running libjpeg-turbo, however, the compression
+                        time for a 1-megapixel JPEG image is measured in
+                        milliseconds, and thus the performance benefits of the
+                        fast algorithm are much less noticeable.  On modern
+                        x86/x86-64 CPUs that support AVX2 instructions, the
+                        fast and int methods have similar performance.  On
+                        other types of CPUs, the fast method is generally about
+                        5-15% faster than the int method.
+
+                        For quality levels of 90 and below, there should be
+                        little or no perceptible quality difference between the
+                        two algorithms.  For quality levels above 90, however,
+                        the difference between the fast and int methods becomes
+                        more pronounced.  With quality=97, for instance, the
+                        fast method incurs generally about a 1-3 dB loss in
+                        PSNR relative to the int method, but this can be larger
+                        for some images.  Do not use the fast method with
+                        quality levels above 97.  The algorithm often
+                        degenerates at quality=98 and above and can actually
+                        produce a more lossy image than if lower quality levels
+                        had been used.  Also, in libjpeg-turbo, the fast method
+                        is not fully accelerated for quality levels above 97,
+                        so it will be slower than the int method.
+        -dct float      Use floating-point DCT method [legacy feature].
+                        The float method does not produce significantly more
+                        accurate results than the int method, and it is much
+                        slower.  The float method may also give different
+                        results on different machines due to varying roundoff
+                        behavior, whereas the integer methods should give the
+                        same results on all machines.
+
+        -restart N      Emit a JPEG restart marker every N MCU rows, or every
+                        N MCU blocks if "B" is attached to the number.
+                        -restart 0 (the default) means no restart markers.
+
+        -smooth N       Smooth the input image to eliminate dithering noise.
+                        N, ranging from 1 to 100, indicates the strength of
+                        smoothing.  0 (the default) means no smoothing.
+
+        -maxmemory N    Set limit for amount of memory to use in processing
+                        large images.  Value is in thousands of bytes, or
+                        millions of bytes if "M" is attached to the number.
+                        For example, -max 4m selects 4000000 bytes.  If more
+                        space is needed, an error will occur.
+
+        -verbose        Enable debug printout.  More -v's give more printout.
+        or  -debug      Also, version information is printed at startup.
+
+The -restart option inserts extra markers that allow a JPEG decoder to
+resynchronize after a transmission error.  Without restart markers, any damage
+to a compressed file will usually ruin the image from the point of the error
+to the end of the image; with restart markers, the damage is usually confined
+to the portion of the image up to the next restart marker.  Of course, the
+restart markers occupy extra space.  We recommend -restart 1 for images that
+will be transmitted across unreliable networks such as Usenet.
+
+The -smooth option filters the input to eliminate fine-scale noise.  This is
+often useful when converting dithered images to JPEG: a moderate smoothing
+factor of 10 to 50 gets rid of dithering patterns in the input file, resulting
+in a smaller JPEG file and a better-looking image.  Too large a smoothing
+factor will visibly blur the image, however.
+
+Switches for wizards:
+
+        -baseline       Force baseline-compatible quantization tables to be
+                        generated.  This clamps quantization values to 8 bits
+                        even at low quality settings.  (This switch is poorly
+                        named, since it does not ensure that the output is
+                        actually baseline JPEG.  For example, you can use
+                        -baseline and -progressive together.)
+
+        -qtables file   Use the quantization tables given in the specified
+                        text file.
+
+        -qslots N[,...] Select which quantization table to use for each color
+                        component.
+
+        -sample HxV[,...]  Set JPEG sampling factors for each color component.
+
+        -scans file     Use the scan script given in the specified text file.
+
+The "wizard" switches are intended for experimentation with JPEG.  If you
+don't know what you are doing, DON'T USE THEM.  These switches are documented
+further in the file wizard.txt.
+
+
+DJPEG DETAILS
+
+The basic command line switches for djpeg are:
+
+        -colors N       Reduce image to at most N colors.  This reduces the
+        or -quantize N  number of colors used in the output image, so that it
+                        can be displayed on a colormapped display or stored in
+                        a colormapped file format.  For example, if you have
+                        an 8-bit display, you'd need to reduce to 256 or fewer
+                        colors.  (-colors is the recommended name, -quantize
+                        is provided only for backwards compatibility.)
+
+        -fast           Select recommended processing options for fast, low
+                        quality output.  (The default options are chosen for
+                        highest quality output.)  Currently, this is equivalent
+                        to "-dct fast -nosmooth -onepass -dither ordered".
+
+        -grayscale      Force grayscale output even if JPEG file is color.
+                        Useful for viewing on monochrome displays; also,
+                        djpeg runs noticeably faster in this mode.
+
+        -rgb            Force RGB output even if JPEG file is grayscale.
+
+        -scale M/N      Scale the output image by a factor M/N.  Currently
+                        the scale factor must be M/8, where M is an integer
+                        between 1 and 16 inclusive, or any reduced fraction
+                        thereof (such as 1/2, 3/4, etc.  Scaling is handy if
+                        the image is larger than your screen; also, djpeg runs
+                        much faster when scaling down the output.
+
+        -bmp            Select BMP output format (Windows flavor).  8-bit
+                        colormapped format is emitted if -colors or -grayscale
+                        is specified, or if the JPEG file is grayscale;
+                        otherwise, 24-bit full-color format is emitted.
+
+        -gif            Select GIF output format (LZW-compressed).  Since GIF
+                        does not support more than 256 colors, -colors 256 is
+                        assumed (unless you specify a smaller number of
+                        colors).  If you specify -fast, the default number of
+                        colors is 216.
+
+        -gif0           Select GIF output format (uncompressed).  Since GIF
+                        does not support more than 256 colors, -colors 256 is
+                        assumed (unless you specify a smaller number of
+                        colors).  If you specify -fast, the default number of
+                        colors is 216.
+
+        -os2            Select BMP output format (OS/2 1.x flavor).  8-bit
+                        colormapped format is emitted if -colors or -grayscale
+                        is specified, or if the JPEG file is grayscale;
+                        otherwise, 24-bit full-color format is emitted.
+
+        -pnm            Select PBMPLUS (PPM/PGM) output format (this is the
+                        default format).  PGM is emitted if the JPEG file is
+                        grayscale or if -grayscale is specified; otherwise
+                        PPM is emitted.
+
+        -targa          Select Targa output format.  Grayscale format is
+                        emitted if the JPEG file is grayscale or if
+                        -grayscale is specified; otherwise, colormapped format
+                        is emitted if -colors is specified; otherwise, 24-bit
+                        full-color format is emitted.
+
+Switches for advanced users:
+
+        -dct int        Use accurate integer DCT method (default).
+        -dct fast       Use less accurate integer DCT method [legacy feature].
+                        When the Independent JPEG Group's software was first
+                        released in 1991, the decompression time for a
+                        1-megapixel JPEG image on a mainstream PC was measured
+                        in minutes.  Thus, the fast integer DCT algorithm
+                        provided noticeable performance benefits.  On modern
+                        CPUs running libjpeg-turbo, however, the decompression
+                        time for a 1-megapixel JPEG image is measured in
+                        milliseconds, and thus the performance benefits of the
+                        fast algorithm are much less noticeable.  On modern
+                        x86/x86-64 CPUs that support AVX2 instructions, the
+                        fast and int methods have similar performance.  On
+                        other types of CPUs, the fast method is generally about
+                        5-15% faster than the int method.
+
+                        If the JPEG image was compressed using a quality level
+                        of 85 or below, then there should be little or no
+                        perceptible quality difference between the two
+                        algorithms.  When decompressing images that were
+                        compressed using quality levels above 85, however, the
+                        difference between the fast and int methods becomes
+                        more pronounced.  With images compressed using
+                        quality=97, for instance, the fast method incurs
+                        generally about a 4-6 dB loss in PSNR relative to the
+                        int method, but this can be larger for some images.  If
+                        you can avoid it, do not use the fast method when
+                        decompressing images that were compressed using quality
+                        levels above 97.  The algorithm often degenerates for
+                        such images and can actually produce a more lossy
+                        output image than if the JPEG image had been compressed
+                        using lower quality levels.
+        -dct float      Use floating-point DCT method [legacy feature].
+                        The float method does not produce significantly more
+                        accurate results than the int method, and it is much
+                        slower.  The float method may also give different
+                        results on different machines due to varying roundoff
+                        behavior, whereas the integer methods should give the
+                        same results on all machines.
+
+        -dither fs      Use Floyd-Steinberg dithering in color quantization.
+        -dither ordered Use ordered dithering in color quantization.
+        -dither none    Do not use dithering in color quantization.
+                        By default, Floyd-Steinberg dithering is applied when
+                        quantizing colors; this is slow but usually produces
+                        the best results.  Ordered dither is a compromise
+                        between speed and quality; no dithering is fast but
+                        usually looks awful.  Note that these switches have
+                        no effect unless color quantization is being done.
+                        Ordered dither is only available in -onepass mode.
+
+        -map FILE       Quantize to the colors used in the specified image
+                        file.  This is useful for producing multiple files
+                        with identical color maps, or for forcing a predefined
+                        set of colors to be used.  The FILE must be a GIF
+                        or PPM file.  This option overrides -colors and
+                        -onepass.
+
+        -nosmooth       Use a faster, lower-quality upsampling routine.
+
+        -onepass        Use one-pass instead of two-pass color quantization.
+                        The one-pass method is faster and needs less memory,
+                        but it produces a lower-quality image.  -onepass is
+                        ignored unless you also say -colors N.  Also,
+                        the one-pass method is always used for grayscale
+                        output (the two-pass method is no improvement then).
+
+        -maxmemory N    Set limit for amount of memory to use in processing
+                        large images.  Value is in thousands of bytes, or
+                        millions of bytes if "M" is attached to the number.
+                        For example, -max 4m selects 4000000 bytes.  If more
+                        space is needed, an error will occur.
+
+        -verbose        Enable debug printout.  More -v's give more printout.
+        or  -debug      Also, version information is printed at startup.
+
+
+HINTS FOR CJPEG
+
+Color GIF files are not the ideal input for JPEG; JPEG is really intended for
+compressing full-color (24-bit) images.  In particular, don't try to convert
+cartoons, line drawings, and other images that have only a few distinct
+colors.  GIF works great on these, JPEG does not.  If you want to convert a
+GIF to JPEG, you should experiment with cjpeg's -quality and -smooth options
+to get a satisfactory conversion.  -smooth 10 or so is often helpful.
+
+Avoid running an image through a series of JPEG compression/decompression
+cycles.  Image quality loss will accumulate; after ten or so cycles the image
+may be noticeably worse than it was after one cycle.  It's best to use a
+lossless format while manipulating an image, then convert to JPEG format when
+you are ready to file the image away.
+
+The -optimize option to cjpeg is worth using when you are making a "final"
+version for posting or archiving.  It's also a win when you are using low
+quality settings to make very small JPEG files; the percentage improvement
+is often a lot more than it is on larger files.  (At present, -optimize
+mode is always selected when generating progressive JPEG files.)
+
+
+HINTS FOR DJPEG
+
+To get a quick preview of an image, use the -grayscale and/or -scale switches.
+"-grayscale -scale 1/8" is the fastest case.
+
+Several options are available that trade off image quality to gain speed.
+"-fast" turns on the recommended settings.
+
+"-dct fast" and/or "-nosmooth" gain speed at a small sacrifice in quality.
+When producing a color-quantized image, "-onepass -dither ordered" is fast but
+much lower quality than the default behavior.  "-dither none" may give
+acceptable results in two-pass mode, but is seldom tolerable in one-pass mode.
+
+
+HINTS FOR BOTH PROGRAMS
+
+If the memory needed by cjpeg or djpeg exceeds the limit specified by
+-maxmemory, an error will occur.  You can leave out -progressive and -optimize
+(for cjpeg) or specify -onepass (for djpeg) to reduce memory usage.
+
+On machines that have "environment" variables, you can define the environment
+variable JPEGMEM to set the default memory limit.  The value is specified as
+described for the -maxmemory switch.  JPEGMEM overrides the default value
+specified when the program was compiled, and itself is overridden by an
+explicit -maxmemory switch.
+
+
+JPEGTRAN
+
+jpegtran performs various useful transformations of JPEG files.
+It can translate the coded representation from one variant of JPEG to another,
+for example from baseline JPEG to progressive JPEG or vice versa.  It can also
+perform some rearrangements of the image data, for example turning an image
+from landscape to portrait format by rotation.  For EXIF files and JPEG files
+containing Exif data, you may prefer to use exiftran instead.
+
+jpegtran works by rearranging the compressed data (DCT coefficients), without
+ever fully decoding the image.  Therefore, its transformations are lossless:
+there is no image degradation at all, which would not be true if you used
+djpeg followed by cjpeg to accomplish the same conversion.  But by the same
+token, jpegtran cannot perform lossy operations such as changing the image
+quality.  However, while the image data is losslessly transformed, metadata
+can be removed.  See the -copy option for specifics.
+
+jpegtran uses a command line syntax similar to cjpeg or djpeg.
+On Unix-like systems, you say:
+        jpegtran [switches] [inputfile] >outputfile
+On most non-Unix systems, you say:
+        jpegtran [switches] inputfile outputfile
+where both the input and output files are JPEG files.
+
+To specify the coded JPEG representation used in the output file,
+jpegtran accepts a subset of the switches recognized by cjpeg:
+        -optimize       Perform optimization of entropy encoding parameters.
+        -progressive    Create progressive JPEG file.
+        -arithmetic     Use arithmetic coding.
+        -restart N      Emit a JPEG restart marker every N MCU rows, or every
+                        N MCU blocks if "B" is attached to the number.
+        -scans file     Use the scan script given in the specified text file.
+See the previous discussion of cjpeg for more details about these switches.
+If you specify none of these switches, you get a plain baseline-JPEG output
+file.  The quality setting and so forth are determined by the input file.
+
+The image can be losslessly transformed by giving one of these switches:
+        -flip horizontal        Mirror image horizontally (left-right).
+        -flip vertical          Mirror image vertically (top-bottom).
+        -rotate 90              Rotate image 90 degrees clockwise.
+        -rotate 180             Rotate image 180 degrees.
+        -rotate 270             Rotate image 270 degrees clockwise (or 90 ccw).
+        -transpose              Transpose image (across UL-to-LR axis).
+        -transverse             Transverse transpose (across UR-to-LL axis).
+
+The transpose transformation has no restrictions regarding image dimensions.
+The other transformations operate rather oddly if the image dimensions are not
+a multiple of the iMCU size (usually 8 or 16 pixels), because they can only
+transform complete blocks of DCT coefficient data in the desired way.
+
+jpegtran's default behavior when transforming an odd-size image is designed
+to preserve exact reversibility and mathematical consistency of the
+transformation set.  As stated, transpose is able to flip the entire image
+area.  Horizontal mirroring leaves any partial iMCU column at the right edge
+untouched, but is able to flip all rows of the image.  Similarly, vertical
+mirroring leaves any partial iMCU row at the bottom edge untouched, but is
+able to flip all columns.  The other transforms can be built up as sequences
+of transpose and flip operations; for consistency, their actions on edge
+pixels are defined to be the same as the end result of the corresponding
+transpose-and-flip sequence.
+
+For practical use, you may prefer to discard any untransformable edge pixels
+rather than having a strange-looking strip along the right and/or bottom edges
+of a transformed image.  To do this, add the -trim switch:
+        -trim           Drop non-transformable edge blocks.
+Obviously, a transformation with -trim is not reversible, so strictly speaking
+jpegtran with this switch is not lossless.  Also, the expected mathematical
+equivalences between the transformations no longer hold.  For example,
+"-rot 270 -trim" trims only the bottom edge, but "-rot 90 -trim" followed by
+"-rot 180 -trim" trims both edges.
+
+If you are only interested in perfect transformations, add the -perfect switch:
+        -perfect        Fail with an error if the transformation is not
+                        perfect.
+For example, you may want to do
+  jpegtran -rot 90 -perfect foo.jpg || djpeg foo.jpg | pnmflip -r90 | cjpeg
+to do a perfect rotation, if available, or an approximated one if not.
+
+This version of jpegtran also offers a lossless crop option, which discards
+data outside of a given image region but losslessly preserves what is inside.
+Like the rotate and flip transforms, lossless crop is restricted by the current
+JPEG format; the upper left corner of the selected region must fall on an iMCU
+boundary.  If it doesn't, then it is silently moved up and/or left to the
+nearest iMCU boundary (the lower right corner is unchanged.)  Thus, the output
+image covers at least the requested region, but it may cover more.  The
+adjustment of the region dimensions may be optionally disabled by attaching an
+'f' character ("force") to the width or height number.
+
+The image can be losslessly cropped by giving the switch:
+        -crop WxH+X+Y   Crop to a rectangular region of width W and height H,
+                        starting at point X,Y.
+
+If W or H is larger than the width/height of the input image, then the output
+image is expanded in size, and the expanded region is filled in with zeros
+(neutral gray).  Attaching an 'f' character ("flatten") to the width number
+will cause each block in the expanded region to be filled in with the DC
+coefficient of the nearest block in the input image rather than grayed out.
+Attaching an 'r' character ("reflect") to the width number will cause the
+expanded region to be filled in with repeated reflections of the input image
+rather than grayed out.
+
+A complementary lossless wipe option is provided to discard (gray out) data
+inside a given image region while losslessly preserving what is outside:
+        -wipe WxH+X+Y   Wipe (gray out) a rectangular region of width W and
+                        height H from the input image, starting at point X,Y.
+
+Attaching an 'f' character ("flatten") to the width number will cause the
+region to be filled with the average of adjacent blocks rather than grayed out.
+If the wipe region and the region outside the wipe region, when adjusted to the
+nearest iMCU boundary, form two horizontally adjacent rectangles, then
+attaching an 'r' character ("reflect") to the width number will cause the wipe
+region to be filled with repeated reflections of the outside region rather than
+grayed out.
+
+A lossless drop option is also provided, which allows another JPEG image to be
+inserted ("dropped") into the input image data at a given position, replacing
+the existing image data at that position:
+        -drop +X+Y filename     Drop (insert) another image at point X,Y
+
+Both the input image and the drop image must have the same subsampling level.
+It is best if they also have the same quantization (quality.)  Otherwise, the
+quantization of the output image will be adapted to accommodate the higher of
+the input image quality and the drop image quality.  The trim option can be
+used with the drop option to requantize the drop image to match the input
+image.  Note that a grayscale image can be dropped into a full-color image or
+vice versa, as long as the full-color image has no vertical subsampling.  If
+the input image is grayscale and the drop image is full-color, then the
+chrominance channels from the drop image will be discarded.
+
+Other not-strictly-lossless transformation switches are:
+
+        -grayscale      Force grayscale output.
+This option discards the chrominance channels if the input image is YCbCr
+(ie, a standard color JPEG), resulting in a grayscale JPEG file.  The
+luminance channel is preserved exactly, so this is a better method of reducing
+to grayscale than decompression, conversion, and recompression.  This switch
+is particularly handy for fixing a monochrome picture that was mistakenly
+encoded as a color JPEG.  (In such a case, the space savings from getting rid
+of the near-empty chroma channels won't be large; but the decoding time for
+a grayscale JPEG is substantially less than that for a color JPEG.)
+
+jpegtran also recognizes these switches that control what to do with "extra"
+markers, such as comment blocks:
+        -copy none      Copy no extra markers from source file.  This setting
+                        suppresses all comments and other metadata in the
+                        source file.
+        -copy comments  Copy only comment markers.  This setting copies
+                        comments from the source file but discards any other
+                        metadata.
+        -copy all       Copy all extra markers.  This setting preserves
+                        miscellaneous markers found in the source file, such
+                        as JFIF thumbnails, Exif data, and Photoshop settings.
+                        In some files, these extra markers can be sizable.
+                        Note that this option will copy thumbnails as-is;
+                        they will not be transformed.
+The default behavior is -copy comments.  (Note: in IJG releases v6 and v6a,
+jpegtran always did the equivalent of -copy none.)
+
+Additional switches recognized by jpegtran are:
+        -outfile filename
+        -maxmemory N
+        -verbose
+        -debug
+These work the same as in cjpeg or djpeg.
+
+
+THE COMMENT UTILITIES
+
+The JPEG standard allows "comment" (COM) blocks to occur within a JPEG file.
+Although the standard doesn't actually define what COM blocks are for, they
+are widely used to hold user-supplied text strings.  This lets you add
+annotations, titles, index terms, etc to your JPEG files, and later retrieve
+them as text.  COM blocks do not interfere with the image stored in the JPEG
+file.  The maximum size of a COM block is 64K, but you can have as many of
+them as you like in one JPEG file.
+
+We provide two utility programs to display COM block contents and add COM
+blocks to a JPEG file.
+
+rdjpgcom searches a JPEG file and prints the contents of any COM blocks on
+standard output.  The command line syntax is
+        rdjpgcom [-raw] [-verbose] [inputfilename]
+The switch "-raw" (or just "-r") causes rdjpgcom to output non-printable
+characters in JPEG comments.  These characters are normally escaped for
+security reasons.
+The switch "-verbose" (or just "-v") causes rdjpgcom to also display the JPEG
+image dimensions.  If you omit the input file name from the command line,
+the JPEG file is read from standard input.  (This may not work on some
+operating systems, if binary data can't be read from stdin.)
+
+wrjpgcom adds a COM block, containing text you provide, to a JPEG file.
+Ordinarily, the COM block is added after any existing COM blocks, but you
+can delete the old COM blocks if you wish.  wrjpgcom produces a new JPEG
+file; it does not modify the input file.  DO NOT try to overwrite the input
+file by directing wrjpgcom's output back into it; on most systems this will
+just destroy your file.
+
+The command line syntax for wrjpgcom is similar to cjpeg's.  On Unix-like
+systems, it is
+        wrjpgcom [switches] [inputfilename]
+The output file is written to standard output.  The input file comes from
+the named file, or from standard input if no input file is named.
+
+On most non-Unix systems, the syntax is
+        wrjpgcom [switches] inputfilename outputfilename
+where both input and output file names must be given explicitly.
+
+wrjpgcom understands three switches:
+        -replace                 Delete any existing COM blocks from the file.
+        -comment "Comment text"  Supply new COM text on command line.
+        -cfile name              Read text for new COM block from named file.
+(Switch names can be abbreviated.)  If you have only one line of comment text
+to add, you can provide it on the command line with -comment.  The comment
+text must be surrounded with quotes so that it is treated as a single
+argument.  Longer comments can be read from a text file.
+
+If you give neither -comment nor -cfile, then wrjpgcom will read the comment
+text from standard input.  (In this case an input image file name MUST be
+supplied, so that the source JPEG file comes from somewhere else.)  You can
+enter multiple lines, up to 64KB worth.  Type an end-of-file indicator
+(usually control-D or control-Z) to terminate the comment text entry.
+
+wrjpgcom will not add a COM block if the provided comment string is empty.
+Therefore -replace -comment "" can be used to delete all COM blocks from a
+file.
+
+These utility programs do not depend on the IJG JPEG library.  In
+particular, the source code for rdjpgcom is intended as an illustration of
+the minimum amount of code required to parse a JPEG file header correctly.
diff --git a/external/jpeg/win/gcc/projectTargets-release.cmake.in b/external/jpeg/win/gcc/projectTargets-release.cmake.in
new file mode 100644
index 000000000000..1e1a8a34aff6
--- /dev/null
+++ b/external/jpeg/win/gcc/projectTargets-release.cmake.in
@@ -0,0 +1,49 @@
+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Import target "@CMAKE_PROJECT_NAME@::jpeg" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::jpeg APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::jpeg PROPERTIES
+  IMPORTED_IMPLIB_RELEASE "${_IMPORT_PREFIX}/lib/libjpeg.dll.a"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/bin/libjpeg-62.dll"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::jpeg )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::jpeg "${_IMPORT_PREFIX}/lib/libjpeg.dll.a" "${_IMPORT_PREFIX}/bin/libjpeg-62.dll" )
+
+# Import target "@CMAKE_PROJECT_NAME@::turbojpeg" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::turbojpeg APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::turbojpeg PROPERTIES
+  IMPORTED_IMPLIB_RELEASE "${_IMPORT_PREFIX}/lib/libturbojpeg.dll.a"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/bin/libturbojpeg.dll"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::turbojpeg )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::turbojpeg "${_IMPORT_PREFIX}/lib/libturbojpeg.dll.a" "${_IMPORT_PREFIX}/bin/libturbojpeg.dll" )
+
+# Import target "@CMAKE_PROJECT_NAME@::turbojpeg-static" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::turbojpeg-static APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::turbojpeg-static PROPERTIES
+  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "ASM_NASM;C"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libturbojpeg.a"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::turbojpeg-static )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::turbojpeg-static "${_IMPORT_PREFIX}/lib/libturbojpeg.a" )
+
+# Import target "@CMAKE_PROJECT_NAME@::jpeg-static" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::jpeg-static APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::jpeg-static PROPERTIES
+  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "ASM_NASM;C"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/libjpeg.a"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::jpeg-static )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::jpeg-static "${_IMPORT_PREFIX}/lib/libjpeg.a" )
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
diff --git a/external/jpeg/win/jconfig.h.in b/external/jpeg/win/jconfig.h.in
index 6db0b345b2df..13cceef01d13 100644
--- a/external/jpeg/win/jconfig.h.in
+++ b/external/jpeg/win/jconfig.h.in
@@ -18,7 +18,6 @@
 #define HAVE_UNSIGNED_SHORT
 #undef INCOMPLETE_TYPES_BROKEN
 #undef RIGHT_SHIFT_IS_UNSIGNED
-#undef __CHAR_UNSIGNED__
 
 /* Define "boolean" as unsigned char, not int, per Windows custom */
 #ifndef __RPCNDR_H__            /* don't conflict if rpcndr.h already read */
diff --git a/external/jpeg/win/projectTargets.cmake.in b/external/jpeg/win/projectTargets.cmake.in
new file mode 100644
index 000000000000..05ab4984db1f
--- /dev/null
+++ b/external/jpeg/win/projectTargets.cmake.in
@@ -0,0 +1,115 @@
+# Generated by CMake
+
+if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.5)
+   message(FATAL_ERROR "CMake >= 2.6.0 required")
+endif()
+cmake_policy(PUSH)
+cmake_policy(VERSION 2.6)
+#----------------------------------------------------------------
+# Generated CMake target import file.
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Protect against multiple inclusion, which would fail when already imported targets are added once more.
+set(_targetsDefined)
+set(_targetsNotDefined)
+set(_expectedTargets)
+foreach(_expectedTarget @CMAKE_PROJECT_NAME@::jpeg @CMAKE_PROJECT_NAME@::turbojpeg @CMAKE_PROJECT_NAME@::turbojpeg-static @CMAKE_PROJECT_NAME@::jpeg-static)
+  list(APPEND _expectedTargets ${_expectedTarget})
+  if(NOT TARGET ${_expectedTarget})
+    list(APPEND _targetsNotDefined ${_expectedTarget})
+  endif()
+  if(TARGET ${_expectedTarget})
+    list(APPEND _targetsDefined ${_expectedTarget})
+  endif()
+endforeach()
+if("${_targetsDefined}" STREQUAL "${_expectedTargets}")
+  unset(_targetsDefined)
+  unset(_targetsNotDefined)
+  unset(_expectedTargets)
+  set(CMAKE_IMPORT_FILE_VERSION)
+  cmake_policy(POP)
+  return()
+endif()
+if(NOT "${_targetsDefined}" STREQUAL "")
+  message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_targetsDefined}\nTargets not yet defined: ${_targetsNotDefined}\n")
+endif()
+unset(_targetsDefined)
+unset(_targetsNotDefined)
+unset(_expectedTargets)
+
+
+# Compute the installation prefix relative to this file.
+get_filename_component(_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+get_filename_component(_IMPORT_PREFIX "${_IMPORT_PREFIX}" PATH)
+if(_IMPORT_PREFIX STREQUAL "/")
+  set(_IMPORT_PREFIX "")
+endif()
+
+# Create imported target @CMAKE_PROJECT_NAME@::jpeg
+add_library(@CMAKE_PROJECT_NAME@::jpeg SHARED IMPORTED)
+
+set_target_properties(@CMAKE_PROJECT_NAME@::jpeg PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+)
+
+# Create imported target @CMAKE_PROJECT_NAME@::turbojpeg
+add_library(@CMAKE_PROJECT_NAME@::turbojpeg SHARED IMPORTED)
+
+set_target_properties(@CMAKE_PROJECT_NAME@::turbojpeg PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+)
+
+# Create imported target @CMAKE_PROJECT_NAME@::turbojpeg-static
+add_library(@CMAKE_PROJECT_NAME@::turbojpeg-static STATIC IMPORTED)
+
+set_target_properties(@CMAKE_PROJECT_NAME@::turbojpeg-static PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+)
+
+# Create imported target @CMAKE_PROJECT_NAME@::jpeg-static
+add_library(@CMAKE_PROJECT_NAME@::jpeg-static STATIC IMPORTED)
+
+set_target_properties(@CMAKE_PROJECT_NAME@::jpeg-static PROPERTIES
+  INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include"
+)
+
+# Load information for each installed configuration.
+get_filename_component(_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+file(GLOB CONFIG_FILES "${_DIR}/@CMAKE_PROJECT_NAME@Targets-*.cmake")
+foreach(f ${CONFIG_FILES})
+  include(${f})
+endforeach()
+
+# Cleanup temporary variables.
+set(_IMPORT_PREFIX)
+
+# Loop over all imported files and verify that they actually exist
+foreach(target ${_IMPORT_CHECK_TARGETS} )
+  foreach(file ${_IMPORT_CHECK_FILES_FOR_${target}} )
+    if(NOT EXISTS "${file}" )
+      message(FATAL_ERROR "The imported target \"${target}\" references the file
+   \"${file}\"
+but this file does not exist.  Possible reasons include:
+* The file was deleted, renamed, or moved to another location.
+* An install or uninstall procedure did not complete successfully.
+* The installation package was faulty and contained
+   \"${CMAKE_CURRENT_LIST_FILE}\"
+but not all the files it references.
+")
+    endif()
+  endforeach()
+  unset(_IMPORT_CHECK_FILES_FOR_${target})
+endforeach()
+unset(_IMPORT_CHECK_TARGETS)
+
+# This file does not depend on other imported targets which have
+# been exported from the same project but in a separate export set.
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
+cmake_policy(POP)
diff --git a/external/jpeg/win/vc/projectTargets-release.cmake.in b/external/jpeg/win/vc/projectTargets-release.cmake.in
new file mode 100644
index 000000000000..7abb281b70cf
--- /dev/null
+++ b/external/jpeg/win/vc/projectTargets-release.cmake.in
@@ -0,0 +1,49 @@
+#----------------------------------------------------------------
+# Generated CMake target import file for configuration "Release".
+#----------------------------------------------------------------
+
+# Commands may need to know the format version.
+set(CMAKE_IMPORT_FILE_VERSION 1)
+
+# Import target "@CMAKE_PROJECT_NAME@::jpeg" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::jpeg APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::jpeg PROPERTIES
+  IMPORTED_IMPLIB_RELEASE "${_IMPORT_PREFIX}/lib/jpeg.lib"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/bin/jpeg62.dll"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::jpeg )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::jpeg "${_IMPORT_PREFIX}/lib/jpeg.lib" "${_IMPORT_PREFIX}/bin/jpeg62.dll" )
+
+# Import target "@CMAKE_PROJECT_NAME@::turbojpeg" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::turbojpeg APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::turbojpeg PROPERTIES
+  IMPORTED_IMPLIB_RELEASE "${_IMPORT_PREFIX}/lib/turbojpeg.lib"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/bin/turbojpeg.dll"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::turbojpeg )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::turbojpeg "${_IMPORT_PREFIX}/lib/turbojpeg.lib" "${_IMPORT_PREFIX}/bin/turbojpeg.dll" )
+
+# Import target "@CMAKE_PROJECT_NAME@::turbojpeg-static" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::turbojpeg-static APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::turbojpeg-static PROPERTIES
+  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "ASM_NASM;C"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/turbojpeg-static.lib"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::turbojpeg-static )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::turbojpeg-static "${_IMPORT_PREFIX}/lib/turbojpeg-static.lib" )
+
+# Import target "@CMAKE_PROJECT_NAME@::jpeg-static" for configuration "Release"
+set_property(TARGET @CMAKE_PROJECT_NAME@::jpeg-static APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE)
+set_target_properties(@CMAKE_PROJECT_NAME@::jpeg-static PROPERTIES
+  IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "ASM_NASM;C"
+  IMPORTED_LOCATION_RELEASE "${_IMPORT_PREFIX}/lib/jpeg-static.lib"
+  )
+
+list(APPEND _IMPORT_CHECK_TARGETS @CMAKE_PROJECT_NAME@::jpeg-static )
+list(APPEND _IMPORT_CHECK_FILES_FOR_@CMAKE_PROJECT_NAME@::jpeg-static "${_IMPORT_PREFIX}/lib/jpeg-static.lib" )
+
+# Commands beyond this point should not need to know the version.
+set(CMAKE_IMPORT_FILE_VERSION)
diff --git a/external/jpeg/wizard.txt b/external/jpeg/wizard.txt
new file mode 100644
index 000000000000..c57fe38a542d
--- /dev/null
+++ b/external/jpeg/wizard.txt
@@ -0,0 +1,212 @@
+Advanced usage instructions for the Independent JPEG Group's JPEG software
+==========================================================================
+
+This file describes cjpeg's "switches for wizards".
+
+The "wizard" switches are intended for experimentation with JPEG by persons
+who are reasonably knowledgeable about the JPEG standard.  If you don't know
+what you are doing, DON'T USE THESE SWITCHES.  You'll likely produce files
+with worse image quality and/or poorer compression than you'd get from the
+default settings.  Furthermore, these switches must be used with caution
+when making files intended for general use, because not all JPEG decoders
+will support unusual JPEG parameter settings.
+
+
+Quantization Table Adjustment
+-----------------------------
+
+Ordinarily, cjpeg starts with a default set of tables (the same ones given
+as examples in the JPEG standard) and scales them up or down according to
+the -quality setting.  The details of the scaling algorithm can be found in
+jcparam.c.  At very low quality settings, some quantization table entries
+can get scaled up to values exceeding 255.  Although 2-byte quantization
+values are supported by the IJG software, this feature is not in baseline
+JPEG and is not supported by all implementations.  If you need to ensure
+wide compatibility of low-quality files, you can constrain the scaled
+quantization values to no more than 255 by giving the -baseline switch.
+Note that use of -baseline will result in poorer quality for the same file
+size, since more bits than necessary are expended on higher AC coefficients.
+
+You can substitute a different set of quantization values by using the
+-qtables switch:
+
+        -qtables file   Use the quantization tables given in the named file.
+
+The specified file should be a text file containing decimal quantization
+values.  The file should contain one to four tables, each of 64 elements.
+The tables are implicitly numbered 0,1,etc. in order of appearance.  Table
+entries appear in normal array order (NOT in the zigzag order in which they
+will be stored in the JPEG file).
+
+Quantization table files are free format, in that arbitrary whitespace can
+appear between numbers.  Also, comments can be included: a comment starts
+with '#' and extends to the end of the line.  Here is an example file that
+duplicates the default quantization tables:
+
+        # Quantization tables given in Annex K (Clause K.1) of
+        # Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
+
+        # This is table 0 (the luminance table):
+          16  11  10  16  24  40  51  61
+          12  12  14  19  26  58  60  55
+          14  13  16  24  40  57  69  56
+          14  17  22  29  51  87  80  62
+          18  22  37  56  68 109 103  77
+          24  35  55  64  81 104 113  92
+          49  64  78  87 103 121 120 101
+          72  92  95  98 112 100 103  99
+
+        # This is table 1 (the chrominance table):
+          17  18  24  47  99  99  99  99
+          18  21  26  66  99  99  99  99
+          24  26  56  99  99  99  99  99
+          47  66  99  99  99  99  99  99
+          99  99  99  99  99  99  99  99
+          99  99  99  99  99  99  99  99
+          99  99  99  99  99  99  99  99
+          99  99  99  99  99  99  99  99
+
+If the -qtables switch is used without -quality, then the specified tables
+are used exactly as-is.  If both -qtables and -quality are used, then the
+tables taken from the file are scaled in the same fashion that the default
+tables would be scaled for that quality setting.  If -baseline appears, then
+the quantization values are constrained to the range 1-255.
+
+By default, cjpeg will use quantization table 0 for luminance components and
+table 1 for chrominance components.  To override this choice, use the -qslots
+switch:
+
+        -qslots N[,...]         Select which quantization table to use for
+                                each color component.
+
+The -qslots switch specifies a quantization table number for each color
+component, in the order in which the components appear in the JPEG SOF marker.
+For example, to create a separate table for each of Y,Cb,Cr, you could
+provide a -qtables file that defines three quantization tables and say
+"-qslots 0,1,2".  If -qslots gives fewer table numbers than there are color
+components, then the last table number is repeated as necessary.
+
+
+Sampling Factor Adjustment
+--------------------------
+
+By default, cjpeg uses 2:1 horizontal and vertical downsampling when
+compressing YCbCr data, and no downsampling for all other color spaces.
+You can override this default with the -sample switch:
+
+        -sample HxV[,...]       Set JPEG sampling factors for each color
+                                component.
+
+The -sample switch specifies the JPEG sampling factors for each color
+component, in the order in which they appear in the JPEG SOF marker.
+If you specify fewer HxV pairs than there are components, the remaining
+components are set to 1x1 sampling.  For example, the default YCbCr setting
+is equivalent to "-sample 2x2,1x1,1x1", which can be abbreviated to
+"-sample 2x2".
+
+There are still some JPEG decoders in existence that support only 2x1
+sampling (also called 4:2:2 sampling).  Compatibility with such decoders can
+be achieved by specifying "-sample 2x1".  This is not recommended unless
+really necessary, since it increases file size and encoding/decoding time
+with very little quality gain.
+
+
+Multiple Scan / Progression Control
+-----------------------------------
+
+By default, cjpeg emits a single-scan sequential JPEG file.  The
+-progressive switch generates a progressive JPEG file using a default series
+of progression parameters.  You can create multiple-scan sequential JPEG
+files or progressive JPEG files with custom progression parameters by using
+the -scans switch:
+
+        -scans file     Use the scan sequence given in the named file.
+
+The specified file should be a text file containing a "scan script".
+The script specifies the contents and ordering of the scans to be emitted.
+Each entry in the script defines one scan.  A scan definition specifies
+the components to be included in the scan, and for progressive JPEG it also
+specifies the progression parameters Ss,Se,Ah,Al for the scan.  Scan
+definitions are separated by semicolons (';').  A semicolon after the last
+scan definition is optional.
+
+Each scan definition contains one to four component indexes, optionally
+followed by a colon (':') and the four progressive-JPEG parameters.  The
+component indexes denote which color component(s) are to be transmitted in
+the scan.  Components are numbered in the order in which they appear in the
+JPEG SOF marker, with the first component being numbered 0.  (Note that these
+indexes are not the "component ID" codes assigned to the components, just
+positional indexes.)
+
+The progression parameters for each scan are:
+        Ss      Zigzag index of first coefficient included in scan
+        Se      Zigzag index of last coefficient included in scan
+        Ah      Zero for first scan of a coefficient, else Al of prior scan
+        Al      Successive approximation low bit position for scan
+If the progression parameters are omitted, the values 0,63,0,0 are used,
+producing a sequential JPEG file.  cjpeg automatically determines whether
+the script represents a progressive or sequential file, by observing whether
+Ss and Se values other than 0 and 63 appear.  (The -progressive switch is
+not needed to specify this; in fact, it is ignored when -scans appears.)
+The scan script must meet the JPEG restrictions on progression sequences.
+(cjpeg checks that the spec's requirements are obeyed.)
+
+Scan script files are free format, in that arbitrary whitespace can appear
+between numbers and around punctuation.  Also, comments can be included: a
+comment starts with '#' and extends to the end of the line.  For additional
+legibility, commas or dashes can be placed between values.  (Actually, any
+single punctuation character other than ':' or ';' can be inserted.)  For
+example, the following two scan definitions are equivalent:
+        0 1 2: 0 63 0 0;
+        0,1,2 : 0-63, 0,0 ;
+
+Here is an example of a scan script that generates a partially interleaved
+sequential JPEG file:
+
+        0;                      # Y only in first scan
+        1 2;                    # Cb and Cr in second scan
+
+Here is an example of a progressive scan script using only spectral selection
+(no successive approximation):
+
+        # Interleaved DC scan for Y,Cb,Cr:
+        0,1,2: 0-0,   0, 0 ;
+        # AC scans:
+        0:     1-2,   0, 0 ;    # First two Y AC coefficients
+        0:     3-5,   0, 0 ;    # Three more
+        1:     1-63,  0, 0 ;    # All AC coefficients for Cb
+        2:     1-63,  0, 0 ;    # All AC coefficients for Cr
+        0:     6-9,   0, 0 ;    # More Y coefficients
+        0:     10-63, 0, 0 ;    # Remaining Y coefficients
+
+Here is an example of a successive-approximation script.  This is equivalent
+to the default script used by "cjpeg -progressive" for YCbCr images:
+
+        # Initial DC scan for Y,Cb,Cr (lowest bit not sent)
+        0,1,2: 0-0,   0, 1 ;
+        # First AC scan: send first 5 Y AC coefficients, minus 2 lowest bits:
+        0:     1-5,   0, 2 ;
+        # Send all Cr,Cb AC coefficients, minus lowest bit:
+        # (chroma data is usually too small to be worth subdividing further;
+        #  but note we send Cr first since eye is least sensitive to Cb)
+        2:     1-63,  0, 1 ;
+        1:     1-63,  0, 1 ;
+        # Send remaining Y AC coefficients, minus 2 lowest bits:
+        0:     6-63,  0, 2 ;
+        # Send next-to-lowest bit of all Y AC coefficients:
+        0:     1-63,  2, 1 ;
+        # At this point we've sent all but the lowest bit of all coefficients.
+        # Send lowest bit of DC coefficients
+        0,1,2: 0-0,   1, 0 ;
+        # Send lowest bit of AC coefficients
+        2:     1-63,  1, 0 ;
+        1:     1-63,  1, 0 ;
+        # Y AC lowest bit scan is last; it's usually the largest scan
+        0:     1-63,  1, 0 ;
+
+It may be worth pointing out that this script is tuned for quality settings
+of around 50 to 75.  For lower quality settings, you'd probably want to use
+a script with fewer stages of successive approximation (otherwise the
+initial scans will be really bad).  For higher quality settings, you might
+want to use more stages of successive approximation (so that the initial
+scans are not too large).
diff --git a/external/jpeg/wrbmp.c b/external/jpeg/wrbmp.c
index 239f64eb3c3f..408a722a1001 100644
--- a/external/jpeg/wrbmp.c
+++ b/external/jpeg/wrbmp.c
@@ -141,7 +141,6 @@ put_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
     }
   } else if (cinfo->out_color_space == JCS_CMYK) {
     for (col = cinfo->output_width; col > 0; col--) {
-      /* can omit GETJSAMPLE() safely */
       JSAMPLE c = *inptr++, m = *inptr++, y = *inptr++, k = *inptr++;
       cmyk_to_rgb(c, m, y, k, outptr + 2, outptr + 1, outptr);
       outptr += 3;
@@ -153,7 +152,6 @@ put_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
     register int ps = rgb_pixelsize[cinfo->out_color_space];
 
     for (col = cinfo->output_width; col > 0; col--) {
-      /* can omit GETJSAMPLE() safely */
       outptr[0] = inptr[bindex];
       outptr[1] = inptr[gindex];
       outptr[2] = inptr[rindex];
@@ -372,18 +370,18 @@ write_colormap(j_decompress_ptr cinfo, bmp_dest_ptr dest, int map_colors,
     if (cinfo->out_color_components == 3) {
       /* Normal case with RGB colormap */
       for (i = 0; i < num_colors; i++) {
-        putc(GETJSAMPLE(colormap[2][i]), outfile);
-        putc(GETJSAMPLE(colormap[1][i]), outfile);
-        putc(GETJSAMPLE(colormap[0][i]), outfile);
+        putc(colormap[2][i], outfile);
+        putc(colormap[1][i], outfile);
+        putc(colormap[0][i], outfile);
         if (map_entry_size == 4)
           putc(0, outfile);
       }
     } else {
       /* Grayscale colormap (only happens with grayscale quantization) */
       for (i = 0; i < num_colors; i++) {
-        putc(GETJSAMPLE(colormap[0][i]), outfile);
-        putc(GETJSAMPLE(colormap[0][i]), outfile);
-        putc(GETJSAMPLE(colormap[0][i]), outfile);
+        putc(colormap[0][i], outfile);
+        putc(colormap[0][i], outfile);
+        putc(colormap[0][i], outfile);
         if (map_entry_size == 4)
           putc(0, outfile);
       }
@@ -438,7 +436,6 @@ finish_output_bmp(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
   JSAMPARRAY image_ptr;
   register JSAMPROW data_ptr;
   JDIMENSION row;
-  register JDIMENSION col;
   cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
 
   if (dest->use_inversion_array) {
@@ -459,10 +456,7 @@ finish_output_bmp(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
         ((j_common_ptr)cinfo, dest->whole_image, row - 1, (JDIMENSION)1,
          FALSE);
       data_ptr = image_ptr[0];
-      for (col = dest->row_width; col > 0; col--) {
-        putc(GETJSAMPLE(*data_ptr), outfile);
-        data_ptr++;
-      }
+      (void)JFWRITE(outfile, data_ptr, dest->row_width);
     }
     if (progress != NULL)
       progress->completed_extra_passes++;
diff --git a/external/jpeg/wrgif.c b/external/jpeg/wrgif.c
index 1804e0bb39c4..82a24291d5e1 100644
--- a/external/jpeg/wrgif.c
+++ b/external/jpeg/wrgif.c
@@ -3,6 +3,7 @@
  *
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1997, Thomas G. Lane.
+ * Modified 2015-2019 by Guido Vollbeding.
  * libjpeg-turbo Modifications:
  * Copyright (C) 2015, 2017, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
@@ -10,12 +11,6 @@
  *
  * This file contains routines to write output images in GIF format.
  *
- **************************************************************************
- * NOTE: to avoid entanglements with Unisys' patent on LZW compression,   *
- * this code has been modified to output "uncompressed GIF" files.        *
- * There is no trace of the LZW algorithm in this file.                   *
- **************************************************************************
- *
  * These routines may need modification for non-Unix environments or
  * specialized applications.  As they stand, they assume output to
  * an ordinary stdio stream.
@@ -33,11 +28,6 @@
  *    copyright notice and this permission notice appear in supporting
  *    documentation.  This software is provided "as is" without express or
  *    implied warranty.
- *
- * We are also required to state that
- *    "The Graphics Interchange Format(c) is the Copyright property of
- *    CompuServe Incorporated. GIF(sm) is a Service Mark property of
- *    CompuServe Incorporated."
  */
 
 #include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
@@ -45,6 +35,37 @@
 #ifdef GIF_SUPPORTED
 
 
+#define MAX_LZW_BITS     12     /* maximum LZW code size (4096 symbols) */
+
+typedef INT16 code_int;         /* must hold -1 .. 2**MAX_LZW_BITS */
+
+#define LZW_TABLE_SIZE   ((code_int)1 << MAX_LZW_BITS)
+
+#define HSIZE            5003   /* hash table size for 80% occupancy */
+
+typedef int hash_int;           /* must hold -2*HSIZE..2*HSIZE */
+
+#define MAXCODE(n_bits)  (((code_int)1 << (n_bits)) - 1)
+
+
+/*
+ * The LZW hash table consists of two parallel arrays:
+ *   hash_code[i]       code of symbol in slot i, or 0 if empty slot
+ *   hash_value[i]      symbol's value; undefined if empty slot
+ * where slot values (i) range from 0 to HSIZE-1.  The symbol value is
+ * its prefix symbol's code concatenated with its suffix character.
+ *
+ * Algorithm:  use open addressing double hashing (no chaining) on the
+ * prefix code / suffix character combination.  We do a variant of Knuth's
+ * algorithm D (vol. 3, sec. 6.4) along with G. Knott's relatively-prime
+ * secondary probe.
+ */
+
+typedef int hash_entry;         /* must hold (code_int << 8) | byte */
+
+#define HASH_ENTRY(prefix, suffix)  ((((hash_entry)(prefix)) << 8) | (suffix))
+
+
 /* Private version of data destination object */
 
 typedef struct {
@@ -54,14 +75,24 @@ typedef struct {
 
   /* State for packing variable-width codes into a bitstream */
   int n_bits;                   /* current number of bits/code */
-  int maxcode;                  /* maximum code, given n_bits */
-  long cur_accum;               /* holds bits not yet output */
+  code_int maxcode;             /* maximum code, given n_bits */
+  int init_bits;                /* initial n_bits ... restored after clear */
+  int cur_accum;                /* holds bits not yet output */
   int cur_bits;                 /* # of bits in cur_accum */
 
+  /* LZW string construction */
+  code_int waiting_code;        /* symbol not yet output; may be extendable */
+  boolean first_byte;           /* if TRUE, waiting_code is not valid */
+
   /* State for GIF code assignment */
-  int ClearCode;                /* clear code (doesn't change) */
-  int EOFCode;                  /* EOF code (ditto) */
-  int code_counter;             /* counts output symbols */
+  code_int ClearCode;           /* clear code (doesn't change) */
+  code_int EOFCode;             /* EOF code (ditto) */
+  code_int free_code;           /* LZW: first not-yet-used symbol code */
+  code_int code_counter;        /* not LZW: counts output symbols */
+
+  /* LZW hash table */
+  code_int *hash_code;          /* => hash table of symbol codes */
+  hash_entry *hash_value;       /* => hash table of symbol values */
 
   /* GIF data packet construction buffer */
   int bytesinpkt;               /* # of bytes in current packet */
@@ -71,9 +102,6 @@ typedef struct {
 
 typedef gif_dest_struct *gif_dest_ptr;
 
-/* Largest value that will fit in N bits */
-#define MAXCODE(n_bits)  ((1 << (n_bits)) - 1)
-
 
 /*
  * Routines to package finished data bytes into GIF data blocks.
@@ -105,7 +133,7 @@ flush_packet(gif_dest_ptr dinfo)
 /* Routine to convert variable-width codes into a byte stream */
 
 LOCAL(void)
-output(gif_dest_ptr dinfo, int code)
+output(gif_dest_ptr dinfo, code_int code)
 /* Emit a code of n_bits bits */
 /* Uses cur_accum and cur_bits to reblock into 8-bit bytes */
 {
@@ -117,74 +145,76 @@ output(gif_dest_ptr dinfo, int code)
     dinfo->cur_accum >>= 8;
     dinfo->cur_bits -= 8;
   }
+
+  /*
+   * If the next entry is going to be too big for the code size,
+   * then increase it, if possible.  We do this here to ensure
+   * that it's done in sync with the decoder's codesize increases.
+   */
+  if (dinfo->free_code > dinfo->maxcode) {
+    dinfo->n_bits++;
+    if (dinfo->n_bits == MAX_LZW_BITS)
+      dinfo->maxcode = LZW_TABLE_SIZE; /* free_code will never exceed this */
+    else
+      dinfo->maxcode = MAXCODE(dinfo->n_bits);
+  }
 }
 
 
-/* The pseudo-compression algorithm.
- *
- * In this module we simply output each pixel value as a separate symbol;
- * thus, no compression occurs.  In fact, there is expansion of one bit per
- * pixel, because we use a symbol width one bit wider than the pixel width.
- *
- * GIF ordinarily uses variable-width symbols, and the decoder will expect
- * to ratchet up the symbol width after a fixed number of symbols.
- * To simplify the logic and keep the expansion penalty down, we emit a
- * GIF Clear code to reset the decoder just before the width would ratchet up.
- * Thus, all the symbols in the output file will have the same bit width.
- * Note that emitting the Clear codes at the right times is a mere matter of
- * counting output symbols and is in no way dependent on the LZW patent.
- *
- * With a small basic pixel width (low color count), Clear codes will be
- * needed very frequently, causing the file to expand even more.  So this
- * simplistic approach wouldn't work too well on bilevel images, for example.
- * But for output of JPEG conversions the pixel width will usually be 8 bits
- * (129 to 256 colors), so the overhead added by Clear symbols is only about
- * one symbol in every 256.
- */
+/* Compression initialization & termination */
+
+
+LOCAL(void)
+clear_hash(gif_dest_ptr dinfo)
+/* Fill the hash table with empty entries */
+{
+  /* It's sufficient to zero hash_code[] */
+  MEMZERO(dinfo->hash_code, HSIZE * sizeof(code_int));
+}
+
+
+LOCAL(void)
+clear_block(gif_dest_ptr dinfo)
+/* Reset compressor and issue a Clear code */
+{
+  clear_hash(dinfo);                    /* delete all the symbols */
+  dinfo->free_code = dinfo->ClearCode + 2;
+  output(dinfo, dinfo->ClearCode);      /* inform decoder */
+  dinfo->n_bits = dinfo->init_bits;     /* reset code size */
+  dinfo->maxcode = MAXCODE(dinfo->n_bits);
+}
+
 
 LOCAL(void)
 compress_init(gif_dest_ptr dinfo, int i_bits)
-/* Initialize pseudo-compressor */
+/* Initialize compressor */
 {
   /* init all the state variables */
-  dinfo->n_bits = i_bits;
+  dinfo->n_bits = dinfo->init_bits = i_bits;
   dinfo->maxcode = MAXCODE(dinfo->n_bits);
-  dinfo->ClearCode = (1 << (i_bits - 1));
+  dinfo->ClearCode = ((code_int) 1 << (i_bits - 1));
   dinfo->EOFCode = dinfo->ClearCode + 1;
-  dinfo->code_counter = dinfo->ClearCode + 2;
+  dinfo->code_counter = dinfo->free_code = dinfo->ClearCode + 2;
+  dinfo->first_byte = TRUE;     /* no waiting symbol yet */
   /* init output buffering vars */
   dinfo->bytesinpkt = 0;
   dinfo->cur_accum = 0;
   dinfo->cur_bits = 0;
+  /* clear hash table */
+  if (dinfo->hash_code != NULL)
+    clear_hash(dinfo);
   /* GIF specifies an initial Clear code */
   output(dinfo, dinfo->ClearCode);
 }
 
 
-LOCAL(void)
-compress_pixel(gif_dest_ptr dinfo, int c)
-/* Accept and "compress" one pixel value.
- * The given value must be less than n_bits wide.
- */
-{
-  /* Output the given pixel value as a symbol. */
-  output(dinfo, c);
-  /* Issue Clear codes often enough to keep the reader from ratcheting up
-   * its symbol size.
-   */
-  if (dinfo->code_counter < dinfo->maxcode) {
-    dinfo->code_counter++;
-  } else {
-    output(dinfo, dinfo->ClearCode);
-    dinfo->code_counter = dinfo->ClearCode + 2; /* reset the counter */
-  }
-}
-
-
 LOCAL(void)
 compress_term(gif_dest_ptr dinfo)
 /* Clean up at end */
 {
+  /* Flush out the buffered LZW code */
+  if (!dinfo->first_byte)
+    output(dinfo, dinfo->waiting_code);
   /* Send an EOF code */
   output(dinfo, dinfo->EOFCode);
   /* Flush the bit-packing buffer */
@@ -221,7 +251,7 @@ put_3bytes(gif_dest_ptr dinfo, int val)
 LOCAL(void)
 emit_header(gif_dest_ptr dinfo, int num_colors, JSAMPARRAY colormap)
 /* Output the GIF file header, including color map */
-/* If colormap==NULL, synthesize a grayscale colormap */
+/* If colormap == NULL, synthesize a grayscale colormap */
 {
   int BitsPerPixel, ColorMapSize, InitCodeSize, FlagByte;
   int cshift = dinfo->cinfo->data_precision - 8;
@@ -265,12 +295,12 @@ emit_header(gif_dest_ptr dinfo, int num_colors, JSAMPARRAY colormap)
       if (colormap != NULL) {
         if (dinfo->cinfo->out_color_space == JCS_RGB) {
           /* Normal case: RGB color map */
-          putc(GETJSAMPLE(colormap[0][i]) >> cshift, dinfo->pub.output_file);
-          putc(GETJSAMPLE(colormap[1][i]) >> cshift, dinfo->pub.output_file);
-          putc(GETJSAMPLE(colormap[2][i]) >> cshift, dinfo->pub.output_file);
+          putc(colormap[0][i] >> cshift, dinfo->pub.output_file);
+          putc(colormap[1][i] >> cshift, dinfo->pub.output_file);
+          putc(colormap[2][i] >> cshift, dinfo->pub.output_file);
         } else {
           /* Grayscale "color map": possible if quantizing grayscale image */
-          put_3bytes(dinfo, GETJSAMPLE(colormap[0][i]) >> cshift);
+          put_3bytes(dinfo, colormap[0][i] >> cshift);
         }
       } else {
         /* Create a grayscale map of num_colors values, range 0..255 */
@@ -278,7 +308,7 @@ emit_header(gif_dest_ptr dinfo, int num_colors, JSAMPARRAY colormap)
       }
     } else {
       /* fill out the map to a power of 2 */
-      put_3bytes(dinfo, 0);
+      put_3bytes(dinfo, CENTERJSAMPLE >> cshift);
     }
   }
   /* Write image separator and Image Descriptor */
@@ -292,7 +322,7 @@ emit_header(gif_dest_ptr dinfo, int num_colors, JSAMPARRAY colormap)
   /* Write Initial Code Size byte */
   putc(InitCodeSize, dinfo->pub.output_file);
 
-  /* Initialize for "compression" of image data */
+  /* Initialize for compression of image data */
   compress_init(dinfo, InitCodeSize + 1);
 }
 
@@ -318,17 +348,139 @@ start_output_gif(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
  * In this module rows_supplied will always be 1.
  */
 
+
+/*
+ * The LZW algorithm proper
+ */
+
+METHODDEF(void)
+put_LZW_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+                   JDIMENSION rows_supplied)
+{
+  gif_dest_ptr dest = (gif_dest_ptr)dinfo;
+  register JSAMPROW ptr;
+  register JDIMENSION col;
+  code_int c;
+  register hash_int i;
+  register hash_int disp;
+  register hash_entry probe_value;
+
+  ptr = dest->pub.buffer[0];
+  for (col = cinfo->output_width; col > 0; col--) {
+    /* Accept and compress one 8-bit byte */
+    c = (code_int)(*ptr++);
+
+    if (dest->first_byte) {     /* need to initialize waiting_code */
+      dest->waiting_code = c;
+      dest->first_byte = FALSE;
+      continue;
+    }
+
+    /* Probe hash table to see if a symbol exists for
+     * waiting_code followed by c.
+     * If so, replace waiting_code by that symbol and continue.
+     */
+    i = ((hash_int)c << (MAX_LZW_BITS - 8)) + dest->waiting_code;
+    /* i is less than twice 2**MAX_LZW_BITS, therefore less than twice HSIZE */
+    if (i >= HSIZE)
+      i -= HSIZE;
+
+    probe_value = HASH_ENTRY(dest->waiting_code, c);
+
+    if (dest->hash_code[i] == 0) {
+      /* hit empty slot; desired symbol not in table */
+      output(dest, dest->waiting_code);
+      if (dest->free_code < LZW_TABLE_SIZE) {
+        dest->hash_code[i] = dest->free_code++; /* add symbol to hashtable */
+        dest->hash_value[i] = probe_value;
+      } else
+        clear_block(dest);
+      dest->waiting_code = c;
+      continue;
+    }
+    if (dest->hash_value[i] == probe_value) {
+      dest->waiting_code = dest->hash_code[i];
+      continue;
+    }
+
+    if (i == 0)                 /* secondary hash (after G. Knott) */
+      disp = 1;
+    else
+      disp = HSIZE - i;
+    for (;;) {
+      i -= disp;
+      if (i < 0)
+        i += HSIZE;
+      if (dest->hash_code[i] == 0) {
+        /* hit empty slot; desired symbol not in table */
+        output(dest, dest->waiting_code);
+        if (dest->free_code < LZW_TABLE_SIZE) {
+          dest->hash_code[i] = dest->free_code++; /* add symbol to hashtable */
+          dest->hash_value[i] = probe_value;
+        } else
+          clear_block(dest);
+        dest->waiting_code = c;
+        break;
+      }
+      if (dest->hash_value[i] == probe_value) {
+        dest->waiting_code = dest->hash_code[i];
+        break;
+      }
+    }
+  }
+}
+
+
+/*
+ * The pseudo-compression algorithm.
+ *
+ * In this version we simply output each pixel value as a separate symbol;
+ * thus, no compression occurs.  In fact, there is expansion of one bit per
+ * pixel, because we use a symbol width one bit wider than the pixel width.
+ *
+ * GIF ordinarily uses variable-width symbols, and the decoder will expect
+ * to ratchet up the symbol width after a fixed number of symbols.
+ * To simplify the logic and keep the expansion penalty down, we emit a
+ * GIF Clear code to reset the decoder just before the width would ratchet up.
+ * Thus, all the symbols in the output file will have the same bit width.
+ * Note that emitting the Clear codes at the right times is a mere matter of
+ * counting output symbols and is in no way dependent on the LZW algorithm.
+ *
+ * With a small basic pixel width (low color count), Clear codes will be
+ * needed very frequently, causing the file to expand even more.  So this
+ * simplistic approach wouldn't work too well on bilevel images, for example.
+ * But for output of JPEG conversions the pixel width will usually be 8 bits
+ * (129 to 256 colors), so the overhead added by Clear symbols is only about
+ * one symbol in every 256.
+ */
+
 METHODDEF(void)
-put_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-               JDIMENSION rows_supplied)
+put_raw_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
+                   JDIMENSION rows_supplied)
 {
   gif_dest_ptr dest = (gif_dest_ptr)dinfo;
   register JSAMPROW ptr;
   register JDIMENSION col;
+  code_int c;
 
   ptr = dest->pub.buffer[0];
   for (col = cinfo->output_width; col > 0; col--) {
-    compress_pixel(dest, GETJSAMPLE(*ptr++));
+    c = (code_int)(*ptr++);
+    /* Accept and output one pixel value.
+     * The given value must be less than n_bits wide.
+     */
+
+    /* Output the given pixel value as a symbol. */
+    output(dest, c);
+    /* Issue Clear codes often enough to keep the reader from ratcheting up
+     * its symbol size.
+     */
+    if (dest->code_counter < dest->maxcode) {
+      dest->code_counter++;
+    } else {
+      output(dest, dest->ClearCode);
+      dest->code_counter = dest->ClearCode + 2; /* reset the counter */
+    }
   }
 }
 
@@ -342,7 +494,7 @@ finish_output_gif(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
 {
   gif_dest_ptr dest = (gif_dest_ptr)dinfo;
 
-  /* Flush "compression" mechanism */
+  /* Flush compression mechanism */
   compress_term(dest);
   /* Write a zero-length data block to end the series */
   putc(0, dest->pub.output_file);
@@ -370,7 +522,7 @@ calc_buffer_dimensions_gif(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
  */
 
 GLOBAL(djpeg_dest_ptr)
-jinit_write_gif(j_decompress_ptr cinfo)
+jinit_write_gif(j_decompress_ptr cinfo, boolean is_lzw)
 {
   gif_dest_ptr dest;
 
@@ -380,7 +532,6 @@ jinit_write_gif(j_decompress_ptr cinfo)
                                 sizeof(gif_dest_struct));
   dest->cinfo = cinfo;          /* make back link for subroutines */
   dest->pub.start_output = start_output_gif;
-  dest->pub.put_pixel_rows = put_pixel_rows;
   dest->pub.finish_output = finish_output_gif;
   dest->pub.calc_buffer_dimensions = calc_buffer_dimensions_gif;
 
@@ -407,6 +558,22 @@ jinit_write_gif(j_decompress_ptr cinfo)
     ((j_common_ptr)cinfo, JPOOL_IMAGE, cinfo->output_width, (JDIMENSION)1);
   dest->pub.buffer_height = 1;
 
+  if (is_lzw) {
+    dest->pub.put_pixel_rows = put_LZW_pixel_rows;
+    /* Allocate space for hash table */
+    dest->hash_code = (code_int *)
+      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                  HSIZE * sizeof(code_int));
+    dest->hash_value = (hash_entry *)
+      (*cinfo->mem->alloc_large) ((j_common_ptr)cinfo, JPOOL_IMAGE,
+                                  HSIZE * sizeof(hash_entry));
+  } else {
+    dest->pub.put_pixel_rows = put_raw_pixel_rows;
+    /* Mark tables unused */
+    dest->hash_code = NULL;
+    dest->hash_value = NULL;
+  }
+
   return (djpeg_dest_ptr)dest;
 }
 
diff --git a/external/jpeg/wrjpgcom.1 b/external/jpeg/wrjpgcom.1
new file mode 100644
index 000000000000..a255cab8f8c9
--- /dev/null
+++ b/external/jpeg/wrjpgcom.1
@@ -0,0 +1,103 @@
+.TH WRJPGCOM 1 "15 June 1995"
+.SH NAME
+wrjpgcom \- insert text comments into a JPEG file
+.SH SYNOPSIS
+.B wrjpgcom
+[
+.B \-replace
+]
+[
+.BI \-comment " text"
+]
+[
+.BI \-cfile " name"
+]
+[
+.I filename
+]
+.LP
+.SH DESCRIPTION
+.LP
+.B wrjpgcom
+reads the named JPEG/JFIF file, or the standard input if no file is named,
+and generates a new JPEG/JFIF file on standard output.  A comment block is
+added to the file.
+.PP
+The JPEG standard allows "comment" (COM) blocks to occur within a JPEG file.
+Although the standard doesn't actually define what COM blocks are for, they
+are widely used to hold user-supplied text strings.  This lets you add
+annotations, titles, index terms, etc to your JPEG files, and later retrieve
+them as text.  COM blocks do not interfere with the image stored in the JPEG
+file.  The maximum size of a COM block is 64K, but you can have as many of
+them as you like in one JPEG file.
+.PP
+.B wrjpgcom
+adds a COM block, containing text you provide, to a JPEG file.
+Ordinarily, the COM block is added after any existing COM blocks; but you
+can delete the old COM blocks if you wish.
+.SH OPTIONS
+Switch names may be abbreviated, and are not case sensitive.
+.TP
+.B \-replace
+Delete any existing COM blocks from the file.
+.TP
+.BI \-comment " text"
+Supply text for new COM block on command line.
+.TP
+.BI \-cfile " name"
+Read text for new COM block from named file.
+.PP
+If you have only one line of comment text to add, you can provide it on the
+command line with
+.BR \-comment .
+The comment text must be surrounded with quotes so that it is treated as a
+single argument.  Longer comments can be read from a text file.
+.PP
+If you give neither
+.B \-comment
+nor
+.BR \-cfile,
+then
+.B wrjpgcom
+will read the comment text from standard input.  (In this case an input image
+file name MUST be supplied, so that the source JPEG file comes from somewhere
+else.)  You can enter multiple lines, up to 64KB worth.  Type an end-of-file
+indicator (usually control-D) to terminate the comment text entry.
+.PP
+.B wrjpgcom
+will not add a COM block if the provided comment string is empty.  Therefore
+\fB\-replace \-comment ""\fR can be used to delete all COM blocks from a file.
+.SH EXAMPLES
+.LP
+Add a short comment to in.jpg, producing out.jpg:
+.IP
+.B wrjpgcom \-c
+\fI"View of my back yard" in.jpg
+.B >
+.I out.jpg
+.PP
+Attach a long comment previously stored in comment.txt:
+.IP
+.B wrjpgcom
+.I in.jpg
+.B <
+.I comment.txt
+.B >
+.I out.jpg
+.PP
+or equivalently
+.IP
+.B wrjpgcom
+.B -cfile
+.I comment.txt
+.B <
+.I in.jpg
+.B >
+.I out.jpg
+.SH SEE ALSO
+.BR cjpeg (1),
+.BR djpeg (1),
+.BR jpegtran (1),
+.BR rdjpgcom (1)
+.SH AUTHOR
+Independent JPEG Group
diff --git a/external/jpeg/wrppm.c b/external/jpeg/wrppm.c
index 8cabaf036a85..3081ec333f81 100644
--- a/external/jpeg/wrppm.c
+++ b/external/jpeg/wrppm.c
@@ -108,17 +108,17 @@ copy_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
   ppm_dest_ptr dest = (ppm_dest_ptr)dinfo;
   register char *bufferptr;
   register JSAMPROW ptr;
-#if BITS_IN_JSAMPLE != 8 || (!defined(HAVE_UNSIGNED_CHAR) && !defined(__CHAR_UNSIGNED__))
+#if BITS_IN_JSAMPLE != 8
   register JDIMENSION col;
 #endif
 
   ptr = dest->pub.buffer[0];
   bufferptr = dest->iobuffer;
-#if BITS_IN_JSAMPLE == 8 && (defined(HAVE_UNSIGNED_CHAR) || defined(__CHAR_UNSIGNED__))
+#if BITS_IN_JSAMPLE == 8
   MEMCOPY(bufferptr, ptr, dest->samples_per_row);
 #else
   for (col = dest->samples_per_row; col > 0; col--) {
-    PUTPPMSAMPLE(bufferptr, GETJSAMPLE(*ptr++));
+    PUTPPMSAMPLE(bufferptr, *ptr++);
   }
 #endif
   (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
@@ -200,10 +200,10 @@ put_demapped_rgb(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
   ptr = dest->pub.buffer[0];
   bufferptr = dest->iobuffer;
   for (col = cinfo->output_width; col > 0; col--) {
-    pixval = GETJSAMPLE(*ptr++);
-    PUTPPMSAMPLE(bufferptr, GETJSAMPLE(color_map0[pixval]));
-    PUTPPMSAMPLE(bufferptr, GETJSAMPLE(color_map1[pixval]));
-    PUTPPMSAMPLE(bufferptr, GETJSAMPLE(color_map2[pixval]));
+    pixval = *ptr++;
+    PUTPPMSAMPLE(bufferptr, color_map0[pixval]);
+    PUTPPMSAMPLE(bufferptr, color_map1[pixval]);
+    PUTPPMSAMPLE(bufferptr, color_map2[pixval]);
   }
   (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
 }
@@ -222,7 +222,7 @@ put_demapped_gray(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
   ptr = dest->pub.buffer[0];
   bufferptr = dest->iobuffer;
   for (col = cinfo->output_width; col > 0; col--) {
-    PUTPPMSAMPLE(bufferptr, GETJSAMPLE(color_map[GETJSAMPLE(*ptr++)]));
+    PUTPPMSAMPLE(bufferptr, color_map[*ptr++]);
   }
   (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
 }
diff --git a/external/jpeg/wrrle.c b/external/jpeg/wrrle.c
deleted file mode 100644
index 5c98ec060efe..000000000000
--- a/external/jpeg/wrrle.c
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- * wrrle.c
- *
- * This file was part of the Independent JPEG Group's software:
- * Copyright (C) 1991-1996, Thomas G. Lane.
- * libjpeg-turbo Modifications:
- * Copyright (C) 2017, D. R. Commander.
- * For conditions of distribution and use, see the accompanying README.ijg
- * file.
- *
- * This file contains routines to write output images in RLE format.
- * The Utah Raster Toolkit library is required (version 3.1 or later).
- *
- * These routines may need modification for non-Unix environments or
- * specialized applications.  As they stand, they assume output to
- * an ordinary stdio stream.
- *
- * Based on code contributed by Mike Lijewski,
- * with updates from Robert Hutchinson.
- */
-
-#include "cdjpeg.h"             /* Common decls for cjpeg/djpeg applications */
-
-#ifdef RLE_SUPPORTED
-
-/* rle.h is provided by the Utah Raster Toolkit. */
-
-#include <rle.h>
-
-/*
- * We assume that JSAMPLE has the same representation as rle_pixel,
- * to wit, "unsigned char".  Hence we can't cope with 12- or 16-bit samples.
- */
-
-#if BITS_IN_JSAMPLE != 8
-  Sorry, this code only copes with 8-bit JSAMPLEs. /* deliberate syntax err */
-#endif
-
-
-/*
- * Since RLE stores scanlines bottom-to-top, we have to invert the image
- * from JPEG's top-to-bottom order.  To do this, we save the outgoing data
- * in a virtual array during put_pixel_row calls, then actually emit the
- * RLE file during finish_output.
- */
-
-
-/*
- * For now, if we emit an RLE color map then it is always 256 entries long,
- * though not all of the entries need be used.
- */
-
-#define CMAPBITS        8
-#define CMAPLENGTH      (1 << (CMAPBITS))
-
-typedef struct {
-  struct djpeg_dest_struct pub; /* public fields */
-
-  jvirt_sarray_ptr image;       /* virtual array to store the output image */
-  rle_map *colormap;            /* RLE-style color map, or NULL if none */
-  rle_pixel **rle_row;          /* To pass rows to rle_putrow() */
-
-} rle_dest_struct;
-
-typedef rle_dest_struct *rle_dest_ptr;
-
-/* Forward declarations */
-METHODDEF(void) rle_put_pixel_rows(j_decompress_ptr cinfo,
-                                   djpeg_dest_ptr dinfo,
-                                   JDIMENSION rows_supplied);
-
-
-/*
- * Write the file header.
- *
- * In this module it's easier to wait till finish_output to write anything.
- */
-
-METHODDEF(void)
-start_output_rle(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
-{
-  rle_dest_ptr dest = (rle_dest_ptr)dinfo;
-  size_t cmapsize;
-  int i, ci;
-#ifdef PROGRESS_REPORT
-  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
-#endif
-
-  /*
-   * Make sure the image can be stored in RLE format.
-   *
-   * - RLE stores image dimensions as *signed* 16 bit integers.  JPEG
-   *   uses unsigned, so we have to check the width.
-   *
-   * - Colorspace is expected to be grayscale or RGB.
-   *
-   * - The number of channels (components) is expected to be 1 (grayscale/
-   *   pseudocolor) or 3 (truecolor/directcolor).
-   *   (could be 2 or 4 if using an alpha channel, but we aren't)
-   */
-
-  if (cinfo->output_width > 32767 || cinfo->output_height > 32767)
-    ERREXIT2(cinfo, JERR_RLE_DIMENSIONS, cinfo->output_width,
-             cinfo->output_height);
-
-  if (cinfo->out_color_space != JCS_GRAYSCALE &&
-      cinfo->out_color_space != JCS_RGB)
-    ERREXIT(cinfo, JERR_RLE_COLORSPACE);
-
-  if (cinfo->output_components != 1 && cinfo->output_components != 3)
-    ERREXIT1(cinfo, JERR_RLE_TOOMANYCHANNELS, cinfo->num_components);
-
-  /* Convert colormap, if any, to RLE format. */
-
-  dest->colormap = NULL;
-
-  if (cinfo->quantize_colors) {
-    /* Allocate storage for RLE-style cmap, zero any extra entries */
-    cmapsize = cinfo->out_color_components * CMAPLENGTH * sizeof(rle_map);
-    dest->colormap = (rle_map *)(*cinfo->mem->alloc_small)
-      ((j_common_ptr)cinfo, JPOOL_IMAGE, cmapsize);
-    MEMZERO(dest->colormap, cmapsize);
-
-    /* Save away data in RLE format --- note 8-bit left shift! */
-    /* Shifting would need adjustment for JSAMPLEs wider than 8 bits. */
-    for (ci = 0; ci < cinfo->out_color_components; ci++) {
-      for (i = 0; i < cinfo->actual_number_of_colors; i++) {
-        dest->colormap[ci * CMAPLENGTH + i] =
-          GETJSAMPLE(cinfo->colormap[ci][i]) << 8;
-      }
-    }
-  }
-
-  /* Set the output buffer to the first row */
-  dest->pub.buffer = (*cinfo->mem->access_virt_sarray)
-    ((j_common_ptr)cinfo, dest->image, (JDIMENSION)0, (JDIMENSION)1, TRUE);
-  dest->pub.buffer_height = 1;
-
-  dest->pub.put_pixel_rows = rle_put_pixel_rows;
-
-#ifdef PROGRESS_REPORT
-  if (progress != NULL) {
-    progress->total_extra_passes++;  /* count file writing as separate pass */
-  }
-#endif
-}
-
-
-/*
- * Write some pixel data.
- *
- * This routine just saves the data away in a virtual array.
- */
-
-METHODDEF(void)
-rle_put_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
-                   JDIMENSION rows_supplied)
-{
-  rle_dest_ptr dest = (rle_dest_ptr)dinfo;
-
-  if (cinfo->output_scanline < cinfo->output_height) {
-    dest->pub.buffer = (*cinfo->mem->access_virt_sarray)
-      ((j_common_ptr)cinfo, dest->image,
-       cinfo->output_scanline, (JDIMENSION)1, TRUE);
-  }
-}
-
-/*
- * Finish up at the end of the file.
- *
- * Here is where we really output the RLE file.
- */
-
-METHODDEF(void)
-finish_output_rle(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
-{
-  rle_dest_ptr dest = (rle_dest_ptr)dinfo;
-  rle_hdr header;               /* Output file information */
-  rle_pixel **rle_row, *red, *green, *blue;
-  JSAMPROW output_row;
-  char cmapcomment[80];
-  int row, col;
-  int ci;
-#ifdef PROGRESS_REPORT
-  cd_progress_ptr progress = (cd_progress_ptr)cinfo->progress;
-#endif
-
-  /* Initialize the header info */
-  header = *rle_hdr_init(NULL);
-  header.rle_file = dest->pub.output_file;
-  header.xmin     = 0;
-  header.xmax     = cinfo->output_width  - 1;
-  header.ymin     = 0;
-  header.ymax     = cinfo->output_height - 1;
-  header.alpha    = 0;
-  header.ncolors  = cinfo->output_components;
-  for (ci = 0; ci < cinfo->output_components; ci++) {
-    RLE_SET_BIT(header, ci);
-  }
-  if (cinfo->quantize_colors) {
-    header.ncmap   = cinfo->out_color_components;
-    header.cmaplen = CMAPBITS;
-    header.cmap    = dest->colormap;
-    /* Add a comment to the output image with the true colormap length. */
-    sprintf(cmapcomment, "color_map_length=%d",
-            cinfo->actual_number_of_colors);
-    rle_putcom(cmapcomment, &header);
-  }
-
-  /* Emit the RLE header and color map (if any) */
-  rle_put_setup(&header);
-
-  /* Now output the RLE data from our virtual array.
-   * We assume here that rle_pixel is represented the same as JSAMPLE.
-   */
-
-#ifdef PROGRESS_REPORT
-  if (progress != NULL) {
-    progress->pub.pass_limit = cinfo->output_height;
-    progress->pub.pass_counter = 0;
-    (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-  }
-#endif
-
-  if (cinfo->output_components == 1) {
-    for (row = cinfo->output_height - 1; row >= 0; row--) {
-      rle_row = (rle_pixel **)(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr)cinfo, dest->image,
-         (JDIMENSION)row, (JDIMENSION)1, FALSE);
-      rle_putrow(rle_row, (int)cinfo->output_width, &header);
-#ifdef PROGRESS_REPORT
-      if (progress != NULL) {
-        progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-      }
-#endif
-    }
-  } else {
-    for (row = cinfo->output_height - 1; row >= 0; row--) {
-      rle_row = (rle_pixel **)dest->rle_row;
-      output_row = *(*cinfo->mem->access_virt_sarray)
-        ((j_common_ptr)cinfo, dest->image,
-         (JDIMENSION)row, (JDIMENSION)1, FALSE);
-      red = rle_row[0];
-      green = rle_row[1];
-      blue = rle_row[2];
-      for (col = cinfo->output_width; col > 0; col--) {
-        *red++ = GETJSAMPLE(*output_row++);
-        *green++ = GETJSAMPLE(*output_row++);
-        *blue++ = GETJSAMPLE(*output_row++);
-      }
-      rle_putrow(rle_row, (int)cinfo->output_width, &header);
-#ifdef PROGRESS_REPORT
-      if (progress != NULL) {
-        progress->pub.pass_counter++;
-        (*progress->pub.progress_monitor) ((j_common_ptr)cinfo);
-      }
-#endif
-    }
-  }
-
-#ifdef PROGRESS_REPORT
-  if (progress != NULL)
-    progress->completed_extra_passes++;
-#endif
-
-  /* Emit file trailer */
-  rle_puteof(&header);
-  fflush(dest->pub.output_file);
-  if (ferror(dest->pub.output_file))
-    ERREXIT(cinfo, JERR_FILE_WRITE);
-}
-
-
-/*
- * The module selection routine for RLE format output.
- */
-
-GLOBAL(djpeg_dest_ptr)
-jinit_write_rle(j_decompress_ptr cinfo)
-{
-  rle_dest_ptr dest;
-
-  /* Create module interface object, fill in method pointers */
-  dest = (rle_dest_ptr)
-    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
-                                sizeof(rle_dest_struct));
-  dest->pub.start_output = start_output_rle;
-  dest->pub.finish_output = finish_output_rle;
-  dest->pub.calc_buffer_dimensions = NULL;
-
-  /* Calculate output image dimensions so we can allocate space */
-  jpeg_calc_output_dimensions(cinfo);
-
-  /* Allocate a work array for output to the RLE library. */
-  dest->rle_row = (*cinfo->mem->alloc_sarray)
-    ((j_common_ptr)cinfo, JPOOL_IMAGE,
-     cinfo->output_width, (JDIMENSION)cinfo->output_components);
-
-  /* Allocate a virtual array to hold the image. */
-  dest->image = (*cinfo->mem->request_virt_sarray)
-    ((j_common_ptr)cinfo, JPOOL_IMAGE, FALSE,
-     (JDIMENSION)(cinfo->output_width * cinfo->output_components),
-     cinfo->output_height, (JDIMENSION)1);
-
-  return (djpeg_dest_ptr)dest;
-}
-
-#endif /* RLE_SUPPORTED */
diff --git a/external/jpeg/wrtarga.c b/external/jpeg/wrtarga.c
index 9dfa9201936d..7a654ff57634 100644
--- a/external/jpeg/wrtarga.c
+++ b/external/jpeg/wrtarga.c
@@ -4,7 +4,7 @@
  * This file was part of the Independent JPEG Group's software:
  * Copyright (C) 1991-1996, Thomas G. Lane.
  * libjpeg-turbo Modifications:
- * Copyright (C) 2017, D. R. Commander.
+ * Copyright (C) 2017, 2019, D. R. Commander.
  * For conditions of distribution and use, see the accompanying README.ijg
  * file.
  *
@@ -102,9 +102,9 @@ put_pixel_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
   inptr = dest->pub.buffer[0];
   outptr = dest->iobuffer;
   for (col = cinfo->output_width; col > 0; col--) {
-    outptr[0] = (char)GETJSAMPLE(inptr[2]); /* RGB to BGR order */
-    outptr[1] = (char)GETJSAMPLE(inptr[1]);
-    outptr[2] = (char)GETJSAMPLE(inptr[0]);
+    outptr[0] = inptr[2]; /* RGB to BGR order */
+    outptr[1] = inptr[1];
+    outptr[2] = inptr[0];
     inptr += 3, outptr += 3;
   }
   (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
@@ -118,13 +118,10 @@ put_gray_rows(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
   tga_dest_ptr dest = (tga_dest_ptr)dinfo;
   register JSAMPROW inptr;
   register char *outptr;
-  register JDIMENSION col;
 
   inptr = dest->pub.buffer[0];
   outptr = dest->iobuffer;
-  for (col = cinfo->output_width; col > 0; col--) {
-    *outptr++ = (char)GETJSAMPLE(*inptr++);
-  }
+  MEMCOPY(outptr, inptr, cinfo->output_width);
   (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
 }
 
@@ -147,7 +144,7 @@ put_demapped_gray(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo,
   inptr = dest->pub.buffer[0];
   outptr = dest->iobuffer;
   for (col = cinfo->output_width; col > 0; col--) {
-    *outptr++ = (char)GETJSAMPLE(color_map0[GETJSAMPLE(*inptr++)]);
+    *outptr++ = color_map0[*inptr++];
   }
   (void)JFWRITE(dest->pub.output_file, dest->iobuffer, dest->buffer_width);
 }
@@ -182,9 +179,9 @@ start_output_tga(j_decompress_ptr cinfo, djpeg_dest_ptr dinfo)
       /* Write the colormap.  Note Targa uses BGR byte order */
       outfile = dest->pub.output_file;
       for (i = 0; i < num_colors; i++) {
-        putc(GETJSAMPLE(cinfo->colormap[2][i]), outfile);
-        putc(GETJSAMPLE(cinfo->colormap[1][i]), outfile);
-        putc(GETJSAMPLE(cinfo->colormap[0][i]), outfile);
+        putc(cinfo->colormap[2][i], outfile);
+        putc(cinfo->colormap[1][i], outfile);
+        putc(cinfo->colormap[0][i], outfile);
       }
       dest->pub.put_pixel_rows = put_gray_rows;
     } else {
diff --git a/tools/cocos2d-console/bin/cocos b/tools/cocos2d-console/bin/cocos
index 87df7d883a13..8fbd6ef6cd61 100755
--- a/tools/cocos2d-console/bin/cocos
+++ b/tools/cocos2d-console/bin/cocos
@@ -9,6 +9,7 @@ if hash python2 2>/dev/null; then
 else
     if hash python 2>/dev/null; then
         VERSION="python --version"
+        PYTHON=python
     else
         echo "Python 2+ required."
         exit 1