From 61e2c939a92e2b8c8ed1a2d7f81fe48cc119b61c Mon Sep 17 00:00:00 2001
From: "K. S. Ernest (iFire) Lee" <ernest.lee@chibifire.com>
Date: Tue, 30 Apr 2019 06:56:15 -0700
Subject: [PATCH] Add thirdparty library etcpak.

---
 COPYRIGHT.txt                             |    5 +
 thirdparty/README.md                      |    9 +
 thirdparty/etcpak/.gitignore              |    8 +
 thirdparty/etcpak/AUTHORS.txt             |    2 +
 thirdparty/etcpak/Application.cpp         |  282 ++++
 thirdparty/etcpak/Bitmap.cpp              |  214 +++
 thirdparty/etcpak/Bitmap.hpp              |   50 +
 thirdparty/etcpak/BitmapDownsampled.cpp   |   86 ++
 thirdparty/etcpak/BitmapDownsampled.hpp   |   13 +
 thirdparty/etcpak/BlockData.cpp           |  889 ++++++++++++
 thirdparty/etcpak/BlockData.hpp           |   50 +
 thirdparty/etcpak/ColorSpace.cpp          |  114 ++
 thirdparty/etcpak/ColorSpace.hpp          |   36 +
 thirdparty/etcpak/CpuArch.cpp             |  103 ++
 thirdparty/etcpak/CpuArch.hpp             |    6 +
 thirdparty/etcpak/DataProvider.cpp        |   76 ++
 thirdparty/etcpak/DataProvider.hpp        |   41 +
 thirdparty/etcpak/Debug.cpp               |   31 +
 thirdparty/etcpak/Debug.hpp               |   27 +
 thirdparty/etcpak/Dither.cpp              |  396 ++++++
 thirdparty/etcpak/Dither.hpp              |   18 +
 thirdparty/etcpak/Error.cpp               |   48 +
 thirdparty/etcpak/Error.hpp               |    9 +
 thirdparty/etcpak/LICENSE.txt             |   24 +
 thirdparty/etcpak/Math.hpp                |   89 ++
 thirdparty/etcpak/MipMap.hpp              |   11 +
 thirdparty/etcpak/ProcessAlpha.cpp        |  314 +++++
 thirdparty/etcpak/ProcessAlpha.hpp        |    8 +
 thirdparty/etcpak/ProcessAlpha_AVX2.cpp   |  223 +++
 thirdparty/etcpak/ProcessAlpha_AVX2.hpp   |   12 +
 thirdparty/etcpak/ProcessCommon.hpp       |   50 +
 thirdparty/etcpak/ProcessRGB.cpp          |  723 ++++++++++
 thirdparty/etcpak/ProcessRGB.hpp          |    9 +
 thirdparty/etcpak/ProcessRGB_AVX2.cpp     |  978 +++++++++++++
 thirdparty/etcpak/ProcessRGB_AVX2.hpp     |   15 +
 thirdparty/etcpak/README.md               |   43 +
 thirdparty/etcpak/Semaphore.hpp           |   46 +
 thirdparty/etcpak/System.cpp              |   68 +
 thirdparty/etcpak/System.hpp              |   15 +
 thirdparty/etcpak/Tables.cpp              |  177 +++
 thirdparty/etcpak/Tables.hpp              |   32 +
 thirdparty/etcpak/TaskDispatch.cpp        |  113 ++
 thirdparty/etcpak/TaskDispatch.hpp        |   34 +
 thirdparty/etcpak/Timing.cpp              |    8 +
 thirdparty/etcpak/Timing.hpp              |    8 +
 thirdparty/etcpak/Vector.hpp              |  222 +++
 thirdparty/etcpak/bitbucket-pipelines.yml |   12 +
 thirdparty/etcpak/lz4/lz4.c               | 1516 +++++++++++++++++++++
 thirdparty/etcpak/lz4/lz4.h               |  360 +++++
 thirdparty/etcpak/mmap.cpp                |   38 +
 thirdparty/etcpak/mmap.hpp                |   19 +
 51 files changed, 7680 insertions(+)
 create mode 100644 thirdparty/etcpak/.gitignore
 create mode 100644 thirdparty/etcpak/AUTHORS.txt
 create mode 100644 thirdparty/etcpak/Application.cpp
 create mode 100644 thirdparty/etcpak/Bitmap.cpp
 create mode 100644 thirdparty/etcpak/Bitmap.hpp
 create mode 100644 thirdparty/etcpak/BitmapDownsampled.cpp
 create mode 100644 thirdparty/etcpak/BitmapDownsampled.hpp
 create mode 100644 thirdparty/etcpak/BlockData.cpp
 create mode 100644 thirdparty/etcpak/BlockData.hpp
 create mode 100644 thirdparty/etcpak/ColorSpace.cpp
 create mode 100644 thirdparty/etcpak/ColorSpace.hpp
 create mode 100644 thirdparty/etcpak/CpuArch.cpp
 create mode 100644 thirdparty/etcpak/CpuArch.hpp
 create mode 100644 thirdparty/etcpak/DataProvider.cpp
 create mode 100644 thirdparty/etcpak/DataProvider.hpp
 create mode 100644 thirdparty/etcpak/Debug.cpp
 create mode 100644 thirdparty/etcpak/Debug.hpp
 create mode 100644 thirdparty/etcpak/Dither.cpp
 create mode 100644 thirdparty/etcpak/Dither.hpp
 create mode 100644 thirdparty/etcpak/Error.cpp
 create mode 100644 thirdparty/etcpak/Error.hpp
 create mode 100644 thirdparty/etcpak/LICENSE.txt
 create mode 100644 thirdparty/etcpak/Math.hpp
 create mode 100644 thirdparty/etcpak/MipMap.hpp
 create mode 100644 thirdparty/etcpak/ProcessAlpha.cpp
 create mode 100644 thirdparty/etcpak/ProcessAlpha.hpp
 create mode 100644 thirdparty/etcpak/ProcessAlpha_AVX2.cpp
 create mode 100644 thirdparty/etcpak/ProcessAlpha_AVX2.hpp
 create mode 100644 thirdparty/etcpak/ProcessCommon.hpp
 create mode 100644 thirdparty/etcpak/ProcessRGB.cpp
 create mode 100644 thirdparty/etcpak/ProcessRGB.hpp
 create mode 100644 thirdparty/etcpak/ProcessRGB_AVX2.cpp
 create mode 100644 thirdparty/etcpak/ProcessRGB_AVX2.hpp
 create mode 100644 thirdparty/etcpak/README.md
 create mode 100644 thirdparty/etcpak/Semaphore.hpp
 create mode 100644 thirdparty/etcpak/System.cpp
 create mode 100644 thirdparty/etcpak/System.hpp
 create mode 100644 thirdparty/etcpak/Tables.cpp
 create mode 100644 thirdparty/etcpak/Tables.hpp
 create mode 100644 thirdparty/etcpak/TaskDispatch.cpp
 create mode 100644 thirdparty/etcpak/TaskDispatch.hpp
 create mode 100644 thirdparty/etcpak/Timing.cpp
 create mode 100644 thirdparty/etcpak/Timing.hpp
 create mode 100644 thirdparty/etcpak/Vector.hpp
 create mode 100644 thirdparty/etcpak/bitbucket-pipelines.yml
 create mode 100644 thirdparty/etcpak/lz4/lz4.c
 create mode 100644 thirdparty/etcpak/lz4/lz4.h
 create mode 100644 thirdparty/etcpak/mmap.cpp
 create mode 100644 thirdparty/etcpak/mmap.hpp

diff --git a/COPYRIGHT.txt b/COPYRIGHT.txt
index 5431e2b4034a..4853ed2c70ae 100644
--- a/COPYRIGHT.txt
+++ b/COPYRIGHT.txt
@@ -147,6 +147,11 @@ Comment: Etc2Comp
 Copyright: 2015, Etc2Comp Authors
 License: Apache-2.0
 
+Files: ./thirdparty/etcpak/
+Comment: etcpak
+Copyright: 2013, Bartosz Taudul <wolf.pld@gmail.com>
+License: BSD-3-clause
+
 Files: ./thirdparty/fonts/DroidSans*.ttf
 Comment: DroidSans font
 Copyright: 2008, The Android Open Source Project
diff --git a/thirdparty/README.md b/thirdparty/README.md
index bc820634bb27..1758c4229727 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -96,6 +96,15 @@ They are marked with `// -- GODOT start --` and `// -- GODOT end --`
 comments.
 
 
+## etcpak
+
+- Upstream: https://bitbucket.org/wolfpld/etcpak/src
+- Version: git (1f69f20, 2018)
+- License: BSD-3 license
+
+Imporant: Some Godot-made changes.
+
+
 ## fonts
 
 ### Noto Sans
diff --git a/thirdparty/etcpak/.gitignore b/thirdparty/etcpak/.gitignore
new file mode 100644
index 000000000000..488e9ad99ab2
--- /dev/null
+++ b/thirdparty/etcpak/.gitignore
@@ -0,0 +1,8 @@
+build/Debug
+build/Release
+build/x64
+build/*sdf
+build/*.suo
+build/*.vcxproj.user
+*.o
+*.d
diff --git a/thirdparty/etcpak/AUTHORS.txt b/thirdparty/etcpak/AUTHORS.txt
new file mode 100644
index 000000000000..73892f5fdac5
--- /dev/null
+++ b/thirdparty/etcpak/AUTHORS.txt
@@ -0,0 +1,2 @@
+Bartosz Taudul <wolf.pld@gmail.com>
+Daniel Jungmann <el.3d.source@gmail.com>
diff --git a/thirdparty/etcpak/Application.cpp b/thirdparty/etcpak/Application.cpp
new file mode 100644
index 000000000000..d33722bbdfba
--- /dev/null
+++ b/thirdparty/etcpak/Application.cpp
@@ -0,0 +1,282 @@
+#include <future>
+#include <stdio.h>
+#include <limits>
+#include <math.h>
+#include <memory>
+#include <string.h>
+
+#include "Bitmap.hpp"
+#include "BlockData.hpp"
+#include "CpuArch.hpp"
+#include "DataProvider.hpp"
+#include "Debug.hpp"
+#include "Dither.hpp"
+#include "Error.hpp"
+#include "System.hpp"
+#include "TaskDispatch.hpp"
+#include "Timing.hpp"
+
+struct DebugCallback_t : public DebugLog::Callback
+{
+    void OnDebugMessage( const char* msg ) override
+    {
+        fprintf( stderr, "%s\n", msg );
+    }
+} DebugCallback;
+
+void Usage()
+{
+    fprintf( stderr, "Usage: etcpak input.png [options]\n" );
+#ifdef __SSE4_1__
+    if( can_use_intel_core_4th_gen_features() )
+    {
+        fprintf( stderr, "  Using AVX 2 instructions.\n" );
+    }
+    else
+    {
+        fprintf( stderr, "  Using SSE 4.1 instructions.\n" );
+    }
+#else
+    fprintf( stderr, "  SIMD not available.\n" );
+#endif
+    fprintf( stderr, "  Options:\n" );
+    fprintf( stderr, "  -v          view mode (loads pvr/ktx file, decodes it and saves to png)\n" );
+    fprintf( stderr, "  -o 1        output selection (sum of: 1 - save pvr file; 2 - save png file)\n" );
+    fprintf( stderr, "                note: pvr files are written regardless of this option\n" );
+    fprintf( stderr, "  -a          disable alpha channel processing\n" );
+    fprintf( stderr, "  -s          display image quality measurements\n" );
+    fprintf( stderr, "  -b          benchmark mode\n" );
+    fprintf( stderr, "  -m          generate mipmaps\n" );
+    fprintf( stderr, "  -d          enable dithering\n" );
+    fprintf( stderr, "  -debug      dissect ETC texture\n" );
+    fprintf( stderr, "  -etc2       enable ETC2 mode\n" );
+    fprintf( stderr, "  -rgba       enable ETC2 RGBA mode\n" );
+}
+
+int main( int argc, char** argv )
+{
+    DebugLog::AddCallback( &DebugCallback );
+
+    bool viewMode = false;
+    int save = 1;
+    bool alpha = true;
+    bool stats = false;
+    bool benchmark = false;
+    bool mipmap = false;
+    bool dither = false;
+    bool debug = false;
+    bool etc2 = false;
+    bool rgba = false;
+
+    if( argc < 2 )
+    {
+        Usage();
+        return 1;
+    }
+
+#define CSTR(x) strcmp( argv[i], x ) == 0
+    for( int i=2; i<argc; i++ )
+    {
+        if( CSTR( "-v" ) )
+        {
+            viewMode = true;
+        }
+        else if( CSTR( "-o" ) )
+        {
+            i++;
+            save = atoi( argv[i] );
+            assert( ( save & 0x3 ) != 0 );
+        }
+        else if( CSTR( "-a" ) )
+        {
+            alpha = false;
+        }
+        else if( CSTR( "-s" ) )
+        {
+            stats = true;
+        }
+        else if( CSTR( "-b" ) )
+        {
+            benchmark = true;
+        }
+        else if( CSTR( "-m" ) )
+        {
+            mipmap = true;
+        }
+        else if( CSTR( "-d" ) )
+        {
+            dither = true;
+        }
+        else if( CSTR( "-debug" ) )
+        {
+            debug = true;
+        }
+        else if( CSTR( "-etc2" ) )
+        {
+            etc2 = true;
+        }
+        else if( CSTR( "-rgba" ) )
+        {
+            rgba = true;
+            etc2 = true;
+        }
+        else
+        {
+            Usage();
+            return 1;
+        }
+    }
+#undef CSTR
+
+    if( dither )
+    {
+        InitDither();
+    }
+
+    TaskDispatch taskDispatch( System::CPUCores() );
+
+    if( benchmark )
+    {
+        auto start = GetTime();
+        auto bmp = std::make_shared<Bitmap>( argv[1], std::numeric_limits<unsigned int>::max() );
+        auto data = bmp->Data();
+        auto end = GetTime();
+        printf( "Image load time: %0.3f ms\n", ( end - start ) / 1000.f );
+
+        const int NumTasks = System::CPUCores() * 10;
+        start = GetTime();
+        for( int i=0; i<NumTasks; i++ )
+        {
+            TaskDispatch::Queue( [&bmp, &dither, i, etc2, rgba]()
+            {
+                const BlockData::Type type = rgba ? BlockData::Etc2_RGBA : ( etc2 ? BlockData::Etc2_RGB : BlockData::Etc1 );
+                auto bd = std::make_shared<BlockData>( bmp->Size(), false, type );
+                if( rgba )
+                {
+                    bd->ProcessRGBA( bmp->Data(), bmp->Size().x * bmp->Size().y / 16, 0, bmp->Size().x, dither );
+                }
+                else
+                {
+                    bd->Process( bmp->Data(), bmp->Size().x * bmp->Size().y / 16, 0, bmp->Size().x, Channels::RGB, dither );
+                }
+            } );
+        }
+        TaskDispatch::Sync();
+        end = GetTime();
+        printf( "Mean compression time for %i runs: %0.3f ms\n", NumTasks, ( end - start ) / ( NumTasks * 1000.f ) );
+    }
+    else if( viewMode )
+    {
+        auto bd = std::make_shared<BlockData>( argv[1] );
+        auto out = bd->Decode();
+        out->Write( "out.png" );
+    }
+    else if( debug )
+    {
+        auto bd = std::make_shared<BlockData>( argv[1] );
+        bd->Dissect();
+    }
+    else
+    {
+        DataProvider dp( argv[1], mipmap );
+        auto num = dp.NumberOfParts();
+
+        BlockData::Type type;
+        if( etc2 )
+        {
+            if( rgba && dp.Alpha() )
+            {
+                type = BlockData::Etc2_RGBA;
+            }
+            else
+            {
+                type = BlockData::Etc2_RGB;
+            }
+        }
+        else
+        {
+            type = BlockData::Etc1;
+        }
+
+        auto bd = std::make_shared<BlockData>( "out.pvr", dp.Size(), mipmap, type );
+        BlockDataPtr bda;
+        if( alpha && dp.Alpha() && !rgba )
+        {
+            bda = std::make_shared<BlockData>( "outa.pvr", dp.Size(), mipmap, type );
+        }
+
+        if( bda )
+        {
+            for( int i=0; i<num; i++ )
+            {
+                auto part = dp.NextPart();
+
+                TaskDispatch::Queue( [part, i, &bd, &dither]()
+                {
+                    bd->Process( part.src, part.width / 4 * part.lines, part.offset, part.width, Channels::RGB, dither );
+                } );
+                TaskDispatch::Queue( [part, i, &bda]()
+                {
+                    bda->Process( part.src, part.width / 4 * part.lines, part.offset, part.width, Channels::Alpha, false );
+                } );
+            }
+        }
+        else
+        {
+            for( int i=0; i<num; i++ )
+            {
+                auto part = dp.NextPart();
+
+                if( type == BlockData::Etc2_RGBA )
+                {
+                    TaskDispatch::Queue( [part, i, &bd, &dither]()
+                    {
+                        bd->ProcessRGBA( part.src, part.width / 4 * part.lines, part.offset, part.width, dither );
+                    } );
+                }
+                else
+                {
+                    TaskDispatch::Queue( [part, i, &bd, &dither]()
+                    {
+                        bd->Process( part.src, part.width / 4 * part.lines, part.offset, part.width, Channels::RGB, dither );
+                    } );
+                }
+            }
+        }
+
+        TaskDispatch::Sync();
+
+        if( stats )
+        {
+            auto out = bd->Decode();
+            float mse = CalcMSE3( dp.ImageData(), *out );
+            printf( "RGB data\n" );
+            printf( "  RMSE: %f\n", sqrt( mse ) );
+            printf( "  PSNR: %f\n", 20 * log10( 255 ) - 10 * log10( mse ) );
+            if( bda )
+            {
+                auto out = bda->Decode();
+                float mse = CalcMSE1( dp.ImageData(), *out );
+                printf( "A data\n" );
+                printf( "  RMSE: %f\n", sqrt( mse ) );
+                printf( "  PSNR: %f\n", 20 * log10( 255 ) - 10 * log10( mse ) );
+            }
+        }
+
+        if( save & 0x2 )
+        {
+            auto out = bd->Decode();
+            out->Write( "out.png" );
+            if( bda )
+            {
+                auto outa = bda->Decode();
+                outa->Write( "outa.png" );
+            }
+        }
+
+        bd.reset();
+        bda.reset();
+    }
+
+    return 0;
+}
diff --git a/thirdparty/etcpak/Bitmap.cpp b/thirdparty/etcpak/Bitmap.cpp
new file mode 100644
index 000000000000..83c589ab67e0
--- /dev/null
+++ b/thirdparty/etcpak/Bitmap.cpp
@@ -0,0 +1,214 @@
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "png.h"
+
+#include "lz4/lz4.h"
+
+#include "Bitmap.hpp"
+#include "Debug.hpp"
+
+Bitmap::Bitmap( const char* fn, unsigned int lines )
+    : m_block( nullptr )
+    , m_lines( lines )
+    , m_alpha( true )
+    , m_sema( 0 )
+{
+    FILE* f = fopen( fn, "rb" );
+    assert( f );
+
+    char buf[4];
+    fread( buf, 1, 4, f );
+    if( memcmp( buf, "raw4", 4 ) == 0 )
+    {
+        uint8_t a;
+        fread( &a, 1, 1, f );
+        m_alpha = a == 1;
+        uint32_t d;
+        fread( &d, 1, 4, f );
+        m_size.x = d;
+        fread( &d, 1, 4, f );
+        m_size.y = d;
+        DBGPRINT( "Raw bitmap " << fn << "  " << m_size.x << "x" << m_size.y );
+
+        assert( m_size.x % 4 == 0 );
+        assert( m_size.y % 4 == 0 );
+
+        int32_t csize;
+        fread( &csize, 1, 4, f );
+        char* cbuf = new char[csize];
+        fread( cbuf, 1, csize, f );
+        fclose( f );
+
+        m_block = m_data = new uint32_t[m_size.x*m_size.y];
+        m_linesLeft = m_size.y / 4;
+
+        LZ4_decompress_fast( cbuf, (char*)m_data, m_size.x*m_size.y*4 );
+        delete[] cbuf;
+
+        for( int i=0; i<m_size.y/4; i++ )
+        {
+            m_sema.unlock();
+        }
+    }
+    else
+    {
+        fseek( f, 0, SEEK_SET );
+
+        unsigned int sig_read = 0;
+        int bit_depth, color_type, interlace_type;
+
+        png_structp png_ptr = png_create_read_struct( PNG_LIBPNG_VER_STRING, NULL, NULL, NULL );
+        png_infop info_ptr = png_create_info_struct( png_ptr );
+        setjmp( png_jmpbuf( png_ptr ) );
+
+        png_init_io( png_ptr, f );
+        png_set_sig_bytes( png_ptr, sig_read );
+
+        png_uint_32 w, h;
+
+        png_read_info( png_ptr, info_ptr );
+        png_get_IHDR( png_ptr, info_ptr, &w, &h, &bit_depth, &color_type, &interlace_type, NULL, NULL );
+
+        m_size = v2i( w, h );
+
+        png_set_strip_16( png_ptr );
+        if( color_type == PNG_COLOR_TYPE_PALETTE )
+        {
+            png_set_palette_to_rgb( png_ptr );
+        }
+        else if( color_type == PNG_COLOR_TYPE_GRAY && bit_depth < 8 )
+        {
+            png_set_expand_gray_1_2_4_to_8( png_ptr );
+        }
+        if( png_get_valid( png_ptr, info_ptr, PNG_INFO_tRNS ) )
+        {
+            png_set_tRNS_to_alpha( png_ptr );
+        }
+        if( color_type == PNG_COLOR_TYPE_GRAY_ALPHA )
+        {
+            png_set_gray_to_rgb(png_ptr);
+        }
+        png_set_bgr(png_ptr);
+
+        switch( color_type )
+        {
+        case PNG_COLOR_TYPE_PALETTE:
+            if( !png_get_valid( png_ptr, info_ptr, PNG_INFO_tRNS ) )
+            {
+                png_set_filler( png_ptr, 0xff, PNG_FILLER_AFTER );
+                m_alpha = false;
+            }
+            break;
+        case PNG_COLOR_TYPE_GRAY_ALPHA:
+            png_set_gray_to_rgb( png_ptr );
+            break;
+        case PNG_COLOR_TYPE_RGB:
+            png_set_filler( png_ptr, 0xff, PNG_FILLER_AFTER );
+            m_alpha = false;
+            break;
+        default:
+            break;
+        }
+
+        DBGPRINT( "Bitmap " << fn << "  " << w << "x" << h );
+
+        assert( w % 4 == 0 );
+        assert( h % 4 == 0 );
+
+        m_block = m_data = new uint32_t[w*h];
+        m_linesLeft = h / 4;
+
+        m_load = std::async( std::launch::async, [this, f, png_ptr, info_ptr]() mutable
+        {
+            auto ptr = m_data;
+            unsigned int lines = 0;
+            for( int i=0; i<m_size.y / 4; i++ )
+            {
+                for( int j=0; j<4; j++ )
+                {
+                    png_read_rows( png_ptr, (png_bytepp)&ptr, NULL, 1 );
+                    ptr += m_size.x;
+                }
+                lines++;
+                if( lines >= m_lines )
+                {
+                    lines = 0;
+                    m_sema.unlock();
+                }
+            }
+
+            if( lines != 0 )
+            {
+                m_sema.unlock();
+            }
+
+            png_read_end( png_ptr, info_ptr );
+            png_destroy_read_struct( &png_ptr, &info_ptr, NULL );
+            fclose( f );
+        } );
+    }
+}
+
+Bitmap::Bitmap( const v2i& size )
+    : m_data( new uint32_t[size.x*size.y] )
+    , m_block( nullptr )
+    , m_lines( 1 )
+    , m_linesLeft( size.y / 4 )
+    , m_size( size )
+    , m_sema( 0 )
+{
+}
+
+Bitmap::Bitmap( const Bitmap& src, unsigned int lines )
+    : m_lines( lines )
+    , m_alpha( src.Alpha() )
+    , m_sema( 0 )
+{
+}
+
+Bitmap::~Bitmap()
+{
+    delete[] m_data;
+}
+
+void Bitmap::Write( const char* fn )
+{
+    FILE* f = fopen( fn, "wb" );
+    assert( f );
+
+    png_structp png_ptr = png_create_write_struct( PNG_LIBPNG_VER_STRING, NULL, NULL, NULL );
+    png_infop info_ptr = png_create_info_struct( png_ptr );
+    setjmp( png_jmpbuf( png_ptr ) );
+    png_init_io( png_ptr, f );
+
+    png_set_IHDR( png_ptr, info_ptr, m_size.x, m_size.y, 8, PNG_COLOR_TYPE_RGB_ALPHA, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_BASE, PNG_FILTER_TYPE_BASE );
+
+    png_write_info( png_ptr, info_ptr );
+
+    uint32_t* ptr = m_data;
+    for( int i=0; i<m_size.y; i++ )
+    {
+        png_write_rows( png_ptr, (png_bytepp)(&ptr), 1 );
+        ptr += m_size.x;
+    }
+
+    png_write_end( png_ptr, info_ptr );
+    png_destroy_write_struct( &png_ptr, &info_ptr );
+
+    fclose( f );
+}
+
+const uint32_t* Bitmap::NextBlock( unsigned int& lines, bool& done )
+{
+    std::lock_guard<std::mutex> lock( m_lock );
+    lines = std::min( m_lines, m_linesLeft );
+    auto ret = m_block;
+    m_sema.lock();
+    m_block += m_size.x * 4 * lines;
+    m_linesLeft -= lines;
+    done = m_linesLeft == 0;
+    return ret;
+}
diff --git a/thirdparty/etcpak/Bitmap.hpp b/thirdparty/etcpak/Bitmap.hpp
new file mode 100644
index 000000000000..999889ff0c64
--- /dev/null
+++ b/thirdparty/etcpak/Bitmap.hpp
@@ -0,0 +1,50 @@
+#ifndef __DARKRL__BITMAP_HPP__
+#define __DARKRL__BITMAP_HPP__
+
+#include <future>
+#include <memory>
+#include <mutex>
+#include <stdint.h>
+
+#include "Semaphore.hpp"
+#include "Vector.hpp"
+
+enum class Channels
+{
+    RGB,
+    Alpha
+};
+
+class Bitmap
+{
+public:
+    Bitmap( const char* fn, unsigned int lines );
+    Bitmap( const v2i& size );
+    virtual ~Bitmap();
+
+    void Write( const char* fn );
+
+    uint32_t* Data() { if( m_load.valid() ) m_load.wait(); return m_data; }
+    const uint32_t* Data() const { if( m_load.valid() ) m_load.wait(); return m_data; }
+    const v2i& Size() const { return m_size; }
+    bool Alpha() const { return m_alpha; }
+
+    const uint32_t* NextBlock( unsigned int& lines, bool& done );
+
+protected:
+    Bitmap( const Bitmap& src, unsigned int lines );
+
+    uint32_t* m_data;
+    uint32_t* m_block;
+    unsigned int m_lines;
+    unsigned int m_linesLeft;
+    v2i m_size;
+    bool m_alpha;
+    Semaphore m_sema;
+    std::mutex m_lock;
+    std::future<void> m_load;
+};
+
+typedef std::shared_ptr<Bitmap> BitmapPtr;
+
+#endif
diff --git a/thirdparty/etcpak/BitmapDownsampled.cpp b/thirdparty/etcpak/BitmapDownsampled.cpp
new file mode 100644
index 000000000000..0eb0d8118504
--- /dev/null
+++ b/thirdparty/etcpak/BitmapDownsampled.cpp
@@ -0,0 +1,86 @@
+#include <string.h>
+#include <utility>
+
+#include "BitmapDownsampled.hpp"
+#include "Debug.hpp"
+
+BitmapDownsampled::BitmapDownsampled( const Bitmap& bmp, unsigned int lines )
+    : Bitmap( bmp, lines )
+{
+    m_size.x = std::max( 1, bmp.Size().x / 2 );
+    m_size.y = std::max( 1, bmp.Size().y / 2 );
+
+    int w = std::max( m_size.x, 4 );
+    int h = std::max( m_size.y, 4 );
+
+    DBGPRINT( "Subbitmap " << m_size.x << "x" << m_size.y );
+
+    m_block = m_data = new uint32_t[w*h];
+
+    if( m_size.x < w || m_size.y < h )
+    {
+        memset( m_data, 0, w*h*sizeof( uint32_t ) );
+        m_linesLeft = h / 4;
+        unsigned int lines = 0;
+        for( int i=0; i<h/4; i++ )
+        {
+            for( int j=0; j<4; j++ )
+            {
+                lines++;
+                if( lines > m_lines )
+                {
+                    lines = 0;
+                    m_sema.unlock();
+                }
+            }
+        }
+        if( lines != 0 )
+        {
+            m_sema.unlock();
+        }
+    }
+    else
+    {
+        m_linesLeft = h / 4;
+        m_load = std::async( std::launch::async, [this, &bmp, w, h]() mutable
+        {
+            auto ptr = m_data;
+            auto src1 = bmp.Data();
+            auto src2 = src1 + bmp.Size().x;
+            unsigned int lines = 0;
+            for( int i=0; i<h/4; i++ )
+            {
+                for( int j=0; j<4; j++ )
+                {
+                    for( int k=0; k<m_size.x; k++ )
+                    {
+                        int r = ( ( *src1 & 0x000000FF ) + ( *(src1+1) & 0x000000FF ) + ( *src2 & 0x000000FF ) + ( *(src2+1) & 0x000000FF ) ) / 4;
+                        int g = ( ( ( *src1 & 0x0000FF00 ) + ( *(src1+1) & 0x0000FF00 ) + ( *src2 & 0x0000FF00 ) + ( *(src2+1) & 0x0000FF00 ) ) / 4 ) & 0x0000FF00;
+                        int b = ( ( ( *src1 & 0x00FF0000 ) + ( *(src1+1) & 0x00FF0000 ) + ( *src2 & 0x00FF0000 ) + ( *(src2+1) & 0x00FF0000 ) ) / 4 ) & 0x00FF0000;
+                        int a = ( ( ( ( ( *src1 & 0xFF000000 ) >> 8 ) + ( ( *(src1+1) & 0xFF000000 ) >> 8 ) + ( ( *src2 & 0xFF000000 ) >> 8 ) + ( ( *(src2+1) & 0xFF000000 ) >> 8 ) ) / 4 ) & 0x00FF0000 ) << 8;
+                        *ptr++ = r | g | b | a;
+                        src1 += 2;
+                        src2 += 2;
+                    }
+                    src1 += m_size.x * 2;
+                    src2 += m_size.x * 2;
+                }
+                lines++;
+                if( lines >= m_lines )
+                {
+                    lines = 0;
+                    m_sema.unlock();
+                }
+            }
+
+            if( lines != 0 )
+            {
+                m_sema.unlock();
+            }
+        } );
+    }
+}
+
+BitmapDownsampled::~BitmapDownsampled()
+{
+}
diff --git a/thirdparty/etcpak/BitmapDownsampled.hpp b/thirdparty/etcpak/BitmapDownsampled.hpp
new file mode 100644
index 000000000000..b7313808df98
--- /dev/null
+++ b/thirdparty/etcpak/BitmapDownsampled.hpp
@@ -0,0 +1,13 @@
+#ifndef __DARKRL__BITMAPDOWNSAMPLED_HPP__
+#define __DARKRL__BITMAPDOWNSAMPLED_HPP__
+
+#include "Bitmap.hpp"
+
+class BitmapDownsampled : public Bitmap
+{
+public:
+    BitmapDownsampled( const Bitmap& bmp, unsigned int lines );
+    ~BitmapDownsampled();
+};
+
+#endif
diff --git a/thirdparty/etcpak/BlockData.cpp b/thirdparty/etcpak/BlockData.cpp
new file mode 100644
index 000000000000..0f9caf78f1ac
--- /dev/null
+++ b/thirdparty/etcpak/BlockData.cpp
@@ -0,0 +1,889 @@
+#include <assert.h>
+#include <string.h>
+
+#include "BlockData.hpp"
+#include "ColorSpace.hpp"
+#include "CpuArch.hpp"
+#include "Debug.hpp"
+#include "Dither.hpp"
+#include "MipMap.hpp"
+#include "mmap.hpp"
+#include "ProcessAlpha.hpp"
+#include "ProcessAlpha_AVX2.hpp"
+#include "ProcessRGB.hpp"
+#include "ProcessRGB_AVX2.hpp"
+#include "Tables.hpp"
+#include "TaskDispatch.hpp"
+
+BlockData::BlockData( const char* fn )
+    : m_file( fopen( fn, "rb" ) )
+{
+    assert( m_file );
+    fseek( m_file, 0, SEEK_END );
+    m_maplen = ftell( m_file );
+    fseek( m_file, 0, SEEK_SET );
+    m_data = (uint8_t*)mmap( nullptr, m_maplen, PROT_READ, MAP_SHARED, fileno( m_file ), 0 );
+
+    auto data32 = (uint32_t*)m_data;
+    if( *data32 == 0x03525650 )
+    {
+        // PVR
+        switch( *(data32+2) )
+        {
+        case 6:
+            m_type = Etc1;
+            break;
+        case 22:
+            m_type = Etc2_RGB;
+            break;
+        case 23:
+            m_type = Etc2_RGBA;
+            break;
+        default:
+            assert( false );
+            break;
+        }
+
+        m_size.y = *(data32+6);
+        m_size.x = *(data32+7);
+        m_dataOffset = 52 + *(data32+12);
+    }
+    else if( *data32 == 0x58544BAB )
+    {
+        // KTX
+        switch( *(data32+7) )
+        {
+        case 0x9274:
+            m_type = Etc2_RGB;
+            break;
+        case 0x9278:
+            m_type = Etc2_RGBA;
+            break;
+        default:
+            assert( false );
+            break;
+        }
+
+        m_size.x = *(data32+9);
+        m_size.y = *(data32+10);
+        m_dataOffset = sizeof( uint32_t ) * 17 + *(data32+15);
+    }
+    else
+    {
+        assert( false );
+    }
+}
+
+static uint8_t* OpenForWriting( const char* fn, size_t len, const v2i& size, FILE** f, int levels, BlockData::Type type )
+{
+    *f = fopen( fn, "wb+" );
+    assert( *f );
+    fseek( *f, len - 1, SEEK_SET );
+    const char zero = 0;
+    fwrite( &zero, 1, 1, *f );
+    fseek( *f, 0, SEEK_SET );
+
+    auto ret = (uint8_t*)mmap( nullptr, len, PROT_WRITE, MAP_SHARED, fileno( *f ), 0 );
+    auto dst = (uint32_t*)ret;
+
+    *dst++ = 0x03525650;  // version
+    *dst++ = 0;           // flags
+    switch( type )        // pixelformat[0]
+    {
+    case BlockData::Etc1:
+        *dst++ = 6;
+        break;
+    case BlockData::Etc2_RGB:
+        *dst++ = 22;
+        break;
+    case BlockData::Etc2_RGBA:
+        *dst++ = 23;
+        break;
+    default:
+        assert( false );
+        break;
+    }
+    *dst++ = 0;           // pixelformat[1]
+    *dst++ = 0;           // colourspace
+    *dst++ = 0;           // channel type
+    *dst++ = size.y;      // height
+    *dst++ = size.x;      // width
+    *dst++ = 1;           // depth
+    *dst++ = 1;           // num surfs
+    *dst++ = 1;           // num faces
+    *dst++ = levels;      // mipmap count
+    *dst++ = 0;           // metadata size
+
+    return ret;
+}
+
+static int AdjustSizeForMipmaps( const v2i& size, int levels )
+{
+    int len = 0;
+    v2i current = size;
+    for( int i=1; i<levels; i++ )
+    {
+        assert( current.x != 1 || current.y != 1 );
+        current.x = std::max( 1, current.x / 2 );
+        current.y = std::max( 1, current.y / 2 );
+        len += std::max( 4, current.x ) * std::max( 4, current.y ) / 2;
+    }
+    assert( current.x == 1 && current.y == 1 );
+    return len;
+}
+
+BlockData::BlockData( const char* fn, const v2i& size, bool mipmap, Type type )
+    : m_size( size )
+    , m_dataOffset( 52 )
+    , m_maplen( m_size.x*m_size.y/2 )
+    , m_type( type )
+{
+    assert( m_size.x%4 == 0 && m_size.y%4 == 0 );
+
+    uint32_t cnt = m_size.x * m_size.y / 16;
+    DBGPRINT( cnt << " blocks" );
+
+    int levels = 1;
+
+    if( mipmap )
+    {
+        levels = NumberOfMipLevels( size );
+        DBGPRINT( "Number of mipmaps: " << levels );
+        m_maplen += AdjustSizeForMipmaps( size, levels );
+    }
+
+    if( type == Etc2_RGBA ) m_maplen *= 2;
+
+    m_maplen += m_dataOffset;
+    m_data = OpenForWriting( fn, m_maplen, m_size, &m_file, levels, type );
+}
+
+BlockData::BlockData( const v2i& size, bool mipmap, Type type )
+    : m_size( size )
+    , m_dataOffset( 52 )
+    , m_file( nullptr )
+    , m_maplen( m_size.x*m_size.y/2 )
+    , m_type( type )
+{
+    assert( m_size.x%4 == 0 && m_size.y%4 == 0 );
+    if( mipmap )
+    {
+        const int levels = NumberOfMipLevels( size );
+        m_maplen += AdjustSizeForMipmaps( size, levels );
+    }
+
+    if( type == Etc2_RGBA ) m_maplen *= 2;
+
+    m_maplen += m_dataOffset;
+    m_data = new uint8_t[m_maplen];
+}
+
+BlockData::~BlockData()
+{
+    if( m_file )
+    {
+        munmap( m_data, m_maplen );
+        fclose( m_file );
+    }
+    else
+    {
+        delete[] m_data;
+    }
+}
+
+static uint64_t _f_rgba( uint8_t* ptr )
+{
+    return ProcessAlpha( ptr );
+}
+
+#ifdef __SSE4_1__
+static uint64_t _f_rgba_avx2( uint8_t* ptr )
+{
+    return ProcessAlpha_AVX2( ptr );
+}
+#endif
+
+static uint64_t _f_rgb( uint8_t* ptr )
+{
+    return ProcessRGB( ptr );
+}
+
+#ifdef __SSE4_1__
+static uint64_t _f_rgb_avx2( uint8_t* ptr )
+{
+    return ProcessRGB_AVX2( ptr );
+}
+#endif
+
+static uint64_t _f_rgb_dither( uint8_t* ptr )
+{
+    Dither( ptr );
+    return ProcessRGB( ptr );
+}
+
+#ifdef __SSE4_1__
+static uint64_t _f_rgb_dither_avx2( uint8_t* ptr )
+{
+    Dither( ptr );
+    return ProcessRGB_AVX2( ptr );
+}
+#endif
+
+static uint64_t _f_rgb_etc2( uint8_t* ptr )
+{
+    return ProcessRGB_ETC2( ptr );
+}
+
+#ifdef __SSE4_1__
+static uint64_t _f_rgb_etc2_avx2( uint8_t* ptr )
+{
+    return ProcessRGB_ETC2_AVX2( ptr );
+}
+#endif
+
+static uint64_t _f_rgb_etc2_dither( uint8_t* ptr )
+{
+    Dither( ptr );
+    return ProcessRGB_ETC2( ptr );
+}
+
+#ifdef __SSE4_1__
+static uint64_t _f_rgb_etc2_dither_avx2( uint8_t* ptr )
+{
+    Dither( ptr );
+    return ProcessRGB_ETC2_AVX2( ptr );
+}
+#endif
+
+void BlockData::Process( const uint32_t* src, uint32_t blocks, size_t offset, size_t width, Channels type, bool dither )
+{
+    uint32_t buf[4*4];
+    int w = 0;
+
+    auto dst = ((uint64_t*)( m_data + m_dataOffset )) + offset;
+
+    uint64_t (*func)(uint8_t*);
+
+    if( type == Channels::Alpha )
+    {
+#ifdef __SSE4_1__
+        if( can_use_intel_core_4th_gen_features() )
+        {
+            if( m_type != Etc1 )
+            {
+                func = _f_rgb_etc2_avx2;
+            }
+            else
+            {
+                func = _f_rgb_avx2;
+            }
+        }
+        else
+#endif
+        {
+            if( m_type != Etc1 )
+            {
+                func = _f_rgb_etc2;
+            }
+            else
+            {
+                func = _f_rgb;
+            }
+        }
+
+        do
+        {
+            auto ptr = buf;
+            for( int x=0; x<4; x++ )
+            {
+                unsigned int a = *src >> 24;
+                *ptr++ = a | ( a << 8 ) | ( a << 16 );
+                src += width;
+                a = *src >> 24;
+                *ptr++ = a | ( a << 8 ) | ( a << 16 );
+                src += width;
+                a = *src >> 24;
+                *ptr++ = a | ( a << 8 ) | ( a << 16 );
+                src += width;
+                a = *src >> 24;
+                *ptr++ = a | ( a << 8 ) | ( a << 16 );
+                src -= width * 3 - 1;
+            }
+            if( ++w == width/4 )
+            {
+                src += width * 3;
+                w = 0;
+            }
+
+            *dst++ = func( (uint8_t*)buf );
+        }
+        while( --blocks );
+    }
+    else
+    {
+#ifdef __SSE4_1__
+        if( can_use_intel_core_4th_gen_features() )
+        {
+            if( m_type != Etc1 )
+            {
+                if( dither )
+                {
+                    func = _f_rgb_etc2_dither_avx2;
+                }
+                else
+                {
+                    func = _f_rgb_etc2_avx2;
+                }
+            }
+            else
+            {
+                if( dither )
+                {
+                    func = _f_rgb_dither_avx2;
+                }
+                else
+                {
+                    func = _f_rgb_avx2;
+                }
+            }
+        }
+        else
+#endif
+        {
+            if( m_type != Etc1 )
+            {
+                if( dither )
+                {
+                    func = _f_rgb_etc2_dither;
+                }
+                else
+                {
+                    func = _f_rgb_etc2;
+                }
+            }
+            else
+            {
+                if( dither )
+                {
+                    func = _f_rgb_dither;
+                }
+                else
+                {
+                    func = _f_rgb;
+                }
+            }
+        }
+
+        do
+        {
+            auto ptr = buf;
+            for( int x=0; x<4; x++ )
+            {
+                *ptr++ = *src;
+                src += width;
+                *ptr++ = *src;
+                src += width;
+                *ptr++ = *src;
+                src += width;
+                *ptr++ = *src;
+                src -= width * 3 - 1;
+            }
+            if( ++w == width/4 )
+            {
+                src += width * 3;
+                w = 0;
+            }
+
+            *dst++ = func( (uint8_t*)buf );
+        }
+        while( --blocks );
+    }
+}
+
+void BlockData::ProcessRGBA( const uint32_t* src, uint32_t blocks, size_t offset, size_t width, bool dither )
+{
+    assert( m_type == Etc2_RGBA );
+
+    uint32_t buf[4*4];
+    uint8_t buf8[4*4];
+    int w = 0;
+
+    auto dst = ((uint64_t*)( m_data + m_dataOffset )) + offset * 2;
+
+    uint64_t (*func)(uint8_t*);
+    uint64_t (*func_alpha)(uint8_t*);
+
+#ifdef __SSE4_1__
+    if( can_use_intel_core_4th_gen_features() )
+    {
+        if( dither )
+        {
+            func = _f_rgb_etc2_dither_avx2;
+        }
+        else
+        {
+            func = _f_rgb_etc2_avx2;
+        }
+
+        func_alpha = _f_rgba_avx2;
+    }
+    else
+#endif
+    {
+        if( dither )
+        {
+            func = _f_rgb_etc2_dither;
+        }
+        else
+        {
+            func = _f_rgb_etc2;
+        }
+
+        func_alpha = _f_rgba;
+    }
+
+    do
+    {
+        auto ptr = buf;
+        auto ptr8 = buf8;
+        for( int x=0; x<4; x++ )
+        {
+            auto v = *src;
+            *ptr++ = v;
+            *ptr8++ = v >> 24;
+            src += width;
+            v = *src;
+            *ptr++ = v;
+            *ptr8++ = v >> 24;
+            src += width;
+            v = *src;
+            *ptr++ = v;
+            *ptr8++ = v >> 24;
+            src += width;
+            v = *src;
+            *ptr++ = v;
+            *ptr8++ = v >> 24;
+            src -= width * 3 - 1;
+        }
+        if( ++w == width/4 )
+        {
+            src += width * 3;
+            w = 0;
+        }
+
+        *dst++ = func_alpha( buf8 );
+        *dst++ = func( (uint8_t*)buf );
+    }
+    while( --blocks );
+}
+
+namespace
+{
+struct BlockColor
+{
+    uint32_t r[2], g[2], b[2];
+};
+
+enum class Etc2Mode
+{
+    none,
+    t,
+    h,
+    planar
+};
+
+Etc2Mode DecodeBlockColor( uint64_t d, BlockColor& c )
+{
+    if( d & 0x2 )
+    {
+        int32_t dr, dg, db;
+
+        c.r[0] = ( d & 0xF8000000 ) >> 27;
+        c.g[0] = ( d & 0x00F80000 ) >> 19;
+        c.b[0] = ( d & 0x0000F800 ) >> 11;
+
+        dr = ( d & 0x07000000 ) >> 24;
+        dg = ( d & 0x00070000 ) >> 16;
+        db = ( d & 0x00000700 ) >> 8;
+
+        if( dr & 0x4 )
+        {
+            dr |= 0xFFFFFFF8;
+        }
+        if( dg & 0x4 )
+        {
+            dg |= 0xFFFFFFF8;
+        }
+        if( db & 0x4 )
+        {
+            db |= 0xFFFFFFF8;
+        }
+
+        int32_t r = static_cast<int32_t>(c.r[0]) + dr;
+        int32_t g = static_cast<int32_t>(c.g[0]) + dg;
+        int32_t b = static_cast<int32_t>(c.b[0]) + db;
+
+        if ((r < 0) || (r > 31))
+        {
+            return Etc2Mode::t;
+        }
+
+        if ((g < 0) || (g > 31))
+        {
+            return Etc2Mode::h;
+        }
+
+        if ((b < 0) || (b > 31))
+        {
+            return Etc2Mode::planar;
+        }
+
+        c.r[1] = c.r[0] + dr;
+        c.g[1] = c.g[0] + dg;
+        c.b[1] = c.b[0] + db;
+
+        for( int i=0; i<2; i++ )
+        {
+            c.r[i] = ( c.r[i] << 3 ) | ( c.r[i] >> 2 );
+            c.g[i] = ( c.g[i] << 3 ) | ( c.g[i] >> 2 );
+            c.b[i] = ( c.b[i] << 3 ) | ( c.b[i] >> 2 );
+        }
+    }
+    else
+    {
+        c.r[0] = ( ( d & 0xF0000000 ) >> 24 ) | ( ( d & 0xF0000000 ) >> 28 );
+        c.r[1] = ( ( d & 0x0F000000 ) >> 20 ) | ( ( d & 0x0F000000 ) >> 24 );
+        c.g[0] = ( ( d & 0x00F00000 ) >> 16 ) | ( ( d & 0x00F00000 ) >> 20 );
+        c.g[1] = ( ( d & 0x000F0000 ) >> 12 ) | ( ( d & 0x000F0000 ) >> 16 );
+        c.b[0] = ( ( d & 0x0000F000 ) >> 8  ) | ( ( d & 0x0000F000 ) >> 12 );
+        c.b[1] = ( ( d & 0x00000F00 ) >> 4  ) | ( ( d & 0x00000F00 ) >> 8  );
+    }
+    return Etc2Mode::none;
+}
+
+inline int32_t expand6(uint32_t value)
+{
+    return (value << 2) | (value >> 4);
+}
+
+inline int32_t expand7(uint32_t value)
+{
+    return (value << 1) | (value >> 6);
+}
+
+void DecodePlanar(uint64_t block, uint32_t* l[4])
+{
+    const auto bv = expand6((block >> ( 0 + 32)) & 0x3F);
+    const auto gv = expand7((block >> ( 6 + 32)) & 0x7F);
+    const auto rv = expand6((block >> (13 + 32)) & 0x3F);
+
+    const auto bh = expand6((block >> (19 + 32)) & 0x3F);
+    const auto gh = expand7((block >> (25 + 32)) & 0x7F);
+
+    const auto rh0 = (block >> (32 - 32)) & 0x01;
+    const auto rh1 = ((block >> (34 - 32)) & 0x1F) << 1;
+    const auto rh = expand6(rh0 | rh1);
+
+    const auto bo0 = (block >> (39 - 32)) & 0x07;
+    const auto bo1 = ((block >> (43 - 32)) & 0x3) << 3;
+    const auto bo2 = ((block >> (48 - 32)) & 0x1) << 5;
+    const auto bo = expand6(bo0 | bo1 | bo2);
+    const auto go0 = (block >> (49 - 32)) & 0x3F;
+    const auto go1 = ((block >> (56 - 32)) & 0x01) << 6;
+    const auto go = expand7(go0 | go1);
+    const auto ro = expand6((block >> (57 - 32)) & 0x3F);
+
+    for (auto j = 0; j < 4; j++)
+    {
+        for (auto i = 0; i < 4; i++)
+        {
+            uint32_t r = clampu8((i * (rh - ro) + j * (rv - ro) + 4 * ro + 2) >> 2);
+            uint32_t g = clampu8((i * (gh - go) + j * (gv - go) + 4 * go + 2) >> 2);
+            uint32_t b = clampu8((i * (bh - bo) + j * (bv - bo) + 4 * bo + 2) >> 2);
+
+            *l[j]++ = r | ( g << 8 ) | ( b << 16 ) | 0xFF000000;
+        }
+    }
+}
+
+}
+
+BitmapPtr BlockData::Decode()
+{
+    if( m_type == Etc2_RGBA )
+    {
+        return DecodeRGBA();
+    }
+    else
+    {
+        return DecodeRGB();
+    }
+}
+
+static uint64_t ConvertByteOrder( uint64_t d )
+{
+    return ( ( d & 0xFF000000FF000000 ) >> 24 ) |
+           ( ( d & 0x000000FF000000FF ) << 24 ) |
+           ( ( d & 0x00FF000000FF0000 ) >> 8 ) |
+           ( ( d & 0x0000FF000000FF00 ) << 8 );
+}
+
+static void DecodeRGBPart( uint32_t* l[4], uint64_t d )
+{
+    d = ConvertByteOrder( d );
+
+    BlockColor c;
+    const auto mode = DecodeBlockColor( d, c );
+
+    if (mode == Etc2Mode::planar)
+    {
+        DecodePlanar(d, l);
+        return;
+    }
+
+    unsigned int tcw[2];
+    tcw[0] = ( d & 0xE0 ) >> 5;
+    tcw[1] = ( d & 0x1C ) >> 2;
+
+    if( d & 0x1 )
+    {
+        int o = 0;
+        for( int i=0; i<4; i++ )
+        {
+            for( int j=0; j<4; j++ )
+            {
+                const auto mod = g_table[tcw[j/2]][ ( ( d >> ( o + 32 + j ) ) & 0x1 ) | ( ( d >> ( o + 47 + j ) ) & 0x2 ) ];
+                const auto r = clampu8( c.r[j/2] + mod );
+                const auto g = clampu8( c.g[j/2] + mod );
+                const auto b = clampu8( c.b[j/2] + mod );
+                *l[j]++ = r | ( g << 8 ) | ( b << 16 ) | 0xFF000000;
+            }
+            o += 4;
+        }
+    }
+    else
+    {
+        int o = 0;
+        for( int i=0; i<4; i++ )
+        {
+            const auto tbl = g_table[tcw[i/2]];
+            const auto cr = c.r[i/2];
+            const auto cg = c.g[i/2];
+            const auto cb = c.b[i/2];
+
+            for( int j=0; j<4; j++ )
+            {
+                const auto mod = tbl[ ( ( d >> ( o + 32 + j ) ) & 0x1 ) | ( ( d >> ( o + 47 + j ) ) & 0x2 ) ];
+                const auto r = clampu8( cr + mod );
+                const auto g = clampu8( cg + mod );
+                const auto b = clampu8( cb + mod );
+                *l[j]++ = r | ( g << 8 ) | ( b << 16 ) | 0xFF000000;
+            }
+            o += 4;
+        }
+    }
+}
+
+static void DecodeAlphaPart( uint32_t* l[4], uint64_t d )
+{
+    d = ( ( d & 0xFF00000000000000 ) >> 56 ) |
+        ( ( d & 0x00FF000000000000 ) >> 40 ) |
+        ( ( d & 0x0000FF0000000000 ) >> 24 ) |
+        ( ( d & 0x000000FF00000000 ) >> 8 ) |
+        ( ( d & 0x00000000FF000000 ) << 8 ) |
+        ( ( d & 0x0000000000FF0000 ) << 24 ) |
+        ( ( d & 0x000000000000FF00 ) << 40 ) |
+        ( ( d & 0x00000000000000FF ) << 56 );
+
+    unsigned int base = d >> 56;
+    unsigned int mul = ( d >> 52 ) & 0xF;
+    unsigned int idx = ( d >> 48 ) & 0xF;
+
+    const auto tbl = g_alpha[idx];
+
+    int o = 45;
+    for( int i=0; i<4; i++ )
+    {
+        for( int j=0; j<4; j++ )
+        {
+            const auto mod = tbl[ ( d >> o ) & 0x7 ];
+            const auto a = clampu8( base + mod * mul );
+            *l[j] = ( *l[j] & 0x00FFFFFF ) | ( a << 24 );
+            l[j]++;
+            o -= 3;
+        }
+    }
+}
+
+BitmapPtr BlockData::DecodeRGB()
+{
+    auto ret = std::make_shared<Bitmap>( m_size );
+
+    uint32_t* l[4];
+    l[0] = ret->Data();
+    l[1] = l[0] + m_size.x;
+    l[2] = l[1] + m_size.x;
+    l[3] = l[2] + m_size.x;
+
+    const uint64_t* src = (const uint64_t*)( m_data + m_dataOffset );
+
+    for( int y=0; y<m_size.y/4; y++ )
+    {
+        for( int x=0; x<m_size.x/4; x++ )
+        {
+            uint64_t d = *src++;
+            DecodeRGBPart( l, d );
+        }
+
+        for( int i=0; i<4; i++ )
+        {
+            l[i] += m_size.x * 3;
+        }
+    }
+
+    return ret;
+}
+
+BitmapPtr BlockData::DecodeRGBA()
+{
+    auto ret = std::make_shared<Bitmap>( m_size );
+
+    uint32_t* l[4];
+    l[0] = ret->Data();
+    l[1] = l[0] + m_size.x;
+    l[2] = l[1] + m_size.x;
+    l[3] = l[2] + m_size.x;
+
+    const uint64_t* src = (const uint64_t*)( m_data + m_dataOffset );
+
+    for( int y=0; y<m_size.y/4; y++ )
+    {
+        for( int x=0; x<m_size.x/4; x++ )
+        {
+            uint64_t a = *src++;
+            uint64_t d = *src++;
+            DecodeRGBPart( l, d );
+
+            for( int i=0; i<4; i++ )
+            {
+                l[i] -= 4;
+            }
+
+            DecodeAlphaPart( l, a );
+        }
+
+        for( int i=0; i<4; i++ )
+        {
+            l[i] += m_size.x * 3;
+        }
+    }
+
+    return ret;
+}
+
+// Block type:
+//  red - 2x4, green - 4x2, blue - planar
+//  dark - 444, bright - 555 + 333
+void BlockData::Dissect()
+{
+    auto size = m_size / 4;
+    const uint64_t* data = (const uint64_t*)( m_data + m_dataOffset );
+
+    auto src = data;
+
+    auto bmp = std::make_shared<Bitmap>( size );
+    auto dst = bmp->Data();
+
+    auto bmp2 = std::make_shared<Bitmap>( m_size );
+    uint32_t* l[4];
+    l[0] = bmp2->Data();
+    l[1] = l[0] + m_size.x;
+    l[2] = l[1] + m_size.x;
+    l[3] = l[2] + m_size.x;
+
+    auto bmp3 = std::make_shared<Bitmap>( size );
+    auto dst3 = bmp3->Data();
+
+    for( int y=0; y<size.y; y++ )
+    {
+        for( int x=0; x<size.x; x++ )
+        {
+            uint64_t d = ConvertByteOrder( *src++ );
+
+            BlockColor c;
+            const auto mode = DecodeBlockColor( d, c );
+
+            switch( mode )
+            {
+            case Etc2Mode::none:
+                switch( d & 0x3 )
+                {
+                case 0:
+                    *dst++ = 0xFF000088;
+                    break;
+                case 1:
+                    *dst++ = 0xFF008800;
+                    break;
+                case 2:
+                    *dst++ = 0xFF0000FF;
+                    break;
+                case 3:
+                    *dst++ = 0xFF00FF00;
+                    break;
+                default:
+                    assert( false );
+                    break;
+                }
+                break;
+            case Etc2Mode::planar:
+                *dst++ = 0xFFFF0000;
+                break;
+            default:
+                assert( false );
+                break;
+            }
+
+            unsigned int tcw[2];
+            tcw[0] = ( d & 0xE0 );
+            tcw[1] = ( d & 0x1C ) << 3;
+
+            *dst3++ = 0xFF000000 | ( tcw[0] << 8 ) | ( tcw[1] );
+
+            if( d & 0x1 )
+            {
+                for( int i=0; i<4; i++ )
+                {
+                    *l[0]++ = 0xFF000000 | ( c.b[0] << 16 ) | ( c.g[0] << 8 ) | c.r[0];
+                    *l[1]++ = 0xFF000000 | ( c.b[0] << 16 ) | ( c.g[0] << 8 ) | c.r[0];
+                    *l[2]++ = 0xFF000000 | ( c.b[1] << 16 ) | ( c.g[1] << 8 ) | c.r[1];
+                    *l[3]++ = 0xFF000000 | ( c.b[1] << 16 ) | ( c.g[1] << 8 ) | c.r[1];
+                }
+            }
+            else
+            {
+                for( int i=0; i<2; i++ )
+                {
+                    *l[0]++ = 0xFF000000 | ( c.b[0] << 16 ) | ( c.g[0] << 8 ) | c.r[0];
+                    *l[1]++ = 0xFF000000 | ( c.b[0] << 16 ) | ( c.g[0] << 8 ) | c.r[0];
+                    *l[2]++ = 0xFF000000 | ( c.b[0] << 16 ) | ( c.g[0] << 8 ) | c.r[0];
+                    *l[3]++ = 0xFF000000 | ( c.b[0] << 16 ) | ( c.g[0] << 8 ) | c.r[0];
+                }
+                for( int i=0; i<2; i++ )
+                {
+                    *l[0]++ = 0xFF000000 | ( c.b[1] << 16 ) | ( c.g[1] << 8 ) | c.r[1];
+                    *l[1]++ = 0xFF000000 | ( c.b[1] << 16 ) | ( c.g[1] << 8 ) | c.r[1];
+                    *l[2]++ = 0xFF000000 | ( c.b[1] << 16 ) | ( c.g[1] << 8 ) | c.r[1];
+                    *l[3]++ = 0xFF000000 | ( c.b[1] << 16 ) | ( c.g[1] << 8 ) | c.r[1];
+                }
+            }
+        }
+        l[0] += m_size.x * 3;
+        l[1] += m_size.x * 3;
+        l[2] += m_size.x * 3;
+        l[3] += m_size.x * 3;
+    }
+
+    bmp->Write( "out_block_type.png" );
+    bmp2->Write( "out_block_color.png" );
+    bmp3->Write( "out_block_selectors.png" );
+}
diff --git a/thirdparty/etcpak/BlockData.hpp b/thirdparty/etcpak/BlockData.hpp
new file mode 100644
index 000000000000..eb2ed818c140
--- /dev/null
+++ b/thirdparty/etcpak/BlockData.hpp
@@ -0,0 +1,50 @@
+#ifndef __BLOCKDATA_HPP__
+#define __BLOCKDATA_HPP__
+
+#include <condition_variable>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+
+#include "Bitmap.hpp"
+#include "Vector.hpp"
+
+class BlockData
+{
+public:
+    enum Type
+    {
+        Etc1,
+        Etc2_RGB,
+        Etc2_RGBA,
+    };
+
+    BlockData( const char* fn );
+    BlockData( const char* fn, const v2i& size, bool mipmap, Type type );
+    BlockData( const v2i& size, bool mipmap, Type type );
+    ~BlockData();
+
+    BitmapPtr Decode();
+    void Dissect();
+
+    void Process( const uint32_t* src, uint32_t blocks, size_t offset, size_t width, Channels type, bool dither );
+    void ProcessRGBA( const uint32_t* src, uint32_t blocks, size_t offset, size_t width, bool dither );
+
+private:
+    BitmapPtr DecodeRGB();
+    BitmapPtr DecodeRGBA();
+
+    uint8_t* m_data;
+    v2i m_size;
+    size_t m_dataOffset;
+    FILE* m_file;
+    size_t m_maplen;
+    Type m_type;
+};
+
+typedef std::shared_ptr<BlockData> BlockDataPtr;
+
+#endif
diff --git a/thirdparty/etcpak/ColorSpace.cpp b/thirdparty/etcpak/ColorSpace.cpp
new file mode 100644
index 000000000000..041154106620
--- /dev/null
+++ b/thirdparty/etcpak/ColorSpace.cpp
@@ -0,0 +1,114 @@
+#include <math.h>
+#include <stdint.h>
+
+#include "Math.hpp"
+#include "ColorSpace.hpp"
+
+namespace Color
+{
+
+    static const XYZ white( v3b( 255, 255, 255 ) );
+    static const v3f rwhite( 1.f / white.x, 1.f / white.y, 1.f / white.z );
+
+
+    XYZ::XYZ( float _x, float _y, float _z )
+        : x( _x )
+        , y( _y )
+        , z( _z )
+    {
+    }
+
+    XYZ::XYZ( const v3b& rgb )
+    {
+        const float r = rgb.x / 255.f;
+        const float g = rgb.y / 255.f;
+        const float b = rgb.z / 255.f;
+
+        const float rl = sRGB2linear( r );
+        const float gl = sRGB2linear( g );
+        const float bl = sRGB2linear( b );
+
+        x = 0.4124f * rl + 0.3576f * gl + 0.1805f * bl;
+        y = 0.2126f * rl + 0.7152f * gl + 0.0722f * bl;
+        z = 0.0193f * rl + 0.1192f * gl + 0.9505f * bl;
+    }
+
+    static float revlab( float t )
+    {
+        const float p1 = 6.f/29.f;
+        const float p2 = 4.f/29.f;
+
+        if( t > p1 )
+        {
+            return t*t*t;
+        }
+        else
+        {
+            return 3 * sq( p1 ) * ( t - p2 );
+        }
+    }
+
+    XYZ::XYZ( const Lab& lab )
+    {
+        y = white.y * revlab( 1.f/116.f * ( lab.L + 16 ) );
+        x = white.x * revlab( 1.f/116.f * ( lab.L + 16 ) + 1.f/500.f * lab.a );
+        z = white.z * revlab( 1.f/116.f * ( lab.L + 16 ) - 1.f/200.f * lab.b );
+    }
+
+    v3i XYZ::RGB() const
+    {
+        const float rl =  3.2406f * x - 1.5372f * y - 0.4986f * z;
+        const float gl = -0.9689f * x + 1.8758f * y + 0.0415f * z;
+        const float bl =  0.0557f * x - 0.2040f * y + 1.0570f * z;
+
+        const float r = linear2sRGB( rl );
+        const float g = linear2sRGB( gl );
+        const float b = linear2sRGB( bl );
+
+        return v3i( clampu8( int32_t( r * 255 ) ), clampu8( int32_t( g * 255 ) ), clampu8( int32_t( b * 255 ) ) );
+    }
+
+
+    Lab::Lab()
+        : L( 0 )
+        , a( 0 )
+        , b( 0 )
+    {
+    }
+
+    Lab::Lab( float L, float a, float b )
+        : L( L )
+        , a( a )
+        , b( b )
+    {
+    }
+
+    static float labfunc( float t )
+    {
+        const float p1 = (6.f/29.f)*(6.f/29.f)*(6.f/29.f);
+        const float p2 = (1.f/3.f)*(29.f/6.f)*(29.f/6.f);
+        const float p3 = (4.f/29.f);
+
+        if( t > p1 )
+        {
+            return pow( t, 1.f/3.f );
+        }
+        else
+        {
+            return p2 * t + p3;
+        }
+    }
+
+    Lab::Lab( const XYZ& xyz )
+    {
+        L = 116 * labfunc( xyz.y * rwhite.y ) - 16;
+        a = 500 * ( labfunc( xyz.x * rwhite.x ) - labfunc( xyz.y * rwhite.y ) );
+        b = 200 * ( labfunc( xyz.y * rwhite.y ) - labfunc( xyz.z * rwhite.z ) );
+    }
+
+    Lab::Lab( const v3b& rgb )
+    {
+        new(this) Lab( XYZ( rgb ) );
+    }
+
+}
diff --git a/thirdparty/etcpak/ColorSpace.hpp b/thirdparty/etcpak/ColorSpace.hpp
new file mode 100644
index 000000000000..c9d0a9cf3f0c
--- /dev/null
+++ b/thirdparty/etcpak/ColorSpace.hpp
@@ -0,0 +1,36 @@
+#ifndef __DARKRL__COLORSPACE_HPP__
+#define __DARKRL__COLORSPACE_HPP__
+
+#include "Vector.hpp"
+
+namespace Color
+{
+
+    class Lab;
+
+    class XYZ
+    {
+    public:
+        XYZ( float x, float y, float z );
+        XYZ( const v3b& rgb );
+        XYZ( const Lab& lab );
+
+        v3i RGB() const;
+
+        float x, y, z;
+    };
+
+    class Lab
+    {
+    public:
+        Lab();
+        Lab( float L, float a, float b );
+        Lab( const XYZ& xyz );
+        Lab( const v3b& rgb );
+
+        float L, a, b;
+    };
+
+}
+
+#endif
diff --git a/thirdparty/etcpak/CpuArch.cpp b/thirdparty/etcpak/CpuArch.cpp
new file mode 100644
index 000000000000..fc1a803385d4
--- /dev/null
+++ b/thirdparty/etcpak/CpuArch.cpp
@@ -0,0 +1,103 @@
+#include "CpuArch.hpp"
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || defined(_M_IX86)
+
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)
+
+#include <immintrin.h>
+
+int check_4th_gen_intel_core_features()
+{
+    const int the_4th_gen_features = 
+        (_FEATURE_AVX2 | _FEATURE_FMA | _FEATURE_BMI | _FEATURE_LZCNT | _FEATURE_MOVBE);
+    return _may_i_use_cpu_feature( the_4th_gen_features );
+}
+
+#else /* non-Intel compiler */
+
+#include <stdint.h>
+#if defined(_MSC_VER)
+# include <intrin.h>
+#endif
+
+void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd)
+{
+#if defined(_MSC_VER)
+    __cpuidex((int*)abcd, eax, ecx);
+#else
+    uint32_t ebx, edx;
+# if defined( __i386__ ) && defined ( __PIC__ )
+    /* in case of PIC under 32-bit EBX cannot be clobbered */
+    __asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx),
+# else
+    __asm__ ( "cpuid" : "+b" (ebx),
+# endif
+        "+a" (eax), "+c" (ecx), "=d" (edx) );
+    abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
+#endif
+}     
+
+int check_xcr0_ymm() 
+{
+    uint32_t xcr0;
+#if defined(_MSC_VER)
+    xcr0 = (uint32_t)_xgetbv(0);  /* min VS2010 SP1 compiler is required */
+#else
+    __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
+#endif
+    return ((xcr0 & 6) == 6); /* checking if xmm and ymm state are enabled in XCR0 */
+}
+
+
+int check_4th_gen_intel_core_features()
+{
+    uint32_t abcd[4];
+    uint32_t fma_movbe_osxsave_mask = ((1 << 12) | (1 << 22) | (1 << 27));
+    uint32_t avx2_bmi12_mask = (1 << 5) | (1 << 3) | (1 << 8);
+
+    /* CPUID.(EAX=01H, ECX=0H):ECX.FMA[bit 12]==1   && 
+    CPUID.(EAX=01H, ECX=0H):ECX.MOVBE[bit 22]==1 && 
+    CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1 */
+    run_cpuid( 1, 0, abcd );
+    if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask ) 
+        return 0;
+
+    if ( ! check_xcr0_ymm() )
+        return 0;
+
+    /*  CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1  &&
+    CPUID.(EAX=07H, ECX=0H):EBX.BMI1[bit 3]==1  &&
+    CPUID.(EAX=07H, ECX=0H):EBX.BMI2[bit 8]==1  */
+    run_cpuid( 7, 0, abcd );
+    if ( (abcd[1] & avx2_bmi12_mask) != avx2_bmi12_mask ) 
+        return 0;
+
+    /* CPUID.(EAX=80000001H):ECX.LZCNT[bit 5]==1 */
+    run_cpuid( 0x80000001, 0, abcd );
+    if ( (abcd[2] & (1 << 5)) == 0)
+        return 0;
+
+    return 1;
+}
+
+#endif /* non-Intel compiler */
+
+
+bool can_use_intel_core_4th_gen_features()
+{
+    static int the_4th_gen_features_available = -1;
+    /* test is performed once */
+    if (the_4th_gen_features_available < 0 )
+        the_4th_gen_features_available = check_4th_gen_intel_core_features();
+
+    return the_4th_gen_features_available == 1;
+}
+
+#else
+
+bool can_use_intel_core_4th_gen_features()
+{
+    return false;
+}
+
+#endif
diff --git a/thirdparty/etcpak/CpuArch.hpp b/thirdparty/etcpak/CpuArch.hpp
new file mode 100644
index 000000000000..f127700ad3b9
--- /dev/null
+++ b/thirdparty/etcpak/CpuArch.hpp
@@ -0,0 +1,6 @@
+#ifndef __CPUARCH_HPP__
+#define __CPUARCH_HPP__
+
+bool can_use_intel_core_4th_gen_features();
+
+#endif
diff --git a/thirdparty/etcpak/DataProvider.cpp b/thirdparty/etcpak/DataProvider.cpp
new file mode 100644
index 000000000000..1c462a6771f5
--- /dev/null
+++ b/thirdparty/etcpak/DataProvider.cpp
@@ -0,0 +1,76 @@
+#include <assert.h>
+#include <utility>
+
+#include "BitmapDownsampled.hpp"
+#include "DataProvider.hpp"
+#include "MipMap.hpp"
+
+DataProvider::DataProvider( const char* fn, bool mipmap )
+    : m_offset( 0 )
+    , m_mipmap( mipmap )
+    , m_done( false )
+    , m_lines( 32 )
+{
+    m_bmp.emplace_back( new Bitmap( fn, m_lines ) );
+    m_current = m_bmp[0].get();
+}
+
+DataProvider::~DataProvider()
+{
+}
+
+unsigned int DataProvider::NumberOfParts() const
+{
+    unsigned int parts = ( ( m_bmp[0]->Size().y / 4 ) + m_lines - 1 ) / m_lines;
+
+    if( m_mipmap )
+    {
+        v2i current = m_bmp[0]->Size();
+        int levels = NumberOfMipLevels( current );
+        unsigned int lines = m_lines;
+        for( int i=1; i<levels; i++ )
+        {
+            assert( current.x != 1 || current.y != 1 );
+            current.x = std::max( 1, current.x / 2 );
+            current.y = std::max( 1, current.y / 2 );
+            lines *= 2;
+            parts += ( ( std::max( 4, current.y ) / 4 ) + lines - 1 ) / lines;
+        }
+        assert( current.x == 1 && current.y == 1 );
+    }
+
+    return parts;
+}
+
+DataPart DataProvider::NextPart()
+{
+    assert( !m_done );
+
+    unsigned int lines = m_lines;
+    bool done;
+
+    DataPart ret = {
+        m_current->NextBlock( lines, done ),
+        std::max<unsigned int>( 4, m_current->Size().x ),
+        lines,
+        m_offset
+    };
+
+    m_offset += m_current->Size().x / 4 * lines;
+
+    if( done )
+    {
+        if( m_mipmap && ( m_current->Size().x != 1 || m_current->Size().y != 1 ) )
+        {
+            m_lines *= 2;
+            m_bmp.emplace_back( new BitmapDownsampled( *m_current, m_lines ) );
+            m_current = m_bmp[m_bmp.size()-1].get();
+        }
+        else
+        {
+            m_done = true;
+        }
+    }
+
+    return ret;
+}
diff --git a/thirdparty/etcpak/DataProvider.hpp b/thirdparty/etcpak/DataProvider.hpp
new file mode 100644
index 000000000000..4d0b815b3f44
--- /dev/null
+++ b/thirdparty/etcpak/DataProvider.hpp
@@ -0,0 +1,41 @@
+#ifndef __DATAPROVIDER_HPP__
+#define __DATAPROVIDER_HPP__
+
+#include <memory>
+#include <stdint.h>
+#include <vector>
+
+#include "Bitmap.hpp"
+
+struct DataPart
+{
+    const uint32_t* src;
+    unsigned int width;
+    unsigned int lines;
+    unsigned int offset;
+};
+
+class DataProvider
+{
+public:
+    DataProvider( const char* fn, bool mipmap );
+    ~DataProvider();
+
+    unsigned int NumberOfParts() const;
+
+    DataPart NextPart();
+
+    bool Alpha() const { return m_bmp[0]->Alpha(); }
+    const v2i& Size() const { return m_bmp[0]->Size(); }
+    const Bitmap& ImageData() const { return *m_bmp[0]; }
+
+private:
+    std::vector<std::unique_ptr<Bitmap>> m_bmp;
+    Bitmap* m_current;
+    unsigned int m_offset;
+    unsigned int m_lines;
+    bool m_mipmap;
+    bool m_done;
+};
+
+#endif
diff --git a/thirdparty/etcpak/Debug.cpp b/thirdparty/etcpak/Debug.cpp
new file mode 100644
index 000000000000..72dc4e052680
--- /dev/null
+++ b/thirdparty/etcpak/Debug.cpp
@@ -0,0 +1,31 @@
+#include <algorithm>
+#include <vector>
+#include "Debug.hpp"
+
+static std::vector<DebugLog::Callback*> s_callbacks;
+
+void DebugLog::Message( const char* msg )
+{
+    for( auto it = s_callbacks.begin(); it != s_callbacks.end(); ++it )
+    {
+        (*it)->OnDebugMessage( msg );
+    }
+}
+
+void DebugLog::AddCallback( Callback* c )
+{
+    const auto it = std::find( s_callbacks.begin(), s_callbacks.end(), c );
+    if( it == s_callbacks.end() )
+    {
+        s_callbacks.push_back( c );
+    }
+}
+
+void DebugLog::RemoveCallback( Callback* c )
+{
+    const auto it = std::find( s_callbacks.begin(), s_callbacks.end(), c );
+    if( it != s_callbacks.end() )
+    {
+        s_callbacks.erase( it );
+    }
+}
diff --git a/thirdparty/etcpak/Debug.hpp b/thirdparty/etcpak/Debug.hpp
new file mode 100644
index 000000000000..524eaa7bafe1
--- /dev/null
+++ b/thirdparty/etcpak/Debug.hpp
@@ -0,0 +1,27 @@
+#ifndef __DARKRL__DEBUG_HPP__
+#define __DARKRL__DEBUG_HPP__
+
+#ifdef DEBUG
+#  include <sstream>
+#  define DBGPRINT(msg) { std::stringstream __buf; __buf << msg; DebugLog::Message( __buf.str().c_str() ); }
+#else
+#  define DBGPRINT(msg) ((void)0)
+#endif
+
+class DebugLog
+{
+public:
+    struct Callback
+    {
+        virtual void OnDebugMessage( const char* msg ) = 0;
+    };
+
+    static void Message( const char* msg );
+    static void AddCallback( Callback* c );
+    static void RemoveCallback( Callback* c );
+
+private:
+    DebugLog() {}
+};
+
+#endif
diff --git a/thirdparty/etcpak/Dither.cpp b/thirdparty/etcpak/Dither.cpp
new file mode 100644
index 000000000000..87a28b97897b
--- /dev/null
+++ b/thirdparty/etcpak/Dither.cpp
@@ -0,0 +1,396 @@
+#include <algorithm>
+#include <string.h>
+
+#include "Dither.hpp"
+#include "Math.hpp"
+#ifdef __SSE4_1__
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#    include <Windows.h>
+#  else
+#    include <x86intrin.h>
+#  endif
+#endif
+
+static uint8_t e5[32];
+static uint8_t e6[64];
+static uint8_t qrb[256+16];
+static uint8_t qg[256+16];
+
+void InitDither()
+{
+    for( int i=0; i<32; i++ )
+    {
+        e5[i] = (i<<3) | (i>>2);
+    }
+    for( int i=0; i<64; i++ )
+    {
+        e6[i] = (i<<2) | (i>>4);
+    }
+    for( int i=0; i<256+16; i++ )
+    {
+        int v = std::min( std::max( 0, i-8 ), 255 );
+        qrb[i] = e5[mul8bit( v, 31 )];
+        qg[i] = e6[mul8bit( v, 63 )];
+    }
+}
+
+void Dither( uint8_t* data )
+{
+    int err[8];
+    int* ep1 = err;
+    int* ep2 = err+4;
+
+    for( int ch=0; ch<3; ch++ )
+    {
+        uint8_t* ptr = data + ch;
+        uint8_t* quant = (ch == 1) ? qg + 8 : qrb + 8;
+        memset( err, 0, sizeof( err ) );
+
+        for( int y=0; y<4; y++ )
+        {
+            uint8_t tmp;
+            tmp = quant[ptr[0] + ( ( 3 * ep2[1] + 5 * ep2[0] ) >> 4 )];
+            ep1[0] = ptr[0] - tmp;
+            ptr[0] = tmp;
+            tmp = quant[ptr[4] + ( ( 7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0] ) >> 4 )];
+            ep1[1] = ptr[4] - tmp;
+            ptr[4] = tmp;
+            tmp = quant[ptr[8] + ( ( 7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1] ) >> 4 )];
+            ep1[2] = ptr[8] - tmp;
+            ptr[8] = tmp;
+            tmp = quant[ptr[12] + ( ( 7 * ep1[2] + 5 * ep2[3] + ep2[2] ) >> 4 )];
+            ep1[3] = ptr[12] - tmp;
+            ptr[12] = tmp;
+            ptr += 16;
+            std::swap( ep1, ep2 );
+        }
+    }
+}
+
+void Swizzle(const uint8_t* data, const ptrdiff_t pitch, uint8_t* output)
+{
+    for (int i = 0; i < 4; ++i)
+    {
+        uint64_t d0 = *(const uint64_t*)(data + i * pitch + 0);
+        uint64_t d1 = *(const uint64_t*)(data + i * pitch + 8);
+
+        *(uint64_t*)(output + i * 16 + 0) = d0;
+        *(uint64_t*)(output + i * 16 + 8) = d1;
+    }
+}
+
+#ifdef __SSE4_1__
+// This version uses a 5 bit quantization for each channel to allow SIMD acceleration.
+// Tow blocks are processed in parallel
+void Dither_SSE41(const uint8_t* data0, const uint8_t* data1, uint8_t* output0, uint8_t* output1)
+{
+    __m128i ep1[4];
+    __m128i ep2[4];
+
+    ep1[0] = _mm_setzero_si128();
+    ep1[1] = _mm_setzero_si128();
+    ep1[2] = _mm_setzero_si128();
+    ep1[3] = _mm_setzero_si128();
+
+    ep2[0] = _mm_setzero_si128();
+    ep2[1] = _mm_setzero_si128();
+    ep2[2] = _mm_setzero_si128();
+    ep2[3] = _mm_setzero_si128();
+
+    for( int y=0; y<4; y++ )
+    {
+        __m128i d0 = _mm_loadl_epi64((const __m128i*)(data0 + y * 16));
+        __m128i d1 = _mm_loadl_epi64((const __m128i*)(data1 + y * 16));
+        __m128i d2 = _mm_unpacklo_epi32(d0, d1);
+
+        __m128i o0;
+        __m128i o1;
+
+        // tmp = quant[ptr[0] + ( ( 3 * ep2[1] + 5 * ep2[0] ) >> 4 )];
+        {
+            __m128i d3 = _mm_cvtepu8_epi16(d2);
+
+            __m128i t0 = _mm_mullo_epi16(ep2[1], _mm_set1_epi16(3));
+            __m128i t1 = _mm_mullo_epi16(ep2[0], _mm_set1_epi16(5));
+            __m128i t2 = _mm_add_epi16(t0, t1);
+            __m128i t3 = _mm_srai_epi16(t2, 4);
+            __m128i t4 = _mm_add_epi16(t3, d3);
+            __m128i t5 = _mm_add_epi16(t4, _mm_set1_epi16(4));
+
+            // clamp to 0..255
+            __m128i c0 = _mm_min_epi16(t5, _mm_set1_epi16(255));
+            __m128i c1 = _mm_max_epi16(c0, _mm_set1_epi16(0));
+
+            __m128i q0 = _mm_and_si128(c1, _mm_set1_epi16(0xF8));
+            __m128i q1 = _mm_srli_epi16(c1, 5);
+            __m128i q2 = _mm_or_si128(q0, q1);
+            o0 = q2;
+
+            // ep1[0] = ptr[0] - tmp;
+            ep1[0] = _mm_sub_epi16(d3, q2);
+        }
+
+        // tmp = quant[ptr[4] + ( ( 7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0] ) >> 4 )];
+        {
+            __m128i d3 = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
+
+            __m128i t0 = _mm_mullo_epi16(ep1[0], _mm_set1_epi16(7));
+            __m128i t1 = _mm_mullo_epi16(ep2[2], _mm_set1_epi16(3));
+            __m128i t2 = _mm_mullo_epi16(ep2[1], _mm_set1_epi16(5));
+            __m128i t3 = _mm_add_epi16(t0, t1);
+            __m128i t4 = _mm_add_epi16(t2, ep2[0]);
+            __m128i t5 = _mm_add_epi16(t3, t4);
+            __m128i t6 = _mm_srai_epi16(t5, 4);
+            __m128i t7 = _mm_add_epi16(t6, d3);
+            __m128i t8 = _mm_add_epi16(t7, _mm_set1_epi16(4));
+
+            // clamp to 0..255
+            __m128i c0 = _mm_min_epi16(t8, _mm_set1_epi16(255));
+            __m128i c1 = _mm_max_epi16(c0, _mm_set1_epi16(0));
+
+            __m128i q0 = _mm_and_si128(c1, _mm_set1_epi16(0xF8));
+            __m128i q1 = _mm_srli_epi16(c1, 5);
+            __m128i q2 = _mm_or_si128(q0, q1);
+            o1 = q2;
+
+            // ep1[1] = ptr[4] - tmp;
+            ep1[1] = _mm_sub_epi16(d3, q2);
+        }
+
+        __m128i o2 = _mm_packus_epi16(o0, o1);
+
+        _mm_storel_epi64((__m128i*)(output0 + y * 16), _mm_shuffle_epi32(o2, _MM_SHUFFLE(2, 0, 2, 0)));
+        _mm_storel_epi64((__m128i*)(output1 + y * 16), _mm_shuffle_epi32(o2, _MM_SHUFFLE(3, 1, 3, 1)));
+
+        d0 = _mm_loadl_epi64((const __m128i*)(data0 + y * 16 + 8));
+        d1 = _mm_loadl_epi64((const __m128i*)(data1 + y * 16 + 8));
+        d2 = _mm_unpacklo_epi32(d0, d1);
+
+        // tmp = quant[ptr[8] + ( ( 7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1] ) >> 4 )];
+        {
+            __m128i d3 = _mm_cvtepu8_epi16(d2);
+
+            __m128i t0 = _mm_mullo_epi16(ep1[1], _mm_set1_epi16(7));
+            __m128i t1 = _mm_mullo_epi16(ep2[3], _mm_set1_epi16(3));
+            __m128i t2 = _mm_mullo_epi16(ep2[2], _mm_set1_epi16(5));
+            __m128i t3 = _mm_add_epi16(t0, t1);
+            __m128i t4 = _mm_add_epi16(t2, ep2[1]);
+            __m128i t5 = _mm_add_epi16(t3, t4);
+            __m128i t6 = _mm_srai_epi16(t5, 4);
+            __m128i t7 = _mm_add_epi16(t6, d3);
+            __m128i t8 = _mm_add_epi16(t7, _mm_set1_epi16(4));
+
+            // clamp to 0..255
+            __m128i c0 = _mm_min_epi16(t8, _mm_set1_epi16(255));
+            __m128i c1 = _mm_max_epi16(c0, _mm_set1_epi16(0));
+
+            __m128i q0 = _mm_and_si128(c1, _mm_set1_epi16(0xF8));
+            __m128i q1 = _mm_srli_epi16(c1, 5);
+            __m128i q2 = _mm_or_si128(q0, q1);
+            o0 = q2;
+
+            // ep1[2] = ptr[8] - tmp;
+            ep1[2] = _mm_sub_epi16(d3, q2);
+        }
+
+        // tmp = quant[ptr[12] + ( ( 7 * ep1[2] + 5 * ep2[3] + ep2[2] ) >> 4 )];
+        {
+            __m128i d3 = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
+
+            __m128i t0 = _mm_mullo_epi16(ep1[2], _mm_set1_epi16(7));
+            __m128i t1 = _mm_mullo_epi16(ep2[3], _mm_set1_epi16(5));
+            __m128i t3 = _mm_add_epi16(t0, t1);
+            __m128i t4 = _mm_add_epi16(t3, ep2[2]);
+            __m128i t5 = _mm_srai_epi16(t4, 4);
+            __m128i t6 = _mm_add_epi16(t5, d3);
+            __m128i t7 = _mm_add_epi16(t6, _mm_set1_epi16(4));
+
+            // clamp to 0..255
+            __m128i c0 = _mm_min_epi16(t7, _mm_set1_epi16(255));
+            __m128i c1 = _mm_max_epi16(c0, _mm_set1_epi16(0));
+
+            __m128i q0 = _mm_and_si128(c1, _mm_set1_epi16(0xF8));
+            __m128i q1 = _mm_srli_epi16(c1, 5);
+            __m128i q2 = _mm_or_si128(q0, q1);
+            o1 = q2;
+
+            // ep1[3] = ptr[12] - tmp;
+            ep1[3] = _mm_sub_epi16(d3, q2);
+        }
+
+        o2 = _mm_packus_epi16(o0, o1);
+
+        _mm_storel_epi64((__m128i*)(output0 + y * 16 + 8), _mm_shuffle_epi32(o2, _MM_SHUFFLE(2, 0, 2, 0)));
+        _mm_storel_epi64((__m128i*)(output1 + y * 16 + 8), _mm_shuffle_epi32(o2, _MM_SHUFFLE(3, 1, 3, 1)));
+
+        for (int i = 0; i < 4; ++i)
+        {
+            std::swap( ep1[i], ep2[i] );
+        }
+    }
+}
+
+// Tow blocks are processed in parallel
+void Swizzle_SSE41(const uint8_t* data, const ptrdiff_t pitch, uint8_t* output0, uint8_t* output1)
+{
+    for (int i = 0; i < 4; ++i)
+    {
+        __m128i d0 = _mm_loadu_si128((const __m128i*)(data + i * pitch +  0));
+        __m128i d1 = _mm_loadu_si128((const __m128i*)(data + i * pitch + 16));
+        _mm_storeu_si128((__m128i*)(output0 + i * 16), d0);
+        _mm_storeu_si128((__m128i*)(output1 + i * 16), d1);
+    }
+}
+
+// This version uses a 5 bit quantization for each channel to allow SIMD acceleration.
+// Tow blocks are processed in parallel
+void Dither_Swizzle_SSE41(const uint8_t* data, const ptrdiff_t pitch, uint8_t* output0, uint8_t* output1)
+{
+    __m128i ep1[4];
+    __m128i ep2[4];
+
+    ep1[0] = _mm_setzero_si128();
+    ep1[1] = _mm_setzero_si128();
+    ep1[2] = _mm_setzero_si128();
+    ep1[3] = _mm_setzero_si128();
+
+    ep2[0] = _mm_setzero_si128();
+    ep2[1] = _mm_setzero_si128();
+    ep2[2] = _mm_setzero_si128();
+    ep2[3] = _mm_setzero_si128();
+
+    for( int y=0; y<4; y++ )
+    {
+        __m128i d0 = _mm_loadl_epi64((const __m128i*)(data + y * pitch +  0));
+        __m128i d1 = _mm_loadl_epi64((const __m128i*)(data + y * pitch + 16));
+        __m128i d2 = _mm_unpacklo_epi32(d0, d1);
+
+        __m128i o0;
+        __m128i o1;
+
+        // tmp = quant[ptr[0] + ( ( 3 * ep2[1] + 5 * ep2[0] ) >> 4 )];
+        {
+            __m128i d3 = _mm_cvtepu8_epi16(d2);
+
+            __m128i t0 = _mm_mullo_epi16(ep2[1], _mm_set1_epi16(3));
+            __m128i t1 = _mm_mullo_epi16(ep2[0], _mm_set1_epi16(5));
+            __m128i t2 = _mm_add_epi16(t0, t1);
+            __m128i t3 = _mm_srai_epi16(t2, 4);
+            __m128i t4 = _mm_add_epi16(t3, d3);
+            __m128i t5 = _mm_add_epi16(t4, _mm_set1_epi16(4));
+
+            // clamp to 0..255
+            __m128i c0 = _mm_min_epi16(t5, _mm_set1_epi16(255));
+            __m128i c1 = _mm_max_epi16(c0, _mm_set1_epi16(0));
+
+            __m128i q0 = _mm_and_si128(c1, _mm_set1_epi16(0xF8));
+            __m128i q1 = _mm_srli_epi16(c1, 5);
+            __m128i q2 = _mm_or_si128(q0, q1);
+            o0 = q2;
+
+            // ep1[0] = ptr[0] - tmp;
+            ep1[0] = _mm_sub_epi16(d3, q2);
+        }
+
+        // tmp = quant[ptr[4] + ( ( 7 * ep1[0] + 3 * ep2[2] + 5 * ep2[1] + ep2[0] ) >> 4 )];
+        {
+            __m128i d3 = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
+
+            __m128i t0 = _mm_mullo_epi16(ep1[0], _mm_set1_epi16(7));
+            __m128i t1 = _mm_mullo_epi16(ep2[2], _mm_set1_epi16(3));
+            __m128i t2 = _mm_mullo_epi16(ep2[1], _mm_set1_epi16(5));
+            __m128i t3 = _mm_add_epi16(t0, t1);
+            __m128i t4 = _mm_add_epi16(t2, ep2[0]);
+            __m128i t5 = _mm_add_epi16(t3, t4);
+            __m128i t6 = _mm_srai_epi16(t5, 4);
+            __m128i t7 = _mm_add_epi16(t6, d3);
+            __m128i t8 = _mm_add_epi16(t7, _mm_set1_epi16(4));
+
+            // clamp to 0..255
+            __m128i c0 = _mm_min_epi16(t8, _mm_set1_epi16(255));
+            __m128i c1 = _mm_max_epi16(c0, _mm_set1_epi16(0));
+
+            __m128i q0 = _mm_and_si128(c1, _mm_set1_epi16(0xF8));
+            __m128i q1 = _mm_srli_epi16(c1, 5);
+            __m128i q2 = _mm_or_si128(q0, q1);
+            o1 = q2;
+
+            // ep1[1] = ptr[4] - tmp;
+            ep1[1] = _mm_sub_epi16(d3, q2);
+        }
+
+        __m128i o2 = _mm_packus_epi16(o0, o1);
+
+        _mm_storel_epi64((__m128i*)(output0 + y * 16), _mm_shuffle_epi32(o2, _MM_SHUFFLE(2, 0, 2, 0)));
+        _mm_storel_epi64((__m128i*)(output1 + y * 16), _mm_shuffle_epi32(o2, _MM_SHUFFLE(3, 1, 3, 1)));
+
+        d0 = _mm_loadl_epi64((const __m128i*)(data + y * pitch +  8));
+        d1 = _mm_loadl_epi64((const __m128i*)(data + y * pitch + 24));
+        d2 = _mm_unpacklo_epi32(d0, d1);
+
+        // tmp = quant[ptr[8] + ( ( 7 * ep1[1] + 3 * ep2[3] + 5 * ep2[2] + ep2[1] ) >> 4 )];
+        {
+            __m128i d3 = _mm_cvtepu8_epi16(d2);
+
+            __m128i t0 = _mm_mullo_epi16(ep1[1], _mm_set1_epi16(7));
+            __m128i t1 = _mm_mullo_epi16(ep2[3], _mm_set1_epi16(3));
+            __m128i t2 = _mm_mullo_epi16(ep2[2], _mm_set1_epi16(5));
+            __m128i t3 = _mm_add_epi16(t0, t1);
+            __m128i t4 = _mm_add_epi16(t2, ep2[1]);
+            __m128i t5 = _mm_add_epi16(t3, t4);
+            __m128i t6 = _mm_srai_epi16(t5, 4);
+            __m128i t7 = _mm_add_epi16(t6, d3);
+            __m128i t8 = _mm_add_epi16(t7, _mm_set1_epi16(4));
+
+            // clamp to 0..255
+            __m128i c0 = _mm_min_epi16(t8, _mm_set1_epi16(255));
+            __m128i c1 = _mm_max_epi16(c0, _mm_set1_epi16(0));
+
+            __m128i q0 = _mm_and_si128(c1, _mm_set1_epi16(0xF8));
+            __m128i q1 = _mm_srli_epi16(c1, 5);
+            __m128i q2 = _mm_or_si128(q0, q1);
+            o0 = q2;
+
+            // ep1[2] = ptr[8] - tmp;
+            ep1[2] = _mm_sub_epi16(d3, q2);
+        }
+
+        // tmp = quant[ptr[12] + ( ( 7 * ep1[2] + 5 * ep2[3] + ep2[2] ) >> 4 )];
+        {
+            __m128i d3 = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
+
+            __m128i t0 = _mm_mullo_epi16(ep1[2], _mm_set1_epi16(7));
+            __m128i t1 = _mm_mullo_epi16(ep2[3], _mm_set1_epi16(5));
+            __m128i t3 = _mm_add_epi16(t0, t1);
+            __m128i t4 = _mm_add_epi16(t3, ep2[2]);
+            __m128i t5 = _mm_srai_epi16(t4, 4);
+            __m128i t6 = _mm_add_epi16(t5, d3);
+            __m128i t7 = _mm_add_epi16(t6, _mm_set1_epi16(4));
+
+            // clamp to 0..255
+            __m128i c0 = _mm_min_epi16(t7, _mm_set1_epi16(255));
+            __m128i c1 = _mm_max_epi16(c0, _mm_set1_epi16(0));
+
+            __m128i q0 = _mm_and_si128(c1, _mm_set1_epi16(0xF8));
+            __m128i q1 = _mm_srli_epi16(c1, 5);
+            __m128i q2 = _mm_or_si128(q0, q1);
+            o1 = q2;
+
+            // ep1[3] = ptr[12] - tmp;
+            ep1[3] = _mm_sub_epi16(d3, q2);
+        }
+
+        o2 = _mm_packus_epi16(o0, o1);
+
+        _mm_storel_epi64((__m128i*)(output0 + y * 16 + 8), _mm_shuffle_epi32(o2, _MM_SHUFFLE(2, 0, 2, 0)));
+        _mm_storel_epi64((__m128i*)(output1 + y * 16 + 8), _mm_shuffle_epi32(o2, _MM_SHUFFLE(3, 1, 3, 1)));
+
+        for (int i = 0; i < 4; ++i)
+        {
+            std::swap( ep1[i], ep2[i] );
+        }
+    }
+}
+#endif
+
diff --git a/thirdparty/etcpak/Dither.hpp b/thirdparty/etcpak/Dither.hpp
new file mode 100644
index 000000000000..ef1a5b4839ac
--- /dev/null
+++ b/thirdparty/etcpak/Dither.hpp
@@ -0,0 +1,18 @@
+#ifndef __DITHER_HPP__
+#define __DITHER_HPP__
+
+#include <stddef.h>
+#include <stdint.h>
+
+void InitDither();
+void Dither( uint8_t* data );
+
+void Swizzle(const uint8_t* data, const ptrdiff_t pitch, uint8_t* output);
+
+#ifdef __SSE4_1__
+void Dither_SSE41(const uint8_t* data0, const uint8_t* data1, uint8_t* output0, uint8_t* output1);
+void Swizzle_SSE41(const uint8_t* data, const ptrdiff_t pitch, uint8_t* output0, uint8_t* output1);
+void Dither_Swizzle_SSE41(const uint8_t* data, const ptrdiff_t pitch, uint8_t* output0, uint8_t* output1);
+#endif
+
+#endif
diff --git a/thirdparty/etcpak/Error.cpp b/thirdparty/etcpak/Error.cpp
new file mode 100644
index 000000000000..014ecdab6678
--- /dev/null
+++ b/thirdparty/etcpak/Error.cpp
@@ -0,0 +1,48 @@
+#include <stdint.h>
+
+#include "Error.hpp"
+#include "Math.hpp"
+
+float CalcMSE3( const Bitmap& bmp, const Bitmap& out )
+{
+    float err = 0;
+
+    const uint32_t* p1 = bmp.Data();
+    const uint32_t* p2 = out.Data();
+    size_t cnt = bmp.Size().x * bmp.Size().y;
+
+    for( size_t i=0; i<cnt; i++ )
+    {
+        uint32_t c1 = *p1++;
+        uint32_t c2 = *p2++;
+
+        err += sq( ( c1 & 0x000000FF ) - ( c2 & 0x000000FF ) );
+        err += sq( ( ( c1 & 0x0000FF00 ) >> 8 ) - ( ( c2 & 0x0000FF00 ) >> 8 ) );
+        err += sq( ( ( c1 & 0x00FF0000 ) >> 16 ) - ( ( c2 & 0x00FF0000 ) >> 16 ) );
+    }
+
+    err /= cnt * 3;
+
+    return err;
+}
+
+float CalcMSE1( const Bitmap& bmp, const Bitmap& out )
+{
+    float err = 0;
+
+    const uint32_t* p1 = bmp.Data();
+    const uint32_t* p2 = out.Data();
+    size_t cnt = bmp.Size().x * bmp.Size().y;
+
+    for( size_t i=0; i<cnt; i++ )
+    {
+        uint32_t c1 = *p1++;
+        uint32_t c2 = *p2++;
+
+        err += sq( ( c1 >> 24 ) - ( c2 & 0xFF ) );
+    }
+
+    err /= cnt;
+
+    return err;
+}
diff --git a/thirdparty/etcpak/Error.hpp b/thirdparty/etcpak/Error.hpp
new file mode 100644
index 000000000000..9817754b7419
--- /dev/null
+++ b/thirdparty/etcpak/Error.hpp
@@ -0,0 +1,9 @@
+#ifndef __ERROR_HPP__
+#define __ERROR_HPP__
+
+#include "Bitmap.hpp"
+
+float CalcMSE3( const Bitmap& bmp, const Bitmap& out );
+float CalcMSE1( const Bitmap& bmp, const Bitmap& out );
+
+#endif
diff --git a/thirdparty/etcpak/LICENSE.txt b/thirdparty/etcpak/LICENSE.txt
new file mode 100644
index 000000000000..2254f9ece88d
--- /dev/null
+++ b/thirdparty/etcpak/LICENSE.txt
@@ -0,0 +1,24 @@
+Copyright (c) 2013, Bartosz Taudul <wolf.pld@gmail.com>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/thirdparty/etcpak/Math.hpp b/thirdparty/etcpak/Math.hpp
new file mode 100644
index 000000000000..be9c73ae4e03
--- /dev/null
+++ b/thirdparty/etcpak/Math.hpp
@@ -0,0 +1,89 @@
+#ifndef __DARKRL__MATH_HPP__
+#define __DARKRL__MATH_HPP__
+
+#include <algorithm>
+#include <cmath>
+#include <stdint.h>
+
+template<typename T>
+inline T AlignPOT( T val )
+{
+    if( val == 0 ) return 1;
+    val--;
+    for( unsigned int i=1; i<sizeof( T ) * 8; i <<= 1 )
+    {
+        val |= val >> i;
+    }
+    return val + 1;
+}
+
+inline int CountSetBits( uint32_t val )
+{
+    val -= ( val >> 1 ) & 0x55555555;
+    val = ( ( val >> 2 ) & 0x33333333 ) + ( val & 0x33333333 );
+    val = ( ( val >> 4 ) + val ) & 0x0f0f0f0f;
+    val += val >> 8;
+    val += val >> 16;
+    return val & 0x0000003f;
+}
+
+inline int CountLeadingZeros( uint32_t val )
+{
+    val |= val >> 1;
+    val |= val >> 2;
+    val |= val >> 4;
+    val |= val >> 8;
+    val |= val >> 16;
+    return 32 - CountSetBits( val );
+}
+
+inline float sRGB2linear( float v )
+{
+    const float a = 0.055f;
+    if( v <= 0.04045f )
+    {
+        return v / 12.92f;
+    }
+    else
+    {
+        return pow( ( v + a ) / ( 1 + a ), 2.4f );
+    }
+}
+
+inline float linear2sRGB( float v )
+{
+    const float a = 0.055f;
+    if( v <= 0.0031308f )
+    {
+        return 12.92f * v;
+    }
+    else
+    {
+        return ( 1 + a ) * pow( v, 1/2.4f ) - a;
+    }
+}
+
+template<class T>
+inline T SmoothStep( T x )
+{
+    return x*x*(3-2*x);
+}
+
+inline uint8_t clampu8( int32_t val )
+{
+    return std::min( std::max( 0, val ), 255 );
+}
+
+template<class T>
+inline T sq( T val )
+{
+    return val * val;
+}
+
+static inline int mul8bit( int a, int b )
+{
+    int t = a*b + 128;
+    return ( t + ( t >> 8 ) ) >> 8;
+}
+
+#endif
diff --git a/thirdparty/etcpak/MipMap.hpp b/thirdparty/etcpak/MipMap.hpp
new file mode 100644
index 000000000000..d3b4bc9e7cf6
--- /dev/null
+++ b/thirdparty/etcpak/MipMap.hpp
@@ -0,0 +1,11 @@
+#ifndef __MIPMAP_HPP__
+#define __MIPMAP_HPP__
+
+#include "Vector.hpp"
+
+inline int NumberOfMipLevels( const v2i& size )
+{
+    return (int)floor( log2( std::max( size.x, size.y ) ) ) + 1;
+}
+
+#endif
diff --git a/thirdparty/etcpak/ProcessAlpha.cpp b/thirdparty/etcpak/ProcessAlpha.cpp
new file mode 100644
index 000000000000..bea3e38a26c0
--- /dev/null
+++ b/thirdparty/etcpak/ProcessAlpha.cpp
@@ -0,0 +1,314 @@
+#include <limits>
+
+#include "core/typedefs.h"
+
+#include "Math.hpp"
+#include "ProcessAlpha.hpp"
+#include "Tables.hpp"
+
+// Godot
+//#ifdef __SSE4_1__
+//#  ifdef _MSC_VER
+//#    include <intrin.h>
+//#    include <Windows.h>
+//#    define _bswap(x) _byteswap_ulong(x)
+//#    define _bswap64(x) _byteswap_uint64(x)
+//#  else
+//#    include <x86intrin.h>
+//#  endif
+//#else
+//#  ifndef _MSC_VER || 
+//#    include <byteswap.h>
+//#    define _bswap(x) bswap_32(x)
+//#    define _bswap64(x) bswap_64(x)
+//#  endif
+//#endif
+//
+//#ifndef _bswap
+//#  define _bswap(x) __builtin_bswap32(x)
+//#  define _bswap64(x) __builtin_bswap64(x)
+//#endif
+
+#ifdef __SSE4_1__
+template<int K>
+static inline __m128i Widen( const __m128i src )
+{
+    static_assert( K >= 0 && K <= 7, "Index out of range" );
+
+    __m128i tmp;
+    switch( K )
+    {
+    case 0:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 1:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 2:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 3:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 4:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    case 5:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    case 6:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    case 7:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    }
+}
+
+static inline int GetMulSel( int sel )
+{
+    switch( sel )
+    {
+    case 0:
+        return 0;
+    case 1:
+    case 2:
+    case 3:
+        return 1;
+    case 4:
+        return 2;
+    case 5:
+    case 6:
+    case 7:
+        return 3;
+    case 8:
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+        return 4;
+    case 14:
+    case 15:
+        return 5;
+    }
+}
+#endif
+
+uint64_t ProcessAlpha( const uint8_t* src )
+{
+#if defined __SSE4_1__
+    // Check solid
+    __m128i s = _mm_loadu_si128( (__m128i*)src );
+    __m128i solidCmp = _mm_set1_epi8( src[0] );
+    __m128i cmpRes = _mm_cmpeq_epi8( s, solidCmp );
+    if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
+    {
+        return src[0];
+    }
+
+    // Calculate min, max
+    __m128i s1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i max1 = _mm_max_epu8( s, s1 );
+    __m128i min1 = _mm_min_epu8( s, s1 );
+    __m128i smax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i smin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i max2 = _mm_max_epu8( max1, smax2 );
+    __m128i min2 = _mm_min_epu8( min1, smin2 );
+    __m128i smax3 = _mm_alignr_epi8( max2, max2, 2 );
+    __m128i smin3 = _mm_alignr_epi8( min2, min2, 2 );
+    __m128i max3 = _mm_max_epu8( max2, smax3 );
+    __m128i min3 = _mm_min_epu8( min2, smin3 );
+    __m128i smax4 = _mm_alignr_epi8( max3, max3, 1 );
+    __m128i smin4 = _mm_alignr_epi8( min3, min3, 1 );
+    __m128i max = _mm_max_epu8( max3, smax4 );
+    __m128i min = _mm_min_epu8( min3, smin4 );
+    __m128i max16 = _mm_unpacklo_epi8( max, _mm_setzero_si128() );
+    __m128i min16 = _mm_unpacklo_epi8( min, _mm_setzero_si128() );
+
+    // src range, mid
+    __m128i srcRange = _mm_sub_epi16( max16, min16 );
+    __m128i srcRangeHalf = _mm_srli_epi16( srcRange, 1 );
+    __m128i srcMid = _mm_add_epi16( min16, srcRangeHalf );
+
+    // multiplier
+    __m128i mul1 = _mm_mulhi_epi16( srcRange, g_alphaRange_SIMD );
+    __m128i mul = _mm_add_epi16( mul1, _mm_set1_epi16( 1 ) );
+
+    // wide multiplier
+    __m128i rangeMul[16] = {
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() )
+    };
+
+    // wide source
+    __m128i s16_1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    __m128i s16[2] = { _mm_unpacklo_epi8( s, _mm_setzero_si128() ), _mm_unpacklo_epi8( s16_1, _mm_setzero_si128() ) };
+
+    __m128i sr[16] = {
+        Widen<0>( s16[0] ),
+        Widen<1>( s16[0] ),
+        Widen<2>( s16[0] ),
+        Widen<3>( s16[0] ),
+        Widen<4>( s16[0] ),
+        Widen<5>( s16[0] ),
+        Widen<6>( s16[0] ),
+        Widen<7>( s16[0] ),
+        Widen<0>( s16[1] ),
+        Widen<1>( s16[1] ),
+        Widen<2>( s16[1] ),
+        Widen<3>( s16[1] ),
+        Widen<4>( s16[1] ),
+        Widen<5>( s16[1] ),
+        Widen<6>( s16[1] ),
+        Widen<7>( s16[1] )
+    };
+
+    // find indices
+    uint8_t buf[16][16];
+    int err = std::numeric_limits<int>::max();
+    int sel;
+    for( int r=0; r<16; r++ )
+    {
+        __m128i recVal16 = rangeMul[r];
+
+        int rangeErr = 0;
+        for( int i=0; i<16; i++ )
+        {
+            __m128i err1 = _mm_sub_epi16( sr[i], recVal16 );
+            __m128i err = _mm_mullo_epi16( err1, err1 );
+            __m128i minerr = _mm_minpos_epu16( err );
+            uint64_t tmp = _mm_cvtsi128_si64( minerr );
+            buf[r][i] = tmp >> 16;
+            rangeErr += tmp & 0xFFFF;
+        }
+
+        if( rangeErr < err )
+        {
+            err = rangeErr;
+            sel = r;
+            if( err == 0 ) break;
+        }
+    }
+
+    uint16_t rm[8];
+    _mm_storeu_si128( (__m128i*)rm, mul );
+    uint16_t sm = _mm_cvtsi128_si64( srcMid );
+
+    uint64_t d = ( uint64_t( sm ) << 56 ) |
+        ( uint64_t( rm[GetMulSel( sel )] ) << 52 ) |
+        ( uint64_t( sel ) << 48 );
+
+    int offset = 45;
+    auto ptr = buf[sel];
+    for( int i=0; i<16; i++ )
+    {
+        d |= uint64_t( *ptr++ ) << offset;
+        offset -= 3;
+    }
+
+    return _bswap64( d );
+#else
+    {
+        bool solid = true;
+        const uint8_t* ptr = src + 1;
+        const uint8_t ref = *src;
+        for( int i=1; i<16; i++ )
+        {
+            if( ref != *ptr++ )
+            {
+                solid = false;
+                break;
+            }
+        }
+        if( solid )
+        {
+            return ref;
+        }
+    }
+
+    uint8_t min = src[0];
+    uint8_t max = src[0];
+    for( int i=1; i<16; i++ )
+    {
+        if( min > src[i] ) min = src[i];
+        else if( max < src[i] ) max = src[i];
+    }
+    int srcRange = max - min;
+    int srcMid = min + srcRange / 2;
+
+    uint8_t buf[16][16];
+    int err = std::numeric_limits<int>::max();
+    int sel;
+    int selmul;
+    for( int r=0; r<16; r++ )
+    {
+        int mul = ( ( srcRange * g_alphaRange[r] ) >> 16 ) + 1;
+
+        int rangeErr = 0;
+        for( int i=0; i<16; i++ )
+        {
+            const auto srcVal = src[i];
+
+            int idx = 0;
+            const auto modVal = g_alpha[r][0] * mul;
+            const auto recVal = clampu8( srcMid + modVal );
+            int localErr = sq( srcVal - recVal );
+
+            if( localErr != 0 )
+            {
+                for( int j=1; j<8; j++ )
+                {
+                    const auto modVal = g_alpha[r][j] * mul;
+                    const auto recVal = clampu8( srcMid + modVal );
+                    const auto errProbe = sq( srcVal - recVal );
+                    if( errProbe < localErr )
+                    {
+                        localErr = errProbe;
+                        idx = j;
+                    }
+                }
+            }
+
+            buf[r][i] = idx;
+            rangeErr += localErr;
+        }
+
+        if( rangeErr < err )
+        {
+            err = rangeErr;
+            sel = r;
+            selmul = mul;
+            if( err == 0 ) break;
+        }
+    }
+
+    uint64_t d = ( uint64_t( srcMid ) << 56 ) |
+                 ( uint64_t( selmul ) << 52 ) |
+                 ( uint64_t( sel ) << 48 );
+
+    int offset = 45;
+    auto ptr = buf[sel];
+    for( int i=0; i<16; i++ )
+    {
+        d |= uint64_t( *ptr++ ) << offset;
+        offset -= 3;
+    }
+
+    return BSWAP64( d );
+#endif
+}
diff --git a/thirdparty/etcpak/ProcessAlpha.hpp b/thirdparty/etcpak/ProcessAlpha.hpp
new file mode 100644
index 000000000000..ef476763c679
--- /dev/null
+++ b/thirdparty/etcpak/ProcessAlpha.hpp
@@ -0,0 +1,8 @@
+#ifndef __PROCESSALPHA_HPP__
+#define __PROCESSALPHA_HPP__
+
+#include <stdint.h>
+
+uint64_t ProcessAlpha( const uint8_t* src );
+
+#endif
diff --git a/thirdparty/etcpak/ProcessAlpha_AVX2.cpp b/thirdparty/etcpak/ProcessAlpha_AVX2.cpp
new file mode 100644
index 000000000000..819629fa4c97
--- /dev/null
+++ b/thirdparty/etcpak/ProcessAlpha_AVX2.cpp
@@ -0,0 +1,223 @@
+#ifdef __SSE4_1__
+
+#include <limits>
+
+#include "Math.hpp"
+#include "ProcessAlpha.hpp"
+#include "Tables.hpp"
+
+#ifdef _MSC_VER
+#  include <intrin.h>
+#  include <Windows.h>
+#  define _bswap(x) _byteswap_ulong(x)
+#  define _bswap64(x) _byteswap_uint64(x)
+#  define VS_VECTORCALL _vectorcall
+#else
+#  include <x86intrin.h>
+#  pragma GCC push_options
+#  pragma GCC target ("avx2,fma,bmi2")
+#  define VS_VECTORCALL
+#endif
+
+#ifndef _bswap
+#  define _bswap(x) __builtin_bswap32(x)
+#  define _bswap64(x) __builtin_bswap64(x)
+#endif
+
+template<int K>
+static inline __m128i VS_VECTORCALL Widen( const __m128i src )
+{
+    static_assert( K >= 0 && K <= 7, "Index out of range" );
+
+    __m128i tmp;
+    switch( K )
+    {
+    case 0:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 1:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 2:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 3:
+        tmp = _mm_shufflelo_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+    case 4:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 0, 0, 0, 0 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    case 5:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 1, 1, 1, 1 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    case 6:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    case 7:
+        tmp = _mm_shufflehi_epi16( src, _MM_SHUFFLE( 3, 3, 3, 3 ) );
+        return _mm_shuffle_epi32( tmp, _MM_SHUFFLE( 2, 2, 2, 2 ) );
+    }
+}
+
+static inline int VS_VECTORCALL GetMulSel( int sel )
+{
+    switch( sel )
+    {
+    case 0:
+        return 0;
+    case 1:
+    case 2:
+    case 3:
+        return 1;
+    case 4:
+        return 2;
+    case 5:
+    case 6:
+    case 7:
+        return 3;
+    case 8:
+    case 9:
+    case 10:
+    case 11:
+    case 12:
+    case 13:
+        return 4;
+    case 14:
+    case 15:
+        return 5;
+    }
+}
+
+uint64_t ProcessAlpha_AVX2( const uint8_t* src )
+{
+    // Check solid
+    __m128i s = _mm_loadu_si128( (__m128i*)src );
+    __m128i solidCmp = _mm_broadcastb_epi8( s );
+    __m128i cmpRes = _mm_cmpeq_epi8( s, solidCmp );
+    if( _mm_testc_si128( cmpRes, _mm_set1_epi32( -1 ) ) )
+    {
+        return src[0];
+    }
+
+    // Calculate min, max
+    __m128i s1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 2, 3, 0, 1 ) );
+    __m128i max1 = _mm_max_epu8( s, s1 );
+    __m128i min1 = _mm_min_epu8( s, s1 );
+    __m128i smax2 = _mm_shuffle_epi32( max1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i smin2 = _mm_shuffle_epi32( min1, _MM_SHUFFLE( 0, 0, 2, 2 ) );
+    __m128i max2 = _mm_max_epu8( max1, smax2 );
+    __m128i min2 = _mm_min_epu8( min1, smin2 );
+    __m128i smax3 = _mm_alignr_epi8( max2, max2, 2 );
+    __m128i smin3 = _mm_alignr_epi8( min2, min2, 2 );
+    __m128i max3 = _mm_max_epu8( max2, smax3 );
+    __m128i min3 = _mm_min_epu8( min2, smin3 );
+    __m128i smax4 = _mm_alignr_epi8( max3, max3, 1 );
+    __m128i smin4 = _mm_alignr_epi8( min3, min3, 1 );
+    __m128i max = _mm_max_epu8( max3, smax4 );
+    __m128i min = _mm_min_epu8( min3, smin4 );
+    __m128i max16 = _mm_unpacklo_epi8( max, _mm_setzero_si128() );
+    __m128i min16 = _mm_unpacklo_epi8( min, _mm_setzero_si128() );
+
+    // src range, mid
+    __m128i srcRange = _mm_sub_epi16( max16, min16 );
+    __m128i srcRangeHalf = _mm_srli_epi16( srcRange, 1 );
+    __m128i srcMid = _mm_add_epi16( min16, srcRangeHalf );
+
+    // multiplier
+    __m128i mul1 = _mm_mulhi_epi16( srcRange, g_alphaRange_SIMD );
+    __m128i mul = _mm_add_epi16( mul1, _mm_set1_epi16( 1 ) );
+
+    // wide multiplier
+    __m128i rangeMul[16] = {
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<0>( mul ), g_alpha_SIMD[0] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[1] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[2] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<1>( mul ), g_alpha_SIMD[3] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<2>( mul ), g_alpha_SIMD[4] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[5] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[6] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<3>( mul ), g_alpha_SIMD[7] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[8] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[9] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[10] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[11] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[12] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<4>( mul ), g_alpha_SIMD[13] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[14] ) ) ), _mm_setzero_si128() ),
+        _mm_unpacklo_epi8( _mm_packus_epi16( _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ), _mm_add_epi16( srcMid, _mm_mullo_epi16( Widen<5>( mul ), g_alpha_SIMD[15] ) ) ), _mm_setzero_si128() )
+    };
+
+    // wide source
+    __m128i s16_1 = _mm_shuffle_epi32( s, _MM_SHUFFLE( 3, 2, 3, 2 ) );
+    __m128i s16[2] = { _mm_unpacklo_epi8( s, _mm_setzero_si128() ), _mm_unpacklo_epi8( s16_1, _mm_setzero_si128() ) };
+
+    __m128i sr[16] = {
+        Widen<0>( s16[0] ),
+        Widen<1>( s16[0] ),
+        Widen<2>( s16[0] ),
+        Widen<3>( s16[0] ),
+        Widen<4>( s16[0] ),
+        Widen<5>( s16[0] ),
+        Widen<6>( s16[0] ),
+        Widen<7>( s16[0] ),
+        Widen<0>( s16[1] ),
+        Widen<1>( s16[1] ),
+        Widen<2>( s16[1] ),
+        Widen<3>( s16[1] ),
+        Widen<4>( s16[1] ),
+        Widen<5>( s16[1] ),
+        Widen<6>( s16[1] ),
+        Widen<7>( s16[1] )
+    };
+
+    // find indices
+    uint8_t buf[16][16];
+    int err = std::numeric_limits<int>::max();
+    int sel;
+    for( int r=0; r<16; r++ )
+    {
+        __m128i recVal16 = rangeMul[r];
+
+        int rangeErr = 0;
+        for( int i=0; i<16; i++ )
+        {
+            __m128i err1 = _mm_sub_epi16( sr[i], recVal16 );
+            __m128i err = _mm_mullo_epi16( err1, err1 );
+            __m128i minerr = _mm_minpos_epu16( err );
+            uint64_t tmp = _mm_cvtsi128_si64( minerr );
+            buf[r][i] = tmp >> 16;
+            rangeErr += tmp & 0xFFFF;
+        }
+
+        if( rangeErr < err )
+        {
+            err = rangeErr;
+            sel = r;
+            if( err == 0 ) break;
+        }
+    }
+
+    uint16_t rm[8];
+    _mm_storeu_si128( (__m128i*)rm, mul );
+    uint16_t sm = _mm_cvtsi128_si64( srcMid );
+
+    uint64_t d = ( uint64_t( sm ) << 56 ) |
+        ( uint64_t( rm[GetMulSel( sel )] ) << 52 ) |
+        ( uint64_t( sel ) << 48 );
+
+    int offset = 45;
+    auto ptr = buf[sel];
+    for( int i=0; i<16; i++ )
+    {
+        d |= uint64_t( *ptr++ ) << offset;
+        offset -= 3;
+    }
+
+    return _bswap64( d );
+}
+
+#ifndef _MSC_VER
+#  pragma GCC pop_options
+#endif
+
+#endif
diff --git a/thirdparty/etcpak/ProcessAlpha_AVX2.hpp b/thirdparty/etcpak/ProcessAlpha_AVX2.hpp
new file mode 100644
index 000000000000..f024f06c2ba8
--- /dev/null
+++ b/thirdparty/etcpak/ProcessAlpha_AVX2.hpp
@@ -0,0 +1,12 @@
+#ifndef __PROCESSALPHA_AVX2_HPP__
+#define __PROCESSALPHA_AVX2_HPP__
+
+#ifdef __SSE4_1__
+
+#include <stdint.h>
+
+uint64_t ProcessAlpha_AVX2( const uint8_t* src );
+
+#endif
+
+#endif
diff --git a/thirdparty/etcpak/ProcessCommon.hpp b/thirdparty/etcpak/ProcessCommon.hpp
new file mode 100644
index 000000000000..657d68888f97
--- /dev/null
+++ b/thirdparty/etcpak/ProcessCommon.hpp
@@ -0,0 +1,50 @@
+#ifndef __PROCESSCOMMON_HPP__
+#define __PROCESSCOMMON_HPP__
+
+#include <assert.h>
+#include <stddef.h>
+#include <stdint.h>
+
+template<class T>
+static size_t GetLeastError( const T* err, size_t num )
+{
+    size_t idx = 0;
+    for( size_t i=1; i<num; i++ )
+    {
+        if( err[i] < err[idx] )
+        {
+            idx = i;
+        }
+    }
+    return idx;
+}
+
+static uint64_t FixByteOrder( uint64_t d )
+{
+    return ( ( d & 0x00000000FFFFFFFF ) ) |
+           ( ( d & 0xFF00000000000000 ) >> 24 ) |
+           ( ( d & 0x000000FF00000000 ) << 24 ) |
+           ( ( d & 0x00FF000000000000 ) >> 8 ) |
+           ( ( d & 0x0000FF0000000000 ) << 8 );
+}
+
+template<class T, class S>
+static uint64_t EncodeSelectors( uint64_t d, const T terr[2][8], const S tsel[16][8], const uint32_t* id )
+{
+    size_t tidx[2];
+    tidx[0] = GetLeastError( terr[0], 8 );
+    tidx[1] = GetLeastError( terr[1], 8 );
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+    for( int i=0; i<16; i++ )
+    {
+        uint64_t t = tsel[i][tidx[id[i]%2]];
+        d |= ( t & 0x1 ) << ( i + 32 );
+        d |= ( t & 0x2 ) << ( i + 47 );
+    }
+
+    return d;
+}
+
+#endif
diff --git a/thirdparty/etcpak/ProcessRGB.cpp b/thirdparty/etcpak/ProcessRGB.cpp
new file mode 100644
index 000000000000..344153df37a5
--- /dev/null
+++ b/thirdparty/etcpak/ProcessRGB.cpp
@@ -0,0 +1,723 @@
+#include <array>
+#include <string.h>
+
+// Godot
+#include "core/typedefs.h"
+
+#include "Math.hpp"
+#include "ProcessCommon.hpp"
+#include "ProcessRGB.hpp"
+#include "Tables.hpp"
+#include "Vector.hpp"
+// Godot
+//#ifdef __SSE4_1__
+//#  ifdef _MSC_VER
+//#    include <intrin.h>
+//#    include <Windows.h>
+//#    define _bswap(x) _byteswap_ulong(x)
+//#  else
+//#    include <x86intrin.h>
+//#  endif
+//#else
+//#  ifndef _MSC_VER
+//#    include <byteswap.h>
+//#    define _bswap(x) bswap_32(x)
+//#  endif
+//#endif
+
+//#ifndef _bswap
+//#  define _bswap(x) __builtin_bswap32(x)
+//#endif
+
+namespace
+{
+
+typedef std::array<uint16_t, 4> v4i;
+
+void Average( const uint8_t* data, v4i* a )
+{
+#ifdef __SSE4_1__
+    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
+
+    __m128i d0l = _mm_unpacklo_epi8(d0, _mm_setzero_si128());
+    __m128i d0h = _mm_unpackhi_epi8(d0, _mm_setzero_si128());
+    __m128i d1l = _mm_unpacklo_epi8(d1, _mm_setzero_si128());
+    __m128i d1h = _mm_unpackhi_epi8(d1, _mm_setzero_si128());
+    __m128i d2l = _mm_unpacklo_epi8(d2, _mm_setzero_si128());
+    __m128i d2h = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
+    __m128i d3l = _mm_unpacklo_epi8(d3, _mm_setzero_si128());
+    __m128i d3h = _mm_unpackhi_epi8(d3, _mm_setzero_si128());
+
+    __m128i sum0 = _mm_add_epi16(d0l, d1l);
+    __m128i sum1 = _mm_add_epi16(d0h, d1h);
+    __m128i sum2 = _mm_add_epi16(d2l, d3l);
+    __m128i sum3 = _mm_add_epi16(d2h, d3h);
+
+    __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
+    __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
+    __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
+    __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
+    __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
+    __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
+    __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
+    __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
+
+    __m128i b0 = _mm_add_epi32(sum0l, sum0h);
+    __m128i b1 = _mm_add_epi32(sum1l, sum1h);
+    __m128i b2 = _mm_add_epi32(sum2l, sum2h);
+    __m128i b3 = _mm_add_epi32(sum3l, sum3h);
+
+    __m128i a0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b2, b3), _mm_set1_epi32(4)), 3);
+    __m128i a1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b1), _mm_set1_epi32(4)), 3);
+    __m128i a2 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b1, b3), _mm_set1_epi32(4)), 3);
+    __m128i a3 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b2), _mm_set1_epi32(4)), 3);
+
+    _mm_storeu_si128((__m128i*)&a[0], _mm_packus_epi32(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a1, _MM_SHUFFLE(3, 0, 1, 2))));
+    _mm_storeu_si128((__m128i*)&a[2], _mm_packus_epi32(_mm_shuffle_epi32(a2, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a3, _MM_SHUFFLE(3, 0, 1, 2))));
+#else
+    uint32_t r[4];
+    uint32_t g[4];
+    uint32_t b[4];
+
+    memset(r, 0, sizeof(r));
+    memset(g, 0, sizeof(g));
+    memset(b, 0, sizeof(b));
+
+    for( int j=0; j<4; j++ )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            int index = (j & 2) + (i >> 1);
+            b[index] += *data++;
+            g[index] += *data++;
+            r[index] += *data++;
+            data++;
+        }
+    }
+
+    a[0] = v4i{ uint16_t( (r[2] + r[3] + 4) / 8 ), uint16_t( (g[2] + g[3] + 4) / 8 ), uint16_t( (b[2] + b[3] + 4) / 8 ), 0};
+    a[1] = v4i{ uint16_t( (r[0] + r[1] + 4) / 8 ), uint16_t( (g[0] + g[1] + 4) / 8 ), uint16_t( (b[0] + b[1] + 4) / 8 ), 0};
+    a[2] = v4i{ uint16_t( (r[1] + r[3] + 4) / 8 ), uint16_t( (g[1] + g[3] + 4) / 8 ), uint16_t( (b[1] + b[3] + 4) / 8 ), 0};
+    a[3] = v4i{ uint16_t( (r[0] + r[2] + 4) / 8 ), uint16_t( (g[0] + g[2] + 4) / 8 ), uint16_t( (b[0] + b[2] + 4) / 8 ), 0};
+#endif
+}
+
+void CalcErrorBlock( const uint8_t* data, unsigned int err[4][4] )
+{
+#ifdef __SSE4_1__
+    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
+
+    __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
+
+    __m128i d0l = _mm_unpacklo_epi8(dm0, _mm_setzero_si128());
+    __m128i d0h = _mm_unpackhi_epi8(dm0, _mm_setzero_si128());
+    __m128i d1l = _mm_unpacklo_epi8(dm1, _mm_setzero_si128());
+    __m128i d1h = _mm_unpackhi_epi8(dm1, _mm_setzero_si128());
+    __m128i d2l = _mm_unpacklo_epi8(dm2, _mm_setzero_si128());
+    __m128i d2h = _mm_unpackhi_epi8(dm2, _mm_setzero_si128());
+    __m128i d3l = _mm_unpacklo_epi8(dm3, _mm_setzero_si128());
+    __m128i d3h = _mm_unpackhi_epi8(dm3, _mm_setzero_si128());
+
+    __m128i sum0 = _mm_add_epi16(d0l, d1l);
+    __m128i sum1 = _mm_add_epi16(d0h, d1h);
+    __m128i sum2 = _mm_add_epi16(d2l, d3l);
+    __m128i sum3 = _mm_add_epi16(d2h, d3h);
+
+    __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
+    __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
+    __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
+    __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
+    __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
+    __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
+    __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
+    __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
+
+    __m128i b0 = _mm_add_epi32(sum0l, sum0h);
+    __m128i b1 = _mm_add_epi32(sum1l, sum1h);
+    __m128i b2 = _mm_add_epi32(sum2l, sum2h);
+    __m128i b3 = _mm_add_epi32(sum3l, sum3h);
+
+    __m128i a0 = _mm_add_epi32(b2, b3);
+    __m128i a1 = _mm_add_epi32(b0, b1);
+    __m128i a2 = _mm_add_epi32(b1, b3);
+    __m128i a3 = _mm_add_epi32(b0, b2);
+
+    _mm_storeu_si128((__m128i*)&err[0], a0);
+    _mm_storeu_si128((__m128i*)&err[1], a1);
+    _mm_storeu_si128((__m128i*)&err[2], a2);
+    _mm_storeu_si128((__m128i*)&err[3], a3);
+#else
+    unsigned int terr[4][4];
+
+    memset(terr, 0, 16 * sizeof(unsigned int));
+
+    for( int j=0; j<4; j++ )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            int index = (j & 2) + (i >> 1);
+            unsigned int d = *data++;
+            terr[index][0] += d;
+            d = *data++;
+            terr[index][1] += d;
+            d = *data++;
+            terr[index][2] += d;
+            data++;
+        }
+    }
+
+    for( int i=0; i<3; i++ )
+    {
+        err[0][i] = terr[2][i] + terr[3][i];
+        err[1][i] = terr[0][i] + terr[1][i];
+        err[2][i] = terr[1][i] + terr[3][i];
+        err[3][i] = terr[0][i] + terr[2][i];
+    }
+    for( int i=0; i<4; i++ )
+    {
+        err[i][3] = 0;
+    }
+#endif
+}
+
+unsigned int CalcError( const unsigned int block[4], const v4i& average )
+{
+    unsigned int err = 0x3FFFFFFF; // Big value to prevent negative values, but small enough to prevent overflow
+    err -= block[0] * 2 * average[2];
+    err -= block[1] * 2 * average[1];
+    err -= block[2] * 2 * average[0];
+    err += 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
+    return err;
+}
+
+void ProcessAverages( v4i* a )
+{
+#ifdef __SSE4_1__
+    for( int i=0; i<2; i++ )
+    {
+        __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
+
+        __m128i t = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(31)), _mm_set1_epi16(128));
+
+        __m128i c = _mm_srli_epi16(_mm_add_epi16(t, _mm_srli_epi16(t, 8)), 8);
+
+        __m128i c1 = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
+        __m128i diff = _mm_sub_epi16(c, c1);
+        diff = _mm_max_epi16(diff, _mm_set1_epi16(-4));
+        diff = _mm_min_epi16(diff, _mm_set1_epi16(3));
+
+        __m128i co = _mm_add_epi16(c1, diff);
+
+        c = _mm_blend_epi16(co, c, 0xF0);
+
+        __m128i a0 = _mm_or_si128(_mm_slli_epi16(c, 3), _mm_srli_epi16(c, 2));
+
+        _mm_storeu_si128((__m128i*)a[4+i*2].data(), a0);
+    }
+
+    for( int i=0; i<2; i++ )
+    {
+        __m128i d = _mm_loadu_si128((__m128i*)a[i*2].data());
+
+        __m128i t0 = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(15)), _mm_set1_epi16(128));
+        __m128i t1 = _mm_srli_epi16(_mm_add_epi16(t0, _mm_srli_epi16(t0, 8)), 8);
+
+        __m128i t2 = _mm_or_si128(t1, _mm_slli_epi16(t1, 4));
+
+        _mm_storeu_si128((__m128i*)a[i*2].data(), t2);
+    }
+#else
+    for( int i=0; i<2; i++ )
+    {
+        for( int j=0; j<3; j++ )
+        {
+            int32_t c1 = mul8bit( a[i*2+1][j], 31 );
+            int32_t c2 = mul8bit( a[i*2][j], 31 );
+
+            int32_t diff = c2 - c1;
+            if( diff > 3 ) diff = 3;
+            else if( diff < -4 ) diff = -4;
+
+            int32_t co = c1 + diff;
+
+            a[5+i*2][j] = ( c1 << 3 ) | ( c1 >> 2 );
+            a[4+i*2][j] = ( co << 3 ) | ( co >> 2 );
+        }
+    }
+
+    for( int i=0; i<4; i++ )
+    {
+        a[i][0] = g_avg2[mul8bit( a[i][0], 15 )];
+        a[i][1] = g_avg2[mul8bit( a[i][1], 15 )];
+        a[i][2] = g_avg2[mul8bit( a[i][2], 15 )];
+    }
+#endif
+}
+
+void EncodeAverages( uint64_t& _d, const v4i* a, size_t idx )
+{
+    auto d = _d;
+    d |= ( idx << 24 );
+    size_t base = idx << 1;
+
+    if( ( idx & 0x2 ) == 0 )
+    {
+        for( int i=0; i<3; i++ )
+        {
+            d |= uint64_t( a[base+0][i] >> 4 ) << ( i*8 );
+            d |= uint64_t( a[base+1][i] >> 4 ) << ( i*8 + 4 );
+        }
+    }
+    else
+    {
+        for( int i=0; i<3; i++ )
+        {
+            d |= uint64_t( a[base+1][i] & 0xF8 ) << ( i*8 );
+            int32_t c = ( ( a[base+0][i] & 0xF8 ) - ( a[base+1][i] & 0xF8 ) ) >> 3;
+            c &= ~0xFFFFFFF8;
+            d |= ((uint64_t)c) << ( i*8 );
+        }
+    }
+    _d = d;
+}
+
+uint64_t CheckSolid( const uint8_t* src )
+{
+#ifdef __SSE4_1__
+    __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
+
+    __m128i c = _mm_shuffle_epi32(d0, _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m128i c0 = _mm_cmpeq_epi8(d0, c);
+    __m128i c1 = _mm_cmpeq_epi8(d1, c);
+    __m128i c2 = _mm_cmpeq_epi8(d2, c);
+    __m128i c3 = _mm_cmpeq_epi8(d3, c);
+
+    __m128i m0 = _mm_and_si128(c0, c1);
+    __m128i m1 = _mm_and_si128(c2, c3);
+    __m128i m = _mm_and_si128(m0, m1);
+
+    if (!_mm_testc_si128(m, _mm_set1_epi32(-1)))
+    {
+        return 0;
+    }
+#else
+    const uint8_t* ptr = src + 4;
+    for( int i=1; i<16; i++ )
+    {
+        if( memcmp( src, ptr, 4 ) != 0 )
+        {
+            return 0;
+        }
+        ptr += 4;
+    }
+#endif
+    return 0x02000000 |
+        ( (unsigned int)( src[0] & 0xF8 ) << 16 ) |
+        ( (unsigned int)( src[1] & 0xF8 ) << 8 ) |
+        ( (unsigned int)( src[2] & 0xF8 ) );
+}
+
+void PrepareAverages( v4i a[8], const uint8_t* src, unsigned int err[4] )
+{
+    Average( src, a );
+    ProcessAverages( a );
+
+    unsigned int errblock[4][4];
+    CalcErrorBlock( src, errblock );
+
+    for( int i=0; i<4; i++ )
+    {
+        err[i/2] += CalcError( errblock[i], a[i] );
+        err[2+i/2] += CalcError( errblock[i], a[i+4] );
+    }
+}
+
+void FindBestFit( uint64_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data )
+{
+    for( size_t i=0; i<16; i++ )
+    {
+        uint16_t* sel = tsel[i];
+        unsigned int bid = id[i];
+        uint64_t* ter = terr[bid%2];
+
+        uint8_t b = *data++;
+        uint8_t g = *data++;
+        uint8_t r = *data++;
+        data++;
+
+        int dr = a[bid][0] - r;
+        int dg = a[bid][1] - g;
+        int db = a[bid][2] - b;
+
+#ifdef __SSE4_1__
+        // Reference implementation
+
+        __m128i pix = _mm_set1_epi32(dr * 77 + dg * 151 + db * 28);
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        __m128i error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[0]));
+        __m128i error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[1]));
+        __m128i error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[0]));
+        __m128i error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[1]));
+
+        __m128i index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
+        __m128i minError0 = _mm_min_epi32(error0, error1);
+
+        __m128i index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
+        __m128i minError1 = _mm_min_epi32(error2, error3);
+
+        __m128i minIndex0 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
+        __m128i minError = _mm_min_epi32(minError0, minError1);
+
+        // Squaring the minimum error to produce correct values when adding
+        __m128i minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
+        __m128i squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
+        squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
+        _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
+        __m128i minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
+        __m128i squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
+        squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
+        _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
+
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[2]));
+        error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[3]));
+        error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[2]));
+        error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[3]));
+
+        index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
+        minError0 = _mm_min_epi32(error0, error1);
+
+        index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
+        minError1 = _mm_min_epi32(error2, error3);
+
+        __m128i minIndex1 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
+        minError = _mm_min_epi32(minError0, minError1);
+
+        // Squaring the minimum error to produce correct values when adding
+        minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
+        squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
+        squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 2));
+        _mm_storeu_si128(((__m128i*)ter) + 2, squareErrorLow);
+        minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
+        squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
+        squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 3));
+        _mm_storeu_si128(((__m128i*)ter) + 3, squareErrorHigh);
+        __m128i minIndex = _mm_packs_epi32(minIndex0, minIndex1);
+        _mm_storeu_si128((__m128i*)sel, minIndex);
+#else
+        int pix = dr * 77 + dg * 151 + db * 28;
+
+        for( int t=0; t<8; t++ )
+        {
+            const int64_t* tab = g_table256[t];
+            unsigned int idx = 0;
+            uint64_t err = sq( tab[0] + pix );
+            for( int j=1; j<4; j++ )
+            {
+                uint64_t local = sq( tab[j] + pix );
+                if( local < err )
+                {
+                    err = local;
+                    idx = j;
+                }
+            }
+            *sel++ = idx;
+            *ter++ += err;
+        }
+#endif
+    }
+}
+
+#ifdef __SSE4_1__
+// Non-reference implementation, but faster. Produces same results as the AVX2 version
+void FindBestFit( uint32_t terr[2][8], uint16_t tsel[16][8], v4i a[8], const uint32_t* id, const uint8_t* data )
+{
+    for( size_t i=0; i<16; i++ )
+    {
+        uint16_t* sel = tsel[i];
+        unsigned int bid = id[i];
+        uint32_t* ter = terr[bid%2];
+
+        uint8_t b = *data++;
+        uint8_t g = *data++;
+        uint8_t r = *data++;
+        data++;
+
+        int dr = a[bid][0] - r;
+        int dg = a[bid][1] - g;
+        int db = a[bid][2] - b;
+
+        // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
+        // This produces slightly different results, but is significant faster
+        __m128i pixel = _mm_set1_epi16(dr * 38 + dg * 76 + db * 14);
+        __m128i pix = _mm_abs_epi16(pixel);
+
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+        __m128i error0 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[0]));
+        __m128i error1 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[1]));
+
+        __m128i index = _mm_and_si128(_mm_cmplt_epi16(error1, error0), _mm_set1_epi16(1));
+        __m128i minError = _mm_min_epi16(error0, error1);
+
+        // Exploiting symmetry of the selector table and use the sign bit
+        // This produces slightly different results, but is needed to produce same results as AVX2 implementation
+        __m128i indexBit = _mm_andnot_si128(_mm_srli_epi16(pixel, 15), _mm_set1_epi8(-1));
+        __m128i minIndex = _mm_or_si128(index, _mm_add_epi16(indexBit, indexBit));
+
+        // Squaring the minimum error to produce correct values when adding
+        __m128i squareErrorLo = _mm_mullo_epi16(minError, minError);
+        __m128i squareErrorHi = _mm_mulhi_epi16(minError, minError);
+
+        __m128i squareErrorLow = _mm_unpacklo_epi16(squareErrorLo, squareErrorHi);
+        __m128i squareErrorHigh = _mm_unpackhi_epi16(squareErrorLo, squareErrorHi);
+
+        squareErrorLow = _mm_add_epi32(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
+        _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
+        squareErrorHigh = _mm_add_epi32(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
+        _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
+
+        _mm_storeu_si128((__m128i*)sel, minIndex);
+    }
+}
+#endif
+
+uint8_t convert6(float f)
+{
+    int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
+    return (i + 11 - ((i + 11) >> 7) - ((i + 4) >> 7)) >> 3;
+}
+
+uint8_t convert7(float f)
+{
+    int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
+    return (i + 9 - ((i + 9) >> 8) - ((i + 6) >> 8)) >> 2;
+}
+
+std::pair<uint64_t, uint64_t> Planar(const uint8_t* src)
+{
+    int32_t r = 0;
+    int32_t g = 0;
+    int32_t b = 0;
+
+    for (int i = 0; i < 16; ++i)
+    {
+        b += src[i * 4 + 0];
+        g += src[i * 4 + 1];
+        r += src[i * 4 + 2];
+    }
+
+    int32_t difRyz = 0;
+    int32_t difGyz = 0;
+    int32_t difByz = 0;
+    int32_t difRxz = 0;
+    int32_t difGxz = 0;
+    int32_t difBxz = 0;
+
+    const int32_t scaling[] = { -255, -85, 85, 255 };
+
+    for (int i = 0; i < 16; ++i)
+    {
+        int32_t difB = (static_cast<int>(src[i * 4 + 0]) << 4) - b;
+        int32_t difG = (static_cast<int>(src[i * 4 + 1]) << 4) - g;
+        int32_t difR = (static_cast<int>(src[i * 4 + 2]) << 4) - r;
+
+        difRyz += difR * scaling[i % 4];
+        difGyz += difG * scaling[i % 4];
+        difByz += difB * scaling[i % 4];
+
+        difRxz += difR * scaling[i / 4];
+        difGxz += difG * scaling[i / 4];
+        difBxz += difB * scaling[i / 4];
+    }
+
+    const float scale = -4.0f / ((255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f);
+
+    float aR = difRxz * scale;
+    float aG = difGxz * scale;
+    float aB = difBxz * scale;
+
+    float bR = difRyz * scale;
+    float bG = difGyz * scale;
+    float bB = difByz * scale;
+
+    float dR = r * (4.0f / 16.0f);
+    float dG = g * (4.0f / 16.0f);
+    float dB = b * (4.0f / 16.0f);
+
+    // calculating the three colors RGBO, RGBH, and RGBV.  RGB = df - af * x - bf * y;
+    float cofR = std::fma(aR,  255.0f, std::fma(bR,  255.0f, dR));
+    float cofG = std::fma(aG,  255.0f, std::fma(bG,  255.0f, dG));
+    float cofB = std::fma(aB,  255.0f, std::fma(bB,  255.0f, dB));
+    float chfR = std::fma(aR, -425.0f, std::fma(bR,  255.0f, dR));
+    float chfG = std::fma(aG, -425.0f, std::fma(bG,  255.0f, dG));
+    float chfB = std::fma(aB, -425.0f, std::fma(bB,  255.0f, dB));
+    float cvfR = std::fma(aR,  255.0f, std::fma(bR, -425.0f, dR));
+    float cvfG = std::fma(aG,  255.0f, std::fma(bG, -425.0f, dG));
+    float cvfB = std::fma(aB,  255.0f, std::fma(bB, -425.0f, dB));
+
+    // convert to r6g7b6
+    int32_t coR = convert6(cofR);
+    int32_t coG = convert7(cofG);
+    int32_t coB = convert6(cofB);
+    int32_t chR = convert6(chfR);
+    int32_t chG = convert7(chfG);
+    int32_t chB = convert6(chfB);
+    int32_t cvR = convert6(cvfR);
+    int32_t cvG = convert7(cvfG);
+    int32_t cvB = convert6(cvfB);
+
+    // Error calculation
+    auto ro0 = coR;
+    auto go0 = coG;
+    auto bo0 = coB;
+    auto ro1 = (ro0 >> 4) | (ro0 << 2);
+    auto go1 = (go0 >> 6) | (go0 << 1);
+    auto bo1 = (bo0 >> 4) | (bo0 << 2);
+    auto ro2 = (ro1 << 2) + 2;
+    auto go2 = (go1 << 2) + 2;
+    auto bo2 = (bo1 << 2) + 2;
+
+    auto rh0 = chR;
+    auto gh0 = chG;
+    auto bh0 = chB;
+    auto rh1 = (rh0 >> 4) | (rh0 << 2);
+    auto gh1 = (gh0 >> 6) | (gh0 << 1);
+    auto bh1 = (bh0 >> 4) | (bh0 << 2);
+
+    auto rh2 = rh1 - ro1;
+    auto gh2 = gh1 - go1;
+    auto bh2 = bh1 - bo1;
+
+    auto rv0 = cvR;
+    auto gv0 = cvG;
+    auto bv0 = cvB;
+    auto rv1 = (rv0 >> 4) | (rv0 << 2);
+    auto gv1 = (gv0 >> 6) | (gv0 << 1);
+    auto bv1 = (bv0 >> 4) | (bv0 << 2);
+
+    auto rv2 = rv1 - ro1;
+    auto gv2 = gv1 - go1;
+    auto bv2 = bv1 - bo1;
+
+    uint64_t error = 0;
+
+    for (int i = 0; i < 16; ++i)
+    {
+        int32_t cR = clampu8((rh2 * (i / 4) + rv2 * (i % 4) + ro2) >> 2);
+        int32_t cG = clampu8((gh2 * (i / 4) + gv2 * (i % 4) + go2) >> 2);
+        int32_t cB = clampu8((bh2 * (i / 4) + bv2 * (i % 4) + bo2) >> 2);
+
+        int32_t difB = static_cast<int>(src[i * 4 + 0]) - cB;
+        int32_t difG = static_cast<int>(src[i * 4 + 1]) - cG;
+        int32_t difR = static_cast<int>(src[i * 4 + 2]) - cR;
+
+        int32_t dif = difR * 38 + difG * 76 + difB * 14;
+
+        error += dif * dif;
+    }
+
+    /**/
+    uint32_t rgbv = cvB | (cvG << 6) | (cvR << 13);
+    uint32_t rgbh = chB | (chG << 6) | (chR << 13);
+    uint32_t hi = rgbv | ((rgbh & 0x1FFF) << 19);
+    uint32_t lo = (chR & 0x1) | 0x2 | ((chR << 1) & 0x7C);
+    lo |= ((coB & 0x07) <<  7) | ((coB & 0x18) <<  8) | ((coB & 0x20) << 11);
+    lo |= ((coG & 0x3F) << 17) | ((coG & 0x40) << 18);
+    lo |= coR << 25;
+
+    const auto idx = (coR & 0x20) | ((coG & 0x20) >> 1) | ((coB & 0x1E) >> 1);
+
+    lo |= g_flags[idx];
+
+    uint64_t result = static_cast<uint32_t>(BSWAP32(lo));
+	result |= static_cast<uint64_t>(static_cast<uint32_t>(BSWAP32(hi))) << 32;
+
+    return std::make_pair(result, error);
+}
+
+template<class T, class S>
+uint64_t EncodeSelectors( uint64_t d, const T terr[2][8], const S tsel[16][8], const uint32_t* id, const uint64_t value, const uint64_t error)
+{
+    size_t tidx[2];
+    tidx[0] = GetLeastError( terr[0], 8 );
+    tidx[1] = GetLeastError( terr[1], 8 );
+
+    if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
+    {
+        return value;
+    }
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+    for( int i=0; i<16; i++ )
+    {
+        uint64_t t = tsel[i][tidx[id[i]%2]];
+        d |= ( t & 0x1 ) << ( i + 32 );
+        d |= ( t & 0x2 ) << ( i + 47 );
+    }
+
+    return FixByteOrder(d);
+}
+}
+
+uint64_t ProcessRGB( const uint8_t* src )
+{
+    uint64_t d = CheckSolid( src );
+    if( d != 0 ) return d;
+
+    v4i a[8];
+    unsigned int err[4] = {};
+    PrepareAverages( a, src, err );
+    size_t idx = GetLeastError( err, 4 );
+    EncodeAverages( d, a, idx );
+
+#if defined __SSE4_1__ && !defined REFERENCE_IMPLEMENTATION
+    uint32_t terr[2][8] = {};
+#else
+    uint64_t terr[2][8] = {};
+#endif
+    uint16_t tsel[16][8];
+    auto id = g_id[idx];
+    FindBestFit( terr, tsel, a, id, src );
+
+    return FixByteOrder( EncodeSelectors( d, terr, tsel, id ) );
+}
+
+uint64_t ProcessRGB_ETC2( const uint8_t* src )
+{
+    auto result = Planar( src );
+
+    uint64_t d = 0;
+
+    v4i a[8];
+    unsigned int err[4] = {};
+    PrepareAverages( a, src, err );
+    size_t idx = GetLeastError( err, 4 );
+    EncodeAverages( d, a, idx );
+
+#if defined __SSE4_1__ && !defined REFERENCE_IMPLEMENTATION
+    uint32_t terr[2][8] = {};
+#else
+    uint64_t terr[2][8] = {};
+#endif
+    uint16_t tsel[16][8];
+    auto id = g_id[idx];
+    FindBestFit( terr, tsel, a, id, src );
+
+    return EncodeSelectors( d, terr, tsel, id, result.first, result.second );
+}
+
diff --git a/thirdparty/etcpak/ProcessRGB.hpp b/thirdparty/etcpak/ProcessRGB.hpp
new file mode 100644
index 000000000000..f4cc4f8995df
--- /dev/null
+++ b/thirdparty/etcpak/ProcessRGB.hpp
@@ -0,0 +1,9 @@
+#ifndef __PROCESSRGB_HPP__
+#define __PROCESSRGB_HPP__
+
+#include <stdint.h>
+
+uint64_t ProcessRGB( const uint8_t* src );
+uint64_t ProcessRGB_ETC2( const uint8_t* src );
+
+#endif
diff --git a/thirdparty/etcpak/ProcessRGB_AVX2.cpp b/thirdparty/etcpak/ProcessRGB_AVX2.cpp
new file mode 100644
index 000000000000..dbe5f96e5e20
--- /dev/null
+++ b/thirdparty/etcpak/ProcessRGB_AVX2.cpp
@@ -0,0 +1,978 @@
+#ifdef __SSE4_1__
+
+#include <array>
+#include <string.h>
+
+#include "Math.hpp"
+#include "ProcessCommon.hpp"
+#include "ProcessRGB_AVX2.hpp"
+#include "Tables.hpp"
+#include "Vector.hpp"
+#ifdef _MSC_VER
+#  include <intrin.h>
+#  include <Windows.h>
+#  define _bswap(x) _byteswap_ulong(x)
+#  define VS_VECTORCALL _vectorcall
+#else
+#  include <x86intrin.h>
+#  pragma GCC push_options
+#  pragma GCC target ("avx2,fma,bmi2")
+#  define VS_VECTORCALL
+#endif
+
+#ifndef _bswap
+#  define _bswap(x) __builtin_bswap32(x)
+#endif
+
+namespace
+{
+
+#ifdef _MSC_VER
+    inline unsigned long _bit_scan_forward( unsigned long mask )
+    {
+        unsigned long ret;
+        _BitScanForward( &ret, mask );
+        return ret;
+    }
+#endif
+
+typedef std::array<uint16_t, 4> v4i;
+
+__m256i VS_VECTORCALL Sum4_AVX2( const uint8_t* data) noexcept
+{
+    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
+
+    __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
+
+    __m256i t0 = _mm256_cvtepu8_epi16(dm0);
+    __m256i t1 = _mm256_cvtepu8_epi16(dm1);
+    __m256i t2 = _mm256_cvtepu8_epi16(dm2);
+    __m256i t3 = _mm256_cvtepu8_epi16(dm3);
+
+    __m256i sum0 = _mm256_add_epi16(t0, t1);
+    __m256i sum1 = _mm256_add_epi16(t2, t3);
+
+    __m256i s0 = _mm256_permute2x128_si256(sum0, sum1, (0) | (3 << 4)); // 0, 0, 3, 3
+    __m256i s1 = _mm256_permute2x128_si256(sum0, sum1, (1) | (2 << 4)); // 1, 1, 2, 2
+
+    __m256i s2 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(1, 3, 0, 2));
+    __m256i s3 = _mm256_permute4x64_epi64(s0, _MM_SHUFFLE(0, 2, 1, 3));
+    __m256i s4 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(3, 1, 0, 2));
+    __m256i s5 = _mm256_permute4x64_epi64(s1, _MM_SHUFFLE(2, 0, 1, 3));
+
+    __m256i sum5 = _mm256_add_epi16(s2, s3); //   3,   0,   3,   0
+    __m256i sum6 = _mm256_add_epi16(s4, s5); //   2,   1,   1,   2
+    return _mm256_add_epi16(sum5, sum6);     // 3+2, 0+1, 3+1, 3+2
+}
+
+__m256i VS_VECTORCALL Average_AVX2( const __m256i data) noexcept
+{
+    __m256i a = _mm256_add_epi16(data, _mm256_set1_epi16(4));
+
+    return _mm256_srli_epi16(a, 3);
+}
+
+__m128i VS_VECTORCALL CalcErrorBlock_AVX2( const __m256i data, const v4i a[8]) noexcept
+{
+    //
+    __m256i a0 = _mm256_load_si256((__m256i*)a[0].data());
+    __m256i a1 = _mm256_load_si256((__m256i*)a[4].data());
+
+    // err = 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
+    __m256i a4 = _mm256_madd_epi16(a0, a0);
+    __m256i a5 = _mm256_madd_epi16(a1, a1);
+
+    __m256i a6 = _mm256_hadd_epi32(a4, a5);
+    __m256i a7 = _mm256_slli_epi32(a6, 3);
+
+    __m256i a8 = _mm256_add_epi32(a7, _mm256_set1_epi32(0x3FFFFFFF)); // Big value to prevent negative values, but small enough to prevent overflow
+
+    // average is not swapped
+    // err -= block[0] * 2 * average[0];
+    // err -= block[1] * 2 * average[1];
+    // err -= block[2] * 2 * average[2];
+    __m256i a2 = _mm256_slli_epi16(a0, 1);
+    __m256i a3 = _mm256_slli_epi16(a1, 1);
+    __m256i b0 = _mm256_madd_epi16(a2, data);
+    __m256i b1 = _mm256_madd_epi16(a3, data);
+
+    __m256i b2 = _mm256_hadd_epi32(b0, b1);
+    __m256i b3 = _mm256_sub_epi32(a8, b2);
+    __m256i b4 = _mm256_hadd_epi32(b3, b3);
+
+    __m256i b5 = _mm256_permutevar8x32_epi32(b4, _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0));
+
+    return _mm256_castsi256_si128(b5);
+}
+
+void VS_VECTORCALL ProcessAverages_AVX2(const __m256i d, v4i a[8] ) noexcept
+{
+    __m256i t = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(31)), _mm256_set1_epi16(128));
+
+    __m256i c = _mm256_srli_epi16(_mm256_add_epi16(t, _mm256_srli_epi16(t, 8)), 8);
+
+    __m256i c1 = _mm256_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256i diff = _mm256_sub_epi16(c, c1);
+    diff = _mm256_max_epi16(diff, _mm256_set1_epi16(-4));
+    diff = _mm256_min_epi16(diff, _mm256_set1_epi16(3));
+
+    __m256i co = _mm256_add_epi16(c1, diff);
+
+    c = _mm256_blend_epi16(co, c, 0xF0);
+
+    __m256i a0 = _mm256_or_si256(_mm256_slli_epi16(c, 3), _mm256_srli_epi16(c, 2));
+
+    _mm256_store_si256((__m256i*)a[4].data(), a0);
+
+    __m256i t0 = _mm256_add_epi16(_mm256_mullo_epi16(d, _mm256_set1_epi16(15)), _mm256_set1_epi16(128));
+    __m256i t1 = _mm256_srli_epi16(_mm256_add_epi16(t0, _mm256_srli_epi16(t0, 8)), 8);
+
+    __m256i t2 = _mm256_or_si256(t1, _mm256_slli_epi16(t1, 4));
+
+    _mm256_store_si256((__m256i*)a[0].data(), t2);
+}
+
+uint64_t VS_VECTORCALL EncodeAverages_AVX2( const v4i a[8], size_t idx ) noexcept
+{
+    uint64_t d = ( idx << 24 );
+    size_t base = idx << 1;
+
+    __m128i a0 = _mm_load_si128((const __m128i*)a[base].data());
+
+    __m128i r0, r1;
+
+    if( ( idx & 0x2 ) == 0 )
+    {
+        r0 = _mm_srli_epi16(a0, 4);
+
+        __m128i a1 = _mm_unpackhi_epi64(r0, r0);
+        r1 = _mm_slli_epi16(a1, 4);
+    }
+    else
+    {
+        __m128i a1 = _mm_and_si128(a0, _mm_set1_epi16(-8));
+
+        r0 = _mm_unpackhi_epi64(a1, a1);
+        __m128i a2 = _mm_sub_epi16(a1, r0);
+        __m128i a3 = _mm_srai_epi16(a2, 3);
+        r1 = _mm_and_si128(a3, _mm_set1_epi16(0x07));
+    }
+
+    __m128i r2 = _mm_or_si128(r0, r1);
+    // do missing swap for average values
+    __m128i r3 = _mm_shufflelo_epi16(r2, _MM_SHUFFLE(3, 0, 1, 2));
+    __m128i r4 = _mm_packus_epi16(r3, _mm_setzero_si128());
+    d |= _mm_cvtsi128_si32(r4);
+
+    return d;
+}
+
+uint64_t VS_VECTORCALL CheckSolid_AVX2( const uint8_t* src ) noexcept
+{
+    __m256i d0 = _mm256_loadu_si256(((__m256i*)src) + 0);
+    __m256i d1 = _mm256_loadu_si256(((__m256i*)src) + 1);
+
+    __m256i c = _mm256_broadcastd_epi32(_mm256_castsi256_si128(d0));
+
+    __m256i c0 = _mm256_cmpeq_epi8(d0, c);
+    __m256i c1 = _mm256_cmpeq_epi8(d1, c);
+
+    __m256i m = _mm256_and_si256(c0, c1);
+
+    if (!_mm256_testc_si256(m, _mm256_set1_epi32(-1)))
+    {
+        return 0;
+    }
+
+    return 0x02000000 |
+        ( (unsigned int)( src[0] & 0xF8 ) << 16 ) |
+        ( (unsigned int)( src[1] & 0xF8 ) << 8 ) |
+        ( (unsigned int)( src[2] & 0xF8 ) );
+}
+
+__m128i VS_VECTORCALL PrepareAverages_AVX2( v4i a[8], const uint8_t* src) noexcept
+{
+    __m256i sum4 = Sum4_AVX2( src );
+
+    ProcessAverages_AVX2(Average_AVX2( sum4 ), a );
+
+    return CalcErrorBlock_AVX2( sum4, a);
+}
+
+__m128i VS_VECTORCALL PrepareAverages_AVX2( v4i a[8], const __m256i sum4) noexcept
+{
+    ProcessAverages_AVX2(Average_AVX2( sum4 ), a );
+
+    return CalcErrorBlock_AVX2( sum4, a);
+}
+
+void VS_VECTORCALL FindBestFit_4x2_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept
+{
+    __m256i sel0 = _mm256_setzero_si256();
+    __m256i sel1 = _mm256_setzero_si256();
+
+    for (unsigned int j = 0; j < 2; ++j)
+    {
+        unsigned int bid = offset + 1 - j;
+
+        __m256i squareErrorSum = _mm256_setzero_si256();
+
+        __m128i a0 = _mm_loadl_epi64((const __m128i*)a[bid].data());
+        __m256i a1 = _mm256_broadcastq_epi64(a0);
+
+        // Processing one full row each iteration
+        for (size_t i = 0; i < 8; i += 4)
+        {
+            __m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4));
+
+            __m256i rgb16 = _mm256_cvtepu8_epi16(rgb);
+            __m256i d = _mm256_sub_epi16(a1, rgb16);
+
+            // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
+            // This produces slightly different results, but is significant faster
+            __m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14));
+            __m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0);
+            __m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1);
+            __m128i pixel3 = _mm256_castsi256_si128(pixel2);
+
+            __m128i pix0 = _mm_broadcastw_epi16(pixel3);
+            __m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
+            __m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+
+            // Processing first two pixels of the row
+            {
+                __m256i pix = _mm256_abs_epi16(pixel);
+
+                // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+                // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+                __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
+                __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
+
+                __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
+                __m256i minError = _mm256_min_epi16(error0, error1);
+
+                // Exploiting symmetry of the selector table and use the sign bit
+                // This produces slightly different results, but is significant faster
+                __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
+
+                // Interleaving values so madd instruction can be used
+                __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
+                __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
+
+                __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
+                // Squaring the minimum error to produce correct values when adding
+                __m256i squareError = _mm256_madd_epi16(minError2, minError2);
+
+                squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError);
+
+                // Packing selector bits
+                __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8));
+                __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8));
+
+                sel0 = _mm256_or_si256(sel0, minIndexLo2);
+                sel1 = _mm256_or_si256(sel1, minIndexHi2);
+            }
+
+            pixel3 = _mm256_extracti128_si256(pixel2, 1);
+            pix0 = _mm_broadcastw_epi16(pixel3);
+            pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
+            pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+
+            // Processing second two pixels of the row
+            {
+                __m256i pix = _mm256_abs_epi16(pixel);
+
+                // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+                // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+                __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
+                __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
+
+                __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
+                __m256i minError = _mm256_min_epi16(error0, error1);
+
+                // Exploiting symmetry of the selector table and use the sign bit
+                __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
+
+                // Interleaving values so madd instruction can be used
+                __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
+                __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
+
+                __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
+                // Squaring the minimum error to produce correct values when adding
+                __m256i squareError = _mm256_madd_epi16(minError2, minError2);
+
+                squareErrorSum = _mm256_add_epi32(squareErrorSum, squareError);
+
+                // Packing selector bits
+                __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i + j * 8));
+                __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i + j * 8));
+                __m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2);
+                __m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2);
+
+                sel0 = _mm256_or_si256(sel0, minIndexLo3);
+                sel1 = _mm256_or_si256(sel1, minIndexHi3);
+            }
+        }
+
+        data += 8 * 4;
+
+        _mm256_store_si256((__m256i*)terr[1 - j], squareErrorSum);
+    }
+
+    // Interleave selector bits
+    __m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1);
+    __m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1);
+
+    __m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4));
+    __m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4));
+
+    __m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1);
+
+    __m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2);
+
+    _mm256_store_si256((__m256i*)tsel, sel);
+}
+
+void VS_VECTORCALL FindBestFit_2x4_AVX2( uint32_t terr[2][8], uint32_t tsel[8], v4i a[8], const uint32_t offset, const uint8_t* data) noexcept
+{
+    __m256i sel0 = _mm256_setzero_si256();
+    __m256i sel1 = _mm256_setzero_si256();
+
+    __m256i squareErrorSum0 = _mm256_setzero_si256();
+    __m256i squareErrorSum1 = _mm256_setzero_si256();
+
+    __m128i a0 = _mm_loadl_epi64((const __m128i*)a[offset + 1].data());
+    __m128i a1 = _mm_loadl_epi64((const __m128i*)a[offset + 0].data());
+
+    __m128i a2 = _mm_broadcastq_epi64(a0);
+    __m128i a3 = _mm_broadcastq_epi64(a1);
+    __m256i a4 = _mm256_insertf128_si256(_mm256_castsi128_si256(a2), a3, 1);
+
+    // Processing one full row each iteration
+    for (size_t i = 0; i < 16; i += 4)
+    {
+        __m128i rgb = _mm_loadu_si128((const __m128i*)(data + i * 4));
+
+        __m256i rgb16 = _mm256_cvtepu8_epi16(rgb);
+        __m256i d = _mm256_sub_epi16(a4, rgb16);
+
+        // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
+        // This produces slightly different results, but is significant faster
+        __m256i pixel0 = _mm256_madd_epi16(d, _mm256_set_epi16(0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14, 0, 38, 76, 14));
+        __m256i pixel1 = _mm256_packs_epi32(pixel0, pixel0);
+        __m256i pixel2 = _mm256_hadd_epi16(pixel1, pixel1);
+        __m128i pixel3 = _mm256_castsi256_si128(pixel2);
+
+        __m128i pix0 = _mm_broadcastw_epi16(pixel3);
+        __m128i pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
+        __m256i pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+
+        // Processing first two pixels of the row
+        {
+            __m256i pix = _mm256_abs_epi16(pixel);
+
+            // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+            // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+            __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
+            __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
+
+            __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
+            __m256i minError = _mm256_min_epi16(error0, error1);
+
+            // Exploiting symmetry of the selector table and use the sign bit
+            __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
+
+            // Interleaving values so madd instruction can be used
+            __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
+            __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
+
+            __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
+            // Squaring the minimum error to produce correct values when adding
+            __m256i squareError = _mm256_madd_epi16(minError2, minError2);
+
+            squareErrorSum0 = _mm256_add_epi32(squareErrorSum0, squareError);
+
+            // Packing selector bits
+            __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i));
+            __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i));
+
+            sel0 = _mm256_or_si256(sel0, minIndexLo2);
+            sel1 = _mm256_or_si256(sel1, minIndexHi2);
+        }
+
+        pixel3 = _mm256_extracti128_si256(pixel2, 1);
+        pix0 = _mm_broadcastw_epi16(pixel3);
+        pix1 = _mm_broadcastw_epi16(_mm_srli_epi32(pixel3, 16));
+        pixel = _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
+
+        // Processing second two pixels of the row
+        {
+            __m256i pix = _mm256_abs_epi16(pixel);
+
+            // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+            // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+            __m256i error0 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[0])));
+            __m256i error1 = _mm256_abs_epi16(_mm256_sub_epi16(pix, _mm256_broadcastsi128_si256(g_table128_SIMD[1])));
+
+            __m256i minIndex0 = _mm256_and_si256(_mm256_cmpgt_epi16(error0, error1), _mm256_set1_epi16(1));
+            __m256i minError = _mm256_min_epi16(error0, error1);
+
+            // Exploiting symmetry of the selector table and use the sign bit
+            __m256i minIndex1 = _mm256_srli_epi16(pixel, 15);
+
+            // Interleaving values so madd instruction can be used
+            __m256i minErrorLo = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(1, 1, 0, 0));
+            __m256i minErrorHi = _mm256_permute4x64_epi64(minError, _MM_SHUFFLE(3, 3, 2, 2));
+
+            __m256i minError2 = _mm256_unpacklo_epi16(minErrorLo, minErrorHi);
+            // Squaring the minimum error to produce correct values when adding
+            __m256i squareError = _mm256_madd_epi16(minError2, minError2);
+
+            squareErrorSum1 = _mm256_add_epi32(squareErrorSum1, squareError);
+
+            // Packing selector bits
+            __m256i minIndexLo2 = _mm256_sll_epi16(minIndex0, _mm_cvtsi64_si128(i));
+            __m256i minIndexHi2 = _mm256_sll_epi16(minIndex1, _mm_cvtsi64_si128(i));
+            __m256i minIndexLo3 = _mm256_slli_epi16(minIndexLo2, 2);
+            __m256i minIndexHi3 = _mm256_slli_epi16(minIndexHi2, 2);
+
+            sel0 = _mm256_or_si256(sel0, minIndexLo3);
+            sel1 = _mm256_or_si256(sel1, minIndexHi3);
+        }
+    }
+
+    _mm256_store_si256((__m256i*)terr[1], squareErrorSum0);
+    _mm256_store_si256((__m256i*)terr[0], squareErrorSum1);
+
+    // Interleave selector bits
+    __m256i minIndexLo0 = _mm256_unpacklo_epi16(sel0, sel1);
+    __m256i minIndexHi0 = _mm256_unpackhi_epi16(sel0, sel1);
+
+    __m256i minIndexLo1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (0) | (2 << 4));
+    __m256i minIndexHi1 = _mm256_permute2x128_si256(minIndexLo0, minIndexHi0, (1) | (3 << 4));
+
+    __m256i minIndexHi2 = _mm256_slli_epi32(minIndexHi1, 1);
+
+    __m256i sel = _mm256_or_si256(minIndexLo1, minIndexHi2);
+
+    _mm256_store_si256((__m256i*)tsel, sel);
+}
+
+uint64_t VS_VECTORCALL EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate) noexcept
+{
+    size_t tidx[2];
+
+    // Get index of minimum error (terr[0] and terr[1])
+    __m256i err0 = _mm256_load_si256((const __m256i*)terr[0]);
+    __m256i err1 = _mm256_load_si256((const __m256i*)terr[1]);
+
+    __m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4));
+    __m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4));
+
+    __m256i errMin0 = _mm256_min_epu32(errLo, errHi);
+
+    __m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1));
+    __m256i errMin2 = _mm256_min_epu32(errMin0, errMin1);
+
+    __m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2));
+    __m256i errMin4 = _mm256_min_epu32(errMin3, errMin2);
+
+    __m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4));
+    __m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4));
+
+    __m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0);
+    __m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1);
+
+    uint32_t mask0 = _mm256_movemask_epi8(errMask0);
+    uint32_t mask1 = _mm256_movemask_epi8(errMask1);
+
+    tidx[0] = _bit_scan_forward(mask0) >> 2;
+    tidx[1] = _bit_scan_forward(mask1) >> 2;
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+
+    unsigned int t0 = tsel[tidx[0]];
+    unsigned int t1 = tsel[tidx[1]];
+
+    if (!rotate)
+    {
+        t0 &= 0xFF00FF00;
+        t1 &= 0x00FF00FF;
+    }
+    else
+    {
+        t0 &= 0xCCCCCCCC;
+        t1 &= 0x33333333;
+    }
+
+    // Flip selectors from sign bit
+    unsigned int t2 = (t0 | t1) ^ 0xFFFF0000;
+
+    return d | static_cast<uint64_t>(_bswap(t2)) << 32;
+}
+
+__m128i VS_VECTORCALL r6g7b6_AVX2(__m128 cof, __m128 chf, __m128 cvf) noexcept
+{
+    __m128i co = _mm_cvttps_epi32(cof);
+    __m128i ch = _mm_cvttps_epi32(chf);
+    __m128i cv = _mm_cvttps_epi32(cvf);
+
+    __m128i coh = _mm_packus_epi32(co, ch);
+    __m128i cv0 = _mm_packus_epi32(cv, _mm_setzero_si128());
+
+	__m256i cohv0 = _mm256_inserti128_si256(_mm256_castsi128_si256(coh), cv0, 1);
+    __m256i cohv1 = _mm256_min_epu16(cohv0, _mm256_set1_epi16(1023));
+
+    __m256i cohv2 = _mm256_sub_epi16(cohv1, _mm256_set1_epi16(15));
+    __m256i cohv3 = _mm256_srai_epi16(cohv2, 1);
+
+    __m256i cohvrb0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(11));
+    __m256i cohvrb1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(4));
+    __m256i cohvg0 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(9));
+    __m256i cohvg1 = _mm256_add_epi16(cohv3, _mm256_set1_epi16(6));
+
+    __m256i cohvrb2 = _mm256_srai_epi16(cohvrb0, 7);
+    __m256i cohvrb3 = _mm256_srai_epi16(cohvrb1, 7);
+    __m256i cohvg2 = _mm256_srai_epi16(cohvg0, 8);
+    __m256i cohvg3 = _mm256_srai_epi16(cohvg1, 8);
+
+    __m256i cohvrb4 = _mm256_sub_epi16(cohvrb0, cohvrb2);
+    __m256i cohvrb5 = _mm256_sub_epi16(cohvrb4, cohvrb3);
+    __m256i cohvg4 = _mm256_sub_epi16(cohvg0, cohvg2);
+    __m256i cohvg5 = _mm256_sub_epi16(cohvg4, cohvg3);
+
+    __m256i cohvrb6 = _mm256_srai_epi16(cohvrb5, 3);
+    __m256i cohvg6 = _mm256_srai_epi16(cohvg5, 2);
+
+	__m256i cohv4 = _mm256_blend_epi16(cohvg6, cohvrb6, 0x55);
+
+    __m128i cohv5 = _mm_packus_epi16(_mm256_castsi256_si128(cohv4), _mm256_extracti128_si256(cohv4, 1));
+    return _mm_shuffle_epi8(cohv5, _mm_setr_epi8(6, 5, 4, -1, 2, 1, 0, -1, 10, 9, 8, -1, -1, -1, -1, -1));
+}
+
+struct Plane
+{
+	uint64_t plane;
+	uint64_t error;
+	__m256i sum4;
+};
+
+Plane Planar_AVX2(const uint8_t* src)
+{
+    __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
+
+    __m128i rgb0 = _mm_shuffle_epi8(d0, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
+    __m128i rgb1 = _mm_shuffle_epi8(d1, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
+    __m128i rgb2 = _mm_shuffle_epi8(d2, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
+    __m128i rgb3 = _mm_shuffle_epi8(d3, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, -1, -1, -1, -1));
+
+    __m128i rg0 = _mm_unpacklo_epi32(rgb0, rgb1);
+    __m128i rg1 = _mm_unpacklo_epi32(rgb2, rgb3);
+    __m128i b0 = _mm_unpackhi_epi32(rgb0, rgb1);
+    __m128i b1 = _mm_unpackhi_epi32(rgb2, rgb3);
+
+    // swap channels
+    __m128i b8 = _mm_unpacklo_epi64(rg0, rg1);
+    __m128i g8 = _mm_unpackhi_epi64(rg0, rg1);
+    __m128i r8 = _mm_unpacklo_epi64(b0, b1);
+
+    __m128i t0 = _mm_sad_epu8(r8, _mm_setzero_si128());
+    __m128i t1 = _mm_sad_epu8(g8, _mm_setzero_si128());
+    __m128i t2 = _mm_sad_epu8(b8, _mm_setzero_si128());
+
+    __m128i r8s = _mm_shuffle_epi8(r8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
+    __m128i g8s = _mm_shuffle_epi8(g8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
+    __m128i b8s = _mm_shuffle_epi8(b8, _mm_set_epi8(0xF, 0xE, 0xB, 0xA, 0x7, 0x6, 0x3, 0x2, 0xD, 0xC, 0x9, 0x8, 0x5, 0x4, 0x1, 0x0));
+
+    __m128i s0 = _mm_sad_epu8(r8s, _mm_setzero_si128());
+    __m128i s1 = _mm_sad_epu8(g8s, _mm_setzero_si128());
+    __m128i s2 = _mm_sad_epu8(b8s, _mm_setzero_si128());
+
+    __m256i sr0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t0), s0, 1);
+    __m256i sg0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t1), s1, 1);
+    __m256i sb0 = _mm256_insertf128_si256(_mm256_castsi128_si256(t2), s2, 1);
+
+    __m256i sr1 = _mm256_slli_epi64(sr0, 32);
+    __m256i sg1 = _mm256_slli_epi64(sg0, 16);
+
+    __m256i srb = _mm256_or_si256(sr1, sb0);
+    __m256i srgb = _mm256_or_si256(srb, sg1);
+
+    __m128i t3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(t0), _mm_castsi128_ps(t1), _MM_SHUFFLE(2, 0, 2, 0)));
+    __m128i t4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3, 1, 2, 0));
+    __m128i t5 = _mm_hadd_epi32(t3, t4);
+    __m128i t6 = _mm_shuffle_epi32(t5, _MM_SHUFFLE(1, 1, 1, 1));
+    __m128i t7 = _mm_shuffle_epi32(t5, _MM_SHUFFLE(2, 2, 2, 2));
+
+    __m256i sr = _mm256_broadcastw_epi16(t5);
+    __m256i sg = _mm256_broadcastw_epi16(t6);
+    __m256i sb = _mm256_broadcastw_epi16(t7);
+
+    __m256i r08 = _mm256_cvtepu8_epi16(r8);
+    __m256i g08 = _mm256_cvtepu8_epi16(g8);
+    __m256i b08 = _mm256_cvtepu8_epi16(b8);
+
+    __m256i r16 = _mm256_slli_epi16(r08, 4);
+    __m256i g16 = _mm256_slli_epi16(g08, 4);
+    __m256i b16 = _mm256_slli_epi16(b08, 4);
+
+    __m256i difR0 = _mm256_sub_epi16(r16, sr);
+    __m256i difG0 = _mm256_sub_epi16(g16, sg);
+    __m256i difB0 = _mm256_sub_epi16(b16, sb);
+
+    __m256i difRyz = _mm256_madd_epi16(difR0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
+    __m256i difGyz = _mm256_madd_epi16(difG0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
+    __m256i difByz = _mm256_madd_epi16(difB0, _mm256_set_epi16(255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255, 255, 85, -85, -255));
+
+    __m256i difRxz = _mm256_madd_epi16(difR0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
+    __m256i difGxz = _mm256_madd_epi16(difG0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
+    __m256i difBxz = _mm256_madd_epi16(difB0, _mm256_set_epi16(255, 255, 255, 255, 85, 85, 85, 85, -85, -85, -85, -85, -255, -255, -255, -255));
+
+    __m256i difRGyz = _mm256_hadd_epi32(difRyz, difGyz);
+    __m256i difByzxz = _mm256_hadd_epi32(difByz, difBxz);
+
+    __m256i difRGxz = _mm256_hadd_epi32(difRxz, difGxz);
+
+    __m128i sumRGyz = _mm_add_epi32(_mm256_castsi256_si128(difRGyz), _mm256_extracti128_si256(difRGyz, 1));
+    __m128i sumByzxz = _mm_add_epi32(_mm256_castsi256_si128(difByzxz), _mm256_extracti128_si256(difByzxz, 1));
+    __m128i sumRGxz = _mm_add_epi32(_mm256_castsi256_si128(difRGxz), _mm256_extracti128_si256(difRGxz, 1));
+
+    __m128i sumRGByz = _mm_hadd_epi32(sumRGyz, sumByzxz);
+    __m128i sumRGByzxz = _mm_hadd_epi32(sumRGxz, sumByzxz);
+
+    __m128i sumRGBxz = _mm_shuffle_epi32(sumRGByzxz, _MM_SHUFFLE(2, 3, 1, 0));
+
+    __m128 sumRGByzf = _mm_cvtepi32_ps(sumRGByz);
+    __m128 sumRGBxzf = _mm_cvtepi32_ps(sumRGBxz);
+
+    const float value = (255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f;
+
+    __m128 scale = _mm_set1_ps(-4.0f / value);
+
+    __m128 af = _mm_mul_ps(sumRGBxzf, scale);
+    __m128 bf = _mm_mul_ps(sumRGByzf, scale);
+
+    __m128 df = _mm_mul_ps(_mm_cvtepi32_ps(t5), _mm_set1_ps(4.0f / 16.0f));
+
+    // calculating the three colors RGBO, RGBH, and RGBV.  RGB = df - af * x - bf * y;
+    __m128 cof0 = _mm_fnmadd_ps(af, _mm_set1_ps(-255.0f), _mm_fnmadd_ps(bf, _mm_set1_ps(-255.0f), df));
+    __m128 chf0 = _mm_fnmadd_ps(af, _mm_set1_ps( 425.0f), _mm_fnmadd_ps(bf, _mm_set1_ps(-255.0f), df));
+    __m128 cvf0 = _mm_fnmadd_ps(af, _mm_set1_ps(-255.0f), _mm_fnmadd_ps(bf, _mm_set1_ps( 425.0f), df));
+
+    // convert to r6g7b6
+    __m128i cohv = r6g7b6_AVX2(cof0, chf0, cvf0);
+
+    uint64_t rgbho = _mm_extract_epi64(cohv, 0);
+    uint32_t rgbv0 = _mm_extract_epi32(cohv, 2);
+
+	// Error calculation
+	auto ro0 = (rgbho >> 48) & 0x3F;
+	auto go0 = (rgbho >> 40) & 0x7F;
+	auto bo0 = (rgbho >> 32) & 0x3F;
+	auto ro1 = (ro0 >> 4) | (ro0 << 2);
+	auto go1 = (go0 >> 6) | (go0 << 1);
+	auto bo1 = (bo0 >> 4) | (bo0 << 2);
+	auto ro2 = (ro1 << 2) + 2;
+	auto go2 = (go1 << 2) + 2;
+	auto bo2 = (bo1 << 2) + 2;
+
+    __m256i ro3 = _mm256_set1_epi16(ro2);
+    __m256i go3 = _mm256_set1_epi16(go2);
+    __m256i bo3 = _mm256_set1_epi16(bo2);
+
+	auto rh0 = (rgbho >> 16) & 0x3F;
+	auto gh0 = (rgbho >>  8) & 0x7F;
+	auto bh0 = (rgbho >>  0) & 0x3F;
+	auto rh1 = (rh0 >> 4) | (rh0 << 2);
+	auto gh1 = (gh0 >> 6) | (gh0 << 1);
+	auto bh1 = (bh0 >> 4) | (bh0 << 2);
+
+	auto rh2 = rh1 - ro1;
+	auto gh2 = gh1 - go1;
+	auto bh2 = bh1 - bo1;
+
+    __m256i rh3 = _mm256_set1_epi16(rh2);
+    __m256i gh3 = _mm256_set1_epi16(gh2);
+    __m256i bh3 = _mm256_set1_epi16(bh2);
+
+	auto rv0 = (rgbv0 >> 16) & 0x3F;
+	auto gv0 = (rgbv0 >>  8) & 0x7F;
+	auto bv0 = (rgbv0 >>  0) & 0x3F;
+	auto rv1 = (rv0 >> 4) | (rv0 << 2);
+	auto gv1 = (gv0 >> 6) | (gv0 << 1);
+	auto bv1 = (bv0 >> 4) | (bv0 << 2);
+
+	auto rv2 = rv1 - ro1;
+	auto gv2 = gv1 - go1;
+	auto bv2 = bv1 - bo1;
+
+    __m256i rv3 = _mm256_set1_epi16(rv2);
+    __m256i gv3 = _mm256_set1_epi16(gv2);
+    __m256i bv3 = _mm256_set1_epi16(bv2);
+
+    __m256i x = _mm256_set_epi16(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+
+    __m256i rh4 = _mm256_mullo_epi16(rh3, x);
+    __m256i gh4 = _mm256_mullo_epi16(gh3, x);
+    __m256i bh4 = _mm256_mullo_epi16(bh3, x);
+
+    __m256i y = _mm256_set_epi16(3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0);
+
+    __m256i rv4 = _mm256_mullo_epi16(rv3, y);
+    __m256i gv4 = _mm256_mullo_epi16(gv3, y);
+    __m256i bv4 = _mm256_mullo_epi16(bv3, y);
+
+    __m256i rxy = _mm256_add_epi16(rh4, rv4);
+    __m256i gxy = _mm256_add_epi16(gh4, gv4);
+    __m256i bxy = _mm256_add_epi16(bh4, bv4);
+
+    __m256i rp0 = _mm256_add_epi16(rxy, ro3);
+    __m256i gp0 = _mm256_add_epi16(gxy, go3);
+    __m256i bp0 = _mm256_add_epi16(bxy, bo3);
+
+    __m256i rp1 = _mm256_srai_epi16(rp0, 2);
+    __m256i gp1 = _mm256_srai_epi16(gp0, 2);
+    __m256i bp1 = _mm256_srai_epi16(bp0, 2);
+
+    __m256i rp2 = _mm256_max_epi16(_mm256_min_epi16(rp1, _mm256_set1_epi16(255)), _mm256_setzero_si256());
+    __m256i gp2 = _mm256_max_epi16(_mm256_min_epi16(gp1, _mm256_set1_epi16(255)), _mm256_setzero_si256());
+    __m256i bp2 = _mm256_max_epi16(_mm256_min_epi16(bp1, _mm256_set1_epi16(255)), _mm256_setzero_si256());
+
+    __m256i rdif = _mm256_sub_epi16(r08, rp2);
+    __m256i gdif = _mm256_sub_epi16(g08, gp2);
+    __m256i bdif = _mm256_sub_epi16(b08, bp2);
+
+    __m256i rerr = _mm256_mullo_epi16(rdif, _mm256_set1_epi16(38));
+    __m256i gerr = _mm256_mullo_epi16(gdif, _mm256_set1_epi16(76));
+    __m256i berr = _mm256_mullo_epi16(bdif, _mm256_set1_epi16(14));
+
+    __m256i sum0 = _mm256_add_epi16(rerr, gerr);
+    __m256i sum1 = _mm256_add_epi16(sum0, berr);
+
+    __m256i sum2 = _mm256_madd_epi16(sum1, sum1);
+
+    __m128i sum3 = _mm_add_epi32(_mm256_castsi256_si128(sum2), _mm256_extracti128_si256(sum2, 1));
+
+	uint32_t err0 = _mm_extract_epi32(sum3, 0);
+	uint32_t err1 = _mm_extract_epi32(sum3, 1);
+	uint32_t err2 = _mm_extract_epi32(sum3, 2);
+	uint32_t err3 = _mm_extract_epi32(sum3, 3);
+
+	uint64_t error = err0 + err1 + err2 + err3;
+	/**/
+
+    uint32_t rgbv = _pext_u32(rgbv0, 0x3F7F3F);
+    uint64_t rgbho0 = _pext_u64(rgbho, 0x3F7F3F003F7F3F);
+
+    uint32_t hi = rgbv | ((rgbho0 & 0x1FFF) << 19);
+    uint32_t lo = _pdep_u32(rgbho0 >> 13, 0x7F7F1BFD);
+
+    uint32_t idx = _pext_u64(rgbho, 0x20201E00000000);
+    lo |= _pdep_u32(g_flags_AVX2[idx], 0x8080E402);
+    uint64_t result = static_cast<uint32_t>(_bswap(lo));
+    result |= static_cast<uint64_t>(static_cast<uint32_t>(_bswap(hi))) << 32;
+
+	Plane plane;
+
+	plane.plane = result;
+	plane.error = error;
+	plane.sum4 = _mm256_permute4x64_epi64(srgb, _MM_SHUFFLE(2, 3, 0, 1));
+
+    return plane;
+}
+
+uint64_t VS_VECTORCALL EncodeSelectors_AVX2( uint64_t d, const uint32_t terr[2][8], const uint32_t tsel[8], const bool rotate, const uint64_t value, const uint32_t error) noexcept
+{
+    size_t tidx[2];
+
+    // Get index of minimum error (terr[0] and terr[1])
+    __m256i err0 = _mm256_load_si256((const __m256i*)terr[0]);
+    __m256i err1 = _mm256_load_si256((const __m256i*)terr[1]);
+
+    __m256i errLo = _mm256_permute2x128_si256(err0, err1, (0) | (2 << 4));
+    __m256i errHi = _mm256_permute2x128_si256(err0, err1, (1) | (3 << 4));
+
+    __m256i errMin0 = _mm256_min_epu32(errLo, errHi);
+
+    __m256i errMin1 = _mm256_shuffle_epi32(errMin0, _MM_SHUFFLE(2, 3, 0, 1));
+    __m256i errMin2 = _mm256_min_epu32(errMin0, errMin1);
+
+    __m256i errMin3 = _mm256_shuffle_epi32(errMin2, _MM_SHUFFLE(1, 0, 3, 2));
+    __m256i errMin4 = _mm256_min_epu32(errMin3, errMin2);
+
+    __m256i errMin5 = _mm256_permute2x128_si256(errMin4, errMin4, (0) | (0 << 4));
+    __m256i errMin6 = _mm256_permute2x128_si256(errMin4, errMin4, (1) | (1 << 4));
+
+    __m256i errMask0 = _mm256_cmpeq_epi32(errMin5, err0);
+    __m256i errMask1 = _mm256_cmpeq_epi32(errMin6, err1);
+
+    uint32_t mask0 = _mm256_movemask_epi8(errMask0);
+    uint32_t mask1 = _mm256_movemask_epi8(errMask1);
+
+    tidx[0] = _bit_scan_forward(mask0) >> 2;
+    tidx[1] = _bit_scan_forward(mask1) >> 2;
+
+	if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
+	{
+		return value;
+	}
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+
+    unsigned int t0 = tsel[tidx[0]];
+    unsigned int t1 = tsel[tidx[1]];
+
+    if (!rotate)
+    {
+        t0 &= 0xFF00FF00;
+        t1 &= 0x00FF00FF;
+    }
+    else
+    {
+        t0 &= 0xCCCCCCCC;
+        t1 &= 0x33333333;
+    }
+
+    // Flip selectors from sign bit
+    unsigned int t2 = (t0 | t1) ^ 0xFFFF0000;
+
+    return d | static_cast<uint64_t>(_bswap(t2)) << 32;
+}
+
+}
+
+uint64_t ProcessRGB_AVX2( const uint8_t* src )
+{
+    uint64_t d = CheckSolid_AVX2( src );
+    if( d != 0 ) return d;
+
+    alignas(32) v4i a[8];
+
+    __m128i err0 = PrepareAverages_AVX2( a, src );
+
+    // Get index of minimum error (err0)
+    __m128i err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(2, 3, 0, 1));
+    __m128i errMin0 = _mm_min_epu32(err0, err1);
+
+    __m128i errMin1 = _mm_shuffle_epi32(errMin0, _MM_SHUFFLE(1, 0, 3, 2));
+    __m128i errMin2 = _mm_min_epu32(errMin1, errMin0);
+
+    __m128i errMask = _mm_cmpeq_epi32(errMin2, err0);
+
+    uint32_t mask = _mm_movemask_epi8(errMask);
+
+    uint32_t idx = _bit_scan_forward(mask) >> 2;
+
+    d |= EncodeAverages_AVX2( a, idx );
+
+    alignas(32) uint32_t terr[2][8] = {};
+    alignas(32) uint32_t tsel[8];
+
+    if ((idx == 0) || (idx == 2))
+    {
+        FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src );
+    }
+    else
+    {
+        FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
+    }
+
+    return EncodeSelectors_AVX2( d, terr, tsel, (idx % 2) == 1 );
+}
+
+uint64_t ProcessRGB_4x2_AVX2( const uint8_t* src )
+{
+    uint64_t d = CheckSolid_AVX2( src );
+    if( d != 0 ) return d;
+
+    alignas(32) v4i a[8];
+
+    __m128i err0 = PrepareAverages_AVX2( a, src );
+
+    uint32_t idx = _mm_extract_epi32(err0, 0) < _mm_extract_epi32(err0, 2) ? 0 : 2;
+
+    d |= EncodeAverages_AVX2( a, idx );
+
+    alignas(32) uint32_t terr[2][8] = {};
+    alignas(32) uint32_t tsel[8];
+
+    FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src );
+
+    return EncodeSelectors_AVX2( d, terr, tsel, false);
+}
+
+uint64_t ProcessRGB_2x4_AVX2( const uint8_t* src )
+{
+    uint64_t d = CheckSolid_AVX2( src );
+    if( d != 0 ) return d;
+
+    alignas(32) v4i a[8];
+
+    __m128i err0 = PrepareAverages_AVX2( a, src );
+
+    uint32_t idx = _mm_extract_epi32(err0, 1) < _mm_extract_epi32(err0, 3) ? 1 : 3;
+
+    d |= EncodeAverages_AVX2( a, idx );
+
+    alignas(32) uint32_t terr[2][8] = {};
+    alignas(32) uint32_t tsel[8];
+
+    FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
+
+    return EncodeSelectors_AVX2( d, terr, tsel, true);
+}
+
+uint64_t ProcessRGB_ETC2_AVX2( const uint8_t* src )
+{
+    auto plane = Planar_AVX2( src );
+
+    alignas(32) v4i a[8];
+
+    __m128i err0 = PrepareAverages_AVX2( a, plane.sum4 );
+
+    // Get index of minimum error (err0)
+    __m128i err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(2, 3, 0, 1));
+    __m128i errMin0 = _mm_min_epu32(err0, err1);
+
+    __m128i errMin1 = _mm_shuffle_epi32(errMin0, _MM_SHUFFLE(1, 0, 3, 2));
+    __m128i errMin2 = _mm_min_epu32(errMin1, errMin0);
+
+    __m128i errMask = _mm_cmpeq_epi32(errMin2, err0);
+
+    uint32_t mask = _mm_movemask_epi8(errMask);
+
+    size_t idx = _bit_scan_forward(mask) >> 2;
+
+    uint64_t d = EncodeAverages_AVX2( a, idx );
+
+    alignas(32) uint32_t terr[2][8] = {};
+    alignas(32) uint32_t tsel[8];
+
+    if ((idx == 0) || (idx == 2))
+    {
+        FindBestFit_4x2_AVX2( terr, tsel, a, idx * 2, src );
+    }
+    else
+    {
+        FindBestFit_2x4_AVX2( terr, tsel, a, idx * 2, src );
+    }
+
+    return EncodeSelectors_AVX2( d, terr, tsel, (idx % 2) == 1, plane.plane, plane.error );
+}
+
+#ifndef _MSC_VER
+#  pragma GCC pop_options
+#endif
+
+#endif
diff --git a/thirdparty/etcpak/ProcessRGB_AVX2.hpp b/thirdparty/etcpak/ProcessRGB_AVX2.hpp
new file mode 100644
index 000000000000..ad7739cf3af6
--- /dev/null
+++ b/thirdparty/etcpak/ProcessRGB_AVX2.hpp
@@ -0,0 +1,15 @@
+#ifndef __PROCESSRGB_AVX2_HPP__
+#define __PROCESSRGB_AVX2_HPP__
+
+#ifdef __SSE4_1__
+
+#include <stdint.h>
+
+uint64_t ProcessRGB_AVX2( const uint8_t* src );
+uint64_t ProcessRGB_4x2_AVX2( const uint8_t* src );
+uint64_t ProcessRGB_2x4_AVX2( const uint8_t* src );
+uint64_t ProcessRGB_ETC2_AVX2( const uint8_t* src );
+
+#endif
+
+#endif
diff --git a/thirdparty/etcpak/README.md b/thirdparty/etcpak/README.md
new file mode 100644
index 000000000000..cee9a79c815d
--- /dev/null
+++ b/thirdparty/etcpak/README.md
@@ -0,0 +1,43 @@
+# etcpak 0.6 #
+(Updated 2018-07-11)
+
+## The fastest ETC compressor on the planet ##
+
+etcpak is an extremely fast [Ericsson Texture Compression](http://en.wikipedia.org/wiki/Ericsson_Texture_Compression) utility. Currently it's best suited for rapid assets preparation during development, when graphics quality is not a concern, but it's also used in production builds of applications used by millions of people.
+
+## Compression times ##
+
+Benchmark performed on an Intel i7 8700K, using a real-life RGBA 16K × 16K atlas:
+
+ETC1: **113 ms** (only RGB part)  
+ETC2 RGB: **213 ms** (only RGB part)  
+ETC2 RGBA: **404 ms**
+
+This is 100× - 1000× faster than any other ETC compression tool (there's no typo in the numbers).
+
+[Why there's no image quality metrics? / Quality comparison.](http://i.imgur.com/FxlmUOF.png)  
+[Workload distribution.](https://i.imgur.com/9ZUy4KP.png)
+
+## Quality comparison ##
+
+Original image:
+
+![](http://1.bp.blogspot.com/-kqFgRVL0uKY/UbSclN-fZdI/AAAAAAAAAxU/Fy87I8P4Yxs/s1600/kodim23.png)
+
+Compressed image:
+
+ETC1:
+![](http://i.imgur.com/xmdht4u.png "ETC1 mode")
+ETC2:
+![](http://i.imgur.com/v7Dw2Yz.png "ETC2 mode")
+
+## More information ##
+
+[etcpak 0.6](http://zgredowo.blogspot.com/2018/07/etcpak-06.html)  
+[etcpak 0.5](http://zgredowo.blogspot.com/2016/01/etcpak-05.html)  
+[etcpak 0.4](http://zgredowo.blogspot.com/2016/01/etcpak-04.html)  
+[etcpak 0.3](http://zgredowo.blogspot.com/2014/05/etcpak-03.html)  
+[etcpak 0.2.2](http://zgredowo.blogspot.com/2014/03/etcpack-022.html)  
+[etcpak 0.2.1](http://zgredowo.blogspot.com/2013/08/etcpak-021.html)   
+[etcpak 0.2](http://zgredowo.blogspot.com/2013/07/etcpak-02.html)  
+[etcpak 0.1](http://zgredowo.blogspot.com/2013/06/fastest-etc-compressor-on-planet.html)
diff --git a/thirdparty/etcpak/Semaphore.hpp b/thirdparty/etcpak/Semaphore.hpp
new file mode 100644
index 000000000000..9e42dbb9e006
--- /dev/null
+++ b/thirdparty/etcpak/Semaphore.hpp
@@ -0,0 +1,46 @@
+#ifndef __DARKRL__SEMAPHORE_HPP__
+#define __DARKRL__SEMAPHORE_HPP__
+
+#include <condition_variable>
+#include <mutex>
+
+class Semaphore
+{
+public:
+    Semaphore( int count ) : m_count( count ) {}
+
+    void lock()
+    {
+        std::unique_lock<std::mutex> lock( m_mutex );
+        m_cv.wait( lock, [this](){ return m_count != 0; } );
+        m_count--;
+    }
+
+    void unlock()
+    {
+        std::lock_guard<std::mutex> lock( m_mutex );
+        m_count++;
+        m_cv.notify_one();
+    }
+
+    bool try_lock()
+    {
+        std::lock_guard<std::mutex> lock( m_mutex );
+        if( m_count == 0 )
+        {
+            return false;
+        }
+        else
+        {
+            m_count--;
+            return true;
+        }
+    }
+
+private:
+    std::mutex m_mutex;
+    std::condition_variable m_cv;
+    unsigned int m_count;
+};
+
+#endif
diff --git a/thirdparty/etcpak/System.cpp b/thirdparty/etcpak/System.cpp
new file mode 100644
index 000000000000..1383d0ecd0eb
--- /dev/null
+++ b/thirdparty/etcpak/System.cpp
@@ -0,0 +1,68 @@
+#include <algorithm>
+#ifdef _WIN32
+#  include <windows.h>
+#else
+#  include <pthread.h>
+#  include <unistd.h>
+#endif
+
+#include "System.hpp"
+
+unsigned int System::CPUCores()
+{
+    static unsigned int cores = 0;
+    if( cores == 0 )
+    {
+        int tmp;
+#ifdef _WIN32
+        SYSTEM_INFO info;
+        GetSystemInfo( &info );
+        tmp = (int)info.dwNumberOfProcessors;
+#else
+#  ifndef _SC_NPROCESSORS_ONLN
+#    ifdef _SC_NPROC_ONLN
+#      define _SC_NPROCESSORS_ONLN _SC_NPROC_ONLN
+#    elif defined _SC_CRAY_NCPU
+#      define _SC_NPROCESSORS_ONLN _SC_CRAY_NCPU
+#    endif
+#  endif
+        tmp = (int)(long)sysconf( _SC_NPROCESSORS_ONLN );
+#endif
+        cores = (unsigned int)std::max( tmp, 1 );
+    }
+    return cores;
+}
+
+void System::SetThreadName( std::thread& thread, const char* name )
+{
+#ifdef _WIN32
+    const DWORD MS_VC_EXCEPTION=0x406D1388;
+
+#  pragma pack( push, 8 )
+    struct THREADNAME_INFO
+    {
+       DWORD dwType;
+       LPCSTR szName;
+       DWORD dwThreadID;
+       DWORD dwFlags;
+    };
+#  pragma pack(pop)
+
+    DWORD ThreadId = GetThreadId( static_cast<HANDLE>( thread.native_handle() ) );
+    THREADNAME_INFO info;
+    info.dwType = 0x1000;
+    info.szName = name;
+    info.dwThreadID = ThreadId;
+    info.dwFlags = 0;
+
+    __try
+    {
+       RaiseException( MS_VC_EXCEPTION, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info );
+    }
+    __except(EXCEPTION_EXECUTE_HANDLER)
+    {
+    }
+#elif !defined(__APPLE__)
+    pthread_setname_np( thread.native_handle(), name );
+#endif
+}
diff --git a/thirdparty/etcpak/System.hpp b/thirdparty/etcpak/System.hpp
new file mode 100644
index 000000000000..1a09bb15e1c5
--- /dev/null
+++ b/thirdparty/etcpak/System.hpp
@@ -0,0 +1,15 @@
+#ifndef __DARKRL__SYSTEM_HPP__
+#define __DARKRL__SYSTEM_HPP__
+
+#include <thread>
+
+class System
+{
+public:
+    System() = delete;
+
+    static unsigned int CPUCores();
+    static void SetThreadName( std::thread& thread, const char* name );
+};
+
+#endif
diff --git a/thirdparty/etcpak/Tables.cpp b/thirdparty/etcpak/Tables.cpp
new file mode 100644
index 000000000000..ac5009c627b9
--- /dev/null
+++ b/thirdparty/etcpak/Tables.cpp
@@ -0,0 +1,177 @@
+#include "Tables.hpp"
+
+const int32_t g_table[8][4] = {
+    {  2,  8,   -2,   -8 },
+    {  5, 17,   -5,  -17 },
+    {  9, 29,   -9,  -29 },
+    { 13, 42,  -13,  -42 },
+    { 18, 60,  -18,  -60 },
+    { 24, 80,  -24,  -80 },
+    { 33, 106, -33, -106 },
+    { 47, 183, -47, -183 }
+};
+
+const int64_t g_table256[8][4] = {
+    {  2*256,  8*256,   -2*256,   -8*256 },
+    {  5*256, 17*256,   -5*256,  -17*256 },
+    {  9*256, 29*256,   -9*256,  -29*256 },
+    { 13*256, 42*256,  -13*256,  -42*256 },
+    { 18*256, 60*256,  -18*256,  -60*256 },
+    { 24*256, 80*256,  -24*256,  -80*256 },
+    { 33*256, 106*256, -33*256, -106*256 },
+    { 47*256, 183*256, -47*256, -183*256 }
+};
+
+const uint32_t g_id[4][16] = {
+    { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2 },
+    { 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 },
+    { 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6 }
+};
+
+const uint32_t g_avg2[16] = {
+    0x00,
+    0x11,
+    0x22,
+    0x33,
+    0x44,
+    0x55,
+    0x66,
+    0x77,
+    0x88,
+    0x99,
+    0xAA,
+    0xBB,
+    0xCC,
+    0xDD,
+    0xEE,
+    0xFF
+};
+
+const uint32_t g_flags[64] = {
+    0x80800402, 0x80800402, 0x80800402, 0x80800402,
+    0x80800402, 0x80800402, 0x80800402, 0x8080E002,
+    0x80800402, 0x80800402, 0x8080E002, 0x8080E002,
+    0x80800402, 0x8080E002, 0x8080E002, 0x8080E002,
+    0x80000402, 0x80000402, 0x80000402, 0x80000402,
+    0x80000402, 0x80000402, 0x80000402, 0x8000E002,
+    0x80000402, 0x80000402, 0x8000E002, 0x8000E002,
+    0x80000402, 0x8000E002, 0x8000E002, 0x8000E002,
+    0x00800402, 0x00800402, 0x00800402, 0x00800402,
+    0x00800402, 0x00800402, 0x00800402, 0x0080E002,
+    0x00800402, 0x00800402, 0x0080E002, 0x0080E002,
+    0x00800402, 0x0080E002, 0x0080E002, 0x0080E002,
+    0x00000402, 0x00000402, 0x00000402, 0x00000402,
+    0x00000402, 0x00000402, 0x00000402, 0x0000E002,
+    0x00000402, 0x00000402, 0x0000E002, 0x0000E002,
+    0x00000402, 0x0000E002, 0x0000E002, 0x0000E002
+};
+
+const int32_t g_alpha[16][8] = {
+    { -3, -6,  -9, -15, 2, 5, 8, 14 },
+    { -3, -7, -10, -13, 2, 6, 9, 12 },
+    { -2, -5,  -8, -13, 1, 4, 7, 12 },
+    { -2, -4,  -6, -13, 1, 3, 5, 12 },
+    { -3, -6,  -8, -12, 2, 5, 7, 11 },
+    { -3, -7,  -9, -11, 2, 6, 8, 10 },
+    { -4, -7,  -8, -11, 3, 6, 7, 10 },
+    { -3, -5,  -8, -11, 2, 4, 7, 10 },
+    { -2, -6,  -8, -10, 1, 5, 7,  9 },
+    { -2, -5,  -8, -10, 1, 4, 7,  9 },
+    { -2, -4,  -8, -10, 1, 3, 7,  9 },
+    { -2, -5,  -7, -10, 1, 4, 6,  9 },
+    { -3, -4,  -7, -10, 2, 3, 6,  9 },
+    { -1, -2,  -3, -10, 0, 1, 2,  9 },
+    { -4, -6,  -8,  -9, 3, 5, 7,  8 },
+    { -3, -5,  -7,  -9, 2, 4, 6,  8 }
+};
+
+const int32_t g_alphaRange[16] = {
+    0x100FF / ( 1 + g_alpha[0][7] - g_alpha[0][3] ),
+    0x100FF / ( 1 + g_alpha[1][7] - g_alpha[1][3] ),
+    0x100FF / ( 1 + g_alpha[2][7] - g_alpha[2][3] ),
+    0x100FF / ( 1 + g_alpha[3][7] - g_alpha[3][3] ),
+    0x100FF / ( 1 + g_alpha[4][7] - g_alpha[4][3] ),
+    0x100FF / ( 1 + g_alpha[5][7] - g_alpha[5][3] ),
+    0x100FF / ( 1 + g_alpha[6][7] - g_alpha[6][3] ),
+    0x100FF / ( 1 + g_alpha[7][7] - g_alpha[7][3] ),
+    0x100FF / ( 1 + g_alpha[8][7] - g_alpha[8][3] ),
+    0x100FF / ( 1 + g_alpha[9][7] - g_alpha[9][3] ),
+    0x100FF / ( 1 + g_alpha[10][7] - g_alpha[10][3] ),
+    0x100FF / ( 1 + g_alpha[11][7] - g_alpha[11][3] ),
+    0x100FF / ( 1 + g_alpha[12][7] - g_alpha[12][3] ),
+    0x100FF / ( 1 + g_alpha[13][7] - g_alpha[13][3] ),
+    0x100FF / ( 1 + g_alpha[14][7] - g_alpha[14][3] ),
+    0x100FF / ( 1 + g_alpha[15][7] - g_alpha[15][3] ),
+};
+
+#ifdef __SSE4_1__
+const uint8_t g_flags_AVX2[64] =
+{
+    0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x7D,
+    0x63, 0x63, 0x7D, 0x7D,
+    0x63, 0x7D, 0x7D, 0x7D,
+    0x43, 0x43, 0x43, 0x43,
+    0x43, 0x43, 0x43, 0x5D,
+    0x43, 0x43, 0x5D, 0x5D,
+    0x43, 0x5D, 0x5D, 0x5D,
+    0x23, 0x23, 0x23, 0x23,
+    0x23, 0x23, 0x23, 0x3D,
+    0x23, 0x23, 0x3D, 0x3D,
+    0x23, 0x3D, 0x3D, 0x3D,
+    0x03, 0x03, 0x03, 0x03,
+    0x03, 0x03, 0x03, 0x1D,
+    0x03, 0x03, 0x1D, 0x1D,
+    0x03, 0x1D, 0x1D, 0x1D,
+};
+
+const __m128i g_table_SIMD[2] =
+{
+    _mm_setr_epi16(   2,   5,   9,  13,  18,  24,  33,  47),
+    _mm_setr_epi16(   8,  17,  29,  42,  60,  80, 106, 183)
+};
+const __m128i g_table128_SIMD[2] =
+{
+    _mm_setr_epi16(   2*128,   5*128,   9*128,  13*128,  18*128,  24*128,  33*128,  47*128),
+    _mm_setr_epi16(   8*128,  17*128,  29*128,  42*128,  60*128,  80*128, 106*128, 183*128)
+};
+const __m128i g_table256_SIMD[4] =
+{
+    _mm_setr_epi32(  2*256,   5*256,   9*256,  13*256),
+    _mm_setr_epi32(  8*256,  17*256,  29*256,  42*256),
+    _mm_setr_epi32( 18*256,  24*256,  33*256,  47*256),
+    _mm_setr_epi32( 60*256,  80*256, 106*256, 183*256)
+};
+
+const __m128i g_alpha_SIMD[16] = {
+    _mm_setr_epi16( g_alpha[ 0][0], g_alpha[ 0][1], g_alpha[ 0][2], g_alpha[ 0][3], g_alpha[ 0][4], g_alpha[ 0][5], g_alpha[ 0][6], g_alpha[ 0][7] ),
+    _mm_setr_epi16( g_alpha[ 1][0], g_alpha[ 1][1], g_alpha[ 1][2], g_alpha[ 1][3], g_alpha[ 1][4], g_alpha[ 1][5], g_alpha[ 1][6], g_alpha[ 1][7] ),
+    _mm_setr_epi16( g_alpha[ 2][0], g_alpha[ 2][1], g_alpha[ 2][2], g_alpha[ 2][3], g_alpha[ 2][4], g_alpha[ 2][5], g_alpha[ 2][6], g_alpha[ 2][7] ),
+    _mm_setr_epi16( g_alpha[ 3][0], g_alpha[ 3][1], g_alpha[ 3][2], g_alpha[ 3][3], g_alpha[ 3][4], g_alpha[ 3][5], g_alpha[ 3][6], g_alpha[ 3][7] ),
+    _mm_setr_epi16( g_alpha[ 4][0], g_alpha[ 4][1], g_alpha[ 4][2], g_alpha[ 4][3], g_alpha[ 4][4], g_alpha[ 4][5], g_alpha[ 4][6], g_alpha[ 4][7] ),
+    _mm_setr_epi16( g_alpha[ 5][0], g_alpha[ 5][1], g_alpha[ 5][2], g_alpha[ 5][3], g_alpha[ 5][4], g_alpha[ 5][5], g_alpha[ 5][6], g_alpha[ 5][7] ),
+    _mm_setr_epi16( g_alpha[ 6][0], g_alpha[ 6][1], g_alpha[ 6][2], g_alpha[ 6][3], g_alpha[ 6][4], g_alpha[ 6][5], g_alpha[ 6][6], g_alpha[ 6][7] ),
+    _mm_setr_epi16( g_alpha[ 7][0], g_alpha[ 7][1], g_alpha[ 7][2], g_alpha[ 7][3], g_alpha[ 7][4], g_alpha[ 7][5], g_alpha[ 7][6], g_alpha[ 7][7] ),
+    _mm_setr_epi16( g_alpha[ 8][0], g_alpha[ 8][1], g_alpha[ 8][2], g_alpha[ 8][3], g_alpha[ 8][4], g_alpha[ 8][5], g_alpha[ 8][6], g_alpha[ 8][7] ),
+    _mm_setr_epi16( g_alpha[ 9][0], g_alpha[ 9][1], g_alpha[ 9][2], g_alpha[ 9][3], g_alpha[ 9][4], g_alpha[ 9][5], g_alpha[ 9][6], g_alpha[ 9][7] ),
+    _mm_setr_epi16( g_alpha[10][0], g_alpha[10][1], g_alpha[10][2], g_alpha[10][3], g_alpha[10][4], g_alpha[10][5], g_alpha[10][6], g_alpha[10][7] ),
+    _mm_setr_epi16( g_alpha[11][0], g_alpha[11][1], g_alpha[11][2], g_alpha[11][3], g_alpha[11][4], g_alpha[11][5], g_alpha[11][6], g_alpha[11][7] ),
+    _mm_setr_epi16( g_alpha[12][0], g_alpha[12][1], g_alpha[12][2], g_alpha[12][3], g_alpha[12][4], g_alpha[12][5], g_alpha[12][6], g_alpha[12][7] ),
+    _mm_setr_epi16( g_alpha[13][0], g_alpha[13][1], g_alpha[13][2], g_alpha[13][3], g_alpha[13][4], g_alpha[13][5], g_alpha[13][6], g_alpha[13][7] ),
+    _mm_setr_epi16( g_alpha[14][0], g_alpha[14][1], g_alpha[14][2], g_alpha[14][3], g_alpha[14][4], g_alpha[14][5], g_alpha[14][6], g_alpha[14][7] ),
+    _mm_setr_epi16( g_alpha[15][0], g_alpha[15][1], g_alpha[15][2], g_alpha[15][3], g_alpha[15][4], g_alpha[15][5], g_alpha[15][6], g_alpha[15][7] ),
+};
+
+const __m128i g_alphaRange_SIMD = _mm_setr_epi16(
+    g_alphaRange[0],
+    g_alphaRange[1],
+    g_alphaRange[4],
+    g_alphaRange[5],
+    g_alphaRange[8],
+    g_alphaRange[14],
+    0,
+    0 );
+
+#endif
+
diff --git a/thirdparty/etcpak/Tables.hpp b/thirdparty/etcpak/Tables.hpp
new file mode 100644
index 000000000000..6a0128aa4dc8
--- /dev/null
+++ b/thirdparty/etcpak/Tables.hpp
@@ -0,0 +1,32 @@
+#ifndef __TABLES_HPP__
+#define __TABLES_HPP__
+
+#include <stdint.h>
+
+#ifdef __SSE4_1__
+#include <smmintrin.h>
+#endif
+
+extern const int32_t g_table[8][4];
+extern const int64_t g_table256[8][4];
+
+extern const uint32_t g_id[4][16];
+
+extern const uint32_t g_avg2[16];
+
+extern const uint32_t g_flags[64];
+
+extern const int32_t g_alpha[16][8];
+extern const int32_t g_alphaRange[16];
+
+#ifdef __SSE4_1__
+extern const uint8_t g_flags_AVX2[64];
+extern const __m128i g_table_SIMD[2];
+extern const __m128i g_table128_SIMD[2];
+extern const __m128i g_table256_SIMD[4];
+
+extern const __m128i g_alpha_SIMD[16];
+extern const __m128i g_alphaRange_SIMD;
+#endif
+
+#endif
diff --git a/thirdparty/etcpak/TaskDispatch.cpp b/thirdparty/etcpak/TaskDispatch.cpp
new file mode 100644
index 000000000000..c330b1451eaf
--- /dev/null
+++ b/thirdparty/etcpak/TaskDispatch.cpp
@@ -0,0 +1,113 @@
+#include <assert.h>
+#include <stdio.h>
+
+#include "Debug.hpp"
+#include "System.hpp"
+#include "TaskDispatch.hpp"
+
+static TaskDispatch* s_instance = nullptr;
+
+TaskDispatch::TaskDispatch( size_t workers )
+    : m_exit( false )
+    , m_jobs( 0 )
+{
+    assert( !s_instance );
+    s_instance = this;
+
+    assert( workers >= 1 );
+    workers--;
+
+    m_workers.reserve( workers );
+    for( size_t i=0; i<workers; i++ )
+    {
+        char tmp[16];
+        sprintf( tmp, "Worker %zu", i );
+#ifdef __APPLE__
+        auto worker = std::thread( [this, tmp]{
+            pthread_setname_np( tmp );
+            Worker();
+        } );
+#else
+        auto worker = std::thread( [this]{ Worker(); } );
+#endif
+        System::SetThreadName( worker, tmp );
+        m_workers.emplace_back( std::move( worker ) );
+    }
+
+    DBGPRINT( "Task dispatcher with " << m_workers.size() + 1 << " workers" );
+}
+
+TaskDispatch::~TaskDispatch()
+{
+    m_exit = true;
+    m_cvWork.notify_all();
+
+    for( auto& worker : m_workers )
+    {
+        worker.join();
+    }
+
+    assert( s_instance );
+    s_instance = nullptr;
+}
+
+void TaskDispatch::Queue( const std::function<void(void)>& f )
+{
+    std::unique_lock<std::mutex> lock( s_instance->m_queueLock );
+    s_instance->m_queue.emplace_back( f );
+    const auto size = s_instance->m_queue.size();
+    lock.unlock();
+    if( size > 1 )
+    {
+        s_instance->m_cvWork.notify_one();
+    }
+}
+
+void TaskDispatch::Queue( std::function<void(void)>&& f )
+{
+    std::unique_lock<std::mutex> lock( s_instance->m_queueLock );
+    s_instance->m_queue.emplace_back( std::move( f ) );
+    const auto size = s_instance->m_queue.size();
+    lock.unlock();
+    if( size > 1 )
+    {
+        s_instance->m_cvWork.notify_one();
+    }
+}
+
+void TaskDispatch::Sync()
+{
+    std::unique_lock<std::mutex> lock( s_instance->m_queueLock );
+    while( !s_instance->m_queue.empty() )
+    {
+        auto f = s_instance->m_queue.back();
+        s_instance->m_queue.pop_back();
+        lock.unlock();
+        f();
+        lock.lock();
+    }
+    s_instance->m_cvJobs.wait( lock, []{ return s_instance->m_jobs == 0; } );
+}
+
+void TaskDispatch::Worker()
+{
+    for(;;)
+    {
+        std::unique_lock<std::mutex> lock( m_queueLock );
+        m_cvWork.wait( lock, [this]{ return !m_queue.empty() || m_exit; } );
+        if( m_exit ) return;
+        auto f = m_queue.back();
+        m_queue.pop_back();
+        m_jobs++;
+        lock.unlock();
+        f();
+        lock.lock();
+        m_jobs--;
+        bool notify = m_jobs == 0 && m_queue.empty();
+        lock.unlock();
+        if( notify )
+        {
+            m_cvJobs.notify_all();
+        }
+    }
+}
diff --git a/thirdparty/etcpak/TaskDispatch.hpp b/thirdparty/etcpak/TaskDispatch.hpp
new file mode 100644
index 000000000000..b513de4c0c5c
--- /dev/null
+++ b/thirdparty/etcpak/TaskDispatch.hpp
@@ -0,0 +1,34 @@
+#ifndef __DARKRL__TASKDISPATCH_HPP__
+#define __DARKRL__TASKDISPATCH_HPP__
+
+#include <atomic>
+#include <condition_variable>
+#include <functional>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+class TaskDispatch
+{
+public:
+    TaskDispatch( size_t workers );
+    ~TaskDispatch();
+
+    static void Queue( const std::function<void(void)>& f );
+    static void Queue( std::function<void(void)>&& f );
+
+    static void Sync();
+
+private:
+    void Worker();
+
+    std::vector<std::function<void(void)>> m_queue;
+    std::mutex m_queueLock;
+    std::condition_variable m_cvWork, m_cvJobs;
+    std::atomic<bool> m_exit;
+    size_t m_jobs;
+
+    std::vector<std::thread> m_workers;
+};
+
+#endif
diff --git a/thirdparty/etcpak/Timing.cpp b/thirdparty/etcpak/Timing.cpp
new file mode 100644
index 000000000000..2af851f9a918
--- /dev/null
+++ b/thirdparty/etcpak/Timing.cpp
@@ -0,0 +1,8 @@
+#include <chrono>
+
+#include "Timing.hpp"
+
+uint64_t GetTime()
+{
+    return std::chrono::time_point_cast<std::chrono::microseconds>( std::chrono::high_resolution_clock::now() ).time_since_epoch().count();
+}
diff --git a/thirdparty/etcpak/Timing.hpp b/thirdparty/etcpak/Timing.hpp
new file mode 100644
index 000000000000..3767e20f24a8
--- /dev/null
+++ b/thirdparty/etcpak/Timing.hpp
@@ -0,0 +1,8 @@
+#ifndef __DARKRL__TIMING_HPP__
+#define __DARKRL__TIMING_HPP__
+
+#include <stdint.h>
+
+uint64_t GetTime();
+
+#endif
diff --git a/thirdparty/etcpak/Vector.hpp b/thirdparty/etcpak/Vector.hpp
new file mode 100644
index 000000000000..d765caa1c762
--- /dev/null
+++ b/thirdparty/etcpak/Vector.hpp
@@ -0,0 +1,222 @@
+#ifndef __DARKRL__VECTOR_HPP__
+#define __DARKRL__VECTOR_HPP__
+
+#include <assert.h>
+#include <algorithm>
+#include <math.h>
+#include <stdint.h>
+
+#include "Math.hpp"
+
+template<class T>
+struct EtcpakVector2
+{
+    EtcpakVector2() : x( 0 ), y( 0 ) {}
+    EtcpakVector2( T v ) : x( v ), y( v ) {}
+    EtcpakVector2( T _x, T _y ) : x( _x ), y( _y ) {}
+
+    bool operator==( const EtcpakVector2<T>& rhs ) const { return x == rhs.x && y == rhs.y; }
+    bool operator!=( const EtcpakVector2<T>& rhs ) const { return !( *this == rhs ); }
+
+    EtcpakVector2<T>& operator+=( const EtcpakVector2<T>& rhs )
+    {
+        x += rhs.x;
+        y += rhs.y;
+        return *this;
+    }
+    EtcpakVector2<T>& operator-=( const EtcpakVector2<T>& rhs )
+    {
+        x -= rhs.x;
+        y -= rhs.y;
+        return *this;
+    }
+    EtcpakVector2<T>& operator*=( const EtcpakVector2<T>& rhs )
+    {
+        x *= rhs.x;
+        y *= rhs.y;
+        return *this;
+    }
+
+    T x, y;
+};
+
+template<class T>
+EtcpakVector2<T> operator+( const EtcpakVector2<T>& lhs, const EtcpakVector2<T>& rhs )
+{
+    return EtcpakVector2<T>( lhs.x + rhs.x, lhs.y + rhs.y );
+}
+
+template<class T>
+EtcpakVector2<T> operator-( const EtcpakVector2<T>& lhs, const EtcpakVector2<T>& rhs )
+{
+    return EtcpakVector2<T>( lhs.x - rhs.x, lhs.y - rhs.y );
+}
+
+template<class T>
+EtcpakVector2<T> operator*( const EtcpakVector2<T>& lhs, const float& rhs )
+{
+    return EtcpakVector2<T>( lhs.x * rhs, lhs.y * rhs );
+}
+
+template<class T>
+EtcpakVector2<T> operator/( const EtcpakVector2<T>& lhs, const T& rhs )
+{
+    return EtcpakVector2<T>( lhs.x / rhs, lhs.y / rhs );
+}
+
+
+typedef EtcpakVector2<int32_t> v2i;
+typedef EtcpakVector2<float> v2f;
+
+
+template<class T>
+struct EtcpakVector3
+{
+    EtcpakVector3() : x( 0 ), y( 0 ), z( 0 ) {}
+    EtcpakVector3( T v ) : x( v ), y( v ), z( v ) {}
+    EtcpakVector3( T _x, T _y, T _z ) : x( _x ), y( _y ), z( _z ) {}
+    template<class Y>
+    EtcpakVector3( const EtcpakVector3<Y>& v ) : x( T( v.x ) ), y( T( v.y ) ), z( T( v.z ) ) {}
+
+    T Luminance() const { return T( x * 0.3f + y * 0.59f + z * 0.11f ); }
+    void Clamp()
+    {
+        x = std::min( T(1), std::max( T(0), x ) );
+        y = std::min( T(1), std::max( T(0), y ) );
+        z = std::min( T(1), std::max( T(0), z ) );
+    }
+
+    bool operator==( const EtcpakVector3<T>& rhs ) const { return x == rhs.x && y == rhs.y && z == rhs.z; }
+    bool operator!=( const EtcpakVector2<T>& rhs ) const { return !( *this == rhs ); }
+
+    T& operator[]( unsigned int idx ) { assert( idx < 3 ); return ((T*)this)[idx]; }
+    const T& operator[]( unsigned int idx ) const { assert( idx < 3 ); return ((T*)this)[idx]; }
+
+    EtcpakVector3<T> operator+=( const EtcpakVector3<T>& rhs )
+    {
+        x += rhs.x;
+        y += rhs.y;
+        z += rhs.z;
+        return *this;
+    }
+
+    EtcpakVector3<T> operator*=( const EtcpakVector3<T>& rhs )
+    {
+        x *= rhs.x;
+        y *= rhs.y;
+        z *= rhs.z;
+        return *this;
+    }
+
+    EtcpakVector3<T> operator*=( const float& rhs )
+    {
+        x *= rhs;
+        y *= rhs;
+        z *= rhs;
+        return *this;
+    }
+
+    T x, y, z;
+    T padding;
+};
+
+template<class T>
+EtcpakVector3<T> operator+( const EtcpakVector3<T>& lhs, const EtcpakVector3<T>& rhs )
+{
+    return EtcpakVector3<T>( lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z );
+}
+
+template<class T>
+EtcpakVector3<T> operator-( const EtcpakVector3<T>& lhs, const EtcpakVector3<T>& rhs )
+{
+    return EtcpakVector3<T>( lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z );
+}
+
+template<class T>
+EtcpakVector3<T> operator*( const EtcpakVector3<T>& lhs, const EtcpakVector3<T>& rhs )
+{
+    return EtcpakVector3<T>( lhs.x * rhs.x, lhs.y * rhs.y, lhs.z * rhs.z );
+}
+
+template<class T>
+EtcpakVector3<T> operator*( const EtcpakVector3<T>& lhs, const float& rhs )
+{
+    return EtcpakVector3<T>( T( lhs.x * rhs ), T( lhs.y * rhs ), T( lhs.z * rhs ) );
+}
+
+template<class T>
+EtcpakVector3<T> operator/( const EtcpakVector3<T>& lhs, const T& rhs )
+{
+    return EtcpakVector3<T>( lhs.x / rhs, lhs.y / rhs, lhs.z / rhs );
+}
+
+template<class T>
+bool operator<( const EtcpakVector3<T>& lhs, const EtcpakVector3<T>& rhs )
+{
+    return lhs.Luminance() < rhs.Luminance();
+}
+
+typedef EtcpakVector3<int32_t> v3i;
+typedef EtcpakVector3<float> v3f;
+typedef EtcpakVector3<uint8_t> v3b;
+
+
+static inline v3b v3f_to_v3b( const v3f& v )
+{
+    return v3b( uint8_t( std::min( 1.f, v.x ) * 255 ), uint8_t( std::min( 1.f, v.y ) * 255 ), uint8_t( std::min( 1.f, v.z ) * 255 ) );
+}
+
+template<class T>
+EtcpakVector3<T> Mix( const EtcpakVector3<T>& v1, const EtcpakVector3<T>& v2, float amount )
+{
+    return v1 + ( v2 - v1 ) * amount;
+}
+
+template<>
+inline v3b Mix( const v3b& v1, const v3b& v2, float amount )
+{
+    return v3b( v3f( v1 ) + ( v3f( v2 ) - v3f( v1 ) ) * amount );
+}
+
+template<class T>
+EtcpakVector3<T> Desaturate( const EtcpakVector3<T>& v )
+{
+    T l = v.Luminance();
+    return EtcpakVector3<T>( l, l, l );
+}
+
+template<class T>
+EtcpakVector3<T> Desaturate( const EtcpakVector3<T>& v, float mul )
+{
+    T l = T( v.Luminance() * mul );
+    return EtcpakVector3<T>( l, l, l );
+}
+
+template<class T>
+EtcpakVector3<T> pow( const EtcpakVector3<T>& base, float exponent )
+{
+    return EtcpakVector3<T>(
+        pow( base.x, exponent ),
+        pow( base.y, exponent ),
+        pow( base.z, exponent ) );
+}
+
+template<class T>
+EtcpakVector3<T> sRGB2linear( const EtcpakVector3<T>& v )
+{
+    return EtcpakVector3<T>(
+        sRGB2linear( v.x ),
+        sRGB2linear( v.y ),
+        sRGB2linear( v.z ) );
+}
+
+template<class T>
+EtcpakVector3<T> linear2sRGB( const EtcpakVector3<T>& v )
+{
+    return EtcpakVector3<T>(
+        linear2sRGB( v.x ),
+        linear2sRGB( v.y ),
+        linear2sRGB( v.z ) );
+}
+
+#endif
diff --git a/thirdparty/etcpak/bitbucket-pipelines.yml b/thirdparty/etcpak/bitbucket-pipelines.yml
new file mode 100644
index 000000000000..c8d94963c356
--- /dev/null
+++ b/thirdparty/etcpak/bitbucket-pipelines.yml
@@ -0,0 +1,12 @@
+# This is a sample build configuration for C++.
+# Check our guides at https://confluence.atlassian.com/x/VYk8Lw for more examples.
+# Only use spaces to indent your .yml configuration.
+# -----
+# You can specify a custom docker image from Docker Hub as your build environment.
+image: gcc:6.1
+
+pipelines:
+  default:
+    - step:
+        script: # Modify the commands below to build your repository.
+          - make -C unix
diff --git a/thirdparty/etcpak/lz4/lz4.c b/thirdparty/etcpak/lz4/lz4.c
new file mode 100644
index 000000000000..08cf6b5cd72b
--- /dev/null
+++ b/thirdparty/etcpak/lz4/lz4.c
@@ -0,0 +1,1516 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Copyright (C) 2011-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : https://github.com/Cyan4973/lz4
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+
+/**************************************
+*  Tuning parameters
+**************************************/
+/*
+ * HEAPMODE :
+ * Select how default compression functions will allocate memory for their hash table,
+ * in memory stack (0:default, fastest), or in memory heap (1:requires malloc()).
+ */
+#define HEAPMODE 0
+
+/*
+ * ACCELERATION_DEFAULT :
+ * Select "acceleration" for LZ4_compress_fast() when parameter value <= 0
+ */
+#define ACCELERATION_DEFAULT 1
+
+
+/**************************************
+*  CPU Feature Detection
+**************************************/
+/*
+ * LZ4_FORCE_SW_BITCOUNT
+ * Define this parameter if your target system or compiler does not support hardware bit count
+ */
+#if defined(_MSC_VER) && defined(_WIN32_WCE)   /* Visual Studio for Windows CE does not support Hardware bit count */
+#  define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+
+/**************************************
+*  Includes
+**************************************/
+#include "lz4.h"
+
+
+/**************************************
+*  Compiler Options
+**************************************/
+#ifdef _MSC_VER    /* Visual Studio */
+#  define FORCE_INLINE static __forceinline
+#  include <intrin.h>
+#  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
+#  pragma warning(disable : 4293)        /* disable: C4293: too large shift (32-bits) */
+#else
+#  if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
+#    if defined(__GNUC__) || defined(__clang__)
+#      define FORCE_INLINE static inline __attribute__((always_inline))
+#    else
+#      define FORCE_INLINE static inline
+#    endif
+#  else
+#    define FORCE_INLINE static
+#  endif   /* __STDC_VERSION__ */
+#endif  /* _MSC_VER */
+
+/* LZ4_GCC_VERSION is defined into lz4.h */
+#if (LZ4_GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
+#  define expect(expr,value)    (__builtin_expect ((expr),(value)) )
+#else
+#  define expect(expr,value)    (expr)
+#endif
+
+#define likely(expr)     expect((expr) != 0, 1)
+#define unlikely(expr)   expect((expr) != 0, 0)
+
+
+/**************************************
+*  Memory routines
+**************************************/
+#include <stdlib.h>   /* malloc, calloc, free */
+#define ALLOCATOR(n,s) calloc(n,s)
+#define FREEMEM        free
+#include <string.h>   /* memset, memcpy */
+#define MEM_INIT       memset
+
+
+/**************************************
+*  Basic Types
+**************************************/
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)   /* C99 */
+# include <stdint.h>
+  typedef  uint8_t BYTE;
+  typedef uint16_t U16;
+  typedef uint32_t U32;
+  typedef  int32_t S32;
+  typedef uint64_t U64;
+#else
+  typedef unsigned char       BYTE;
+  typedef unsigned short      U16;
+  typedef unsigned int        U32;
+  typedef   signed int        S32;
+  typedef unsigned long long  U64;
+#endif
+
+
+/**************************************
+*  Reading and writing into memory
+**************************************/
+#define STEPSIZE sizeof(size_t)
+
+static unsigned LZ4_64bits(void) { return sizeof(void*)==8; }
+
+static unsigned LZ4_isLittleEndian(void)
+{
+    const union { U32 i; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+
+
+static U16 LZ4_read16(const void* memPtr)
+{
+    U16 val16;
+    memcpy(&val16, memPtr, 2);
+    return val16;
+}
+
+static U16 LZ4_readLE16(const void* memPtr)
+{
+    if (LZ4_isLittleEndian())
+    {
+        return LZ4_read16(memPtr);
+    }
+    else
+    {
+        const BYTE* p = (const BYTE*)memPtr;
+        return (U16)((U16)p[0] + (p[1]<<8));
+    }
+}
+
+static void LZ4_writeLE16(void* memPtr, U16 value)
+{
+    if (LZ4_isLittleEndian())
+    {
+        memcpy(memPtr, &value, 2);
+    }
+    else
+    {
+        BYTE* p = (BYTE*)memPtr;
+        p[0] = (BYTE) value;
+        p[1] = (BYTE)(value>>8);
+    }
+}
+
+static U32 LZ4_read32(const void* memPtr)
+{
+    U32 val32;
+    memcpy(&val32, memPtr, 4);
+    return val32;
+}
+
+static U64 LZ4_read64(const void* memPtr)
+{
+    U64 val64;
+    memcpy(&val64, memPtr, 8);
+    return val64;
+}
+
+static size_t LZ4_read_ARCH(const void* p)
+{
+    if (LZ4_64bits())
+        return (size_t)LZ4_read64(p);
+    else
+        return (size_t)LZ4_read32(p);
+}
+
+
+static void LZ4_copy4(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 4); }
+
+static void LZ4_copy8(void* dstPtr, const void* srcPtr) { memcpy(dstPtr, srcPtr, 8); }
+
+/* customized version of memcpy, which may overwrite up to 7 bytes beyond dstEnd */
+static void LZ4_wildCopy(void* dstPtr, const void* srcPtr, void* dstEnd)
+{
+    BYTE* d = (BYTE*)dstPtr;
+    const BYTE* s = (const BYTE*)srcPtr;
+    BYTE* e = (BYTE*)dstEnd;
+    do { LZ4_copy8(d,s); d+=8; s+=8; } while (d<e);
+}
+
+
+/**************************************
+*  Common Constants
+**************************************/
+#define MINMATCH 4
+
+#define COPYLENGTH 8
+#define LASTLITERALS 5
+#define MFLIMIT (COPYLENGTH+MINMATCH)
+static const int LZ4_minLength = (MFLIMIT+1);
+
+#define KB *(1 <<10)
+#define MB *(1 <<20)
+#define GB *(1U<<30)
+
+#define MAXD_LOG 16
+#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+
+#define ML_BITS  4
+#define ML_MASK  ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+
+/**************************************
+*  Common Utils
+**************************************/
+#define LZ4_STATIC_ASSERT(c)    { enum { LZ4_static_assert = 1/(int)(!!(c)) }; }   /* use only *after* variable declarations */
+
+
+/**************************************
+*  Common functions
+**************************************/
+static unsigned LZ4_NbCommonBytes (register size_t val)
+{
+    if (LZ4_isLittleEndian())
+    {
+        if (LZ4_64bits())
+        {
+#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanForward64( &r, (U64)val );
+            return (int)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_ctzll((U64)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
+            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+#       endif
+        }
+        else /* 32 bits */
+        {
+#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r;
+            _BitScanForward( &r, (U32)val );
+            return (int)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_ctz((U32)val) >> 3);
+#       else
+            static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
+            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+#       endif
+        }
+    }
+    else   /* Big Endian CPU */
+    {
+        if (LZ4_64bits())
+        {
+#       if defined(_MSC_VER) && defined(_WIN64) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanReverse64( &r, val );
+            return (unsigned)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_clzll((U64)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>32)) { r=4; } else { r=0; val>>=32; }
+            if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        }
+        else /* 32 bits */
+        {
+#       if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            unsigned long r = 0;
+            _BitScanReverse( &r, (unsigned long)val );
+            return (unsigned)(r>>3);
+#       elif (defined(__clang__) || (LZ4_GCC_VERSION >= 304)) && !defined(LZ4_FORCE_SW_BITCOUNT)
+            return (__builtin_clz((U32)val) >> 3);
+#       else
+            unsigned r;
+            if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+            r += (!val);
+            return r;
+#       endif
+        }
+    }
+}
+
+static unsigned LZ4_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* pInLimit)
+{
+    const BYTE* const pStart = pIn;
+
+    while (likely(pIn<pInLimit-(STEPSIZE-1)))
+    {
+        size_t diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
+        if (!diff) { pIn+=STEPSIZE; pMatch+=STEPSIZE; continue; }
+        pIn += LZ4_NbCommonBytes(diff);
+        return (unsigned)(pIn - pStart);
+    }
+
+    if (LZ4_64bits()) if ((pIn<(pInLimit-3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn+=4; pMatch+=4; }
+    if ((pIn<(pInLimit-1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn+=2; pMatch+=2; }
+    if ((pIn<pInLimit) && (*pMatch == *pIn)) pIn++;
+    return (unsigned)(pIn - pStart);
+}
+
+
+#ifndef LZ4_COMMONDEFS_ONLY
+/**************************************
+*  Local Constants
+**************************************/
+#define LZ4_HASHLOG   (LZ4_MEMORY_USAGE-2)
+#define HASHTABLESIZE (1 << LZ4_MEMORY_USAGE)
+#define HASH_SIZE_U32 (1 << LZ4_HASHLOG)       /* required as macro for static allocation */
+
+static const int LZ4_64Klimit = ((64 KB) + (MFLIMIT-1));
+static const U32 LZ4_skipTrigger = 6;  /* Increase this value ==> compression run slower on incompressible data */
+
+
+/**************************************
+*  Local Structures and types
+**************************************/
+typedef struct {
+    U32 hashTable[HASH_SIZE_U32];
+    U32 currentOffset;
+    U32 initCheck;
+    const BYTE* dictionary;
+    BYTE* bufferStart;   /* obsolete, used for slideInputBuffer */
+    U32 dictSize;
+} LZ4_stream_t_internal;
+
+typedef enum { notLimited = 0, limitedOutput = 1 } limitedOutput_directive;
+typedef enum { byPtr, byU32, byU16 } tableType_t;
+
+typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive;
+typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
+
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
+typedef enum { full = 0, partial = 1 } earlyEnd_directive;
+
+
+/**************************************
+*  Local Utils
+**************************************/
+int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; }
+int LZ4_compressBound(int isize)  { return LZ4_COMPRESSBOUND(isize); }
+int LZ4_sizeofState() { return LZ4_STREAMSIZE; }
+
+
+
+/********************************
+*  Compression functions
+********************************/
+
+static U32 LZ4_hashSequence(U32 sequence, tableType_t const tableType)
+{
+    if (tableType == byU16)
+        return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
+    else
+        return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
+}
+
+static const U64 prime5bytes = 889523592379ULL;
+static U32 LZ4_hashSequence64(size_t sequence, tableType_t const tableType)
+{
+    const U32 hashLog = (tableType == byU16) ? LZ4_HASHLOG+1 : LZ4_HASHLOG;
+    const U32 hashMask = (1<<hashLog) - 1;
+    return ((sequence * prime5bytes) >> (40 - hashLog)) & hashMask;
+}
+
+static U32 LZ4_hashSequenceT(size_t sequence, tableType_t const tableType)
+{
+    if (LZ4_64bits())
+        return LZ4_hashSequence64(sequence, tableType);
+    return LZ4_hashSequence((U32)sequence, tableType);
+}
+
+static U32 LZ4_hashPosition(const void* p, tableType_t tableType) { return LZ4_hashSequenceT(LZ4_read_ARCH(p), tableType); }
+
+static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t const tableType, const BYTE* srcBase)
+{
+    switch (tableType)
+    {
+    case byPtr: { const BYTE** hashTable = (const BYTE**)tableBase; hashTable[h] = p; return; }
+    case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); return; }
+    case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); return; }
+    }
+}
+
+static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    U32 h = LZ4_hashPosition(p, tableType);
+    LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
+}
+
+static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; }
+    if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; }
+    { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; }   /* default, to ensure a return */
+}
+
+static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+    U32 h = LZ4_hashPosition(p, tableType);
+    return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
+}
+
+FORCE_INLINE int LZ4_compress_generic(
+                 void* const ctx,
+                 const char* const source,
+                 char* const dest,
+                 const int inputSize,
+                 const int maxOutputSize,
+                 const limitedOutput_directive outputLimited,
+                 const tableType_t tableType,
+                 const dict_directive dict,
+                 const dictIssue_directive dictIssue,
+                 const U32 acceleration)
+{
+    LZ4_stream_t_internal* const dictPtr = (LZ4_stream_t_internal*)ctx;
+
+    const BYTE* ip = (const BYTE*) source;
+    const BYTE* base;
+    const BYTE* lowLimit;
+    const BYTE* const lowRefLimit = ip - dictPtr->dictSize;
+    const BYTE* const dictionary = dictPtr->dictionary;
+    const BYTE* const dictEnd = dictionary + dictPtr->dictSize;
+    const size_t dictDelta = dictEnd - (const BYTE*)source;
+    const BYTE* anchor = (const BYTE*) source;
+    const BYTE* const iend = ip + inputSize;
+    const BYTE* const mflimit = iend - MFLIMIT;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const olimit = op + maxOutputSize;
+
+    U32 forwardH;
+    size_t refDelta=0;
+
+    /* Init conditions */
+    if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0;   /* Unsupported input size, too large (or negative) */
+    switch(dict)
+    {
+    case noDict:
+    default:
+        base = (const BYTE*)source;
+        lowLimit = (const BYTE*)source;
+        break;
+    case withPrefix64k:
+        base = (const BYTE*)source - dictPtr->currentOffset;
+        lowLimit = (const BYTE*)source - dictPtr->dictSize;
+        break;
+    case usingExtDict:
+        base = (const BYTE*)source - dictPtr->currentOffset;
+        lowLimit = (const BYTE*)source;
+        break;
+    }
+    if ((tableType == byU16) && (inputSize>=LZ4_64Klimit)) return 0;   /* Size too large (not within 64K limit) */
+    if (inputSize<LZ4_minLength) goto _last_literals;                  /* Input too small, no compression (all literals) */
+
+    /* First Byte */
+    LZ4_putPosition(ip, ctx, tableType, base);
+    ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+    /* Main Loop */
+    for ( ; ; )
+    {
+        const BYTE* match;
+        BYTE* token;
+        {
+            const BYTE* forwardIp = ip;
+            unsigned step = 1;
+            unsigned searchMatchNb = acceleration << LZ4_skipTrigger;
+
+            /* Find a match */
+            do {
+                U32 h = forwardH;
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimit)) goto _last_literals;
+
+                match = LZ4_getPositionOnHash(h, ctx, tableType, base);
+                if (dict==usingExtDict)
+                {
+                    if (match<(const BYTE*)source)
+                    {
+                        refDelta = dictDelta;
+                        lowLimit = dictionary;
+                    }
+                    else
+                    {
+                        refDelta = 0;
+                        lowLimit = (const BYTE*)source;
+                    }
+                }
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
+
+            } while ( ((dictIssue==dictSmall) ? (match < lowRefLimit) : 0)
+                || ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip))
+                || (LZ4_read32(match+refDelta) != LZ4_read32(ip)) );
+        }
+
+        /* Catch up */
+        while ((ip>anchor) && (match+refDelta > lowLimit) && (unlikely(ip[-1]==match[refDelta-1]))) { ip--; match--; }
+
+        {
+            /* Encode Literal length */
+            unsigned litLength = (unsigned)(ip - anchor);
+            token = op++;
+            if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit)))
+                return 0;   /* Check output limit */
+            if (litLength>=RUN_MASK)
+            {
+                int len = (int)litLength-RUN_MASK;
+                *token=(RUN_MASK<<ML_BITS);
+                for(; len >= 255 ; len-=255) *op++ = 255;
+                *op++ = (BYTE)len;
+            }
+            else *token = (BYTE)(litLength<<ML_BITS);
+
+            /* Copy Literals */
+            LZ4_wildCopy(op, anchor, op+litLength);
+            op+=litLength;
+        }
+
+_next_match:
+        /* Encode Offset */
+        LZ4_writeLE16(op, (U16)(ip-match)); op+=2;
+
+        /* Encode MatchLength */
+        {
+            unsigned matchLength;
+
+            if ((dict==usingExtDict) && (lowLimit==dictionary))
+            {
+                const BYTE* limit;
+                match += refDelta;
+                limit = ip + (dictEnd-match);
+                if (limit > matchlimit) limit = matchlimit;
+                matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, limit);
+                ip += MINMATCH + matchLength;
+                if (ip==limit)
+                {
+                    unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit);
+                    matchLength += more;
+                    ip += more;
+                }
+            }
+            else
+            {
+                matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
+                ip += MINMATCH + matchLength;
+            }
+
+            if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit)))
+                return 0;    /* Check output limit */
+            if (matchLength>=ML_MASK)
+            {
+                *token += ML_MASK;
+                matchLength -= ML_MASK;
+                for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; }
+                if (matchLength >= 255) { matchLength-=255; *op++ = 255; }
+                *op++ = (BYTE)matchLength;
+            }
+            else *token += (BYTE)(matchLength);
+        }
+
+        anchor = ip;
+
+        /* Test end of chunk */
+        if (ip > mflimit) break;
+
+        /* Fill table */
+        LZ4_putPosition(ip-2, ctx, tableType, base);
+
+        /* Test next position */
+        match = LZ4_getPosition(ip, ctx, tableType, base);
+        if (dict==usingExtDict)
+        {
+            if (match<(const BYTE*)source)
+            {
+                refDelta = dictDelta;
+                lowLimit = dictionary;
+            }
+            else
+            {
+                refDelta = 0;
+                lowLimit = (const BYTE*)source;
+            }
+        }
+        LZ4_putPosition(ip, ctx, tableType, base);
+        if ( ((dictIssue==dictSmall) ? (match>=lowRefLimit) : 1)
+            && (match+MAX_DISTANCE>=ip)
+            && (LZ4_read32(match+refDelta)==LZ4_read32(ip)) )
+        { token=op++; *token=0; goto _next_match; }
+
+        /* Prepare next loop */
+        forwardH = LZ4_hashPosition(++ip, tableType);
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {
+        const size_t lastRun = (size_t)(iend - anchor);
+        if ((outputLimited) && ((op - (BYTE*)dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize))
+            return 0;   /* Check output limit */
+        if (lastRun >= RUN_MASK)
+        {
+            size_t accumulator = lastRun - RUN_MASK;
+            *op++ = RUN_MASK << ML_BITS;
+            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
+            *op++ = (BYTE) accumulator;
+        }
+        else
+        {
+            *op++ = (BYTE)(lastRun<<ML_BITS);
+        }
+        memcpy(op, anchor, lastRun);
+        op += lastRun;
+    }
+
+    /* End */
+    return (int) (((char*)op)-dest);
+}
+
+
+int LZ4_compress_fast_extState(void* state, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_resetStream((LZ4_stream_t*)state);
+    if (acceleration < 1) acceleration = ACCELERATION_DEFAULT;
+
+    if (maxOutputSize >= LZ4_compressBound(inputSize))
+    {
+        if (inputSize < LZ4_64Klimit)
+            return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16,                        noDict, noDictIssue, acceleration);
+        else
+            return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
+    }
+    else
+    {
+        if (inputSize < LZ4_64Klimit)
+            return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16,                        noDict, noDictIssue, acceleration);
+        else
+            return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
+    }
+}
+
+
+int LZ4_compress_fast(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+#if (HEAPMODE)
+    void* ctxPtr = ALLOCATOR(1, sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+#else
+    LZ4_stream_t ctx;
+    void* ctxPtr = &ctx;
+#endif
+
+    int result = LZ4_compress_fast_extState(ctxPtr, source, dest, inputSize, maxOutputSize, acceleration);
+
+#if (HEAPMODE)
+    FREEMEM(ctxPtr);
+#endif
+    return result;
+}
+
+
+int LZ4_compress_default(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+    return LZ4_compress_fast(source, dest, inputSize, maxOutputSize, 1);
+}
+
+
+/* hidden debug function */
+/* strangely enough, gcc generates faster code when this function is uncommented, even if unused */
+int LZ4_compress_fast_force(const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_stream_t ctx;
+
+    LZ4_resetStream(&ctx);
+
+    if (inputSize < LZ4_64Klimit)
+        return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, byU16,                        noDict, noDictIssue, acceleration);
+    else
+        return LZ4_compress_generic(&ctx, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64bits() ? byU32 : byPtr, noDict, noDictIssue, acceleration);
+}
+
+
+/********************************
+*  destSize variant
+********************************/
+
+static int LZ4_compress_destSize_generic(
+                       void* const ctx,
+                 const char* const src,
+                       char* const dst,
+                       int*  const srcSizePtr,
+                 const int targetDstSize,
+                 const tableType_t tableType)
+{
+    const BYTE* ip = (const BYTE*) src;
+    const BYTE* base = (const BYTE*) src;
+    const BYTE* lowLimit = (const BYTE*) src;
+    const BYTE* anchor = ip;
+    const BYTE* const iend = ip + *srcSizePtr;
+    const BYTE* const mflimit = iend - MFLIMIT;
+    const BYTE* const matchlimit = iend - LASTLITERALS;
+
+    BYTE* op = (BYTE*) dst;
+    BYTE* const oend = op + targetDstSize;
+    BYTE* const oMaxLit = op + targetDstSize - 2 /* offset */ - 8 /* because 8+MINMATCH==MFLIMIT */ - 1 /* token */;
+    BYTE* const oMaxMatch = op + targetDstSize - (LASTLITERALS + 1 /* token */);
+    BYTE* const oMaxSeq = oMaxLit - 1 /* token */;
+
+    U32 forwardH;
+
+
+    /* Init conditions */
+    if (targetDstSize < 1) return 0;                                     /* Impossible to store anything */
+    if ((U32)*srcSizePtr > (U32)LZ4_MAX_INPUT_SIZE) return 0;            /* Unsupported input size, too large (or negative) */
+    if ((tableType == byU16) && (*srcSizePtr>=LZ4_64Klimit)) return 0;   /* Size too large (not within 64K limit) */
+    if (*srcSizePtr<LZ4_minLength) goto _last_literals;                  /* Input too small, no compression (all literals) */
+
+    /* First Byte */
+    *srcSizePtr = 0;
+    LZ4_putPosition(ip, ctx, tableType, base);
+    ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+    /* Main Loop */
+    for ( ; ; )
+    {
+        const BYTE* match;
+        BYTE* token;
+        {
+            const BYTE* forwardIp = ip;
+            unsigned step = 1;
+            unsigned searchMatchNb = 1 << LZ4_skipTrigger;
+
+            /* Find a match */
+            do {
+                U32 h = forwardH;
+                ip = forwardIp;
+                forwardIp += step;
+                step = (searchMatchNb++ >> LZ4_skipTrigger);
+
+                if (unlikely(forwardIp > mflimit))
+                    goto _last_literals;
+
+                match = LZ4_getPositionOnHash(h, ctx, tableType, base);
+                forwardH = LZ4_hashPosition(forwardIp, tableType);
+                LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
+
+            } while ( ((tableType==byU16) ? 0 : (match + MAX_DISTANCE < ip))
+                || (LZ4_read32(match) != LZ4_read32(ip)) );
+        }
+
+        /* Catch up */
+        while ((ip>anchor) && (match > lowLimit) && (unlikely(ip[-1]==match[-1]))) { ip--; match--; }
+
+        {
+            /* Encode Literal length */
+            unsigned litLength = (unsigned)(ip - anchor);
+            token = op++;
+            if (op + ((litLength+240)/255) + litLength > oMaxLit)
+            {
+                /* Not enough space for a last match */
+                op--;
+                goto _last_literals;
+            }
+            if (litLength>=RUN_MASK)
+            {
+                unsigned len = litLength - RUN_MASK;
+                *token=(RUN_MASK<<ML_BITS);
+                for(; len >= 255 ; len-=255) *op++ = 255;
+                *op++ = (BYTE)len;
+            }
+            else *token = (BYTE)(litLength<<ML_BITS);
+
+            /* Copy Literals */
+            LZ4_wildCopy(op, anchor, op+litLength);
+            op += litLength;
+        }
+
+_next_match:
+        /* Encode Offset */
+        LZ4_writeLE16(op, (U16)(ip-match)); op+=2;
+
+        /* Encode MatchLength */
+        {
+            size_t matchLength;
+
+            matchLength = LZ4_count(ip+MINMATCH, match+MINMATCH, matchlimit);
+
+            if (op + ((matchLength+240)/255) > oMaxMatch)
+            {
+                /* Match description too long : reduce it */
+                matchLength = (15-1) + (oMaxMatch-op) * 255;
+            }
+            //printf("offset %5i, matchLength%5i \n", (int)(ip-match), matchLength + MINMATCH);
+            ip += MINMATCH + matchLength;
+
+            if (matchLength>=ML_MASK)
+            {
+                *token += ML_MASK;
+                matchLength -= ML_MASK;
+                while (matchLength >= 255) { matchLength-=255; *op++ = 255; }
+                *op++ = (BYTE)matchLength;
+            }
+            else *token += (BYTE)(matchLength);
+        }
+
+        anchor = ip;
+
+        /* Test end of block */
+        if (ip > mflimit) break;
+        if (op > oMaxSeq) break;
+
+        /* Fill table */
+        LZ4_putPosition(ip-2, ctx, tableType, base);
+
+        /* Test next position */
+        match = LZ4_getPosition(ip, ctx, tableType, base);
+        LZ4_putPosition(ip, ctx, tableType, base);
+        if ( (match+MAX_DISTANCE>=ip)
+            && (LZ4_read32(match)==LZ4_read32(ip)) )
+        { token=op++; *token=0; goto _next_match; }
+
+        /* Prepare next loop */
+        forwardH = LZ4_hashPosition(++ip, tableType);
+    }
+
+_last_literals:
+    /* Encode Last Literals */
+    {
+        size_t lastRunSize = (size_t)(iend - anchor);
+        if (op + 1 /* token */ + ((lastRunSize+240)/255) /* litLength */ + lastRunSize /* literals */ > oend)
+        {
+            /* adapt lastRunSize to fill 'dst' */
+            lastRunSize  = (oend-op) - 1;
+            lastRunSize -= (lastRunSize+240)/255;
+        }
+        ip = anchor + lastRunSize;
+
+        if (lastRunSize >= RUN_MASK)
+        {
+            size_t accumulator = lastRunSize - RUN_MASK;
+            *op++ = RUN_MASK << ML_BITS;
+            for(; accumulator >= 255 ; accumulator-=255) *op++ = 255;
+            *op++ = (BYTE) accumulator;
+        }
+        else
+        {
+            *op++ = (BYTE)(lastRunSize<<ML_BITS);
+        }
+        memcpy(op, anchor, lastRunSize);
+        op += lastRunSize;
+    }
+
+    /* End */
+    *srcSizePtr = (int) (((const char*)ip)-src);
+    return (int) (((char*)op)-dst);
+}
+
+
+static int LZ4_compress_destSize_extState (void* state, const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+    LZ4_resetStream((LZ4_stream_t*)state);
+
+    if (targetDstSize >= LZ4_compressBound(*srcSizePtr))   /* compression success is guaranteed */
+    {
+        return LZ4_compress_fast_extState(state, src, dst, *srcSizePtr, targetDstSize, 1);
+    }
+    else
+    {
+        if (*srcSizePtr < LZ4_64Klimit)
+            return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, byU16);
+        else
+            return LZ4_compress_destSize_generic(state, src, dst, srcSizePtr, targetDstSize, LZ4_64bits() ? byU32 : byPtr);
+    }
+}
+
+
+int LZ4_compress_destSize(const char* src, char* dst, int* srcSizePtr, int targetDstSize)
+{
+#if (HEAPMODE)
+    void* ctx = ALLOCATOR(1, sizeof(LZ4_stream_t));   /* malloc-calloc always properly aligned */
+#else
+    LZ4_stream_t ctxBody;
+    void* ctx = &ctxBody;
+#endif
+
+    int result = LZ4_compress_destSize_extState(ctx, src, dst, srcSizePtr, targetDstSize);
+
+#if (HEAPMODE)
+    FREEMEM(ctx);
+#endif
+    return result;
+}
+
+
+
+/********************************
+*  Streaming functions
+********************************/
+
+LZ4_stream_t* LZ4_createStream(void)
+{
+    LZ4_stream_t* lz4s = (LZ4_stream_t*)ALLOCATOR(8, LZ4_STREAMSIZE_U64);
+    LZ4_STATIC_ASSERT(LZ4_STREAMSIZE >= sizeof(LZ4_stream_t_internal));    /* A compilation error here means LZ4_STREAMSIZE is not large enough */
+    LZ4_resetStream(lz4s);
+    return lz4s;
+}
+
+void LZ4_resetStream (LZ4_stream_t* LZ4_stream)
+{
+    MEM_INIT(LZ4_stream, 0, sizeof(LZ4_stream_t));
+}
+
+int LZ4_freeStream (LZ4_stream_t* LZ4_stream)
+{
+    FREEMEM(LZ4_stream);
+    return (0);
+}
+
+
+#define HASH_UNIT sizeof(size_t)
+int LZ4_loadDict (LZ4_stream_t* LZ4_dict, const char* dictionary, int dictSize)
+{
+    LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
+    const BYTE* p = (const BYTE*)dictionary;
+    const BYTE* const dictEnd = p + dictSize;
+    const BYTE* base;
+
+    if ((dict->initCheck) || (dict->currentOffset > 1 GB))  /* Uninitialized structure, or reuse overflow */
+        LZ4_resetStream(LZ4_dict);
+
+    if (dictSize < (int)HASH_UNIT)
+    {
+        dict->dictionary = NULL;
+        dict->dictSize = 0;
+        return 0;
+    }
+
+    if ((dictEnd - p) > 64 KB) p = dictEnd - 64 KB;
+    dict->currentOffset += 64 KB;
+    base = p - dict->currentOffset;
+    dict->dictionary = p;
+    dict->dictSize = (U32)(dictEnd - p);
+    dict->currentOffset += dict->dictSize;
+
+    while (p <= dictEnd-HASH_UNIT)
+    {
+        LZ4_putPosition(p, dict->hashTable, byU32, base);
+        p+=3;
+    }
+
+    return dict->dictSize;
+}
+
+
+static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src)
+{
+    if ((LZ4_dict->currentOffset > 0x80000000) ||
+        ((size_t)LZ4_dict->currentOffset > (size_t)src))   /* address space overflow */
+    {
+        /* rescale hash table */
+        U32 delta = LZ4_dict->currentOffset - 64 KB;
+        const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize;
+        int i;
+        for (i=0; i<HASH_SIZE_U32; i++)
+        {
+            if (LZ4_dict->hashTable[i] < delta) LZ4_dict->hashTable[i]=0;
+            else LZ4_dict->hashTable[i] -= delta;
+        }
+        LZ4_dict->currentOffset = 64 KB;
+        if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB;
+        LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize;
+    }
+}
+
+
+int LZ4_compress_fast_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize, int acceleration)
+{
+    LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream;
+    const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
+
+    const BYTE* smallest = (const BYTE*) source;
+    if (streamPtr->initCheck) return 0;   /* Uninitialized structure detected */
+    if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd;
+    LZ4_renormDictT(streamPtr, smallest);
+    if (acceleration < 1) acceleration = ACCELERATION_DEFAULT;
+
+    /* Check overlapping input/dictionary space */
+    {
+        const BYTE* sourceEnd = (const BYTE*) source + inputSize;
+        if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd))
+        {
+            streamPtr->dictSize = (U32)(dictEnd - sourceEnd);
+            if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB;
+            if (streamPtr->dictSize < 4) streamPtr->dictSize = 0;
+            streamPtr->dictionary = dictEnd - streamPtr->dictSize;
+        }
+    }
+
+    /* prefix mode : source data follows dictionary */
+    if (dictEnd == (const BYTE*)source)
+    {
+        int result;
+        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, dictSmall, acceleration);
+        else
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, withPrefix64k, noDictIssue, acceleration);
+        streamPtr->dictSize += (U32)inputSize;
+        streamPtr->currentOffset += (U32)inputSize;
+        return result;
+    }
+
+    /* external dictionary mode */
+    {
+        int result;
+        if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset))
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, dictSmall, acceleration);
+        else
+            result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput, byU32, usingExtDict, noDictIssue, acceleration);
+        streamPtr->dictionary = (const BYTE*)source;
+        streamPtr->dictSize = (U32)inputSize;
+        streamPtr->currentOffset += (U32)inputSize;
+        return result;
+    }
+}
+
+
+/* Hidden debug function, to force external dictionary mode */
+int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize)
+{
+    LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict;
+    int result;
+    const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize;
+
+    const BYTE* smallest = dictEnd;
+    if (smallest > (const BYTE*) source) smallest = (const BYTE*) source;
+    LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest);
+
+    result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue, 1);
+
+    streamPtr->dictionary = (const BYTE*)source;
+    streamPtr->dictSize = (U32)inputSize;
+    streamPtr->currentOffset += (U32)inputSize;
+
+    return result;
+}
+
+
+int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize)
+{
+    LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict;
+    const BYTE* previousDictEnd = dict->dictionary + dict->dictSize;
+
+    if ((U32)dictSize > 64 KB) dictSize = 64 KB;   /* useless to define a dictionary > 64 KB */
+    if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize;
+
+    memmove(safeBuffer, previousDictEnd - dictSize, dictSize);
+
+    dict->dictionary = (const BYTE*)safeBuffer;
+    dict->dictSize = (U32)dictSize;
+
+    return dictSize;
+}
+
+
+
+/*******************************
+*  Decompression functions
+*******************************/
+/*
+ * This generic decompression function cover all use cases.
+ * It shall be instantiated several times, using different sets of directives
+ * Note that it is essential this generic function is really inlined,
+ * in order to remove useless branches during compilation optimization.
+ */
+FORCE_INLINE int LZ4_decompress_generic(
+                 const char* const source,
+                 char* const dest,
+                 int inputSize,
+                 int outputSize,         /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */
+
+                 int endOnInput,         /* endOnOutputSize, endOnInputSize */
+                 int partialDecoding,    /* full, partial */
+                 int targetOutputSize,   /* only used if partialDecoding==partial */
+                 int dict,               /* noDict, withPrefix64k, usingExtDict */
+                 const BYTE* const lowPrefix,  /* == dest if dict == noDict */
+                 const BYTE* const dictStart,  /* only if dict==usingExtDict */
+                 const size_t dictSize         /* note : = 0 if noDict */
+                 )
+{
+    /* Local Variables */
+    const BYTE* ip = (const BYTE*) source;
+    const BYTE* const iend = ip + inputSize;
+
+    BYTE* op = (BYTE*) dest;
+    BYTE* const oend = op + outputSize;
+    BYTE* cpy;
+    BYTE* oexit = op + targetOutputSize;
+    const BYTE* const lowLimit = lowPrefix - dictSize;
+
+    const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize;
+    const size_t dec32table[] = {4, 1, 2, 1, 4, 4, 4, 4};
+    const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
+
+    const int safeDecode = (endOnInput==endOnInputSize);
+    const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB)));
+
+
+    /* Special cases */
+    if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT;                         /* targetOutputSize too high => decode everything */
+    if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1;  /* Empty output buffer */
+    if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1);
+
+
+    /* Main Loop */
+    while (1)
+    {
+        unsigned token;
+        size_t length;
+        const BYTE* match;
+
+        /* get literal length */
+        token = *ip++;
+        if ((length=(token>>ML_BITS)) == RUN_MASK)
+        {
+            unsigned s;
+            do
+            {
+                s = *ip++;
+                length += s;
+            }
+            while (likely((endOnInput)?ip<iend-RUN_MASK:1) && (s==255));
+            if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)(op))) goto _output_error;   /* overflow detection */
+            if ((safeDecode) && unlikely((size_t)(ip+length)<(size_t)(ip))) goto _output_error;   /* overflow detection */
+        }
+
+        /* copy literals */
+        cpy = op+length;
+        if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) )
+            || ((!endOnInput) && (cpy>oend-COPYLENGTH)))
+        {
+            if (partialDecoding)
+            {
+                if (cpy > oend) goto _output_error;                           /* Error : write attempt beyond end of output buffer */
+                if ((endOnInput) && (ip+length > iend)) goto _output_error;   /* Error : read attempt beyond end of input buffer */
+            }
+            else
+            {
+                if ((!endOnInput) && (cpy != oend)) goto _output_error;       /* Error : block decoding must stop exactly there */
+                if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error;   /* Error : input must be consumed */
+            }
+            memcpy(op, ip, length);
+            ip += length;
+            op += length;
+            break;     /* Necessarily EOF, due to parsing restrictions */
+        }
+        LZ4_wildCopy(op, ip, cpy);
+        ip += length; op = cpy;
+
+        /* get offset */
+        match = cpy - LZ4_readLE16(ip); ip+=2;
+        if ((checkOffset) && (unlikely(match < lowLimit))) goto _output_error;   /* Error : offset outside destination buffer */
+
+        /* get matchlength */
+        length = token & ML_MASK;
+        if (length == ML_MASK)
+        {
+            unsigned s;
+            do
+            {
+                if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error;
+                s = *ip++;
+                length += s;
+            } while (s==255);
+            if ((safeDecode) && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error;   /* overflow detection */
+        }
+        length += MINMATCH;
+
+        /* check external dictionary */
+        if ((dict==usingExtDict) && (match < lowPrefix))
+        {
+            if (unlikely(op+length > oend-LASTLITERALS)) goto _output_error;   /* doesn't respect parsing restriction */
+
+            if (length <= (size_t)(lowPrefix-match))
+            {
+                /* match can be copied as a single segment from external dictionary */
+                match = dictEnd - (lowPrefix-match);
+                memmove(op, match, length); op += length;
+            }
+            else
+            {
+                /* match encompass external dictionary and current segment */
+                size_t copySize = (size_t)(lowPrefix-match);
+                memcpy(op, dictEnd - copySize, copySize);
+                op += copySize;
+                copySize = length - copySize;
+                if (copySize > (size_t)(op-lowPrefix))   /* overlap within current segment */
+                {
+                    BYTE* const endOfMatch = op + copySize;
+                    const BYTE* copyFrom = lowPrefix;
+                    while (op < endOfMatch) *op++ = *copyFrom++;
+                }
+                else
+                {
+                    memcpy(op, lowPrefix, copySize);
+                    op += copySize;
+                }
+            }
+            continue;
+        }
+
+        /* copy repeated sequence */
+        cpy = op + length;
+        if (unlikely((op-match)<8))
+        {
+            const size_t dec64 = dec64table[op-match];
+            op[0] = match[0];
+            op[1] = match[1];
+            op[2] = match[2];
+            op[3] = match[3];
+            match += dec32table[op-match];
+            LZ4_copy4(op+4, match);
+            op += 8; match -= dec64;
+        } else { LZ4_copy8(op, match); op+=8; match+=8; }
+
+        if (unlikely(cpy>oend-12))
+        {
+            if (cpy > oend-LASTLITERALS) goto _output_error;    /* Error : last LASTLITERALS bytes must be literals */
+            if (op < oend-8)
+            {
+                LZ4_wildCopy(op, match, oend-8);
+                match += (oend-8) - op;
+                op = oend-8;
+            }
+            while (op<cpy) *op++ = *match++;
+        }
+        else
+            LZ4_wildCopy(op, match, cpy);
+        op=cpy;   /* correction */
+    }
+
+    /* end of decoding */
+    if (endOnInput)
+       return (int) (((char*)op)-dest);     /* Nb of output bytes decoded */
+    else
+       return (int) (((const char*)ip)-source);   /* Nb of input bytes read */
+
+    /* Overflow error detected */
+_output_error:
+    return (int) (-(((const char*)ip)-source))-1;
+}
+
+
+int LZ4_decompress_safe(const char* source, char* dest, int compressedSize, int maxDecompressedSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, full, 0, noDict, (BYTE*)dest, NULL, 0);
+}
+
+int LZ4_decompress_safe_partial(const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxDecompressedSize, endOnInputSize, partial, targetOutputSize, noDict, (BYTE*)dest, NULL, 0);
+}
+
+int LZ4_decompress_fast(const char* source, char* dest, int originalSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)(dest - 64 KB), NULL, 64 KB);
+}
+
+
+/* streaming decompression functions */
+
+typedef struct
+{
+    const BYTE* externalDict;
+    size_t extDictSize;
+    const BYTE* prefixEnd;
+    size_t prefixSize;
+} LZ4_streamDecode_t_internal;
+
+/*
+ * If you prefer dynamic allocation methods,
+ * LZ4_createStreamDecode()
+ * provides a pointer (void*) towards an initialized LZ4_streamDecode_t structure.
+ */
+LZ4_streamDecode_t* LZ4_createStreamDecode(void)
+{
+    LZ4_streamDecode_t* lz4s = (LZ4_streamDecode_t*) ALLOCATOR(1, sizeof(LZ4_streamDecode_t));
+    return lz4s;
+}
+
+int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream)
+{
+    FREEMEM(LZ4_stream);
+    return 0;
+}
+
+/*
+ * LZ4_setStreamDecode
+ * Use this function to instruct where to find the dictionary
+ * This function is not necessary if previous data is still available where it was decoded.
+ * Loading a size of 0 is allowed (same effect as no dictionary).
+ * Return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
+    lz4sd->prefixSize = (size_t) dictSize;
+    lz4sd->prefixEnd = (const BYTE*) dictionary + dictSize;
+    lz4sd->externalDict = NULL;
+    lz4sd->extDictSize  = 0;
+    return 1;
+}
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in "streaming" mode.
+    Previously decoded blocks must still be available at the memory position where they were decoded.
+    If it's not possible, save the relevant part of decoded data into a safe buffer,
+    and indicate where it stands using LZ4_setStreamDecode()
+*/
+int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
+    int result;
+
+    if (lz4sd->prefixEnd == (BYTE*)dest)
+    {
+        result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                        endOnInputSize, full, 0,
+                                        usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += result;
+        lz4sd->prefixEnd  += result;
+    }
+    else
+    {
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+        result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize,
+                                        endOnInputSize, full, 0,
+                                        usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = result;
+        lz4sd->prefixEnd  = (BYTE*)dest + result;
+    }
+
+    return result;
+}
+
+int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize)
+{
+    LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode;
+    int result;
+
+    if (lz4sd->prefixEnd == (BYTE*)dest)
+    {
+        result = LZ4_decompress_generic(source, dest, 0, originalSize,
+                                        endOnOutputSize, full, 0,
+                                        usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize += originalSize;
+        lz4sd->prefixEnd  += originalSize;
+    }
+    else
+    {
+        lz4sd->extDictSize = lz4sd->prefixSize;
+        lz4sd->externalDict = (BYTE*)dest - lz4sd->extDictSize;
+        result = LZ4_decompress_generic(source, dest, 0, originalSize,
+                                        endOnOutputSize, full, 0,
+                                        usingExtDict, (BYTE*)dest, lz4sd->externalDict, lz4sd->extDictSize);
+        if (result <= 0) return result;
+        lz4sd->prefixSize = originalSize;
+        lz4sd->prefixEnd  = (BYTE*)dest + originalSize;
+    }
+
+    return result;
+}
+
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as "_continue" ones,
+    the dictionary must be explicitly provided within parameters
+*/
+
+FORCE_INLINE int LZ4_decompress_usingDict_generic(const char* source, char* dest, int compressedSize, int maxOutputSize, int safe, const char* dictStart, int dictSize)
+{
+    if (dictSize==0)
+        return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest, NULL, 0);
+    if (dictStart+dictSize == dest)
+    {
+        if (dictSize >= (int)(64 KB - 1))
+            return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, withPrefix64k, (BYTE*)dest-64 KB, NULL, 0);
+        return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, noDict, (BYTE*)dest-dictSize, NULL, 0);
+    }
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, safe, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
+{
+    return LZ4_decompress_usingDict_generic(source, dest, compressedSize, maxOutputSize, 1, dictStart, dictSize);
+}
+
+int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize)
+{
+    return LZ4_decompress_usingDict_generic(source, dest, 0, originalSize, 0, dictStart, dictSize);
+}
+
+/* debug function */
+int LZ4_decompress_safe_forceExtDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, (BYTE*)dest, (const BYTE*)dictStart, dictSize);
+}
+
+
+/***************************************************
+*  Obsolete Functions
+***************************************************/
+/* obsolete compression functions */
+int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize) { return LZ4_compress_default(source, dest, inputSize, maxOutputSize); }
+int LZ4_compress(const char* source, char* dest, int inputSize) { return LZ4_compress_default(source, dest, inputSize, LZ4_compressBound(inputSize)); }
+int LZ4_compress_limitedOutput_withState (void* state, const char* src, char* dst, int srcSize, int dstSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, dstSize, 1); }
+int LZ4_compress_withState (void* state, const char* src, char* dst, int srcSize) { return LZ4_compress_fast_extState(state, src, dst, srcSize, LZ4_compressBound(srcSize), 1); }
+int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* src, char* dst, int srcSize, int maxDstSize) { return LZ4_compress_fast_continue(LZ4_stream, src, dst, srcSize, maxDstSize, 1); }
+int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) { return LZ4_compress_fast_continue(LZ4_stream, source, dest, inputSize, LZ4_compressBound(inputSize), 1); }
+
+/*
+These function names are deprecated and should no longer be used.
+They are only provided here for compatibility with older user programs.
+- LZ4_uncompress is totally equivalent to LZ4_decompress_fast
+- LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe
+*/
+int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); }
+int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); }
+
+
+/* Obsolete Streaming functions */
+
+int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; }
+
+static void LZ4_init(LZ4_stream_t_internal* lz4ds, BYTE* base)
+{
+    MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE);
+    lz4ds->bufferStart = base;
+}
+
+int LZ4_resetStreamState(void* state, char* inputBuffer)
+{
+    if ((((size_t)state) & 3) != 0) return 1;   /* Error : pointer is not aligned on 4-bytes boundary */
+    LZ4_init((LZ4_stream_t_internal*)state, (BYTE*)inputBuffer);
+    return 0;
+}
+
+void* LZ4_create (char* inputBuffer)
+{
+    void* lz4ds = ALLOCATOR(8, LZ4_STREAMSIZE_U64);
+    LZ4_init ((LZ4_stream_t_internal*)lz4ds, (BYTE*)inputBuffer);
+    return lz4ds;
+}
+
+char* LZ4_slideInputBuffer (void* LZ4_Data)
+{
+    LZ4_stream_t_internal* ctx = (LZ4_stream_t_internal*)LZ4_Data;
+    int dictSize = LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)ctx->bufferStart, 64 KB);
+    return (char*)(ctx->bufferStart + dictSize);
+}
+
+/* Obsolete streaming decompression functions */
+
+int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize)
+{
+    return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB);
+}
+
+int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize)
+{
+    return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, (BYTE*)dest - 64 KB, NULL, 64 KB);
+}
+
+#endif   /* LZ4_COMMONDEFS_ONLY */
+
diff --git a/thirdparty/etcpak/lz4/lz4.h b/thirdparty/etcpak/lz4/lz4.h
new file mode 100644
index 000000000000..3e7400225612
--- /dev/null
+++ b/thirdparty/etcpak/lz4/lz4.h
@@ -0,0 +1,360 @@
+/*
+   LZ4 - Fast LZ compression algorithm
+   Header File
+   Copyright (C) 2011-2015, Yann Collet.
+
+   BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+   met:
+
+       * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+       * Redistributions in binary form must reproduce the above
+   copyright notice, this list of conditions and the following disclaimer
+   in the documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   You can contact the author at :
+   - LZ4 source repository : https://github.com/Cyan4973/lz4
+   - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+#pragma once
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+/*
+ * lz4.h provides block compression functions, and gives full buffer control to programmer.
+ * If you need to generate inter-operable compressed data (respecting LZ4 frame specification),
+ * and can let the library handle its own memory, please use lz4frame.h instead.
+*/
+
+/**************************************
+*  Version
+**************************************/
+#define LZ4_VERSION_MAJOR    1    /* for breaking interface changes  */
+#define LZ4_VERSION_MINOR    7    /* for new (non-breaking) interface capabilities */
+#define LZ4_VERSION_RELEASE  1    /* for tweaks, bug-fixes, or development */
+#define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE)
+int LZ4_versionNumber (void);
+
+/**************************************
+*  Tuning parameter
+**************************************/
+/*
+ * LZ4_MEMORY_USAGE :
+ * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+ * Increasing memory usage improves compression ratio
+ * Reduced memory usage can improve speed, due to cache effect
+ * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
+ */
+#define LZ4_MEMORY_USAGE 14
+
+
+/**************************************
+*  Simple Functions
+**************************************/
+
+int LZ4_compress_default(const char* source, char* dest, int sourceSize, int maxDestSize);
+int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize);
+
+/*
+LZ4_compress_default() :
+    Compresses 'sourceSize' bytes from buffer 'source'
+    into already allocated 'dest' buffer of size 'maxDestSize'.
+    Compression is guaranteed to succeed if 'maxDestSize' >= LZ4_compressBound(sourceSize).
+    It also runs faster, so it's a recommended setting.
+    If the function cannot compress 'source' into a more limited 'dest' budget,
+    compression stops *immediately*, and the function result is zero.
+    As a consequence, 'dest' content is not valid.
+    This function never writes outside 'dest' buffer, nor read outside 'source' buffer.
+        sourceSize  : Max supported value is LZ4_MAX_INPUT_VALUE
+        maxDestSize : full or partial size of buffer 'dest' (which must be already allocated)
+        return : the number of bytes written into buffer 'dest' (necessarily <= maxOutputSize)
+              or 0 if compression fails
+
+LZ4_decompress_safe() :
+    compressedSize : is the precise full size of the compressed block.
+    maxDecompressedSize : is the size of destination buffer, which must be already allocated.
+    return : the number of bytes decompressed into destination buffer (necessarily <= maxDecompressedSize)
+             If destination buffer is not large enough, decoding will stop and output an error code (<0).
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+             This function is protected against buffer overflow exploits, including malicious data packets.
+             It never writes outside output buffer, nor reads outside input buffer.
+*/
+
+
+/**************************************
+*  Advanced Functions
+**************************************/
+#define LZ4_MAX_INPUT_SIZE        0x7E000000   /* 2 113 929 216 bytes */
+#define LZ4_COMPRESSBOUND(isize)  ((unsigned)(isize) > (unsigned)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
+
+/*
+LZ4_compressBound() :
+    Provides the maximum size that LZ4 compression may output in a "worst case" scenario (input data not compressible)
+    This function is primarily useful for memory allocation purposes (destination buffer size).
+    Macro LZ4_COMPRESSBOUND() is also provided for compilation-time evaluation (stack memory allocation for example).
+    Note that LZ4_compress_default() compress faster when dest buffer size is >= LZ4_compressBound(srcSize)
+        inputSize  : max supported value is LZ4_MAX_INPUT_SIZE
+        return : maximum output size in a "worst case" scenario
+              or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE)
+*/
+int LZ4_compressBound(int inputSize);
+
+/*
+LZ4_compress_fast() :
+    Same as LZ4_compress_default(), but allows to select an "acceleration" factor.
+    The larger the acceleration value, the faster the algorithm, but also the lesser the compression.
+    It's a trade-off. It can be fine tuned, with each successive value providing roughly +~3% to speed.
+    An acceleration value of "1" is the same as regular LZ4_compress_default()
+    Values <= 0 will be replaced by ACCELERATION_DEFAULT (see lz4.c), which is 1.
+*/
+int LZ4_compress_fast (const char* source, char* dest, int sourceSize, int maxDestSize, int acceleration);
+
+
+/*
+LZ4_compress_fast_extState() :
+    Same compression function, just using an externally allocated memory space to store compression state.
+    Use LZ4_sizeofState() to know how much memory must be allocated,
+    and allocate it on 8-bytes boundaries (using malloc() typically).
+    Then, provide it as 'void* state' to compression function.
+*/
+int LZ4_sizeofState(void);
+int LZ4_compress_fast_extState (void* state, const char* source, char* dest, int inputSize, int maxDestSize, int acceleration);
+
+
+/*
+LZ4_compress_destSize() :
+    Reverse the logic, by compressing as much data as possible from 'source' buffer
+    into already allocated buffer 'dest' of size 'targetDestSize'.
+    This function either compresses the entire 'source' content into 'dest' if it's large enough,
+    or fill 'dest' buffer completely with as much data as possible from 'source'.
+        *sourceSizePtr : will be modified to indicate how many bytes where read from 'source' to fill 'dest'.
+                         New value is necessarily <= old value.
+        return : Nb bytes written into 'dest' (necessarily <= targetDestSize)
+              or 0 if compression fails
+*/
+int LZ4_compress_destSize (const char* source, char* dest, int* sourceSizePtr, int targetDestSize);
+
+
+/*
+LZ4_decompress_fast() :
+    originalSize : is the original and therefore uncompressed size
+    return : the number of bytes read from the source buffer (in other words, the compressed size)
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+             Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes.
+    note : This function fully respect memory boundaries for properly formed compressed data.
+           It is a bit faster than LZ4_decompress_safe().
+           However, it does not provide any protection against intentionally modified data stream (malicious input).
+           Use this function in trusted environment only (data to decode comes from a trusted source).
+*/
+int LZ4_decompress_fast (const char* source, char* dest, int originalSize);
+
+/*
+LZ4_decompress_safe_partial() :
+    This function decompress a compressed block of size 'compressedSize' at position 'source'
+    into destination buffer 'dest' of size 'maxDecompressedSize'.
+    The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached,
+    reducing decompression time.
+    return : the number of bytes decoded in the destination buffer (necessarily <= maxDecompressedSize)
+       Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller.
+             Always control how many bytes were decoded.
+             If the source stream is detected malformed, the function will stop decoding and return a negative result.
+             This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets
+*/
+int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize);
+
+
+/***********************************************
+*  Streaming Compression Functions
+***********************************************/
+#define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4)
+#define LZ4_STREAMSIZE     (LZ4_STREAMSIZE_U64 * sizeof(long long))
+/*
+ * LZ4_stream_t
+ * information structure to track an LZ4 stream.
+ * important : init this structure content before first use !
+ * note : only allocated directly the structure if you are statically linking LZ4
+ *        If you are using liblz4 as a DLL, please use below construction methods instead.
+ */
+typedef struct { long long table[LZ4_STREAMSIZE_U64]; } LZ4_stream_t;
+
+/*
+ * LZ4_resetStream
+ * Use this function to init an allocated LZ4_stream_t structure
+ */
+void LZ4_resetStream (LZ4_stream_t* streamPtr);
+
+/*
+ * LZ4_createStream will allocate and initialize an LZ4_stream_t structure
+ * LZ4_freeStream releases its memory.
+ * In the context of a DLL (liblz4), please use these methods rather than the static struct.
+ * They are more future proof, in case of a change of LZ4_stream_t size.
+ */
+LZ4_stream_t* LZ4_createStream(void);
+int           LZ4_freeStream (LZ4_stream_t* streamPtr);
+
+/*
+ * LZ4_loadDict
+ * Use this function to load a static dictionary into LZ4_stream.
+ * Any previous data will be forgotten, only 'dictionary' will remain in memory.
+ * Loading a size of 0 is allowed.
+ * Return : dictionary size, in bytes (necessarily <= 64 KB)
+ */
+int LZ4_loadDict (LZ4_stream_t* streamPtr, const char* dictionary, int dictSize);
+
+/*
+ * LZ4_compress_fast_continue
+ * Compress buffer content 'src', using data from previously compressed blocks as dictionary to improve compression ratio.
+ * Important : Previous data blocks are assumed to still be present and unmodified !
+ * 'dst' buffer must be already allocated.
+ * If maxDstSize >= LZ4_compressBound(srcSize), compression is guaranteed to succeed, and runs faster.
+ * If not, and if compressed data cannot fit into 'dst' buffer size, compression stops, and function returns a zero.
+ */
+int LZ4_compress_fast_continue (LZ4_stream_t* streamPtr, const char* src, char* dst, int srcSize, int maxDstSize, int acceleration);
+
+/*
+ * LZ4_saveDict
+ * If previously compressed data block is not guaranteed to remain available at its memory location
+ * save it into a safer place (char* safeBuffer)
+ * Note : you don't need to call LZ4_loadDict() afterwards,
+ *        dictionary is immediately usable, you can therefore call LZ4_compress_fast_continue()
+ * Return : saved dictionary size in bytes (necessarily <= dictSize), or 0 if error
+ */
+int LZ4_saveDict (LZ4_stream_t* streamPtr, char* safeBuffer, int dictSize);
+
+
+/************************************************
+*  Streaming Decompression Functions
+************************************************/
+
+#define LZ4_STREAMDECODESIZE_U64  4
+#define LZ4_STREAMDECODESIZE     (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long))
+typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t;
+/*
+ * LZ4_streamDecode_t
+ * information structure to track an LZ4 stream.
+ * init this structure content using LZ4_setStreamDecode or memset() before first use !
+ *
+ * In the context of a DLL (liblz4) please prefer usage of construction methods below.
+ * They are more future proof, in case of a change of LZ4_streamDecode_t size in the future.
+ * LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure
+ * LZ4_freeStreamDecode releases its memory.
+ */
+LZ4_streamDecode_t* LZ4_createStreamDecode(void);
+int                 LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream);
+
+/*
+ * LZ4_setStreamDecode
+ * Use this function to instruct where to find the dictionary.
+ * Setting a size of 0 is allowed (same effect as reset).
+ * Return : 1 if OK, 0 if error
+ */
+int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
+
+/*
+*_continue() :
+    These decoding functions allow decompression of multiple blocks in "streaming" mode.
+    Previously decoded blocks *must* remain available at the memory position where they were decoded (up to 64 KB)
+    In the case of a ring buffers, decoding buffer must be either :
+    - Exactly same size as encoding buffer, with same update rule (block boundaries at same positions)
+      In which case, the decoding & encoding ring buffer can have any size, including very small ones ( < 64 KB).
+    - Larger than encoding buffer, by a minimum of maxBlockSize more bytes.
+      maxBlockSize is implementation dependent. It's the maximum size you intend to compress into a single block.
+      In which case, encoding and decoding buffers do not need to be synchronized,
+      and encoding ring buffer can have any size, including small ones ( < 64 KB).
+    - _At least_ 64 KB + 8 bytes + maxBlockSize.
+      In which case, encoding and decoding buffers do not need to be synchronized,
+      and encoding ring buffer can have any size, including larger than decoding buffer.
+    Whenever these conditions are not possible, save the last 64KB of decoded data into a safe buffer,
+    and indicate where it is saved using LZ4_setStreamDecode()
+*/
+int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxDecompressedSize);
+int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize);
+
+
+/*
+Advanced decoding functions :
+*_usingDict() :
+    These decoding functions work the same as
+    a combination of LZ4_setStreamDecode() followed by LZ4_decompress_x_continue()
+    They are stand-alone. They don't need nor update an LZ4_streamDecode_t structure.
+*/
+int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize);
+int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize);
+
+
+
+/**************************************
+*  Obsolete Functions
+**************************************/
+/* Deprecate Warnings */
+/* Should these warnings messages be a problem,
+   it is generally possible to disable them,
+   with -Wno-deprecated-declarations for gcc
+   or _CRT_SECURE_NO_WARNINGS in Visual for example.
+   You can also define LZ4_DEPRECATE_WARNING_DEFBLOCK. */
+#ifndef LZ4_DEPRECATE_WARNING_DEFBLOCK
+#  define LZ4_DEPRECATE_WARNING_DEFBLOCK
+#  define LZ4_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+#  if (LZ4_GCC_VERSION >= 405) || defined(__clang__)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated(message)))
+#  elif (LZ4_GCC_VERSION >= 301)
+#    define LZ4_DEPRECATED(message) __attribute__((deprecated))
+#  elif defined(_MSC_VER)
+#    define LZ4_DEPRECATED(message) __declspec(deprecated(message))
+#  else
+#    pragma message("WARNING: You need to implement LZ4_DEPRECATED for this compiler")
+#    define LZ4_DEPRECATED(message)
+#  endif
+#endif /* LZ4_DEPRECATE_WARNING_DEFBLOCK */
+
+/* Obsolete compression functions */
+/* These functions are planned to start generate warnings by r131 approximately */
+int LZ4_compress               (const char* source, char* dest, int sourceSize);
+int LZ4_compress_limitedOutput (const char* source, char* dest, int sourceSize, int maxOutputSize);
+int LZ4_compress_withState               (void* state, const char* source, char* dest, int inputSize);
+int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize);
+int LZ4_compress_continue                (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize);
+int LZ4_compress_limitedOutput_continue  (LZ4_stream_t* LZ4_streamPtr, const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/* Obsolete decompression functions */
+/* These function names are completely deprecated and must no longer be used.
+   They are only provided here for compatibility with older programs.
+    - LZ4_uncompress is the same as LZ4_decompress_fast
+    - LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe
+   These function prototypes are now disabled; uncomment them only if you really need them.
+   It is highly recommended to stop using these prototypes and migrate to maintained ones */
+/* int LZ4_uncompress (const char* source, char* dest, int outputSize); */
+/* int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); */
+
+/* Obsolete streaming functions; use new streaming interface whenever possible */
+LZ4_DEPRECATED("use LZ4_createStream() instead") void* LZ4_create (char* inputBuffer);
+LZ4_DEPRECATED("use LZ4_createStream() instead") int   LZ4_sizeofStreamState(void);
+LZ4_DEPRECATED("use LZ4_resetStream() instead")  int   LZ4_resetStreamState(void* state, char* inputBuffer);
+LZ4_DEPRECATED("use LZ4_saveDict() instead")     char* LZ4_slideInputBuffer (void* state);
+
+/* Obsolete streaming decoding functions */
+LZ4_DEPRECATED("use LZ4_decompress_safe_usingDict() instead") int LZ4_decompress_safe_withPrefix64k (const char* src, char* dst, int compressedSize, int maxDstSize);
+LZ4_DEPRECATED("use LZ4_decompress_fast_usingDict() instead") int LZ4_decompress_fast_withPrefix64k (const char* src, char* dst, int originalSize);
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/thirdparty/etcpak/mmap.cpp b/thirdparty/etcpak/mmap.cpp
new file mode 100644
index 000000000000..c2460ee9e4e8
--- /dev/null
+++ b/thirdparty/etcpak/mmap.cpp
@@ -0,0 +1,38 @@
+#include "mmap.hpp"
+
+#ifdef _WIN32
+#  include <io.h>
+#  include <windows.h>
+
+void* mmap( void* addr, size_t length, int prot, int flags, int fd, off_t offset )
+{
+    HANDLE hnd;
+    void* map = nullptr;
+
+    switch( prot )
+    {
+    case PROT_READ:
+        if( hnd = CreateFileMapping( HANDLE( _get_osfhandle( fd ) ), nullptr, PAGE_READONLY, 0, DWORD( length ), nullptr ) )
+        {
+            map = MapViewOfFile( hnd, FILE_MAP_READ, 0, 0, length );
+            CloseHandle( hnd );
+        }
+        break;
+    case PROT_WRITE:
+        if( hnd = CreateFileMapping( HANDLE( _get_osfhandle( fd ) ), nullptr, PAGE_READWRITE, 0, DWORD( length ), nullptr ) )
+        {
+            map = MapViewOfFile( hnd, FILE_MAP_WRITE, 0, 0, length );
+            CloseHandle( hnd );
+        }
+        break;
+    }
+
+    return map ? (char*)map + offset : (void*)-1;
+}
+
+int munmap( void* addr, size_t length )
+{
+    return UnmapViewOfFile( addr ) != 0 ? 0 : -1;
+}
+
+#endif
diff --git a/thirdparty/etcpak/mmap.hpp b/thirdparty/etcpak/mmap.hpp
new file mode 100644
index 000000000000..e4cfe7759ca7
--- /dev/null
+++ b/thirdparty/etcpak/mmap.hpp
@@ -0,0 +1,19 @@
+#ifndef __MMAP_HPP__
+#define __MMAP_HPP__
+
+#ifndef _WIN32
+#  include <sys/mman.h>
+#else
+#  include <string.h>
+#  include <sys/types.h>
+
+#  define PROT_READ 1
+#  define PROT_WRITE 2
+#  define MAP_SHARED 0
+
+void* mmap( void* addr, size_t length, int prot, int flags, int fd, off_t offset );
+int munmap( void* addr, size_t length );
+
+#endif
+
+#endif