From 508eb92e32aad7aacd49f4547511d7c41a0795dc Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Fri, 29 Mar 2024 07:03:28 +1100
Subject: [PATCH 01/37] Added DFS and ATK, but not fully tested.  Need to add
 COC.  Integrate with the rest of the code.

---
 src/core/codestream/ojph_codestream_local.cpp |  20 +-
 src/core/codestream/ojph_codestream_local.h   |  27 +-
 src/core/codestream/ojph_params.cpp           | 371 ++++++++++++++++--
 src/core/codestream/ojph_params_local.h       | 206 ++++++++--
 src/core/codestream/ojph_subband.cpp          |   9 +-
 src/core/common/ojph_version.h                |   4 +-
 6 files changed, 554 insertions(+), 83 deletions(-)

diff --git a/src/core/codestream/ojph_codestream_local.cpp b/src/core/codestream/ojph_codestream_local.cpp
index df2f18c4..c2154fa0 100644
--- a/src/core/codestream/ojph_codestream_local.cpp
+++ b/src/core/codestream/ojph_codestream_local.cpp
@@ -81,6 +81,8 @@ namespace ojph {
 
       used_qcc_fields = 0;
       qcc = qcc_store;
+      used_coc_fields = 0;
+      coc = coc_store;
 
       allocator = new mem_fixed_allocator;
       elastic_alloc = new mem_elastic_allocator(1048576); //1 megabyte
@@ -717,15 +719,15 @@ namespace ojph {
       {
         if (msg_level == OJPH_MSG_LEVEL::INFO)
         {
-          OJPH_INFO(0x00030001, "%s\n", msg);
+          OJPH_INFO(0x00030001, "%s", msg);
         }
         else if (msg_level == OJPH_MSG_LEVEL::WARN)
         {
-          OJPH_WARN(0x00030001, "%s\n", msg);
+          OJPH_WARN(0x00030001, "%s", msg);
         }
         else if (msg_level == OJPH_MSG_LEVEL::ERROR)
         {
-          OJPH_ERROR(0x00030001, "%s\n", msg);
+          OJPH_ERROR(0x00030001, "%s", msg);
         }
         else
           assert(0);
@@ -736,8 +738,8 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void codestream::read_headers(infile_base *file)
     {
-      ui16 marker_list[17] = { SOC, SIZ, CAP, PRF, CPF, COD, COC, QCD, QCC,
-        RGN, POC, PPM, TLM, PLM, CRG, COM, SOT };
+      ui16 marker_list[19] = { SOC, SIZ, CAP, PRF, CPF, COD, COC, QCD, QCC,
+        RGN, POC, PPM, TLM, PLM, CRG, COM, DFS, ATK, SOT };
       find_marker(file, marker_list, 1); //find SOC
       find_marker(file, marker_list + 1, 1); //find SIZ
       siz.read(file);
@@ -745,7 +747,7 @@ namespace ojph {
       int received_markers = 0; //check that COD, & QCD received
       while (true)
       {
-        marker_idx = find_marker(file, marker_list + 2, 15);
+        marker_idx = find_marker(file, marker_list + 2, 17);
         if (marker_idx == 0)
           cap.read(file);
         else if (marker_idx == 1)
@@ -805,11 +807,17 @@ namespace ojph {
         else if (marker_idx == 13)
           skip_marker(file, "COM", NULL, OJPH_MSG_LEVEL::NO_MSG, false);
         else if (marker_idx == 14)
+          dfs.read(file);
+        else if (marker_idx == 15)
+          atk.read(file);
+        else if (marker_idx == 16)
           break;
         else
           OJPH_ERROR(0x00030051, "File ended before finding a tile segment");
       }
 
+      //qcd.update(&dfs);
+
       if (received_markers != 3)
         OJPH_ERROR(0x00030052, "markers error, COD and QCD are required");
 
diff --git a/src/core/codestream/ojph_codestream_local.h b/src/core/codestream/ojph_codestream_local.h
index 5e0bbfaf..035b534f 100644
--- a/src/core/codestream/ojph_codestream_local.h
+++ b/src/core/codestream/ojph_codestream_local.h
@@ -148,20 +148,27 @@ namespace ojph {
       bool employ_color_transform;
       int planar;
       int profile;
-      ui32 tilepart_div;    // tilepart division value
-      bool need_tlm;       // true if tlm markers are needed
+      ui32 tilepart_div;     // tilepart division value
+      bool need_tlm;         // true if tlm markers are needed
       
     private:
-      param_siz siz;
-      param_cod cod;
-      param_cap cap;
-      param_qcd qcd;
-      param_tlm tlm;
+      param_siz siz;         // image and tile size
+      param_cod cod;         // coding style default
+      param_cap cap;         // extended capabilities
+      param_qcd qcd;         // quantization default
+      param_tlm tlm;         // tile-part lengths
 
-    private: // this is to handle qcc
+    private: // this is to handle qcc and coc
       int used_qcc_fields;
-      param_qcc qcc_store[4], *qcc; // we allocate 4, 
-                                    // if not enough, we allocate more
+      param_qcc *qcc;         // quantization component
+      param_qcc qcc_store[4]; // we allocate 4, we allocate more if needed
+      int used_coc_fields;
+      param_coc *coc;         // coding style component
+      param_coc coc_store[4]; // we allocate 4, we allocate more if needed
+
+    private:  // these are from Part 2 of the standard
+      param_dfs dfs;         // downsmapling factor styles
+      param_atk atk;         // arbitrary transformation kernels
 
     private:
       mem_fixed_allocator *allocator;
diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp
index fa194431..5243762f 100644
--- a/src/core/codestream/ojph_params.cpp
+++ b/src/core/codestream/ojph_params.cpp
@@ -417,6 +417,16 @@ namespace ojph {
       return u;
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    ui64 swap_byte(ui64 t)
+    {
+      ui64 u = swap_byte((ui32)(t & 0xFFFFFFFFu));
+      u <<= 32;
+      u |= swap_byte((ui32)(t >> 32));
+      return u;
+    }
+
     //////////////////////////////////////////////////////////////////////////
     //
     //
@@ -790,7 +800,7 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void param_qcd::set_rev_quant(ui32 bit_depth,
+    void param_qcd::set_rev_quant(int num_decomps, ui32 bit_depth,
                                   bool is_employing_color_transform)
     {
       int guard_bits = 1;
@@ -815,7 +825,7 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void param_qcd::set_irrev_quant()
+    void param_qcd::set_irrev_quant(int num_decomps)
     {
       int guard_bits = 1;
       Sqcd = (ui8)((guard_bits<<5)|0x2);//one guard bit, scalar quantization
@@ -859,13 +869,17 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     ui32 param_qcd::get_MAGBp() const
     { //this can be written better, but it is only executed once
+
+      // this assumes a bi-directional wavelet (conventional DWT)
+      ui32 num_decomps = (num_subbands - 1) / 3;
+
       ui32 B = 0;
       int irrev = Sqcd & 0x1F;
       if (irrev == 0) //reversible
-        for (ui32 i = 0; i < 3 * num_decomps + 1; ++i)
+        for (ui32 i = 0; i < num_subbands; ++i)
           B = ojph_max(B, (u8_SPqcd[i] >> 3) + get_num_guard_bits() - 1u);
       else if (irrev == 2) //scalar expounded
-        for (ui32 i = 0; i < 3 * num_decomps + 1; ++i)
+        for (ui32 i = 0; i < num_subbands; ++i)
         {
           ui32 nb = num_decomps - (i ? (i - 1) / 3 : 0); //decompsition level
           B = ojph_max(B, (u16_SPqcd[i] >> 11) + get_num_guard_bits() - nb);
@@ -877,14 +891,24 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    float param_qcd::irrev_get_delta(ui32 resolution, ui32 subband) const
+    float param_qcd::irrev_get_delta(const param_dfs* dfs, 
+                                     ui32 num_decompositions,
+                                     ui32 resolution, ui32 subband) const
     {
-      assert((resolution == 0 && subband == 0) ||
-             (resolution <= num_decomps && subband > 0 && subband<4));
-      assert((Sqcd & 0x1F) == 2);
       float arr[] = { 1.0f, 2.0f, 2.0f, 4.0f };
+      assert((Sqcd & 0x1F) == 2);
 
-      ui32 idx = resolution == 0 ? 0 : (resolution - 1) * 3 + subband;
+      ui32 idx = 
+        dfs->get_subband_idx(num_decompositions, resolution, subband);
+      if (idx >= num_subbands) {
+        OJPH_INFO(0x00050101, "Trying to access quantization step size for "
+          "subband %d when the QCD/QCC marker segment specifies "
+          "quantization step sizes for %d subbands only.  To continue "
+          "decoding, we are using the step size for subband %d, which can "
+          "produce incorrect results", 
+          idx + 1, num_subbands, num_subbands - 1);
+        idx = num_subbands - 1;
+      }
       int eps = u16_SPqcd[idx] >> 11;
       float mantissa;
       mantissa = (float)((u16_SPqcd[idx] & 0x7FF) | 0x800) * arr[subband];
@@ -900,12 +924,22 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    ui32 param_qcd::get_Kmax(ui32 resolution, ui32 subband) const
+    ui32 param_qcd::get_Kmax(const param_dfs* dfs, ui32 num_decompositions,
+                             ui32 resolution, ui32 subband) const
     {
-      assert((resolution == 0 && subband == 0) ||
-             (resolution <= num_decomps && subband > 0 && subband<4));
       ui32 num_bits = get_num_guard_bits();
-      ui32 idx = resolution == 0 ? 0 : (resolution - 1) * 3 + subband;
+      ui32 idx = 
+        dfs->get_subband_idx(num_decompositions, resolution, subband);
+      if (idx >= num_subbands) {
+        OJPH_INFO(0x00050111, "Trying to access quantization step size for "
+          "subband %d when the QCD/QCC marker segment specifies "
+          "quantization step sizes for %d subbands only.  To continue "
+          "decoding, we are using the step size for subband %d, which can "
+          "produce incorrect results", 
+          idx + 1, num_subbands, num_subbands - 1);
+        idx = num_subbands - 1;
+      }
+
       int irrev = Sqcd & 0x1F;
       if (irrev == 0) //reversible; this is (10.22) from the J2K book
       {
@@ -926,7 +960,6 @@ namespace ojph {
     bool param_qcd::write(outfile_base *file)
     {
       int irrev = Sqcd & 0x1F;
-      ui32 num_subbands = 1 + 3 * num_decomps;
 
       //marker size excluding header
       Lqcd = 3;
@@ -976,16 +1009,16 @@ namespace ojph {
         OJPH_ERROR(0x00050082, "error reading QCD marker");
       if ((Sqcd & 0x1F) == 0)
       {
-        num_decomps = (Lqcd - 4) / 3;
-        if (Lqcd != 4 + 3 * num_decomps)
+        num_subbands = (Lqcd - 3);
+        if (Lqcd != 3 + num_subbands)
           OJPH_ERROR(0x00050083, "wrong Lqcd value in QCD marker");
-        for (ui32 i = 0; i < 1 + 3 * num_decomps; ++i)
+        for (ui32 i = 0; i < num_subbands; ++i)
           if (file->read(&u8_SPqcd[i], 1) != 1)
             OJPH_ERROR(0x00050084, "error reading QCD marker");
       }
       else if ((Sqcd & 0x1F) == 1)
       {
-        num_decomps = 0;
+        num_subbands = 0;
         OJPH_ERROR(0x00050089, 
           "Scalar derived quantization is not supported yet in QCD marker");
         if (Lqcd != 5)
@@ -993,10 +1026,10 @@ namespace ojph {
       }
       else if ((Sqcd & 0x1F) == 2)
       {
-        num_decomps = (Lqcd - 5) / 6;
-        if (Lqcd != 5 + 6 * num_decomps)
+        num_subbands = (Lqcd - 3) / 2;
+        if (Lqcd != 3 + 2 * num_subbands)
           OJPH_ERROR(0x00050086, "wrong Lqcd value in QCD marker");
-        for (ui32 i = 0; i < 1 + 3 * num_decomps; ++i)
+        for (ui32 i = 0; i < num_subbands; ++i)
         {
           if (file->read(&u16_SPqcd[i], 2) != 2)
             OJPH_ERROR(0x00050087, "error reading QCD marker");
@@ -1036,20 +1069,19 @@ namespace ojph {
       }
       if (file->read(&Sqcd, 1) != 1)
         OJPH_ERROR(0x000500A4, "error reading QCC marker");
+      ui32 offset = num_comps < 257 ? 4 : 5;
       if ((Sqcd & 0x1F) == 0)
       {
-        ui32 offset = num_comps < 257 ? 5 : 6;
-        num_decomps = (Lqcd - offset) / 3;
-        if (Lqcd != offset + 3 * num_decomps)
+        num_subbands = (Lqcd - offset);
+        if (Lqcd != offset + num_subbands)
           OJPH_ERROR(0x000500A5, "wrong Lqcd value in QCC marker");
-        for (ui32 i = 0; i < 1 + 3 * num_decomps; ++i)
+        for (ui32 i = 0; i < num_subbands; ++i)
           if (file->read(&u8_SPqcd[i], 1) != 1)
             OJPH_ERROR(0x000500A6, "error reading QCC marker");
       }
       else if ((Sqcd & 0x1F) == 1)
       {
-        ui32 offset = num_comps < 257 ? 6 : 7;
-        num_decomps = 0;
+        num_subbands = 0;
         OJPH_ERROR(0x000500AB, 
           "Scalar derived quantization is not supported yet in QCC marker");
         if (Lqcd != offset)
@@ -1057,11 +1089,10 @@ namespace ojph {
       }
       else if ((Sqcd & 0x1F) == 2)
       {
-        ui32 offset = num_comps < 257 ? 6 : 7;
-        num_decomps = (Lqcd - offset) / 6;
-        if (Lqcd != offset + 6 * num_decomps)
+        num_subbands = (Lqcd - offset) / 2;
+        if (Lqcd != offset + 2 * num_subbands)
           OJPH_ERROR(0x000500A8, "wrong Lqcc value in QCC marker");
-        for (ui32 i = 0; i < 1 + 3 * num_decomps; ++i)
+        for (ui32 i = 0; i < num_subbands; ++i)
         {
           if (file->read(&u16_SPqcd[i], 2) != 2)
             OJPH_ERROR(0x000500A9, "error reading QCC marker");
@@ -1260,6 +1291,280 @@ namespace ojph {
       return result;
     }
 
-  }
+    //////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    //////////////////////////////////////////////////////////////////////////
 
-}
+    //////////////////////////////////////////////////////////////////////////
+    const param_dfs* param_dfs::get_dfs(int index) const
+    {
+      const param_dfs* p = this;
+      while (p && p->Sdfs != index)
+        p = p->next;
+      return p;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    param_dfs::dfs_dwt_type param_dfs::get_dwt_type(ui32 decomp_level) const
+    { 
+      assert(decomp_level > 0 && decomp_level <= Ids);
+
+      decomp_level = ojph_min(decomp_level, Ids);
+      ui8 d = decomp_level - 1;          // decomp_level starts from 1
+      ui8 idx = d >> 2;                  // complete bytes
+      ui8 bits = d & 0x3;                // bit within the bytes
+      ui8 val = (Ddfs[idx] >> (6 - 2 * bits)) & 0x3;
+      return (dfs_dwt_type)val;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    int param_dfs::get_subband_idx(ui32 num_decompositions, ui32 resolution,
+                                   ui32 subband) const
+    {
+      int idx;
+      if (this != NULL)
+      {
+        assert((resolution == 0 && subband == 0) || 
+               (resolution > 0 && resolution <= Ids && 
+                subband > 0 && subband < 4));
+
+        ui32 ns[4] = { 0, 3, 2, 2 };
+        ui32 off[4] = {};
+
+        idx = 0;
+        if (resolution > 0)
+        {
+          idx = 0;
+          ui32 i = 1;
+          for (; i < resolution; ++i)
+            idx += ns[get_dwt_type(num_decompositions - i + 1)];
+          dfs_dwt_type t = get_dwt_type(num_decompositions - i + 1);
+          idx += subband;
+          if (t == VERT_DWT && subband == 2)
+            --idx;
+        }
+      }
+      else 
+      {
+        assert(subband >= 0 && subband < 4);
+        idx = resolution ? (resolution - 1) * 3 + subband : 0;
+      }
+
+      return idx;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_dfs::read(infile_base *file)
+    {
+      if (Ldfs != 0) { // this param_dfs is used
+        param_dfs* p = this;
+        while (p->next != NULL)
+          p = p->next;
+        p->next = new param_dfs;
+        p = p->next;
+        return p->read(file);
+      }
+
+      if (file->read(&Ldfs, 2) != 2)
+        OJPH_ERROR(0x000500D1, "error reading DFS-Ldfs parameter");
+      Ldfs = swap_byte(Ldfs);
+      if (file->read(&Sdfs, 2) != 2)
+        OJPH_ERROR(0x000500D2, "error reading DFS-Sdfs parameter");
+      Sdfs = swap_byte(Sdfs);
+      if (Sdfs > 15)
+        OJPH_ERROR(0x000500D3, "The DFS-Sdfs parameter is %d, which is "
+          "larger than the permissible 15", Sdfs);
+      ui8 t, l_Ids = 0;
+      if (file->read(&l_Ids, 1) != 1)
+        OJPH_ERROR(0x000500D4, "error reading DFS-Ids parameter");
+      constexpr int max_Ddfs = sizeof(Ddfs) * 4;
+      if (l_Ids > max_Ddfs)
+        OJPH_INFO(0x000500D5, "The DFS-Ids parameter is %d; while this is "
+          "valid, the number is unnessarily large -- you do not need more "
+          "than %d.  Please contact me regarding this issue.", 
+          l_Ids, max_Ddfs);
+      Ids = l_Ids < max_Ddfs ? l_Ids : max_Ddfs;
+      for (int i = 0; i < Ids; i += 4)
+        if (file->read(&Ddfs[i / 4], 1) != 1)
+          OJPH_ERROR(0x000500D6, "error reading DFS-Ddfs parameters");
+      for (int i = Ids; i < l_Ids; i += 4)
+        if (file->read(&t, 1) != 1)
+          OJPH_ERROR(0x000500D7, "error reading DFS-Ddfs parameters");
+      return true;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    //////////////////////////////////////////////////////////////////////////
+
+    //////////////////////////////////////////////////////////////////////////
+    const param_atk* param_atk::get_atk(int index) const
+    {
+      const param_atk* p = this;
+      while (p && p->get_index() != index)
+        p = p->next;
+      return p;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_atk::read_coefficient(infile_base *file, float &K)
+    {
+      int coeff_type = get_coeff_type();
+      if (coeff_type == 0) { // 8bit
+        ui8 v;
+        if (file->read(&v, 1) != 1) return false;
+        K = v;
+      }
+      else if (coeff_type == 1) { // 16bit
+        ui16 v;
+        if (file->read(&v, 2) != 2) return false;
+        K = swap_byte(v);
+      }
+      else if (coeff_type == 2) { // float
+        if (file->read(&K, 4) != 4) return false;
+        ui32 t = swap_byte(*(ui32*)&K);
+        K = *(float*)&t;
+      }
+      else if (coeff_type == 3) { // double
+        double v;
+        if (file->read(&v, 8) != 8) return false;
+        ui64 t = swap_byte(*(ui64*)&v);
+        double u = *(float*)&t;
+        K = (float)u;
+      }
+      else if (coeff_type == 4) { // 128 bit float
+        ui64 v, v1;
+        if (file->read(&v, 8) != 8) return false;
+        if (file->read(&v1, 8) != 8) return false; // not needed
+        v = swap_byte(v);
+
+        // convert the MSB of 128b float to 32b float
+        // 32b float has 1 sign bit, 8 exponent (offset 127), 23 mantissa
+        // 128b float has 1 sign bit, 15 exponent (offset 16383), 112 mantissa
+        si32 t1 = (si32)((v >> 48) & 0x7FFF); // exponent
+        t1 -= 16383;
+        t1 += 127;
+        t1 = t1 & 0xFF;                      // removes MSBs if negative
+        t1 <<= 23;                           // move bits to their location
+        ui32 t = 0;
+        t |= ((ui32)(v >> 32) & 0x80000000); // copy sign bit
+        t |= t1;                             // copy exponent
+        t |= (ui32)((v >> 25) & 0x007FFFFF); // copy 23 mantissa
+        K = *(float*)&t;
+      }
+      return true;
+    }
+
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_atk::read_coefficient(infile_base *file, si16 &K)
+    {
+      int coeff_type = get_coeff_type();
+      if (coeff_type == 0) {
+        ui8 v;
+        if (file->read(&v, 1) != 1) return false;
+        K = v;
+      }
+      else if (coeff_type == 1) {
+        ui16 v;
+        if (file->read(&v, 2) != 2) return false;
+        v = swap_byte(v);
+        K = v;
+      }
+      else
+        return false;
+      return true;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    bool param_atk::read(infile_base *file)
+    {
+      if (Latk != 0) { // this param_atk is used
+        param_atk *p = this;
+        while (p->next != NULL)
+          p = p->next;
+        p->next = new param_atk;
+        p = p->next;
+        return p->read(file);
+      }
+
+      if (file->read(&Latk, 2) != 2)
+        OJPH_ERROR(0x000500E1, "error reading ATK-Latk parameter"); 
+      Latk = swap_byte(Latk);
+      if (file->read(&Satk, 2) != 2)
+        OJPH_ERROR(0x000500E2, "error reading ATK-Satk parameter"); 
+      Satk = swap_byte(Satk);
+      if (is_m_init0() == false)  // only even-indexed is supported
+        OJPH_ERROR(0x000500E3, "ATK-Satk parameter sets m_init to 1, "
+          "requiring odd-indexed subsequence in first reconstruction step, "
+          "which is not supported yet.");
+      if (is_whole_sample() == false)  // ARB filter not supported
+        OJPH_ERROR(0x000500E4, "ATK-Satk parameter specified ARB filter, "
+          "which is not supported yet."); 
+      if (is_reversible() && get_coeff_type() >= 2) // reversible & float
+        OJPH_ERROR(0x000500E5, "ATK-Satk parameter does not make sense. "
+          "It employs floats with reversible filtering."); 
+      if (is_reversible() == false) 
+        if (read_coefficient(file, Katk) == false)
+          OJPH_ERROR(0x000500E6, "error reading ATK-Katk parameter"); 
+      if (file->read(&Natk, 1) != 1)
+        OJPH_ERROR(0x000500E7, "error reading ATK-Natk parameter");
+      if (Natk > max_steps) {
+        if (d != d_store) // was this allocated -- very unlikely
+          delete[] d;
+        d = new data[Natk];
+        max_steps = Natk;
+      }
+
+      if (is_reversible())
+      {
+        for (int s = 0; s < Natk; ++s)
+        {
+          if (file->read(&d[s].rev.Eatk, 1) != 1)
+            OJPH_ERROR(0x000500E8, "error reading ATK-Eatk parameter");           
+          if (file->read(&d[s].rev.Batk, 2) != 2)
+            OJPH_ERROR(0x000500E9, "error reading ATK-Batk parameter");           
+          d[s].rev.Batk = (si16)swap_byte((ui16)d[s].rev.Batk);
+          ui8 LCatk;
+          if (file->read(&LCatk, 1) != 1)
+            OJPH_ERROR(0x000500EA, "error reading ATK-LCatk parameter");
+          if (LCatk == 0)
+            OJPH_ERROR(0x000500EB, "Encountered a ATK-LCatk value of zero; "
+              "something is wrong.");
+          if (LCatk > 1)
+            OJPH_ERROR(0x000500EC, "ATK-LCatk value greater than 1; "
+              "that is, a multitap filter is not supported");
+          if (read_coefficient(file, d[s].rev.Aatk) == false)
+            OJPH_ERROR(0x000500ED, "Error reding ATK-Aatk parameter");
+        }
+      }
+      else
+      {
+        for (int s = 0; s < Natk; ++s)
+        {
+          ui8 LCatk;
+          if (file->read(&LCatk, 1) != 1)
+            OJPH_ERROR(0x000500EE, "error reading ATK-LCatk parameter");
+          if (LCatk == 0)
+            OJPH_ERROR(0x000500EF, "Encountered a ATK-LCatk value of zero; "
+              "something is wrong.");
+          if (LCatk > 1)
+            OJPH_ERROR(0x000500F0, "ATK-LCatk value greater than 1; "
+              "that is, a multitap filter is not supported.");
+          if (read_coefficient(file, d[s].irv.Aatk) == false)
+            OJPH_ERROR(0x000500F1, "Error reding ATK-Aatk parameter");
+        }
+      }
+
+      return true;
+    }
+  } // !local namespace
+}  // !ojph namespace
diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h
index bac0c359..acfd0347 100644
--- a/src/core/codestream/ojph_params_local.h
+++ b/src/core/codestream/ojph_params_local.h
@@ -96,14 +96,26 @@ namespace ojph {
 
   ////////////////////////////////////////////////////////////////////////////
   enum OJPH_TILEPART_DIVISIONS: ui32 {
-    OJPH_TILEPART_NO_DIVISIONS  = 0x0, // no divisions to tile parts
-    OJPH_TILEPART_RESOLUTIONS = 0x1,
-    OJPH_TILEPART_COMPONENTS  = 0x2,
-    OJPH_TILEPART_LAYERS      = 0x4, // these are meaningless with HTJ2K
+    OJPH_TILEPART_NO_DIVISIONS = 0x0, // no divisions to tile parts
+    OJPH_TILEPART_RESOLUTIONS  = 0x1,
+    OJPH_TILEPART_COMPONENTS   = 0x2,
+    OJPH_TILEPART_LAYERS       = 0x4, // these are meaningless with HTJ2K
   };
 
   namespace local {
 
+    //defined here
+    struct param_siz;
+    struct param_cod;
+    struct param_qcd;
+    struct param_qcc;
+    struct param_cap;
+    struct param_sot;
+    struct param_tlm;
+    struct param_coc;
+    struct param_dfs;
+    struct param_atk;
+
     //////////////////////////////////////////////////////////////////////////
     enum JP2K_MARKER : ui16
     {
@@ -111,6 +123,7 @@ namespace ojph {
       CAP = 0xFF50, //extended capability
       SIZ = 0xFF51, //image and tile size (required)
       COD = 0xFF52, //coding style default (required)
+      COC = 0xFF53, //coding style component
       TLM = 0xFF55, //tile-part lengths
       PRF = 0xFF56, //profile
       PLM = 0xFF57, //packet length, main header
@@ -118,19 +131,20 @@ namespace ojph {
       CPF = 0xFF59, //corresponding profile values
       QCD = 0xFF5C, //qunatization default (required)
       QCC = 0xFF5D, //quantization component
+      RGN = 0xFF5E, //region of interest
+      POC = 0xFF5F, //progression order change
+      PPM = 0xFF60, //packed packet headers, main header
+      PPT = 0xFF61, //packed packet headers, tile-part header
+      CRG = 0xFF63, //component registration
       COM = 0xFF64, //comment
+      DFS = 0xFF72, //downsampling factor styles
+      ADS = 0xFF73, //arbitrary decomposition styles
+      ATK = 0xFF79, //arbitrary transformation kernels
       SOT = 0xFF90, //start of tile-part
       SOP = 0xFF91, //start of packet
       EPH = 0xFF92, //end of packet
       SOD = 0xFF93, //start of data
       EOC = 0xFFD9, //end of codestream (required)
-
-      COC = 0xFF53, //coding style component
-      RGN = 0xFF5E, //region of interest
-      POC = 0xFF5F, //progression order change
-      PPM = 0xFF60, //packed packet headers, main header
-      PPT = 0xFF61, //packed packet headers, tile-part header
-      CRG = 0xFF63, //component registration
     };
 
     //////////////////////////////////////////////////////////////////////////
@@ -442,23 +456,23 @@ namespace ojph {
         Sqcd = 0;
         for (int i = 0; i < 97; ++i)
           u16_SPqcd[i] = 0;
-        num_decomps = 0;
-        base_delta = -1.0f; 
+        num_subbands = 0;
+        base_delta = -1.0f;
       }
 
       void set_delta(float delta) { base_delta = delta; }
-      void set_rev_quant(ui32 bit_depth, bool is_employing_color_transform);
-      void set_irrev_quant();
 
       void check_validity(const param_siz& siz, const param_cod& cod)
       {
-        num_decomps = cod.get_num_decompositions();
+        int num_decomps = cod.get_num_decompositions();
+        num_subbands = 1 + 3 * num_decomps;
         if (cod.is_reversible())
         {
           ui32 bit_depth = 0;
           for (ui32 i = 0; i < siz.get_num_components(); ++i)
             bit_depth = ojph_max(bit_depth, siz.get_bit_depth(i));
-          set_rev_quant(bit_depth, cod.is_employing_color_transform());
+          set_rev_quant(num_decomps, bit_depth,
+            cod.is_employing_color_transform());
         }
         else
         {
@@ -466,21 +480,28 @@ namespace ojph {
             ui32 bit_depth = 0;
             for (ui32 i = 0; i < siz.get_num_components(); ++i)
               bit_depth =
-                ojph_max(bit_depth, siz.get_bit_depth(i) + siz.is_signed(i));
+              ojph_max(bit_depth, siz.get_bit_depth(i) + siz.is_signed(i));
             base_delta = 1.0f / (float)(1 << bit_depth);
           }
-          set_irrev_quant();
-         }
+          set_irrev_quant(num_decomps);
+        }
       }
-
       ui32 get_num_guard_bits() const;
       ui32 get_MAGBp() const;
-      ui32 get_Kmax(ui32 resolution, ui32 subband) const;
-      float irrev_get_delta(ui32 resolution, ui32 subband) const;
+      ui32 get_Kmax(const param_dfs* dfs, ui32 num_decompositions,
+                    ui32 resolution, ui32 subband) const;
+      float irrev_get_delta(const param_dfs* dfs,
+                            ui32 num_decompositions,
+                            ui32 resolution, ui32 subband) const;
 
       bool write(outfile_base *file);
       void read(infile_base *file);
 
+    protected:
+      void set_rev_quant(int num_decomps, ui32 bit_depth, 
+                         bool is_employing_color_transform);
+      void set_irrev_quant(int num_decomps);
+
     protected:
       ui16 Lqcd;
       ui8 Sqcd;
@@ -489,8 +510,9 @@ namespace ojph {
         ui8 u8_SPqcd[97];
         ui16 u16_SPqcd[97];
       };
-      ui32 num_decomps;
-      float base_delta;
+      ui32 num_subbands;        // number of subbands
+      float base_delta;         // base quantization step size -- all other
+                                // step sizes are derived from it.
     };
 
     ///////////////////////////////////////////////////////////////////////////
@@ -502,7 +524,6 @@ namespace ojph {
     ///////////////////////////////////////////////////////////////////////////
     struct param_qcc : public param_qcd
     {
-      //friend ::ojph::param_qcc;
     public:
       param_qcc() : param_qcd()
       { comp_idx = 0; }
@@ -627,9 +648,136 @@ namespace ojph {
       Ttlm_Ptlm_pair* pairs;
       ui32 num_pairs;
       ui32 next_pair_index;
-      
     };
-  }
-}
+
+    ///////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    ///////////////////////////////////////////////////////////////////////////
+    struct param_coc : public param_cod
+    {
+
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    ///////////////////////////////////////////////////////////////////////////
+    struct param_dfs
+    {
+    public:
+      enum dfs_dwt_type : ui8 {
+        NO_DWT    = 0,  // no wavelet transform
+        BIDIR_DWT = 1,  // bidirectional DWT (this the conventional DWT)
+        HORZ_DWT  = 2,  // horizontal only DWT transform
+        VERT_DWT  = 3,  // vertical only DWT transform
+      };
+
+    public: // member functions
+      param_dfs() { memset(this, 0, sizeof(param_dfs)); }
+      ~param_dfs() { if (next) delete next; }
+      void init() { memset(this, 0, sizeof(param_dfs)); }
+      bool read(infile_base *file);
+      bool exists() const { return Ldfs != 0; }
+
+      // get_dfs return a dfs structure Sdfs == index, or NULL if not found
+      const param_dfs* get_dfs(int index) const;
+      // decomp_level is the decomposition level, starting from 1 for highest
+      // resolution to num_decomps for the coarsest resolution
+      dfs_dwt_type get_dwt_type(ui32 decomp_level) const;
+      int get_subband_idx(ui32 num_decompositions, ui32 resolution, 
+                          ui32 subband) const;
+
+    private: // member variables
+      ui16 Ldfs;       // length of the segment marker
+      ui16 Sdfs;       // index of this DFS marker segment
+      ui8 Ids;         // number of elements in Ddfs, 2 bits per sub-level
+      ui8 Ddfs[8];     // a string defining number of decomposition sub-levels
+                       // 8 bytes should be enough for 32 levels
+      param_dfs* next; // used for linking other dfs segments
+    };
+
+    ///////////////////////////////////////////////////////////////////////////
+    //
+    //
+    //
+    //
+    //
+    ///////////////////////////////////////////////////////////////////////////
+    struct param_atk
+    {
+      // Limitations:
+      // Arbitrary filters (ARB) are not supported
+      // Up to 6 steps are supported -- more than 6 are not supported
+      // Only one coefficient per step -- first order filter
+      // Only even-indexed subsequence in first reconstruction step,
+      //   m_init = 0 is supported
+
+    public: // data structures used by this object
+      struct irv_data {
+        // si8 Oatk;     // only for arbitrary filter
+        // ui8 LCatk;    // number of lifting coefficients in a step
+        float Aatk;      // lifting coefficient
+      };
+
+      struct rev_data {
+        // si8 Oatk;     // only for arbitrary filter, offset of filter
+        ui8 Eatk;        // only for reversible, epsilon, the power of 2
+        si16 Batk;       // only for reversible, beta, the additive residue
+        // ui8 LCatk;    // number of lifting coefficients in a step
+        si16 Aatk;       // lifting coefficient
+      };
+
+      union data {
+        irv_data irv;
+        rev_data rev;
+      };
+
+    public: // member functions
+      param_atk() { init(); }
+      ~param_atk() {
+        if (next) delete next;
+        if (d != NULL && d != d_store) {
+          delete[] d;
+          init(false);
+        }
+      }
+      bool read(infile_base *file);
+      bool read_coefficient(infile_base *file, float &K);
+      bool read_coefficient(infile_base *file, si16 &K);
+      void init(bool clear_all = true) { 
+        if (clear_all)
+          memset(this, 0, sizeof(param_atk));
+        d = d_store; max_steps = sizeof(d_store) / sizeof(data); 
+      }
+
+      ui8 get_index() const { return (ui8)(Satk & 0xFF); }
+      int get_coeff_type() const { return (Satk >> 8) & 0x7; }
+      bool is_whole_sample() const { return (Satk & 0x800) != 0; }
+      bool is_reversible() const { return (Satk & 0x1000) != 0; }
+      bool is_m_init0() const { return (Satk & 0x2000) == 0; }
+      bool is_using_ws_extension() const { return (Satk & 0x4000) != 0x4000; }
+      const param_atk* get_atk(int index) const;
+      const data* get_step(ui32 s) const { assert(s < Natk); return d + s; }
+
+    private: // member variables
+      ui16 Latk;         // structure length
+      ui16 Satk;         // carries a variety of information
+      float Katk;        // only for irreversible scaling factor K
+      ui8 Natk;          // number of lifting steps
+      data* d;           // pointer to data, initialized to d_store
+      int max_steps;     // maximum number of steps without memory allocation
+      data d_store[6];   // step coefficient
+      param_atk* next;   // used for chaining if more than one atk segment
+                         // exist in the codestream
+    };
+  } // !local namespace
+} // !ojph namespace
 
 #endif // !OJPH_PARAMS_LOCAL_H
diff --git a/src/core/codestream/ojph_subband.cpp b/src/core/codestream/ojph_subband.cpp
index fc83bf2b..eb958bfb 100644
--- a/src/core/codestream/ojph_subband.cpp
+++ b/src/core/codestream/ojph_subband.cpp
@@ -124,11 +124,14 @@ namespace ojph {
       cur_cb_row = 0;
       cur_line = 0;
       cur_cb_height = 0;
-      param_qcd *qcd = codestream->access_qcd(parent->get_comp_num());
-      this->K_max = qcd->get_Kmax(this->res_num, band_num);
+      param_qcd* qcd = codestream->access_qcd(parent->get_comp_num());
+      const param_cod* cod = codestream->get_cod();
+      int num_decomps = cod->get_num_decompositions();
+      this->K_max = qcd->get_Kmax(NULL, num_decomps, this->res_num, band_num);
       if (!reversible)
       {
-        float d = qcd->irrev_get_delta(res_num, subband_num);
+        float d = 
+          qcd->irrev_get_delta(NULL, num_decomps, res_num, subband_num);
         d /= (float)(1u << (31 - this->K_max));
         delta = d;
         delta_inv = (1.0f/d);
diff --git a/src/core/common/ojph_version.h b/src/core/common/ojph_version.h
index fdf28bc2..ff62f0aa 100644
--- a/src/core/common/ojph_version.h
+++ b/src/core/common/ojph_version.h
@@ -34,5 +34,5 @@
 //***************************************************************************/
 
 #define OPENJPH_VERSION_MAJOR 0
-#define OPENJPH_VERSION_MINOR 10
-#define OPENJPH_VERSION_PATCH 5
+#define OPENJPH_VERSION_MINOR 11
+#define OPENJPH_VERSION_PATCH 0

From 4648f913599bde67b2c4763ddfd357adc68b1124 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Fri, 29 Mar 2024 08:04:15 +1100
Subject: [PATCH 02/37] This fixes an issue with the previous commit

---
 src/core/codestream/ojph_params.cpp | 60 ++++++++++++++---------------
 1 file changed, 28 insertions(+), 32 deletions(-)

diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp
index 5243762f..b2b1980e 100644
--- a/src/core/codestream/ojph_params.cpp
+++ b/src/core/codestream/ojph_params.cpp
@@ -898,8 +898,11 @@ namespace ojph {
       float arr[] = { 1.0f, 2.0f, 2.0f, 4.0f };
       assert((Sqcd & 0x1F) == 2);
 
-      ui32 idx = 
-        dfs->get_subband_idx(num_decompositions, resolution, subband);
+      ui32 idx;
+      if (dfs != NULL && dfs->exists())
+        idx = dfs->get_subband_idx(num_decompositions, resolution, subband);
+      else
+        idx = resolution ? (resolution - 1) * 3 + subband : 0;
       if (idx >= num_subbands) {
         OJPH_INFO(0x00050101, "Trying to access quantization step size for "
           "subband %d when the QCD/QCC marker segment specifies "
@@ -928,8 +931,11 @@ namespace ojph {
                              ui32 resolution, ui32 subband) const
     {
       ui32 num_bits = get_num_guard_bits();
-      ui32 idx = 
-        dfs->get_subband_idx(num_decompositions, resolution, subband);
+      ui32 idx;
+      if (dfs != NULL && dfs->exists())
+        idx = dfs->get_subband_idx(num_decompositions, resolution, subband);
+      else
+        idx = resolution ? (resolution - 1) * 3 + subband : 0;
       if (idx >= num_subbands) {
         OJPH_INFO(0x00050111, "Trying to access quantization step size for "
           "subband %d when the QCD/QCC marker segment specifies "
@@ -1314,10 +1320,10 @@ namespace ojph {
       assert(decomp_level > 0 && decomp_level <= Ids);
 
       decomp_level = ojph_min(decomp_level, Ids);
-      ui8 d = decomp_level - 1;          // decomp_level starts from 1
-      ui8 idx = d >> 2;                  // complete bytes
-      ui8 bits = d & 0x3;                // bit within the bytes
-      ui8 val = (Ddfs[idx] >> (6 - 2 * bits)) & 0x3;
+      ui32 d = decomp_level - 1;          // decomp_level starts from 1
+      ui32 idx = d >> 2;                  // complete bytes
+      ui32 bits = d & 0x3;                // bit within the bytes
+      ui32 val = (Ddfs[idx] >> (6 - 2 * bits)) & 0x3;
       return (dfs_dwt_type)val;
     }
 
@@ -1325,33 +1331,23 @@ namespace ojph {
     int param_dfs::get_subband_idx(ui32 num_decompositions, ui32 resolution,
                                    ui32 subband) const
     {
-      int idx;
-      if (this != NULL)
-      {
-        assert((resolution == 0 && subband == 0) || 
-               (resolution > 0 && resolution <= Ids && 
-                subband > 0 && subband < 4));
+      assert((resolution == 0 && subband == 0) || 
+              (resolution > 0 && resolution <= Ids && 
+              subband > 0 && subband < 4));
 
-        ui32 ns[4] = { 0, 3, 2, 2 };
-        ui32 off[4] = {};
+      ui32 ns[4] = { 0, 3, 2, 2 };
 
-        idx = 0;
-        if (resolution > 0)
-        {
-          idx = 0;
-          ui32 i = 1;
-          for (; i < resolution; ++i)
-            idx += ns[get_dwt_type(num_decompositions - i + 1)];
-          dfs_dwt_type t = get_dwt_type(num_decompositions - i + 1);
-          idx += subband;
-          if (t == VERT_DWT && subband == 2)
-            --idx;
-        }
-      }
-      else 
+      int idx = 0;
+      if (resolution > 0)
       {
-        assert(subband >= 0 && subband < 4);
-        idx = resolution ? (resolution - 1) * 3 + subband : 0;
+        idx = 0;
+        ui32 i = 1;
+        for (; i < resolution; ++i)
+          idx += ns[get_dwt_type(num_decompositions - i + 1)];
+        dfs_dwt_type t = get_dwt_type(num_decompositions - i + 1);
+        idx += subband;
+        if (t == VERT_DWT && subband == 2)
+          --idx;
       }
 
       return idx;

From 1a5925f44c8a4f43d4205885d5bb67ba36d4fdef Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Fri, 29 Mar 2024 08:17:51 +1100
Subject: [PATCH 03/37] More fixes

---
 src/core/codestream/ojph_params.cpp | 48 +++++++++++++++++------------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp
index b2b1980e..8c2169c3 100644
--- a/src/core/codestream/ojph_params.cpp
+++ b/src/core/codestream/ojph_params.cpp
@@ -1425,36 +1425,46 @@ namespace ojph {
         K = swap_byte(v);
       }
       else if (coeff_type == 2) { // float
-        if (file->read(&K, 4) != 4) return false;
-        ui32 t = swap_byte(*(ui32*)&K);
-        K = *(float*)&t;
+        union {
+          float f;
+          ui32 i;
+        } v;
+        if (file->read(&v.i, 4) != 4) return false;
+        v.i = swap_byte(v.i);
+        K = v.f;
       }
       else if (coeff_type == 3) { // double
-        double v;
-        if (file->read(&v, 8) != 8) return false;
-        ui64 t = swap_byte(*(ui64*)&v);
-        double u = *(float*)&t;
-        K = (float)u;
+        union {
+          double d;
+          ui64 i;
+        } v;
+        if (file->read(&v.i, 8) != 8) return false;
+        v.i = swap_byte(v.i);
+        K = (float)v.d;
       }
       else if (coeff_type == 4) { // 128 bit float
         ui64 v, v1;
         if (file->read(&v, 8) != 8) return false;
-        if (file->read(&v1, 8) != 8) return false; // not needed
+        if (file->read(&v1, 8) != 8) return false; // v1 not needed
         v = swap_byte(v);
 
+        union {
+          float f;
+          ui32 i;
+        } s;
         // convert the MSB of 128b float to 32b float
         // 32b float has 1 sign bit, 8 exponent (offset 127), 23 mantissa
         // 128b float has 1 sign bit, 15 exponent (offset 16383), 112 mantissa
-        si32 t1 = (si32)((v >> 48) & 0x7FFF); // exponent
-        t1 -= 16383;
-        t1 += 127;
-        t1 = t1 & 0xFF;                      // removes MSBs if negative
-        t1 <<= 23;                           // move bits to their location
-        ui32 t = 0;
-        t |= ((ui32)(v >> 32) & 0x80000000); // copy sign bit
-        t |= t1;                             // copy exponent
-        t |= (ui32)((v >> 25) & 0x007FFFFF); // copy 23 mantissa
-        K = *(float*)&t;
+        si32 e = (si32)((v >> 48) & 0x7FFF);   // exponent
+        e -= 16383;
+        e += 127;
+        e = e & 0xFF;                          // removes MSBs if negative
+        e <<= 23;                              // move bits to their location
+        s.i = 0;
+        s.i |= ((ui32)(v >> 32) & 0x80000000); // copy sign bit
+        s.i |= e;                              // copy exponent
+        s.i |= (ui32)((v >> 25) & 0x007FFFFF); // copy 23 mantissa
+        K = s.f;
       }
       return true;
     }

From 2b38785cd1111072f9e7f43a1caa69670bd677a4 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Sat, 30 Mar 2024 18:19:02 +1100
Subject: [PATCH 04/37] Implemented COC. Linked ATK to COD/COC.

---
 src/core/codestream/ojph_codestream_local.cpp |  40 ++--
 src/core/codestream/ojph_codestream_local.h   |   4 +-
 src/core/codestream/ojph_params.cpp           | 112 +++++++++--
 src/core/codestream/ojph_params_local.h       | 178 ++++++++++++------
 src/core/codestream/ojph_resolution.cpp       |   4 +-
 src/core/codestream/ojph_subband.cpp          |   2 +-
 src/core/codestream/ojph_tile.cpp             |   2 +-
 7 files changed, 254 insertions(+), 88 deletions(-)

diff --git a/src/core/codestream/ojph_codestream_local.cpp b/src/core/codestream/ojph_codestream_local.cpp
index c2154fa0..d4d20a38 100644
--- a/src/core/codestream/ojph_codestream_local.cpp
+++ b/src/core/codestream/ojph_codestream_local.cpp
@@ -758,7 +758,8 @@ namespace ojph {
           skip_marker(file, "CPF", NULL, OJPH_MSG_LEVEL::NO_MSG, false);
         else if (marker_idx == 3)
         { 
-          cod.read(file); received_markers |= 1; 
+          cod.read(file, param_cod::COD_MAIN); 
+          received_markers |= 1;
           ojph::param_cod c(&cod);
           int num_qlayers = c.get_num_layers();
           if (num_qlayers != 1)
@@ -766,21 +767,32 @@ namespace ojph {
               "1 quality layer only.  This codestream has %d quality layers",
               num_qlayers);
         }
-        else if (marker_idx == 4)
-          skip_marker(file, "COC", "COC is not supported yet",
-            OJPH_MSG_LEVEL::WARN, false);
+        else if (marker_idx == 4) 
+        {
+          ui32 num_comps = siz.get_num_components();
+          if (coc == coc_store && 
+              num_comps * sizeof(param_cod) > sizeof(coc_store))
+          {
+            coc = new param_cod[num_comps];
+          }
+          coc[used_coc_fields++].read(
+            file, param_cod::COC_MAIN, num_comps, &cod);
+        }
         else if (marker_idx == 5)
-        { qcd.read(file); received_markers |= 2; }
+        { 
+          qcd.read(file); 
+          received_markers |= 2; 
+        }
         else if (marker_idx == 6)
+        {
+          ui32 num_comps = siz.get_num_components();
+          if (qcc == qcc_store && 
+              num_comps * sizeof(param_qcc) > sizeof(qcc_store))
           {
-            ui32 num_comps = siz.get_num_components();
-            if (qcc == qcc_store && 
-                num_comps * sizeof(param_qcc) > sizeof(qcc_store))
-            {
-              qcc = new param_qcc[num_comps];
-            }
-            qcc[used_qcc_fields++].read(file, num_comps);
+            qcc = new param_qcc[num_comps];
           }
+          qcc[used_qcc_fields++].read(file, num_comps);
+        }
         else if (marker_idx == 7)
           skip_marker(file, "RGN", "RGN is not supported yet",
             OJPH_MSG_LEVEL::WARN, false);
@@ -816,7 +828,9 @@ namespace ojph {
           OJPH_ERROR(0x00030051, "File ended before finding a tile segment");
       }
 
-      //qcd.update(&dfs);
+      cod.update_atk(&atk);
+      for (int i = 0; i < used_coc_fields; ++i)
+        coc[i].update_atk(&atk);
 
       if (received_markers != 3)
         OJPH_ERROR(0x00030052, "markers error, COD and QCD are required");
diff --git a/src/core/codestream/ojph_codestream_local.h b/src/core/codestream/ojph_codestream_local.h
index 035b534f..34ffc355 100644
--- a/src/core/codestream/ojph_codestream_local.h
+++ b/src/core/codestream/ojph_codestream_local.h
@@ -163,8 +163,8 @@ namespace ojph {
       param_qcc *qcc;         // quantization component
       param_qcc qcc_store[4]; // we allocate 4, we allocate more if needed
       int used_coc_fields;
-      param_coc *coc;         // coding style component
-      param_coc coc_store[4]; // we allocate 4, we allocate more if needed
+      param_cod *coc;         // coding style component
+      param_cod coc_store[4]; // we allocate 4, we allocate more if needed
 
     private:  // these are from Part 2 of the standard
       param_dfs dfs;         // downsmapling factor styles
diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp
index 8c2169c3..07446c0f 100644
--- a/src/core/codestream/ojph_params.cpp
+++ b/src/core/codestream/ojph_params.cpp
@@ -275,7 +275,12 @@ namespace ojph {
   ////////////////////////////////////////////////////////////////////////////
   bool param_cod::is_reversible() const
   {
-    return state->is_reversible();
+    if (state->SPcod.wavelet_trans <= 1)
+      return state->get_wavelet_kern() == local::param_cod::DWT_REV53;
+    else {
+      assert(state->atk != NULL);
+      return state->atk->is_reversible();
+    }
   }
 
   ////////////////////////////////////////////////////////////////////////////
@@ -604,8 +609,9 @@ namespace ojph {
         OJPH_ERROR(0x00050043, "error reading SIZ marker");
       Rsiz = swap_byte(Rsiz);
       if ((Rsiz & 0x4000) == 0)
-        OJPH_ERROR(0x00050044, "Rsiz bit 14 not set (this is not a JPH file)");
-      if (Rsiz & 0xBFFF)
+        OJPH_ERROR(0x00050044, 
+          "Rsiz bit 14 is not set (this is not a JPH file)");
+      if ((Rsiz & 0x8000) != 0 && (Rsiz & 0xF5F) != 0)
         OJPH_WARN(0x00050001, "Rsiz in SIZ has unimplemented fields");
       if (file->read(&Xsiz, 4) != 4)
         OJPH_ERROR(0x00050045, "error reading SIZ marker");
@@ -652,6 +658,9 @@ namespace ojph {
         if (file->read(&cptr[c].YRsiz, 1) != 1)
           OJPH_ERROR(0x00050053, "error reading SIZ marker");
       }
+
+      ws_kern_support_needed = (Rsiz & 0x20) != 0;
+      dfs_support_needed = (Rsiz & 0x80) != 0;
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -720,6 +729,8 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     bool param_cod::write(outfile_base *file)
     {
+      assert(type == COD_MAIN);
+
       //marker size excluding header
       Lcod = 12;
       Lcod = (ui16)(Lcod + (Scod & 1 ? 1 + SPcod.num_decomp : 0));
@@ -758,37 +769,106 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void param_cod::read(infile_base *file)
+    void param_cod::read(infile_base *file, param_cod::cod_type type)
     {
+      assert(this->type == UNDEFINED);
+      assert(type == COD_MAIN);
+
+      this->type = type;
       if (file->read(&Lcod, 2) != 2)
-        OJPH_ERROR(0x00050071, "error reading COD marker");
+        OJPH_ERROR(0x00050071, "error reading COD segment");
       Lcod = swap_byte(Lcod);
       if (file->read(&Scod, 1) != 1)
-        OJPH_ERROR(0x00050072, "error reading COD marker");
+        OJPH_ERROR(0x00050072, "error reading COD segment");
       if (file->read(&SGCod.prog_order, 1) != 1)
-        OJPH_ERROR(0x00050073, "error reading COD marker");
+        OJPH_ERROR(0x00050073, "error reading COD segment");
       if (file->read(&SGCod.num_layers, 2) != 2)
-      { OJPH_ERROR(0x00050074, "error reading COD marker"); }
+      { OJPH_ERROR(0x00050074, "error reading COD segment"); }
       else
         SGCod.num_layers = swap_byte(SGCod.num_layers);
       if (file->read(&SGCod.mc_trans, 1) != 1)
-        OJPH_ERROR(0x00050075, "error reading COD marker");
+        OJPH_ERROR(0x00050075, "error reading COD segment");
       if (file->read(&SPcod.num_decomp, 1) != 1)
-        OJPH_ERROR(0x00050076, "error reading COD marker");
+        OJPH_ERROR(0x00050076, "error reading COD segment");
       if (file->read(&SPcod.block_width, 1) != 1)
-        OJPH_ERROR(0x00050077, "error reading COD marker");
+        OJPH_ERROR(0x00050077, "error reading COD segment");
       if (file->read(&SPcod.block_height, 1) != 1)
-        OJPH_ERROR(0x00050078, "error reading COD marker");
+        OJPH_ERROR(0x00050078, "error reading COD segment");
       if (file->read(&SPcod.block_style, 1) != 1)
-        OJPH_ERROR(0x00050079, "error reading COD marker");
+        OJPH_ERROR(0x00050079, "error reading COD segment");
       if (file->read(&SPcod.wavelet_trans, 1) != 1)
-        OJPH_ERROR(0x0005007A, "error reading COD marker");
+        OJPH_ERROR(0x0005007A, "error reading COD segment");
       if (Scod & 1)
         for (int i = 0; i <= SPcod.num_decomp; ++i)
           if (file->read(&SPcod.precinct_size[i], 1) != 1)
-            OJPH_ERROR(0x0005007B, "error reading COD marker");
+            OJPH_ERROR(0x0005007B, "error reading COD segment");
       if (Lcod != 12 + ((Scod & 1) ? 1 + SPcod.num_decomp : 0))
-        OJPH_ERROR(0x0005007C, "error in COD marker length");
+        OJPH_ERROR(0x0005007C, "error in COD segment length");
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_cod::read(infile_base* file, param_cod::cod_type type, 
+                         ui32 num_comps, param_cod *cod)
+    {
+      assert(this->type == UNDEFINED);
+      assert(type == COC_MAIN);
+      assert(cod != NULL);
+
+      this->type = type;
+      this->SGCod = cod->SGCod;
+      this->parent = cod;
+      if (file->read(&Lcod, 2) != 2)
+        OJPH_ERROR(0x00050121, "error reading COC segment");
+      Lcod = swap_byte(Lcod);
+      if (num_comps < 257) {
+        ui8 t;
+        if (file->read(&t, 1) != 1)
+          OJPH_ERROR(0x00050122, "error reading COC segment");
+        comp_idx = t;
+      }
+      else {
+        if (file->read(&comp_idx, 2) != 2)
+          OJPH_ERROR(0x00050123, "error reading COC segment");
+        comp_idx = swap_byte(comp_idx);
+      }
+      if (file->read(&Scod, 1) != 1)
+        OJPH_ERROR(0x00050124, "error reading COC segment");
+      if (Scod & 0xF8)
+        OJPH_WARN(0x00050011, 
+          "Unsupported options in Scoc field of the COC segment");
+      if (file->read(&SPcod.num_decomp, 1) != 1)
+        OJPH_ERROR(0x00050125, "error reading COC segment");
+      if (file->read(&SPcod.block_width, 1) != 1)
+        OJPH_ERROR(0x00050126, "error reading COC segment");
+      if (file->read(&SPcod.block_height, 1) != 1)
+        OJPH_ERROR(0x00050127, "error reading COC segment");
+      if (file->read(&SPcod.block_style, 1) != 1)
+        OJPH_ERROR(0x00050128, "error reading COC segment");
+      if (file->read(&SPcod.wavelet_trans, 1) != 1)
+        OJPH_ERROR(0x00050129, "error reading COC segment");
+      if (Scod & 1)
+        for (int i = 0; i <= get_num_decompositions(); ++i)
+          if (file->read(&SPcod.precinct_size[i], 1) != 1)
+            OJPH_ERROR(0x0005012A, "error reading COC segment");
+      ui16 t = 9;
+      t += num_comps < 257 ? 0 : 1;
+      t += (Scod & 1) ? 1 + get_num_decompositions() : 0;
+      if (Lcod != t)
+        OJPH_ERROR(0x0005012B, "error in COC segment length");
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_cod::update_atk(const param_atk* atk)
+    {
+      if (SPcod.wavelet_trans > 1) {
+        this->atk = atk->get_atk(SPcod.wavelet_trans);
+        if (this->atk == NULL)
+          OJPH_ERROR(0x00050131, "A COD/COC segment employs the DWT kernel "
+            "atk=%d, but a corresponding ATK segment cannot be found", 
+            SPcod.wavelet_trans);
+      }
+      else
+        this->atk = NULL;
     }
 
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h
index acfd0347..91447f15 100644
--- a/src/core/codestream/ojph_params_local.h
+++ b/src/core/codestream/ojph_params_local.h
@@ -112,7 +112,6 @@ namespace ojph {
     struct param_cap;
     struct param_sot;
     struct param_tlm;
-    struct param_coc;
     struct param_dfs;
     struct param_atk;
 
@@ -173,6 +172,7 @@ namespace ojph {
         cptr = store;
         old_Csiz = 4;
         Rsiz = 0x4000; //for jph, bit 14 of Rsiz is 1
+        ws_kern_support_needed = dfs_support_needed = false;
       }
 
       ~param_siz()
@@ -270,6 +270,8 @@ namespace ojph {
         ui32 t = ojph_div_ceil(Ysiz, ds) - ojph_div_ceil(YOsiz, ds);
         return t;
       }
+      bool is_ws_kern_support_needed() { return ws_kern_support_needed; }
+      bool is_dfs_support_needed() { return dfs_support_needed; }
 
     private:
       ui16 Lsiz;
@@ -289,6 +291,8 @@ namespace ojph {
       ui32 skipped_resolutions;
       int old_Csiz;
       siz_comp_info store[4];
+      bool ws_kern_support_needed;
+      bool dfs_support_needed;
       param_siz(const param_siz&) = delete; //prevent copy constructor
       param_siz& operator=(const param_siz&) = delete; //prevent copy
     };
@@ -308,10 +312,18 @@ namespace ojph {
       ui8 block_style;
       ui8 wavelet_trans;
       ui8 precinct_size[33]; //num_decomp is in [0,32]
-    };
 
-    ///////////////////////////////////////////////////////////////////////////
-    typedef cod_SPcod cod_SPcoc;
+      size get_log_block_dims() const
+      { return size(block_width + 2, block_height + 2); }
+      size get_block_dims() const 
+      { size t = get_log_block_dims(); return size(1 << t.w, 1 << t.h); }
+      size get_log_precinct_size(ui32 res_num) const
+      {
+        assert(res_num <= num_decomp);
+        size ps(precinct_size[res_num] & 0xF, precinct_size[res_num] >> 4);
+        return ps;
+      }
+    };
 
     ///////////////////////////////////////////////////////////////////////////
     struct cod_SGcod
@@ -324,38 +336,65 @@ namespace ojph {
     ///////////////////////////////////////////////////////////////////////////
     struct param_cod
     {
+      // serves for both COD and COC markers
+
       friend ::ojph::param_cod;
+      ////////////////////////////////////////
       enum BLOCK_CODING_STYLES {
         VERT_CAUSAL_MODE = 0x8,
         HT_MODE = 0x40
       };
-    public:
+      ////////////////////////////////////////
+      enum cod_type : ui8 {
+        UNDEFINED = 0,
+        COD_MAIN  = 1,
+        COC_MAIN  = 2,
+        COD_TILE  = 3,
+        COC_TILE  = 4
+      };
+      ////////////////////////////////////////
+      enum dwt_type : ui8 {
+        DWT_IRV97 = 0,
+        DWT_REV53 = 1,
+      };
+
+    public: // COD_MAIN and COC_MAIN common functions
+      ////////////////////////////////////////
       param_cod()
       {
         memset(this, 0, sizeof(param_cod));
         SPcod.block_style = HT_MODE;
-        SGCod.prog_order = 2;
+        SGCod.prog_order = OJPH_PO_RPCL;
         SGCod.num_layers = 1;
         SGCod.mc_trans = 0;
         SPcod.num_decomp = 5;
         SPcod.block_width = 4; //64
         SPcod.block_height = 4; //64
-        set_reversible(false);
       }
 
+      ////////////////////////////////////////
       void set_reversible(bool reversible)
       {
-        SPcod.wavelet_trans = reversible ? 1 : 0;
+        assert(type == UNDEFINED || type == COD_MAIN);
+        type = COD_MAIN;
+        SPcod.wavelet_trans = reversible ? DWT_REV53 : DWT_IRV97;
       }
 
+      ////////////////////////////////////////
       void employ_color_transform(ui8 val)
       {
         assert(val == 0 || val == 1);
+        assert(type == UNDEFINED || type == COD_MAIN);
+        type = COD_MAIN;
         SGCod.mc_trans = val;
       }
 
+      ////////////////////////////////////////
       void check_validity(const param_siz& siz)
       {
+        assert(type == UNDEFINED || type == COD_MAIN);
+        type = COD_MAIN;
+
         //check that colour transform and match number of components and
         // downsampling
         int num_comps = siz.get_num_components();
@@ -393,50 +432,97 @@ namespace ojph {
         }
       }
 
+      ////////////////////////////////////////
       ui8 get_num_decompositions() const
-      { return SPcod.num_decomp; }
-      size get_block_dims() const
       {
-        return size(1 << (SPcod.block_width + 2),
-                    1 << (SPcod.block_height + 2));
+        if (type == COD_MAIN)
+          return SPcod.num_decomp;
+        else if (type == COC_MAIN)
+        {
+          if (is_dfs_defined())
+            return parent->get_num_decompositions();
+          else
+            return SPcod.num_decomp;
+        }
+        else {
+          assert(0);
+          return 0; // just in case
+        }
       }
-      bool is_reversible() const
-      { return (SPcod.wavelet_trans == 1); }
+
+      ////////////////////////////////////////
+      size get_block_dims() const
+      { return SPcod.get_block_dims(); }
+
+      ////////////////////////////////////////
+      size get_log_block_dims() const
+      { return SPcod.get_log_block_dims(); }
+
+      ////////////////////////////////////////
+      ui8 get_wavelet_kern() const
+      { return SPcod.wavelet_trans; }
+
+      ////////////////////////////////////////
       bool is_employing_color_transform() const
       { return (SGCod.mc_trans == 1); }
-      size get_log_block_dims() const
-      { return size(SPcod.block_width + 2, SPcod.block_height + 2); }
+
+      ////////////////////////////////////////
       size get_precinct_size(ui32 res_num) const
       {
         size t = get_log_precinct_size(res_num);
-        t.w = 1 << t.w;
-        t.h = 1 << t.h;
-        return t;
+        return size(1 << t.w, 1 << t.h);
       }
+
+      ////////////////////////////////////////
       size get_log_precinct_size(ui32 res_num) const
-      {
-        assert(res_num <= SPcod.num_decomp);
-        size ps(15, 15);
+      { 
         if (Scod & 1)
-        {
-          ps.w = SPcod.precinct_size[res_num] & 0xF;
-          ps.h = SPcod.precinct_size[res_num] >> 4;
-        }
-        return ps;
+          return SPcod.get_log_precinct_size(res_num);
+        else
+          return size(15, 15);
       }
+
+      ////////////////////////////////////////
       bool packets_may_use_sop() const
       { return (Scod & 2) == 2; }
+
+      ////////////////////////////////////////
       bool packets_use_eph() const
       { return (Scod & 4) == 4; }
 
+      ////////////////////////////////////////
       bool write(outfile_base *file);
-      void read(infile_base *file);
 
-    private:
-      ui16 Lcod;
-      ui8 Scod;
-      cod_SGcod SGCod;
-      cod_SPcod SPcod;
+      ////////////////////////////////////////
+      void read(infile_base *file, cod_type type);
+
+      ////////////////////////////////////////
+      void read(infile_base* file, cod_type type, ui32 num_comps, 
+                param_cod* cod);
+
+      ////////////////////////////////////////
+      void update_atk(const param_atk* atk);
+
+    public: // COC_MAIN only functions
+      ////////////////////////////////////////
+      bool is_dfs_defined() const 
+      { return (SPcod.num_decomp & 0x80) != 0; }
+
+      ////////////////////////////////////////
+      ui16 get_dfs_index() const  // cannot be more than 15
+      { return SPcod.num_decomp & 0xF; }
+
+    private: // Common variables
+      cod_type type;        // The type of this cod structure
+      ui16 Lcod;            // serves as Lcod and Scod
+      ui8 Scod;             // serves as Scod and Scoc
+      cod_SGcod SGCod;      // Used in COD and copied to COC
+      cod_SPcod SPcod;      // serves as SPcod and SPcoc
+
+    private: // COC only variables
+      param_cod* parent;    // parent COD structure
+      ui16 comp_idx;        // component index of this COC structure
+      const param_atk* atk; // useful when SPcod.wavelet_trans > 1
     };
 
     ///////////////////////////////////////////////////////////////////////////
@@ -452,11 +538,7 @@ namespace ojph {
     public:
       param_qcd()
       { 
-        Lqcd = 0;
-        Sqcd = 0;
-        for (int i = 0; i < 97; ++i)
-          u16_SPqcd[i] = 0;
-        num_subbands = 0;
+        memset(this, 0, sizeof(param_qcd));
         base_delta = -1.0f;
       }
 
@@ -466,7 +548,7 @@ namespace ojph {
       {
         int num_decomps = cod.get_num_decompositions();
         num_subbands = 1 + 3 * num_decomps;
-        if (cod.is_reversible())
+        if (cod.get_wavelet_kern() == param_cod::DWT_REV53)
         {
           ui32 bit_depth = 0;
           for (ui32 i = 0; i < siz.get_num_components(); ++i)
@@ -474,7 +556,7 @@ namespace ojph {
           set_rev_quant(num_decomps, bit_depth,
             cod.is_employing_color_transform());
         }
-        else
+        else if (cod.get_wavelet_kern() == param_cod::DWT_IRV97)
         {
           if (base_delta == -1.0f) {
             ui32 bit_depth = 0;
@@ -485,6 +567,8 @@ namespace ojph {
           }
           set_irrev_quant(num_decomps);
         }
+        else
+          assert(0);
       }
       ui32 get_num_guard_bits() const;
       ui32 get_MAGBp() const;
@@ -554,7 +638,7 @@ namespace ojph {
 
       void check_validity(const param_cod& cod, const param_qcd& qcd)
       {
-        if (cod.is_reversible())
+        if (cod.get_wavelet_kern() == param_cod::DWT_REV53)
           Ccap[0] &= 0xFFDF;
         else
           Ccap[0] |= 0x0020;
@@ -650,18 +734,6 @@ namespace ojph {
       ui32 next_pair_index;
     };
 
-    ///////////////////////////////////////////////////////////////////////////
-    //
-    //
-    //
-    //
-    //
-    ///////////////////////////////////////////////////////////////////////////
-    struct param_coc : public param_cod
-    {
-
-    };
-
     ///////////////////////////////////////////////////////////////////////////
     //
     //
diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp
index 82371bd7..0cc7e3b9 100644
--- a/src/core/codestream/ojph_resolution.cpp
+++ b/src/core/codestream/ojph_resolution.cpp
@@ -166,7 +166,7 @@ namespace ojph {
       //allocate lines
       if (skipped_res_for_recon == false)
       {
-        bool reversible = cdp->is_reversible();
+        bool reversible = (cdp->get_wavelet_kern() == param_cod::DWT_REV53);
         ui32 num_lines = reversible ? 4 : 6;
         allocator->pre_alloc_obj<line_buf>(num_lines);
 
@@ -321,7 +321,7 @@ namespace ojph {
       //allocate lines
       if (skipped_res_for_recon == false)
       {
-        this->reversible = cdp->is_reversible();
+        this->reversible = cdp->get_wavelet_kern() == param_cod::DWT_REV53;
         this->num_lines = this->reversible ? 4 : 6;
         lines = allocator->post_alloc_obj<line_buf>(num_lines);
 
diff --git a/src/core/codestream/ojph_subband.cpp b/src/core/codestream/ojph_subband.cpp
index eb958bfb..ba6c5b96 100644
--- a/src/core/codestream/ojph_subband.cpp
+++ b/src/core/codestream/ojph_subband.cpp
@@ -112,7 +112,7 @@ namespace ojph {
       this->parent = res;
 
       const param_cod* cdp = codestream->get_cod();
-      this->reversible = cdp->is_reversible();
+      this->reversible = cdp->get_wavelet_kern() == param_cod::DWT_REV53;
       size log_cb = cdp->get_log_block_dims();
       log_PP = cdp->get_log_precinct_size(res_num);
 
diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp
index 0ad4acd3..38bcd686 100644
--- a/src/core/codestream/ojph_tile.cpp
+++ b/src/core/codestream/ojph_tile.cpp
@@ -214,7 +214,7 @@ namespace ojph {
 
       //allocate lines
       const param_cod* cdp = codestream->get_cod();
-      this->reversible = cdp->is_reversible();
+      this->reversible = cdp->get_wavelet_kern() == param_cod::DWT_REV53;
       this->employ_color_transform = cdp->is_employing_color_transform();
       if (this->employ_color_transform)
       {

From 0363e40896ed45f8ce826d50053ae2fe17e448d5 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Sat, 30 Mar 2024 18:24:20 +1100
Subject: [PATCH 05/37] Warning fix.

---
 src/core/codestream/ojph_params.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp
index 07446c0f..5a76f24c 100644
--- a/src/core/codestream/ojph_params.cpp
+++ b/src/core/codestream/ojph_params.cpp
@@ -850,7 +850,7 @@ namespace ojph {
         for (int i = 0; i <= get_num_decompositions(); ++i)
           if (file->read(&SPcod.precinct_size[i], 1) != 1)
             OJPH_ERROR(0x0005012A, "error reading COC segment");
-      ui16 t = 9;
+      ui32 t = 9;
       t += num_comps < 257 ? 0 : 1;
       t += (Scod & 1) ? 1 + get_num_decompositions() : 0;
       if (Lcod != t)

From 02f6967ecbb9472227f0154ef93b6ce65e3266a5 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Sun, 31 Mar 2024 13:38:21 +1100
Subject: [PATCH 06/37] More changes to param_atk.

---
 src/apps/ojph_compress/ojph_compress.cpp      | 98 +++++++++----------
 src/core/codestream/ojph_codestream_local.cpp | 15 ++-
 src/core/codestream/ojph_codestream_local.h   |  5 +-
 src/core/codestream/ojph_params.cpp           | 58 ++++++++---
 src/core/codestream/ojph_params_local.h       | 16 ++-
 src/core/codestream/ojph_resolution.cpp       |  4 +-
 src/core/codestream/ojph_subband.cpp          |  2 +-
 src/core/codestream/ojph_tile.cpp             |  2 +-
 8 files changed, 127 insertions(+), 73 deletions(-)

diff --git a/src/apps/ojph_compress/ojph_compress.cpp b/src/apps/ojph_compress/ojph_compress.cpp
index 42befaff..0ee86f7f 100644
--- a/src/apps/ojph_compress/ojph_compress.cpp
+++ b/src/apps/ojph_compress/ojph_compress.cpp
@@ -917,55 +917,55 @@ int main(int argc, char * argv[]) {
       }
       else if (is_matching(".dpx", v))
       {
-      dpx.open(input_filename);
-      ojph::param_siz siz = codestream.access_siz();
-      siz.set_image_extent(ojph::point(image_offset.x + dpx.get_size().w,
-        image_offset.y + dpx.get_size().h));
-      ojph::ui32 num_comps = dpx.get_num_components();
-      siz.set_num_components(num_comps);
-      //if (num_bit_depths > 0)
-      //  dpx.set_bit_depth(num_bit_depths, bit_depth);
-      for (ojph::ui32 c = 0; c < num_comps; ++c)
-        siz.set_component(c, dpx.get_comp_subsampling(c),
-          dpx.get_bit_depth(c), dpx.get_is_signed(c));
-      siz.set_image_offset(image_offset);
-      siz.set_tile_size(tile_size);
-      siz.set_tile_offset(tile_offset);
-
-      ojph::param_cod cod = codestream.access_cod();
-      cod.set_num_decomposition(num_decompositions);
-      cod.set_block_dims(block_size.w, block_size.h);
-      if (num_precincts != -1)
-        cod.set_precinct_size(num_precincts, precinct_size);
-      cod.set_progression_order(prog_order);
-      if (employ_color_transform == -1 && num_comps >= 3)
-        cod.set_color_transform(true);
-      else
-        cod.set_color_transform(employ_color_transform == 1);
-      cod.set_reversible(reversible);
-      if (!reversible && quantization_step != -1)
-        codestream.access_qcd().set_irrev_quant(quantization_step);
-      codestream.set_planar(false);
-      if (profile_string[0] != '\0')
-        codestream.set_profile(profile_string);
-      codestream.set_tilepart_divisions(tileparts_at_resolutions,
-        tileparts_at_components);
-      codestream.request_tlm_marker(tlm_marker);
-
-      if (dims.w != 0 || dims.h != 0)
-        OJPH_WARN(0x01000071,
-          "-dims option is not needed and was not used\n");
-      if (num_components != 0)
-        OJPH_WARN(0x01000072,
-          "-num_comps is not needed and was not used\n");
-      if (is_signed[0] != -1)
-        OJPH_WARN(0x01000073,
-          "-signed is not needed and was not used\n");
-      if (comp_downsampling[0].x != 0 || comp_downsampling[0].y != 0)
-        OJPH_WARN(0x01000075,
-          "-downsamp is not needed and was not used\n");
-
-      base = &dpx;
+        dpx.open(input_filename);
+        ojph::param_siz siz = codestream.access_siz();
+        siz.set_image_extent(ojph::point(image_offset.x + dpx.get_size().w,
+          image_offset.y + dpx.get_size().h));
+        ojph::ui32 num_comps = dpx.get_num_components();
+        siz.set_num_components(num_comps);
+        //if (num_bit_depths > 0)
+        //  dpx.set_bit_depth(num_bit_depths, bit_depth);
+        for (ojph::ui32 c = 0; c < num_comps; ++c)
+          siz.set_component(c, dpx.get_comp_subsampling(c),
+            dpx.get_bit_depth(c), dpx.get_is_signed(c));
+        siz.set_image_offset(image_offset);
+        siz.set_tile_size(tile_size);
+        siz.set_tile_offset(tile_offset);
+
+        ojph::param_cod cod = codestream.access_cod();
+        cod.set_num_decomposition(num_decompositions);
+        cod.set_block_dims(block_size.w, block_size.h);
+        if (num_precincts != -1)
+          cod.set_precinct_size(num_precincts, precinct_size);
+        cod.set_progression_order(prog_order);
+        if (employ_color_transform == -1 && num_comps >= 3)
+          cod.set_color_transform(true);
+        else
+          cod.set_color_transform(employ_color_transform == 1);
+        cod.set_reversible(reversible);
+        if (!reversible && quantization_step != -1)
+          codestream.access_qcd().set_irrev_quant(quantization_step);
+        codestream.set_planar(false);
+        if (profile_string[0] != '\0')
+          codestream.set_profile(profile_string);
+        codestream.set_tilepart_divisions(tileparts_at_resolutions,
+          tileparts_at_components);
+        codestream.request_tlm_marker(tlm_marker);
+
+        if (dims.w != 0 || dims.h != 0)
+          OJPH_WARN(0x01000071,
+            "-dims option is not needed and was not used\n");
+        if (num_components != 0)
+          OJPH_WARN(0x01000072,
+            "-num_comps is not needed and was not used\n");
+        if (is_signed[0] != -1)
+          OJPH_WARN(0x01000073,
+            "-signed is not needed and was not used\n");
+        if (comp_downsampling[0].x != 0 || comp_downsampling[0].y != 0)
+          OJPH_WARN(0x01000075,
+            "-downsamp is not needed and was not used\n");
+
+        base = &dpx;
       }
       else
 #if defined( OJPH_ENABLE_TIFF_SUPPORT)
diff --git a/src/core/codestream/ojph_codestream_local.cpp b/src/core/codestream/ojph_codestream_local.cpp
index d4d20a38..737daffb 100644
--- a/src/core/codestream/ojph_codestream_local.cpp
+++ b/src/core/codestream/ojph_codestream_local.cpp
@@ -84,6 +84,12 @@ namespace ojph {
       used_coc_fields = 0;
       coc = coc_store;
 
+      atk = atk_store;
+      atk[0].init_irv97();
+      atk[0].link(atk_store + 1);
+      atk[1].init_rev53();
+      atk[1].link(atk_store + 2);
+
       allocator = new mem_fixed_allocator;
       elastic_alloc = new mem_elastic_allocator(1048576); //1 megabyte
 
@@ -557,7 +563,8 @@ namespace ojph {
     {
       //finalize
       siz.check_validity();
-      cod.check_validity(siz);
+      cod.check_validity(siz);  
+      cod.update_atk(atk);
       qcd.check_validity(siz, cod);
       cap.check_validity(cod, qcd);
       if (profile == OJPH_PN_IMF)
@@ -821,16 +828,16 @@ namespace ojph {
         else if (marker_idx == 14)
           dfs.read(file);
         else if (marker_idx == 15)
-          atk.read(file);
+          atk[2].read(file);
         else if (marker_idx == 16)
           break;
         else
           OJPH_ERROR(0x00030051, "File ended before finding a tile segment");
       }
 
-      cod.update_atk(&atk);
+      cod.update_atk(atk);
       for (int i = 0; i < used_coc_fields; ++i)
-        coc[i].update_atk(&atk);
+        coc[i].update_atk(atk);
 
       if (received_markers != 3)
         OJPH_ERROR(0x00030052, "markers error, COD and QCD are required");
diff --git a/src/core/codestream/ojph_codestream_local.h b/src/core/codestream/ojph_codestream_local.h
index 34ffc355..5bfa09d4 100644
--- a/src/core/codestream/ojph_codestream_local.h
+++ b/src/core/codestream/ojph_codestream_local.h
@@ -168,7 +168,10 @@ namespace ojph {
 
     private:  // these are from Part 2 of the standard
       param_dfs dfs;         // downsmapling factor styles
-      param_atk atk;         // arbitrary transformation kernels
+      param_atk* atk;        // a pointer to atk
+      param_atk atk_store[3];// 0 and 1 are for DWT from Part 1, 2 onward are
+                             // for arbitrary transformation kernels
+
 
     private:
       mem_fixed_allocator *allocator;
diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp
index 5a76f24c..1735c819 100644
--- a/src/core/codestream/ojph_params.cpp
+++ b/src/core/codestream/ojph_params.cpp
@@ -279,7 +279,7 @@ namespace ojph {
       return state->get_wavelet_kern() == local::param_cod::DWT_REV53;
     else {
       assert(state->atk != NULL);
-      return state->atk->is_reversible();
+      return state->access_atk()->is_reversible();
     }
   }
 
@@ -1578,6 +1578,7 @@ namespace ojph {
         while (p->next != NULL)
           p = p->next;
         p->next = new param_atk;
+        p->alloced_next = true;
         p = p->next;
         return p->read(file);
       }
@@ -1598,11 +1599,14 @@ namespace ojph {
       if (is_reversible() && get_coeff_type() >= 2) // reversible & float
         OJPH_ERROR(0x000500E5, "ATK-Satk parameter does not make sense. "
           "It employs floats with reversible filtering."); 
+      if (is_using_ws_extension() == false)  // only sym. ext is supported
+        OJPH_ERROR(0x000500E6, "ATK-Satk parameter requires constant "
+          "boundary extension, which is not supported yet.");
       if (is_reversible() == false) 
         if (read_coefficient(file, Katk) == false)
-          OJPH_ERROR(0x000500E6, "error reading ATK-Katk parameter"); 
+          OJPH_ERROR(0x000500E7, "error reading ATK-Katk parameter"); 
       if (file->read(&Natk, 1) != 1)
-        OJPH_ERROR(0x000500E7, "error reading ATK-Natk parameter");
+        OJPH_ERROR(0x000500E8, "error reading ATK-Natk parameter");
       if (Natk > max_steps) {
         if (d != d_store) // was this allocated -- very unlikely
           delete[] d;
@@ -1615,21 +1619,21 @@ namespace ojph {
         for (int s = 0; s < Natk; ++s)
         {
           if (file->read(&d[s].rev.Eatk, 1) != 1)
-            OJPH_ERROR(0x000500E8, "error reading ATK-Eatk parameter");           
+            OJPH_ERROR(0x000500E9, "error reading ATK-Eatk parameter");           
           if (file->read(&d[s].rev.Batk, 2) != 2)
-            OJPH_ERROR(0x000500E9, "error reading ATK-Batk parameter");           
+            OJPH_ERROR(0x000500EA, "error reading ATK-Batk parameter");           
           d[s].rev.Batk = (si16)swap_byte((ui16)d[s].rev.Batk);
           ui8 LCatk;
           if (file->read(&LCatk, 1) != 1)
-            OJPH_ERROR(0x000500EA, "error reading ATK-LCatk parameter");
+            OJPH_ERROR(0x000500EB, "error reading ATK-LCatk parameter");
           if (LCatk == 0)
-            OJPH_ERROR(0x000500EB, "Encountered a ATK-LCatk value of zero; "
+            OJPH_ERROR(0x000500EC, "Encountered a ATK-LCatk value of zero; "
               "something is wrong.");
           if (LCatk > 1)
-            OJPH_ERROR(0x000500EC, "ATK-LCatk value greater than 1; "
+            OJPH_ERROR(0x000500ED, "ATK-LCatk value greater than 1; "
               "that is, a multitap filter is not supported");
           if (read_coefficient(file, d[s].rev.Aatk) == false)
-            OJPH_ERROR(0x000500ED, "Error reding ATK-Aatk parameter");
+            OJPH_ERROR(0x000500EE, "Error reding ATK-Aatk parameter");
         }
       }
       else
@@ -1638,19 +1642,47 @@ namespace ojph {
         {
           ui8 LCatk;
           if (file->read(&LCatk, 1) != 1)
-            OJPH_ERROR(0x000500EE, "error reading ATK-LCatk parameter");
+            OJPH_ERROR(0x000500EF, "error reading ATK-LCatk parameter");
           if (LCatk == 0)
-            OJPH_ERROR(0x000500EF, "Encountered a ATK-LCatk value of zero; "
+            OJPH_ERROR(0x000500F0, "Encountered a ATK-LCatk value of zero; "
               "something is wrong.");
           if (LCatk > 1)
-            OJPH_ERROR(0x000500F0, "ATK-LCatk value greater than 1; "
+            OJPH_ERROR(0x000500F1, "ATK-LCatk value greater than 1; "
               "that is, a multitap filter is not supported.");
           if (read_coefficient(file, d[s].irv.Aatk) == false)
-            OJPH_ERROR(0x000500F1, "Error reding ATK-Aatk parameter");
+            OJPH_ERROR(0x000500F2, "Error reding ATK-Aatk parameter");
         }
       }
 
       return true;
     }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_atk::init_irv97()
+    {
+      Satk = 0x4a00;     // illegal because ATK = 0
+      Katk = (float)1.230174104914001;
+      Natk = 4;
+      Latk = 5 + Natk + sizeof(float) * (1 + Natk); // (A-4) in T.801
+      d[0].irv.Aatk = (float)-1.586134342059924;
+      d[1].irv.Aatk = (float)-0.052980118572961;
+      d[2].irv.Aatk = (float)0.882911075530934;
+      d[3].irv.Aatk = (float)0.443506852043971;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void param_atk::init_rev53()
+    {
+      Satk = 0x5801;     // illegal because ATK = 1
+      Natk = 2;
+      Latk = 5 + 2 * Natk + sizeof(ui8) * (Natk + Natk); // (A-4) in T.801
+      d[0].rev.Aatk = -1;
+      d[0].rev.Batk = 0;
+      d[0].rev.Eatk = 1;
+      d[1].rev.Aatk = 1;
+      d[1].rev.Batk = 2;
+      d[1].rev.Eatk = 2;
+    }
+
   } // !local namespace
 }  // !ojph namespace
diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h
index 91447f15..59425da3 100644
--- a/src/core/codestream/ojph_params_local.h
+++ b/src/core/codestream/ojph_params_local.h
@@ -503,6 +503,9 @@ namespace ojph {
       ////////////////////////////////////////
       void update_atk(const param_atk* atk);
 
+      ////////////////////////////////////////
+      const param_atk* access_atk() const { return atk; }
+
     public: // COC_MAIN only functions
       ////////////////////////////////////////
       bool is_dfs_defined() const 
@@ -814,7 +817,10 @@ namespace ojph {
     public: // member functions
       param_atk() { init(); }
       ~param_atk() {
-        if (next) delete next;
+        if (next && alloced_next) {
+          delete next;
+          next = NULL;
+        }
         if (d != NULL && d != d_store) {
           delete[] d;
           init(false);
@@ -828,13 +834,17 @@ namespace ojph {
           memset(this, 0, sizeof(param_atk));
         d = d_store; max_steps = sizeof(d_store) / sizeof(data); 
       }
+      void init_irv97();
+      void init_rev53();
+      void link(param_atk* next) 
+      { assert(this->next == NULL); this->next = next; alloced_next = false; }
 
       ui8 get_index() const { return (ui8)(Satk & 0xFF); }
       int get_coeff_type() const { return (Satk >> 8) & 0x7; }
       bool is_whole_sample() const { return (Satk & 0x800) != 0; }
       bool is_reversible() const { return (Satk & 0x1000) != 0; }
       bool is_m_init0() const { return (Satk & 0x2000) == 0; }
-      bool is_using_ws_extension() const { return (Satk & 0x4000) != 0x4000; }
+      bool is_using_ws_extension() const { return (Satk & 0x4000) != 0; }
       const param_atk* get_atk(int index) const;
       const data* get_step(ui32 s) const { assert(s < Natk); return d + s; }
 
@@ -848,6 +858,8 @@ namespace ojph {
       data d_store[6];   // step coefficient
       param_atk* next;   // used for chaining if more than one atk segment
                          // exist in the codestream
+      bool alloced_next; // true if next was allocated, not just set to an
+                         // existing object
     };
   } // !local namespace
 } // !ojph namespace
diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp
index 0cc7e3b9..03d1278d 100644
--- a/src/core/codestream/ojph_resolution.cpp
+++ b/src/core/codestream/ojph_resolution.cpp
@@ -166,7 +166,7 @@ namespace ojph {
       //allocate lines
       if (skipped_res_for_recon == false)
       {
-        bool reversible = (cdp->get_wavelet_kern() == param_cod::DWT_REV53);
+        bool reversible = cdp->access_atk()->is_reversible();
         ui32 num_lines = reversible ? 4 : 6;
         allocator->pre_alloc_obj<line_buf>(num_lines);
 
@@ -321,7 +321,7 @@ namespace ojph {
       //allocate lines
       if (skipped_res_for_recon == false)
       {
-        this->reversible = cdp->get_wavelet_kern() == param_cod::DWT_REV53;
+        this->reversible = cdp->access_atk()->is_reversible();
         this->num_lines = this->reversible ? 4 : 6;
         lines = allocator->post_alloc_obj<line_buf>(num_lines);
 
diff --git a/src/core/codestream/ojph_subband.cpp b/src/core/codestream/ojph_subband.cpp
index ba6c5b96..c65a2ebb 100644
--- a/src/core/codestream/ojph_subband.cpp
+++ b/src/core/codestream/ojph_subband.cpp
@@ -112,7 +112,7 @@ namespace ojph {
       this->parent = res;
 
       const param_cod* cdp = codestream->get_cod();
-      this->reversible = cdp->get_wavelet_kern() == param_cod::DWT_REV53;
+      this->reversible = cdp->access_atk()->is_reversible();
       size log_cb = cdp->get_log_block_dims();
       log_PP = cdp->get_log_precinct_size(res_num);
 
diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp
index 38bcd686..b7cb52cd 100644
--- a/src/core/codestream/ojph_tile.cpp
+++ b/src/core/codestream/ojph_tile.cpp
@@ -214,7 +214,7 @@ namespace ojph {
 
       //allocate lines
       const param_cod* cdp = codestream->get_cod();
-      this->reversible = cdp->get_wavelet_kern() == param_cod::DWT_REV53;
+      this->reversible = cdp->access_atk()->is_reversible();
       this->employ_color_transform = cdp->is_employing_color_transform();
       if (this->employ_color_transform)
       {

From 32f17b5d4d9a95f202b52061c899d2b25fa926a6 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Sun, 31 Mar 2024 13:55:11 +1100
Subject: [PATCH 07/37] A bug fix.

---
 src/core/codestream/ojph_params.cpp | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp
index 1735c819..67c8fad3 100644
--- a/src/core/codestream/ojph_params.cpp
+++ b/src/core/codestream/ojph_params.cpp
@@ -860,15 +860,11 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void param_cod::update_atk(const param_atk* atk)
     {
-      if (SPcod.wavelet_trans > 1) {
-        this->atk = atk->get_atk(SPcod.wavelet_trans);
-        if (this->atk == NULL)
-          OJPH_ERROR(0x00050131, "A COD/COC segment employs the DWT kernel "
-            "atk=%d, but a corresponding ATK segment cannot be found", 
-            SPcod.wavelet_trans);
-      }
-      else
-        this->atk = NULL;
+      this->atk = atk->get_atk(SPcod.wavelet_trans);
+      if (this->atk == NULL)
+        OJPH_ERROR(0x00050131, "A COD/COC segment employs the DWT kernel "
+          "atk=%d, but a corresponding ATK segment cannot be found", 
+          SPcod.wavelet_trans);
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -1663,7 +1659,8 @@ namespace ojph {
       Satk = 0x4a00;     // illegal because ATK = 0
       Katk = (float)1.230174104914001;
       Natk = 4;
-      Latk = 5 + Natk + sizeof(float) * (1 + Natk); // (A-4) in T.801
+      // next is (A-4) in T.801 second line
+      Latk = (ui16)(5 + Natk + sizeof(float) * (1 + Natk));
       d[0].irv.Aatk = (float)-1.586134342059924;
       d[1].irv.Aatk = (float)-0.052980118572961;
       d[2].irv.Aatk = (float)0.882911075530934;
@@ -1675,7 +1672,8 @@ namespace ojph {
     {
       Satk = 0x5801;     // illegal because ATK = 1
       Natk = 2;
-      Latk = 5 + 2 * Natk + sizeof(ui8) * (Natk + Natk); // (A-4) in T.801
+      // next is (A-4) in T.801 fourth line
+      Latk = (ui16)(5 + 2 * Natk + sizeof(ui8) * (Natk + Natk));
       d[0].rev.Aatk = -1;
       d[0].rev.Batk = 0;
       d[0].rev.Eatk = 1;

From abe4ccf67b05eb1356261fed35965300cde13c47 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Sun, 31 Mar 2024 14:06:58 +1100
Subject: [PATCH 08/37] Remove warnings.

---
 src/core/codestream/ojph_params.cpp     | 12 ++++++------
 src/core/codestream/ojph_params_local.h | 15 +++++++++------
 src/core/codestream/ojph_subband.cpp    |  2 +-
 3 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp
index 67c8fad3..affa222a 100644
--- a/src/core/codestream/ojph_params.cpp
+++ b/src/core/codestream/ojph_params.cpp
@@ -876,7 +876,7 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void param_qcd::set_rev_quant(int num_decomps, ui32 bit_depth,
+    void param_qcd::set_rev_quant(ui32 num_decomps, ui32 bit_depth,
                                   bool is_employing_color_transform)
     {
       int guard_bits = 1;
@@ -901,7 +901,7 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void param_qcd::set_irrev_quant(int num_decomps)
+    void param_qcd::set_irrev_quant(ui32 num_decomps)
     {
       int guard_bits = 1;
       Sqcd = (ui8)((guard_bits<<5)|0x2);//one guard bit, scalar quantization
@@ -1404,8 +1404,8 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    int param_dfs::get_subband_idx(ui32 num_decompositions, ui32 resolution,
-                                   ui32 subband) const
+    ui32 param_dfs::get_subband_idx(ui32 num_decompositions, ui32 resolution,
+                                    ui32 subband) const
     {
       assert((resolution == 0 && subband == 0) || 
               (resolution > 0 && resolution <= Ids && 
@@ -1413,7 +1413,7 @@ namespace ojph {
 
       ui32 ns[4] = { 0, 3, 2, 2 };
 
-      int idx = 0;
+      ui32 idx = 0;
       if (resolution > 0)
       {
         idx = 0;
@@ -1538,7 +1538,7 @@ namespace ojph {
         e <<= 23;                              // move bits to their location
         s.i = 0;
         s.i |= ((ui32)(v >> 32) & 0x80000000); // copy sign bit
-        s.i |= e;                              // copy exponent
+        s.i |= (ui32)e;                        // copy exponent
         s.i |= (ui32)((v >> 25) & 0x007FFFFF); // copy 23 mantissa
         K = s.f;
       }
diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h
index 59425da3..e8e43f8f 100644
--- a/src/core/codestream/ojph_params_local.h
+++ b/src/core/codestream/ojph_params_local.h
@@ -541,7 +541,10 @@ namespace ojph {
     public:
       param_qcd()
       { 
-        memset(this, 0, sizeof(param_qcd));
+        Lqcd = 0;
+        Sqcd = 0;
+        memset(u16_SPqcd, 0, sizeof(u16_SPqcd));
+        num_subbands = 0;
         base_delta = -1.0f;
       }
 
@@ -549,7 +552,7 @@ namespace ojph {
 
       void check_validity(const param_siz& siz, const param_cod& cod)
       {
-        int num_decomps = cod.get_num_decompositions();
+        ui32 num_decomps = cod.get_num_decompositions();
         num_subbands = 1 + 3 * num_decomps;
         if (cod.get_wavelet_kern() == param_cod::DWT_REV53)
         {
@@ -585,9 +588,9 @@ namespace ojph {
       void read(infile_base *file);
 
     protected:
-      void set_rev_quant(int num_decomps, ui32 bit_depth, 
+      void set_rev_quant(ui32 num_decomps, ui32 bit_depth, 
                          bool is_employing_color_transform);
-      void set_irrev_quant(int num_decomps);
+      void set_irrev_quant(ui32 num_decomps);
 
     protected:
       ui16 Lqcd;
@@ -766,8 +769,8 @@ namespace ojph {
       // decomp_level is the decomposition level, starting from 1 for highest
       // resolution to num_decomps for the coarsest resolution
       dfs_dwt_type get_dwt_type(ui32 decomp_level) const;
-      int get_subband_idx(ui32 num_decompositions, ui32 resolution, 
-                          ui32 subband) const;
+      ui32 get_subband_idx(ui32 num_decompositions, ui32 resolution,
+                           ui32 subband) const;
 
     private: // member variables
       ui16 Ldfs;       // length of the segment marker
diff --git a/src/core/codestream/ojph_subband.cpp b/src/core/codestream/ojph_subband.cpp
index c65a2ebb..6348e98b 100644
--- a/src/core/codestream/ojph_subband.cpp
+++ b/src/core/codestream/ojph_subband.cpp
@@ -126,7 +126,7 @@ namespace ojph {
       cur_cb_height = 0;
       param_qcd* qcd = codestream->access_qcd(parent->get_comp_num());
       const param_cod* cod = codestream->get_cod();
-      int num_decomps = cod->get_num_decompositions();
+      ui32 num_decomps = cod->get_num_decompositions();
       this->K_max = qcd->get_Kmax(NULL, num_decomps, this->res_num, band_num);
       if (!reversible)
       {

From fc377de1de6eabca5b193b6da8b36c5f189a2800 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Sun, 31 Mar 2024 14:19:09 +1100
Subject: [PATCH 09/37] Warning/bug fix.

---
 src/core/codestream/ojph_params.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp
index affa222a..3795d4b8 100644
--- a/src/core/codestream/ojph_params.cpp
+++ b/src/core/codestream/ojph_params.cpp
@@ -1551,15 +1551,14 @@ namespace ojph {
     {
       int coeff_type = get_coeff_type();
       if (coeff_type == 0) {
-        ui8 v;
+        si8 v;
         if (file->read(&v, 1) != 1) return false;
         K = v;
       }
       else if (coeff_type == 1) {
-        ui16 v;
+        si16 v;
         if (file->read(&v, 2) != 2) return false;
-        v = swap_byte(v);
-        K = v;
+        K = (si16)swap_byte((ui16)v);
       }
       else
         return false;

From 1c08cf3b7a3842611a18d633c76b6c5932f1b600 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Tue, 2 Apr 2024 21:07:54 +1100
Subject: [PATCH 10/37] Added COC. ATK is used for wavelet transform. Modified
 wavelet synthesis (no acceleration yet). Wavelet analysis is broken.  Close
 to enabling DFS in synthesis.

---
 src/core/codestream/ojph_codestream_local.h |  15 +-
 src/core/codestream/ojph_params.cpp         |  10 +-
 src/core/codestream/ojph_params_local.h     |  44 +-
 src/core/codestream/ojph_resolution.cpp     | 811 +++++++++++---------
 src/core/codestream/ojph_resolution.h       |  15 +-
 src/core/codestream/ojph_subband.cpp        |   9 +-
 src/core/codestream/ojph_subband.h          |   2 +-
 src/core/codestream/ojph_tile.cpp           |   2 +-
 src/core/codestream/ojph_tile_comp.cpp      |   7 +-
 src/core/codestream/ojph_tile_comp.h        |   3 +-
 src/core/common/ojph_mem.h                  |  10 +
 src/core/transform/ojph_transform.cpp       | 235 ++++++
 src/core/transform/ojph_transform.h         |  42 +
 src/core/transform/ojph_transform_local.h   |  38 +
 14 files changed, 839 insertions(+), 404 deletions(-)

diff --git a/src/core/codestream/ojph_codestream_local.h b/src/core/codestream/ojph_codestream_local.h
index 5bfa09d4..8e77eb17 100644
--- a/src/core/codestream/ojph_codestream_local.h
+++ b/src/core/codestream/ojph_codestream_local.h
@@ -82,8 +82,19 @@ namespace ojph {
       { return &siz; }
       ojph::param_cod access_cod()            //return externally wrapped cod
       { return ojph::param_cod(&cod); }
-      const param_cod* get_cod() //return internal code
+      const param_cod* get_cod()              //return internal code
       { return &cod; }
+      const param_cod* get_cod(ui32 comp_num) //return internal code
+      { 
+        if (used_coc_fields == 0)
+          return &cod;
+        else {
+          for (int i = 0; i < used_coc_fields; ++i)
+            if (coc[i].get_comp_num() == comp_num)
+              return coc + i;
+          return &cod;
+        }
+      }
       param_qcd* access_qcd(ui32 comp_num)
       { 
         if (used_qcc_fields > 0)
@@ -92,6 +103,8 @@ namespace ojph {
               return qcc + v;
         return &qcd; 
       }
+      const param_dfs* access_dfs()
+      { if (dfs.exists()) return &dfs; else return NULL; }
       mem_fixed_allocator* get_allocator() { return allocator; }
       mem_elastic_allocator* get_elastic_alloc() { return elastic_alloc; }
       outfile_base* get_file() { return outfile; }
diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp
index 3795d4b8..ef652651 100644
--- a/src/core/codestream/ojph_params.cpp
+++ b/src/core/codestream/ojph_params.cpp
@@ -824,12 +824,12 @@ namespace ojph {
         ui8 t;
         if (file->read(&t, 1) != 1)
           OJPH_ERROR(0x00050122, "error reading COC segment");
-        comp_idx = t;
+        comp_num = t;
       }
       else {
-        if (file->read(&comp_idx, 2) != 2)
+        if (file->read(&comp_num, 2) != 2)
           OJPH_ERROR(0x00050123, "error reading COC segment");
-        comp_idx = swap_byte(comp_idx);
+        comp_num = swap_byte(comp_num);
       }
       if (file->read(&Scod, 1) != 1)
         OJPH_ERROR(0x00050124, "error reading COC segment");
@@ -1393,8 +1393,6 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     param_dfs::dfs_dwt_type param_dfs::get_dwt_type(ui32 decomp_level) const
     { 
-      assert(decomp_level > 0 && decomp_level <= Ids);
-
       decomp_level = ojph_min(decomp_level, Ids);
       ui32 d = decomp_level - 1;          // decomp_level starts from 1
       ui32 idx = d >> 2;                  // complete bytes
@@ -1605,7 +1603,7 @@ namespace ojph {
       if (Natk > max_steps) {
         if (d != d_store) // was this allocated -- very unlikely
           delete[] d;
-        d = new data[Natk];
+        d = new lifting_step[Natk];
         max_steps = Natk;
       }
 
diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h
index e8e43f8f..c08d750e 100644
--- a/src/core/codestream/ojph_params_local.h
+++ b/src/core/codestream/ojph_params_local.h
@@ -515,6 +515,10 @@ namespace ojph {
       ui16 get_dfs_index() const  // cannot be more than 15
       { return SPcod.num_decomp & 0xF; }
 
+      ////////////////////////////////////////
+      ui32 get_comp_num()
+      { assert(type == COC_MAIN); return comp_num; }
+
     private: // Common variables
       cod_type type;        // The type of this cod structure
       ui16 Lcod;            // serves as Lcod and Scod
@@ -524,7 +528,7 @@ namespace ojph {
 
     private: // COC only variables
       param_cod* parent;    // parent COD structure
-      ui16 comp_idx;        // component index of this COC structure
+      ui16 comp_num;        // component index of this COC structure
       const param_atk* atk; // useful when SPcod.wavelet_trans > 1
     };
 
@@ -788,16 +792,9 @@ namespace ojph {
     //
     //
     ///////////////////////////////////////////////////////////////////////////
-    struct param_atk
-    {
-      // Limitations:
-      // Arbitrary filters (ARB) are not supported
-      // Up to 6 steps are supported -- more than 6 are not supported
-      // Only one coefficient per step -- first order filter
-      // Only even-indexed subsequence in first reconstruction step,
-      //   m_init = 0 is supported
+    // data structures used by param_atk
 
-    public: // data structures used by this object
+    union lifting_step {
       struct irv_data {
         // si8 Oatk;     // only for arbitrary filter
         // ui8 LCatk;    // number of lifting coefficients in a step
@@ -812,10 +809,18 @@ namespace ojph {
         si16 Aatk;       // lifting coefficient
       };
 
-      union data {
-        irv_data irv;
-        rev_data rev;
-      };
+      irv_data irv;
+      rev_data rev;
+    };
+
+    struct param_atk
+    {
+      // Limitations:
+      // Arbitrary filters (ARB) are not supported
+      // Up to 6 steps are supported -- more than 6 are not supported
+      // Only one coefficient per step -- first order filter
+      // Only even-indexed subsequence in first reconstruction step,
+      //   m_init = 0 is supported
 
     public: // member functions
       param_atk() { init(); }
@@ -835,7 +840,7 @@ namespace ojph {
       void init(bool clear_all = true) { 
         if (clear_all)
           memset(this, 0, sizeof(param_atk));
-        d = d_store; max_steps = sizeof(d_store) / sizeof(data); 
+        d = d_store; max_steps = sizeof(d_store) / sizeof(lifting_step);
       }
       void init_irv97();
       void init_rev53();
@@ -849,16 +854,19 @@ namespace ojph {
       bool is_m_init0() const { return (Satk & 0x2000) == 0; }
       bool is_using_ws_extension() const { return (Satk & 0x4000) != 0; }
       const param_atk* get_atk(int index) const;
-      const data* get_step(ui32 s) const { assert(s < Natk); return d + s; }
+      const lifting_step* get_step(ui32 s) const 
+      { assert(s < Natk); return d + s; }
+      const ui32 get_num_steps() const { return Natk; }
+      const float get_K() const { return Katk; }
 
     private: // member variables
       ui16 Latk;         // structure length
       ui16 Satk;         // carries a variety of information
       float Katk;        // only for irreversible scaling factor K
       ui8 Natk;          // number of lifting steps
-      data* d;           // pointer to data, initialized to d_store
+      lifting_step* d;   // pointer to data, initialized to d_store
       int max_steps;     // maximum number of steps without memory allocation
-      data d_store[6];   // step coefficient
+      lifting_step d_store[6];   // lifting step coefficient
       param_atk* next;   // used for chaining if more than one atk segment
                          // exist in the codestream
       bool alloced_next; // true if next was allocated, not just set to an
diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp
index 03d1278d..105c57de 100644
--- a/src/core/codestream/ojph_resolution.cpp
+++ b/src/core/codestream/ojph_resolution.cpp
@@ -98,14 +98,39 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void resolution::pre_alloc(codestream* codestream, const rect& res_rect,
-                               const rect& recon_res_rect, ui32 res_num)
+                               const rect& recon_res_rect, 
+                               ui32 comp_num, ui32 res_num)
     {
       mem_fixed_allocator* allocator = codestream->get_allocator();
-      const param_cod* cdp = codestream->get_cod();
-      ui32 t = codestream->get_cod()->get_num_decompositions()
+      const param_cod* cdp = codestream->get_cod(comp_num);
+      ui32 t = cdp->get_num_decompositions()
              - codestream->get_skipped_res_for_recon();
       bool skipped_res_for_recon = res_num > t;
 
+      const param_atk* atk = cdp->access_atk();
+      param_dfs::dfs_dwt_type downsampling_style = param_dfs::BIDIR_DWT;
+      if (cdp->is_dfs_defined()) {
+        const param_dfs* dfs = codestream->access_dfs();
+        if (dfs == NULL) {
+          OJPH_ERROR(0x00070001, "There is a problem with codestream "
+            "marker segments. COD/COC specifies the use of a DFS marker "
+            "but there are no DFS markers within the main codestream "
+            "headers");
+        }
+        else {
+          ui16 dfs_idx = cdp->get_dfs_index();
+          dfs = dfs->get_dfs(dfs_idx);
+          if (dfs == NULL) {
+            OJPH_ERROR(0x00070002, "There is a problem with codestream "
+              "marker segments. COD/COC specifies the use of a DFS marker "
+              "with index %d, but there are no such marker within the "
+              "main codestream headers", dfs_idx);
+          }
+          ui32 num_decomps = cdp->get_num_decompositions();
+          downsampling_style = dfs->get_dwt_type(num_decomps - res_num + 1);
+        }
+      }
+
       //create next resolution
       if (res_num > 0)
       {
@@ -122,7 +147,8 @@ namespace ojph {
         next_res_rect.siz.h = try1 - try0;
 
         resolution::pre_alloc(codestream, next_res_rect,
-          skipped_res_for_recon ? recon_res_rect : next_res_rect, res_num - 1);
+          skipped_res_for_recon ? recon_res_rect : next_res_rect, 
+          comp_num, res_num - 1);
       }
 
       //allocate subbands
@@ -145,11 +171,11 @@ namespace ojph {
           band_rect.org.y = tby0;
           band_rect.siz.w = tbx1 - tbx0;
           band_rect.siz.h = tby1 - tby0;
-          subband::pre_alloc(codestream, band_rect, res_num);
+          subband::pre_alloc(codestream, band_rect, comp_num, res_num);
         }
       }
       else
-        subband::pre_alloc(codestream, res_rect, res_num);
+        subband::pre_alloc(codestream, res_rect, comp_num, res_num);
 
       //prealloc precincts
       size log_PP = cdp->get_log_precinct_size(res_num);
@@ -166,13 +192,15 @@ namespace ojph {
       //allocate lines
       if (skipped_res_for_recon == false)
       {
-        bool reversible = cdp->access_atk()->is_reversible();
-        ui32 num_lines = reversible ? 4 : 6;
-        allocator->pre_alloc_obj<line_buf>(num_lines);
+        ui32 num_steps = atk->get_num_steps();
+        allocator->pre_alloc_obj<line_buf>(num_steps + 2);
+        allocator->pre_alloc_obj<lifting_buf>(num_steps + 2);
 
         ui32 width = res_rect.siz.w + 1;
-        for (ui32 i = 0; i < num_lines; ++i)
+        for (ui32 i = 0; i < num_steps; ++i)
           allocator->pre_alloc_data<si32>(width, 1);
+        allocator->pre_alloc_data<si32>(width, 1);
+        allocator->pre_alloc_data<si32>(width, 1);
       }
     }
 
@@ -187,12 +215,12 @@ namespace ojph {
     {
       mem_fixed_allocator* allocator = codestream->get_allocator();
       elastic = codestream->get_elastic_alloc();
-      ui32 t, num_decomps = codestream->get_cod()->get_num_decompositions();
+      const param_cod* cdp = codestream->get_cod(comp_num);
+      ui32 t, num_decomps = cdp->get_num_decompositions();
       t = num_decomps - codestream->get_skipped_res_for_recon();
       skipped_res_for_recon = res_num > t;
       t = num_decomps - codestream->get_skipped_res_for_read();
       skipped_res_for_read = res_num > t;
-      const param_cod* cdp = codestream->get_cod();
 
       this->comp_downsamp = comp_downsamp;
       this->parent_comp = parent_tile_comp;
@@ -201,6 +229,31 @@ namespace ojph {
       this->comp_num = comp_num;
       this->res_num = res_num;
       this->num_bytes = 0;
+      this->atk = cdp->access_atk();
+      this->downsampling_style = param_dfs::BIDIR_DWT;
+      if (cdp->is_dfs_defined()) {
+        const param_dfs* dfs = codestream->access_dfs();
+        if (dfs == NULL) {
+          OJPH_ERROR(0x00070011, "There is a problem with codestream "
+              "marker segments. COD/COC specifies the use of a DFS marker "
+              "but there are no DFS markers within the main codestream "
+            "headers");
+        }
+        else {
+          ui16 dfs_idx = cdp->get_dfs_index();
+          dfs = dfs->get_dfs(dfs_idx);
+          if (dfs == NULL) {
+            OJPH_ERROR(0x00070012, "There is a problem with codestream "
+              "marker segments. COD/COC specifies the use of a DFS marker "
+              "with index %d, but there are no such marker within the "
+              "main codestream headers", dfs_idx);
+          }
+          ui32 num_decomps = cdp->get_num_decompositions();
+          this->downsampling_style = 
+            dfs->get_dwt_type(num_decomps - res_num + 1);
+        }
+      }
+
       //finalize next resolution
       if (res_num > 0)
       {
@@ -321,13 +374,33 @@ namespace ojph {
       //allocate lines
       if (skipped_res_for_recon == false)
       {
-        this->reversible = cdp->access_atk()->is_reversible();
-        this->num_lines = this->reversible ? 4 : 6;
-        lines = allocator->post_alloc_obj<line_buf>(num_lines);
-
+        this->atk = cdp->access_atk();
+        this->reversible = atk->is_reversible();
+        this->num_steps = atk->get_num_steps();
+        // create line buffers and lifting_bufs
+        lines = allocator->post_alloc_obj<line_buf>(num_steps + 2);
+        ssp = allocator->post_alloc_obj<lifting_buf>(num_steps + 2);
+        sig = ssp + num_steps;
+        aug = ssp + num_steps + 1;
+
+        // initiate lifting_bufs
+        for (ui32 i = 0; i < num_steps; ++i) {
+          new (ssp + i) lifting_buf;
+          ssp[i].line = lines + i;
+        };
+        new (sig) lifting_buf;
+        sig->line = lines + num_steps;
+        new (aug) lifting_buf;
+        aug->line = lines + num_steps + 1;
+
+        // initiate storage of line_buf
         ui32 width = res_rect.siz.w + 1;
-        for (ui32 i = 0; i < num_lines; ++i)
-          lines[i].wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+        for (ui32 i = 0; i < num_steps; ++i)
+          ssp[i].line->wrap(
+            allocator->post_alloc_data<si32>(width, 1), width, 1);
+        sig->line->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+        aug->line->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+
         cur_line = 0;
         vert_even = (res_rect.org.y & 1) == 0;
         horz_even = (res_rect.org.x & 1) == 0;
@@ -340,271 +413,271 @@ namespace ojph {
       if (res_num == 0)
       {
         assert(num_bands == 1 && child_res == NULL);
-        bands[0].exchange_buf(lines + 0);//line at location 0
+        bands[0].exchange_buf(ssp[0].line);//line at location 0
         bands[0].push_line();
         return;
       }
 
-      ui32 width = res_rect.siz.w;
-      if (width == 0)
-        return;
-      if (reversible)
-      {
-        //vertical transform
-        assert(num_lines >= 4);
-        if (vert_even)
-        {
-          rev_vert_wvlt_fwd_predict(lines,
-                                    cur_line > 1 ? lines + 2 : lines,
-                                    lines + 1, width);
-          rev_vert_wvlt_fwd_update(lines + 1,
-                                   cur_line > 2 ? lines + 3 : lines + 1,
-                                   lines + 2, width);
-
-          // push to horizontal transform lines[2](L) and lines[1] (H)
-          if (cur_line >= 1)
-          {
-            rev_horz_wvlt_fwd_tx(lines + 1, bands[2].get_line(),
-              bands[3].get_line(), width, horz_even);
-            bands[2].push_line();
-            bands[3].push_line();
-          }
-          if (cur_line >= 2)
-          {
-            rev_horz_wvlt_fwd_tx(lines + 2, child_res->get_line(),
-              bands[1].get_line(), width, horz_even);
-            bands[1].push_line();
-            child_res->push_line();
-          }
-        }
-
-        if (cur_line >= res_rect.siz.h - 1)
-        { //finished, so we need to process any lines left
-          if (cur_line)
-          {
-            if (vert_even)
-            {
-              rev_vert_wvlt_fwd_update(lines + 1, lines + 1,
-                                       lines, width);
-              //push lines[0] to L
-              rev_horz_wvlt_fwd_tx(lines, child_res->get_line(),
-                bands[1].get_line(), width, horz_even);
-              bands[1].push_line();
-              child_res->push_line();
-            }
-            else
-            {
-              rev_vert_wvlt_fwd_predict(lines + 1, lines + 1,
-                                        lines, width);
-              rev_vert_wvlt_fwd_update(lines,
-                                       cur_line > 1 ? lines + 2 : lines,
-                                       lines + 1, width);
-
-              // push to horizontal transform lines[1](L) and line[0] (H)
-              //line[0] to H
-              rev_horz_wvlt_fwd_tx(lines, bands[2].get_line(),
-                bands[3].get_line(), width, horz_even);
-              bands[2].push_line();
-              bands[3].push_line();
-              //line[1] to L
-              rev_horz_wvlt_fwd_tx(lines + 1, child_res->get_line(),
-                bands[1].get_line(), width, horz_even);
-              bands[1].push_line();
-              child_res->push_line();
-            }
-          }
-          else
-          { //only one line
-            if (vert_even)
-            {
-              //push to L
-              rev_horz_wvlt_fwd_tx(lines, child_res->get_line(),
-                bands[1].get_line(), width, horz_even);
-              bands[1].push_line();
-              child_res->push_line();
-            }
-            else
-            {
-              si32* sp = lines[0].i32;
-              for (ui32 i = width; i > 0; --i)
-                *sp++ <<= 1;
-              //push to H
-              rev_horz_wvlt_fwd_tx(lines, bands[2].get_line(),
-                bands[3].get_line(), width, horz_even);
-              bands[2].push_line();
-              bands[3].push_line();
-            }
-          }
-        }
-
-        rotate_buffers(lines, lines + 1, lines + 2, lines + 3);
-
-        ++cur_line;
-        vert_even = !vert_even;
-      }
-      else
-      {
-        //vertical transform
-        assert(num_lines >= 6);
-        if (vert_even)
-        {
-          irrev_vert_wvlt_step(lines + 0,
-                               cur_line > 1 ? lines + 2 : lines,
-                               lines + 1, 0, width);
-          irrev_vert_wvlt_step(lines + 1,
-                               cur_line > 2 ? lines + 3 : lines + 1,
-                               lines + 2, 1, width);
-          irrev_vert_wvlt_step(lines + 2,
-                               cur_line > 3 ? lines + 4 : lines + 2,
-                               lines + 3, 2, width);
-          irrev_vert_wvlt_step(lines + 3,
-                               cur_line > 4 ? lines + 5 : lines + 3,
-                               lines + 4, 3, width);
-
-          // push to horizontal transform lines[4](L) and lines[3] (H)
-          if (cur_line >= 3)
-          {
-            irrev_vert_wvlt_K(lines + 3, lines + 5,
-                              false, width);
-            irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
-              bands[3].get_line(), width, horz_even);
-            bands[2].push_line();
-            bands[3].push_line();
-          }
-          if (cur_line >= 4)
-          {
-            irrev_vert_wvlt_K(lines + 4, lines + 5,
-                              true, width);
-            irrev_horz_wvlt_fwd_tx(lines + 5, child_res->get_line(),
-              bands[1].get_line(), width, horz_even);
-            bands[1].push_line();
-            child_res->push_line();
-          }
-        }
-
-        if (cur_line >= res_rect.siz.h - 1)
-        { //finished, so we need to process any left line
-          if (cur_line)
-          {
-            if (vert_even)
-            {
-              irrev_vert_wvlt_step(lines + 1, lines + 1,
-                                   lines, 1, width);
-              irrev_vert_wvlt_step(lines,
-                                   cur_line > 1 ? lines + 2 : lines,
-                                   lines + 1, 2, width);
-              irrev_vert_wvlt_step(lines + 1,
-                                   cur_line > 2 ? lines + 3 : lines + 1,
-                                   lines + 2, 3, width);
-              irrev_vert_wvlt_step(lines + 1, lines + 1,
-                                   lines, 3, width);
-              //push lines[2] to L, lines[1] to H, and lines[0] to L
-              if (cur_line >= 2)
-              {
-                irrev_vert_wvlt_K(lines + 2, lines + 5,
-                                  true, width);
-                irrev_horz_wvlt_fwd_tx(lines + 5,
-                  child_res->get_line(), bands[1].get_line(),
-                  width, horz_even);
-                bands[1].push_line();
-                child_res->push_line();
-              }
-              irrev_vert_wvlt_K(lines + 1, lines + 5,
-                                false, width);
-              irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
-                bands[3].get_line(), width, horz_even);
-              bands[2].push_line();
-              bands[3].push_line();
-              irrev_vert_wvlt_K(lines, lines + 5,
-                                true, width);
-              irrev_horz_wvlt_fwd_tx(lines + 5, child_res->get_line(),
-                bands[1].get_line(), width, horz_even);
-              bands[1].push_line();
-              child_res->push_line();
-            }
-            else
-            {
-              irrev_vert_wvlt_step(lines + 1, lines + 1,
-                                   lines, 0, width);
-              irrev_vert_wvlt_step(lines,
-                                   cur_line > 1 ? lines + 2 : lines,
-                                   lines + 1, 1, width);
-              irrev_vert_wvlt_step(lines + 1,
-                                   cur_line > 2 ? lines + 3 : lines + 1,
-                                   lines + 2, 2, width);
-              irrev_vert_wvlt_step(lines + 2,
-                                   cur_line > 3 ? lines + 4 : lines + 2,
-                                   lines + 3, 3, width);
-
-              irrev_vert_wvlt_step(lines + 1, lines + 1,
-                                   lines, 2, width);
-              irrev_vert_wvlt_step(lines,
-                                   cur_line > 1 ? lines + 2 : lines,
-                                   lines + 1, 3, width);
-
-              //push lines[3] L, lines[2] H, lines[1] L, and lines[0] H
-              if (cur_line >= 3)
-              {
-                irrev_vert_wvlt_K(lines + 3, lines + 5,
-                                  true, width);
-                irrev_horz_wvlt_fwd_tx(lines + 5,
-                  child_res->get_line(), bands[1].get_line(),
-                  width, horz_even);
-                bands[1].push_line();
-                child_res->push_line();
-              }
-              if (cur_line >= 2)
-                irrev_vert_wvlt_K(lines + 2, lines + 5, false, width);
-              else
-                irrev_vert_wvlt_K(lines, lines + 5, false, width);
-              irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
-                bands[3].get_line(), width, horz_even);
-              bands[2].push_line();
-              bands[3].push_line();
-              irrev_vert_wvlt_K(lines + 1, lines + 5,
-                                true, width);
-              irrev_horz_wvlt_fwd_tx(lines + 5, child_res->get_line(),
-                bands[1].get_line(), width, horz_even);
-              bands[1].push_line();
-              child_res->push_line();
-              irrev_vert_wvlt_K(lines, lines + 5,
-                                false, width);
-              irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
-                bands[3].get_line(), width, horz_even);
-              bands[2].push_line();
-              bands[3].push_line();
-            }
-          }
-          else
-          { //only one line
-            if (vert_even)
-            {
-              //push to L
-              irrev_horz_wvlt_fwd_tx(lines, child_res->get_line(),
-                bands[1].get_line(), width, horz_even);
-              bands[1].push_line();
-              child_res->push_line();
-            }
-            else
-            {
-              float* sp = lines[0].f32;
-              for (ui32 i = width; i > 0; --i)
-                *sp++ *= 2.0f;
-              //push to H
-              irrev_horz_wvlt_fwd_tx(lines, bands[2].get_line(),
-                bands[3].get_line(), width, horz_even);
-              bands[2].push_line();
-              bands[3].push_line();
-            }
-          }
-        }
-
-        rotate_buffers(lines, lines + 1, lines + 2, lines + 3, lines + 4, 
-                       lines + 5);
-
-        ++cur_line;
-        vert_even = !vert_even;
-      }
+      //ui32 width = res_rect.siz.w;
+      //if (width == 0)
+      //  return;
+      //if (reversible)
+      //{
+      //  //vertical transform
+      //  assert(num_lines >= 4);
+      //  if (vert_even)
+      //  {
+      //    rev_vert_wvlt_fwd_predict(lines,
+      //                              cur_line > 1 ? lines + 2 : lines,
+      //                              lines + 1, width);
+      //    rev_vert_wvlt_fwd_update(lines + 1,
+      //                             cur_line > 2 ? lines + 3 : lines + 1,
+      //                             lines + 2, width);
+
+      //    // push to horizontal transform lines[2](L) and lines[1] (H)
+      //    if (cur_line >= 1)
+      //    {
+      //      rev_horz_wvlt_fwd_tx(lines + 1, bands[2].get_line(),
+      //        bands[3].get_line(), width, horz_even);
+      //      bands[2].push_line();
+      //      bands[3].push_line();
+      //    }
+      //    if (cur_line >= 2)
+      //    {
+      //      rev_horz_wvlt_fwd_tx(lines + 2, child_res->get_line(),
+      //        bands[1].get_line(), width, horz_even);
+      //      bands[1].push_line();
+      //      child_res->push_line();
+      //    }
+      //  }
+
+      //  if (cur_line >= res_rect.siz.h - 1)
+      //  { //finished, so we need to process any lines left
+      //    if (cur_line)
+      //    {
+      //      if (vert_even)
+      //      {
+      //        rev_vert_wvlt_fwd_update(lines + 1, lines + 1,
+      //                                 lines, width);
+      //        //push lines[0] to L
+      //        rev_horz_wvlt_fwd_tx(lines, child_res->get_line(),
+      //          bands[1].get_line(), width, horz_even);
+      //        bands[1].push_line();
+      //        child_res->push_line();
+      //      }
+      //      else
+      //      {
+      //        rev_vert_wvlt_fwd_predict(lines + 1, lines + 1,
+      //                                  lines, width);
+      //        rev_vert_wvlt_fwd_update(lines,
+      //                                 cur_line > 1 ? lines + 2 : lines,
+      //                                 lines + 1, width);
+
+      //        // push to horizontal transform lines[1](L) and line[0] (H)
+      //        //line[0] to H
+      //        rev_horz_wvlt_fwd_tx(lines, bands[2].get_line(),
+      //          bands[3].get_line(), width, horz_even);
+      //        bands[2].push_line();
+      //        bands[3].push_line();
+      //        //line[1] to L
+      //        rev_horz_wvlt_fwd_tx(lines + 1, child_res->get_line(),
+      //          bands[1].get_line(), width, horz_even);
+      //        bands[1].push_line();
+      //        child_res->push_line();
+      //      }
+      //    }
+      //    else
+      //    { //only one line
+      //      if (vert_even)
+      //      {
+      //        //push to L
+      //        rev_horz_wvlt_fwd_tx(lines, child_res->get_line(),
+      //          bands[1].get_line(), width, horz_even);
+      //        bands[1].push_line();
+      //        child_res->push_line();
+      //      }
+      //      else
+      //      {
+      //        si32* sp = lines[0].i32;
+      //        for (ui32 i = width; i > 0; --i)
+      //          *sp++ <<= 1;
+      //        //push to H
+      //        rev_horz_wvlt_fwd_tx(lines, bands[2].get_line(),
+      //          bands[3].get_line(), width, horz_even);
+      //        bands[2].push_line();
+      //        bands[3].push_line();
+      //      }
+      //    }
+      //  }
+
+      //  rotate_buffers(lines, lines + 1, lines + 2, lines + 3);
+
+      //  ++cur_line;
+      //  vert_even = !vert_even;
+      //}
+      //else
+      //{
+      //  //vertical transform
+      //  assert(num_lines >= 6);
+      //  if (vert_even)
+      //  {
+      //    irrev_vert_wvlt_step(lines + 0,
+      //                         cur_line > 1 ? lines + 2 : lines,
+      //                         lines + 1, 0, width);
+      //    irrev_vert_wvlt_step(lines + 1,
+      //                         cur_line > 2 ? lines + 3 : lines + 1,
+      //                         lines + 2, 1, width);
+      //    irrev_vert_wvlt_step(lines + 2,
+      //                         cur_line > 3 ? lines + 4 : lines + 2,
+      //                         lines + 3, 2, width);
+      //    irrev_vert_wvlt_step(lines + 3,
+      //                         cur_line > 4 ? lines + 5 : lines + 3,
+      //                         lines + 4, 3, width);
+
+      //    // push to horizontal transform lines[4](L) and lines[3] (H)
+      //    if (cur_line >= 3)
+      //    {
+      //      irrev_vert_wvlt_K(lines + 3, lines + 5,
+      //                        false, width);
+      //      irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
+      //        bands[3].get_line(), width, horz_even);
+      //      bands[2].push_line();
+      //      bands[3].push_line();
+      //    }
+      //    if (cur_line >= 4)
+      //    {
+      //      irrev_vert_wvlt_K(lines + 4, lines + 5,
+      //                        true, width);
+      //      irrev_horz_wvlt_fwd_tx(lines + 5, child_res->get_line(),
+      //        bands[1].get_line(), width, horz_even);
+      //      bands[1].push_line();
+      //      child_res->push_line();
+      //    }
+      //  }
+
+      //  if (cur_line >= res_rect.siz.h - 1)
+      //  { //finished, so we need to process any left line
+      //    if (cur_line)
+      //    {
+      //      if (vert_even)
+      //      {
+      //        irrev_vert_wvlt_step(lines + 1, lines + 1,
+      //                             lines, 1, width);
+      //        irrev_vert_wvlt_step(lines,
+      //                             cur_line > 1 ? lines + 2 : lines,
+      //                             lines + 1, 2, width);
+      //        irrev_vert_wvlt_step(lines + 1,
+      //                             cur_line > 2 ? lines + 3 : lines + 1,
+      //                             lines + 2, 3, width);
+      //        irrev_vert_wvlt_step(lines + 1, lines + 1,
+      //                             lines, 3, width);
+      //        //push lines[2] to L, lines[1] to H, and lines[0] to L
+      //        if (cur_line >= 2)
+      //        {
+      //          irrev_vert_wvlt_K(lines + 2, lines + 5,
+      //                            true, width);
+      //          irrev_horz_wvlt_fwd_tx(lines + 5,
+      //            child_res->get_line(), bands[1].get_line(),
+      //            width, horz_even);
+      //          bands[1].push_line();
+      //          child_res->push_line();
+      //        }
+      //        irrev_vert_wvlt_K(lines + 1, lines + 5,
+      //                          false, width);
+      //        irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
+      //          bands[3].get_line(), width, horz_even);
+      //        bands[2].push_line();
+      //        bands[3].push_line();
+      //        irrev_vert_wvlt_K(lines, lines + 5,
+      //                          true, width);
+      //        irrev_horz_wvlt_fwd_tx(lines + 5, child_res->get_line(),
+      //          bands[1].get_line(), width, horz_even);
+      //        bands[1].push_line();
+      //        child_res->push_line();
+      //      }
+      //      else
+      //      {
+      //        irrev_vert_wvlt_step(lines + 1, lines + 1,
+      //                             lines, 0, width);
+      //        irrev_vert_wvlt_step(lines,
+      //                             cur_line > 1 ? lines + 2 : lines,
+      //                             lines + 1, 1, width);
+      //        irrev_vert_wvlt_step(lines + 1,
+      //                             cur_line > 2 ? lines + 3 : lines + 1,
+      //                             lines + 2, 2, width);
+      //        irrev_vert_wvlt_step(lines + 2,
+      //                             cur_line > 3 ? lines + 4 : lines + 2,
+      //                             lines + 3, 3, width);
+
+      //        irrev_vert_wvlt_step(lines + 1, lines + 1,
+      //                             lines, 2, width);
+      //        irrev_vert_wvlt_step(lines,
+      //                             cur_line > 1 ? lines + 2 : lines,
+      //                             lines + 1, 3, width);
+
+      //        //push lines[3] L, lines[2] H, lines[1] L, and lines[0] H
+      //        if (cur_line >= 3)
+      //        {
+      //          irrev_vert_wvlt_K(lines + 3, lines + 5,
+      //                            true, width);
+      //          irrev_horz_wvlt_fwd_tx(lines + 5,
+      //            child_res->get_line(), bands[1].get_line(),
+      //            width, horz_even);
+      //          bands[1].push_line();
+      //          child_res->push_line();
+      //        }
+      //        if (cur_line >= 2)
+      //          irrev_vert_wvlt_K(lines + 2, lines + 5, false, width);
+      //        else
+      //          irrev_vert_wvlt_K(lines, lines + 5, false, width);
+      //        irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
+      //          bands[3].get_line(), width, horz_even);
+      //        bands[2].push_line();
+      //        bands[3].push_line();
+      //        irrev_vert_wvlt_K(lines + 1, lines + 5,
+      //                          true, width);
+      //        irrev_horz_wvlt_fwd_tx(lines + 5, child_res->get_line(),
+      //          bands[1].get_line(), width, horz_even);
+      //        bands[1].push_line();
+      //        child_res->push_line();
+      //        irrev_vert_wvlt_K(lines, lines + 5,
+      //                          false, width);
+      //        irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
+      //          bands[3].get_line(), width, horz_even);
+      //        bands[2].push_line();
+      //        bands[3].push_line();
+      //      }
+      //    }
+      //    else
+      //    { //only one line
+      //      if (vert_even)
+      //      {
+      //        //push to L
+      //        irrev_horz_wvlt_fwd_tx(lines, child_res->get_line(),
+      //          bands[1].get_line(), width, horz_even);
+      //        bands[1].push_line();
+      //        child_res->push_line();
+      //      }
+      //      else
+      //      {
+      //        float* sp = lines[0].f32;
+      //        for (ui32 i = width; i > 0; --i)
+      //          *sp++ *= 2.0f;
+      //        //push to H
+      //        irrev_horz_wvlt_fwd_tx(lines, bands[2].get_line(),
+      //          bands[3].get_line(), width, horz_even);
+      //        bands[2].push_line();
+      //        bands[3].push_line();
+      //      }
+      //    }
+      //  }
+
+      //  rotate_buffers(lines, lines + 1, lines + 2, lines + 3, lines + 4, 
+      //                 lines + 5);
+
+      //  ++cur_line;
+      //  vert_even = !vert_even;
+      //}
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -621,147 +694,159 @@ namespace ojph {
 
       ui32 width = res_rect.siz.w;
       if (width == 0)
-        return lines;
+        return NULL;
       if (reversible)
       {
-        assert(num_lines >= 4);
         if (res_rect.siz.h > 1)
         {
-          do
+          if (sig->active) {
+            sig->active = false;
+            return sig->line;
+          };
+          for (;;)
           {
             //horizontal transform
             if (cur_line < res_rect.siz.h)
             {
-              if (vert_even)
-                rev_horz_wvlt_bwd_tx(lines,
+              if (vert_even) { // even
+                rev_horz_syn(atk, aug->line,
                   child_res->pull_line(), bands[1].pull_line(),
                   width, horz_even);
-              else
-                rev_horz_wvlt_bwd_tx(lines,
+                aug->active = true;
+                vert_even = !vert_even;
+                ++cur_line;
+                continue;
+              }
+              else {
+                rev_horz_syn(atk, sig->line,
                   bands[2].pull_line(), bands[3].pull_line(),
                   width, horz_even);
+                sig->active = true;
+                vert_even = !vert_even;
+                ++cur_line;
+              }
             }
 
             //vertical transform
-            if (!vert_even)
+            for (ui32 i = 0; i < num_steps; ++i)
             {
-              rev_vert_wvlt_bwd_update(
-                cur_line > 1 ? lines + 2 : lines,
-                cur_line < res_rect.siz.h ? lines : lines + 2,
-                lines + 1, width);
-              rev_vert_wvlt_bwd_predict(
-                cur_line > 2 ? lines + 3 : lines + 1,
-                cur_line < res_rect.siz.h + 1 ? lines + 1 : lines + 3,
-                lines + 2, width);
+              if (aug->active &&
+                (sig->active == true || ssp[i].active == true))
+              {
+                line_buf* dp = aug->line;
+                line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
+                line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
+                const lifting_step* s = atk->get_step(num_steps - i - 1);
+                rev_vert_syn_step(s, dp, sp1, sp2, width);
+              }
+              lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
             }
 
-            vert_even = !vert_even;
-            rotate_buffers(lines, lines + 1, lines + 2, lines + 3);
-            ++cur_line;
-          } while (cur_line < 3);
-          memcpy(lines[0].i32, lines[3].i32, res_rect.siz.w * sizeof(si32));
-          return lines;
+            if (aug->active) {
+              aug->active = false;
+              return aug->line;
+            }
+            if (sig->active) {
+              sig->active = false;
+              return sig->line;
+            };
+          }
         }
-        else if (res_rect.siz.h == 1)
+        else
         {
           if (vert_even)
-          {
-            rev_horz_wvlt_bwd_tx(lines, child_res->pull_line(),
+            rev_horz_syn(atk, aug->line, child_res->pull_line(),
               bands[1].pull_line(), width, horz_even);
-          }
           else
           {
-            rev_horz_wvlt_bwd_tx(lines, bands[2].pull_line(),
+            rev_horz_syn(atk, aug->line, bands[2].pull_line(),
               bands[3].pull_line(), width, horz_even);
-            if (width)
-            {
-              si32* sp = lines[0].i32;
-              for (ui32 i = width; i > 0; --i)
-                *sp++ >>= 1;
-            }
+            si32* sp = aug->line->i32;
+            for (ui32 i = width; i > 0; --i)
+              *sp++ >>= 1;
           }
-          return lines;
+          return aug->line;
         }
-        else
-          return lines;
       }
       else
       {
-        assert(num_lines >= 6);
         if (res_rect.siz.h > 1)
         {
-          do
+          if (sig->active) {
+            sig->active = false;
+            return sig->line;
+          };
+          for (;;)
           {
             //horizontal transform
             if (cur_line < res_rect.siz.h)
             {
-              if (vert_even)
-              {
-                irrev_horz_wvlt_bwd_tx(lines,
+              if (vert_even) { // even
+                irv_horz_syn(atk, aug->line,
                   child_res->pull_line(), bands[1].pull_line(),
                   width, horz_even);
-                irrev_vert_wvlt_K(lines, lines, false, width);
+                aug->active = true;
+                vert_even = !vert_even;
+                ++cur_line;
+
+                const float K = atk->get_K();
+                irv_vert_syn_K(K, aug->line, width);
+
+                continue;
               }
-              else
-              {
-                irrev_horz_wvlt_bwd_tx(lines,
+              else {
+                irv_horz_syn(atk, sig->line,
                   bands[2].pull_line(), bands[3].pull_line(),
                   width, horz_even);
-                irrev_vert_wvlt_K(lines, lines, true, width);
+                sig->active = true;
+                vert_even = !vert_even;
+                ++cur_line;
+
+                const float K_inv = 1.0f / atk->get_K();
+                irv_vert_syn_K(K_inv, sig->line, width);
               }
             }
 
             //vertical transform
-            if (!vert_even)
+            for (ui32 i = 0; i < num_steps; ++i)
             {
-              irrev_vert_wvlt_step(
-                cur_line > 1 ? lines + 2 : lines,
-                cur_line < res_rect.siz.h ? lines : lines + 2,
-                lines + 1, 7, width);
-              irrev_vert_wvlt_step(
-                cur_line > 2 ? lines + 3 : lines + 1,
-                cur_line < res_rect.siz.h + 1 ? lines + 1 : lines + 3,
-                lines + 2, 6, width);
-              irrev_vert_wvlt_step(
-                cur_line > 3 ? lines + 4 : lines + 2,
-                cur_line < res_rect.siz.h + 2 ? lines + 2 : lines + 4,
-                lines + 3, 5, width);
-              irrev_vert_wvlt_step(
-                cur_line > 4 ? lines + 5 : lines + 3,
-                cur_line < res_rect.siz.h + 3 ? lines + 3 : lines + 5,
-                lines + 4, 4, width);
+              if (aug->active &&
+                (sig->active == true || ssp[i].active == true))
+              {
+                line_buf* dp = aug->line;
+                line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
+                line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
+                const lifting_step* s = atk->get_step(num_steps - i - 1);
+                irv_vert_syn_step(s, dp, sp1, sp2, width);
+              }
+              lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
             }
 
-            vert_even = !vert_even;
-            rotate_buffers(lines, lines + 1, lines + 2, lines + 3, lines + 4, 
-                           lines + 5);
-            ++cur_line;
-          } while (cur_line < 5);
-          memcpy(lines[0].f32, lines[5].f32, res_rect.siz.w * sizeof(float));
-          return lines;
+            if (aug->active) {
+              aug->active = false;
+              return aug->line;
+            }
+            if (sig->active) {
+              sig->active = false;
+              return sig->line;
+            };
+          }
         }
-        else if (res_rect.siz.h == 1)
+        else
         {
           if (vert_even)
-          {
-            irrev_horz_wvlt_bwd_tx(lines, child_res->pull_line(),
+            irv_horz_syn(atk, aug->line, child_res->pull_line(),
               bands[1].pull_line(), width, horz_even);
-          }
           else
           {
-            irrev_horz_wvlt_bwd_tx(lines, bands[2].pull_line(),
+            irv_horz_syn(atk, aug->line, bands[2].pull_line(),
               bands[3].pull_line(), width, horz_even);
-            if (width)
-            {
-              float* sp = lines[0].f32;
-              for (ui32 i = width; i > 0; --i)
-                *sp++ *= 0.5f;
-            }
+            float *sp = aug->line->f32;
+            for (ui32 i = width; i > 0; --i)
+              *sp++ *= 0.5f;
           }
-          return lines;
+          return aug->line;
         }
-        else
-          return lines;
       }
     }
 
diff --git a/src/core/codestream/ojph_resolution.h b/src/core/codestream/ojph_resolution.h
index e110811b..7a7d43d5 100644
--- a/src/core/codestream/ojph_resolution.h
+++ b/src/core/codestream/ojph_resolution.h
@@ -64,14 +64,15 @@ namespace ojph {
 
     public:
       static void pre_alloc(codestream *codestream, const rect& res_rect,
-                            const rect& recon_res_rect, ui32 res_num);
+                            const rect& recon_res_rect, 
+                            ui32 comp_num, ui32 res_num);
       void finalize_alloc(codestream *codestream, const rect& res_rect,
                           const rect& recon_res_rect, ui32 comp_num,
                           ui32 res_num, point comp_downsamp,
                           tile_comp *parent_tile_comp,
                           resolution *parent_res);
 
-      line_buf* get_line() { return lines + 0; }
+      line_buf* get_line() { return ssp[0].line; }
       void push_line();
       line_buf* pull_line();
       rect get_rect() { return res_rect; }
@@ -90,14 +91,16 @@ namespace ojph {
 
     private:
       bool reversible, skipped_res_for_read, skipped_res_for_recon;
-      ui32 num_lines;
+      ui32 num_steps;
       ui32 num_bands, res_num;
       ui32 comp_num;
       ui32 num_bytes; // number of bytes in this resolution 
                       // used for tilepart length
       point comp_downsamp;
-      rect res_rect;
-      line_buf *lines;
+      rect res_rect;                             // resolution rectangle
+      line_buf* lines;                           // used to store lines
+      lifting_buf *ssp;                          // step state pointer
+      lifting_buf *aug, *sig;
       subband *bands;
       tile_comp *parent_comp;
       resolution *parent_res, *child_res;
@@ -109,6 +112,8 @@ namespace ojph {
       int tag_tree_size;
       ui32 level_index[20]; //more than enough
       point cur_precinct_loc; //used for progressing spatial modes (2, 3, 4)
+      const param_atk* atk;
+      param_dfs::dfs_dwt_type downsampling_style;
       //wavelet machinery
       ui32 cur_line;
       bool vert_even, horz_even;
diff --git a/src/core/codestream/ojph_subband.cpp b/src/core/codestream/ojph_subband.cpp
index 6348e98b..dbef3b75 100644
--- a/src/core/codestream/ojph_subband.cpp
+++ b/src/core/codestream/ojph_subband.cpp
@@ -55,7 +55,7 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void subband::pre_alloc(codestream *codestream, const rect &band_rect,
-                            ui32 res_num)
+                            ui32 comp_num, ui32 res_num)
     {
       mem_fixed_allocator* allocator = codestream->get_allocator();
 
@@ -63,7 +63,7 @@ namespace ojph {
       if (empty)
         return;
 
-      const param_cod* cdp = codestream->get_cod();
+      const param_cod* cdp = codestream->get_cod(comp_num);
       size log_cb = cdp->get_log_block_dims();
       size log_PP = cdp->get_log_precinct_size(res_num);
 
@@ -111,7 +111,7 @@ namespace ojph {
       this->band_rect = band_rect;
       this->parent = res;
 
-      const param_cod* cdp = codestream->get_cod();
+      const param_cod* cdp = codestream->get_cod(parent->get_comp_num());
       this->reversible = cdp->access_atk()->is_reversible();
       size log_cb = cdp->get_log_block_dims();
       log_PP = cdp->get_log_precinct_size(res_num);
@@ -125,8 +125,7 @@ namespace ojph {
       cur_line = 0;
       cur_cb_height = 0;
       param_qcd* qcd = codestream->access_qcd(parent->get_comp_num());
-      const param_cod* cod = codestream->get_cod();
-      ui32 num_decomps = cod->get_num_decompositions();
+      ui32 num_decomps = cdp->get_num_decompositions();
       this->K_max = qcd->get_Kmax(NULL, num_decomps, this->res_num, band_num);
       if (!reversible)
       {
diff --git a/src/core/codestream/ojph_subband.h b/src/core/codestream/ojph_subband.h
index 3bcc6edb..9928c5ef 100644
--- a/src/core/codestream/ojph_subband.h
+++ b/src/core/codestream/ojph_subband.h
@@ -64,7 +64,7 @@ namespace ojph {
       friend struct precinct;
     public:
       static void pre_alloc(codestream *codestream, const rect& band_rect,
-                            ui32 res_num);
+                            ui32 comp_num, ui32 res_num);
       void finalize_alloc(codestream *codestream, const rect& band_rect,
                           resolution* res, ui32 res_num, ui32 subband_num);
 
diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp
index b7cb52cd..48f8bb56 100644
--- a/src/core/codestream/ojph_tile.cpp
+++ b/src/core/codestream/ojph_tile.cpp
@@ -116,7 +116,7 @@ namespace ojph {
         recon_comp_rect.siz.w = recon_tcx1 - recon_tcx0;
         recon_comp_rect.siz.h = recon_tcy1 - recon_tcy0;
 
-        tile_comp::pre_alloc(codestream, comp_rect, recon_comp_rect);
+        tile_comp::pre_alloc(codestream, i, comp_rect, recon_comp_rect);
         width = ojph_max(width, recon_comp_rect.siz.w);
       }
 
diff --git a/src/core/codestream/ojph_tile_comp.cpp b/src/core/codestream/ojph_tile_comp.cpp
index a2124e8b..69ed0bcb 100644
--- a/src/core/codestream/ojph_tile_comp.cpp
+++ b/src/core/codestream/ojph_tile_comp.cpp
@@ -51,7 +51,8 @@ namespace ojph {
   {
 
     //////////////////////////////////////////////////////////////////////////
-    void tile_comp::pre_alloc(codestream *codestream, const rect& comp_rect,
+    void tile_comp::pre_alloc(codestream *codestream, ui32 comp_num, 
+                              const rect& comp_rect,
                               const rect& recon_comp_rect)
     {
       mem_fixed_allocator* allocator = codestream->get_allocator();
@@ -60,7 +61,7 @@ namespace ojph {
       ui32 num_decomps = codestream->access_cod().get_num_decompositions();
       allocator->pre_alloc_obj<resolution>(1);
 
-      resolution::pre_alloc(codestream, comp_rect, recon_comp_rect, 
+      resolution::pre_alloc(codestream, comp_rect, recon_comp_rect, comp_num, 
                             num_decomps);
     }
 
@@ -72,7 +73,7 @@ namespace ojph {
       mem_fixed_allocator* allocator = codestream->get_allocator();
 
       //allocate a resolution
-      num_decomps = codestream->get_cod()->get_num_decompositions();
+      num_decomps = codestream->get_cod(comp_num)->get_num_decompositions();
 
       comp_downsamp = codestream->get_siz()->get_downsampling(comp_num);
       this->comp_rect = comp_rect;
diff --git a/src/core/codestream/ojph_tile_comp.h b/src/core/codestream/ojph_tile_comp.h
index d7304d96..def39e55 100644
--- a/src/core/codestream/ojph_tile_comp.h
+++ b/src/core/codestream/ojph_tile_comp.h
@@ -62,7 +62,8 @@ namespace ojph {
     class tile_comp
     {
     public:
-      static void pre_alloc(codestream *codestream, const rect& comp_rect,
+      static void pre_alloc(codestream *codestream, ui32 comp_num, 
+                            const rect& comp_rect,
                             const rect& recon_comp_rect);
       void finalize_alloc(codestream *codestream, tile *parent,
                           ui32 comp_num, const rect& comp_rect,
diff --git a/src/core/common/ojph_mem.h b/src/core/common/ojph_mem.h
index 712727c0..d7497cdb 100644
--- a/src/core/common/ojph_mem.h
+++ b/src/core/common/ojph_mem.h
@@ -134,6 +134,8 @@ namespace ojph {
   /////////////////////////////////////////////////////////////////////////////
   struct line_buf
   {
+    line_buf() : size(0), pre_size(0), i32(0) {}
+
     template<typename T>
     void pre_alloc(mem_fixed_allocator *p, size_t num_ele, ui32 pre_size)
     {
@@ -157,6 +159,14 @@ namespace ojph {
     };
   };
 
+  /////////////////////////////////////////////////////////////////////////////
+  struct lifting_buf
+  {
+    lifting_buf() { line = NULL;  active = false; }
+    line_buf *line;
+    bool active;
+  };
+
   /////////////////////////////////////////////////////////////////////////////
   struct coded_lists
   {
diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp
index b6919032..46231d63 100644
--- a/src/core/transform/ojph_transform.cpp
+++ b/src/core/transform/ojph_transform.cpp
@@ -41,6 +41,8 @@
 #include "ojph_mem.h"
 #include "ojph_transform.h"
 #include "ojph_transform_local.h"
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
 
 namespace ojph {
   struct line_buf;
@@ -81,6 +83,24 @@ namespace ojph {
       (line_buf* dst, line_buf *lsrc, line_buf *hsrc, ui32 width, bool even)
       = NULL;
 
+
+
+
+
+    /////////////////////////////////////////////////////////////////////////
+    void (*rev_vert_syn_step)
+      (const lifting_step* s, line_buf* aug, const line_buf* sig, 
+        line_buf* other, ui32 repeat) = NULL;
+
+    /////////////////////////////////////////////////////////////////////////
+    void (*rev_horz_syn)
+      (const param_atk* atk, line_buf* dst, line_buf* lsrc,
+        line_buf* hsrc, ui32 width, bool even) = NULL;
+
+
+
+
+    
     /////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     /////////////////////////////////////////////////////////////////////////
@@ -105,6 +125,27 @@ namespace ojph {
       (line_buf* src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
       = NULL;
 
+
+
+
+
+    /////////////////////////////////////////////////////////////////////////
+    void (*irv_vert_syn_step)
+      (const lifting_step* s, line_buf* aug, const line_buf* sig,
+        line_buf* other, ui32 repeat) = NULL;
+
+    /////////////////////////////////////////////////////////////////////////
+    void (*irv_vert_syn_K)(const float K, line_buf* aug, ui32 repeat) = NULL;
+
+    /////////////////////////////////////////////////////////////////////////
+    void (*irv_horz_syn)
+      (const param_atk* atk, line_buf* dst, line_buf* lsrc,
+        line_buf* hsrc, ui32 width, bool even) = NULL;
+
+
+
+
+
     ////////////////////////////////////////////////////////////////////////////
     static bool wavelet_transform_functions_initialized = false;
 
@@ -122,11 +163,19 @@ namespace ojph {
       rev_vert_wvlt_bwd_predict = gen_rev_vert_wvlt_bwd_predict;
       rev_vert_wvlt_bwd_update  = gen_rev_vert_wvlt_bwd_update;
       rev_horz_wvlt_bwd_tx      = gen_rev_horz_wvlt_bwd_tx;
+
+      rev_vert_syn_step         = gen_rev_vert_syn_step;
+      rev_horz_syn              = gen_rev_horz_syn;
+
       irrev_vert_wvlt_step      = gen_irrev_vert_wvlt_step;
       irrev_vert_wvlt_K         = gen_irrev_vert_wvlt_K;
       irrev_horz_wvlt_fwd_tx    = gen_irrev_horz_wvlt_fwd_tx;
       irrev_horz_wvlt_bwd_tx    = gen_irrev_horz_wvlt_bwd_tx;
 
+      irv_vert_syn_step         = gen_irv_vert_syn_step;
+      irv_vert_syn_K            = gen_irv_vert_syn_K;
+      irv_horz_syn              = gen_irv_horz_syn;
+
 #ifndef OJPH_DISABLE_INTEL_SIMD
       int level = get_cpu_ext_level();
 
@@ -326,6 +375,96 @@ namespace ojph {
     }
 
 
+
+
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_rev_vert_syn_step(const lifting_step* s, line_buf* aug, 
+                               const line_buf* sig, line_buf* other, 
+                               ui32 repeat)
+    {
+      si32 a = s->rev.Aatk;
+      si32 b = s->rev.Batk;
+      ui32 e = s->rev.Eatk;
+
+      si32* dst = aug->i32;
+      const si32* src1 = sig->i32, * src2 = other->i32;
+      if (a >= 0)
+        for (ui32 i = repeat; i > 0; --i)
+          *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+      else
+        for (ui32 i = repeat; i > 0; --i)
+          *dst++ += (b - a * (*src1++ + *src2++)) >> e;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_rev_horz_syn(const param_atk *atk, line_buf* dst, line_buf *lsrc,
+                          line_buf *hsrc, ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si32* oth = hsrc->i32, * aug = lsrc->i32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          si32 a = s->rev.Aatk;
+          si32 b = s->rev.Batk;
+          ui32 e = s->rev.Eatk;
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si32* sp = oth + (ev ? 0 : 1);
+          si32* dp = aug;
+          if (a >= 0)
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+          else
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp += (b - a * (sp[-1] + sp[0])) >> e;
+
+          // swap buffers
+          si32* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        si32* sph = hsrc->i32;
+        si32* spl = lsrc->i32;
+        si32* dp = dst->i32;
+        ui32 w = width;
+        if (!even)
+        {
+          *dp++ = *sph++; --w;
+        }
+        for (; w > 1; w -= 2)
+        {
+          *dp++ = *spl++; *dp++ = *sph++;
+        }
+        if (w)
+        {
+          *dp++ = *spl++; --w;
+        }
+      }
+      else {
+        if (even)
+          dst->i32[0] = lsrc->i32[0];
+        else
+          dst->i32[0] = hsrc->i32[0] >> 1;
+      }
+    }
+
+
+
+
+
     //////////////////////////////////////////////////////////////////////////
     void gen_irrev_vert_wvlt_step(const line_buf* line_src1,
                                   const line_buf* line_src2,
@@ -499,6 +638,102 @@ namespace ojph {
       }
     }
 
+
+
+
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_vert_syn_step(const lifting_step* s, line_buf* aug,
+                               const line_buf* sig, line_buf* other,
+                               ui32 repeat)
+    {
+      float a = s->irv.Aatk;
+
+      float* dst = aug->f32;
+      const float* src1 = sig->f32, * src2 = other->f32;
+      for (ui32 i = repeat; i > 0; --i)
+        *dst++ -= a * (*src1++ + *src2++);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_vert_syn_K(const float K, line_buf* aug, ui32 repeat)
+    {
+      float* dst = aug->f32;
+      for (ui32 i = repeat; i > 0; --i)
+        *dst++ *= K;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_horz_syn(const param_atk* atk, line_buf* dst, line_buf* lsrc,
+                          line_buf* hsrc, ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        float* oth = hsrc->f32, * aug = lsrc->f32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+
+        {
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          float* dp;
+
+          dp = aug;
+          for (ui32 i = aug_width; i > 0; --i)
+            *dp++ *= K;
+
+          dp = oth;
+          for (ui32 i = oth_width; i > 0; --i)
+            *dp++ *= K_inv;
+        }
+
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          float a = s->irv.Aatk;
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const float* sp = oth + (ev ? 0 : 1);
+          float* dp = aug;
+          for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+            *dp -= a * (sp[-1] + sp[0]);
+
+          // swap buffers
+          float* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        float* sph = hsrc->f32;
+        float* spl = lsrc->f32;
+        float* dp = dst->f32;
+        ui32 w = width;
+        if (!even)
+        { *dp++ = *sph++; --w; }
+        for (; w > 1; w -= 2)
+        { *dp++ = *spl++; *dp++ = *sph++; }
+        if (w)
+        { *dp++ = *spl++; --w; }
+      }
+      else {
+        if (even)
+          dst->f32[0] = lsrc->f32[0];
+        else
+          dst->f32[0] = hsrc->f32[0] * 0.5f;
+      }
+    }
+
+
+
+
+
 #endif // !OJPH_ENABLE_WASM_SIMD
 
   }
diff --git a/src/core/transform/ojph_transform.h b/src/core/transform/ojph_transform.h
index 002235d3..77ede96f 100644
--- a/src/core/transform/ojph_transform.h
+++ b/src/core/transform/ojph_transform.h
@@ -44,6 +44,8 @@
 namespace ojph {
   struct line_buf;
   namespace local {
+    union lifting_step;
+    struct param_atk;
 
     //////////////////////////////////////////////////////////////////////////
     void init_wavelet_transform_functions();
@@ -80,6 +82,24 @@ namespace ojph {
     extern void (*rev_horz_wvlt_bwd_tx)
       (line_buf* dst, line_buf *lsrc, line_buf *hsrc, ui32 width, bool even);
 
+
+
+
+
+    /////////////////////////////////////////////////////////////////////////
+    extern void (*rev_vert_syn_step)
+      (const lifting_step* s, line_buf* aug, const line_buf* sig, 
+        line_buf* other, ui32 repeat);
+
+    /////////////////////////////////////////////////////////////////////////
+    extern void (*rev_horz_syn)
+      (const param_atk* atk, line_buf* dst, line_buf* lsrc,
+        line_buf* hsrc, ui32 width, bool even);
+
+
+
+
+
     /////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     /////////////////////////////////////////////////////////////////////////
@@ -102,6 +122,28 @@ namespace ojph {
     extern void (*irrev_horz_wvlt_bwd_tx)
       (line_buf* src, line_buf *ldst, line_buf *hdst, ui32 width, bool even);
 
+
+
+
+
+    /////////////////////////////////////////////////////////////////////////
+    extern void (*irv_vert_syn_step)
+      (const lifting_step* s, line_buf* aug, const line_buf* sig, 
+        line_buf* other, ui32 repeat);
+
+    /////////////////////////////////////////////////////////////////////////
+    extern void (*irv_vert_syn_K)
+      (const float K, line_buf* aug, ui32 repeat);
+
+    /////////////////////////////////////////////////////////////////////////
+    extern void (*irv_horz_syn)
+      (const param_atk* atk, line_buf* dst, line_buf* lsrc,
+        line_buf* hsrc, ui32 width, bool even);
+
+
+
+
+
   }
 }
 
diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h
index 2bf041c8..42cec378 100644
--- a/src/core/transform/ojph_transform_local.h
+++ b/src/core/transform/ojph_transform_local.h
@@ -44,6 +44,7 @@
 namespace ojph {
   struct line_buf;
   namespace local {
+    struct param_atk;
 
     //////////////////////////////////////////////////////////////////////////
     struct LIFTING_FACTORS
@@ -93,6 +94,23 @@ namespace ojph {
     void gen_rev_horz_wvlt_bwd_tx(line_buf* dst, line_buf *lsrc,
                                   line_buf *hsrc, ui32 width, bool even);
 
+
+
+
+
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_vert_syn_step(const lifting_step* s, line_buf* aug,
+                               const line_buf* sig, line_buf* other,
+                               ui32 repeat);
+
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_horz_syn(const param_atk *atk, line_buf* dst, line_buf *lsrc,
+                          line_buf *hsrc, ui32 width, bool even);
+
+
+
+
+
     //////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
@@ -113,6 +131,26 @@ namespace ojph {
     void gen_irrev_horz_wvlt_bwd_tx(line_buf* src, line_buf *ldst,
                                     line_buf *hdst, ui32 width, bool even);
 
+
+
+
+
+    /////////////////////////////////////////////////////////////////////////
+    void gen_irv_vert_syn_step(const lifting_step* s, line_buf* aug,
+                               const line_buf* sig, line_buf* other,
+                               ui32 repeat);
+
+    /////////////////////////////////////////////////////////////////////////
+    void gen_irv_vert_syn_K(const float K, line_buf* aug, ui32 repeat);
+
+    /////////////////////////////////////////////////////////////////////////
+    void gen_irv_horz_syn(const param_atk *atk, line_buf* dst, line_buf *lsrc,
+                          line_buf *hsrc, ui32 width, bool even);
+
+
+
+
+
     //////////////////////////////////////////////////////////////////////////
     //
     //

From 2c74db3ab52487b2d74c63e58bce5c89f0547e02 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Tue, 2 Apr 2024 21:16:57 +1100
Subject: [PATCH 11/37] Syntax error fix.

---
 src/core/codestream/ojph_params_local.h |  4 +--
 src/core/codestream/ojph_resolution.cpp | 43 +------------------------
 2 files changed, 3 insertions(+), 44 deletions(-)

diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h
index c08d750e..43c1181d 100644
--- a/src/core/codestream/ojph_params_local.h
+++ b/src/core/codestream/ojph_params_local.h
@@ -856,8 +856,8 @@ namespace ojph {
       const param_atk* get_atk(int index) const;
       const lifting_step* get_step(ui32 s) const 
       { assert(s < Natk); return d + s; }
-      const ui32 get_num_steps() const { return Natk; }
-      const float get_K() const { return Katk; }
+      ui32 get_num_steps() const { return Natk; }
+      float get_K() const { return Katk; }
 
     private: // member variables
       ui16 Latk;         // structure length
diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp
index 105c57de..7f226445 100644
--- a/src/core/codestream/ojph_resolution.cpp
+++ b/src/core/codestream/ojph_resolution.cpp
@@ -38,6 +38,7 @@
 
 #include <climits>
 #include <cmath>
+#include <new>
 
 #include "ojph_mem.h"
 #include "ojph_params.h"
@@ -54,48 +55,6 @@ namespace ojph {
 
   namespace local
   {
-
-    //////////////////////////////////////////////////////////////////////////
-    static void rotate_buffers(line_buf* line1, line_buf* line2,
-                               line_buf* line3, line_buf* line4)
-    {
-      assert(line1->size == line2->size &&
-             line1->pre_size == line2->pre_size &&
-             line1->size == line3->size &&
-             line1->pre_size == line3->pre_size &&
-             line1->size == line4->size &&
-             line1->pre_size == line4->pre_size);
-      si32* p = line4->i32;
-      line4->i32 = line3->i32;
-      line3->i32 = line2->i32;
-      line2->i32 = line1->i32;
-      line1->i32 = p;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    static void rotate_buffers(line_buf* line1, line_buf* line2,
-                               line_buf* line3, line_buf* line4,
-                               line_buf* line5, line_buf* line6)
-    {
-      assert(line1->size == line2->size &&
-             line1->pre_size == line2->pre_size &&
-             line1->size == line3->size &&
-             line1->pre_size == line3->pre_size &&
-             line1->size == line4->size &&
-             line1->pre_size == line4->pre_size &&
-             line1->size == line5->size &&
-             line1->pre_size == line5->pre_size &&
-             line1->size == line6->size &&
-             line1->pre_size == line6->pre_size);
-      si32* p = line6->i32;
-      line6->i32 = line5->i32;
-      line5->i32 = line4->i32;
-      line4->i32 = line3->i32;
-      line3->i32 = line2->i32;
-      line2->i32 = line1->i32;
-      line1->i32 = p;
-    }
-
     //////////////////////////////////////////////////////////////////////////
     void resolution::pre_alloc(codestream* codestream, const rect& res_rect,
                                const rect& recon_res_rect, 

From be39386e13e426e5868fce6563f2520d6b4cd10a Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Thu, 4 Apr 2024 06:49:08 +1100
Subject: [PATCH 12/37] reversible analysis is working; irreversible not.  More
 testing is needed.

---
 src/core/codestream/ojph_resolution.cpp   | 430 ++++++++--------------
 src/core/codestream/ojph_resolution.h     |   2 +-
 src/core/transform/ojph_transform.cpp     | 242 ++++++++++--
 src/core/transform/ojph_transform.h       |  41 ++-
 src/core/transform/ojph_transform_local.h |  39 +-
 5 files changed, 442 insertions(+), 312 deletions(-)

diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp
index 105c57de..f28cfd5d 100644
--- a/src/core/codestream/ojph_resolution.cpp
+++ b/src/core/codestream/ojph_resolution.cpp
@@ -407,277 +407,175 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    line_buf* resolution::get_line()
+    { 
+      if (vert_even)
+      {
+        ++cur_line;
+        sig->active = true;
+        return sig->line;
+      }
+      else
+      {
+        ++cur_line;
+        aug->active = true;
+        return aug->line;
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void resolution::push_line()
     {
       if (res_num == 0)
       {
         assert(num_bands == 1 && child_res == NULL);
-        bands[0].exchange_buf(ssp[0].line);//line at location 0
+        bands[0].exchange_buf(vert_even ? sig->line : aug->line);
         bands[0].push_line();
         return;
       }
 
-      //ui32 width = res_rect.siz.w;
-      //if (width == 0)
-      //  return;
-      //if (reversible)
-      //{
-      //  //vertical transform
-      //  assert(num_lines >= 4);
-      //  if (vert_even)
-      //  {
-      //    rev_vert_wvlt_fwd_predict(lines,
-      //                              cur_line > 1 ? lines + 2 : lines,
-      //                              lines + 1, width);
-      //    rev_vert_wvlt_fwd_update(lines + 1,
-      //                             cur_line > 2 ? lines + 3 : lines + 1,
-      //                             lines + 2, width);
-
-      //    // push to horizontal transform lines[2](L) and lines[1] (H)
-      //    if (cur_line >= 1)
-      //    {
-      //      rev_horz_wvlt_fwd_tx(lines + 1, bands[2].get_line(),
-      //        bands[3].get_line(), width, horz_even);
-      //      bands[2].push_line();
-      //      bands[3].push_line();
-      //    }
-      //    if (cur_line >= 2)
-      //    {
-      //      rev_horz_wvlt_fwd_tx(lines + 2, child_res->get_line(),
-      //        bands[1].get_line(), width, horz_even);
-      //      bands[1].push_line();
-      //      child_res->push_line();
-      //    }
-      //  }
-
-      //  if (cur_line >= res_rect.siz.h - 1)
-      //  { //finished, so we need to process any lines left
-      //    if (cur_line)
-      //    {
-      //      if (vert_even)
-      //      {
-      //        rev_vert_wvlt_fwd_update(lines + 1, lines + 1,
-      //                                 lines, width);
-      //        //push lines[0] to L
-      //        rev_horz_wvlt_fwd_tx(lines, child_res->get_line(),
-      //          bands[1].get_line(), width, horz_even);
-      //        bands[1].push_line();
-      //        child_res->push_line();
-      //      }
-      //      else
-      //      {
-      //        rev_vert_wvlt_fwd_predict(lines + 1, lines + 1,
-      //                                  lines, width);
-      //        rev_vert_wvlt_fwd_update(lines,
-      //                                 cur_line > 1 ? lines + 2 : lines,
-      //                                 lines + 1, width);
-
-      //        // push to horizontal transform lines[1](L) and line[0] (H)
-      //        //line[0] to H
-      //        rev_horz_wvlt_fwd_tx(lines, bands[2].get_line(),
-      //          bands[3].get_line(), width, horz_even);
-      //        bands[2].push_line();
-      //        bands[3].push_line();
-      //        //line[1] to L
-      //        rev_horz_wvlt_fwd_tx(lines + 1, child_res->get_line(),
-      //          bands[1].get_line(), width, horz_even);
-      //        bands[1].push_line();
-      //        child_res->push_line();
-      //      }
-      //    }
-      //    else
-      //    { //only one line
-      //      if (vert_even)
-      //      {
-      //        //push to L
-      //        rev_horz_wvlt_fwd_tx(lines, child_res->get_line(),
-      //          bands[1].get_line(), width, horz_even);
-      //        bands[1].push_line();
-      //        child_res->push_line();
-      //      }
-      //      else
-      //      {
-      //        si32* sp = lines[0].i32;
-      //        for (ui32 i = width; i > 0; --i)
-      //          *sp++ <<= 1;
-      //        //push to H
-      //        rev_horz_wvlt_fwd_tx(lines, bands[2].get_line(),
-      //          bands[3].get_line(), width, horz_even);
-      //        bands[2].push_line();
-      //        bands[3].push_line();
-      //      }
-      //    }
-      //  }
-
-      //  rotate_buffers(lines, lines + 1, lines + 2, lines + 3);
-
-      //  ++cur_line;
-      //  vert_even = !vert_even;
-      //}
-      //else
-      //{
-      //  //vertical transform
-      //  assert(num_lines >= 6);
-      //  if (vert_even)
-      //  {
-      //    irrev_vert_wvlt_step(lines + 0,
-      //                         cur_line > 1 ? lines + 2 : lines,
-      //                         lines + 1, 0, width);
-      //    irrev_vert_wvlt_step(lines + 1,
-      //                         cur_line > 2 ? lines + 3 : lines + 1,
-      //                         lines + 2, 1, width);
-      //    irrev_vert_wvlt_step(lines + 2,
-      //                         cur_line > 3 ? lines + 4 : lines + 2,
-      //                         lines + 3, 2, width);
-      //    irrev_vert_wvlt_step(lines + 3,
-      //                         cur_line > 4 ? lines + 5 : lines + 3,
-      //                         lines + 4, 3, width);
-
-      //    // push to horizontal transform lines[4](L) and lines[3] (H)
-      //    if (cur_line >= 3)
-      //    {
-      //      irrev_vert_wvlt_K(lines + 3, lines + 5,
-      //                        false, width);
-      //      irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
-      //        bands[3].get_line(), width, horz_even);
-      //      bands[2].push_line();
-      //      bands[3].push_line();
-      //    }
-      //    if (cur_line >= 4)
-      //    {
-      //      irrev_vert_wvlt_K(lines + 4, lines + 5,
-      //                        true, width);
-      //      irrev_horz_wvlt_fwd_tx(lines + 5, child_res->get_line(),
-      //        bands[1].get_line(), width, horz_even);
-      //      bands[1].push_line();
-      //      child_res->push_line();
-      //    }
-      //  }
-
-      //  if (cur_line >= res_rect.siz.h - 1)
-      //  { //finished, so we need to process any left line
-      //    if (cur_line)
-      //    {
-      //      if (vert_even)
-      //      {
-      //        irrev_vert_wvlt_step(lines + 1, lines + 1,
-      //                             lines, 1, width);
-      //        irrev_vert_wvlt_step(lines,
-      //                             cur_line > 1 ? lines + 2 : lines,
-      //                             lines + 1, 2, width);
-      //        irrev_vert_wvlt_step(lines + 1,
-      //                             cur_line > 2 ? lines + 3 : lines + 1,
-      //                             lines + 2, 3, width);
-      //        irrev_vert_wvlt_step(lines + 1, lines + 1,
-      //                             lines, 3, width);
-      //        //push lines[2] to L, lines[1] to H, and lines[0] to L
-      //        if (cur_line >= 2)
-      //        {
-      //          irrev_vert_wvlt_K(lines + 2, lines + 5,
-      //                            true, width);
-      //          irrev_horz_wvlt_fwd_tx(lines + 5,
-      //            child_res->get_line(), bands[1].get_line(),
-      //            width, horz_even);
-      //          bands[1].push_line();
-      //          child_res->push_line();
-      //        }
-      //        irrev_vert_wvlt_K(lines + 1, lines + 5,
-      //                          false, width);
-      //        irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
-      //          bands[3].get_line(), width, horz_even);
-      //        bands[2].push_line();
-      //        bands[3].push_line();
-      //        irrev_vert_wvlt_K(lines, lines + 5,
-      //                          true, width);
-      //        irrev_horz_wvlt_fwd_tx(lines + 5, child_res->get_line(),
-      //          bands[1].get_line(), width, horz_even);
-      //        bands[1].push_line();
-      //        child_res->push_line();
-      //      }
-      //      else
-      //      {
-      //        irrev_vert_wvlt_step(lines + 1, lines + 1,
-      //                             lines, 0, width);
-      //        irrev_vert_wvlt_step(lines,
-      //                             cur_line > 1 ? lines + 2 : lines,
-      //                             lines + 1, 1, width);
-      //        irrev_vert_wvlt_step(lines + 1,
-      //                             cur_line > 2 ? lines + 3 : lines + 1,
-      //                             lines + 2, 2, width);
-      //        irrev_vert_wvlt_step(lines + 2,
-      //                             cur_line > 3 ? lines + 4 : lines + 2,
-      //                             lines + 3, 3, width);
-
-      //        irrev_vert_wvlt_step(lines + 1, lines + 1,
-      //                             lines, 2, width);
-      //        irrev_vert_wvlt_step(lines,
-      //                             cur_line > 1 ? lines + 2 : lines,
-      //                             lines + 1, 3, width);
-
-      //        //push lines[3] L, lines[2] H, lines[1] L, and lines[0] H
-      //        if (cur_line >= 3)
-      //        {
-      //          irrev_vert_wvlt_K(lines + 3, lines + 5,
-      //                            true, width);
-      //          irrev_horz_wvlt_fwd_tx(lines + 5,
-      //            child_res->get_line(), bands[1].get_line(),
-      //            width, horz_even);
-      //          bands[1].push_line();
-      //          child_res->push_line();
-      //        }
-      //        if (cur_line >= 2)
-      //          irrev_vert_wvlt_K(lines + 2, lines + 5, false, width);
-      //        else
-      //          irrev_vert_wvlt_K(lines, lines + 5, false, width);
-      //        irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
-      //          bands[3].get_line(), width, horz_even);
-      //        bands[2].push_line();
-      //        bands[3].push_line();
-      //        irrev_vert_wvlt_K(lines + 1, lines + 5,
-      //                          true, width);
-      //        irrev_horz_wvlt_fwd_tx(lines + 5, child_res->get_line(),
-      //          bands[1].get_line(), width, horz_even);
-      //        bands[1].push_line();
-      //        child_res->push_line();
-      //        irrev_vert_wvlt_K(lines, lines + 5,
-      //                          false, width);
-      //        irrev_horz_wvlt_fwd_tx(lines + 5, bands[2].get_line(),
-      //          bands[3].get_line(), width, horz_even);
-      //        bands[2].push_line();
-      //        bands[3].push_line();
-      //      }
-      //    }
-      //    else
-      //    { //only one line
-      //      if (vert_even)
-      //      {
-      //        //push to L
-      //        irrev_horz_wvlt_fwd_tx(lines, child_res->get_line(),
-      //          bands[1].get_line(), width, horz_even);
-      //        bands[1].push_line();
-      //        child_res->push_line();
-      //      }
-      //      else
-      //      {
-      //        float* sp = lines[0].f32;
-      //        for (ui32 i = width; i > 0; --i)
-      //          *sp++ *= 2.0f;
-      //        //push to H
-      //        irrev_horz_wvlt_fwd_tx(lines, bands[2].get_line(),
-      //          bands[3].get_line(), width, horz_even);
-      //        bands[2].push_line();
-      //        bands[3].push_line();
-      //      }
-      //    }
-      //  }
-
-      //  rotate_buffers(lines, lines + 1, lines + 2, lines + 3, lines + 4, 
-      //                 lines + 5);
-
-      //  ++cur_line;
-      //  vert_even = !vert_even;
-      //}
+      ui32 width = res_rect.siz.w;
+      if (width == 0)
+        return;
+      if (reversible)
+      {
+        if (res_rect.siz.h > 1)
+        {
+          if (!vert_even && cur_line < res_rect.siz.h) {
+            vert_even = !vert_even;
+            return;
+          }
+
+          bool finished;
+          do
+          {
+            //vertical transform
+            for (ui32 i = 0; i < num_steps; ++i)
+            {
+              if (aug->active && (sig->active || ssp[i].active))
+              {
+                line_buf* dp = aug->line;
+                line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
+                line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
+                const lifting_step* s = atk->get_step(i);
+                rev_vert_ana_step(s, sp1, sp2, dp, width);
+              }
+              lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
+            }
+
+            finished = true;
+            if (aug->active) {
+              rev_horz_ana(atk, bands[2].get_line(),
+                bands[3].get_line(), aug->line, width, horz_even);
+              bands[2].push_line();
+              bands[3].push_line();
+              aug->active = false;
+              finished = false;
+            }
+            if (sig->active) {
+              rev_horz_ana(atk, child_res->get_line(),
+                bands[1].get_line(), sig->line, width, horz_even);
+              bands[1].push_line();
+              child_res->push_line();
+              sig->active = false;
+              finished = false;
+            };
+            vert_even = !vert_even;
+          } while (cur_line >= res_rect.siz.h && !finished);
+        }
+        else
+        {
+          if (vert_even) {
+            rev_horz_ana(atk, child_res->get_line(),
+              bands[1].get_line(), sig->line, width, horz_even);
+            bands[1].push_line();
+            child_res->push_line();
+          }
+          else
+          {
+            si32* sp = aug->line->i32;
+            for (ui32 i = width; i > 0; --i)
+              *sp++ <<= 1;
+            rev_horz_ana(atk, bands[2].get_line(),
+              bands[3].get_line(), aug->line, width, horz_even);
+            bands[2].push_line();
+            bands[3].push_line();
+          }
+        }
+      }
+      else
+      {
+        if (res_rect.siz.h > 1)
+        {
+          if (!vert_even && cur_line < res_rect.siz.h) {
+            vert_even = !vert_even;
+            return;
+          }
+
+          bool finished;
+          do
+          {
+            //vertical transform
+            for (ui32 i = 0; i < num_steps; ++i)
+            {
+              if (aug->active && (sig->active || ssp[i].active))
+              {
+                line_buf* dp = aug->line;
+                line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
+                line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
+                const lifting_step* s = atk->get_step(i);
+                irv_vert_ana_step(s, sp1, sp2, dp, width);
+              }
+              lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
+            }
+
+            finished = true;
+            if (aug->active) {
+              const float K = atk->get_K();
+              irv_vert_times_K(K, aug->line, width);
+
+              irv_horz_ana(atk, bands[2].get_line(),
+                bands[3].get_line(), aug->line, width, horz_even);
+              bands[2].push_line();
+              bands[3].push_line();
+              aug->active = false;
+              finished = false;
+            }
+            if (sig->active) {
+              const float K_inv = 1.0f / atk->get_K();
+              irv_vert_times_K(K_inv, sig->line, width);
+
+              irv_horz_ana(atk, child_res->get_line(),
+                bands[1].get_line(), sig->line, width, horz_even);
+              bands[1].push_line();
+              child_res->push_line();
+              sig->active = false;
+              finished = false;
+            };
+            vert_even = !vert_even;
+          } while (cur_line >= res_rect.siz.h && !finished);
+        }
+        else
+        {
+          if (vert_even) {
+            irv_horz_ana(atk, child_res->get_line(),
+              bands[1].get_line(), sig->line, width, horz_even);
+            bands[1].push_line();
+            child_res->push_line();
+          }
+          else
+          {
+            float* sp = aug->line->f32;
+            for (ui32 i = width; i > 0; --i)
+              *sp++ *= 2.0f;
+            irv_horz_ana(atk, bands[2].get_line(),
+              bands[3].get_line(), aug->line, width, horz_even);
+            bands[2].push_line();
+            bands[3].push_line();
+          }
+        }
+      }
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -730,8 +628,7 @@ namespace ojph {
             //vertical transform
             for (ui32 i = 0; i < num_steps; ++i)
             {
-              if (aug->active &&
-                (sig->active == true || ssp[i].active == true))
+              if (aug->active && (sig->active || ssp[i].active))
               {
                 line_buf* dp = aug->line;
                 line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
@@ -790,7 +687,7 @@ namespace ojph {
                 ++cur_line;
 
                 const float K = atk->get_K();
-                irv_vert_syn_K(K, aug->line, width);
+                irv_vert_times_K(K, aug->line, width);
 
                 continue;
               }
@@ -803,15 +700,14 @@ namespace ojph {
                 ++cur_line;
 
                 const float K_inv = 1.0f / atk->get_K();
-                irv_vert_syn_K(K_inv, sig->line, width);
+                irv_vert_times_K(K_inv, sig->line, width);
               }
             }
 
             //vertical transform
             for (ui32 i = 0; i < num_steps; ++i)
             {
-              if (aug->active &&
-                (sig->active == true || ssp[i].active == true))
+              if (aug->active && (sig->active || ssp[i].active))
               {
                 line_buf* dp = aug->line;
                 line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
diff --git a/src/core/codestream/ojph_resolution.h b/src/core/codestream/ojph_resolution.h
index 7a7d43d5..36ae5d00 100644
--- a/src/core/codestream/ojph_resolution.h
+++ b/src/core/codestream/ojph_resolution.h
@@ -72,7 +72,7 @@ namespace ojph {
                           tile_comp *parent_tile_comp,
                           resolution *parent_res);
 
-      line_buf* get_line() { return ssp[0].line; }
+      line_buf* get_line();
       void push_line();
       line_buf* pull_line();
       rect get_rect() { return res_rect; }
diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp
index 46231d63..4f7f8cc1 100644
--- a/src/core/transform/ojph_transform.cpp
+++ b/src/core/transform/ojph_transform.cpp
@@ -87,15 +87,25 @@ namespace ojph {
 
 
 
+    /////////////////////////////////////////////////////////////////////////
+    void (*rev_vert_ana_step)
+      (const lifting_step* s, const line_buf* sig, const line_buf* other,
+        const line_buf* aug, ui32 repeat) = NULL;
+
+    /////////////////////////////////////////////////////////////////////////
+    void (*rev_horz_ana)
+      (const param_atk* atk, const line_buf* ldst, const line_buf* hdst,
+        const line_buf* src, ui32 width, bool even) = NULL;
+
     /////////////////////////////////////////////////////////////////////////
     void (*rev_vert_syn_step)
-      (const lifting_step* s, line_buf* aug, const line_buf* sig, 
-        line_buf* other, ui32 repeat) = NULL;
+      (const lifting_step* s, const line_buf* aug, const line_buf* sig,
+        const line_buf* other, ui32 repeat) = NULL;
 
     /////////////////////////////////////////////////////////////////////////
     void (*rev_horz_syn)
-      (const param_atk* atk, line_buf* dst, line_buf* lsrc,
-        line_buf* hsrc, ui32 width, bool even) = NULL;
+      (const param_atk* atk, const line_buf* dst, const line_buf* lsrc,
+        const line_buf* hsrc, ui32 width, bool even) = NULL;
 
 
 
@@ -130,17 +140,28 @@ namespace ojph {
 
 
     /////////////////////////////////////////////////////////////////////////
-    void (*irv_vert_syn_step)
-      (const lifting_step* s, line_buf* aug, const line_buf* sig,
-        line_buf* other, ui32 repeat) = NULL;
+    void (*irv_vert_ana_step)
+      (const lifting_step* s, const line_buf* sig, const line_buf* other,
+        const line_buf* aug, ui32 repeat) = NULL;
+
+    /////////////////////////////////////////////////////////////////////////
+    void (*irv_horz_ana)
+      (const param_atk* atk, const line_buf* ldst, const line_buf* hdst,
+        const line_buf* src, ui32 width, bool even) = NULL;
 
     /////////////////////////////////////////////////////////////////////////
-    void (*irv_vert_syn_K)(const float K, line_buf* aug, ui32 repeat) = NULL;
+    void (*irv_vert_syn_step)
+      (const lifting_step* s, const line_buf* aug, const line_buf* sig,
+        const line_buf* other, ui32 repeat) = NULL;
 
     /////////////////////////////////////////////////////////////////////////
     void (*irv_horz_syn)
-      (const param_atk* atk, line_buf* dst, line_buf* lsrc,
-        line_buf* hsrc, ui32 width, bool even) = NULL;
+      (const param_atk* atk, const line_buf* dst, const line_buf* lsrc,
+        const line_buf* hsrc, ui32 width, bool even) = NULL;
+
+    /////////////////////////////////////////////////////////////////////////
+    void (*irv_vert_times_K)
+      (float K, const line_buf* aug, ui32 repeat) = NULL;
 
 
 
@@ -164,6 +185,8 @@ namespace ojph {
       rev_vert_wvlt_bwd_update  = gen_rev_vert_wvlt_bwd_update;
       rev_horz_wvlt_bwd_tx      = gen_rev_horz_wvlt_bwd_tx;
 
+      rev_vert_ana_step         = gen_rev_vert_ana_step;
+      rev_horz_ana              = gen_rev_horz_ana;
       rev_vert_syn_step         = gen_rev_vert_syn_step;
       rev_horz_syn              = gen_rev_horz_syn;
 
@@ -172,9 +195,11 @@ namespace ojph {
       irrev_horz_wvlt_fwd_tx    = gen_irrev_horz_wvlt_fwd_tx;
       irrev_horz_wvlt_bwd_tx    = gen_irrev_horz_wvlt_bwd_tx;
 
+      irv_vert_ana_step         = gen_irv_vert_ana_step;
+      irv_horz_ana              = gen_irv_horz_ana;      
       irv_vert_syn_step         = gen_irv_vert_syn_step;
-      irv_vert_syn_K            = gen_irv_vert_syn_K;
       irv_horz_syn              = gen_irv_horz_syn;
+      irv_vert_times_K          = gen_irv_vert_times_K;
 
 #ifndef OJPH_DISABLE_INTEL_SIMD
       int level = get_cpu_ext_level();
@@ -378,9 +403,92 @@ namespace ojph {
 
 
 
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_vert_ana_step(const lifting_step* s, const line_buf* sig, 
+                               const line_buf* other, const line_buf* aug, 
+                               ui32 repeat)
+    {
+      si32 a = s->rev.Aatk;
+      si32 b = s->rev.Batk;
+      ui32 e = s->rev.Eatk;
+
+      si32* dst = aug->i32;
+      const si32* src1 = sig->i32, * src2 = other->i32;
+      if (a >= 0)
+        for (ui32 i = repeat; i > 0; --i)
+          *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      else
+        for (ui32 i = repeat; i > 0; --i)
+          *dst++ -= (b - a * (*src1++ + *src2++)) >> e;
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // combine both lsrc and hsrc into dst
+        si32* dph = hdst->i32;
+        si32* dpl = ldst->i32;
+        si32* sp = src->i32;
+        ui32 w = width;
+        if (!even)
+        {
+          *dph++ = *sp++; --w;
+        }
+        for (; w > 1; w -= 2)
+        {
+          *dpl++ = *sp++; *dph++ = *sp++;
+        }
+        if (w)
+        {
+          *dpl++ = *sp++; --w;
+        }
+
+        si32* hp = hdst->i32, * lp = ldst->i32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j);
+          si32 a = s->rev.Aatk;
+          si32 b = s->rev.Batk;
+          ui32 e = s->rev.Eatk;
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si32* sp = lp + (even ? 1 : 0);
+          si32* dp = hp;
+          if (a >= 0)
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          else
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp -= (b - a * (sp[-1] + sp[0])) >> e;
+
+          // swap buffers
+          si32* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i32[0] = src->i32[0];
+        else
+          hdst->i32[0] = src->i32[0] << 1;
+      }
+    }
+    
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_syn_step(const lifting_step* s, line_buf* aug, 
-                               const line_buf* sig, line_buf* other, 
+    void gen_rev_vert_syn_step(const lifting_step* s, const line_buf* aug, 
+                               const line_buf* sig, const line_buf* other, 
                                ui32 repeat)
     {
       si32 a = s->rev.Aatk;
@@ -398,8 +506,9 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_syn(const param_atk *atk, line_buf* dst, line_buf *lsrc,
-                          line_buf *hsrc, ui32 width, bool even)
+    void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                          const line_buf* lsrc, const line_buf* hsrc, 
+                          ui32 width, bool even)
     {
       if (width > 1)
       {
@@ -643,8 +752,8 @@ namespace ojph {
 
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_irv_vert_syn_step(const lifting_step* s, line_buf* aug,
-                               const line_buf* sig, line_buf* other,
+    void gen_irv_vert_ana_step(const lifting_step* s, const line_buf* aug, 
+                               const line_buf* sig, const line_buf* other, 
                                ui32 repeat)
     {
       float a = s->irv.Aatk;
@@ -652,20 +761,100 @@ namespace ojph {
       float* dst = aug->f32;
       const float* src1 = sig->f32, * src2 = other->f32;
       for (ui32 i = repeat; i > 0; --i)
-        *dst++ -= a * (*src1++ + *src2++);
+        *dst++ += a * (*src1++ + *src2++);
     }
+    
+    /////////////////////////////////////////////////////////////////////////
+    void gen_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // split src into ldst and hdst
+        float* dph = hdst->f32;
+        float* dpl = ldst->f32;
+        float* sp = src->f32;
+        ui32 w = width;
+        if (!even)
+        {
+          *dph++ = *sp++; --w;
+        }
+        for (; w > 1; w -= 2)
+        {
+          *dpl++ = *sp++; *dph++ = *sp++;
+        }
+        if (w)
+        {
+          *dpl++ = *sp++; --w;
+        }
+
+        float* hp = hdst->f32, * lp = ldst->f32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j);
+          float a = s->irv.Aatk;
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const float* sp = lp + (even ? 1 : 0);
+          float* dp = hp;
+          for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+            *dp += a * (sp[-1] + sp[0]);
+
+          // swap buffers
+          float* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+
+        {
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          float* dp;
+
+          dp = lp;
+          for (ui32 i = l_width; i > 0; --i)
+            *dp++ *= K_inv;
 
+          dp = hp;
+          for (ui32 i = h_width; i > 0; --i)
+            *dp++ *= K;
+        }
+      }
+      else {
+        if (even)
+          ldst->f32[0] = src->f32[0];
+        else
+          hdst->f32[0] = src->f32[0] * 2.0f;
+      }
+
+
+    }
+    
     //////////////////////////////////////////////////////////////////////////
-    void gen_irv_vert_syn_K(const float K, line_buf* aug, ui32 repeat)
+    void gen_irv_vert_syn_step(const lifting_step* s, const line_buf* aug, 
+                               const line_buf* sig, const line_buf* other, 
+                               ui32 repeat)
     {
+      float a = s->irv.Aatk;
+
       float* dst = aug->f32;
+      const float* src1 = sig->f32, * src2 = other->f32;
       for (ui32 i = repeat; i > 0; --i)
-        *dst++ *= K;
+        *dst++ -= a * (*src1++ + *src2++);
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_irv_horz_syn(const param_atk* atk, line_buf* dst, line_buf* lsrc,
-                          line_buf* hsrc, ui32 width, bool even)
+    void gen_irv_horz_syn(const param_atk* atk, const line_buf* dst, 
+                          const line_buf* lsrc, const line_buf* hsrc, 
+                          ui32 width, bool even)
     {
       if (width > 1)
       {
@@ -691,7 +880,6 @@ namespace ojph {
         ui32 num_steps = atk->get_num_steps();
         for (ui32 j = num_steps; j > 0; --j)
         {
-          // first lifting step
           const lifting_step* s = atk->get_step(j - 1);
           float a = s->irv.Aatk;
 
@@ -730,7 +918,13 @@ namespace ojph {
       }
     }
 
-
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
+    {
+      float* dst = aug->f32;
+      for (ui32 i = repeat; i > 0; --i)
+        *dst++ *= K;
+    }
 
 
 
diff --git a/src/core/transform/ojph_transform.h b/src/core/transform/ojph_transform.h
index 77ede96f..b31df0ef 100644
--- a/src/core/transform/ojph_transform.h
+++ b/src/core/transform/ojph_transform.h
@@ -85,16 +85,25 @@ namespace ojph {
 
 
 
+    /////////////////////////////////////////////////////////////////////////
+    extern void (*rev_vert_ana_step)
+      (const lifting_step* s, const line_buf* sig, const line_buf* other,
+        const line_buf* aug, ui32 repeat);
+
+    /////////////////////////////////////////////////////////////////////////
+    extern void (*rev_horz_ana)
+      (const param_atk* atk, const line_buf* ldst, const line_buf* hdst,
+        const line_buf* src, ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
     extern void (*rev_vert_syn_step)
-      (const lifting_step* s, line_buf* aug, const line_buf* sig, 
-        line_buf* other, ui32 repeat);
+      (const lifting_step* s, const line_buf* aug, const line_buf* sig,
+        const line_buf* other, ui32 repeat);
 
     /////////////////////////////////////////////////////////////////////////
     extern void (*rev_horz_syn)
-      (const param_atk* atk, line_buf* dst, line_buf* lsrc,
-        line_buf* hsrc, ui32 width, bool even);
+      (const param_atk* atk, const line_buf* dst, const line_buf* lsrc,
+        const line_buf* hsrc, ui32 width, bool even);
 
 
 
@@ -126,20 +135,30 @@ namespace ojph {
 
 
 
+
     /////////////////////////////////////////////////////////////////////////
-    extern void (*irv_vert_syn_step)
-      (const lifting_step* s, line_buf* aug, const line_buf* sig, 
-        line_buf* other, ui32 repeat);
+    extern void (*irv_vert_ana_step)
+      (const lifting_step* s, const line_buf* sig, const line_buf* other, 
+        const line_buf* aug, ui32 repeat);
 
     /////////////////////////////////////////////////////////////////////////
-    extern void (*irv_vert_syn_K)
-      (const float K, line_buf* aug, ui32 repeat);
+    extern void (*irv_horz_ana)
+      (const param_atk* atk, const line_buf* ldst, const line_buf* hdst, 
+        const line_buf* src, ui32 width, bool even);
+
+    /////////////////////////////////////////////////////////////////////////
+    extern void (*irv_vert_syn_step)
+      (const lifting_step* s, const line_buf* aug, const line_buf* sig, 
+        const line_buf* other, ui32 repeat);
 
     /////////////////////////////////////////////////////////////////////////
     extern void (*irv_horz_syn)
-      (const param_atk* atk, line_buf* dst, line_buf* lsrc,
-        line_buf* hsrc, ui32 width, bool even);
+      (const param_atk* atk, const line_buf* dst, const line_buf* lsrc, 
+        const line_buf* hsrc, ui32 width, bool even);
 
+    /////////////////////////////////////////////////////////////////////////
+    extern void (*irv_vert_times_K)
+      (float K, const line_buf* aug, ui32 repeat);
 
 
 
diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h
index 42cec378..c484d279 100644
--- a/src/core/transform/ojph_transform_local.h
+++ b/src/core/transform/ojph_transform_local.h
@@ -99,13 +99,24 @@ namespace ojph {
 
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_syn_step(const lifting_step* s, line_buf* aug,
-                               const line_buf* sig, line_buf* other,
+    void gen_rev_vert_ana_step(const lifting_step* s, const line_buf* sig, 
+                               const line_buf* other, const line_buf* aug, 
                                ui32 repeat);
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_syn(const param_atk *atk, line_buf* dst, line_buf *lsrc,
-                          line_buf *hsrc, ui32 width, bool even);
+    void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even);
+
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_vert_syn_step(const lifting_step* s, const line_buf* aug, 
+                               const line_buf* sig, const line_buf* other, 
+                               ui32 repeat);
+
+    /////////////////////////////////////////////////////////////////////////
+    void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                          const line_buf* lsrc, const line_buf* hsrc, 
+                          ui32 width, bool even);
 
 
 
@@ -134,18 +145,28 @@ namespace ojph {
 
 
 
+    /////////////////////////////////////////////////////////////////////////
+    void gen_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
+                               const line_buf* other, const line_buf* aug, 
+                               ui32 repeat);
+
+    /////////////////////////////////////////////////////////////////////////
+    void gen_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_irv_vert_syn_step(const lifting_step* s, line_buf* aug,
-                               const line_buf* sig, line_buf* other,
+    void gen_irv_vert_syn_step(const lifting_step* s, const line_buf* aug,
+                               const line_buf* sig, const line_buf* other,
                                ui32 repeat);
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_irv_vert_syn_K(const float K, line_buf* aug, ui32 repeat);
+    void gen_irv_horz_syn(const param_atk *atk, const line_buf* dst, 
+                          const line_buf *lsrc, const line_buf *hsrc, 
+                          ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_irv_horz_syn(const param_atk *atk, line_buf* dst, line_buf *lsrc,
-                          line_buf *hsrc, ui32 width, bool even);
+    void gen_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
 
 
 

From 12c3bf57624704daf3493e95eac40ae2327c3137 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Thu, 4 Apr 2024 16:04:52 +1100
Subject: [PATCH 13/37] Fixed 97 analysis.

---
 src/core/transform/ojph_transform.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp
index 4f7f8cc1..028ac013 100644
--- a/src/core/transform/ojph_transform.cpp
+++ b/src/core/transform/ojph_transform.cpp
@@ -752,8 +752,8 @@ namespace ojph {
 
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_irv_vert_ana_step(const lifting_step* s, const line_buf* aug, 
-                               const line_buf* sig, const line_buf* other, 
+    void gen_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
+                               const line_buf* other, const line_buf* aug, 
                                ui32 repeat)
     {
       float a = s->irv.Aatk;

From 4dc10b6abb0e0379ef24e5d79a2e3d598a2fe2b3 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Fri, 5 Apr 2024 06:54:32 +1100
Subject: [PATCH 14/37] A bug fix.  Still buggy.

---
 src/core/codestream/ojph_codeblock.cpp  | 12 +++--
 src/core/codestream/ojph_resolution.cpp | 23 ++++----
 src/core/codestream/ojph_resolution.h   |  1 +
 tests/test_executables.cpp              | 70 ++++++++++++++++++++++++-
 tests/test_helpers/ht_cmdlines.txt      | 48 +++++++++--------
 5 files changed, 116 insertions(+), 38 deletions(-)

diff --git a/src/core/codestream/ojph_codeblock.cpp b/src/core/codestream/ojph_codeblock.cpp
index a95cbef5..25bdc2ae 100644
--- a/src/core/codestream/ojph_codeblock.cpp
+++ b/src/core/codestream/ojph_codeblock.cpp
@@ -150,12 +150,14 @@ namespace ojph {
             cb_size.w, cb_size.h, stride, stripe_causal);
 
         if (result == false)
-          {
-            if (resilient == true)
-              zero_block = true;
-            else
-              OJPH_ERROR(0x000300A1, "Error decoding a codeblock\n");
+        {
+          if (resilient == true) {
+            OJPH_INFO(0x000300A1, "Error decoding a codeblock\n");
+            zero_block = true;
           }
+          else
+            OJPH_ERROR(0x000300A1, "Error decoding a codeblock\n");
+        }
       }
       else
         zero_block = true;
diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp
index ff148400..c4507707 100644
--- a/src/core/codestream/ojph_resolution.cpp
+++ b/src/core/codestream/ojph_resolution.cpp
@@ -361,6 +361,7 @@ namespace ojph {
         aug->line->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
 
         cur_line = 0;
+        rows_to_produce = res_rect.siz.h;
         vert_even = (res_rect.org.y & 1) == 0;
         horz_even = (res_rect.org.x & 1) == 0;
       }
@@ -406,7 +407,6 @@ namespace ojph {
             return;
           }
 
-          bool finished;
           do
           {
             //vertical transform
@@ -423,14 +423,13 @@ namespace ojph {
               lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
             }
 
-            finished = true;
             if (aug->active) {
               rev_horz_ana(atk, bands[2].get_line(),
                 bands[3].get_line(), aug->line, width, horz_even);
               bands[2].push_line();
               bands[3].push_line();
               aug->active = false;
-              finished = false;
+              --rows_to_produce;
             }
             if (sig->active) {
               rev_horz_ana(atk, child_res->get_line(),
@@ -438,14 +437,15 @@ namespace ojph {
               bands[1].push_line();
               child_res->push_line();
               sig->active = false;
-              finished = false;
+              --rows_to_produce;
             };
             vert_even = !vert_even;
-          } while (cur_line >= res_rect.siz.h && !finished);
+          } while (cur_line >= res_rect.siz.h && rows_to_produce > 0);
         }
         else
         {
           if (vert_even) {
+            // horizontal transform
             rev_horz_ana(atk, child_res->get_line(),
               bands[1].get_line(), sig->line, width, horz_even);
             bands[1].push_line();
@@ -453,9 +453,11 @@ namespace ojph {
           }
           else
           {
+            // vertical transform
             si32* sp = aug->line->i32;
             for (ui32 i = width; i > 0; --i)
               *sp++ <<= 1;
+            // horizontal transform
             rev_horz_ana(atk, bands[2].get_line(),
               bands[3].get_line(), aug->line, width, horz_even);
             bands[2].push_line();
@@ -472,7 +474,6 @@ namespace ojph {
             return;
           }
 
-          bool finished;
           do
           {
             //vertical transform
@@ -489,7 +490,6 @@ namespace ojph {
               lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
             }
 
-            finished = true;
             if (aug->active) {
               const float K = atk->get_K();
               irv_vert_times_K(K, aug->line, width);
@@ -499,7 +499,7 @@ namespace ojph {
               bands[2].push_line();
               bands[3].push_line();
               aug->active = false;
-              finished = false;
+              --rows_to_produce;
             }
             if (sig->active) {
               const float K_inv = 1.0f / atk->get_K();
@@ -510,14 +510,15 @@ namespace ojph {
               bands[1].push_line();
               child_res->push_line();
               sig->active = false;
-              finished = false;
+              --rows_to_produce;
             };
             vert_even = !vert_even;
-          } while (cur_line >= res_rect.siz.h && !finished);
+          } while (cur_line >= res_rect.siz.h && rows_to_produce > 0);
         }
         else
         {
           if (vert_even) {
+            // horizontal transform
             irv_horz_ana(atk, child_res->get_line(),
               bands[1].get_line(), sig->line, width, horz_even);
             bands[1].push_line();
@@ -525,9 +526,11 @@ namespace ojph {
           }
           else
           {
+            // vertical transform
             float* sp = aug->line->f32;
             for (ui32 i = width; i > 0; --i)
               *sp++ *= 2.0f;
+            // horizontal transform
             irv_horz_ana(atk, bands[2].get_line(),
               bands[3].get_line(), aug->line, width, horz_even);
             bands[2].push_line();
diff --git a/src/core/codestream/ojph_resolution.h b/src/core/codestream/ojph_resolution.h
index 36ae5d00..72e0b91a 100644
--- a/src/core/codestream/ojph_resolution.h
+++ b/src/core/codestream/ojph_resolution.h
@@ -116,6 +116,7 @@ namespace ojph {
       param_dfs::dfs_dwt_type downsampling_style;
       //wavelet machinery
       ui32 cur_line;
+      ui32 rows_to_produce;
       bool vert_even, horz_even;
       mem_elastic_allocator *elastic;
     };
diff --git a/tests/test_executables.cpp b/tests/test_executables.cpp
index 7e6a00cb..4c3a12b9 100644
--- a/tests/test_executables.cpp
+++ b/tests/test_executables.cpp
@@ -1015,6 +1015,40 @@ TEST(TestExecutables, SimpleEncIrv9732x128) {
               "Malamute.ppm", "", 3, mse, pae);
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// Test ojph_compress with codeblocks when the irv97 wavelet is used.
+// We test by comparing MSE and PAE of decoded images. 
+// The compressed file is obtained using these command-line options:
+// -o simple_enc_irv97_64x64_tiles_33x33_d5.j2c -qstep 0.01 -tile_size {33,33}
+// -num_decomps 5
+TEST(TestExecutables, SimpleEncIrv9764x64Tiles33x33D5) {
+  double mse[3] = { 46.2004, 43.622, 56.7452};
+  int pae[3] = { 48, 46, 52};
+  run_ojph_compress("Malamute.ppm",
+                    "simple_enc_irv97_64x64_tiles_33x33_d5", "", "j2c",
+                    "-qstep 0.01 -tile_size \"{33,33}\" -num_decomps 5");
+  run_ojph_compress_expand("simple_enc_irv97_64x64_tiles_33x33_d5", "j2c", "ppm");
+  run_mse_pae("simple_enc_irv97_64x64_tiles_33x33_d5", "ppm",
+              "Malamute.ppm", "", 3, mse, pae);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Test ojph_compress with codeblocks when the irv97 wavelet is used.
+// We test by comparing MSE and PAE of decoded images. 
+// The compressed file is obtained using these command-line options:
+// -o simple_enc_irv97_64x64_tiles_33x33_d6.j2c -qstep 0.01 -tile_size {33,33}
+// -num_decomps 6
+TEST(TestExecutables, SimpleEncIrv9764x64Tiles33x33D6) {
+  double mse[3] = { 46.2004, 43.622, 56.7452};
+  int pae[3] = { 48, 46, 52};
+  run_ojph_compress("Malamute.ppm",
+                    "simple_enc_irv97_64x64_tiles_33x33_d6", "", "j2c",
+                    "-qstep 0.01 -tile_size \"{33,33}\" -num_decomps 6");
+  run_ojph_compress_expand("simple_enc_irv97_64x64_tiles_33x33_d6", "j2c", "ppm");
+  run_mse_pae("simple_enc_irv97_64x64_tiles_33x33_d6", "ppm",
+              "Malamute.ppm", "", 3, mse, pae);
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Test ojph_compress with codeblocks when the irv97 wavelet is used.
 // We test by comparing MSE and PAE of decoded images. 
@@ -1159,6 +1193,40 @@ TEST(TestExecutables, SimpleEncRev534x1024) {
               "Malamute.ppm", "", 3, mse, pae);
 }
 
+///////////////////////////////////////////////////////////////////////////////
+// Test ojph_compress with codeblocks when the rev53 wavelet is used.
+// We test by comparing MSE and PAE of decoded images. 
+// The compressed file is obtained using these command-line options:
+// -o simple_enc_rev53_64x64_tiles_33x33.j2c -reversible true -tile_size
+// {32,32} -num_decomps 5
+TEST(TestExecutables, SimpleEncRev5364x64Tiles33x33D5) {
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
+  run_ojph_compress("Malamute.ppm",
+                    "simple_enc_rev53_64x64_tiles_33x33_d5", "", "j2c",
+                    "-reversible true -tile_size \"{32,32}\" -num_decomps 5");
+  run_ojph_compress_expand("simple_enc_rev53_64x64_tiles_33x33_d5", "j2c", "ppm");
+  run_mse_pae("simple_enc_rev53_64x64_tiles_33x33_d5", "ppm",
+              "Malamute.ppm", "", 3, mse, pae);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Test ojph_compress with codeblocks when the rev53 wavelet is used.
+// We test by comparing MSE and PAE of decoded images. 
+// The compressed file is obtained using these command-line options:
+// -o simple_enc_rev53_64x64_tiles_33x33.j2c -reversible true -tile_size
+// {32,32} -num_decomps 6
+TEST(TestExecutables, SimpleEncRev5364x64Tiles33x33D6) {
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
+  run_ojph_compress("Malamute.ppm",
+                    "simple_enc_rev53_64x64_tiles_33x33_d6", "", "j2c",
+                    "-reversible true -tile_size \"{32,32}\" -num_decomps 6");
+  run_ojph_compress_expand("simple_enc_rev53_64x64_tiles_33x33_d6", "j2c", "ppm");
+  run_mse_pae("simple_enc_rev53_64x64_tiles_33x33_d6", "ppm",
+              "Malamute.ppm", "", 3, mse, pae);
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // Test ojph_compress with codeblocks when the irv97 wavelet is used.
 // We test by comparing MSE and PAE of decoded images. 
@@ -1220,7 +1288,7 @@ TEST(TestExecutables, SimpleEncIrv97TallNarrow) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_tall_narrow1.j2c -image_offset {1,0} -qstep 0.1
 TEST(TestExecutables, SimpleEncIrv97TallNarrow1) {
-  double mse[3] = { 100.905762, 76.113037, 72.834717};
+  double mse[3] = { 100.906, 76.113, 72.8347};
   int pae[3] = { 39, 35, 34};
   run_ojph_compress("tall_narrow.ppm",
                     "simple_enc_irv97_tall_narrow1", "", "j2c",
diff --git a/tests/test_helpers/ht_cmdlines.txt b/tests/test_helpers/ht_cmdlines.txt
index 55b8e865..c8590611 100644
--- a/tests/test_helpers/ht_cmdlines.txt
+++ b/tests/test_helpers/ht_cmdlines.txt
@@ -57,28 +57,32 @@ add_test(NAME simple_dec_rev53_64x64_16bit_gray COMMAND ${CMAKE_CURRENT_SOURCE_D
 # Encoding
 #############################################################
 
-add_test(NAME simple_enc_irv97_64x64  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_64x64.j2c            -qstep 0.1"                         "-i simple_enc_irv97_64x64.j2c            -o test1.ppm -precise -quiet" "-i simple_enc_irv97_64x64.j2c            -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_32x32  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_32x32.j2c            -qstep 0.01 -block_size \{32,32\}"  "-i simple_enc_irv97_32x32.j2c            -o test1.ppm -precise -quiet" "-i simple_enc_irv97_32x32.j2c            -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_16x16  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_16x16.j2c            -qstep 0.01 -block_size \{16,16\}"  "-i simple_enc_irv97_16x16.j2c            -o test1.ppm -precise -quiet" "-i simple_enc_irv97_16x16.j2c            -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_4x4    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_4x4.j2c              -qstep 0.01 -block_size \{4,4\}"    "-i simple_enc_irv97_4x4.j2c              -o test1.ppm -precise -quiet" "-i simple_enc_irv97_4x4.j2c              -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_1024x4 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_1024x4.j2c           -qstep 0.01 -block_size \{4,1024\}" "-i simple_enc_irv97_1024x4.j2c           -o test1.ppm -precise -quiet" "-i simple_enc_irv97_1024x4.j2c           -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_4x1024 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_4x1024.j2c           -qstep 0.01 -block_size \{1024,4\}" "-i simple_enc_irv97_4x1024.j2c           -o test1.ppm -precise -quiet" "-i simple_enc_irv97_4x1024.j2c           -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_512x8  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_512x8.j2c            -qstep 0.01 -block_size \{8,512\}"  "-i simple_enc_irv97_512x8.j2c            -o test1.ppm -precise -quiet" "-i simple_enc_irv97_512x8.j2c            -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_8x512  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_8x512.j2c            -qstep 0.01 -block_size \{512,8\}"  "-i simple_enc_irv97_8x512.j2c            -o test1.ppm -precise -quiet" "-i simple_enc_irv97_8x512.j2c            -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_256x16 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_256x16.j2c           -qstep 0.01 -block_size \{16,256\}" "-i simple_enc_irv97_256x16.j2c           -o test1.ppm -precise -quiet" "-i simple_enc_irv97_256x16.j2c           -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_16x256 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_16x256.j2c           -qstep 0.01 -block_size \{256,16\}" "-i simple_enc_irv97_16x256.j2c           -o test1.ppm -precise -quiet" "-i simple_enc_irv97_16x256.j2c           -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_128x32 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_128x32.j2c           -qstep 0.01 -block_size \{32,128\}" "-i simple_enc_irv97_128x32.j2c           -o test1.ppm -precise -quiet" "-i simple_enc_irv97_128x32.j2c           -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_32x128 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc             "-i ${images_folder}/mm.ppm -o simple_enc_irv97_32x128.j2c           -qstep 0.01 -block_size \{128,32\}" "-i simple_enc_irv97_32x128.j2c           -o test1.ppm -precise -quiet" "-i simple_enc_irv97_32x128.j2c           -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_64x64_16bit COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc        "-i ${images_folder}/mm.ppm  -o simple_enc_irv97_64x64_16bit.j2c      -qstep 0.01"                        "-i simple_enc_irv97_64x64_16bit.j2c      -o test1.ppm -precise -quiet" "-i simple_enc_irv97_64x64_16bit.j2c      -o test2.ppm" "${images_folder}/mm.ppm"  "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_irv97_64x64_16bit_gray COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc   "-i ${images_folder}/mm.pgm  -o simple_enc_irv97_64x64_16bit_gray.j2c -qstep 0.01"                        "-i simple_enc_irv97_64x64_16bit_gray.j2c -o test1.pgm -precise -quiet" "-i simple_enc_irv97_64x64_16bit_gray.j2c -o test2.pgm" "${images_folder}/mm.pgm"  "test1.pgm" "test2.pgm")
-add_test(NAME simple_enc_rev53_64x64_16bit  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc      "-i ${images_folder}/mm.ppm  -o simple_enc_rev53_64x64_16bit.j2c      -reversible true"                   "-i simple_enc_rev53_64x64_16bit.j2c      -o test1.ppm -precise -quiet" "-i simple_enc_rev53_64x64_16bit.j2c      -o test2.ppm" "${images_folder}/mm.ppm"  "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_rev53_64x64_16bit_gray  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.pgm  -o simple_enc_rev53_64x64_16bit_gray.j2c -reversible true"                   "-i simple_enc_rev53_64x64_16bit_gray.j2c -o test1.pgm -precise -quiet" "-i simple_enc_rev53_64x64_16bit_gray.j2c -o test2.pgm" "${images_folder}/mm.pgm"  "test1.pgm" "test2.pgm")
-
-add_test(NAME simple_enc_rev53_64x64  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_64x64.j2c  -reversible true"                        "-i simple_enc_rev53_64x64.j2c  -o test1.ppm -precise -quiet" "-i simple_enc_rev53_64x64.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_rev53_32x32  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_32x32.j2c  -reversible true -block_size \{32,32\}"  "-i simple_enc_rev53_32x32.j2c  -o test1.ppm -precise -quiet" "-i simple_enc_rev53_32x32.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_rev53_4x4    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_4x4.j2c    -reversible true -block_size \{4,4\}"    "-i simple_enc_rev53_4x4.j2c    -o test1.ppm -precise -quiet" "-i simple_enc_rev53_4x4.j2c    -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_rev53_1024x4 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_1024x4.j2c -reversible true -block_size \{4,1024\}" "-i simple_enc_rev53_1024x4.j2c -o test1.ppm -precise -quiet" "-i simple_enc_rev53_1024x4.j2c -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_rev53_4x1024 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_4x1024.j2c -reversible true -block_size \{1024,4\}" "-i simple_enc_rev53_4x1024.j2c -o test1.ppm -precise -quiet" "-i simple_enc_rev53_4x1024.j2c -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_64x64  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_64x64.j2c             -qstep 0.1"                                           "-i simple_enc_irv97_64x64.j2c             -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_64x64.j2c                -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_32x32  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_32x32.j2c             -qstep 0.01 -block_size \{32,32\}"                    "-i simple_enc_irv97_32x32.j2c             -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_32x32.j2c                -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_16x16  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_16x16.j2c             -qstep 0.01 -block_size \{16,16\}"                    "-i simple_enc_irv97_16x16.j2c             -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_16x16.j2c                -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_4x4    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_4x4.j2c               -qstep 0.01 -block_size \{4,4\}"                      "-i simple_enc_irv97_4x4.j2c               -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_4x4.j2c                  -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_1024x4 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_1024x4.j2c            -qstep 0.01 -block_size \{4,1024\}"                   "-i simple_enc_irv97_1024x4.j2c            -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_1024x4.j2c               -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_4x1024 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_4x1024.j2c            -qstep 0.01 -block_size \{1024,4\}"                   "-i simple_enc_irv97_4x1024.j2c            -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_4x1024.j2c               -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_512x8  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_512x8.j2c             -qstep 0.01 -block_size \{8,512\}"                    "-i simple_enc_irv97_512x8.j2c             -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_512x8.j2c                -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_8x512  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_8x512.j2c             -qstep 0.01 -block_size \{512,8\}"                    "-i simple_enc_irv97_8x512.j2c             -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_8x512.j2c                -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_256x16 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_256x16.j2c            -qstep 0.01 -block_size \{16,256\}"                   "-i simple_enc_irv97_256x16.j2c            -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_256x16.j2c               -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_16x256 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_16x256.j2c            -qstep 0.01 -block_size \{256,16\}"                   "-i simple_enc_irv97_16x256.j2c            -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_16x256.j2c               -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_128x32 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_128x32.j2c            -qstep 0.01 -block_size \{32,128\}"                   "-i simple_enc_irv97_128x32.j2c            -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_128x32.j2c               -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_32x128 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc                 "-i ${images_folder}/mm.ppm -o simple_enc_irv97_32x128.j2c            -qstep 0.01 -block_size \{128,32\}"                   "-i simple_enc_irv97_32x128.j2c            -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_32x128.j2c               -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_64x64_tiles_33x33_d5 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc   "-i ${images_folder}/mm.ppm -o simple_enc_irv97_64x64_tiles_33x33_d5.j2c -qstep 0.01 -tile_size \{33,33\} -num_decomps 5"   "-i simple_enc_irv97_64x64_tiles_33x33_d5.j2c -o test1.ppm -precise -quiet" "-i simple_enc_irv97_64x64_tiles_33x33_d5.j2c -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_64x64_tiles_33x33_d6 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc   "-i ${images_folder}/mm.ppm -o simple_enc_irv97_64x64_tiles_33x33_d6.j2c -qstep 0.01 -tile_size \{33,33\} -num_decomps 6"   "-i simple_enc_irv97_64x64_tiles_33x33_d6.j2c -o test1.ppm -precise -quiet" "-i simple_enc_irv97_64x64_tiles_33x33_d6.j2c -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_64x64_16bit COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc            "-i ${images_folder}/mm.ppm  -o simple_enc_irv97_64x64_16bit.j2c      -qstep 0.01"                                          "-i simple_enc_irv97_64x64_16bit.j2c       -o test1.ppm -precise -quiet"    "-i simple_enc_irv97_64x64_16bit.j2c          -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_irv97_64x64_16bit_gray COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -enc       "-i ${images_folder}/mm.pgm  -o simple_enc_irv97_64x64_16bit_gray.j2c -qstep 0.01"                                          "-i simple_enc_irv97_64x64_16bit_gray.j2c  -o test1.pgm -precise -quiet"    "-i simple_enc_irv97_64x64_16bit_gray.j2c     -o test2.pgm"    "${images_folder}/mm.pgm" "test1.pgm" "test2.pgm")
+add_test(NAME simple_enc_rev53_64x64_16bit  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc          "-i ${images_folder}/mm.ppm  -o simple_enc_rev53_64x64_16bit.j2c      -reversible true"                                     "-i simple_enc_rev53_64x64_16bit.j2c       -o test1.ppm -precise -quiet"    "-i simple_enc_rev53_64x64_16bit.j2c          -o test2.ppm"    "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_64x64_16bit_gray  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc     "-i ${images_folder}/mm.pgm  -o simple_enc_rev53_64x64_16bit_gray.j2c -reversible true"                                     "-i simple_enc_rev53_64x64_16bit_gray.j2c  -o test1.pgm -precise -quiet"    "-i simple_enc_rev53_64x64_16bit_gray.j2c     -o test2.pgm"    "${images_folder}/mm.pgm" "test1.pgm" "test2.pgm")
+
+add_test(NAME simple_enc_rev53_64x64  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc               "-i ${images_folder}/mm.ppm -o simple_enc_rev53_64x64.j2c  -reversible true"                                                 "-i simple_enc_rev53_64x64.j2c  -o test1.ppm -precise -quiet"             "-i simple_enc_rev53_64x64.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_32x32  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc               "-i ${images_folder}/mm.ppm -o simple_enc_rev53_32x32.j2c  -reversible true -block_size \{32,32\}"                           "-i simple_enc_rev53_32x32.j2c  -o test1.ppm -precise -quiet"             "-i simple_enc_rev53_32x32.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_4x4    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc               "-i ${images_folder}/mm.ppm -o simple_enc_rev53_4x4.j2c    -reversible true -block_size \{4,4\}"                             "-i simple_enc_rev53_4x4.j2c    -o test1.ppm -precise -quiet"             "-i simple_enc_rev53_4x4.j2c    -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_1024x4 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc               "-i ${images_folder}/mm.ppm -o simple_enc_rev53_1024x4.j2c -reversible true -block_size \{4,1024\}"                          "-i simple_enc_rev53_1024x4.j2c -o test1.ppm -precise -quiet"             "-i simple_enc_rev53_1024x4.j2c -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_4x1024 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc               "-i ${images_folder}/mm.ppm -o simple_enc_rev53_4x1024.j2c -reversible true -block_size \{1024,4\}"                          "-i simple_enc_rev53_4x1024.j2c -o test1.ppm -precise -quiet"             "-i simple_enc_rev53_4x1024.j2c -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_64x64_tiles_33x33_d5 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_64x64_tiles_33x33.j2c  -reversible true -tile_size \{32,32\} -num_decomps 5" "-i simple_enc_rev53_64x64_tiles_33x33.j2c  -o test1.ppm -precise -quiet" "-i simple_enc_rev53_64x64.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_64x64_tiles_33x33_d6 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_64x64_tiles_33x33.j2c  -reversible true -tile_size \{32,32\} -num_decomps 6" "-i simple_enc_rev53_64x64_tiles_33x33.j2c  -o test1.ppm -precise -quiet" "-i simple_enc_rev53_64x64.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
 
 add_test(NAME simple_enc_irv97_64x64_yuv COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom_yuv.sh -enc  "-i ${images_folder}/foreman_420.yuv -o simple_enc_irv97_64x64_yuv.j2c -qstep 0.1 -dims \{352,288\} -num_comps 3 -downsamp \{1,1\},\{2,2\},\{2,2\} -bit_depth 8,8,8 -signed false,false,false"                   "-i simple_enc_irv97_64x64_yuv.j2c -o test1y.rawl,test1u.rawl,test1v.rawl -precise -quiet" "-i simple_enc_irv97_64x64_yuv.j2c -o test2.yuv" "${images_folder}/foreman_420.yuv:352x288x8x420" "test1.yuv:352x288x8x420" "test2.yuv:352x288x8x420")
 add_test(NAME simple_enc_rev53_64x64_yuv COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom_yuv.sh -renc "-i ${images_folder}/foreman_420.yuv -o simple_enc_rev53_64x64_yuv.j2c -reversible true -qstep 0.1 -dims \{352,288\} -num_comps 3 -downsamp \{1,1\},\{2,2\},\{2,2\} -bit_depth 8,8,8 -signed false,false,false"  "-i simple_enc_rev53_64x64_yuv.j2c -o test1y.rawl,test1u.rawl,test1v.rawl -precise -quiet" "-i simple_enc_rev53_64x64_yuv.j2c -o test2.yuv" "${images_folder}/foreman_420.yuv:352x288x8x420" "test1.yuv:352x288x8x420" "test2.yuv:352x288x8x420")

From 9846f01b5d40a38116cd871754678a110d9d837b Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Fri, 5 Apr 2024 14:07:32 +1100
Subject: [PATCH 15/37] Small touchup for error messages

---
 src/core/codestream/ojph_codeblock.cpp       |  4 ++--
 src/core/coding/ojph_block_decoder.cpp       | 12 ++++++------
 src/core/coding/ojph_block_decoder_ssse3.cpp | 12 ++++++------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/core/codestream/ojph_codeblock.cpp b/src/core/codestream/ojph_codeblock.cpp
index 25bdc2ae..9a63ca19 100644
--- a/src/core/codestream/ojph_codeblock.cpp
+++ b/src/core/codestream/ojph_codeblock.cpp
@@ -152,11 +152,11 @@ namespace ojph {
         if (result == false)
         {
           if (resilient == true) {
-            OJPH_INFO(0x000300A1, "Error decoding a codeblock\n");
+            OJPH_INFO(0x000300A1, "Error decoding a codeblock");
             zero_block = true;
           }
           else
-            OJPH_ERROR(0x000300A1, "Error decoding a codeblock\n");
+            OJPH_ERROR(0x000300A1, "Error decoding a codeblock");
         }
       }
       else
diff --git a/src/core/coding/ojph_block_decoder.cpp b/src/core/coding/ojph_block_decoder.cpp
index 9a121876..5be5430a 100644
--- a/src/core/coding/ojph_block_decoder.cpp
+++ b/src/core/coding/ojph_block_decoder.cpp
@@ -753,14 +753,14 @@ namespace ojph {
       {
         OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
                               "one coding pass, but zero length for "
-                              "2nd and potential 3rd pass.\n");
+                              "2nd and potential 3rd pass");
         num_passes = 1;
       }
 
       if (num_passes > 3)
       {
         OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
-                              "This codeblocks has %d passes.\n",
+                              "This codeblocks has %d passes",
                               num_passes);
         return false;
       }
@@ -772,7 +772,7 @@ namespace ojph {
           insufficient_precision = true;
           OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
                                 "codeblock. This message will not be "
-                                "displayed again.\n");
+                                "displayed again");
         }
         return false;
       }       
@@ -783,7 +783,7 @@ namespace ojph {
           OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
                                 "pass. The code can be modified to support "
                                 "this case. This message will not be "
-                                "displayed again.\n");
+                                "displayed again");
         }
          return false;         // 32 bits are not enough to decode this
        }
@@ -796,7 +796,7 @@ namespace ojph {
             OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
                                   "nor MagRef passes; both will be skipped. "
                                   "This message will not be displayed "
-                                  "again.\n");
+                                  "again");
           }
         }
       }
@@ -806,7 +806,7 @@ namespace ojph {
 
       if (lengths1 < 2)
       {
-        OJPH_WARN(0x00010006, "Wrong codeblock length.\n");
+        OJPH_WARN(0x00010006, "Wrong codeblock length");
         return false;
       }
 
diff --git a/src/core/coding/ojph_block_decoder_ssse3.cpp b/src/core/coding/ojph_block_decoder_ssse3.cpp
index a8f89138..99ae38cb 100644
--- a/src/core/coding/ojph_block_decoder_ssse3.cpp
+++ b/src/core/coding/ojph_block_decoder_ssse3.cpp
@@ -1033,14 +1033,14 @@ namespace ojph {
       {
         OJPH_WARN(0x00010001, "A malformed codeblock that has more than "
                               "one coding pass, but zero length for "
-                              "2nd and potential 3rd pass.\n");
+                              "2nd and potential 3rd pass");
         num_passes = 1;
       }
 
       if (num_passes > 3)
       {
         OJPH_WARN(0x00010002, "We do not support more than 3 coding passes; "
-                              "This codeblocks has %d passes.\n",
+                              "This codeblocks has %d passes",
                               num_passes);
         return false;
       }
@@ -1052,7 +1052,7 @@ namespace ojph {
           insufficient_precision = true;
           OJPH_WARN(0x00010003, "32 bits are not enough to decode this "
                                 "codeblock. This message will not be "
-                                "displayed again.\n");
+                                "displayed again");
         }
         return false;
       }       
@@ -1063,7 +1063,7 @@ namespace ojph {
           OJPH_WARN(0x00010004, "Not enough precision to decode the cleanup "
                                 "pass. The code can be modified to support "
                                 "this case. This message will not be "
-                                "displayed again.\n");
+                                "displayed again");
         }
          return false;         // 32 bits are not enough to decode this
        }
@@ -1076,7 +1076,7 @@ namespace ojph {
             OJPH_WARN(0x00010005, "Not enough precision to decode the SgnProp "
                                   "nor MagRef passes; both will be skipped. "
                                   "This message will not be displayed "
-                                  "again.\n");
+                                  "again");
           }
         }
       }
@@ -1086,7 +1086,7 @@ namespace ojph {
 
       if (lengths1 < 2)
       {
-        OJPH_WARN(0x00010006, "Wrong codeblock length.\n");
+        OJPH_WARN(0x00010006, "Wrong codeblock length");
         return false;
       }
 

From 86b139d62f6246ca686801a7da43d198b82e02db Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Sat, 6 Apr 2024 09:13:39 +1100
Subject: [PATCH 16/37] This is a very important bug fix -- Empty
 subbands/precincts.

---
 src/core/codestream/ojph_precinct.cpp | 24 ++++++++++++++++++++++++
 src/core/codestream/ojph_precinct.h   |  2 +-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/core/codestream/ojph_precinct.cpp b/src/core/codestream/ojph_precinct.cpp
index 71b61de8..f8d69fbe 100644
--- a/src/core/codestream/ojph_precinct.cpp
+++ b/src/core/codestream/ojph_precinct.cpp
@@ -332,6 +332,30 @@ namespace ojph {
       if (may_use_sop)
         bb_skip_sop(&bb);
 
+      if (num_bands == 3)
+      {
+        if (bands[1].empty && bands[2].empty && bands[3].empty)
+        {
+          ui32 bit = 0;
+          bb_read_bit(&bb, bit);
+          bb_terminate(&bb, uses_eph);
+          assert(bit == 0);
+          return;
+        }
+      }
+      else
+      {
+        if (bands[0].empty)
+        {
+          ui32 bit = 0;
+          bb_read_bit(&bb, bit);
+          bb_terminate(&bb, uses_eph);
+          assert(bit == 0);
+          return;
+        }
+      }
+
+
       int sst = num_bands == 3 ? 1 : 0;
       int send = num_bands == 3 ? 4 : 1;
       bool empty_packet = true;
diff --git a/src/core/codestream/ojph_precinct.h b/src/core/codestream/ojph_precinct.h
index 4641ed68..d8e880a9 100644
--- a/src/core/codestream/ojph_precinct.h
+++ b/src/core/codestream/ojph_precinct.h
@@ -69,7 +69,7 @@ namespace ojph {
                  ui32& data_left, infile_base *file, bool skipped);
 
       ui8 *scratch;
-      point img_point;   //the precinct projected to full resolution
+      point img_point; //the precinct projected to full resolution
       rect cb_idxs[4]; //indices of codeblocks
       subband *bands;  //the subbands
       coded_lists* coded;

From 55993264b15ee2efba172d40d6e626e5c6f2ff06 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Sat, 6 Apr 2024 14:33:56 +1100
Subject: [PATCH 17/37] A small improvement.

---
 src/core/codestream/ojph_precinct.cpp   | 27 ++++++-------------------
 src/core/codestream/ojph_resolution.cpp |  2 ++
 src/core/codestream/ojph_subband.h      |  2 ++
 3 files changed, 10 insertions(+), 21 deletions(-)

diff --git a/src/core/codestream/ojph_precinct.cpp b/src/core/codestream/ojph_precinct.cpp
index f8d69fbe..b7e25aa0 100644
--- a/src/core/codestream/ojph_precinct.cpp
+++ b/src/core/codestream/ojph_precinct.cpp
@@ -332,30 +332,15 @@ namespace ojph {
       if (may_use_sop)
         bb_skip_sop(&bb);
 
-      if (num_bands == 3)
+      if (bands[0].empty && bands[1].empty && bands[2].empty && bands[3].empty)
       {
-        if (bands[1].empty && bands[2].empty && bands[3].empty)
-        {
-          ui32 bit = 0;
-          bb_read_bit(&bb, bit);
-          bb_terminate(&bb, uses_eph);
-          assert(bit == 0);
-          return;
-        }
-      }
-      else
-      {
-        if (bands[0].empty)
-        {
-          ui32 bit = 0;
-          bb_read_bit(&bb, bit);
-          bb_terminate(&bb, uses_eph);
-          assert(bit == 0);
-          return;
-        }
+        ui32 bit = 0;
+        bb_read_bit(&bb, bit);
+        bb_terminate(&bb, uses_eph);
+        assert(bit == 0);
+        return;
       }
 
-
       int sst = num_bands == 3 ? 1 : 0;
       int send = num_bands == 3 ? 4 : 1;
       bool empty_packet = true;
diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp
index c4507707..a0413b76 100644
--- a/src/core/codestream/ojph_resolution.cpp
+++ b/src/core/codestream/ojph_resolution.cpp
@@ -241,6 +241,8 @@ namespace ojph {
       ui32 trx1 = res_rect.org.x + res_rect.siz.w;
       ui32 try1 = res_rect.org.y + res_rect.siz.h;
       bands = allocator->post_alloc_obj<subband>(4);
+      for (int i = 0; i < 4; ++i)
+        new (bands + i) subband;
       if (res_num > 0)
       {
         this->num_bands = 3;
diff --git a/src/core/codestream/ojph_subband.h b/src/core/codestream/ojph_subband.h
index 9928c5ef..34cc7396 100644
--- a/src/core/codestream/ojph_subband.h
+++ b/src/core/codestream/ojph_subband.h
@@ -63,6 +63,8 @@ namespace ojph {
     {
       friend struct precinct;
     public:
+      subband() { memset(this, 0, sizeof(subband)); empty = true; }
+
       static void pre_alloc(codestream *codestream, const rect& band_rect,
                             ui32 comp_num, ui32 res_num);
       void finalize_alloc(codestream *codestream, const rect& band_rect,

From 0e0d41ddd4d3770df81fd3e71f1091af4d7ae9bb Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Sat, 6 Apr 2024 15:42:00 +1100
Subject: [PATCH 18/37] Corrected Tests.

---
 src/core/codestream/ojph_precinct.cpp |  11 +-
 src/core/codestream/ojph_subband.h    |  20 +-
 tests/test_executables.cpp            | 744 +++++++++++++-------------
 tests/test_helpers/ht_cmdlines.txt    |   4 +-
 4 files changed, 399 insertions(+), 380 deletions(-)

diff --git a/src/core/codestream/ojph_precinct.cpp b/src/core/codestream/ojph_precinct.cpp
index b7e25aa0..c20c8589 100644
--- a/src/core/codestream/ojph_precinct.cpp
+++ b/src/core/codestream/ojph_precinct.cpp
@@ -341,11 +341,12 @@ namespace ojph {
         return;
       }
 
-      int sst = num_bands == 3 ? 1 : 0;
-      int send = num_bands == 3 ? 4 : 1;
       bool empty_packet = true;
-      for (int s = sst; s < send; ++s)
+      for (int s = 0; s < 4; ++s)
       {
+        if (bands[s].empty)
+          continue;
+
         if (cb_idxs[s].siz.w == 0 || cb_idxs[s].siz.h == 0)
           continue;
 
@@ -505,8 +506,10 @@ namespace ojph {
       }
       bb_terminate(&bb, uses_eph);
       //read codeblock data
-      for (int s = sst; s < send; ++s)
+      for (int s = 0; s < 4; ++s)
       {
+        if (bands[s].empty)
+          continue;
         ui32 band_width = bands[s].num_blocks.w;
         ui32 width = cb_idxs[s].siz.w;
         ui32 height = cb_idxs[s].siz.h;
diff --git a/src/core/codestream/ojph_subband.h b/src/core/codestream/ojph_subband.h
index 34cc7396..5dd145e6 100644
--- a/src/core/codestream/ojph_subband.h
+++ b/src/core/codestream/ojph_subband.h
@@ -63,7 +63,22 @@ namespace ojph {
     {
       friend struct precinct;
     public:
-      subband() { memset(this, 0, sizeof(subband)); empty = true; }
+      subband() { 
+        res_num = band_num = 0;
+        reversible = false;
+        empty = true;             // <---- true
+        lines = NULL;
+        parent = NULL;
+        blocks = NULL;
+        xcb_prime = ycb_prime = 0;
+        cur_cb_row = 0;
+        cur_line = 0;
+        cur_cb_height = 0;
+        delta = delta_inv = 0.0f;
+        K_max = 0;
+        coded_cbs = NULL;
+        elastic = NULL;
+      }
 
       static void pre_alloc(codestream *codestream, const rect& band_rect,
                             ui32 comp_num, ui32 res_num);
@@ -80,9 +95,10 @@ namespace ojph {
       line_buf* pull_line();
 
     private:
+      bool empty;                  // true if the subband has no pixels or
+                                   // the subband is NOT USED
       ui32 res_num, band_num;
       bool reversible;
-      bool empty;
       rect band_rect;
       line_buf *lines;
       resolution* parent;
diff --git a/tests/test_executables.cpp b/tests/test_executables.cpp
index 4c3a12b9..f42174f6 100644
--- a/tests/test_executables.cpp
+++ b/tests/test_executables.cpp
@@ -44,7 +44,7 @@
 // STATIC                         ojph_popen
 ////////////////////////////////////////////////////////////////////////////////
 static inline
-FILE *ojph_popen(const char *command, const char *modes) 
+FILE* ojph_popen(const char* command, const char* modes)
 {
 #ifdef OJPH_COMPILER_MSVC
   return _popen(command, modes);
@@ -57,7 +57,7 @@ FILE *ojph_popen(const char *command, const char *modes)
 // STATIC                         ojph_pclose
 ////////////////////////////////////////////////////////////////////////////////
 static inline
-int ojph_pclose(FILE *stream) 
+int ojph_pclose(FILE* stream)
 {
 #ifdef OJPH_COMPILER_MSVC
   return _pclose(stream);
@@ -69,16 +69,16 @@ int ojph_pclose(FILE *stream)
 ////////////////////////////////////////////////////////////////////////////////
 // STATIC                           execute
 ////////////////////////////////////////////////////////////////////////////////
-static 
-int execute(const std::string& cmd, std::string& result) 
+static
+int execute(const std::string& cmd, std::string& result)
 {
   std::array<char, 128> buffer;
   result.clear();
 
   FILE* pipe = ojph_popen(cmd.c_str(), "r");
-  if (!pipe) 
+  if (!pipe)
     throw std::runtime_error("ojph_popen() failed!");
-  
+
   while (!feof(pipe))
     if (fgets(buffer.data(), 128, pipe) != nullptr)
       result += buffer.data();
@@ -94,21 +94,21 @@ int execute(const std::string& cmd, std::string& result)
 ////////////////////////////////////////////////////////////////////////////////
 
 #ifdef OJPH_OS_WINDOWS
-	#define SRC_FILE_DIR ".\\jp2k_test_codestreams\\openjph\\"
-	#define OUT_FILE_DIR ".\\"
-	#define REF_FILE_DIR ".\\jp2k_test_codestreams\\openjph\\references\\"
-	#define MSE_PAE_PATH  ".\\mse_pae"
-	#define COMPARE_FILES_PATH  ".\\compare_files"
-	#define EXPAND_EXECUTABLE ".\\ojph_expand.exe"
-	#define COMPRESS_EXECUTABLE ".\\ojph_compress.exe"
+#define SRC_FILE_DIR ".\\jp2k_test_codestreams\\openjph\\"
+#define OUT_FILE_DIR ".\\"
+#define REF_FILE_DIR ".\\jp2k_test_codestreams\\openjph\\references\\"
+#define MSE_PAE_PATH  ".\\mse_pae"
+#define COMPARE_FILES_PATH  ".\\compare_files"
+#define EXPAND_EXECUTABLE ".\\ojph_expand.exe"
+#define COMPRESS_EXECUTABLE ".\\ojph_compress.exe"
 #else
-	#define SRC_FILE_DIR "./jp2k_test_codestreams/openjph/"
-	#define OUT_FILE_DIR "./"
-	#define REF_FILE_DIR "./jp2k_test_codestreams/openjph/references/"
-	#define MSE_PAE_PATH  "./mse_pae"
-	#define COMPARE_FILES_PATH  "./compare_files"
-	#define EXPAND_EXECUTABLE "./ojph_expand"
-	#define COMPRESS_EXECUTABLE "./ojph_compress"
+#define SRC_FILE_DIR "./jp2k_test_codestreams/openjph/"
+#define OUT_FILE_DIR "./"
+#define REF_FILE_DIR "./jp2k_test_codestreams/openjph/references/"
+#define MSE_PAE_PATH  "./mse_pae"
+#define COMPARE_FILES_PATH  "./compare_files"
+#define EXPAND_EXECUTABLE "./ojph_expand"
+#define COMPRESS_EXECUTABLE "./ojph_compress"
 #endif
 #define TOL_DOUBLE 0.01
 #define TOL_INTEGER 1
@@ -116,22 +116,22 @@ int execute(const std::string& cmd, std::string& result)
 ////////////////////////////////////////////////////////////////////////////////
 //                            run_ojph_compress
 ////////////////////////////////////////////////////////////////////////////////
-void run_ojph_compress(const std::string& ref_filename, 
-                       const std::string& base_filename, 
-                       const std::string& extended_base_fname, 
-                       const std::string& out_ext,
-                       const std::string& extra_options)
+void run_ojph_compress(const std::string& ref_filename,
+  const std::string& base_filename,
+  const std::string& extended_base_fname,
+  const std::string& out_ext,
+  const std::string& extra_options)
 {
   try {
     std::string result, command;
-    command = std::string(COMPRESS_EXECUTABLE) 
+    command = std::string(COMPRESS_EXECUTABLE)
       + " -i " + REF_FILE_DIR + ref_filename
-      + " -o " + OUT_FILE_DIR + base_filename + extended_base_fname + 
+      + " -o " + OUT_FILE_DIR + base_filename + extended_base_fname +
       "." + out_ext + " " + extra_options;
     std::cerr << command << std::endl;
     EXPECT_EQ(execute(command, result), 0);
   }
-  catch(const std::runtime_error& error) {
+  catch (const std::runtime_error& error) {
     FAIL() << error.what();
   }
 }
@@ -139,18 +139,18 @@ void run_ojph_compress(const std::string& ref_filename,
 ////////////////////////////////////////////////////////////////////////////////
 //                            run_ojph_expand
 ////////////////////////////////////////////////////////////////////////////////
-void run_ojph_expand(const std::string& base_filename, 
-                     const std::string& src_ext,
-                     const std::string& out_ext)
+void run_ojph_expand(const std::string& base_filename,
+  const std::string& src_ext,
+  const std::string& out_ext)
 {
   try {
     std::string result, command;
-    command = std::string(EXPAND_EXECUTABLE) 
+    command = std::string(EXPAND_EXECUTABLE)
       + " -i " + SRC_FILE_DIR + base_filename + "." + src_ext
       + " -o " + OUT_FILE_DIR + base_filename + "." + out_ext;
     EXPECT_EQ(execute(command, result), 0);
   }
-  catch(const std::runtime_error& error) {
+  catch (const std::runtime_error& error) {
     FAIL() << error.what();
   }
 }
@@ -158,34 +158,34 @@ void run_ojph_expand(const std::string& base_filename,
 ////////////////////////////////////////////////////////////////////////////////
 //                            run_ojph_compress
 ////////////////////////////////////////////////////////////////////////////////
-void run_ojph_compress_expand(const std::string& base_filename, 
-                              const std::string& out_ext,
-                              const std::string& decode_ext)
+void run_ojph_compress_expand(const std::string& base_filename,
+  const std::string& out_ext,
+  const std::string& decode_ext)
 {
   try {
     std::string result, command;
-    command = std::string(EXPAND_EXECUTABLE) 
+    command = std::string(EXPAND_EXECUTABLE)
       + " -i " + OUT_FILE_DIR + base_filename + "." + out_ext
       + " -o " + OUT_FILE_DIR + base_filename + "." + decode_ext;
     EXPECT_EQ(execute(command, result), 0);
   }
-  catch(const std::runtime_error& error) {
+  catch (const std::runtime_error& error) {
     FAIL() << error.what();
-  }  
+  }
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 //                             run_mse_pae
 ////////////////////////////////////////////////////////////////////////////////
-void run_mse_pae(const std::string& base_filename, 
-                 const std::string& out_ext, 
-                 const std::string& ref_filename, 
-                 const std::string& yuv_specs,
-                 int num_components, double* mse, int* pae) 
+void run_mse_pae(const std::string& base_filename,
+  const std::string& out_ext,
+  const std::string& ref_filename,
+  const std::string& yuv_specs,
+  int num_components, double* mse, int* pae)
 {
   try {
     std::string result, command;
-    command = std::string(MSE_PAE_PATH) 
+    command = std::string(MSE_PAE_PATH)
       + " " + OUT_FILE_DIR + base_filename + "." + out_ext + yuv_specs
       + " " + REF_FILE_DIR + ref_filename + yuv_specs;
     EXPECT_EQ(execute(command, result), 0);
@@ -214,7 +214,7 @@ void run_mse_pae(const std::string& base_filename,
         ++pos;
     }
   }
-  catch(const std::runtime_error& error) {
+  catch (const std::runtime_error& error) {
     FAIL() << error.what();
   }
 }
@@ -222,20 +222,20 @@ void run_mse_pae(const std::string& base_filename,
 ////////////////////////////////////////////////////////////////////////////////
 //                             compare_files
 ////////////////////////////////////////////////////////////////////////////////
-void compare_files(const std::string& base_filename, 
-                   const std::string& extended_base_fname, 
-                   const std::string& ext) 
+void compare_files(const std::string& base_filename,
+  const std::string& extended_base_fname,
+  const std::string& ext)
 {
   try {
     std::string result, command;
-    command = std::string(COMPARE_FILES_PATH) 
+    command = std::string(COMPARE_FILES_PATH)
       + " " + OUT_FILE_DIR + base_filename + extended_base_fname + "." + ext
       + " " + SRC_FILE_DIR + base_filename + "." + ext;
     EXPECT_EQ(execute(command, result), 0);
   }
-  catch(const std::runtime_error& error) {
+  catch (const std::runtime_error& error) {
     FAIL() << error.what();
-  }  
+  }
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -249,7 +249,7 @@ TEST(TestExecutables, OpenJPHCompressNoArguments) {
     std::string result;
     EXPECT_EQ(execute(COMPRESS_EXECUTABLE, result), 1);
   }
-  catch(const std::runtime_error& error) {
+  catch (const std::runtime_error& error) {
     FAIL() << error.what();
   }
 }
@@ -261,7 +261,7 @@ TEST(TestExecutables, OpenJPHExpandNoArguments) {
     std::string result;
     EXPECT_EQ(execute(EXPAND_EXECUTABLE, result), 1);
   }
-  catch(const std::runtime_error& error) {
+  catch (const std::runtime_error& error) {
     FAIL() << error.what();
   }
 }
@@ -275,11 +275,11 @@ TEST(TestExecutables, OpenJPHExpandNoArguments) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_64x64.jph -precise -quiet -rate 0.5 -full
 TEST(TestExecutables, SimpleDecIrv9764x64) {
-  double mse[3] = { 39.2812, 36.3819, 47.642};
-  int pae[3] = { 74, 77, 73};
+  double mse[3] = { 39.2812, 36.3819, 47.642 };
+  int pae[3] = { 74, 77, 73 };
   run_ojph_expand("simple_dec_irv97_64x64", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -287,11 +287,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_32x32.jph -precise -quiet -rate 1 Cblk={32,32} -full
 TEST(TestExecutables, SimpleDecIrv9732x32) {
-  double mse[3] = { 18.6979, 17.1208, 22.7539};
-  int pae[3] = { 51, 48, 46};
+  double mse[3] = { 18.6979, 17.1208, 22.7539 };
+  int pae[3] = { 51, 48, 46 };
   run_ojph_expand("simple_dec_irv97_32x32", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_32x32", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -299,11 +299,11 @@ TEST(TestExecutables, SimpleDecIrv9732x32) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_16x16.jph -precise -quiet -rate 1 Cblk={16,16} -full
 TEST(TestExecutables, SimpleDecIrv9716x16) {
-  double mse[3] = { 20.1706, 18.5427, 24.6146};
-  int pae[3] = { 53, 51, 47};
+  double mse[3] = { 20.1706, 18.5427, 24.6146 };
+  int pae[3] = { 53, 51, 47 };
   run_ojph_expand("simple_dec_irv97_16x16", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_16x16", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -311,11 +311,11 @@ TEST(TestExecutables, SimpleDecIrv9716x16) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_4x4.jph -precise -quiet -rate 1 Cblk={4,4} -full
 TEST(TestExecutables, SimpleDecIrv974x4) {
-  double mse[3] = { 40.8623, 37.9308, 49.7276};
-  int pae[3] = { 75, 77, 80};
+  double mse[3] = { 40.8623, 37.9308, 49.7276 };
+  int pae[3] = { 75, 77, 80 };
   run_ojph_expand("simple_dec_irv97_4x4", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_4x4", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -323,11 +323,11 @@ TEST(TestExecutables, SimpleDecIrv974x4) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_1024x4.jph -precise -quiet -rate 1 Cblk={1024,4} -full
 TEST(TestExecutables, SimpleDecIrv971024x4) {
-  double mse[3] = { 19.8275, 18.2511, 24.2832};
-  int pae[3] = { 53, 52, 50};
+  double mse[3] = { 19.8275, 18.2511, 24.2832 };
+  int pae[3] = { 53, 52, 50 };
   run_ojph_expand("simple_dec_irv97_1024x4", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_1024x4", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -335,11 +335,11 @@ TEST(TestExecutables, SimpleDecIrv971024x4) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_4x1024.jph -precise -quiet -rate 1 Cblk={4,1024} -full
 TEST(TestExecutables, SimpleDecIrv974x1024) {
-  double mse[3] = { 19.9635, 18.4063, 24.1719};
-  int pae[3] = { 51, 48, 51};
+  double mse[3] = { 19.9635, 18.4063, 24.1719 };
+  int pae[3] = { 51, 48, 51 };
   run_ojph_expand("simple_dec_irv97_4x1024", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_4x1024", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -347,11 +347,11 @@ TEST(TestExecutables, SimpleDecIrv974x1024) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_512x8.jph -precise -quiet -rate 1 Cblk={512,8} -full
 TEST(TestExecutables, SimpleDecIrv97512x8) {
-  double mse[3] = { 18.7929, 17.2026, 22.9922};
-  int pae[3] = { 53, 52, 50};
+  double mse[3] = { 18.7929, 17.2026, 22.9922 };
+  int pae[3] = { 53, 52, 50 };
   run_ojph_expand("simple_dec_irv97_512x8", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_512x8", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -359,11 +359,11 @@ TEST(TestExecutables, SimpleDecIrv97512x8) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_8x512.jph -precise -quiet -rate 1 Cblk={8,512} -full
 TEST(TestExecutables, SimpleDecIrv978x512) {
-  double mse[3] = { 19.3661, 17.8067, 23.4574};
-  int pae[3] = { 51, 48, 52};
+  double mse[3] = { 19.3661, 17.8067, 23.4574 };
+  int pae[3] = { 51, 48, 52 };
   run_ojph_expand("simple_dec_irv97_8x512", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_8x512", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -371,11 +371,11 @@ TEST(TestExecutables, SimpleDecIrv978x512) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_256x16.jph -precise -quiet -rate 1 Cblk={256,16} -full
 TEST(TestExecutables, SimpleDecIrv97256x16) {
-  double mse[3] = { 18.6355, 17.0963, 22.6076};
-  int pae[3] = { 54, 51, 48};
+  double mse[3] = { 18.6355, 17.0963, 22.6076 };
+  int pae[3] = { 54, 51, 48 };
   run_ojph_expand("simple_dec_irv97_256x16", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_256x16", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -383,11 +383,11 @@ TEST(TestExecutables, SimpleDecIrv97256x16) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_16x256.jph -precise -quiet -rate 1 Cblk={16,256} -full
 TEST(TestExecutables, SimpleDecIrv9716x256) {
-  double mse[3] = { 18.5933, 17.0208, 22.5709};
-  int pae[3] = { 51, 48, 47};
+  double mse[3] = { 18.5933, 17.0208, 22.5709 };
+  int pae[3] = { 51, 48, 47 };
   run_ojph_expand("simple_dec_irv97_16x256", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_16x256", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -395,11 +395,11 @@ TEST(TestExecutables, SimpleDecIrv9716x256) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_128x32.jph -precise -quiet -rate 1 Cblk={128,32} -full
 TEST(TestExecutables, SimpleDecIrv97128x32) {
-  double mse[3] = { 18.4443, 16.9133, 22.4193};
-  int pae[3] = { 52, 50, 46};
+  double mse[3] = { 18.4443, 16.9133, 22.4193 };
+  int pae[3] = { 52, 50, 46 };
   run_ojph_expand("simple_dec_irv97_128x32", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_128x32", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -407,11 +407,11 @@ TEST(TestExecutables, SimpleDecIrv97128x32) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_32x128.jph -precise -quiet -rate 1 Cblk={32,128} -full
 TEST(TestExecutables, SimpleDecIrv9732x128) {
-  double mse[3] = { 18.4874, 16.9379, 22.4855};
-  int pae[3] = { 51, 48, 45};
+  double mse[3] = { 18.4874, 16.9379, 22.4855 };
+  int pae[3] = { 51, 48, 45 };
   run_ojph_expand("simple_dec_irv97_32x128", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_32x128", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -419,11 +419,11 @@ TEST(TestExecutables, SimpleDecIrv9732x128) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_rev53_64x64.jph -precise -quiet Creversible=yes -full
 TEST(TestExecutables, SimpleDecRev5364x64) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_expand("simple_dec_rev53_64x64", "jph", "ppm");
   run_mse_pae("simple_dec_rev53_64x64", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -432,11 +432,11 @@ TEST(TestExecutables, SimpleDecRev5364x64) {
 // -o simple_dec_rev53_32x32.jph -precise -quiet Creversible=yes Cblk={32,32}
 // -full
 TEST(TestExecutables, SimpleDecRev5332x32) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_expand("simple_dec_rev53_32x32", "jph", "ppm");
   run_mse_pae("simple_dec_rev53_32x32", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -445,11 +445,11 @@ TEST(TestExecutables, SimpleDecRev5332x32) {
 // -o simple_dec_rev53_4x4.jph -precise -quiet Creversible=yes Cblk={4,4}
 // -full
 TEST(TestExecutables, SimpleDecRev534x4) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_expand("simple_dec_rev53_4x4", "jph", "ppm");
   run_mse_pae("simple_dec_rev53_4x4", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -458,11 +458,11 @@ TEST(TestExecutables, SimpleDecRev534x4) {
 // -o simple_dec_rev53_1024x4.jph -precise -quiet Creversible=yes
 // Cblk={1024,4} -full
 TEST(TestExecutables, SimpleDecRev531024x4) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_expand("simple_dec_rev53_1024x4", "jph", "ppm");
   run_mse_pae("simple_dec_rev53_1024x4", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -471,11 +471,11 @@ TEST(TestExecutables, SimpleDecRev531024x4) {
 // -o simple_dec_rev53_4x1024.jph -precise -quiet Creversible=yes
 // Cblk={4,1024} -full
 TEST(TestExecutables, SimpleDecRev534x1024) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_expand("simple_dec_rev53_4x1024", "jph", "ppm");
   run_mse_pae("simple_dec_rev53_4x1024", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -486,11 +486,11 @@ TEST(TestExecutables, SimpleDecRev534x1024) {
 // Sdims={288,352},{144,176},{144,176} Ssampling={1,1},{2,2},{2,2}
 // Nprecision={8} Nsigned={no} -full
 TEST(TestExecutables, SimpleDecIrv9764x64Yuv) {
-  double mse[3] = { 20.2778, 6.27912, 4.15937};
-  int pae[3] = { 52, 22, 31};
+  double mse[3] = { 20.2778, 6.27912, 4.15937 };
+  int pae[3] = { 52, 22, 31 };
   run_ojph_expand("simple_dec_irv97_64x64_yuv", "jph", "yuv");
   run_mse_pae("simple_dec_irv97_64x64_yuv", "yuv", "foreman_420.yuv",
-              ":352x288x8x420", 3, mse, pae);
+    ":352x288x8x420", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -501,11 +501,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64Yuv) {
 // Sdims={288,352},{144,176},{144,176} Ssampling={1,1},{2,2},{2,2}
 // Nprecision={8} Nsigned={no} -full
 TEST(TestExecutables, SimpleDecRev5364x64Yuv) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_expand("simple_dec_rev53_64x64_yuv", "jph", "yuv");
   run_mse_pae("simple_dec_rev53_64x64_yuv", "yuv", "foreman_420.yuv",
-              ":352x288x8x420", 3, mse, pae);
+    ":352x288x8x420", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -516,11 +516,11 @@ TEST(TestExecutables, SimpleDecRev5364x64Yuv) {
 // Sdims={288,352},{144,176},{144,176} Ssampling={1,1},{2,2},{2,2}
 // Nprecision={8} Nsigned={no} Stiles={33,257} -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesYuv) {
-  double mse[3] = { 34.4972, 10.1112, 7.96331};
-  int pae[3] = { 67, 30, 39};
+  double mse[3] = { 34.4972, 10.1112, 7.96331 };
+  int pae[3] = { 67, 30, 39 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_yuv", "jph", "yuv");
   run_mse_pae("simple_dec_irv97_64x64_tiles_yuv", "yuv", "foreman_420.yuv",
-              ":352x288x8x420", 3, mse, pae);
+    ":352x288x8x420", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -531,11 +531,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesYuv) {
 // Sdims={288,352},{144,176},{144,176} Ssampling={1,1},{2,2},{2,2}
 // Nprecision={8} Nsigned={no} Stiles={33,257} -full
 TEST(TestExecutables, SimpleDecRev5364x64TilesYuv) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_expand("simple_dec_rev53_64x64_tiles_yuv", "jph", "yuv");
   run_mse_pae("simple_dec_rev53_64x64_tiles_yuv", "yuv", "foreman_420.yuv",
-              ":352x288x8x420", 3, mse, pae);
+    ":352x288x8x420", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -545,11 +545,11 @@ TEST(TestExecutables, SimpleDecRev5364x64TilesYuv) {
 // Clevels=5 Corder=LRCP Cprecincts={2,256} Sorigin={374,1717}
 // Stile_origin={374,1717} -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesLRCP) {
-  double mse[3] = { 71.8149, 68.7115, 89.4001};
-  int pae[3] = { 78, 78, 83};
+  double mse[3] = { 71.8149, 68.7115, 89.4001 };
+  int pae[3] = { 78, 78, 83 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_LRCP", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_LRCP", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -559,11 +559,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesLRCP) {
 // Clevels=5 Corder=RLCP Cprecincts={2,256} Sorigin={374,1717}
 // Stile_origin={374,1717} -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesRLCP) {
-  double mse[3] = { 71.8149, 68.7115, 89.4001};
-  int pae[3] = { 78, 78, 83};
+  double mse[3] = { 71.8149, 68.7115, 89.4001 };
+  int pae[3] = { 78, 78, 83 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_RLCP", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_RLCP", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -573,11 +573,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesRLCP) {
 // Clevels=5 Corder=RPCL Cprecincts={2,256} Sorigin={374,1717}
 // Stile_origin={374,1717} -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesRPCL) {
-  double mse[3] = { 71.8149, 68.7115, 89.4001};
-  int pae[3] = { 78, 78, 83};
+  double mse[3] = { 71.8149, 68.7115, 89.4001 };
+  int pae[3] = { 78, 78, 83 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_RPCL", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_RPCL", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -587,11 +587,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesRPCL) {
 // Clevels=5 Corder=PCRL Cprecincts={2,256} Sorigin={374,1717}
 // Stile_origin={374,1717} -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesPCRL) {
-  double mse[3] = { 71.8149, 68.7115, 89.4001};
-  int pae[3] = { 78, 78, 83};
+  double mse[3] = { 71.8149, 68.7115, 89.4001 };
+  int pae[3] = { 78, 78, 83 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_PCRL", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_PCRL", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -601,11 +601,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesPCRL) {
 // Clevels=5 Corder=CPRL Cprecincts={2,256} Sorigin={374,1717}
 // Stile_origin={374,1717} -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesCPRL) {
-  double mse[3] = { 71.8149, 68.7115, 89.4001};
-  int pae[3] = { 78, 78, 83};
+  double mse[3] = { 71.8149, 68.7115, 89.4001 };
+  int pae[3] = { 78, 78, 83 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_CPRL", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_CPRL", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -615,11 +615,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesCPRL) {
 // Clevels=5 Corder=LRCP Sorigin={5,33} Stile_origin={5,10} Stiles={33,257}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesLRCP33) {
-  double mse[3] = { 56.2139, 51.4121, 69.0107};
-  int pae[3] = { 80, 81, 98};
+  double mse[3] = { 56.2139, 51.4121, 69.0107 };
+  int pae[3] = { 80, 81, 98 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_LRCP33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_LRCP33", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -629,11 +629,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesLRCP33) {
 // Clevels=5 Corder=RLCP Sorigin={5,33} Stile_origin={5,10} Stiles={33,257}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesRLCP33) {
-  double mse[3] = { 56.2139, 51.4121, 69.0107};
-  int pae[3] = { 80, 81, 98};
+  double mse[3] = { 56.2139, 51.4121, 69.0107 };
+  int pae[3] = { 80, 81, 98 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_RLCP33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_RLCP33", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -643,11 +643,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesRLCP33) {
 // Clevels=5 Corder=RPCL Sorigin={5,33} Stile_origin={5,10} Stiles={33,257}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesRPCL33) {
-  double mse[3] = { 56.2139, 51.4121, 69.0107};
-  int pae[3] = { 80, 81, 98};
+  double mse[3] = { 56.2139, 51.4121, 69.0107 };
+  int pae[3] = { 80, 81, 98 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_RPCL33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_RPCL33", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -657,11 +657,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesRPCL33) {
 // Clevels=5 Corder=PCRL Sorigin={5,33} Stile_origin={5,10} Stiles={33,257}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesPCRL33) {
-  double mse[3] = { 56.2139, 51.4121, 69.0107};
-  int pae[3] = { 80, 81, 98};
+  double mse[3] = { 56.2139, 51.4121, 69.0107 };
+  int pae[3] = { 80, 81, 98 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_PCRL33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_PCRL33", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -671,11 +671,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesPCRL33) {
 // Clevels=5 Corder=CPRL Sorigin={5,33} Stile_origin={5,10} Stiles={33,257}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesCPRL33) {
-  double mse[3] = { 56.2139, 51.4121, 69.0107};
-  int pae[3] = { 80, 81, 98};
+  double mse[3] = { 56.2139, 51.4121, 69.0107 };
+  int pae[3] = { 80, 81, 98 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_CPRL33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_CPRL33", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -685,11 +685,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesCPRL33) {
 // Clevels=5 Corder=LRCP Sorigin={5,33} Stile_origin={5,10} Stiles={33,33}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesLRCP33x33) {
-  double mse[3] = { 210.283, 210.214, 257.276};
-  int pae[3] = { 165, 161, 166};
+  double mse[3] = { 210.283, 210.214, 257.276 };
+  int pae[3] = { 165, 161, 166 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_LRCP33x33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_LRCP33x33", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -699,11 +699,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesLRCP33x33) {
 // Clevels=5 Corder=RLCP Sorigin={5,33} Stile_origin={5,10} Stiles={33,33}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesRLCP33x33) {
-  double mse[3] = { 210.283, 210.214, 257.276};
-  int pae[3] = { 165, 161, 166};
+  double mse[3] = { 210.283, 210.214, 257.276 };
+  int pae[3] = { 165, 161, 166 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_RLCP33x33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_RLCP33x33", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -713,11 +713,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesRLCP33x33) {
 // Clevels=5 Corder=RPCL Sorigin={5,33} Stile_origin={5,10} Stiles={33,33}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesRPCL33x33) {
-  double mse[3] = { 210.283, 210.214, 257.276};
-  int pae[3] = { 165, 161, 166};
+  double mse[3] = { 210.283, 210.214, 257.276 };
+  int pae[3] = { 165, 161, 166 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_RPCL33x33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_RPCL33x33", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -727,11 +727,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesRPCL33x33) {
 // Clevels=5 Corder=PCRL Sorigin={5,33} Stile_origin={5,10} Stiles={33,33}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesPCRL33x33) {
-  double mse[3] = { 210.283, 210.214, 257.276};
-  int pae[3] = { 165, 161, 166};
+  double mse[3] = { 210.283, 210.214, 257.276 };
+  int pae[3] = { 165, 161, 166 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_PCRL33x33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_PCRL33x33", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -741,11 +741,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesPCRL33x33) {
 // Clevels=5 Corder=CPRL Sorigin={5,33} Stile_origin={5,10} Stiles={33,33}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesCPRL33x33) {
-  double mse[3] = { 210.283, 210.214, 257.276};
-  int pae[3] = { 165, 161, 166};
+  double mse[3] = { 210.283, 210.214, 257.276 };
+  int pae[3] = { 165, 161, 166 };
   run_ojph_expand("simple_dec_irv97_64x64_tiles_CPRL33x33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_CPRL33x33", "ppm", "Malamute.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -754,11 +754,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesCPRL33x33) {
 // -o simple_dec_rev53_64x64_gray_tiles.jph -precise -quiet Creversible=yes
 // Clevels=5 Stiles={33,257} -full
 TEST(TestExecutables, SimpleDecRev5364x64GrayTiles) {
-  double mse[1] = { 0};
-  int pae[1] = { 0};
+  double mse[1] = { 0 };
+  int pae[1] = { 0 };
   run_ojph_expand("simple_dec_rev53_64x64_gray_tiles", "jph", "pgm");
   run_mse_pae("simple_dec_rev53_64x64_gray_tiles", "pgm", "monarch.pgm",
-              "", 1, mse, pae);
+    "", 1, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -767,11 +767,11 @@ TEST(TestExecutables, SimpleDecRev5364x64GrayTiles) {
 // -o simple_dec_irv97_64x64_gray_tiles.jph -precise -quiet -rate 0.5
 // Clevels=5 Stiles={33,257} -full
 TEST(TestExecutables, SimpleDecIrv9764x64GrayTiles) {
-  double mse[1] = { 18.9601};
-  int pae[1] = { 56};
+  double mse[1] = { 18.9601 };
+  int pae[1] = { 56 };
   run_ojph_expand("simple_dec_irv97_64x64_gray_tiles", "jph", "pgm");
   run_mse_pae("simple_dec_irv97_64x64_gray_tiles", "pgm", "monarch.pgm",
-              "", 1, mse, pae);
+    "", 1, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -779,11 +779,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64GrayTiles) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_64x64_16bit.jph -precise -quiet -rate 0.5 -full
 TEST(TestExecutables, SimpleDecIrv9764x6416bit) {
-  double mse[3] = { 60507.2, 36672.5, 64809.8};
-  int pae[3] = { 2547, 1974, 1922};
+  double mse[3] = { 60507.2, 36672.5, 64809.8 };
+  int pae[3] = { 2547, 1974, 1922 };
   run_ojph_expand("simple_dec_irv97_64x64_16bit", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_16bit", "ppm", "mm.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -791,11 +791,11 @@ TEST(TestExecutables, SimpleDecIrv9764x6416bit) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_64x64_16bit_gray.jph -precise -quiet -rate 0.5 -full
 TEST(TestExecutables, SimpleDecIrv9764x6416bitGray) {
-  double mse[1] = { 19382.9};
-  int pae[1] = { 1618};
+  double mse[1] = { 19382.9 };
+  int pae[1] = { 1618 };
   run_ojph_expand("simple_dec_irv97_64x64_16bit_gray", "jph", "pgm");
   run_mse_pae("simple_dec_irv97_64x64_16bit_gray", "pgm", "mm.pgm",
-              "", 1, mse, pae);
+    "", 1, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -803,11 +803,11 @@ TEST(TestExecutables, SimpleDecIrv9764x6416bitGray) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_rev53_64x64_16bit.jph -precise -quiet Creversible=yes -full
 TEST(TestExecutables, SimpleDecRev5364x6416bit) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_expand("simple_dec_rev53_64x64_16bit", "jph", "ppm");
   run_mse_pae("simple_dec_rev53_64x64_16bit", "ppm", "mm.ppm",
-              "", 3, mse, pae);
+    "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -816,11 +816,11 @@ TEST(TestExecutables, SimpleDecRev5364x6416bit) {
 // -o simple_dec_rev53_64x64_16bit_gray.jph -precise -quiet Creversible=yes
 // -full
 TEST(TestExecutables, SimpleDecRev5364x6416bitGray) {
-  double mse[1] = { 0};
-  int pae[1] = { 0};
+  double mse[1] = { 0 };
+  int pae[1] = { 0 };
   run_ojph_expand("simple_dec_rev53_64x64_16bit_gray", "jph", "pgm");
   run_mse_pae("simple_dec_rev53_64x64_16bit_gray", "pgm", "mm.pgm",
-              "", 1, mse, pae);
+    "", 1, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -829,14 +829,14 @@ TEST(TestExecutables, SimpleDecRev5364x6416bitGray) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_64x64.j2c -qstep 0.1
 TEST(TestExecutables, SimpleEncIrv9764x64) {
-  double mse[3] = { 46.2004, 43.622, 56.7452};
-  int pae[3] = { 48, 46, 52};
+  double mse[3] = { 46.2004, 43.622, 56.7452 };
+  int pae[3] = { 48, 46, 52 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_irv97_64x64", "", "j2c",
-                    "-qstep 0.1");
+    "simple_enc_irv97_64x64", "", "j2c",
+    "-qstep 0.1");
   run_ojph_compress_expand("simple_enc_irv97_64x64", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_64x64", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -845,14 +845,14 @@ TEST(TestExecutables, SimpleEncIrv9764x64) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_32x32.j2c -qstep 0.01 -block_size {32,32}
 TEST(TestExecutables, SimpleEncIrv9732x32) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395};
-  int pae[3] = { 7, 6, 9};
+  double mse[3] = { 1.78779, 1.26001, 2.38395 };
+  int pae[3] = { 7, 6, 9 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_irv97_32x32", "", "j2c",
-                    "-qstep 0.01 -block_size \"{32,32}\"");
+    "simple_enc_irv97_32x32", "", "j2c",
+    "-qstep 0.01 -block_size \"{32,32}\"");
   run_ojph_compress_expand("simple_enc_irv97_32x32", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_32x32", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -861,14 +861,14 @@ TEST(TestExecutables, SimpleEncIrv9732x32) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_16x16.j2c -qstep 0.01 -block_size {16,16}
 TEST(TestExecutables, SimpleEncIrv9716x16) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395};
-  int pae[3] = { 7, 6, 9};
+  double mse[3] = { 1.78779, 1.26001, 2.38395 };
+  int pae[3] = { 7, 6, 9 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_irv97_16x16", "", "j2c",
-                    "-qstep 0.01 -block_size \"{16,16}\"");
+    "simple_enc_irv97_16x16", "", "j2c",
+    "-qstep 0.01 -block_size \"{16,16}\"");
   run_ojph_compress_expand("simple_enc_irv97_16x16", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_16x16", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -877,14 +877,14 @@ TEST(TestExecutables, SimpleEncIrv9716x16) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_4x4.j2c -qstep 0.01 -block_size {4,4}
 TEST(TestExecutables, SimpleEncIrv974x4) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395};
-  int pae[3] = { 7, 6, 9};
+  double mse[3] = { 1.78779, 1.26001, 2.38395 };
+  int pae[3] = { 7, 6, 9 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_irv97_4x4", "", "j2c",
-                    "-qstep 0.01 -block_size \"{4,4}\"");
+    "simple_enc_irv97_4x4", "", "j2c",
+    "-qstep 0.01 -block_size \"{4,4}\"");
   run_ojph_compress_expand("simple_enc_irv97_4x4", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_4x4", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -893,14 +893,14 @@ TEST(TestExecutables, SimpleEncIrv974x4) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_1024x4.j2c -qstep 0.01 -block_size {4,1024}
 TEST(TestExecutables, SimpleEncIrv971024x4) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395};
-  int pae[3] = { 7, 6, 9};
+  double mse[3] = { 1.78779, 1.26001, 2.38395 };
+  int pae[3] = { 7, 6, 9 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_irv97_1024x4", "", "j2c",
-                    "-qstep 0.01 -block_size \"{4,1024}\"");
+    "simple_enc_irv97_1024x4", "", "j2c",
+    "-qstep 0.01 -block_size \"{4,1024}\"");
   run_ojph_compress_expand("simple_enc_irv97_1024x4", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_1024x4", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -909,14 +909,14 @@ TEST(TestExecutables, SimpleEncIrv971024x4) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_4x1024.j2c -qstep 0.01 -block_size {1024,4}
 TEST(TestExecutables, SimpleEncIrv974x1024) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395};
-  int pae[3] = { 7, 6, 9};
+  double mse[3] = { 1.78779, 1.26001, 2.38395 };
+  int pae[3] = { 7, 6, 9 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_irv97_4x1024", "", "j2c",
-                    "-qstep 0.01 -block_size \"{1024,4}\"");
+    "simple_enc_irv97_4x1024", "", "j2c",
+    "-qstep 0.01 -block_size \"{1024,4}\"");
   run_ojph_compress_expand("simple_enc_irv97_4x1024", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_4x1024", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -925,14 +925,14 @@ TEST(TestExecutables, SimpleEncIrv974x1024) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_512x8.j2c -qstep 0.01 -block_size {8,512}
 TEST(TestExecutables, SimpleEncIrv97512x8) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395};
-  int pae[3] = { 7, 6, 9};
+  double mse[3] = { 1.78779, 1.26001, 2.38395 };
+  int pae[3] = { 7, 6, 9 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_irv97_512x8", "", "j2c",
-                    "-qstep 0.01 -block_size \"{8,512}\"");
+    "simple_enc_irv97_512x8", "", "j2c",
+    "-qstep 0.01 -block_size \"{8,512}\"");
   run_ojph_compress_expand("simple_enc_irv97_512x8", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_512x8", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -941,14 +941,14 @@ TEST(TestExecutables, SimpleEncIrv97512x8) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_8x512.j2c -qstep 0.01 -block_size {512,8}
 TEST(TestExecutables, SimpleEncIrv978x512) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395};
-  int pae[3] = { 7, 6, 9};
+  double mse[3] = { 1.78779, 1.26001, 2.38395 };
+  int pae[3] = { 7, 6, 9 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_irv97_8x512", "", "j2c",
-                    "-qstep 0.01 -block_size \"{512,8}\"");
+    "simple_enc_irv97_8x512", "", "j2c",
+    "-qstep 0.01 -block_size \"{512,8}\"");
   run_ojph_compress_expand("simple_enc_irv97_8x512", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_8x512", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -957,14 +957,14 @@ TEST(TestExecutables, SimpleEncIrv978x512) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_256x16.j2c -qstep 0.01 -block_size {16,256}
 TEST(TestExecutables, SimpleEncIrv97256x16) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395};
-  int pae[3] = { 7, 6, 9};
+  double mse[3] = { 1.78779, 1.26001, 2.38395 };
+  int pae[3] = { 7, 6, 9 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_irv97_256x16", "", "j2c",
-                    "-qstep 0.01 -block_size \"{16,256}\"");
+    "simple_enc_irv97_256x16", "", "j2c",
+    "-qstep 0.01 -block_size \"{16,256}\"");
   run_ojph_compress_expand("simple_enc_irv97_256x16", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_256x16", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -973,14 +973,14 @@ TEST(TestExecutables, SimpleEncIrv97256x16) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_16x256.j2c -qstep 0.01 -block_size {256,16}
 TEST(TestExecutables, SimpleEncIrv9716x256) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395};
-  int pae[3] = { 7, 6, 9};
+  double mse[3] = { 1.78779, 1.26001, 2.38395 };
+  int pae[3] = { 7, 6, 9 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_irv97_16x256", "", "j2c",
-                    "-qstep 0.01 -block_size \"{256,16}\"");
+    "simple_enc_irv97_16x256", "", "j2c",
+    "-qstep 0.01 -block_size \"{256,16}\"");
   run_ojph_compress_expand("simple_enc_irv97_16x256", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_16x256", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -989,14 +989,14 @@ TEST(TestExecutables, SimpleEncIrv9716x256) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_128x32.j2c -qstep 0.01 -block_size {32,128}
 TEST(TestExecutables, SimpleEncIrv97128x32) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395};
-  int pae[3] = { 7, 6, 9};
+  double mse[3] = { 1.78779, 1.26001, 2.38395 };
+  int pae[3] = { 7, 6, 9 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_irv97_128x32", "", "j2c",
-                    "-qstep 0.01 -block_size \"{32,128}\"");
+    "simple_enc_irv97_128x32", "", "j2c",
+    "-qstep 0.01 -block_size \"{32,128}\"");
   run_ojph_compress_expand("simple_enc_irv97_128x32", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_128x32", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1005,14 +1005,14 @@ TEST(TestExecutables, SimpleEncIrv97128x32) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_32x128.j2c -qstep 0.01 -block_size {128,32}
 TEST(TestExecutables, SimpleEncIrv9732x128) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395};
-  int pae[3] = { 7, 6, 9};
+  double mse[3] = { 1.78779, 1.26001, 2.38395 };
+  int pae[3] = { 7, 6, 9 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_irv97_32x128", "", "j2c",
-                    "-qstep 0.01 -block_size \"{128,32}\"");
+    "simple_enc_irv97_32x128", "", "j2c",
+    "-qstep 0.01 -block_size \"{128,32}\"");
   run_ojph_compress_expand("simple_enc_irv97_32x128", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_32x128", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1022,14 +1022,14 @@ TEST(TestExecutables, SimpleEncIrv9732x128) {
 // -o simple_enc_irv97_64x64_tiles_33x33_d5.j2c -qstep 0.01 -tile_size {33,33}
 // -num_decomps 5
 TEST(TestExecutables, SimpleEncIrv9764x64Tiles33x33D5) {
-  double mse[3] = { 46.2004, 43.622, 56.7452};
-  int pae[3] = { 48, 46, 52};
+  double mse[3] = { 1.88906, 1.30757, 2.5347 };
+  int pae[3] = { 9, 6, 10 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_irv97_64x64_tiles_33x33_d5", "", "j2c",
-                    "-qstep 0.01 -tile_size \"{33,33}\" -num_decomps 5");
+    "simple_enc_irv97_64x64_tiles_33x33_d5", "", "j2c",
+    "-qstep 0.01 -tile_size \"{33,33}\" -num_decomps 5");
   run_ojph_compress_expand("simple_enc_irv97_64x64_tiles_33x33_d5", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_64x64_tiles_33x33_d5", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1039,14 +1039,14 @@ TEST(TestExecutables, SimpleEncIrv9764x64Tiles33x33D5) {
 // -o simple_enc_irv97_64x64_tiles_33x33_d6.j2c -qstep 0.01 -tile_size {33,33}
 // -num_decomps 6
 TEST(TestExecutables, SimpleEncIrv9764x64Tiles33x33D6) {
-  double mse[3] = { 46.2004, 43.622, 56.7452};
-  int pae[3] = { 48, 46, 52};
+  double mse[3] = { 1.88751, 1.30673, 2.53378 };
+  int pae[3] = { 8, 6, 10 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_irv97_64x64_tiles_33x33_d6", "", "j2c",
-                    "-qstep 0.01 -tile_size \"{33,33}\" -num_decomps 6");
+    "simple_enc_irv97_64x64_tiles_33x33_d6", "", "j2c",
+    "-qstep 0.01 -tile_size \"{33,33}\" -num_decomps 6");
   run_ojph_compress_expand("simple_enc_irv97_64x64_tiles_33x33_d6", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_64x64_tiles_33x33_d6", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1055,14 +1055,14 @@ TEST(TestExecutables, SimpleEncIrv9764x64Tiles33x33D6) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_64x64_16bit.j2c -qstep 0.01
 TEST(TestExecutables, SimpleEncIrv9764x6416bit) {
-  double mse[3] = { 51727.3, 32596.4, 45897.8};
-  int pae[3] = { 1512, 1481, 1778};
+  double mse[3] = { 51727.3, 32596.4, 45897.8 };
+  int pae[3] = { 1512, 1481, 1778 };
   run_ojph_compress("mm.ppm",
-                    "simple_enc_irv97_64x64_16bit", "", "j2c",
-                    "-qstep 0.01");
+    "simple_enc_irv97_64x64_16bit", "", "j2c",
+    "-qstep 0.01");
   run_ojph_compress_expand("simple_enc_irv97_64x64_16bit", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_64x64_16bit", "ppm",
-              "mm.ppm", "", 3, mse, pae);
+    "mm.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1071,14 +1071,14 @@ TEST(TestExecutables, SimpleEncIrv9764x6416bit) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_64x64_16bit_gray.j2c -qstep 0.01
 TEST(TestExecutables, SimpleEncIrv9764x6416bitGray) {
-  double mse[1] = { 25150.6};
-  int pae[1] = { 1081};
+  double mse[1] = { 25150.6 };
+  int pae[1] = { 1081 };
   run_ojph_compress("mm.pgm",
-                    "simple_enc_irv97_64x64_16bit_gray", "", "j2c",
-                    "-qstep 0.01");
+    "simple_enc_irv97_64x64_16bit_gray", "", "j2c",
+    "-qstep 0.01");
   run_ojph_compress_expand("simple_enc_irv97_64x64_16bit_gray", "j2c", "pgm");
   run_mse_pae("simple_enc_irv97_64x64_16bit_gray", "pgm",
-              "mm.pgm", "", 1, mse, pae);
+    "mm.pgm", "", 1, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1087,14 +1087,14 @@ TEST(TestExecutables, SimpleEncIrv9764x6416bitGray) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_64x64_16bit.j2c -reversible true
 TEST(TestExecutables, SimpleEncRev5364x6416bit) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("mm.ppm",
-                    "simple_enc_rev53_64x64_16bit", "", "j2c",
-                    "-reversible true");
+    "simple_enc_rev53_64x64_16bit", "", "j2c",
+    "-reversible true");
   run_ojph_compress_expand("simple_enc_rev53_64x64_16bit", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_64x64_16bit", "ppm",
-              "mm.ppm", "", 3, mse, pae);
+    "mm.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1103,14 +1103,14 @@ TEST(TestExecutables, SimpleEncRev5364x6416bit) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_64x64_16bit_gray.j2c -reversible true
 TEST(TestExecutables, SimpleEncRev5364x6416bitGray) {
-  double mse[1] = { 0};
-  int pae[1] = { 0};
+  double mse[1] = { 0 };
+  int pae[1] = { 0 };
   run_ojph_compress("mm.pgm",
-                    "simple_enc_rev53_64x64_16bit_gray", "", "j2c",
-                    "-reversible true");
+    "simple_enc_rev53_64x64_16bit_gray", "", "j2c",
+    "-reversible true");
   run_ojph_compress_expand("simple_enc_rev53_64x64_16bit_gray", "j2c", "pgm");
   run_mse_pae("simple_enc_rev53_64x64_16bit_gray", "pgm",
-              "mm.pgm", "", 1, mse, pae);
+    "mm.pgm", "", 1, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1119,14 +1119,14 @@ TEST(TestExecutables, SimpleEncRev5364x6416bitGray) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_64x64_16bit.j2c -reversible true
 TEST(TestExecutables, SimpleEncRev5364x64) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_rev53_64x64", "", "j2c",
-                    "-reversible true");
+    "simple_enc_rev53_64x64", "", "j2c",
+    "-reversible true");
   run_ojph_compress_expand("simple_enc_rev53_64x64", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_64x64", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1135,14 +1135,14 @@ TEST(TestExecutables, SimpleEncRev5364x64) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_32x32.j2c -reversible true -block_size {32,32}
 TEST(TestExecutables, SimpleEncRev5332x32) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_rev53_32x32", "", "j2c",
-                    "-reversible true -block_size \"{32,32}\"");
+    "simple_enc_rev53_32x32", "", "j2c",
+    "-reversible true -block_size \"{32,32}\"");
   run_ojph_compress_expand("simple_enc_rev53_32x32", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_32x32", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1151,14 +1151,14 @@ TEST(TestExecutables, SimpleEncRev5332x32) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_4x4.j2c -reversible true -block_size {4,4}
 TEST(TestExecutables, SimpleEncRev534x4) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_rev53_4x4", "", "j2c",
-                    "-reversible true -block_size \"{4,4}\"");
+    "simple_enc_rev53_4x4", "", "j2c",
+    "-reversible true -block_size \"{4,4}\"");
   run_ojph_compress_expand("simple_enc_rev53_4x4", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_4x4", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1167,14 +1167,14 @@ TEST(TestExecutables, SimpleEncRev534x4) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_1024x4.j2c -reversible true -block_size {4,1024}
 TEST(TestExecutables, SimpleEncRev531024x4) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_rev53_1024x4", "", "j2c",
-                    "-reversible true -block_size \"{4,1024}\"");
+    "simple_enc_rev53_1024x4", "", "j2c",
+    "-reversible true -block_size \"{4,1024}\"");
   run_ojph_compress_expand("simple_enc_rev53_1024x4", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_1024x4", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1183,48 +1183,48 @@ TEST(TestExecutables, SimpleEncRev531024x4) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_4x1024.j2c -reversible true -block_size {1024,4}
 TEST(TestExecutables, SimpleEncRev534x1024) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_rev53_4x1024", "", "j2c",
-                    "-reversible true -block_size \"{1024,4}\"");
+    "simple_enc_rev53_4x1024", "", "j2c",
+    "-reversible true -block_size \"{1024,4}\"");
   run_ojph_compress_expand("simple_enc_rev53_4x1024", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_4x1024", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 // Test ojph_compress with codeblocks when the rev53 wavelet is used.
 // We test by comparing MSE and PAE of decoded images. 
 // The compressed file is obtained using these command-line options:
-// -o simple_enc_rev53_64x64_tiles_33x33.j2c -reversible true -tile_size
+// -o simple_enc_rev53_64x64_tiles_33x33_d5.j2c -reversible true -tile_size
 // {32,32} -num_decomps 5
 TEST(TestExecutables, SimpleEncRev5364x64Tiles33x33D5) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_rev53_64x64_tiles_33x33_d5", "", "j2c",
-                    "-reversible true -tile_size \"{32,32}\" -num_decomps 5");
+    "simple_enc_rev53_64x64_tiles_33x33_d5", "", "j2c",
+    "-reversible true -tile_size \"{32,32}\" -num_decomps 5");
   run_ojph_compress_expand("simple_enc_rev53_64x64_tiles_33x33_d5", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_64x64_tiles_33x33_d5", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
 // Test ojph_compress with codeblocks when the rev53 wavelet is used.
 // We test by comparing MSE and PAE of decoded images. 
 // The compressed file is obtained using these command-line options:
-// -o simple_enc_rev53_64x64_tiles_33x33.j2c -reversible true -tile_size
+// -o simple_enc_rev53_64x64_tiles_33x33_d6.j2c -reversible true -tile_size
 // {32,32} -num_decomps 6
 TEST(TestExecutables, SimpleEncRev5364x64Tiles33x33D6) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("Malamute.ppm",
-                    "simple_enc_rev53_64x64_tiles_33x33_d6", "", "j2c",
-                    "-reversible true -tile_size \"{32,32}\" -num_decomps 6");
+    "simple_enc_rev53_64x64_tiles_33x33_d6", "", "j2c",
+    "-reversible true -tile_size \"{32,32}\" -num_decomps 6");
   run_ojph_compress_expand("simple_enc_rev53_64x64_tiles_33x33_d6", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_64x64_tiles_33x33_d6", "ppm",
-              "Malamute.ppm", "", 3, mse, pae);
+    "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1234,16 +1234,16 @@ TEST(TestExecutables, SimpleEncRev5364x64Tiles33x33D6) {
 // -o simple_enc_irv97_64x64_yuv.j2c -qstep 0.1 -dims {352,288} -num_comps 3
 // -downsamp {1,1},{2,2},{2,2} -bit_depth 8,8,8 -signed false,false,false
 TEST(TestExecutables, SimpleEncIrv9764x64Yuv) {
-  double mse[3] = { 30.3548, 7.69602, 5.22246};
-  int pae[3] = { 49, 27, 26};
+  double mse[3] = { 30.3548, 7.69602, 5.22246 };
+  int pae[3] = { 49, 27, 26 };
   run_ojph_compress("foreman_420.yuv",
-                    "simple_enc_irv97_64x64_yuv", "", "j2c",
-                    "-qstep 0.1 -dims \"{352,288}\" -num_comps 3 -downsamp"
-                    " \"{1,1}\",\"{2,2}\",\"{2,2}\" -bit_depth 8,8,8"
-                    " -signed false,false,false");
+    "simple_enc_irv97_64x64_yuv", "", "j2c",
+    "-qstep 0.1 -dims \"{352,288}\" -num_comps 3 -downsamp"
+    " \"{1,1}\",\"{2,2}\",\"{2,2}\" -bit_depth 8,8,8"
+    " -signed false,false,false");
   run_ojph_compress_expand("simple_enc_irv97_64x64_yuv", "j2c", "yuv");
   run_mse_pae("simple_enc_irv97_64x64_yuv", "yuv",
-              "foreman_420.yuv", ":352x288x8x420", 3, mse, pae);
+    "foreman_420.yuv", ":352x288x8x420", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1254,16 +1254,16 @@ TEST(TestExecutables, SimpleEncIrv9764x64Yuv) {
 // {352,288} -num_comps 3 -downsamp {1,1},{2,2},{2,2} -bit_depth 8,8,8 -signed
 // false,false,false
 TEST(TestExecutables, SimpleEncRev5364x64Yuv) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("foreman_420.yuv",
-                    "simple_enc_rev53_64x64_yuv", "", "j2c",
-                    "-reversible true -qstep 0.1 -dims \"{352,288}\""
-                    " -num_comps 3 -downsamp \"{1,1}\",\"{2,2}\",\"{2,2}\""
-                    " -bit_depth 8,8,8 -signed false,false,false");
+    "simple_enc_rev53_64x64_yuv", "", "j2c",
+    "-reversible true -qstep 0.1 -dims \"{352,288}\""
+    " -num_comps 3 -downsamp \"{1,1}\",\"{2,2}\",\"{2,2}\""
+    " -bit_depth 8,8,8 -signed false,false,false");
   run_ojph_compress_expand("simple_enc_rev53_64x64_yuv", "j2c", "yuv");
   run_mse_pae("simple_enc_rev53_64x64_yuv", "yuv",
-              "foreman_420.yuv", ":352x288x8x420", 3, mse, pae);
+    "foreman_420.yuv", ":352x288x8x420", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1272,14 +1272,14 @@ TEST(TestExecutables, SimpleEncRev5364x64Yuv) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_tall_narrow.j2c -qstep 0.1
 TEST(TestExecutables, SimpleEncIrv97TallNarrow) {
-  double mse[3] = { 112.097, 79.2214, 71.1367};
-  int pae[3] = { 56, 41, 32};
+  double mse[3] = { 112.097, 79.2214, 71.1367 };
+  int pae[3] = { 56, 41, 32 };
   run_ojph_compress("tall_narrow.ppm",
-                    "simple_enc_irv97_tall_narrow", "", "j2c",
-                    "-qstep 0.1");
+    "simple_enc_irv97_tall_narrow", "", "j2c",
+    "-qstep 0.1");
   run_ojph_compress_expand("simple_enc_irv97_tall_narrow", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_tall_narrow", "ppm",
-              "tall_narrow.ppm", "", 3, mse, pae);
+    "tall_narrow.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1288,14 +1288,14 @@ TEST(TestExecutables, SimpleEncIrv97TallNarrow) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_tall_narrow1.j2c -image_offset {1,0} -qstep 0.1
 TEST(TestExecutables, SimpleEncIrv97TallNarrow1) {
-  double mse[3] = { 100.906, 76.113, 72.8347};
-  int pae[3] = { 39, 35, 34};
+  double mse[3] = { 100.906, 76.113, 72.8347 };
+  int pae[3] = { 39, 35, 34 };
   run_ojph_compress("tall_narrow.ppm",
-                    "simple_enc_irv97_tall_narrow1", "", "j2c",
-                    "-image_offset \"{1,0}\" -qstep 0.1");
+    "simple_enc_irv97_tall_narrow1", "", "j2c",
+    "-image_offset \"{1,0}\" -qstep 0.1");
   run_ojph_compress_expand("simple_enc_irv97_tall_narrow1", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_tall_narrow1", "ppm",
-              "tall_narrow.ppm", "", 3, mse, pae);
+    "tall_narrow.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1304,14 +1304,14 @@ TEST(TestExecutables, SimpleEncIrv97TallNarrow1) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_tall_narrow.j2c -reversible true
 TEST(TestExecutables, SimpleEncRev53TallNarrow) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("tall_narrow.ppm",
-                    "simple_enc_rev53_tall_narrow", "", "j2c",
-                    "-reversible true");
+    "simple_enc_rev53_tall_narrow", "", "j2c",
+    "-reversible true");
   run_ojph_compress_expand("simple_enc_rev53_tall_narrow", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_tall_narrow", "ppm",
-              "tall_narrow.ppm", "", 3, mse, pae);
+    "tall_narrow.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1320,14 +1320,14 @@ TEST(TestExecutables, SimpleEncRev53TallNarrow) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_tall_narrow1.j2c -image_offset {1,0} -reversible true
 TEST(TestExecutables, SimpleEncRev53TallNarrow1) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("tall_narrow.ppm",
-                    "simple_enc_rev53_tall_narrow1", "", "j2c",
-                    "-image_offset \"{1,0}\" -reversible true");
+    "simple_enc_rev53_tall_narrow1", "", "j2c",
+    "-image_offset \"{1,0}\" -reversible true");
   run_ojph_compress_expand("simple_enc_rev53_tall_narrow1", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_tall_narrow1", "ppm",
-              "tall_narrow.ppm", "", 3, mse, pae);
+    "tall_narrow.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1336,14 +1336,14 @@ TEST(TestExecutables, SimpleEncRev53TallNarrow1) {
 // The compressed file is obtained using these command-line options:
 // -o dpx_enc_1280x720_10bit_le_nuke11.j2c -reversible true
 TEST(TestExecutables, DpxEnc1280x72010bitLeNuke11) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("dpx_1280x720_10bit.ppm",
-                    "dpx_enc_1280x720_10bit_le_nuke11", "", "j2c",
-                    "-reversible true");
+    "dpx_enc_1280x720_10bit_le_nuke11", "", "j2c",
+    "-reversible true");
   run_ojph_compress_expand("dpx_enc_1280x720_10bit_le_nuke11", "j2c", "ppm");
   run_mse_pae("dpx_enc_1280x720_10bit_le_nuke11", "ppm",
-              "dpx_1280x720_10bit.ppm", "", 3, mse, pae);
+    "dpx_1280x720_10bit.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1352,14 +1352,14 @@ TEST(TestExecutables, DpxEnc1280x72010bitLeNuke11) {
 // The compressed file is obtained using these command-line options:
 // -o dpx_enc_1280x720_10bit_be_nuke11.j2c -reversible true
 TEST(TestExecutables, DpxEnc1280x72010bitBeNuke11) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("dpx_1280x720_10bit.ppm",
-                    "dpx_enc_1280x720_10bit_be_nuke11", "", "j2c",
-                    "-reversible true");
+    "dpx_enc_1280x720_10bit_be_nuke11", "", "j2c",
+    "-reversible true");
   run_ojph_compress_expand("dpx_enc_1280x720_10bit_be_nuke11", "j2c", "ppm");
   run_mse_pae("dpx_enc_1280x720_10bit_be_nuke11", "ppm",
-              "dpx_1280x720_10bit.ppm", "", 3, mse, pae);
+    "dpx_1280x720_10bit.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1368,14 +1368,14 @@ TEST(TestExecutables, DpxEnc1280x72010bitBeNuke11) {
 // The compressed file is obtained using these command-line options:
 // -o dpx_enc_1280x720_16bit_le_nuke11.j2c -reversible true
 TEST(TestExecutables, DpxEnc1280x72016bitLeNuke11) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("dpx_1280x720_16bit.ppm",
-                    "dpx_enc_1280x720_16bit_le_nuke11", "", "j2c",
-                    "-reversible true");
+    "dpx_enc_1280x720_16bit_le_nuke11", "", "j2c",
+    "-reversible true");
   run_ojph_compress_expand("dpx_enc_1280x720_16bit_le_nuke11", "j2c", "ppm");
   run_mse_pae("dpx_enc_1280x720_16bit_le_nuke11", "ppm",
-              "dpx_1280x720_16bit.ppm", "", 3, mse, pae);
+    "dpx_1280x720_16bit.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1384,14 +1384,14 @@ TEST(TestExecutables, DpxEnc1280x72016bitLeNuke11) {
 // The compressed file is obtained using these command-line options:
 // -o dpx_enc_1280x720_16bit_be_nuke11.j2c -reversible true
 TEST(TestExecutables, DpxEnc1280x72016bitBeNuke11) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("dpx_1280x720_16bit.ppm",
-                    "dpx_enc_1280x720_16bit_be_nuke11", "", "j2c",
-                    "-reversible true");
+    "dpx_enc_1280x720_16bit_be_nuke11", "", "j2c",
+    "-reversible true");
   run_ojph_compress_expand("dpx_enc_1280x720_16bit_be_nuke11", "j2c", "ppm");
   run_mse_pae("dpx_enc_1280x720_16bit_be_nuke11", "ppm",
-              "dpx_1280x720_16bit.ppm", "", 3, mse, pae);
+    "dpx_1280x720_16bit.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1400,14 +1400,14 @@ TEST(TestExecutables, DpxEnc1280x72016bitBeNuke11) {
 // The compressed file is obtained using these command-line options:
 // -o dpx_enc_1280x720_10bit_resolve18.j2c -reversible true
 TEST(TestExecutables, DpxEnc1280x72010bitResolve18) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("dpx_1280x720_10bit.ppm",
-                    "dpx_enc_1280x720_10bit_resolve18", "", "j2c",
-                    "-reversible true");
+    "dpx_enc_1280x720_10bit_resolve18", "", "j2c",
+    "-reversible true");
   run_ojph_compress_expand("dpx_enc_1280x720_10bit_resolve18", "j2c", "ppm");
   run_mse_pae("dpx_enc_1280x720_10bit_resolve18", "ppm",
-              "dpx_1280x720_10bit.ppm", "", 3, mse, pae);
+    "dpx_1280x720_10bit.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1416,20 +1416,20 @@ TEST(TestExecutables, DpxEnc1280x72010bitResolve18) {
 // The compressed file is obtained using these command-line options:
 // -o dpx_enc_1280x720_16bit_resolve18.j2c -reversible true
 TEST(TestExecutables, DpxEnc1280x72016bitResolve18) {
-  double mse[3] = { 0, 0, 0};
-  int pae[3] = { 0, 0, 0};
+  double mse[3] = { 0, 0, 0 };
+  int pae[3] = { 0, 0, 0 };
   run_ojph_compress("dpx_1280x720_16bit.ppm",
-                    "dpx_enc_1280x720_16bit_resolve18", "", "j2c",
-                    "-reversible true");
+    "dpx_enc_1280x720_16bit_resolve18", "", "j2c",
+    "-reversible true");
   run_ojph_compress_expand("dpx_enc_1280x720_16bit_resolve18", "j2c", "ppm");
   run_mse_pae("dpx_enc_1280x720_16bit_resolve18", "ppm",
-              "dpx_1280x720_16bit.ppm", "", 3, mse, pae);
+    "dpx_1280x720_16bit.ppm", "", 3, mse, pae);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 //                                   main
 ////////////////////////////////////////////////////////////////////////////////
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/tests/test_helpers/ht_cmdlines.txt b/tests/test_helpers/ht_cmdlines.txt
index c8590611..a8c0987d 100644
--- a/tests/test_helpers/ht_cmdlines.txt
+++ b/tests/test_helpers/ht_cmdlines.txt
@@ -81,8 +81,8 @@ add_test(NAME simple_enc_rev53_32x32  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_de
 add_test(NAME simple_enc_rev53_4x4    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc               "-i ${images_folder}/mm.ppm -o simple_enc_rev53_4x4.j2c    -reversible true -block_size \{4,4\}"                             "-i simple_enc_rev53_4x4.j2c    -o test1.ppm -precise -quiet"             "-i simple_enc_rev53_4x4.j2c    -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
 add_test(NAME simple_enc_rev53_1024x4 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc               "-i ${images_folder}/mm.ppm -o simple_enc_rev53_1024x4.j2c -reversible true -block_size \{4,1024\}"                          "-i simple_enc_rev53_1024x4.j2c -o test1.ppm -precise -quiet"             "-i simple_enc_rev53_1024x4.j2c -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
 add_test(NAME simple_enc_rev53_4x1024 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc               "-i ${images_folder}/mm.ppm -o simple_enc_rev53_4x1024.j2c -reversible true -block_size \{1024,4\}"                          "-i simple_enc_rev53_4x1024.j2c -o test1.ppm -precise -quiet"             "-i simple_enc_rev53_4x1024.j2c -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_rev53_64x64_tiles_33x33_d5 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_64x64_tiles_33x33.j2c  -reversible true -tile_size \{32,32\} -num_decomps 5" "-i simple_enc_rev53_64x64_tiles_33x33.j2c  -o test1.ppm -precise -quiet" "-i simple_enc_rev53_64x64.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
-add_test(NAME simple_enc_rev53_64x64_tiles_33x33_d6 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_64x64_tiles_33x33.j2c  -reversible true -tile_size \{32,32\} -num_decomps 6" "-i simple_enc_rev53_64x64_tiles_33x33.j2c  -o test1.ppm -precise -quiet" "-i simple_enc_rev53_64x64.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_64x64_tiles_33x33_d5 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_64x64_tiles_33x33_d5.j2c  -reversible true -tile_size \{32,32\} -num_decomps 5" "-i simple_enc_rev53_64x64_tiles_33x33_d5.j2c  -o test1.ppm -precise -quiet" "-i simple_enc_rev53_64x64_tiles_33x33_d5.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
+add_test(NAME simple_enc_rev53_64x64_tiles_33x33_d6 COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -renc "-i ${images_folder}/mm.ppm -o simple_enc_rev53_64x64_tiles_33x33_d6.j2c  -reversible true -tile_size \{32,32\} -num_decomps 6" "-i simple_enc_rev53_64x64_tiles_33x33_d6.j2c  -o test1.ppm -precise -quiet" "-i simple_enc_rev53_64x64_tiles_33x33_d6.j2c  -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
 
 add_test(NAME simple_enc_irv97_64x64_yuv COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom_yuv.sh -enc  "-i ${images_folder}/foreman_420.yuv -o simple_enc_irv97_64x64_yuv.j2c -qstep 0.1 -dims \{352,288\} -num_comps 3 -downsamp \{1,1\},\{2,2\},\{2,2\} -bit_depth 8,8,8 -signed false,false,false"                   "-i simple_enc_irv97_64x64_yuv.j2c -o test1y.rawl,test1u.rawl,test1v.rawl -precise -quiet" "-i simple_enc_irv97_64x64_yuv.j2c -o test2.yuv" "${images_folder}/foreman_420.yuv:352x288x8x420" "test1.yuv:352x288x8x420" "test2.yuv:352x288x8x420")
 add_test(NAME simple_enc_rev53_64x64_yuv COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom_yuv.sh -renc "-i ${images_folder}/foreman_420.yuv -o simple_enc_rev53_64x64_yuv.j2c -reversible true -qstep 0.1 -dims \{352,288\} -num_comps 3 -downsamp \{1,1\},\{2,2\},\{2,2\} -bit_depth 8,8,8 -signed false,false,false"  "-i simple_enc_rev53_64x64_yuv.j2c -o test1y.rawl,test1u.rawl,test1v.rawl -precise -quiet" "-i simple_enc_rev53_64x64_yuv.j2c -o test2.yuv" "${images_folder}/foreman_420.yuv:352x288x8x420" "test1.yuv:352x288x8x420" "test2.yuv:352x288x8x420")

From 6e9cfdc60d7d8da0ae1ef2e7cdf707623f1ef136 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Mon, 8 Apr 2024 23:50:35 +1000
Subject: [PATCH 19/37] All changes needed for DFS and ATK are done. Still some
 bugs.

---
 src/core/codestream/ojph_codestream_local.cpp |  31 +-
 src/core/codestream/ojph_codestream_local.h   |  15 +-
 src/core/codestream/ojph_params.cpp           |  75 ++-
 src/core/codestream/ojph_params_local.h       |  53 +-
 src/core/codestream/ojph_precinct.cpp         |  14 +-
 src/core/codestream/ojph_precinct.h           |   3 +-
 src/core/codestream/ojph_resolution.cpp       | 569 +++++++++++-------
 src/core/codestream/ojph_resolution.h         |  14 +-
 src/core/codestream/ojph_subband.cpp          |  38 +-
 src/core/codestream/ojph_subband.h            |   3 +-
 src/core/codestream/ojph_tile.cpp             |  22 +-
 src/core/codestream/ojph_tile.h               |   5 +-
 src/core/codestream/ojph_tile_comp.cpp        |   3 +-
 src/core/transform/ojph_transform.cpp         |  53 +-
 14 files changed, 566 insertions(+), 332 deletions(-)

diff --git a/src/core/codestream/ojph_codestream_local.cpp b/src/core/codestream/ojph_codestream_local.cpp
index 737daffb..5f72d3e8 100644
--- a/src/core/codestream/ojph_codestream_local.cpp
+++ b/src/core/codestream/ojph_codestream_local.cpp
@@ -186,8 +186,6 @@ namespace ojph {
       for (ui32 r = 0; r <= num_decomps; ++r)
       {
         size log_PP = cod.get_log_precinct_size(r);
-        log_PP.w -= (r ? 1 : 0);
-        log_PP.h -= (r ? 1 : 0);
         ratio.w = ojph_max(ratio.w, log_PP.w - ojph_min(log_cb.w, log_PP.w));
         ratio.h = ojph_max(ratio.h, log_PP.h - ojph_min(log_cb.h, log_PP.h));
       }
@@ -200,7 +198,7 @@ namespace ojph {
       // We need 4 such tables. These tables store
       // 1. missing msbs and 2. their flags, 
       // 3. number of layers and 4. their flags
-      precinct_scratch_needed_bytes = 
+      precinct_scratch_needed_bytes =
         4 * ((max_ratio * max_ratio * 4 + 2) / 3);
 
       allocator->pre_alloc_obj<ui8>(precinct_scratch_needed_bytes);
@@ -220,7 +218,7 @@ namespace ojph {
 
       ui32 num_tileparts = 0;
       point index;
-      rect tile_rect, recon_tile_rect;
+      rect tile_rect;
       ojph::param_siz sz = access_siz();
       ui32 ds = 1 << skipped_res_for_recon;
       for (index.y = 0; index.y < num_tiles.h; ++index.y)
@@ -233,12 +231,6 @@ namespace ojph {
         tile_rect.siz.h = 
           ojph_min(y1, sz.get_image_extent().y) - tile_rect.org.y;
 
-        recon_tile_rect.org.y = ojph_max(ojph_div_ceil(y0, ds), 
-          ojph_div_ceil(sz.get_image_offset().y, ds));
-        recon_tile_rect.siz.h = ojph_min(ojph_div_ceil(y1, ds),
-          ojph_div_ceil(sz.get_image_extent().y, ds))
-          - recon_tile_rect.org.y;
-
         ui32 offset = 0;
         for (index.x = 0; index.x < num_tiles.w; ++index.x)
         {
@@ -250,17 +242,9 @@ namespace ojph {
           tile_rect.siz.w = 
             ojph_min(x1, sz.get_image_extent().x) - tile_rect.org.x;
 
-          recon_tile_rect.org.x = ojph_max(ojph_div_ceil(x0, ds),
-            ojph_div_ceil(sz.get_image_offset().x, ds));
-          recon_tile_rect.siz.w = ojph_min(ojph_div_ceil(x1, ds),
-            ojph_div_ceil(sz.get_image_extent().x, ds))
-            - recon_tile_rect.org.x;
-
           ui32 tps = 0; // number of tileparts for this tile
           ui32 idx = index.y * num_tiles.w + index.x;
-          tiles[idx].finalize_alloc(this, tile_rect, recon_tile_rect,
-            idx, offset, tps);
-          offset += recon_tile_rect.siz.w;
+          tiles[idx].finalize_alloc(this, tile_rect, idx, offset, tps);
           num_tileparts += tps;
         }
       }
@@ -836,8 +820,15 @@ namespace ojph {
       }
 
       cod.update_atk(atk);
-      for (int i = 0; i < used_coc_fields; ++i)
+      for (int i = 0; i < used_coc_fields; ++i) 
+      {
+        if (i == 0) cod.link_cod(coc);
+        else coc[i - 1].link_cod(coc + i);
         coc[i].update_atk(atk);
+      }
+      siz.link(&cod);
+      if (dfs.exists())
+        siz.link(&dfs);
 
       if (received_markers != 3)
         OJPH_ERROR(0x00030052, "markers error, COD and QCD are required");
diff --git a/src/core/codestream/ojph_codestream_local.h b/src/core/codestream/ojph_codestream_local.h
index 8e77eb17..8ca8c717 100644
--- a/src/core/codestream/ojph_codestream_local.h
+++ b/src/core/codestream/ojph_codestream_local.h
@@ -82,19 +82,10 @@ namespace ojph {
       { return &siz; }
       ojph::param_cod access_cod()            //return externally wrapped cod
       { return ojph::param_cod(&cod); }
-      const param_cod* get_cod()              //return internal code
+      const param_cod* get_cod()              //return internal cod
       { return &cod; }
-      const param_cod* get_cod(ui32 comp_num) //return internal code
-      { 
-        if (used_coc_fields == 0)
-          return &cod;
-        else {
-          for (int i = 0; i < used_coc_fields; ++i)
-            if (coc[i].get_comp_num() == comp_num)
-              return coc + i;
-          return &cod;
-        }
-      }
+      const param_cod* get_cod(ui32 comp_num) //return internal cod
+      { return cod.get_cod(comp_num); }
       param_qcd* access_qcd(ui32 comp_num)
       { 
         if (used_qcc_fields > 0)
diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp
index ef652651..268135c4 100644
--- a/src/core/codestream/ojph_params.cpp
+++ b/src/core/codestream/ojph_params.cpp
@@ -663,6 +663,35 @@ namespace ojph {
       dfs_support_needed = (Rsiz & 0x80) != 0;
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    point param_siz::get_recon_downsampling(ui32 comp_num) const
+    {
+      assert(comp_num < get_num_components());
+
+      point factor(1u << skipped_resolutions, 1u << skipped_resolutions);
+      const param_cod* cdp = cod->get_cod(comp_num);
+      if (dfs && cdp && cdp->is_dfs_defined()) {
+        const param_dfs* d = dfs->get_dfs(cdp->get_dfs_index());
+        factor = d->get_res_downsamp(skipped_resolutions);
+      }
+      factor.x *= (ui32)cptr[comp_num].XRsiz;
+      factor.y *= (ui32)cptr[comp_num].YRsiz;
+      return factor;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    point param_siz::get_recon_size(ui32 comp_num) const
+    {
+      assert(comp_num < get_num_components());
+
+      point factor = get_recon_downsampling(comp_num);
+      point r;
+      r.x = ojph_div_ceil(Xsiz, factor.x) - ojph_div_ceil(XOsiz, factor.x);
+      r.y = ojph_div_ceil(Ysiz, factor.y) - ojph_div_ceil(YOsiz, factor.y);
+      return r;
+    }
+
+
     //////////////////////////////////////////////////////////////////////////
     //
     //
@@ -1406,10 +1435,9 @@ namespace ojph {
                                     ui32 subband) const
     {
       assert((resolution == 0 && subband == 0) || 
-              (resolution > 0 && resolution <= Ids && 
-              subband > 0 && subband < 4));
+              (resolution > 0 && subband > 0 && subband < 4));
 
-      ui32 ns[4] = { 0, 3, 2, 2 };
+      ui32 ns[4] = { 0, 3, 1, 1 };
 
       ui32 idx = 0;
       if (resolution > 0)
@@ -1427,6 +1455,27 @@ namespace ojph {
       return idx;
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    point param_dfs::get_res_downsamp(ui32 skipped_resolutions) const
+    {
+      point factor(1, 1);
+      ui32 decomp_level = 1;
+      while (skipped_resolutions > 0)
+      {
+        param_dfs::dfs_dwt_type type = get_dwt_type(decomp_level);
+        if (type == BIDIR_DWT)
+        { factor.x *= 2; factor.y *= 2; }
+        else if (type == HORZ_DWT)
+          factor.x *= 2;
+        else if (type == VERT_DWT)
+          factor.y *= 2;
+
+        ++decomp_level;
+        --skipped_resolutions;
+      }
+      return factor;
+    }
+
     //////////////////////////////////////////////////////////////////////////
     bool param_dfs::read(infile_base *file)
     {
@@ -1658,10 +1707,10 @@ namespace ojph {
       Natk = 4;
       // next is (A-4) in T.801 second line
       Latk = (ui16)(5 + Natk + sizeof(float) * (1 + Natk));
-      d[0].irv.Aatk = (float)-1.586134342059924;
-      d[1].irv.Aatk = (float)-0.052980118572961;
-      d[2].irv.Aatk = (float)0.882911075530934;
-      d[3].irv.Aatk = (float)0.443506852043971;
+      d[0].irv.Aatk = (float)0.443506852043971;
+      d[1].irv.Aatk = (float)0.882911075530934;
+      d[2].irv.Aatk = (float)-0.052980118572961;
+      d[3].irv.Aatk = (float)-1.586134342059924;
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -1671,12 +1720,12 @@ namespace ojph {
       Natk = 2;
       // next is (A-4) in T.801 fourth line
       Latk = (ui16)(5 + 2 * Natk + sizeof(ui8) * (Natk + Natk));
-      d[0].rev.Aatk = -1;
-      d[0].rev.Batk = 0;
-      d[0].rev.Eatk = 1;
-      d[1].rev.Aatk = 1;
-      d[1].rev.Batk = 2;
-      d[1].rev.Eatk = 2;
+      d[0].rev.Aatk = 1;
+      d[0].rev.Batk = 2;
+      d[0].rev.Eatk = 2;
+      d[1].rev.Aatk = -1;
+      d[1].rev.Batk = 0;
+      d[1].rev.Eatk = 1;
     }
 
   } // !local namespace
diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h
index 43c1181d..1ee508dc 100644
--- a/src/core/codestream/ojph_params_local.h
+++ b/src/core/codestream/ojph_params_local.h
@@ -172,7 +172,6 @@ namespace ojph {
         cptr = store;
         old_Csiz = 4;
         Rsiz = 0x4000; //for jph, bit 14 of Rsiz is 1
-        ws_kern_support_needed = dfs_support_needed = false;
       }
 
       ~param_siz()
@@ -238,10 +237,15 @@ namespace ojph {
       bool write(outfile_base *file);
       void read(infile_base *file);
 
+      void link(const param_cod* cod)
+      { this->cod = cod; }
+
+      void link(const param_dfs* dfs)
+      { this->dfs = dfs; }
+
       void set_skipped_resolutions(ui32 skipped_resolutions)
-      {
-        this->skipped_resolutions = skipped_resolutions;
-      }
+      { this->skipped_resolutions = skipped_resolutions; }
+      
       ui32 get_width(ui32 comp_num) const
       {
         assert(comp_num < get_num_components());
@@ -256,20 +260,14 @@ namespace ojph {
         ui32 t = ojph_div_ceil(Ysiz, ds) - ojph_div_ceil(YOsiz, ds);
         return t;
       }
+
+      point get_recon_downsampling(ui32 comp_num) const;
+      point get_recon_size(ui32 comp_num) const;
       ui32 get_recon_width(ui32 comp_num) const
-      {
-        assert(comp_num < get_num_components());
-        ui32 ds = (ui32)cptr[comp_num].XRsiz * (1u << skipped_resolutions);
-        ui32 t = ojph_div_ceil(Xsiz, ds) - ojph_div_ceil(XOsiz, ds);
-        return t;
-      }
+      { return get_recon_size(comp_num).x; }
       ui32 get_recon_height(ui32 comp_num) const
-      {
-        assert(comp_num < get_num_components());
-        ui32 ds = (ui32)cptr[comp_num].YRsiz * (1u << skipped_resolutions);
-        ui32 t = ojph_div_ceil(Ysiz, ds) - ojph_div_ceil(YOsiz, ds);
-        return t;
-      }
+      { return get_recon_size(comp_num).y; }
+
       bool is_ws_kern_support_needed() { return ws_kern_support_needed; }
       bool is_dfs_support_needed() { return dfs_support_needed; }
 
@@ -293,6 +291,8 @@ namespace ojph {
       siz_comp_info store[4];
       bool ws_kern_support_needed;
       bool dfs_support_needed;
+      const param_cod* cod;
+      const param_dfs* dfs;
       param_siz(const param_siz&) = delete; //prevent copy constructor
       param_siz& operator=(const param_siz&) = delete; //prevent copy
     };
@@ -370,6 +370,7 @@ namespace ojph {
         SPcod.num_decomp = 5;
         SPcod.block_width = 4; //64
         SPcod.block_height = 4; //64
+        next = NULL;
       }
 
       ////////////////////////////////////////
@@ -503,6 +504,22 @@ namespace ojph {
       ////////////////////////////////////////
       void update_atk(const param_atk* atk);
 
+      ////////////////////////////////////////
+      void link_cod(const param_cod* cod)
+      { this->next = cod; }
+
+      ////////////////////////////////////////
+      const param_cod* get_cod(ui32 comp_num) const
+      {
+        const param_cod* result = this->next;
+        while (result != NULL && result->get_comp_num() != comp_num)
+          result = result->next;
+        if (result)
+          return result;
+        else
+          return this;
+      }
+
       ////////////////////////////////////////
       const param_atk* access_atk() const { return atk; }
 
@@ -516,7 +533,7 @@ namespace ojph {
       { return SPcod.num_decomp & 0xF; }
 
       ////////////////////////////////////////
-      ui32 get_comp_num()
+      ui32 get_comp_num() const
       { assert(type == COC_MAIN); return comp_num; }
 
     private: // Common variables
@@ -525,6 +542,7 @@ namespace ojph {
       ui8 Scod;             // serves as Scod and Scoc
       cod_SGcod SGCod;      // Used in COD and copied to COC
       cod_SPcod SPcod;      // serves as SPcod and SPcoc
+      const param_cod* next;// to link cod parameters
 
     private: // COC only variables
       param_cod* parent;    // parent COD structure
@@ -775,6 +793,7 @@ namespace ojph {
       dfs_dwt_type get_dwt_type(ui32 decomp_level) const;
       ui32 get_subband_idx(ui32 num_decompositions, ui32 resolution,
                            ui32 subband) const;
+      point get_res_downsamp(ui32 skipped_resolutions) const;
 
     private: // member variables
       ui16 Ldfs;       // length of the segment marker
diff --git a/src/core/codestream/ojph_precinct.cpp b/src/core/codestream/ojph_precinct.cpp
index c20c8589..813e33b8 100644
--- a/src/core/codestream/ojph_precinct.cpp
+++ b/src/core/codestream/ojph_precinct.cpp
@@ -98,11 +98,12 @@ namespace ojph {
       coded_lists *cur_coded_list = NULL;
       ui32 cb_bytes = 0; //cb_bytes;
       ui32 ph_bytes = 0; //precinct header size
-      int sst = num_bands == 3 ? 1 : 0;
-      int send = num_bands == 3 ? 4 : 1;
       int num_skipped_subbands = 0;
-      for (int s = sst; s < send; ++s)
+      for (int s = 0; s < 4; ++s)
       {
+        if (bands[s].empty)
+          continue;
+
         if (cb_idxs[s].siz.w == 0 || cb_idxs[s].siz.h == 0)
           continue;
 
@@ -288,10 +289,11 @@ namespace ojph {
         }
 
         //write codeblocks
-        int sst = num_bands == 3 ? 1 : 0;
-        int send = num_bands == 3 ? 4 : 1;
-        for (int s = sst; s < send; ++s)
+        for (int s = 0; s < 4; ++s)
         {
+          if (bands[s].empty)
+            continue;
+
           ui32 band_width = bands[s].num_blocks.w;
           ui32 width = cb_idxs[s].siz.w;
           ui32 height = cb_idxs[s].siz.h;
diff --git a/src/core/codestream/ojph_precinct.h b/src/core/codestream/ojph_precinct.h
index d8e880a9..47ec4736 100644
--- a/src/core/codestream/ojph_precinct.h
+++ b/src/core/codestream/ojph_precinct.h
@@ -59,7 +59,7 @@ namespace ojph {
     {
       precinct() {
         scratch = NULL; bands = NULL; coded = NULL;
-        num_bands = 0; may_use_sop = uses_eph = false;
+        may_use_sop = uses_eph = false;
       }
       ui32 prepare_precinct(int tag_tree_size, ui32* lev_idx,
                             mem_elastic_allocator *elastic);
@@ -73,7 +73,6 @@ namespace ojph {
       rect cb_idxs[4]; //indices of codeblocks
       subband *bands;  //the subbands
       coded_lists* coded;
-      ui32 num_bands;
       bool may_use_sop, uses_eph;
     };
 
diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp
index a0413b76..14743249 100644
--- a/src/core/codestream/ojph_resolution.cpp
+++ b/src/core/codestream/ojph_resolution.cpp
@@ -67,7 +67,7 @@ namespace ojph {
       bool skipped_res_for_recon = res_num > t;
 
       const param_atk* atk = cdp->access_atk();
-      param_dfs::dfs_dwt_type downsampling_style = param_dfs::BIDIR_DWT;
+      param_dfs::dfs_dwt_type ds = param_dfs::BIDIR_DWT;
       if (cdp->is_dfs_defined()) {
         const param_dfs* dfs = codestream->access_dfs();
         if (dfs == NULL) {
@@ -86,31 +86,22 @@ namespace ojph {
               "main codestream headers", dfs_idx);
           }
           ui32 num_decomps = cdp->get_num_decompositions();
-          downsampling_style = dfs->get_dwt_type(num_decomps - res_num + 1);
+          ds = dfs->get_dwt_type(num_decomps - res_num + 1);
         }
       }
 
-      //create next resolution
+      ui32 transform_flags = 0;
       if (res_num > 0)
       {
-        //allocate a resolution
-        allocator->pre_alloc_obj<resolution>(1);
-        ui32 trx0 = ojph_div_ceil(res_rect.org.x, 2);
-        ui32 try0 = ojph_div_ceil(res_rect.org.y, 2);
-        ui32 trx1 = ojph_div_ceil(res_rect.org.x + res_rect.siz.w, 2);
-        ui32 try1 = ojph_div_ceil(res_rect.org.y + res_rect.siz.h, 2);
-        rect next_res_rect;
-        next_res_rect.org.x = trx0;
-        next_res_rect.org.y = try0;
-        next_res_rect.siz.w = trx1 - trx0;
-        next_res_rect.siz.h = try1 - try0;
-
-        resolution::pre_alloc(codestream, next_res_rect,
-          skipped_res_for_recon ? recon_res_rect : next_res_rect, 
-          comp_num, res_num - 1);
+        if (ds == param_dfs::BIDIR_DWT)
+          transform_flags = HORZ_TRX | VERT_TRX;
+        else if (ds == param_dfs::HORZ_DWT)
+          transform_flags = HORZ_TRX;
+        else if (ds == param_dfs::VERT_DWT)
+          transform_flags = VERT_TRX;
       }
 
-      //allocate subbands
+      //allocate resolution/subbands
       ui32 trx0 = res_rect.org.x;
       ui32 try0 = res_rect.org.y;
       ui32 trx1 = res_rect.org.x + res_rect.siz.w;
@@ -118,23 +109,83 @@ namespace ojph {
       allocator->pre_alloc_obj<subband>(4);
       if (res_num > 0)
       {
-        for (ui32 i = 1; i < 4; ++i)
+        if (ds == param_dfs::BIDIR_DWT)
         {
-          ui32 tbx0 = (trx0 - (i & 1) + 1) >> 1;
-          ui32 tbx1 = (trx1 - (i & 1) + 1) >> 1;
-          ui32 tby0 = (try0 - (i >> 1) + 1) >> 1;
-          ui32 tby1 = (try1 - (i >> 1) + 1) >> 1;
-
-          rect band_rect;
-          band_rect.org.x = tbx0;
-          band_rect.org.y = tby0;
-          band_rect.siz.w = tbx1 - tbx0;
-          band_rect.siz.h = tby1 - tby0;
-          subband::pre_alloc(codestream, band_rect, comp_num, res_num);
+          for (ui32 i = 0; i < 4; ++i)
+          {
+            ui32 tbx0 = (trx0 - (i & 1) + 1) >> 1;
+            ui32 tbx1 = (trx1 - (i & 1) + 1) >> 1;
+            ui32 tby0 = (try0 - (i >> 1) + 1) >> 1;
+            ui32 tby1 = (try1 - (i >> 1) + 1) >> 1;
+
+            rect re;
+            re.org.x = tbx0;
+            re.org.y = tby0;
+            re.siz.w = tbx1 - tbx0;
+            re.siz.h = tby1 - tby0;
+            if (i == 0) {
+              allocator->pre_alloc_obj<resolution>(1);
+              resolution::pre_alloc(codestream, re,
+                skipped_res_for_recon ? recon_res_rect : re,
+                comp_num, res_num - 1);
+            }
+            else
+              subband::pre_alloc(codestream, re, comp_num, res_num,
+                                 transform_flags);
+          }
+        }
+        else if (ds == param_dfs::VERT_DWT)
+        {
+          ui32 tby0, tby1;
+          rect re = res_rect;
+          tby0 = (try0 + 1) >> 1;
+          tby1 = (try1 + 1) >> 1;
+          re.org.y = tby0;
+          re.siz.h = tby1 - tby0;
+          allocator->pre_alloc_obj<resolution>(1);
+          resolution::pre_alloc(codestream, re,
+            skipped_res_for_recon ? recon_res_rect : re,
+            comp_num, res_num - 1);
+
+          tby0 = try0 >> 1;
+          tby1 = try1 >> 1;
+          re.org.y = tby0;
+          re.siz.h = tby1 - tby0;
+          subband::pre_alloc(codestream, re, comp_num, res_num, 
+                             transform_flags);
+        }
+        else if (ds == param_dfs::HORZ_DWT)
+        {
+          ui32 tbx0, tbx1;
+          rect re = res_rect;
+          tbx0 = (trx0 + 1) >> 1;
+          tbx1 = (trx1 + 1) >> 1;
+          re.org.x = tbx0;
+          re.siz.w = tbx1 - tbx0;
+          allocator->pre_alloc_obj<resolution>(1);
+          resolution::pre_alloc(codestream, re,
+            skipped_res_for_recon ? recon_res_rect : re,
+            comp_num, res_num - 1);
+
+          tbx0 = trx0 >> 1;
+          tbx1 = trx1 >> 1;
+          re.org.x = tbx0;
+          re.siz.w = tbx1 - tbx0;
+          subband::pre_alloc(codestream, re, comp_num, res_num, 
+                             transform_flags);
+        }
+        else
+        {
+          assert(ds == param_dfs::NO_DWT);
+          allocator->pre_alloc_obj<resolution>(1);
+          resolution::pre_alloc(codestream, res_rect,
+            skipped_res_for_recon ? recon_res_rect : res_rect,
+            comp_num, res_num - 1);
         }
       }
       else
-        subband::pre_alloc(codestream, res_rect, comp_num, res_num);
+        subband::pre_alloc(codestream, res_rect, comp_num, res_num, 
+                           transform_flags);
 
       //prealloc precincts
       size log_PP = cdp->get_log_precinct_size(res_num);
@@ -168,7 +219,7 @@ namespace ojph {
                                     const rect& res_rect,
                                     const rect& recon_res_rect,
                                     ui32 comp_num, ui32 res_num,
-                                    point comp_downsamp,
+                                    point comp_downsamp, point res_downsamp,
                                     tile_comp* parent_tile_comp,
                                     resolution* parent_res)
     {
@@ -189,7 +240,7 @@ namespace ojph {
       this->res_num = res_num;
       this->num_bytes = 0;
       this->atk = cdp->access_atk();
-      this->downsampling_style = param_dfs::BIDIR_DWT;
+      param_dfs::dfs_dwt_type ds = param_dfs::BIDIR_DWT;
       if (cdp->is_dfs_defined()) {
         const param_dfs* dfs = codestream->access_dfs();
         if (dfs == NULL) {
@@ -208,34 +259,22 @@ namespace ojph {
               "main codestream headers", dfs_idx);
           }
           ui32 num_decomps = cdp->get_num_decompositions();
-          this->downsampling_style = 
-            dfs->get_dwt_type(num_decomps - res_num + 1);
+          ds = dfs->get_dwt_type(num_decomps - res_num + 1);
         }
       }
 
-      //finalize next resolution
+      transform_flags = 0;
       if (res_num > 0)
       {
-        //allocate a resolution
-        child_res = allocator->post_alloc_obj<resolution>(1);
-        ui32 trx0 = ojph_div_ceil(res_rect.org.x, 2);
-        ui32 try0 = ojph_div_ceil(res_rect.org.y, 2);
-        ui32 trx1 = ojph_div_ceil(res_rect.org.x + res_rect.siz.w, 2);
-        ui32 try1 = ojph_div_ceil(res_rect.org.y + res_rect.siz.h, 2);
-        rect next_res_rect;
-        next_res_rect.org.x = trx0;
-        next_res_rect.org.y = try0;
-        next_res_rect.siz.w = trx1 - trx0;
-        next_res_rect.siz.h = try1 - try0;
-
-        child_res->finalize_alloc(codestream, next_res_rect,
-          skipped_res_for_recon ? recon_res_rect : next_res_rect, comp_num,
-          res_num - 1, comp_downsamp, parent_tile_comp, this);
+        if (ds == param_dfs::BIDIR_DWT)
+          transform_flags = HORZ_TRX | VERT_TRX;
+        else if (ds == param_dfs::HORZ_DWT)
+          transform_flags = HORZ_TRX;
+        else if (ds == param_dfs::VERT_DWT)
+          transform_flags = VERT_TRX;
       }
-      else
-        child_res = NULL;
 
-      //allocate subbands
+      //allocate resolution/subbands
       ui32 trx0 = res_rect.org.x;
       ui32 try0 = res_rect.org.y;
       ui32 trx1 = res_rect.org.x + res_rect.siz.w;
@@ -245,24 +284,94 @@ namespace ojph {
         new (bands + i) subband;
       if (res_num > 0)
       {
-        this->num_bands = 3;
-        for (ui32 i = 1; i < 4; ++i)
+        if (ds == param_dfs::BIDIR_DWT)
+        {
+          for (ui32 i = 0; i < 4; ++i)
+          {
+            ui32 tbx0 = (trx0 - (i & 1) + 1) >> 1;
+            ui32 tbx1 = (trx1 - (i & 1) + 1) >> 1;
+            ui32 tby0 = (try0 - (i >> 1) + 1) >> 1;
+            ui32 tby1 = (try1 - (i >> 1) + 1) >> 1;
+
+            rect re;
+            re.org.x = tbx0;
+            re.org.y = tby0;
+            re.siz.w = tbx1 - tbx0;
+            re.siz.h = tby1 - tby0;
+            if (i == 0) {
+              point next_res_downsamp;
+              next_res_downsamp.x = res_downsamp.x * 2;
+              next_res_downsamp.y = res_downsamp.y * 2;
+
+              child_res = allocator->post_alloc_obj<resolution>(1);
+              child_res->finalize_alloc(codestream, re,
+                skipped_res_for_recon ? recon_res_rect : re, comp_num,
+                res_num - 1, comp_downsamp, next_res_downsamp, 
+                parent_tile_comp, this);
+            }
+            else
+              bands[i].finalize_alloc(codestream, re, this, res_num, i);
+          }
+        }
+        else if (ds == param_dfs::VERT_DWT)
+        {
+          ui32 tby0, tby1;
+          rect re = res_rect;
+          tby0 = (try0 + 1) >> 1;
+          tby1 = (try1 + 1) >> 1;
+          re.org.y = tby0;
+          re.siz.h = tby1 - tby0;
+
+          point next_res_downsamp;
+          next_res_downsamp.x = res_downsamp.x;
+          next_res_downsamp.y = res_downsamp.y * 2;
+          child_res = allocator->post_alloc_obj<resolution>(1);
+          child_res->finalize_alloc(codestream, re,
+            skipped_res_for_recon ? recon_res_rect : re, comp_num,
+            res_num - 1, comp_downsamp, next_res_downsamp,
+            parent_tile_comp, this);
+
+          tby0 = try0 >> 1;
+          tby1 = try1 >> 1;
+          re.org.y = tby0;
+          re.siz.h = tby1 - tby0;
+          bands[2].finalize_alloc(codestream, re, this, res_num, 2);
+        }
+        else if (ds == param_dfs::HORZ_DWT)
+        {
+          ui32 tbx0, tbx1;
+          rect re = res_rect;
+          tbx0 = (trx0 + 1) >> 1;
+          tbx1 = (trx1 + 1) >> 1;
+          re.org.x = tbx0;
+          re.siz.w = tbx1 - tbx0;
+
+          point next_res_downsamp;
+          next_res_downsamp.x = res_downsamp.x * 2;
+          next_res_downsamp.y = res_downsamp.y;
+          child_res = allocator->post_alloc_obj<resolution>(1);
+          child_res->finalize_alloc(codestream, re,
+            skipped_res_for_recon ? recon_res_rect : re, comp_num,
+            res_num - 1, comp_downsamp, next_res_downsamp,
+            parent_tile_comp, this);
+
+          tbx0 = trx0 >> 1;
+          tbx1 = trx1 >> 1;
+          re.org.x = tbx0;
+          re.siz.w = tbx1 - tbx0;
+          bands[1].finalize_alloc(codestream, re, this, res_num, 1);
+        }
+        else
         {
-          ui32 tbx0 = (trx0 - (i & 1) + 1) >> 1;
-          ui32 tbx1 = (trx1 - (i & 1) + 1) >> 1;
-          ui32 tby0 = (try0 - (i >> 1) + 1) >> 1;
-          ui32 tby1 = (try1 - (i >> 1) + 1) >> 1;
-
-          rect band_rect;
-          band_rect.org.x = tbx0;
-          band_rect.org.y = tby0;
-          band_rect.siz.w = tbx1 - tbx0;
-          band_rect.siz.h = tby1 - tby0;
-          bands[i].finalize_alloc(codestream, band_rect, this, res_num, i);
+          assert(ds == param_dfs::NO_DWT);
+          child_res = allocator->post_alloc_obj<resolution>(1);
+          child_res->finalize_alloc(codestream, res_rect,
+            skipped_res_for_recon ? recon_res_rect : res_rect, comp_num,
+            res_num - 1, comp_downsamp, res_downsamp, parent_tile_comp, this);
         }
       }
       else {
-        this->num_bands = 1;
+        child_res = NULL;
         bands[0].finalize_alloc(codestream, res_rect, this, res_num, 0);
       }
 
@@ -287,11 +396,7 @@ namespace ojph {
       ui32 x_lower_bound = (trx0 >> log_PP.w) << log_PP.w;
       ui32 y_lower_bound = (try0 >> log_PP.h) << log_PP.h;
 
-      point proj_factor;
-      proj_factor.x = comp_downsamp.x * (1 << (num_decomps - res_num));
-      proj_factor.y = comp_downsamp.y * (1 << (num_decomps - res_num));
       precinct* pp = precincts;
-
       point tile_top_left = parent_tile_comp->get_tile()->get_tile_rect().org;
       for (ui32 y = 0; y < num_precincts.h; ++y)
       {
@@ -299,11 +404,10 @@ namespace ojph {
         for (ui32 x = 0; x < num_precincts.w; ++x, ++pp)
         {
           ui32 ppx0 = x_lower_bound + (x << log_PP.w);
-          point t(proj_factor.x * ppx0, proj_factor.y * ppy0);
+          point t(res_downsamp.x * ppx0, res_downsamp.y * ppy0);
           t.x = t.x > tile_top_left.x ? t.x : tile_top_left.x;
           t.y = t.y > tile_top_left.y ? t.y : tile_top_left.y;
           pp->img_point = t;
-          pp->num_bands = num_bands;
           pp->bands = bands;
           pp->may_use_sop = cdp->packets_may_use_sop();
           pp->uses_eph = cdp->packets_use_eph();
@@ -311,15 +415,15 @@ namespace ojph {
           pp->coded = NULL;
         }
       }
-      if (num_bands == 1)
-        bands[0].get_cb_indices(num_precincts, precincts);
-      else
-        for (int i = 1; i < 4; ++i)
+      for (int i = 0; i < 4; ++i)
+        if (bands[i].exists())
           bands[i].get_cb_indices(num_precincts, precincts);
 
+      // determine how to divide scratch into multiple levels of
+      // tag trees
       size log_cb = cdp->get_log_block_dims();
-      log_PP.w -= (res_num ? 1 : 0);
-      log_PP.h -= (res_num ? 1 : 0);
+      log_PP.w -= (transform_flags & HORZ_TRX) ? 1 : 0;
+      log_PP.h -= (transform_flags & VERT_TRX) ? 1 : 0;
       size ratio;
       ratio.w = log_PP.w - ojph_min(log_cb.w, log_PP.w);
       ratio.h = log_PP.h - ojph_min(log_cb.h, log_PP.h);
@@ -391,7 +495,9 @@ namespace ojph {
     {
       if (res_num == 0)
       {
-        assert(num_bands == 1 && child_res == NULL);
+        assert(child_res == NULL);
+        assert(bands[0].exists() && !bands[1].exists() 
+          && !bands[2].exists() && !bands[3].exists());
         bands[0].exchange_buf(vert_even ? sig->line : aug->line);
         bands[0].push_line();
         return;
@@ -419,7 +525,7 @@ namespace ojph {
                 line_buf* dp = aug->line;
                 line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
                 line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
-                const lifting_step* s = atk->get_step(i);
+                const lifting_step* s = atk->get_step(num_steps - i - 1);
                 rev_vert_ana_step(s, sp1, sp2, dp, width);
               }
               lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
@@ -486,7 +592,7 @@ namespace ojph {
                 line_buf* dp = aug->line;
                 line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
                 line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
-                const lifting_step* s = atk->get_step(i);
+                const lifting_step* s = atk->get_step(num_steps - i - 1);
                 irv_vert_ana_step(s, sp1, sp2, dp, width);
               }
               lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
@@ -547,7 +653,9 @@ namespace ojph {
     {
       if (res_num == 0)
       {
-        assert(num_bands == 1 && child_res == NULL);
+        assert(child_res == NULL);
+        assert(bands[0].exists() && !bands[1].exists() 
+          && !bands[2].exists() && !bands[3].exists());
         return bands[0].pull_line();
       }
 
@@ -557,154 +665,211 @@ namespace ojph {
       ui32 width = res_rect.siz.w;
       if (width == 0)
         return NULL;
-      if (reversible)
+
+      if (transform_flags & VERT_TRX)
       {
-        if (res_rect.siz.h > 1)
+        if (reversible)
         {
-          if (sig->active) {
-            sig->active = false;
-            return sig->line;
-          };
-          for (;;)
+          if (res_rect.siz.h > 1)
           {
-            //horizontal transform
-            if (cur_line < res_rect.siz.h)
+            if (sig->active) {
+              sig->active = false;
+              return sig->line;
+            };
+            for (;;)
             {
-              if (vert_even) { // even
-                rev_horz_syn(atk, aug->line,
-                  child_res->pull_line(), bands[1].pull_line(),
-                  width, horz_even);
-                aug->active = true;
-                vert_even = !vert_even;
-                ++cur_line;
-                continue;
-              }
-              else {
-                rev_horz_syn(atk, sig->line,
-                  bands[2].pull_line(), bands[3].pull_line(),
-                  width, horz_even);
-                sig->active = true;
-                vert_even = !vert_even;
-                ++cur_line;
+              //horizontal transform
+              if (cur_line < res_rect.siz.h)
+              {
+                if (vert_even) { // even
+                  if (transform_flags & HORZ_TRX)
+                    rev_horz_syn(atk, aug->line, child_res->pull_line(), 
+                      bands[1].pull_line(), width, horz_even);
+                  else
+                    memcpy(aug->line->i32, child_res->pull_line()->i32,
+                      width * sizeof(si32));
+                  aug->active = true;
+                  vert_even = !vert_even;
+                  ++cur_line;
+                  continue;
+                }
+                else {
+                  if (transform_flags & HORZ_TRX)
+                    rev_horz_syn(atk, sig->line, bands[2].pull_line(), 
+                      bands[3].pull_line(), width, horz_even);
+                  else
+                    memcpy(sig->line->i32, bands[2].pull_line()->i32,
+                      width * sizeof(si32));
+                  sig->active = true;
+                  vert_even = !vert_even;
+                  ++cur_line;
+                }
               }
-            }
 
-            //vertical transform
-            for (ui32 i = 0; i < num_steps; ++i)
-            {
-              if (aug->active && (sig->active || ssp[i].active))
+              //vertical transform
+              for (ui32 i = 0; i < num_steps; ++i)
               {
-                line_buf* dp = aug->line;
-                line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
-                line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
-                const lifting_step* s = atk->get_step(num_steps - i - 1);
-                rev_vert_syn_step(s, dp, sp1, sp2, width);
+                if (aug->active && (sig->active || ssp[i].active))
+                {
+                  line_buf* dp = aug->line;
+                  line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
+                  line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
+                  const lifting_step* s = atk->get_step(i);
+                  rev_vert_syn_step(s, dp, sp1, sp2, width);
+                }
+                lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
               }
-              lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
-            }
 
-            if (aug->active) {
-              aug->active = false;
-              return aug->line;
+              if (aug->active) {
+                aug->active = false;
+                return aug->line;
+              }
+              if (sig->active) {
+                sig->active = false;
+                return sig->line;
+              };
             }
-            if (sig->active) {
-              sig->active = false;
-              return sig->line;
-            };
           }
-        }
-        else
-        {
-          if (vert_even)
-            rev_horz_syn(atk, aug->line, child_res->pull_line(),
-              bands[1].pull_line(), width, horz_even);
           else
           {
-            rev_horz_syn(atk, aug->line, bands[2].pull_line(),
-              bands[3].pull_line(), width, horz_even);
-            si32* sp = aug->line->i32;
-            for (ui32 i = width; i > 0; --i)
-              *sp++ >>= 1;
+            if (vert_even) {
+              if (transform_flags & HORZ_TRX)
+                rev_horz_syn(atk, aug->line, child_res->pull_line(),
+                  bands[1].pull_line(), width, horz_even);
+              else
+                memcpy(aug->line->i32, child_res->pull_line()->i32,
+                  width * sizeof(si32));
+            }
+            else
+            {
+              if (transform_flags & HORZ_TRX)
+                rev_horz_syn(atk, aug->line, bands[2].pull_line(),
+                  bands[3].pull_line(), width, horz_even);
+              else
+                memcpy(aug->line->i32, bands[2].pull_line()->i32,
+                  width * sizeof(si32));
+              si32* sp = aug->line->i32;
+              for (ui32 i = width; i > 0; --i)
+                *sp++ >>= 1;
+            }
+            return aug->line;
           }
-          return aug->line;
         }
-      }
-      else
-      {
-        if (res_rect.siz.h > 1)
+        else
         {
-          if (sig->active) {
-            sig->active = false;
-            return sig->line;
-          };
-          for (;;)
+          if (res_rect.siz.h > 1)
           {
-            //horizontal transform
-            if (cur_line < res_rect.siz.h)
+            if (sig->active) {
+              sig->active = false;
+              return sig->line;
+            };
+            for (;;)
             {
-              if (vert_even) { // even
-                irv_horz_syn(atk, aug->line,
-                  child_res->pull_line(), bands[1].pull_line(),
-                  width, horz_even);
-                aug->active = true;
-                vert_even = !vert_even;
-                ++cur_line;
-
-                const float K = atk->get_K();
-                irv_vert_times_K(K, aug->line, width);
-
-                continue;
-              }
-              else {
-                irv_horz_syn(atk, sig->line,
-                  bands[2].pull_line(), bands[3].pull_line(),
-                  width, horz_even);
-                sig->active = true;
-                vert_even = !vert_even;
-                ++cur_line;
-
-                const float K_inv = 1.0f / atk->get_K();
-                irv_vert_times_K(K_inv, sig->line, width);
+              //horizontal transform
+              if (cur_line < res_rect.siz.h)
+              {
+                if (vert_even) { // even
+                  if (transform_flags & HORZ_TRX)
+                    irv_horz_syn(atk, aug->line, child_res->pull_line(), 
+                      bands[1].pull_line(), width, horz_even);
+                  else 
+                    memcpy(aug->line->f32, child_res->pull_line()->f32,
+                      width * sizeof(float));
+                  aug->active = true;
+                  vert_even = !vert_even;
+                  ++cur_line;
+
+                  const float K = atk->get_K();
+                  irv_vert_times_K(K, aug->line, width);
+
+                  continue;
+                }
+                else {
+                  if (transform_flags & HORZ_TRX)
+                    irv_horz_syn(atk, sig->line, bands[2].pull_line(), 
+                      bands[3].pull_line(), width, horz_even);
+                  else
+                    memcpy(sig->line->f32, bands[2].pull_line()->f32,
+                      width * sizeof(float));
+                  sig->active = true;
+                  vert_even = !vert_even;
+                  ++cur_line;
+
+                  const float K_inv = 1.0f / atk->get_K();
+                  irv_vert_times_K(K_inv, sig->line, width);
+                }
               }
-            }
 
-            //vertical transform
-            for (ui32 i = 0; i < num_steps; ++i)
-            {
-              if (aug->active && (sig->active || ssp[i].active))
+              //vertical transform
+              for (ui32 i = 0; i < num_steps; ++i)
               {
-                line_buf* dp = aug->line;
-                line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
-                line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
-                const lifting_step* s = atk->get_step(num_steps - i - 1);
-                irv_vert_syn_step(s, dp, sp1, sp2, width);
+                if (aug->active && (sig->active || ssp[i].active))
+                {
+                  line_buf* dp = aug->line;
+                  line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
+                  line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
+                  const lifting_step* s = atk->get_step(i);
+                  irv_vert_syn_step(s, dp, sp1, sp2, width);
+                }
+                lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
               }
-              lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
-            }
 
-            if (aug->active) {
-              aug->active = false;
-              return aug->line;
+              if (aug->active) {
+                aug->active = false;
+                return aug->line;
+              }
+              if (sig->active) {
+                sig->active = false;
+                return sig->line;
+              };
             }
-            if (sig->active) {
-              sig->active = false;
-              return sig->line;
-            };
           }
+          else
+          {
+            if (vert_even) {
+              if (transform_flags & HORZ_TRX)
+                irv_horz_syn(atk, aug->line, child_res->pull_line(),
+                  bands[1].pull_line(), width, horz_even);
+              else
+                memcpy(aug->line->f32, child_res->pull_line()->f32,
+                  width * sizeof(float));
+            }
+            else
+            {
+              if (transform_flags & HORZ_TRX)
+                irv_horz_syn(atk, aug->line, bands[2].pull_line(),
+                  bands[3].pull_line(), width, horz_even);
+             else
+                memcpy(aug->line->f32, bands[2].pull_line()->f32,
+                  width * sizeof(float));
+              float* sp = aug->line->f32;
+              for (ui32 i = width; i > 0; --i)
+                *sp++ *= 0.5f;
+            }
+            return aug->line;
+          }
+        }
+      }
+      else
+      { 
+        if (reversible)
+        {
+          if (transform_flags & HORZ_TRX)
+            rev_horz_syn(atk, aug->line, child_res->pull_line(),
+              bands[1].pull_line(), width, horz_even);
+          else
+            memcpy(aug->line->i32, child_res->pull_line()->i32,
+              width * sizeof(si32));
+          return aug->line;
         }
         else
         {
-          if (vert_even)
+          if (transform_flags & HORZ_TRX)
             irv_horz_syn(atk, aug->line, child_res->pull_line(),
               bands[1].pull_line(), width, horz_even);
           else
-          {
-            irv_horz_syn(atk, aug->line, bands[2].pull_line(),
-              bands[3].pull_line(), width, horz_even);
-            float *sp = aug->line->f32;
-            for (ui32 i = width; i > 0; --i)
-              *sp++ *= 0.5f;
-          }
+            memcpy(aug->line->f32, child_res->pull_line()->f32,
+              width * sizeof(float));
           return aug->line;
         }
       }
diff --git a/src/core/codestream/ojph_resolution.h b/src/core/codestream/ojph_resolution.h
index 72e0b91a..635a4ced 100644
--- a/src/core/codestream/ojph_resolution.h
+++ b/src/core/codestream/ojph_resolution.h
@@ -61,6 +61,10 @@ namespace ojph {
     class resolution
     {
     public:
+      enum : ui32 {
+        HORZ_TRX = 0x01,   // horizontal transform
+        VERT_TRX = 0x02,   // vertical transform
+      };
 
     public:
       static void pre_alloc(codestream *codestream, const rect& res_rect,
@@ -68,8 +72,8 @@ namespace ojph {
                             ui32 comp_num, ui32 res_num);
       void finalize_alloc(codestream *codestream, const rect& res_rect,
                           const rect& recon_res_rect, ui32 comp_num,
-                          ui32 res_num, point comp_downsamp,
-                          tile_comp *parent_tile_comp,
+                          ui32 res_num, point comp_downsamp, 
+                          point res_downsamp, tile_comp *parent_tile_comp,
                           resolution *parent_res);
 
       line_buf* get_line();
@@ -77,6 +81,8 @@ namespace ojph {
       line_buf* pull_line();
       rect get_rect() { return res_rect; }
       ui32 get_comp_num() { return comp_num; }
+      bool has_horz_transform() { return (transform_flags & HORZ_TRX) != 0; }
+      bool has_vert_transform() { return (transform_flags & VERT_TRX) != 0; }
 
       ui32 prepare_precinct();
       void write_precincts(outfile_base *file);
@@ -92,7 +98,7 @@ namespace ojph {
     private:
       bool reversible, skipped_res_for_read, skipped_res_for_recon;
       ui32 num_steps;
-      ui32 num_bands, res_num;
+      ui32 res_num;
       ui32 comp_num;
       ui32 num_bytes; // number of bytes in this resolution 
                       // used for tilepart length
@@ -113,7 +119,7 @@ namespace ojph {
       ui32 level_index[20]; //more than enough
       point cur_precinct_loc; //used for progressing spatial modes (2, 3, 4)
       const param_atk* atk;
-      param_dfs::dfs_dwt_type downsampling_style;
+      ui32 transform_flags;
       //wavelet machinery
       ui32 cur_line;
       ui32 rows_to_produce;
diff --git a/src/core/codestream/ojph_subband.cpp b/src/core/codestream/ojph_subband.cpp
index dbef3b75..cf007fc9 100644
--- a/src/core/codestream/ojph_subband.cpp
+++ b/src/core/codestream/ojph_subband.cpp
@@ -55,7 +55,7 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void subband::pre_alloc(codestream *codestream, const rect &band_rect,
-                            ui32 comp_num, ui32 res_num)
+                            ui32 comp_num, ui32 res_num, ui32 transform_flags)
     {
       mem_fixed_allocator* allocator = codestream->get_allocator();
 
@@ -67,8 +67,11 @@ namespace ojph {
       size log_cb = cdp->get_log_block_dims();
       size log_PP = cdp->get_log_precinct_size(res_num);
 
-      ui32 xcb_prime = ojph_min(log_cb.w, log_PP.w - (res_num?1:0));
-      ui32 ycb_prime = ojph_min(log_cb.h, log_PP.h - (res_num?1:0));
+      ui32 x_off = ((transform_flags & resolution::HORZ_TRX) ? 1 : 0);
+      ui32 y_off = ((transform_flags & resolution::VERT_TRX) ? 1 : 0);
+
+      ui32 xcb_prime = ojph_min(log_cb.w, log_PP.w - x_off);
+      ui32 ycb_prime = ojph_min(log_cb.h, log_PP.h - y_off);
 
       size nominal(1 << xcb_prime, 1 << ycb_prime);
 
@@ -116,21 +119,30 @@ namespace ojph {
       size log_cb = cdp->get_log_block_dims();
       log_PP = cdp->get_log_precinct_size(res_num);
 
-      xcb_prime = ojph_min(log_cb.w, log_PP.w - (res_num?1:0));
-      ycb_prime = ojph_min(log_cb.h, log_PP.h - (res_num?1:0));
+      ui32 x_off = ((parent->has_horz_transform()) ? 1 : 0);
+      ui32 y_off = ((parent->has_vert_transform()) ? 1 : 0);
+
+      xcb_prime = ojph_min(log_cb.w, log_PP.w - x_off);
+      ycb_prime = ojph_min(log_cb.h, log_PP.h - y_off);
 
       size nominal(1 << xcb_prime, 1 << ycb_prime);
 
       cur_cb_row = 0;
       cur_line = 0;
       cur_cb_height = 0;
+      const param_dfs* dfs = NULL;
+      if (cdp->is_dfs_defined()) {
+        dfs = codestream->access_dfs();
+        if (dfs != NULL)
+          dfs = dfs->get_dfs(cdp->get_dfs_index());
+      }
       param_qcd* qcd = codestream->access_qcd(parent->get_comp_num());
       ui32 num_decomps = cdp->get_num_decompositions();
-      this->K_max = qcd->get_Kmax(NULL, num_decomps, this->res_num, band_num);
+      this->K_max = qcd->get_Kmax(dfs, num_decomps, this->res_num, band_num);
       if (!reversible)
       {
         float d = 
-          qcd->irrev_get_delta(NULL, num_decomps, res_num, subband_num);
+          qcd->irrev_get_delta(dfs, num_decomps, res_num, subband_num);
         d /= (float)(1u << (31 - this->K_max));
         delta = d;
         delta_inv = (1.0f/d);
@@ -199,14 +211,16 @@ namespace ojph {
       ui32 pc_lft = (res_rect.org.x >> log_PP.w) << log_PP.w;
       ui32 pc_top = (res_rect.org.y >> log_PP.h) << log_PP.h;
 
-      ui32 pcx0, pcx1, pcy0, pcy1, shift = (band_num != 0 ? 1 : 0);
+      ui32 pcx0, pcx1, pcy0, pcy1;
+      ui32 x_shift = parent->has_horz_transform() ? 1 : 0;
+      ui32 y_shift = parent->has_vert_transform() ? 1 : 0;
       ui32 yb, xb, coly = 0, colx = 0;
       for (ui32 y = 0; y < num_precincts.h; ++y)
       {
         pcy0 = ojph_max(try0, pc_top + (y << log_PP.h));
         pcy1 = ojph_min(try1, pc_top + ((y + 1) << log_PP.h));
-        pcy0 = (pcy0 - (band_num >> 1) + (1<<shift) - 1) >> shift;
-        pcy1 = (pcy1 - (band_num >> 1) + (1<<shift) - 1) >> shift;
+        pcy0 = (pcy0 - (band_num >> 1) + (1 << y_shift) - 1) >> y_shift;
+        pcy1 = (pcy1 - (band_num >> 1) + (1 << y_shift) - 1) >> y_shift;
 
         precinct *p = precincts + y * num_precincts.w;
         yb = ((pcy1 + (1<<ycb_prime) - 1) >> ycb_prime);
@@ -217,8 +231,8 @@ namespace ojph {
         {
           pcx0 = ojph_max(trx0, pc_lft + (x << log_PP.w));
           pcx1 = ojph_min(trx1, pc_lft + ((x + 1) << log_PP.w));
-          pcx0 = (pcx0 - (band_num & 1) + (1<<shift) - 1) >> shift;
-          pcx1 = (pcx1 - (band_num & 1) + (1<<shift) - 1) >> shift;
+          pcx0 = (pcx0 - (band_num & 1) + (1 << x_shift) - 1) >> x_shift;
+          pcx1 = (pcx1 - (band_num & 1) + (1 << x_shift) - 1) >> x_shift;
 
           rect *bp = p->cb_idxs + band_num;
           xb = ((pcx1 + (1<<xcb_prime) - 1) >> xcb_prime);
diff --git a/src/core/codestream/ojph_subband.h b/src/core/codestream/ojph_subband.h
index 5dd145e6..8cadae07 100644
--- a/src/core/codestream/ojph_subband.h
+++ b/src/core/codestream/ojph_subband.h
@@ -81,7 +81,7 @@ namespace ojph {
       }
 
       static void pre_alloc(codestream *codestream, const rect& band_rect,
-                            ui32 comp_num, ui32 res_num);
+                            ui32 comp_num, ui32 res_num, ui32 transform_flags);
       void finalize_alloc(codestream *codestream, const rect& band_rect,
                           resolution* res, ui32 res_num, ui32 subband_num);
 
@@ -91,6 +91,7 @@ namespace ojph {
 
       void get_cb_indices(const size& num_precincts, precinct *precincts);
       float get_delta() { return delta; }
+      bool exists() { return !empty; }
 
       line_buf* pull_line();
 
diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp
index 48f8bb56..3be907d4 100644
--- a/src/core/codestream/ojph_tile.cpp
+++ b/src/core/codestream/ojph_tile.cpp
@@ -131,8 +131,8 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void tile::finalize_alloc(codestream *codestream, const rect& tile_rect,
-                              const rect& recon_tile_rect, ui32 tile_idx, 
-                              ui32 offset, ui32 &num_tileparts)
+                              ui32 tile_idx, ui32& offset, 
+                              ui32 &num_tileparts)
     {
       //this->parent = codestream;
       mem_fixed_allocator* allocator = codestream->get_allocator();
@@ -167,33 +167,29 @@ namespace ojph {
 
       this->resilient = codestream->is_resilient();
       this->tile_rect = tile_rect;
-      this->recon_tile_rect = recon_tile_rect;
 
       ui32 tx0 = tile_rect.org.x;
       ui32 ty0 = tile_rect.org.y;
       ui32 tx1 = tile_rect.org.x + tile_rect.siz.w;
       ui32 ty1 = tile_rect.org.y + tile_rect.siz.h;
-      ui32 recon_tx0 = recon_tile_rect.org.x;
-      ui32 recon_ty0 = recon_tile_rect.org.y;
-      ui32 recon_tx1 = recon_tile_rect.org.x + recon_tile_rect.siz.w;
-      ui32 recon_ty1 = recon_tile_rect.org.y + recon_tile_rect.siz.h;
 
       ui32 width = 0;
       for (ui32 i = 0; i < num_comps; ++i)
       {
         point downsamp = szp->get_downsampling(i);
+        point recon_downsamp = szp->get_recon_downsampling(i);
 
         ui32 tcx0 = ojph_div_ceil(tx0, downsamp.x);
         ui32 tcy0 = ojph_div_ceil(ty0, downsamp.y);
         ui32 tcx1 = ojph_div_ceil(tx1, downsamp.x);
         ui32 tcy1 = ojph_div_ceil(ty1, downsamp.y);
-        ui32 recon_tcx0 = ojph_div_ceil(recon_tx0, downsamp.x);
-        ui32 recon_tcy0 = ojph_div_ceil(recon_ty0, downsamp.y);
-        ui32 recon_tcx1 = ojph_div_ceil(recon_tx1, downsamp.x);
-        ui32 recon_tcy1 = ojph_div_ceil(recon_ty1, downsamp.y);
+        ui32 recon_tcx0 = ojph_div_ceil(tx0, recon_downsamp.x);
+        ui32 recon_tcy0 = ojph_div_ceil(ty0, recon_downsamp.y);
+        ui32 recon_tcx1 = ojph_div_ceil(tx1, recon_downsamp.x);
+        ui32 recon_tcy1 = ojph_div_ceil(ty1, recon_downsamp.y);
 
         line_offsets[i] = 
-          recon_tcx0 - ojph_div_ceil(recon_tx0 - offset, downsamp.x);
+          recon_tcx0 - ojph_div_ceil(tx0 - offset, recon_downsamp.x);
         comp_rects[i].org.x = tcx0;
         comp_rects[i].org.y = tcy0;
         comp_rects[i].siz.w = tcx1 - tcx0;
@@ -212,6 +208,8 @@ namespace ojph {
         cur_line[i] = 0;
       }
 
+      offset += tile_rect.siz.w;
+
       //allocate lines
       const param_cod* cdp = codestream->get_cod();
       this->reversible = cdp->access_atk()->is_reversible();
diff --git a/src/core/codestream/ojph_tile.h b/src/core/codestream/ojph_tile.h
index b00c8181..056c7c94 100644
--- a/src/core/codestream/ojph_tile.h
+++ b/src/core/codestream/ojph_tile.h
@@ -63,8 +63,7 @@ namespace ojph {
       static void pre_alloc(codestream *codestream, const rect& tile_rect,
                             const rect& recon_tile_rect, ui32 &num_tileparts);
       void finalize_alloc(codestream *codestream, const rect& tile_rect,
-                          const rect& recon_tile_rect, ui32 tile_idx, 
-                          ui32 offset, ui32 &num_tileparts);
+                          ui32 tile_idx, ui32& offset, ui32 &num_tileparts);
 
       bool push(line_buf *line, ui32 comp_num);
       void prepare_for_flush();
@@ -77,7 +76,7 @@ namespace ojph {
 
     private:
       //codestream *parent;
-      rect tile_rect, recon_tile_rect;
+      rect tile_rect;
       ui32 num_comps;
       tile_comp *comps;
       ui32 num_lines;
diff --git a/src/core/codestream/ojph_tile_comp.cpp b/src/core/codestream/ojph_tile_comp.cpp
index 69ed0bcb..83d1b624 100644
--- a/src/core/codestream/ojph_tile_comp.cpp
+++ b/src/core/codestream/ojph_tile_comp.cpp
@@ -83,7 +83,8 @@ namespace ojph {
       this->num_bytes = 0;
       res = allocator->post_alloc_obj<resolution>(1);
       res->finalize_alloc(codestream, comp_rect, recon_comp_rect, comp_num,
-                          num_decomps, comp_downsamp, this, NULL);
+                          num_decomps, comp_downsamp, comp_downsamp, this, 
+                          NULL);
     }
 
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp
index 028ac013..b031860e 100644
--- a/src/core/transform/ojph_transform.cpp
+++ b/src/core/transform/ojph_transform.cpp
@@ -408,9 +408,9 @@ namespace ojph {
                                const line_buf* other, const line_buf* aug, 
                                ui32 repeat)
     {
-      si32 a = s->rev.Aatk;
-      si32 b = s->rev.Batk;
-      ui32 e = s->rev.Eatk;
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui32 e = s->rev.Eatk;
 
       si32* dst = aug->i32;
       const si32* src1 = sig->i32, * src2 = other->i32;
@@ -419,7 +419,7 @@ namespace ojph {
           *dst++ += (b + a * (*src1++ + *src2++)) >> e;
       else
         for (ui32 i = repeat; i > 0; --i)
-          *dst++ -= (b - a * (*src1++ + *src2++)) >> e;
+          *dst++ -= (- b - a * (*src1++ + *src2++)) >> e;
     }
 
     /////////////////////////////////////////////////////////////////////////
@@ -451,13 +451,13 @@ namespace ojph {
         ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
         ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
         ui32 num_steps = atk->get_num_steps();
-        for (ui32 j = 0; j < num_steps; ++j)
+        for (ui32 j = num_steps; j > 0; --j)
         {
           // first lifting step
-          const lifting_step* s = atk->get_step(j);
-          si32 a = s->rev.Aatk;
-          si32 b = s->rev.Batk;
-          ui32 e = s->rev.Eatk;
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui32 e = s->rev.Eatk;
 
           // extension
           lp[-1] = lp[0];
@@ -470,7 +470,7 @@ namespace ojph {
               *dp += (b + a * (sp[-1] + sp[0])) >> e;
           else
             for (ui32 i = h_width; i > 0; --i, sp++, dp++)
-              *dp -= (b - a * (sp[-1] + sp[0])) >> e;
+              *dp -= (- b - a * (sp[-1] + sp[0])) >> e;
 
           // swap buffers
           si32* t = lp; lp = hp; hp = t;
@@ -491,9 +491,9 @@ namespace ojph {
                                const line_buf* sig, const line_buf* other, 
                                ui32 repeat)
     {
-      si32 a = s->rev.Aatk;
-      si32 b = s->rev.Batk;
-      ui32 e = s->rev.Eatk;
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui32 e = s->rev.Eatk;
 
       si32* dst = aug->i32;
       const si32* src1 = sig->i32, * src2 = other->i32;
@@ -502,7 +502,7 @@ namespace ojph {
           *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
       else
         for (ui32 i = repeat; i > 0; --i)
-          *dst++ += (b - a * (*src1++ + *src2++)) >> e;
+          *dst++ += (- b - a * (*src1++ + *src2++)) >> e;
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -517,13 +517,12 @@ namespace ojph {
         ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
         ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
         ui32 num_steps = atk->get_num_steps();
-        for (ui32 j = num_steps; j > 0; --j)
+        for (ui32 j = 0; j < num_steps; ++j)
         {
-          // first lifting step
-          const lifting_step* s = atk->get_step(j - 1);
-          si32 a = s->rev.Aatk;
-          si32 b = s->rev.Batk;
-          ui32 e = s->rev.Eatk;
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui32 e = s->rev.Eatk;
 
           // extension
           oth[-1] = oth[0];
@@ -536,7 +535,7 @@ namespace ojph {
               *dp -= (b + a * (sp[-1] + sp[0])) >> e;
           else
             for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
-              *dp += (b - a * (sp[-1] + sp[0])) >> e;
+              *dp += (- b - a * (sp[-1] + sp[0])) >> e;
 
           // swap buffers
           si32* t = aug; aug = oth; oth = t;
@@ -793,11 +792,11 @@ namespace ojph {
         ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
         ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
         ui32 num_steps = atk->get_num_steps();
-        for (ui32 j = 0; j < num_steps; ++j)
+        for (ui32 j = num_steps; j > 0; --j)
         {
           // first lifting step
-          const lifting_step* s = atk->get_step(j);
-          float a = s->irv.Aatk;
+          const lifting_step* s = atk->get_step(j - 1);
+          const float a = s->irv.Aatk;
 
           // extension
           lp[-1] = lp[0];
@@ -878,10 +877,10 @@ namespace ojph {
         }
 
         ui32 num_steps = atk->get_num_steps();
-        for (ui32 j = num_steps; j > 0; --j)
+        for (ui32 j = 0; j < num_steps; ++j)
         {
-          const lifting_step* s = atk->get_step(j - 1);
-          float a = s->irv.Aatk;
+          const lifting_step* s = atk->get_step(j);
+          const float a = s->irv.Aatk;
 
           // extension
           oth[-1] = oth[0];

From c87d3e402b262687f29c898526c035f0f57f0024 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Tue, 9 Apr 2024 00:01:28 +1000
Subject: [PATCH 20/37] Small bug fix for previous commit.

---
 src/core/codestream/ojph_codestream_local.cpp | 3 +--
 src/core/codestream/ojph_params_local.h       | 4 +++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/core/codestream/ojph_codestream_local.cpp b/src/core/codestream/ojph_codestream_local.cpp
index 5f72d3e8..e9f56d04 100644
--- a/src/core/codestream/ojph_codestream_local.cpp
+++ b/src/core/codestream/ojph_codestream_local.cpp
@@ -220,7 +220,6 @@ namespace ojph {
       point index;
       rect tile_rect;
       ojph::param_siz sz = access_siz();
-      ui32 ds = 1 << skipped_res_for_recon;
       for (index.y = 0; index.y < num_tiles.h; ++index.y)
       {
         ui32 y0 = sz.get_tile_offset().y
@@ -546,7 +545,7 @@ namespace ojph {
                                    ui32 num_comments)
     {
       //finalize
-      siz.check_validity();
+      siz.check_validity(cod);
       cod.check_validity(siz);  
       cod.update_atk(atk);
       qcd.check_validity(siz, cod);
diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h
index 1ee508dc..f4f2c9f4 100644
--- a/src/core/codestream/ojph_params_local.h
+++ b/src/core/codestream/ojph_params_local.h
@@ -202,8 +202,10 @@ namespace ojph {
         cptr[comp_num].YRsiz = (ui8)downsampling.y;
       }
 
-      void check_validity()
+      void check_validity(const param_cod& cod)
       {
+        this->cod = &cod;
+
         if (XTsiz == 0 && YTsiz == 0)
         { XTsiz = Xsiz + XOsiz; YTsiz = Ysiz + YOsiz; }
         if (Xsiz == 0 || Ysiz == 0 || XTsiz == 0 || YTsiz == 0)

From b1c71574406078c24faa38e4a9c2c71a9ed8f1b3 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Tue, 9 Apr 2024 14:23:32 +1000
Subject: [PATCH 21/37] A small bug fix

---
 src/core/codestream/ojph_params_local.h | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/core/codestream/ojph_params_local.h b/src/core/codestream/ojph_params_local.h
index f4f2c9f4..1958b8e8 100644
--- a/src/core/codestream/ojph_params_local.h
+++ b/src/core/codestream/ojph_params_local.h
@@ -487,11 +487,21 @@ namespace ojph {
 
       ////////////////////////////////////////
       bool packets_may_use_sop() const
-      { return (Scod & 2) == 2; }
+      { 
+        if (parent)
+          return (parent->Scod & 2) == 2; 
+        else
+          return (Scod & 2) == 2;
+      }
 
       ////////////////////////////////////////
       bool packets_use_eph() const
-      { return (Scod & 4) == 4; }
+      { 
+        if (parent)
+          return (parent->Scod & 4) == 4;
+        else
+          return (Scod & 4) == 4;
+      }
 
       ////////////////////////////////////////
       bool write(outfile_base *file);
@@ -507,8 +517,8 @@ namespace ojph {
       void update_atk(const param_atk* atk);
 
       ////////////////////////////////////////
-      void link_cod(const param_cod* cod)
-      { this->next = cod; }
+      void link_cod(const param_cod* coc)
+      { this->next = coc; }
 
       ////////////////////////////////////////
       const param_cod* get_cod(ui32 comp_num) const
@@ -544,7 +554,7 @@ namespace ojph {
       ui8 Scod;             // serves as Scod and Scoc
       cod_SGcod SGCod;      // Used in COD and copied to COC
       cod_SPcod SPcod;      // serves as SPcod and SPcoc
-      const param_cod* next;// to link cod parameters
+      const param_cod* next;// to chain coc parameters to cod
 
     private: // COC only variables
       param_cod* parent;    // parent COD structure

From a18e7fb47ae972ebcdcb3627b25da3f72f4b9268 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Wed, 10 Apr 2024 22:56:02 +1000
Subject: [PATCH 22/37] Working on SIMD. SSE and AVX is largely done, except
 the core horizontal transform.

---
 src/core/transform/ojph_colour_sse.cpp     |   2 +-
 src/core/transform/ojph_colour_sse2.cpp    |   2 +-
 src/core/transform/ojph_transform.cpp      | 490 +++------------------
 src/core/transform/ojph_transform.h        |  58 ---
 src/core/transform/ojph_transform_avx.cpp  | 460 +++++++++----------
 src/core/transform/ojph_transform_local.h  | 334 +++++++-------
 src/core/transform/ojph_transform_sse.cpp  | 421 +++++++++---------
 src/core/transform/ojph_transform_sse2.cpp |   2 +-
 8 files changed, 650 insertions(+), 1119 deletions(-)

diff --git a/src/core/transform/ojph_colour_sse.cpp b/src/core/transform/ojph_colour_sse.cpp
index 89cc86c2..edd1eaf2 100644
--- a/src/core/transform/ojph_colour_sse.cpp
+++ b/src/core/transform/ojph_colour_sse.cpp
@@ -42,7 +42,7 @@
 #include "ojph_colour.h"
 #include "ojph_colour_local.h"
 
-#include <immintrin.h>
+#include <xmmintrin.h>
 
 namespace ojph {
   namespace local {
diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp
index 4bb56f29..4a3cb145 100644
--- a/src/core/transform/ojph_colour_sse2.cpp
+++ b/src/core/transform/ojph_colour_sse2.cpp
@@ -41,7 +41,7 @@
 #include "ojph_arch.h"
 #include "ojph_colour.h"
 
-#include <immintrin.h>
+#include <emmintrin.h>
 
 namespace ojph {
   namespace local {
diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp
index b031860e..eba4f006 100644
--- a/src/core/transform/ojph_transform.cpp
+++ b/src/core/transform/ojph_transform.cpp
@@ -53,40 +53,6 @@ namespace ojph {
     // Reversible functions
     /////////////////////////////////////////////////////////////////////////
 
-    /////////////////////////////////////////////////////////////////////////
-    void (*rev_vert_wvlt_fwd_predict)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat) = NULL;
-
-    /////////////////////////////////////////////////////////////////////////
-    void (*rev_vert_wvlt_fwd_update)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat) = NULL;
-
-    /////////////////////////////////////////////////////////////////////////
-    void (*rev_horz_wvlt_fwd_tx)
-      (line_buf* src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
-      = NULL;
-
-    /////////////////////////////////////////////////////////////////////////
-    void (*rev_vert_wvlt_bwd_predict)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat) = NULL;
-
-    /////////////////////////////////////////////////////////////////////////
-    void (*rev_vert_wvlt_bwd_update)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat) = NULL;
-
-    /////////////////////////////////////////////////////////////////////////
-    void (*rev_horz_wvlt_bwd_tx)
-      (line_buf* dst, line_buf *lsrc, line_buf *hsrc, ui32 width, bool even)
-      = NULL;
-
-
-
-
-
     /////////////////////////////////////////////////////////////////////////
     void (*rev_vert_ana_step)
       (const lifting_step* s, const line_buf* sig, const line_buf* other,
@@ -106,39 +72,11 @@ namespace ojph {
     void (*rev_horz_syn)
       (const param_atk* atk, const line_buf* dst, const line_buf* lsrc,
         const line_buf* hsrc, ui32 width, bool even) = NULL;
-
-
-
-
     
     /////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     /////////////////////////////////////////////////////////////////////////
 
-    /////////////////////////////////////////////////////////////////////////
-    void (*irrev_vert_wvlt_step)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       int step_num, ui32 repeat) = NULL;
-
-    /////////////////////////////////////////////////////////////////////////
-    void (*irrev_vert_wvlt_K)
-      (const line_buf *src, line_buf *dst, bool L_analysis_or_H_synthesis,
-       ui32 repeat) = NULL;
-
-    /////////////////////////////////////////////////////////////////////////
-    void (*irrev_horz_wvlt_fwd_tx)
-      (line_buf* src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
-      = NULL;
-
-    /////////////////////////////////////////////////////////////////////////
-    void (*irrev_horz_wvlt_bwd_tx)
-      (line_buf* src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
-      = NULL;
-
-
-
-
-
     /////////////////////////////////////////////////////////////////////////
     void (*irv_vert_ana_step)
       (const lifting_step* s, const line_buf* sig, const line_buf* other,
@@ -163,10 +101,6 @@ namespace ojph {
     void (*irv_vert_times_K)
       (float K, const line_buf* aug, ui32 repeat) = NULL;
 
-
-
-
-
     ////////////////////////////////////////////////////////////////////////////
     static bool wavelet_transform_functions_initialized = false;
 
@@ -178,23 +112,11 @@ namespace ojph {
 
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
-      rev_vert_wvlt_fwd_predict = gen_rev_vert_wvlt_fwd_predict;
-      rev_vert_wvlt_fwd_update  = gen_rev_vert_wvlt_fwd_update;
-      rev_horz_wvlt_fwd_tx      = gen_rev_horz_wvlt_fwd_tx;
-      rev_vert_wvlt_bwd_predict = gen_rev_vert_wvlt_bwd_predict;
-      rev_vert_wvlt_bwd_update  = gen_rev_vert_wvlt_bwd_update;
-      rev_horz_wvlt_bwd_tx      = gen_rev_horz_wvlt_bwd_tx;
-
       rev_vert_ana_step         = gen_rev_vert_ana_step;
       rev_horz_ana              = gen_rev_horz_ana;
       rev_vert_syn_step         = gen_rev_vert_syn_step;
       rev_horz_syn              = gen_rev_horz_syn;
 
-      irrev_vert_wvlt_step      = gen_irrev_vert_wvlt_step;
-      irrev_vert_wvlt_K         = gen_irrev_vert_wvlt_K;
-      irrev_horz_wvlt_fwd_tx    = gen_irrev_horz_wvlt_fwd_tx;
-      irrev_horz_wvlt_bwd_tx    = gen_irrev_horz_wvlt_bwd_tx;
-
       irv_vert_ana_step         = gen_irv_vert_ana_step;
       irv_horz_ana              = gen_irv_horz_ana;      
       irv_vert_syn_step         = gen_irv_vert_syn_step;
@@ -206,203 +128,74 @@ namespace ojph {
 
       if (level >= X86_CPU_EXT_LEVEL_SSE)
       {
-        irrev_vert_wvlt_step    = sse_irrev_vert_wvlt_step;
-        irrev_vert_wvlt_K       = sse_irrev_vert_wvlt_K;
-        irrev_horz_wvlt_fwd_tx  = sse_irrev_horz_wvlt_fwd_tx;
-        irrev_horz_wvlt_bwd_tx  = sse_irrev_horz_wvlt_bwd_tx;
+        irv_vert_ana_step         = sse_irv_vert_ana_step;
+        irv_horz_ana              = sse_irv_horz_ana;
+        irv_vert_syn_step         = sse_irv_vert_syn_step;
+        irv_horz_syn              = sse_irv_horz_syn;
+        irv_vert_times_K          = sse_irv_vert_times_K;
       }
 
-      if (level >= X86_CPU_EXT_LEVEL_SSE2)
-      {
-        rev_vert_wvlt_fwd_predict = sse2_rev_vert_wvlt_fwd_predict;
-        rev_vert_wvlt_fwd_update  = sse2_rev_vert_wvlt_fwd_update;
-        rev_horz_wvlt_fwd_tx      = sse2_rev_horz_wvlt_fwd_tx;
-        rev_vert_wvlt_bwd_predict = sse2_rev_vert_wvlt_bwd_predict;
-        rev_vert_wvlt_bwd_update  = sse2_rev_vert_wvlt_bwd_update;
-        rev_horz_wvlt_bwd_tx      = sse2_rev_horz_wvlt_bwd_tx;
-      }
+      //if (level >= X86_CPU_EXT_LEVEL_SSE2)
+      //{
+      //  rev_vert_ana_step         = sse2_rev_vert_ana_step;
+      //  rev_horz_ana              = sse2_rev_horz_ana;
+      //  rev_vert_syn_step         = sse2_rev_vert_syn_step;
+      //  rev_horz_syn              = sse2_rev_horz_syn;
+      //}
 
       if (level >= X86_CPU_EXT_LEVEL_AVX)
       {
-        irrev_vert_wvlt_step   = avx_irrev_vert_wvlt_step;
-        irrev_vert_wvlt_K      = avx_irrev_vert_wvlt_K;
-        irrev_horz_wvlt_fwd_tx = avx_irrev_horz_wvlt_fwd_tx;
-        irrev_horz_wvlt_bwd_tx = avx_irrev_horz_wvlt_bwd_tx;
+        irv_vert_ana_step         = avx_irv_vert_ana_step;
+        irv_horz_ana              = avx_irv_horz_ana;      
+        irv_vert_syn_step         = avx_irv_vert_syn_step;
+        irv_horz_syn              = avx_irv_horz_syn;
+        irv_vert_times_K          = avx_irv_vert_times_K;
       }
 
-      if (level >= X86_CPU_EXT_LEVEL_AVX2)
-      {
-        rev_vert_wvlt_fwd_predict = avx2_rev_vert_wvlt_fwd_predict;
-        rev_vert_wvlt_fwd_update  = avx2_rev_vert_wvlt_fwd_update;
-        rev_horz_wvlt_fwd_tx      = avx2_rev_horz_wvlt_fwd_tx;
-        rev_vert_wvlt_bwd_predict = avx2_rev_vert_wvlt_bwd_predict;
-        rev_vert_wvlt_bwd_update  = avx2_rev_vert_wvlt_bwd_update;
-        rev_horz_wvlt_bwd_tx      = avx2_rev_horz_wvlt_bwd_tx;
-      }
+      //if (level >= X86_CPU_EXT_LEVEL_AVX2)
+      //{
+      //  rev_vert_ana_step         = avx2_rev_vert_ana_step;
+      //  rev_horz_ana              = avx2_rev_horz_ana;
+      //  rev_vert_syn_step         = avx2_rev_vert_syn_step;
+      //  rev_horz_syn              = avx2_rev_horz_syn;
+      //}
+
+      //if (level >= X86_CPU_EXT_LEVEL_AVX512)
+      //{
+      //  rev_vert_ana_step         = avx512_rev_vert_ana_step;
+      //  rev_horz_ana              = avx512_rev_horz_ana;
+      //  rev_vert_syn_step         = avx512_rev_vert_syn_step;
+      //  rev_horz_syn              = avx512_rev_horz_syn;
+
+      //  irv_vert_ana_step         = avx512_irv_vert_ana_step;
+      //  irv_horz_ana              = avx512_irv_horz_ana;      
+      //  irv_vert_syn_step         = avx512_irv_vert_syn_step;
+      //  irv_horz_syn              = avx512_irv_horz_syn;
+      //  irv_vert_times_K          = avx512_irv_vert_times_K;
+      //}
+
 #endif // !OJPH_DISABLE_INTEL_SIMD
 
 #else // OJPH_ENABLE_WASM_SIMD
-      rev_vert_wvlt_fwd_predict = wasm_rev_vert_wvlt_fwd_predict;
-      rev_vert_wvlt_fwd_update  = wasm_rev_vert_wvlt_fwd_update;
-      rev_horz_wvlt_fwd_tx      = wasm_rev_horz_wvlt_fwd_tx;
-      rev_vert_wvlt_bwd_predict = wasm_rev_vert_wvlt_bwd_predict;
-      rev_vert_wvlt_bwd_update  = wasm_rev_vert_wvlt_bwd_update;
-      rev_horz_wvlt_bwd_tx      = wasm_rev_horz_wvlt_bwd_tx;
-      irrev_vert_wvlt_step      = wasm_irrev_vert_wvlt_step;
-      irrev_vert_wvlt_K         = wasm_irrev_vert_wvlt_K;
-      irrev_horz_wvlt_fwd_tx    = wasm_irrev_horz_wvlt_fwd_tx;
-      irrev_horz_wvlt_bwd_tx    = wasm_irrev_horz_wvlt_bwd_tx;
+      rev_vert_ana_step         = wasm_rev_vert_ana_step;
+      rev_horz_ana              = wasm_rev_horz_ana;
+      rev_vert_syn_step         = wasm_rev_vert_syn_step;
+      rev_horz_syn              = wasm_rev_horz_syn;
+
+      irv_vert_ana_step         = wasm_irv_vert_ana_step;
+      irv_horz_ana              = wasm_irv_horz_ana;      
+      irv_vert_syn_step         = wasm_irv_vert_syn_step;
+      irv_horz_syn              = wasm_irv_horz_syn;
+      irv_vert_times_K          = wasm_irv_vert_times_K;
 #endif // !OJPH_ENABLE_WASM_SIMD
 
       wavelet_transform_functions_initialized = true;
     }
     
     //////////////////////////////////////////////////////////////////////////
-    const float LIFTING_FACTORS::steps[8] =
-    {
-      -1.586134342059924f, -0.052980118572961f, +0.882911075530934f,
-      +0.443506852043971f,
-      +1.586134342059924f, +0.052980118572961f, -0.882911075530934f,
-      -0.443506852043971f
-    };
-    const float LIFTING_FACTORS::K = 1.230174104914001f;
-    const float LIFTING_FACTORS::K_inv  = (float)(1.0 / 1.230174104914001);
-
-    //////////////////////////////////////////////////////////////////////////
 
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_fwd_predict(const line_buf* line_src1,
-                                       const line_buf* line_src2,
-                                       line_buf *line_dst, ui32 repeat)
-    {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-      for (ui32 i = repeat; i > 0; --i)
-        *dst++ -= (*src1++ + *src2++) >> 1;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_fwd_update(const line_buf* line_src1,
-                                      const line_buf* line_src2,
-                                      line_buf *line_dst, ui32 repeat)
-    {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-      for (ui32 i = repeat; i > 0; --i)
-        *dst++ += (*src1++ + *src2++ + 2) >> 2;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst,
-                                  line_buf *line_hdst, ui32 width, bool even)
-    {
-      if (width > 1)
-      {
-        si32 *src = line_src->i32;
-        si32 *ldst = line_ldst->i32, *hdst = line_hdst->i32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        // extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const si32* sp = src + (even ? 1 : 0);
-        si32 *dph = hdst;
-        for (ui32 i = H_width; i > 0; --i, sp+=2)
-          *dph++ = sp[0] - ((sp[-1] + sp[1]) >> 1);
-
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        sp = src + (even ? 0 : 1);
-        const si32* sph = hdst + (even ? 0 : 1);
-        si32 *dpl = ldst;
-        for (ui32 i = L_width; i > 0; --i, sp+=2, sph++)
-          *dpl++ = *sp + ((2 + sph[-1] + sph[0]) >> 2);
-      }
-      else
-      {
-        if (even)
-          line_ldst->i32[0] = line_src->i32[0];
-        else
-          line_hdst->i32[0] = line_src->i32[0] << 1;
-      }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_bwd_predict(const line_buf* line_src1,
-                                       const line_buf* line_src2,
-                                       line_buf *line_dst, ui32 repeat)
-    {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-      for (ui32 i = repeat; i > 0; --i)
-        *dst++ += (*src1++ + *src2++) >> 1;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_bwd_update(const line_buf* line_src1,
-                                      const line_buf* line_src2,
-                                      line_buf *line_dst, ui32 repeat)
-    {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-      for (ui32 i = repeat; i > 0; --i)
-        *dst++ -= (2 + *src1++ + *src2++) >> 2;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
-                                  line_buf *line_hsrc, ui32 width, bool even)
-    {
-      if (width > 1)
-      {
-        si32 *lsrc = line_lsrc->i32, *hsrc = line_hsrc->i32;
-        si32 *dst = line_dst->i32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        // extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        const si32 *sph = hsrc + (even ? 0 : 1);
-        si32 *spl = lsrc;
-        for (ui32 i = L_width; i > 0; --i, sph++, spl++)
-          *spl -= ((2 + sph[-1] + sph[0]) >> 2);
-
-        // extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width - 1];
-        // inverse predict and combine
-        si32 *dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        for (ui32 i = L_width + (even ? 0 : 1); i > 0; --i, spl++, sph++)
-        {
-          *dp++ = *spl;
-          *dp++ = *sph + ((spl[0] + spl[1]) >> 1);
-        }
-      }
-      else
-      {
-        if (even)
-          line_dst->i32[0] = line_lsrc->i32[0];
-        else
-          line_dst->i32[0] = line_hsrc->i32[0] >> 1;
-      }
-    }
-
-
-
-
-
     /////////////////////////////////////////////////////////////////////////
     void gen_rev_vert_ana_step(const lifting_step* s, const line_buf* sig, 
                                const line_buf* other, const line_buf* aug, 
@@ -569,187 +362,6 @@ namespace ojph {
       }
     }
 
-
-
-
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_irrev_vert_wvlt_step(const line_buf* line_src1,
-                                  const line_buf* line_src2,
-                                  line_buf *line_dst,
-                                  int step_num, ui32 repeat)
-    {
-      float *dst = line_dst->f32;
-      const float *src1 = line_src1->f32, *src2 = line_src2->f32;
-      float factor = LIFTING_FACTORS::steps[step_num];
-      for (ui32 i = repeat; i > 0; --i)
-        *dst++ += factor * (*src1++ + *src2++);
-    }
-
-    /////////////////////////////////////////////////////////////////////////
-    void gen_irrev_vert_wvlt_K(const line_buf* line_src,
-                               line_buf* line_dst,
-                               bool L_analysis_or_H_synthesis, ui32 repeat)
-    {
-      float *dst = line_dst->f32;
-      const float *src = line_src->f32;
-      float factor = LIFTING_FACTORS::K_inv;
-      factor = L_analysis_or_H_synthesis ? factor : LIFTING_FACTORS::K;
-      for (ui32 i = repeat; i > 0; --i)
-        *dst++ = *src++ * factor;
-    }
-
-
-    /////////////////////////////////////////////////////////////////////////
-    void gen_irrev_horz_wvlt_fwd_tx(line_buf* line_src,
-                                    line_buf *line_ldst,
-                                    line_buf *line_hdst,
-                                    ui32 width, bool even)
-    {
-      if (width > 1)
-      {
-        float *src = line_src->f32;
-        float *ldst = line_ldst->f32, *hdst = line_hdst->f32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        //extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        float factor = LIFTING_FACTORS::steps[0];
-        const float* sp = src + (even ? 1 : 0);
-        float *dph = hdst;
-        for (ui32 i = H_width; i > 0; --i, sp+=2)
-          *dph++ = sp[0] + factor * (sp[-1] + sp[1]);
-
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = LIFTING_FACTORS::steps[1];
-        sp = src + (even ? 0 : 1);
-        const float* sph = hdst + (even ? 0 : 1);
-        float *dpl = ldst;
-        for (ui32 i = L_width; i > 0; --i, sp+=2, sph++)
-          *dpl++ = sp[0] + factor * (sph[-1] + sph[0]);
-
-        //extension
-        ldst[-1] = ldst[0];
-        ldst[L_width] = ldst[L_width-1];
-        //predict
-        factor = LIFTING_FACTORS::steps[2];
-        const float* spl = ldst + (even ? 1 : 0);
-        dph = hdst;
-        for (ui32 i = H_width; i > 0; --i, spl++)
-          *dph++ += factor * (spl[-1] + spl[0]);
-
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = LIFTING_FACTORS::steps[3];
-        sph = hdst + (even ? 0 : 1);
-        dpl = ldst;
-        for (ui32 i = L_width; i > 0; --i, sph++)
-          *dpl++ += factor * (sph[-1] + sph[0]);
-
-        //multipliers
-        float *dp = ldst;
-        for (ui32 i = L_width; i > 0; --i, dp++)
-          *dp *= LIFTING_FACTORS::K_inv;
-        dp = hdst;
-        for (ui32 i = H_width; i > 0; --i, dp++)
-          *dp *= LIFTING_FACTORS::K;
-      }
-      else
-      {
-        if (even)
-          line_ldst->f32[0] = line_src->f32[0];
-        else
-          line_hdst->f32[0] = line_src->f32[0] + line_src->f32[0];
-      }
-    }
-
-    /////////////////////////////////////////////////////////////////////////
-    void gen_irrev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
-                                    line_buf *line_hsrc, ui32 width,
-                                    bool even)
-    {
-      if (width > 1)
-      {
-        float *lsrc = line_lsrc->f32, *hsrc = line_hsrc->f32;
-        float *dst = line_dst->f32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        //multipliers
-        float *dp = lsrc;
-        for (ui32 i = L_width; i > 0; --i, dp++)
-          *dp *= LIFTING_FACTORS::K;
-        dp = hsrc;
-        for (ui32 i = H_width; i > 0; --i, dp++)
-          *dp *= LIFTING_FACTORS::K_inv;
-
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        float factor = LIFTING_FACTORS::steps[7];
-        const float *sph = hsrc + (even ? 0 : 1);
-        float *dpl = lsrc;
-        for (ui32 i = L_width; i > 0; --i, dpl++, sph++)
-          *dpl += factor * (sph[-1] + sph[0]);
-
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict
-        factor = LIFTING_FACTORS::steps[6];
-        const float *spl = lsrc + (even ? 0 : -1);
-        float *dph = hsrc;
-        for (ui32 i = H_width; i > 0; --i, dph++, spl++)
-          *dph += factor * (spl[0] + spl[1]);
-
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = LIFTING_FACTORS::steps[5];
-        sph = hsrc + (even ? 0 : 1);
-        dpl = lsrc;
-        for (ui32 i = L_width; i > 0; --i, dpl++, sph++)
-          *dpl += factor * (sph[-1] + sph[0]);
-
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict and combine
-        factor = LIFTING_FACTORS::steps[4];
-        dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        for (ui32 i = L_width+(even?0:1); i > 0; --i, spl++, sph++)
-        {
-          *dp++ = *spl;
-          *dp++ = *sph + factor * (spl[0] + spl[1]);
-        }
-      }
-      else
-      {
-        if (even)
-          line_dst->f32[0] = line_lsrc->f32[0];
-        else
-          line_dst->f32[0] = line_hsrc->f32[0] * 0.5f;
-      }
-    }
-
-
-
-
-
     //////////////////////////////////////////////////////////////////////////
     void gen_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
                                const line_buf* other, const line_buf* aug, 
@@ -833,8 +445,6 @@ namespace ojph {
         else
           hdst->f32[0] = src->f32[0] * 2.0f;
       }
-
-
     }
     
     //////////////////////////////////////////////////////////////////////////
@@ -925,8 +535,6 @@ namespace ojph {
         *dst++ *= K;
     }
 
-
-
 #endif // !OJPH_ENABLE_WASM_SIMD
 
   }
diff --git a/src/core/transform/ojph_transform.h b/src/core/transform/ojph_transform.h
index b31df0ef..1aae8b82 100644
--- a/src/core/transform/ojph_transform.h
+++ b/src/core/transform/ojph_transform.h
@@ -54,37 +54,6 @@ namespace ojph {
     // Reversible functions
     /////////////////////////////////////////////////////////////////////////
 
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*rev_vert_wvlt_fwd_predict)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat);
-
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*rev_vert_wvlt_fwd_update)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat);
-
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*rev_horz_wvlt_fwd_tx)
-      (line_buf* src, line_buf *ldst, line_buf *hdst, ui32 width, bool even);
-
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*rev_vert_wvlt_bwd_predict)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat);
-
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*rev_vert_wvlt_bwd_update)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       ui32 repeat);
-
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*rev_horz_wvlt_bwd_tx)
-      (line_buf* dst, line_buf *lsrc, line_buf *hsrc, ui32 width, bool even);
-
-
-
-
     /////////////////////////////////////////////////////////////////////////
     extern void (*rev_vert_ana_step)
       (const lifting_step* s, const line_buf* sig, const line_buf* other,
@@ -107,35 +76,10 @@ namespace ojph {
 
 
 
-
-
     /////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     /////////////////////////////////////////////////////////////////////////
 
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*irrev_vert_wvlt_step)
-      (const line_buf* src1, const line_buf* src2, line_buf *dst,
-       int step_num, ui32 repeat);
-
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*irrev_vert_wvlt_K)
-      (const line_buf *src, line_buf *dst, bool L_analysis_or_H_synthesis,
-       ui32 repeat);
-
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*irrev_horz_wvlt_fwd_tx)
-      (line_buf* src, line_buf *ldst, line_buf *hdst, ui32 width, bool even);
-
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*irrev_horz_wvlt_bwd_tx)
-      (line_buf* src, line_buf *ldst, line_buf *hdst, ui32 width, bool even);
-
-
-
-
-
-
     /////////////////////////////////////////////////////////////////////////
     extern void (*irv_vert_ana_step)
       (const lifting_step* s, const line_buf* sig, const line_buf* other, 
@@ -161,8 +105,6 @@ namespace ojph {
       (float K, const line_buf* aug, ui32 repeat);
 
 
-
-
   }
 }
 
diff --git a/src/core/transform/ojph_transform_avx.cpp b/src/core/transform/ojph_transform_avx.cpp
index 725d7ce8..743ceee6 100644
--- a/src/core/transform/ojph_transform_avx.cpp
+++ b/src/core/transform/ojph_transform_avx.cpp
@@ -36,6 +36,7 @@
 //***************************************************************************/
 
 #include <cstdio>
+#include <immintrin.h>
 
 #include "ojph_defs.h"
 #include "ojph_arch.h"
@@ -43,22 +44,23 @@
 #include "ojph_transform.h"
 #include "ojph_transform_local.h"
 
-#include <immintrin.h>
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
 
 namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    void avx_irrev_vert_wvlt_step(const line_buf* line_src1,
-                                  const line_buf* line_src2,
-                                  line_buf *line_dst, int step_num,
-                                  ui32 repeat)
+    void avx_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
+                               const line_buf* other, const line_buf* aug, 
+                               ui32 repeat)
     {
-      float *dst = line_dst->f32;
-      const float *src1 = line_src1->f32, *src2 = line_src2->f32;
-    
-      __m256 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[step_num]);
-      for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
+      __m256 factor = _mm256_set1_ps(s->irv.Aatk);
+
+      float* dst = aug->f32;
+      const float* src1 = sig->f32, * src2 = other->f32;
+      repeat = (repeat + 7) >> 3;
+      for (ui32 i = repeat; i > 0; --i, dst += 8, src1 += 8, src2 += 8)
       {
         __m256 s1 = _mm256_load_ps(src1);
         __m256 s2 = _mm256_load_ps(src2);
@@ -69,261 +71,261 @@ namespace ojph {
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void avx_irrev_vert_wvlt_K(const line_buf* line_src, line_buf* line_dst,
-                               bool L_analysis_or_H_synthesis, ui32 repeat)
-    {
-      float *dst = line_dst->f32;
-      const float *src = line_src->f32;
-
-      float f = LIFTING_FACTORS::K_inv;
-      f = L_analysis_or_H_synthesis ? f : LIFTING_FACTORS::K;
-      __m256 factor = _mm256_set1_ps(f);
-      for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src+=8)
-      {
-        __m256 s = _mm256_load_ps(src);
-        _mm256_store_ps(dst, _mm256_mul_ps(factor, s));
-      }
-    }
-
-
-    /////////////////////////////////////////////////////////////////////////
-    void avx_irrev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst,
-                                    line_buf *line_hdst, ui32 width,
-                                    bool even)
+    void avx_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *src = line_src->f32;
-        float *ldst = line_ldst->f32, *hdst = line_hdst->f32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
+        // split src into ldst and hdst
+        if (even)
+        {
+          float* dph = hdst->f32;
+          float* dpl = ldst->f32;
+          float* sp = src->f32;
 
-        //extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const float* sp = src + (even ? 1 : 0);
-        float *dph = hdst;
-        __m256 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[0]);
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i)
-        { //this is doing twice the work it needs to do
-          //it can be definitely written better
-          __m256 s1 = _mm256_loadu_ps(sp - 1);
-          __m256 s2 = _mm256_loadu_ps(sp + 1);
-          __m256 d = _mm256_loadu_ps(sp);
-          s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
-          __m256 d1 = _mm256_add_ps(d, s1);
-          sp += 8;
-          __m128 t1 = _mm256_extractf128_ps(d1, 0);
-          __m128 t2 = _mm256_extractf128_ps(d1, 1);
-          __m128 t = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(2, 0, 2, 0));
-          _mm_store_ps(dph, t);
-          dph += 4;
+          for (int i = width; i > 0; i -= 16, sp += 16, dpl += 8, dph += 8)
+          {
+             __m256 a = _mm256_load_ps(sp);
+             __m256 b = _mm256_load_ps(sp + 8);
+             __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
+             __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
+             __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
+             __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
+             _mm256_store_ps(dpl, e);
+             _mm256_store_ps(dph, f);
+          }
         }
-
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        __m128 factor128 = _mm_set1_ps(LIFTING_FACTORS::steps[1]);
-        sp = src + (even ? 0 : 1);
-        const float* sph = hdst + (even ? 0 : 1);
-        float *dpl = ldst;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
+        else
         {
-          __m256 d1 = _mm256_loadu_ps(sp); //is there an advantage here?
-          __m128 t1 = _mm256_extractf128_ps(d1, 0);
-          __m128 t2 = _mm256_extractf128_ps(d1, 1);
-          __m128 d = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(2, 0, 2, 0));
+          float* dph = hdst->f32;
+          float* dpl = ldst->f32;
+          float* sp = src->f32;
 
-          __m128 s1 = _mm_loadu_ps(sph - 1);
-          __m128 s2 = _mm_loadu_ps(sph);
-          s1 = _mm_mul_ps(factor128, _mm_add_ps(s1, s2));
-          d = _mm_add_ps(d, s1);
-          _mm_store_ps(dpl, d);
+          for (int i = width; i > 0; i -= 16, sp += 16, dpl += 8, dph += 8)
+          {
+            __m256 a = _mm256_load_ps(sp);
+            __m256 b = _mm256_load_ps(sp + 8);
+            __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
+            __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
+            __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
+            __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
+            _mm256_store_ps(dpl, f);
+            _mm256_store_ps(dph, e);
+          }
         }
 
-        //extension
-        ldst[-1] = ldst[0];
-        ldst[L_width] = ldst[L_width-1];
-        //predict
-        factor = _mm256_set1_ps(LIFTING_FACTORS::steps[2]);
-        const float* spl = ldst + (even ? 1 : 0);
-        dph = hdst;
-        for (ui32 i = (H_width + 7) >> 3; i > 0; --i, spl+=8, dph+=8)
+        // the actual horizontal transform
+        float* hp = hdst->f32, * lp = ldst->f32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
         {
-          __m256 s1 = _mm256_loadu_ps(spl - 1);
-          __m256 s2 = _mm256_loadu_ps(spl);
-          __m256 d = _mm256_loadu_ps(dph);
-          s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
-          d = _mm256_add_ps(d, s1);
-          _mm256_store_ps(dph, d);
-        }
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const float a = s->irv.Aatk;
 
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = _mm256_set1_ps(LIFTING_FACTORS::steps[3]);
-        sph = hdst + (even ? 0 : 1);
-        dpl = ldst;
-        for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sph+=8, dpl+=8)
-        {
-          __m256 s1 = _mm256_loadu_ps(sph - 1);
-          __m256 s2 = _mm256_loadu_ps(sph);
-          __m256 d = _mm256_loadu_ps(dpl);
-          s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
-          d = _mm256_add_ps(d, s1);
-          _mm256_store_ps(dpl, d);
-        }
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const float* sp = lp + (even ? 1 : 0);
+          float* dp = hp;
+          for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+            *dp += a * (sp[-1] + sp[0]);
 
-        //multipliers
-        float *dp = ldst;
-        factor = _mm256_set1_ps(LIFTING_FACTORS::K_inv);
-        for (ui32 i = (L_width + 7) >> 3; i > 0; --i, dp+=8)
-        {
-          __m256 d = _mm256_load_ps(dp);
-          _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
+          // swap buffers
+          float* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
         }
-        dp = hdst;
-        factor = _mm256_set1_ps(LIFTING_FACTORS::K);
-        for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dp+=8)
-        {
-          __m256 d = _mm256_load_ps(dp);
-          _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
+
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          float* dp;
+          __m256 factor;
+
+          factor = _mm256_set1_ps(K_inv);
+          dp = lp;
+          for (ui32 i = (l_width + 7) >> 3; i > 0; --i, dp += 8)
+          {
+            __m256 s = _mm256_load_ps(dp);
+            _mm256_store_ps(dp, _mm256_mul_ps(factor, s));
+          }
+
+          factor = _mm256_set1_ps(K);
+          dp = hp;
+          for (ui32 i = (h_width + 7) >> 3; i > 0; --i, dp += 8)
+          {
+            __m256 s = _mm256_load_ps(dp);
+            _mm256_store_ps(dp, _mm256_mul_ps(factor, s));
+          }
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_ldst->f32[0] = line_src->f32[0];
+          ldst->f32[0] = src->f32[0];
         else
-          line_hdst->f32[0] = line_src->f32[0] + line_src->f32[0];
+          hdst->f32[0] = src->f32[0] * 2.0f;
       }
     }
+    
+    //////////////////////////////////////////////////////////////////////////
+    void avx_irv_vert_syn_step(const lifting_step* s, const line_buf* aug, 
+                               const line_buf* sig, const line_buf* other, 
+                               ui32 repeat)
+    {
+      __m256 factor = _mm256_set1_ps(s->irv.Aatk);
 
-    /////////////////////////////////////////////////////////////////////////
-    void avx_irrev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
-                                    line_buf *line_hsrc, ui32 width,
-                                    bool even)
+      float* dst = aug->f32;
+      const float* src1 = sig->f32, * src2 = other->f32;
+      repeat = (repeat + 7) >> 3;
+      for (ui32 i = repeat; i > 0; --i, dst += 8, src1 += 8, src2 += 8)
+      {
+        __m256 s1 = _mm256_load_ps(src1);
+        __m256 s2 = _mm256_load_ps(src2);
+        __m256 d  = _mm256_load_ps(dst);
+        d = _mm256_sub_ps(d, _mm256_mul_ps(factor, _mm256_add_ps(s1, s2)));
+        _mm256_store_ps(dst, d);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx_irv_horz_syn(const param_atk* atk, const line_buf* dst, 
+                          const line_buf* lsrc, const line_buf* hsrc, 
+                          ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *lsrc = line_lsrc->f32, *hsrc = line_hsrc->f32;
-        float *dst = line_dst->f32;
-      
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
+        bool ev = even;
+        float* oth = hsrc->f32, * aug = lsrc->f32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
 
-        //multipliers
-        float *dp = lsrc;
-        __m256 factor = _mm256_set1_ps(LIFTING_FACTORS::K);
-        for (ui32 i = (L_width + 7) >> 3; i > 0; --i, dp+=8)
-        {
-          __m256 d = _mm256_load_ps(dp);
-          _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
-        }
-        dp = hsrc;
-        factor = _mm256_set1_ps(LIFTING_FACTORS::K_inv);
-        for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dp+=8)
-        {
-          __m256 d = _mm256_load_ps(dp);
-          _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
-        }
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          float* dp;
+          __m256 factor;
 
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = _mm256_set1_ps(LIFTING_FACTORS::steps[7]);
-        const float *sph = hsrc + (even ? 0 : 1);
-        float *dpl = lsrc;
-        for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sph+=8, dpl+=8)
-        {
-          __m256 s1 = _mm256_loadu_ps(sph - 1);
-          __m256 s2 = _mm256_loadu_ps(sph);
-          __m256 d = _mm256_loadu_ps(dpl);
-          s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
-          d = _mm256_add_ps(d, s1);
-          _mm256_store_ps(dpl, d);
+          factor = _mm256_set1_ps(K);
+          dp = aug;
+          for (ui32 i = (aug_width + 7) >> 3; i > 0; --i, dp += 8)
+          {
+            __m256 s = _mm256_load_ps(dp);
+            _mm256_store_ps(dp, _mm256_mul_ps(factor, s));
+          }
+
+          factor = _mm256_set1_ps(K_inv);
+          dp = oth;
+          for (ui32 i = (oth_width + 7) >> 3; i > 0; --i, dp += 8)
+          {
+            __m256 s = _mm256_load_ps(dp);
+            _mm256_store_ps(dp, _mm256_mul_ps(factor, s));
+          }
         }
 
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict
-        factor = _mm256_set1_ps(LIFTING_FACTORS::steps[6]);
-        const float *spl = lsrc + (even ? 0 : -1);
-        float *dph = hsrc;
-        for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dph+=8, spl+=8)
+        // the actual horizontal transform
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
         {
-          __m256 s1 = _mm256_loadu_ps(spl);
-          __m256 s2 = _mm256_loadu_ps(spl + 1);
-          __m256 d = _mm256_loadu_ps(dph);
-          s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
-          d = _mm256_add_ps(d, s1);
-          _mm256_store_ps(dph, d);
+          const lifting_step* s = atk->get_step(j);
+          const float a = s->irv.Aatk;
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const float* sp = oth + (ev ? 0 : 1);
+          float* dp = aug;
+          for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+            *dp -= a * (sp[-1] + sp[0]);
+
+          // swap buffers
+          float* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
         }
 
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = _mm256_set1_ps(LIFTING_FACTORS::steps[5]);
-        sph = hsrc + (even ? 0 : 1);
-        dpl = lsrc;
-        for (ui32 i = (L_width + 7) >> 3; i > 0; --i, dpl+=8, sph+=8)
+        // combine both lsrc and hsrc into dst
+        if (even)
         {
-          __m256 s1 = _mm256_loadu_ps(sph - 1);
-          __m256 s2 = _mm256_loadu_ps(sph);
-          __m256 d = _mm256_loadu_ps(dpl);
-          s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
-          d = _mm256_add_ps(d, s1);
-          _mm256_store_ps(dpl, d);
+          float* sph = hsrc->f32;
+          float* spl = lsrc->f32;
+          float* dp = dst->f32;
+          int i = width;
+          for ( ; i >= 8; i -= 16, dp += 16, spl += 8, sph += 8)
+          {
+            __m256 a = _mm256_load_ps(spl);
+            __m256 b = _mm256_load_ps(sph);
+            __m256 c = _mm256_unpacklo_ps(a, b);
+            __m256 d = _mm256_unpackhi_ps(a, b);
+            __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
+            __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
+            _mm256_store_ps(dp, e);
+            _mm256_store_ps(dp + 8, f);
+          }
+          for (; i > 0; i -= 8, dp += 8, spl += 4, sph += 4)
+          {
+            __m128 a = _mm_load_ps(spl);
+            __m128 b = _mm_load_ps(sph);
+            __m128 c = _mm_unpacklo_ps(a, b);
+            __m128 d = _mm_unpackhi_ps(a, b);
+            _mm_store_ps(dp, c);
+            _mm_store_ps(dp + 4, d);
+          }
         }
-
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict and combine
-        factor = _mm256_set1_ps(LIFTING_FACTORS::steps[4]);
-        dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        ui32 width = L_width + (even ? 0 : 1);
-        for (ui32 i = (width + 7) >> 3; i > 0; --i, spl+=8, sph+=8)
+        else
         {
-          __m256 s1 = _mm256_loadu_ps(spl);
-          __m256 s2 = _mm256_loadu_ps(spl + 1);
-          __m256 d = _mm256_load_ps(sph);
-          s2 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
-          d = _mm256_add_ps(d, s2);
-
-          __m128 a0 = _mm256_extractf128_ps(s1, 0);
-          __m128 a1 = _mm256_extractf128_ps(s1, 1);
-          __m128 a2 = _mm256_extractf128_ps(d, 0);
-          __m128 a3 = _mm256_extractf128_ps(d, 1);
-          _mm_storeu_ps(dp, _mm_unpacklo_ps(a0, a2)); dp += 4;
-          _mm_storeu_ps(dp, _mm_unpackhi_ps(a0, a2)); dp += 4;
-          _mm_storeu_ps(dp, _mm_unpacklo_ps(a1, a3)); dp += 4;
-          _mm_storeu_ps(dp, _mm_unpackhi_ps(a1, a3)); dp += 4;
-
-//          s2 = _mm256_unpackhi_ps(s1, d);
-//          s1 = _mm256_unpacklo_ps(s1, d);
-//          d = _mm256_permute2f128_ps(s1, s2, (2 << 4) | 0);
-//          _mm256_storeu_ps(dp, d);
-//          d = _mm256_permute2f128_ps(s1, s2, (3 << 4) | 1);
-//          _mm256_storeu_ps(dp + 1, d);
+          float* sph = hsrc->f32;
+          float* spl = lsrc->f32;
+          float* dp = dst->f32;
+          int i = width;
+          for (; i >= 8; i -= 16, dp += 16, spl += 8, sph += 8)
+          { // i>=8 because we can exceed the aligned buffer by up to 7
+            __m256 a = _mm256_load_ps(spl);
+            __m256 b = _mm256_load_ps(sph);
+            __m256 c = _mm256_unpacklo_ps(b, a);
+            __m256 d = _mm256_unpackhi_ps(b, a);
+            __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
+            __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
+            _mm256_store_ps(dp, e);
+            _mm256_store_ps(dp + 8, f);
+          }
+          for (; i > 0; i -= 8, dp += 8, spl += 4, sph += 4)
+          {
+            __m128 a = _mm_load_ps(spl);
+            __m128 b = _mm_load_ps(sph);
+            __m128 c = _mm_unpacklo_ps(b, a);
+            __m128 d = _mm_unpackhi_ps(b, a);
+            _mm_store_ps(dp, c);
+            _mm_store_ps(dp + 4, d);
+          }
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->f32[0] = line_lsrc->f32[0];
+          dst->f32[0] = lsrc->f32[0];
         else
-          line_dst->f32[0] = line_hsrc->f32[0] * 0.5f;
+          dst->f32[0] = hsrc->f32[0] * 0.5f;
       }
     }
-  }
-}
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
+    {
+      __m256 factor = _mm256_set1_ps(K);
+      float* dst = aug->f32;
+      repeat = (repeat + 7) >> 3;
+      for (ui32 i = repeat; i > 0; --i, dst += 8 )
+      {
+        __m256 s = _mm256_load_ps(dst);
+        _mm256_store_ps(dst, _mm256_mul_ps(factor, s));
+      }
+    }
+
+
+  } // !local
+} // !ojph
diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h
index c484d279..816e9e8b 100644
--- a/src/core/transform/ojph_transform_local.h
+++ b/src/core/transform/ojph_transform_local.h
@@ -46,14 +46,6 @@ namespace ojph {
   namespace local {
     struct param_atk;
 
-    //////////////////////////////////////////////////////////////////////////
-    struct LIFTING_FACTORS
-    {
-      static const float steps[8];
-      static const float K;
-      static const float K_inv;
-    };
-
     //////////////////////////////////////////////////////////////////////////
     //
     //
@@ -66,38 +58,6 @@ namespace ojph {
     // Reversible functions
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_fwd_predict(const line_buf* src1,
-                                       const line_buf* src2,
-                                       line_buf *dst, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_fwd_update(const line_buf* src1,
-                                      const line_buf* src2,
-                                      line_buf *dst, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_wvlt_fwd_tx(line_buf* src, line_buf *ldst,
-                                  line_buf *hdst, ui32 width, bool even);
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_bwd_predict(const line_buf* src1,
-                                       const line_buf* src2,
-                                       line_buf *dst, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_wvlt_bwd_update(const line_buf* src1,
-                                      const line_buf* src2,
-                                      line_buf *dst, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_wvlt_bwd_tx(line_buf* dst, line_buf *lsrc,
-                                  line_buf *hsrc, ui32 width, bool even);
-
-
-
-
-
     /////////////////////////////////////////////////////////////////////////
     void gen_rev_vert_ana_step(const lifting_step* s, const line_buf* sig, 
                                const line_buf* other, const line_buf* aug, 
@@ -118,33 +78,10 @@ namespace ojph {
                           const line_buf* lsrc, const line_buf* hsrc, 
                           ui32 width, bool even);
 
-
-
-
-
     //////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_irrev_vert_wvlt_step(const line_buf* src1, const line_buf* src2,
-                                  line_buf *dst, int step_num, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_irrev_vert_wvlt_K(const line_buf *src, line_buf *dst,
-                               bool L_analysis_or_H_synthesis, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_irrev_horz_wvlt_fwd_tx(line_buf* src, line_buf *ldst,
-                                    line_buf *hdst, ui32 width, bool even);
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_irrev_horz_wvlt_bwd_tx(line_buf* src, line_buf *ldst,
-                                    line_buf *hdst, ui32 width, bool even);
-
-
-
-
     /////////////////////////////////////////////////////////////////////////
     void gen_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
                                const line_buf* other, const line_buf* aug, 
@@ -168,10 +105,6 @@ namespace ojph {
     /////////////////////////////////////////////////////////////////////////
     void gen_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
 
-
-
-
-
     //////////////////////////////////////////////////////////////////////////
     //
     //
@@ -184,21 +117,28 @@ namespace ojph {
     // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse_irrev_vert_wvlt_step(const line_buf* src1, const line_buf* src2,
-                                  line_buf *dst, int step_num, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void sse_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
+                               const line_buf* other, const line_buf* aug, 
+                               ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse_irrev_vert_wvlt_K(const line_buf *src, line_buf *dst,
-                               bool L_analysis_or_H_synthesis, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void sse_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even);
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse_irrev_horz_wvlt_fwd_tx(line_buf* src, line_buf *ldst,
-                                    line_buf *hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void sse_irv_vert_syn_step(const lifting_step* s, const line_buf* aug,
+                               const line_buf* sig, const line_buf* other,
+                               ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse_irrev_horz_wvlt_bwd_tx(line_buf* src, line_buf *ldst,
-                                    line_buf *hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void sse_irv_horz_syn(const param_atk *atk, const line_buf* dst,
+                          const line_buf *lsrc, const line_buf *hsrc, 
+                          ui32 width, bool even);
+
+    /////////////////////////////////////////////////////////////////////////
+    void sse_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
     //
@@ -212,33 +152,25 @@ namespace ojph {
     // Reversible functions
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_fwd_predict(const line_buf* src1,
-                                        const line_buf* src2,
-                                        line_buf *dst, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_fwd_update(const line_buf* src1,
-                                       const line_buf* src2,
-                                       line_buf *dst, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_horz_wvlt_fwd_tx(line_buf* src, line_buf *ldst,
-                                   line_buf *hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_vert_ana_step(const lifting_step* s, const line_buf* sig, 
+                                const line_buf* other, const line_buf* aug, 
+                                ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_bwd_predict(const line_buf* src1,
-                                        const line_buf* src2,
-                                        line_buf *dst, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even);
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_bwd_update(const line_buf* src1,
-                                       const line_buf* src2,
-                                       line_buf *dst, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_vert_syn_step(const lifting_step* s, const line_buf* aug,
+                                const line_buf* sig, const line_buf* other, 
+                                ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_horz_wvlt_bwd_tx(line_buf* dst, line_buf *lsrc,
-                                   line_buf *hsrc, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst,
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even);
 
 
     //////////////////////////////////////////////////////////////////////////
@@ -253,21 +185,28 @@ namespace ojph {
     // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx_irrev_vert_wvlt_step(const line_buf* src1, const line_buf* src2,
-                                  line_buf *dst, int step_num, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void avx_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
+                               const line_buf* other, const line_buf* aug, 
+                               ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx_irrev_vert_wvlt_K(const line_buf *src, line_buf *dst,
-                               bool L_analysis_or_H_synthesis, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void avx_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even);
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx_irrev_horz_wvlt_fwd_tx(line_buf* src, line_buf *ldst,
-                                    line_buf *hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void avx_irv_vert_syn_step(const lifting_step* s, const line_buf* aug,
+                               const line_buf* sig, const line_buf* other,
+                               ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx_irrev_horz_wvlt_bwd_tx(line_buf* src, line_buf *ldst,
-                                    line_buf *hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void avx_irv_horz_syn(const param_atk *atk, const line_buf* dst,
+                          const line_buf *lsrc, const line_buf *hsrc, 
+                          ui32 width, bool even);
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
 
     //////////////////////////////////////////////////////////////////////////
     //
@@ -281,33 +220,85 @@ namespace ojph {
     // Reversible functions
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_fwd_predict(const line_buf* src1,
-                                        const line_buf* src2,
-                                        line_buf *dst, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_vert_ana_step(const lifting_step* s, const line_buf* sig, 
+                                const line_buf* other, const line_buf* aug, 
+                                ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_fwd_update(const line_buf* src1,
-                                       const line_buf* src2,
-                                       line_buf *dst, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even);
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_horz_wvlt_fwd_tx(line_buf* src, line_buf *ldst,
-                                   line_buf *hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_vert_syn_step(const lifting_step* s, const line_buf* aug,
+                                const line_buf* sig, const line_buf* other, 
+                                ui32 repeat);
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst,
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_bwd_predict(const line_buf* src1,
-                                        const line_buf* src2,
-                                        line_buf *dst, ui32 repeat);
+    //
+    //
+    //                        AVX512 Functions
+    //
+    //
+    //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_bwd_update(const line_buf* src1,
-                                       const line_buf* src2,
-                                       line_buf *dst, ui32 repeat);
+    // Irreversible functions
+    //////////////////////////////////////////////////////////////////////////
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
+                                  const line_buf* other, const line_buf* aug, 
+                                  ui32 repeat);
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even);
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_irv_vert_syn_step(const lifting_step* s, const line_buf* aug,
+                                  const line_buf* sig, const line_buf* other,
+                                  ui32 repeat);
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_irv_horz_syn(const param_atk *atk, const line_buf* dst,
+                             const line_buf *lsrc, const line_buf *hsrc, 
+                             ui32 width, bool even);
 
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
+
+
+    //////////////////////////////////////////////////////////////////////////
+    // Reversible functions
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_horz_wvlt_bwd_tx(line_buf* dst, line_buf *lsrc,
-                                   line_buf *hsrc, ui32 width, bool even);
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_vert_ana_step(const lifting_step* s, const line_buf* sig,
+                                  const line_buf* other, const line_buf* aug, 
+                                  ui32 repeat);
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even);
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_vert_syn_step(const lifting_step* s, const line_buf* aug,
+                                  const line_buf* sig, const line_buf* other, 
+                                  ui32 repeat);
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst,
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
     //
@@ -321,57 +312,52 @@ namespace ojph {
     // Reversible functions
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_fwd_predict(const line_buf *line_src1, 
-                                        const line_buf *line_src2,
-                                        line_buf *line_dst, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_fwd_update(const line_buf *line_src1, 
-                                       const line_buf *line_src2,
-                                       line_buf *line_dst, ui32 repeat);
-
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, 
-                                   line_buf *line_hdst, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_vert_ana_step(const lifting_step* s, const line_buf* sig,
+                                const line_buf* other, const line_buf* aug, 
+                                ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_bwd_predict(const line_buf *line_src1, 
-                                        const line_buf *line_src2,
-                                        line_buf *line_dst, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even);
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_bwd_update(const line_buf *line_src1, 
-                                       const line_buf *line_src2,
-                                       line_buf *line_dst, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_vert_syn_step(const lifting_step* s, const line_buf* aug,
+                                const line_buf* sig, const line_buf* other, 
+                                ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc, 
-                                   line_buf *line_hsrc, ui32 width, bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst,
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_vert_wvlt_step(const line_buf* line_src1, 
-                                   const line_buf* line_src2,
-                                   line_buf *line_dst, int step_num, 
-                                   ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
+                                const line_buf* other, const line_buf* aug, 
+                                ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_vert_wvlt_K(const line_buf *line_src, line_buf *line_dst,
-                                bool L_analysis_or_H_synthesis, ui32 repeat);
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even);
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, 
-                                     line_buf *line_hdst, ui32 width, 
-                                     bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_irv_vert_syn_step(const lifting_step* s, const line_buf* aug,
+                                const line_buf* sig, const line_buf* other,
+                                ui32 repeat);
 
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_horz_wvlt_bwd_tx(line_buf *line_src, line_buf *line_ldst, 
-                                     line_buf *line_hdst, ui32 width, 
-                                     bool even);
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_irv_horz_syn(const param_atk *atk, const line_buf* dst,
+                           const line_buf *lsrc, const line_buf *hsrc, 
+                           ui32 width, bool even);
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
   }
 }
 
diff --git a/src/core/transform/ojph_transform_sse.cpp b/src/core/transform/ojph_transform_sse.cpp
index c299bc8d..281ff4a6 100644
--- a/src/core/transform/ojph_transform_sse.cpp
+++ b/src/core/transform/ojph_transform_sse.cpp
@@ -36,6 +36,7 @@
 //***************************************************************************/
 
 #include <cstdio>
+#include <xmmintrin.h>
 
 #include "ojph_defs.h"
 #include "ojph_arch.h"
@@ -43,273 +44,265 @@
 #include "ojph_transform.h"
 #include "ojph_transform_local.h"
 
-#include <immintrin.h>
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
 
 namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    void sse_irrev_vert_wvlt_step(const line_buf* line_src1,
-                                  const line_buf* line_src2,
-                                  line_buf *line_dst,
-                                  int step_num, ui32 repeat)
+    void sse_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
+                               const line_buf* other, const line_buf* aug, 
+                               ui32 repeat)
     {
-      float *dst = line_dst->f32;
-      const float *src1 = line_src1->f32, *src2 = line_src2->f32;
+      __m128 factor = _mm_set1_ps(s->irv.Aatk);
 
-      __m128 factor = _mm_set1_ps(LIFTING_FACTORS::steps[step_num]);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+      float* dst = aug->f32;
+      const float* src1 = sig->f32, * src2 = other->f32;
+      repeat = (repeat + 3) >> 2;
+      for (ui32 i = repeat; i > 0; --i, dst += 4, src1 += 4, src2 += 4)
       {
         __m128 s1 = _mm_load_ps(src1);
         __m128 s2 = _mm_load_ps(src2);
-        __m128 d = _mm_load_ps(dst);
+        __m128 d  = _mm_load_ps(dst);
         d = _mm_add_ps(d, _mm_mul_ps(factor, _mm_add_ps(s1, s2)));
         _mm_store_ps(dst, d);
       }
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void sse_irrev_vert_wvlt_K(const line_buf* line_src, line_buf* line_dst,
-                               bool L_analysis_or_H_synthesis, ui32 repeat)
-    {
-      float *dst = line_dst->f32;
-      const float *src = line_src->f32;
-
-      float f = LIFTING_FACTORS::K_inv;
-      f = L_analysis_or_H_synthesis ? f : LIFTING_FACTORS::K;
-      __m128 factor = _mm_set1_ps(f);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src+=4)
-      {
-        __m128 s = _mm_load_ps(src);
-        _mm_store_ps(dst, _mm_mul_ps(factor, s));
-      }
-    }
-
-    /////////////////////////////////////////////////////////////////////////
-    void sse_irrev_horz_wvlt_fwd_tx(line_buf* line_src, line_buf *line_ldst,
-                                    line_buf *line_hdst, ui32 width,
-                                    bool even)
+    void sse_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *src = line_src->f32;
-        float *ldst = line_ldst->f32, *hdst = line_hdst->f32;
+        // split src into ldst and hdst
+        if (even)
+        {
+          float* dph = hdst->f32;
+          float* dpl = ldst->f32;
+          float* sp = src->f32;
 
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
+          for (int i = width; i > 0; i -= 8, sp += 8, dpl += 4, dph += 4)
+          {
+            __m128 a = _mm_load_ps(sp);
+            __m128 b = _mm_load_ps(sp + 4);
 
-        //extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const float* sp = src + (even ? 1 : 0);
-        float *dph = hdst;
-        __m128 factor = _mm_set1_ps(LIFTING_FACTORS::steps[0]);
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
-        { //this is doing twice the work it needs to do
-          //it can be definitely written better
-          __m128 s1 = _mm_loadu_ps(sp - 1);
-          __m128 s2 = _mm_loadu_ps(sp + 1);
-          __m128 d = _mm_loadu_ps(sp);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          __m128 d1 = _mm_add_ps(d, s1);
-          sp += 4;
-          s1 = _mm_loadu_ps(sp - 1);
-          s2 = _mm_loadu_ps(sp + 1);
-          d = _mm_loadu_ps(sp);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          __m128 d2 = _mm_add_ps(d, s1);
-          sp += 4;
-          d = _mm_shuffle_ps(d1, d2, _MM_SHUFFLE(2, 0, 2, 0));
-          _mm_store_ps(dph, d);
-        }
+            __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+            __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
 
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = _mm_set1_ps(LIFTING_FACTORS::steps[1]);
-        sp = src + (even ? 0 : 1);
-        const float* sph = hdst + (even ? 0 : 1);
-        float *dpl = ldst;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
-        {
-          __m128 s1 = _mm_loadu_ps(sph - 1);
-          __m128 s2 = _mm_loadu_ps(sph);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          __m128 d1 = _mm_loadu_ps(sp);
-          __m128 d2 = _mm_loadu_ps(sp + 4);
-          __m128 d = _mm_shuffle_ps(d1, d2, _MM_SHUFFLE(2, 0, 2, 0));
-          d = _mm_add_ps(d, s1);
-          _mm_store_ps(dpl, d);
+            _mm_store_ps(dpl, c);
+            _mm_store_ps(dph, d);
+          }
         }
-
-        //extension
-        ldst[-1] = ldst[0];
-        ldst[L_width] = ldst[L_width-1];
-        //predict
-        factor = _mm_set1_ps(LIFTING_FACTORS::steps[2]);
-        const float* spl = ldst + (even ? 1 : 0);
-        dph = hdst;
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, spl+=4, dph+=4)
+        else
         {
-          __m128 s1 = _mm_loadu_ps(spl - 1);
-          __m128 s2 = _mm_loadu_ps(spl);
-          __m128 d = _mm_loadu_ps(dph);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          d = _mm_add_ps(d, s1);
-          _mm_store_ps(dph, d);
-        }
+          float* dph = hdst->f32;
+          float* dpl = ldst->f32;
+          float* sp = src->f32;
 
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = _mm_set1_ps(LIFTING_FACTORS::steps[3]);
-        sph = hdst + (even ? 0 : 1);
-        dpl = ldst;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, dpl+=4)
-        {
-          __m128 s1 = _mm_loadu_ps(sph - 1);
-          __m128 s2 = _mm_loadu_ps(sph);
-          __m128 d = _mm_loadu_ps(dpl);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          d = _mm_add_ps(d, s1);
-          _mm_store_ps(dpl, d);
+          for (int i = width; i > 0; i -= 8, sp += 8, dpl += 4, dph += 4)
+          {
+            __m128 a = _mm_load_ps(sp);
+            __m128 b = _mm_load_ps(sp + 4);
+
+            __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+            __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+
+            _mm_store_ps(dpl, d);
+            _mm_store_ps(dph, c);
+          }
         }
 
-        //multipliers
-        float *dp = ldst;
-        factor = _mm_set1_ps(LIFTING_FACTORS::K_inv);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
+        // the actual horizontal transform
+        float* hp = hdst->f32, * lp = ldst->f32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
         {
-          __m128 d = _mm_load_ps(dp);
-          _mm_store_ps(dp, _mm_mul_ps(factor, d));
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const float a = s->irv.Aatk;
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const float* sp = lp + (even ? 1 : 0);
+          float* dp = hp;
+          for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+            *dp += a * (sp[-1] + sp[0]);
+
+          // swap buffers
+          float* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
         }
-        dp = hdst;
-        factor = _mm_set1_ps(LIFTING_FACTORS::K);
-        for (int i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          __m128 d = _mm_load_ps(dp);
-          _mm_store_ps(dp, _mm_mul_ps(factor, d));
+
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          float* dp;
+          __m128 factor;
+
+          factor = _mm_set1_ps(K_inv);
+          dp = lp;
+          for (ui32 i = (l_width + 3) >> 2; i > 0; --i, dp += 4)
+          {
+            __m128 s = _mm_load_ps(dp);
+            _mm_store_ps(dp, _mm_mul_ps(factor, s));
+          }
+
+          factor = _mm_set1_ps(K);
+          dp = hp;
+          for (ui32 i = (h_width + 3) >> 2; i > 0; --i, dp += 4)
+          {
+            __m128 s = _mm_load_ps(dp);
+            _mm_store_ps(dp, _mm_mul_ps(factor, s));
+          }
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_ldst->f32[0] = line_src->f32[0];
+          ldst->f32[0] = src->f32[0];
         else
-          line_hdst->f32[0] = line_src->f32[0] + line_src->f32[0];
+          hdst->f32[0] = src->f32[0] * 2.0f;
       }
     }
+    
+    //////////////////////////////////////////////////////////////////////////
+    void sse_irv_vert_syn_step(const lifting_step* s, const line_buf* aug, 
+                               const line_buf* sig, const line_buf* other, 
+                               ui32 repeat)
+    {
+      __m128 factor = _mm_set1_ps(s->irv.Aatk);
 
-    /////////////////////////////////////////////////////////////////////////
-    void sse_irrev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
-                                    line_buf *line_hsrc, ui32 width,
-                                    bool even)
+      float* dst = aug->f32;
+      const float* src1 = sig->f32, * src2 = other->f32;
+      repeat = (repeat + 3) >> 2;
+      for (ui32 i = repeat; i > 0; --i, dst += 4, src1 += 4, src2 += 4)
+      {
+        __m128 s1 = _mm_load_ps(src1);
+        __m128 s2 = _mm_load_ps(src2);
+        __m128 d  = _mm_load_ps(dst);
+        d = _mm_sub_ps(d, _mm_mul_ps(factor, _mm_add_ps(s1, s2)));
+        _mm_store_ps(dst, d);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse_irv_horz_syn(const param_atk* atk, const line_buf* dst, 
+                          const line_buf* lsrc, const line_buf* hsrc, 
+                          ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *lsrc = line_lsrc->f32, *hsrc = line_hsrc->f32;
-        float *dst = line_dst->f32;
+        bool ev = even;
+        float* oth = hsrc->f32, * aug = lsrc->f32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
 
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          float* dp;
+          __m128 factor;
 
-        //multipliers
-        float *dp = lsrc;
-        __m128 factor = _mm_set1_ps(LIFTING_FACTORS::K);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          __m128 d = _mm_load_ps(dp);
-          _mm_store_ps(dp, _mm_mul_ps(factor, d));
-        }
-        dp = hsrc;
-        factor = _mm_set1_ps(LIFTING_FACTORS::K_inv);
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          __m128 d = _mm_load_ps(dp);
-          _mm_store_ps(dp, _mm_mul_ps(factor, d));
-        }
+          factor = _mm_set1_ps(K);
+          dp = aug;
+          for (ui32 i = (aug_width + 3) >> 2; i > 0; --i, dp += 4)
+          {
+            __m128 s = _mm_load_ps(dp);
+            _mm_store_ps(dp, _mm_mul_ps(factor, s));
+          }
 
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = _mm_set1_ps(LIFTING_FACTORS::steps[7]);
-        const float *sph = hsrc + (even ? 0 : 1);
-        float *dpl = lsrc;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
-        {
-          __m128 s1 = _mm_loadu_ps(sph - 1);
-          __m128 s2 = _mm_loadu_ps(sph);
-          __m128 d = _mm_loadu_ps(dpl);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          d = _mm_add_ps(d, s1);
-          _mm_store_ps(dpl, d);
+          factor = _mm_set1_ps(K_inv);
+          dp = oth;
+          for (ui32 i = (oth_width + 3) >> 2; i > 0; --i, dp += 4)
+          {
+            __m128 s = _mm_load_ps(dp);
+            _mm_store_ps(dp, _mm_mul_ps(factor, s));
+          }
         }
 
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict
-        factor = _mm_set1_ps(LIFTING_FACTORS::steps[6]);
-        const float *spl = lsrc + (even ? 0 : -1);
-        float *dph = hsrc;
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4, spl+=4)
+        // the actual horizontal transform
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
         {
-          __m128 s1 = _mm_loadu_ps(spl);
-          __m128 s2 = _mm_loadu_ps(spl + 1);
-          __m128 d = _mm_loadu_ps(dph);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          d = _mm_add_ps(d, s1);
-          _mm_store_ps(dph, d);
+          const lifting_step* s = atk->get_step(j);
+          const float a = s->irv.Aatk;
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const float* sp = oth + (ev ? 0 : 1);
+          float* dp = aug;
+          for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+            *dp -= a * (sp[-1] + sp[0]);
+
+          // swap buffers
+          float* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
         }
 
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = _mm_set1_ps(LIFTING_FACTORS::steps[5]);
-        sph = hsrc + (even ? 0 : 1);
-        dpl = lsrc;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
+        // combine both lsrc and hsrc into dst
+        if (even)
         {
-          __m128 s1 = _mm_loadu_ps(sph - 1);
-          __m128 s2 = _mm_loadu_ps(sph);
-          __m128 d = _mm_loadu_ps(dpl);
-          s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          d = _mm_add_ps(d, s1);
-          _mm_store_ps(dpl, d);
+          float* sph = hsrc->f32;
+          float* spl = lsrc->f32;
+          float* dp = dst->f32;
+          int i = width;
+          for (; i > 0; i -= 8, dp += 8, spl += 4, sph += 4)
+          {
+            __m128 a = _mm_load_ps(spl);
+            __m128 b = _mm_load_ps(sph);
+            __m128 c = _mm_unpacklo_ps(a, b);
+            __m128 d = _mm_unpackhi_ps(a, b);
+            _mm_store_ps(dp, c);
+            _mm_store_ps(dp + 4, d);
+          }
         }
-
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict and combine
-        factor = _mm_set1_ps(LIFTING_FACTORS::steps[4]);
-        dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        ui32 width = L_width + (even ? 0 : 1);
-        for (ui32 i = (width + 3) >> 2; i > 0; --i, spl+=4, sph+=4, dp+=8)
+        else
         {
-          __m128 s1 = _mm_loadu_ps(spl);
-          __m128 s2 = _mm_loadu_ps(spl + 1);
-          __m128 d = _mm_load_ps(sph);
-          s2 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
-          d = _mm_add_ps(d, s2);
-          _mm_storeu_ps(dp, _mm_unpacklo_ps(s1, d));
-          _mm_storeu_ps(dp + 4, _mm_unpackhi_ps(s1, d));
+          float* sph = hsrc->f32;
+          float* spl = lsrc->f32;
+          float* dp = dst->f32;
+          int i = width;
+          for (; i > 0; i -= 8, dp += 8, spl += 4, sph += 4)
+          {
+            __m128 a = _mm_load_ps(spl);
+            __m128 b = _mm_load_ps(sph);
+            __m128 c = _mm_unpacklo_ps(b, a);
+            __m128 d = _mm_unpackhi_ps(b, a);
+            _mm_store_ps(dp, c);
+            _mm_store_ps(dp + 4, d);
+          }
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->f32[0] = line_lsrc->f32[0];
+          dst->f32[0] = lsrc->f32[0];
         else
-          line_dst->f32[0] = line_hsrc->f32[0] * 0.5f;
+          dst->f32[0] = hsrc->f32[0] * 0.5f;
       }
     }
-  }
-}
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
+    {
+      __m128 factor = _mm_set1_ps(K);
+      float* dst = aug->f32;
+      repeat = (repeat + 3) >> 2;
+      for (ui32 i = repeat; i > 0; --i, dst += 4)
+      {
+        __m128 s = _mm_load_ps(dst);
+        _mm_store_ps(dst, _mm_mul_ps(factor, s));
+      }
+    }
+
+  } // !local
+} // !ojph
diff --git a/src/core/transform/ojph_transform_sse2.cpp b/src/core/transform/ojph_transform_sse2.cpp
index a607441a..5f3de49d 100644
--- a/src/core/transform/ojph_transform_sse2.cpp
+++ b/src/core/transform/ojph_transform_sse2.cpp
@@ -43,7 +43,7 @@
 #include "ojph_transform.h"
 #include "ojph_transform_local.h"
 
-#include <immintrin.h>
+#include <emmintrin.h>
 
 namespace ojph {
   namespace local {

From fe24e552cbec80c1fe0b990134fa7f87bda97579 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Thu, 11 Apr 2024 10:34:50 +1000
Subject: [PATCH 23/37] Editorial + compilation fix + a potential bug fix

---
 src/core/transform/ojph_transform_avx.cpp | 58 ++++++++++++++++-------
 src/core/transform/ojph_transform_sse.cpp | 42 ++++++++--------
 2 files changed, 62 insertions(+), 38 deletions(-)

diff --git a/src/core/transform/ojph_transform_avx.cpp b/src/core/transform/ojph_transform_avx.cpp
index 743ceee6..81fc6c43 100644
--- a/src/core/transform/ojph_transform_avx.cpp
+++ b/src/core/transform/ojph_transform_avx.cpp
@@ -59,8 +59,8 @@ namespace ojph {
 
       float* dst = aug->f32;
       const float* src1 = sig->f32, * src2 = other->f32;
-      repeat = (repeat + 7) >> 3;
-      for (ui32 i = repeat; i > 0; --i, dst += 8, src1 += 8, src2 += 8)
+      int i = (int)repeat;
+      for ( ; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
       {
         __m256 s1 = _mm256_load_ps(src1);
         __m256 s2 = _mm256_load_ps(src2);
@@ -83,8 +83,8 @@ namespace ojph {
           float* dph = hdst->f32;
           float* dpl = ldst->f32;
           float* sp = src->f32;
-
-          for (int i = width; i > 0; i -= 16, sp += 16, dpl += 8, dph += 8)
+          int i = (int)width;
+          for ( ; i > 8; i -= 16, sp += 16, dpl += 8, dph += 8)
           {
              __m256 a = _mm256_load_ps(sp);
              __m256 b = _mm256_load_ps(sp + 8);
@@ -95,14 +95,23 @@ namespace ojph {
              _mm256_store_ps(dpl, e);
              _mm256_store_ps(dph, f);
           }
+          for (; i > 0; i -= 8, sp += 8, dpl += 4, dph += 4)
+          {
+            __m128 a = _mm_load_ps(sp);
+            __m128 b = _mm_load_ps(sp + 4);
+            __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+            __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+            _mm_store_ps(dpl, c);
+            _mm_store_ps(dph, d);
+          }
         }
         else
         {
           float* dph = hdst->f32;
           float* dpl = ldst->f32;
           float* sp = src->f32;
-
-          for (int i = width; i > 0; i -= 16, sp += 16, dpl += 8, dph += 8)
+          int i = (int)width;
+          for ( ; i > 8; i -= 16, sp += 16, dpl += 8, dph += 8)
           {
             __m256 a = _mm256_load_ps(sp);
             __m256 b = _mm256_load_ps(sp + 8);
@@ -113,6 +122,15 @@ namespace ojph {
             _mm256_store_ps(dpl, f);
             _mm256_store_ps(dph, e);
           }
+          for (; i > 0; i -= 8, sp += 8, dpl += 4, dph += 4)
+          {
+            __m128 a = _mm_load_ps(sp);
+            __m128 b = _mm_load_ps(sp + 4);
+            __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+            __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+            _mm_store_ps(dpl, d);
+            _mm_store_ps(dph, c);
+          }
         }
 
         // the actual horizontal transform
@@ -149,7 +167,8 @@ namespace ojph {
 
           factor = _mm256_set1_ps(K_inv);
           dp = lp;
-          for (ui32 i = (l_width + 7) >> 3; i > 0; --i, dp += 8)
+          int i = (int)l_width;
+          for ( ; i > 0; i -= 8, dp += 8)
           {
             __m256 s = _mm256_load_ps(dp);
             _mm256_store_ps(dp, _mm256_mul_ps(factor, s));
@@ -157,7 +176,8 @@ namespace ojph {
 
           factor = _mm256_set1_ps(K);
           dp = hp;
-          for (ui32 i = (h_width + 7) >> 3; i > 0; --i, dp += 8)
+          int i = (int)h_width;
+          for ( ; i > 0; i -= 8, dp += 8)
           {
             __m256 s = _mm256_load_ps(dp);
             _mm256_store_ps(dp, _mm256_mul_ps(factor, s));
@@ -181,8 +201,8 @@ namespace ojph {
 
       float* dst = aug->f32;
       const float* src1 = sig->f32, * src2 = other->f32;
-      repeat = (repeat + 7) >> 3;
-      for (ui32 i = repeat; i > 0; --i, dst += 8, src1 += 8, src2 += 8)
+      int i = (int)repeat;
+      for ( ; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
       {
         __m256 s1 = _mm256_load_ps(src1);
         __m256 s2 = _mm256_load_ps(src2);
@@ -212,7 +232,8 @@ namespace ojph {
 
           factor = _mm256_set1_ps(K);
           dp = aug;
-          for (ui32 i = (aug_width + 7) >> 3; i > 0; --i, dp += 8)
+          int i = (int)aug_width;
+          for ( ; i > 0; i -= 8, dp += 8)
           {
             __m256 s = _mm256_load_ps(dp);
             _mm256_store_ps(dp, _mm256_mul_ps(factor, s));
@@ -220,7 +241,8 @@ namespace ojph {
 
           factor = _mm256_set1_ps(K_inv);
           dp = oth;
-          for (ui32 i = (oth_width + 7) >> 3; i > 0; --i, dp += 8)
+          int i = (int)oth_width;
+          for ( ; i > 0; i -= 8, dp += 8)
           {
             __m256 s = _mm256_load_ps(dp);
             _mm256_store_ps(dp, _mm256_mul_ps(factor, s));
@@ -255,8 +277,8 @@ namespace ojph {
           float* sph = hsrc->f32;
           float* spl = lsrc->f32;
           float* dp = dst->f32;
-          int i = width;
-          for ( ; i >= 8; i -= 16, dp += 16, spl += 8, sph += 8)
+          int i = (int)width;
+          for ( ; i > 8; i -= 16, dp += 16, spl += 8, sph += 8)
           {
             __m256 a = _mm256_load_ps(spl);
             __m256 b = _mm256_load_ps(sph);
@@ -282,8 +304,8 @@ namespace ojph {
           float* sph = hsrc->f32;
           float* spl = lsrc->f32;
           float* dp = dst->f32;
-          int i = width;
-          for (; i >= 8; i -= 16, dp += 16, spl += 8, sph += 8)
+          int i = (int)width;
+          for (; i > 8; i -= 16, dp += 16, spl += 8, sph += 8)
           { // i>=8 because we can exceed the aligned buffer by up to 7
             __m256 a = _mm256_load_ps(spl);
             __m256 b = _mm256_load_ps(sph);
@@ -318,8 +340,8 @@ namespace ojph {
     {
       __m256 factor = _mm256_set1_ps(K);
       float* dst = aug->f32;
-      repeat = (repeat + 7) >> 3;
-      for (ui32 i = repeat; i > 0; --i, dst += 8 )
+      int i = (int)repeat;
+      for ( ; i > 0; i -= 8, dst += 8 )
       {
         __m256 s = _mm256_load_ps(dst);
         _mm256_store_ps(dst, _mm256_mul_ps(factor, s));
diff --git a/src/core/transform/ojph_transform_sse.cpp b/src/core/transform/ojph_transform_sse.cpp
index 281ff4a6..3a4d39c8 100644
--- a/src/core/transform/ojph_transform_sse.cpp
+++ b/src/core/transform/ojph_transform_sse.cpp
@@ -59,8 +59,8 @@ namespace ojph {
 
       float* dst = aug->f32;
       const float* src1 = sig->f32, * src2 = other->f32;
-      repeat = (repeat + 3) >> 2;
-      for (ui32 i = repeat; i > 0; --i, dst += 4, src1 += 4, src2 += 4)
+      int i = (int)repeat;
+      for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
       {
         __m128 s1 = _mm_load_ps(src1);
         __m128 s2 = _mm_load_ps(src2);
@@ -84,14 +84,13 @@ namespace ojph {
           float* dpl = ldst->f32;
           float* sp = src->f32;
 
-          for (int i = width; i > 0; i -= 8, sp += 8, dpl += 4, dph += 4)
+          int i = (int)width;
+          for ( ; i > 0; i -= 8, sp += 8, dpl += 4, dph += 4)
           {
             __m128 a = _mm_load_ps(sp);
             __m128 b = _mm_load_ps(sp + 4);
-
             __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
             __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
-
             _mm_store_ps(dpl, c);
             _mm_store_ps(dph, d);
           }
@@ -102,14 +101,13 @@ namespace ojph {
           float* dpl = ldst->f32;
           float* sp = src->f32;
 
-          for (int i = width; i > 0; i -= 8, sp += 8, dpl += 4, dph += 4)
+          int i = (int)width;
+          for ( ; i > 0; i -= 8, sp += 8, dpl += 4, dph += 4)
           {
             __m128 a = _mm_load_ps(sp);
             __m128 b = _mm_load_ps(sp + 4);
-
             __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
             __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
-
             _mm_store_ps(dpl, d);
             _mm_store_ps(dph, c);
           }
@@ -149,7 +147,8 @@ namespace ojph {
 
           factor = _mm_set1_ps(K_inv);
           dp = lp;
-          for (ui32 i = (l_width + 3) >> 2; i > 0; --i, dp += 4)
+          int i = (int)l_width;
+          for ( ; i > 0; i -= 4, dp += 4)
           {
             __m128 s = _mm_load_ps(dp);
             _mm_store_ps(dp, _mm_mul_ps(factor, s));
@@ -157,7 +156,8 @@ namespace ojph {
 
           factor = _mm_set1_ps(K);
           dp = hp;
-          for (ui32 i = (h_width + 3) >> 2; i > 0; --i, dp += 4)
+          int i = (int)h_width;
+          for ( ; i > 0; i -= 4, dp += 4)
           {
             __m128 s = _mm_load_ps(dp);
             _mm_store_ps(dp, _mm_mul_ps(factor, s));
@@ -181,8 +181,8 @@ namespace ojph {
 
       float* dst = aug->f32;
       const float* src1 = sig->f32, * src2 = other->f32;
-      repeat = (repeat + 3) >> 2;
-      for (ui32 i = repeat; i > 0; --i, dst += 4, src1 += 4, src2 += 4)
+      int i = (int)repeat;
+      for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
       {
         __m128 s1 = _mm_load_ps(src1);
         __m128 s2 = _mm_load_ps(src2);
@@ -212,7 +212,8 @@ namespace ojph {
 
           factor = _mm_set1_ps(K);
           dp = aug;
-          for (ui32 i = (aug_width + 3) >> 2; i > 0; --i, dp += 4)
+          int i = (int)aug_width;
+          for ( ; i > 0; i -= 4, dp += 4)
           {
             __m128 s = _mm_load_ps(dp);
             _mm_store_ps(dp, _mm_mul_ps(factor, s));
@@ -220,7 +221,8 @@ namespace ojph {
 
           factor = _mm_set1_ps(K_inv);
           dp = oth;
-          for (ui32 i = (oth_width + 3) >> 2; i > 0; --i, dp += 4)
+          int i = (int)oth_width;
+          for ( ; i > 0; i -= 4, dp += 4)
           {
             __m128 s = _mm_load_ps(dp);
             _mm_store_ps(dp, _mm_mul_ps(factor, s));
@@ -255,8 +257,8 @@ namespace ojph {
           float* sph = hsrc->f32;
           float* spl = lsrc->f32;
           float* dp = dst->f32;
-          int i = width;
-          for (; i > 0; i -= 8, dp += 8, spl += 4, sph += 4)
+          int i = (int)width;
+          for ( ; i > 0; i -= 8, dp += 8, spl += 4, sph += 4)
           {
             __m128 a = _mm_load_ps(spl);
             __m128 b = _mm_load_ps(sph);
@@ -271,8 +273,8 @@ namespace ojph {
           float* sph = hsrc->f32;
           float* spl = lsrc->f32;
           float* dp = dst->f32;
-          int i = width;
-          for (; i > 0; i -= 8, dp += 8, spl += 4, sph += 4)
+          int i = (int)width;
+          for ( ; i > 0; i -= 8, dp += 8, spl += 4, sph += 4)
           {
             __m128 a = _mm_load_ps(spl);
             __m128 b = _mm_load_ps(sph);
@@ -296,8 +298,8 @@ namespace ojph {
     {
       __m128 factor = _mm_set1_ps(K);
       float* dst = aug->f32;
-      repeat = (repeat + 3) >> 2;
-      for (ui32 i = repeat; i > 0; --i, dst += 4)
+      int i = (int)repeat;
+      for ( ; i > 0; i -= 4, dst += 4)
       {
         __m128 s = _mm_load_ps(dst);
         _mm_store_ps(dst, _mm_mul_ps(factor, s));

From 5e4b627771abd338caecdf9d3088401633b118e7 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Thu, 11 Apr 2024 10:39:32 +1000
Subject: [PATCH 24/37] Syntax fix.

---
 src/core/transform/ojph_transform_avx.cpp | 10 ++++++----
 src/core/transform/ojph_transform_sse.cpp | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/core/transform/ojph_transform_avx.cpp b/src/core/transform/ojph_transform_avx.cpp
index 81fc6c43..66e3ec81 100644
--- a/src/core/transform/ojph_transform_avx.cpp
+++ b/src/core/transform/ojph_transform_avx.cpp
@@ -163,11 +163,12 @@ namespace ojph {
           float K = atk->get_K();
           float K_inv = 1.0f / K;
           float* dp;
+          int i;
           __m256 factor;
 
           factor = _mm256_set1_ps(K_inv);
           dp = lp;
-          int i = (int)l_width;
+          i = (int)l_width;
           for ( ; i > 0; i -= 8, dp += 8)
           {
             __m256 s = _mm256_load_ps(dp);
@@ -176,7 +177,7 @@ namespace ojph {
 
           factor = _mm256_set1_ps(K);
           dp = hp;
-          int i = (int)h_width;
+          i = (int)h_width;
           for ( ; i > 0; i -= 8, dp += 8)
           {
             __m256 s = _mm256_load_ps(dp);
@@ -228,11 +229,12 @@ namespace ojph {
           float K = atk->get_K();
           float K_inv = 1.0f / K;
           float* dp;
+          int i;
           __m256 factor;
 
           factor = _mm256_set1_ps(K);
           dp = aug;
-          int i = (int)aug_width;
+          i = (int)aug_width;
           for ( ; i > 0; i -= 8, dp += 8)
           {
             __m256 s = _mm256_load_ps(dp);
@@ -241,7 +243,7 @@ namespace ojph {
 
           factor = _mm256_set1_ps(K_inv);
           dp = oth;
-          int i = (int)oth_width;
+          i = (int)oth_width;
           for ( ; i > 0; i -= 8, dp += 8)
           {
             __m256 s = _mm256_load_ps(dp);
diff --git a/src/core/transform/ojph_transform_sse.cpp b/src/core/transform/ojph_transform_sse.cpp
index 3a4d39c8..39776717 100644
--- a/src/core/transform/ojph_transform_sse.cpp
+++ b/src/core/transform/ojph_transform_sse.cpp
@@ -143,11 +143,12 @@ namespace ojph {
           float K = atk->get_K();
           float K_inv = 1.0f / K;
           float* dp;
+          int i;
           __m128 factor;
 
           factor = _mm_set1_ps(K_inv);
           dp = lp;
-          int i = (int)l_width;
+          i = (int)l_width;
           for ( ; i > 0; i -= 4, dp += 4)
           {
             __m128 s = _mm_load_ps(dp);
@@ -156,7 +157,7 @@ namespace ojph {
 
           factor = _mm_set1_ps(K);
           dp = hp;
-          int i = (int)h_width;
+          i = (int)h_width;
           for ( ; i > 0; i -= 4, dp += 4)
           {
             __m128 s = _mm_load_ps(dp);
@@ -208,11 +209,12 @@ namespace ojph {
           float K = atk->get_K();
           float K_inv = 1.0f / K;
           float* dp;
+          int i;
           __m128 factor;
 
           factor = _mm_set1_ps(K);
           dp = aug;
-          int i = (int)aug_width;
+          i = (int)aug_width;
           for ( ; i > 0; i -= 4, dp += 4)
           {
             __m128 s = _mm_load_ps(dp);
@@ -221,7 +223,7 @@ namespace ojph {
 
           factor = _mm_set1_ps(K_inv);
           dp = oth;
-          int i = (int)oth_width;
+          i = (int)oth_width;
           for ( ; i > 0; i -= 4, dp += 4)
           {
             __m128 s = _mm_load_ps(dp);

From 4b72faa72d1a4192115f3f77006c1eee5b036c7d Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Thu, 11 Apr 2024 12:52:21 +1000
Subject: [PATCH 25/37] A bug fix.

---
 src/core/codestream/ojph_resolution.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp
index 14743249..3b25009f 100644
--- a/src/core/codestream/ojph_resolution.cpp
+++ b/src/core/codestream/ojph_resolution.cpp
@@ -496,8 +496,6 @@ namespace ojph {
       if (res_num == 0)
       {
         assert(child_res == NULL);
-        assert(bands[0].exists() && !bands[1].exists() 
-          && !bands[2].exists() && !bands[3].exists());
         bands[0].exchange_buf(vert_even ? sig->line : aug->line);
         bands[0].push_line();
         return;

From 1e9bc418b707d5dba6717d4ad92caee54967e5bf Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Thu, 11 Apr 2024 13:03:50 +1000
Subject: [PATCH 26/37] A bug fix

---
 src/core/codestream/ojph_resolution.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp
index 3b25009f..6d6c500f 100644
--- a/src/core/codestream/ojph_resolution.cpp
+++ b/src/core/codestream/ojph_resolution.cpp
@@ -652,8 +652,6 @@ namespace ojph {
       if (res_num == 0)
       {
         assert(child_res == NULL);
-        assert(bands[0].exists() && !bands[1].exists() 
-          && !bands[2].exists() && !bands[3].exists());
         return bands[0].pull_line();
       }
 

From 7faf4576a7940b286917865e1002ae0201665ec9 Mon Sep 17 00:00:00 2001
From: aous72 <aous72@yahoo.com>
Date: Thu, 11 Apr 2024 15:50:58 +1000
Subject: [PATCH 27/37] completed sse and avx.

---
 src/core/transform/ojph_transform_avx.cpp | 56 ++++++++++++++++++++---
 src/core/transform/ojph_transform_sse.cpp | 56 ++++++++++++++++++++---
 2 files changed, 100 insertions(+), 12 deletions(-)

diff --git a/src/core/transform/ojph_transform_avx.cpp b/src/core/transform/ojph_transform_avx.cpp
index 66e3ec81..8499bf19 100644
--- a/src/core/transform/ojph_transform_avx.cpp
+++ b/src/core/transform/ojph_transform_avx.cpp
@@ -148,10 +148,32 @@ namespace ojph {
           lp[-1] = lp[0];
           lp[l_width] = lp[l_width - 1];
           // lifting step
-          const float* sp = lp + (even ? 1 : 0);
+          const float* sp = lp;
           float* dp = hp;
-          for (ui32 i = h_width; i > 0; --i, sp++, dp++)
-            *dp += a * (sp[-1] + sp[0]);
+          int i = (int)h_width;
+          __m256 f = _mm256_set1_ps(a);
+          if (even)
+          {
+            for (; i > 0; i -= 8, sp += 8, dp += 8)
+            {
+              __m256 m = _mm256_load_ps(sp);
+              __m256 n = _mm256_loadu_ps(sp + 1);
+              __m256 p = _mm256_load_ps(dp);
+              p = _mm256_add_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
+              _mm256_store_ps(dp, p);
+            }
+          }
+          else
+          {
+            for (; i > 0; i -= 8, sp += 8, dp += 8)
+            {
+              __m256 m = _mm256_load_ps(sp);
+              __m256 n = _mm256_loadu_ps(sp - 1);
+              __m256 p = _mm256_load_ps(dp);
+              p = _mm256_add_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
+              _mm256_store_ps(dp, p);
+            }
+          }
 
           // swap buffers
           float* t = lp; lp = hp; hp = t;
@@ -262,10 +284,32 @@ namespace ojph {
           oth[-1] = oth[0];
           oth[oth_width] = oth[oth_width - 1];
           // lifting step
-          const float* sp = oth + (ev ? 0 : 1);
+          const float* sp = oth;
           float* dp = aug;
-          for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
-            *dp -= a * (sp[-1] + sp[0]);
+          int i = (int)aug_width;
+          __m256 f = _mm256_set1_ps(a);
+          if (ev)
+          {
+            for (; i > 0; i -= 8, sp += 8, dp += 8)
+            {
+              __m256 m = _mm256_load_ps(sp);
+              __m256 n = _mm256_loadu_ps(sp - 1);
+              __m256 p = _mm256_load_ps(dp);
+              p = _mm256_sub_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
+              _mm256_store_ps(dp, p);
+            }
+          }
+          else
+          {
+            for (; i > 0; i -= 8, sp += 8, dp += 8)
+            {
+              __m256 m = _mm256_load_ps(sp);
+              __m256 n = _mm256_loadu_ps(sp + 1);
+              __m256 p = _mm256_load_ps(dp);
+              p = _mm256_sub_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
+              _mm256_store_ps(dp, p);
+            }
+          }
 
           // swap buffers
           float* t = aug; aug = oth; oth = t;
diff --git a/src/core/transform/ojph_transform_sse.cpp b/src/core/transform/ojph_transform_sse.cpp
index 39776717..69907841 100644
--- a/src/core/transform/ojph_transform_sse.cpp
+++ b/src/core/transform/ojph_transform_sse.cpp
@@ -128,10 +128,32 @@ namespace ojph {
           lp[-1] = lp[0];
           lp[l_width] = lp[l_width - 1];
           // lifting step
-          const float* sp = lp + (even ? 1 : 0);
+          const float* sp = lp;
           float* dp = hp;
-          for (ui32 i = h_width; i > 0; --i, sp++, dp++)
-            *dp += a * (sp[-1] + sp[0]);
+          int i = (int)h_width;
+          __m128 f = _mm_set1_ps(a);
+          if (even)
+          {
+            for (; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              __m128 m = _mm_load_ps(sp);
+              __m128 n = _mm_loadu_ps(sp + 1);
+              __m128 p = _mm_load_ps(dp);
+              p = _mm_add_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
+              _mm_store_ps(dp, p);
+            }
+          }
+          else
+          {
+            for (; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              __m128 m = _mm_load_ps(sp);
+              __m128 n = _mm_loadu_ps(sp - 1);
+              __m128 p = _mm_load_ps(dp);
+              p = _mm_add_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
+              _mm_store_ps(dp, p);
+            }
+          }
 
           // swap buffers
           float* t = lp; lp = hp; hp = t;
@@ -242,10 +264,32 @@ namespace ojph {
           oth[-1] = oth[0];
           oth[oth_width] = oth[oth_width - 1];
           // lifting step
-          const float* sp = oth + (ev ? 0 : 1);
+          const float* sp = oth;
           float* dp = aug;
-          for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
-            *dp -= a * (sp[-1] + sp[0]);
+          int i = (int)aug_width;
+          __m128 f = _mm_set1_ps(a);
+          if (ev)
+          {
+            for ( ; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              __m128 m = _mm_load_ps(sp);
+              __m128 n = _mm_loadu_ps(sp - 1);
+              __m128 p = _mm_load_ps(dp);
+              p = _mm_sub_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
+              _mm_store_ps(dp, p);
+            }
+          }
+          else
+          {
+            for ( ; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              __m128 m = _mm_load_ps(sp);
+              __m128 n = _mm_loadu_ps(sp + 1);
+              __m128 p = _mm_load_ps(dp);
+              p = _mm_sub_ps(p, _mm_mul_ps(f, _mm_add_ps(m, n)));
+              _mm_store_ps(dp, p);
+            }
+          }
 
           // swap buffers
           float* t = aug; aug = oth; oth = t;

From 2a7ff07f00f2313ec3c7b2956817e13c2ee92958 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Fri, 12 Apr 2024 13:32:25 +1000
Subject: [PATCH 28/37] Corrected code for reversible path. Simplified DWT
 code.

---
 src/core/codestream/ojph_params.cpp       |   2 +-
 src/core/codestream/ojph_resolution.cpp   |   8 +-
 src/core/transform/ojph_transform.cpp     | 168 ++++++++++-----------
 src/core/transform/ojph_transform.h       |  29 +---
 src/core/transform/ojph_transform_avx.cpp |  59 +++-----
 src/core/transform/ojph_transform_local.h | 172 ++++++++--------------
 src/core/transform/ojph_transform_sse.cpp |  58 +++-----
 7 files changed, 193 insertions(+), 303 deletions(-)

diff --git a/src/core/codestream/ojph_params.cpp b/src/core/codestream/ojph_params.cpp
index 268135c4..b6ada178 100644
--- a/src/core/codestream/ojph_params.cpp
+++ b/src/core/codestream/ojph_params.cpp
@@ -1724,7 +1724,7 @@ namespace ojph {
       d[0].rev.Batk = 2;
       d[0].rev.Eatk = 2;
       d[1].rev.Aatk = -1;
-      d[1].rev.Batk = 0;
+      d[1].rev.Batk = 1;
       d[1].rev.Eatk = 1;
     }
 
diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp
index 6d6c500f..b82a810a 100644
--- a/src/core/codestream/ojph_resolution.cpp
+++ b/src/core/codestream/ojph_resolution.cpp
@@ -524,7 +524,7 @@ namespace ojph {
                 line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
                 line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
                 const lifting_step* s = atk->get_step(num_steps - i - 1);
-                rev_vert_ana_step(s, sp1, sp2, dp, width);
+                rev_vert_step(s, sp1, sp2, dp, width, false);
               }
               lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
             }
@@ -591,7 +591,7 @@ namespace ojph {
                 line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
                 line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
                 const lifting_step* s = atk->get_step(num_steps - i - 1);
-                irv_vert_ana_step(s, sp1, sp2, dp, width);
+                irv_vert_step(s, sp1, sp2, dp, width, false);
               }
               lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
             }
@@ -711,7 +711,7 @@ namespace ojph {
                   line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
                   line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
                   const lifting_step* s = atk->get_step(i);
-                  rev_vert_syn_step(s, dp, sp1, sp2, width);
+                  rev_vert_step(s, sp1, sp2, dp, width, true);
                 }
                 lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
               }
@@ -805,7 +805,7 @@ namespace ojph {
                   line_buf* sp1 = sig->active ? sig->line : ssp[i].line;
                   line_buf* sp2 = ssp[i].active ? ssp[i].line : sig->line;
                   const lifting_step* s = atk->get_step(i);
-                  irv_vert_syn_step(s, dp, sp1, sp2, width);
+                  irv_vert_step(s, sp1, sp2, dp, width, true);
                 }
                 lifting_buf t = *aug; *aug = ssp[i]; ssp[i] = *sig; *sig = t;
               }
diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp
index eba4f006..2a219bca 100644
--- a/src/core/transform/ojph_transform.cpp
+++ b/src/core/transform/ojph_transform.cpp
@@ -54,20 +54,15 @@ namespace ojph {
     /////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void (*rev_vert_ana_step)
+    void (*rev_vert_step)
       (const lifting_step* s, const line_buf* sig, const line_buf* other,
-        const line_buf* aug, ui32 repeat) = NULL;
+        const line_buf* aug, ui32 repeat, bool synthesis) = NULL;
 
     /////////////////////////////////////////////////////////////////////////
     void (*rev_horz_ana)
       (const param_atk* atk, const line_buf* ldst, const line_buf* hdst,
         const line_buf* src, ui32 width, bool even) = NULL;
 
-    /////////////////////////////////////////////////////////////////////////
-    void (*rev_vert_syn_step)
-      (const lifting_step* s, const line_buf* aug, const line_buf* sig,
-        const line_buf* other, ui32 repeat) = NULL;
-
     /////////////////////////////////////////////////////////////////////////
     void (*rev_horz_syn)
       (const param_atk* atk, const line_buf* dst, const line_buf* lsrc,
@@ -78,29 +73,24 @@ namespace ojph {
     /////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void (*irv_vert_ana_step)
+    void (*irv_vert_step)
       (const lifting_step* s, const line_buf* sig, const line_buf* other,
-        const line_buf* aug, ui32 repeat) = NULL;
+        const line_buf* aug, ui32 repeat, bool synthesis) = NULL;
+
+    /////////////////////////////////////////////////////////////////////////
+    void (*irv_vert_times_K)
+      (float K, const line_buf* aug, ui32 repeat) = NULL;
 
     /////////////////////////////////////////////////////////////////////////
     void (*irv_horz_ana)
       (const param_atk* atk, const line_buf* ldst, const line_buf* hdst,
         const line_buf* src, ui32 width, bool even) = NULL;
 
-    /////////////////////////////////////////////////////////////////////////
-    void (*irv_vert_syn_step)
-      (const lifting_step* s, const line_buf* aug, const line_buf* sig,
-        const line_buf* other, ui32 repeat) = NULL;
-
     /////////////////////////////////////////////////////////////////////////
     void (*irv_horz_syn)
       (const param_atk* atk, const line_buf* dst, const line_buf* lsrc,
         const line_buf* hsrc, ui32 width, bool even) = NULL;
 
-    /////////////////////////////////////////////////////////////////////////
-    void (*irv_vert_times_K)
-      (float K, const line_buf* aug, ui32 repeat) = NULL;
-
     ////////////////////////////////////////////////////////////////////////////
     static bool wavelet_transform_functions_initialized = false;
 
@@ -112,27 +102,24 @@ namespace ojph {
 
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
-      rev_vert_ana_step         = gen_rev_vert_ana_step;
+      rev_vert_step             = gen_rev_vert_step;
       rev_horz_ana              = gen_rev_horz_ana;
-      rev_vert_syn_step         = gen_rev_vert_syn_step;
       rev_horz_syn              = gen_rev_horz_syn;
 
-      irv_vert_ana_step         = gen_irv_vert_ana_step;
-      irv_horz_ana              = gen_irv_horz_ana;      
-      irv_vert_syn_step         = gen_irv_vert_syn_step;
-      irv_horz_syn              = gen_irv_horz_syn;
+      irv_vert_step             = gen_irv_vert_step;
       irv_vert_times_K          = gen_irv_vert_times_K;
+      irv_horz_ana              = gen_irv_horz_ana;
+      irv_horz_syn              = gen_irv_horz_syn;
 
 #ifndef OJPH_DISABLE_INTEL_SIMD
       int level = get_cpu_ext_level();
 
       if (level >= X86_CPU_EXT_LEVEL_SSE)
       {
-        irv_vert_ana_step         = sse_irv_vert_ana_step;
+        irv_vert_step             = sse_irv_vert_step;
+        irv_vert_times_K          = sse_irv_vert_times_K;
         irv_horz_ana              = sse_irv_horz_ana;
-        irv_vert_syn_step         = sse_irv_vert_syn_step;
         irv_horz_syn              = sse_irv_horz_syn;
-        irv_vert_times_K          = sse_irv_vert_times_K;
       }
 
       //if (level >= X86_CPU_EXT_LEVEL_SSE2)
@@ -145,11 +132,10 @@ namespace ojph {
 
       if (level >= X86_CPU_EXT_LEVEL_AVX)
       {
-        irv_vert_ana_step         = avx_irv_vert_ana_step;
+        irv_vert_step             = avx_irv_vert_step;
+        irv_vert_times_K          = avx_irv_vert_times_K;
         irv_horz_ana              = avx_irv_horz_ana;      
-        irv_vert_syn_step         = avx_irv_vert_syn_step;
         irv_horz_syn              = avx_irv_horz_syn;
-        irv_vert_times_K          = avx_irv_vert_times_K;
       }
 
       //if (level >= X86_CPU_EXT_LEVEL_AVX2)
@@ -197,9 +183,9 @@ namespace ojph {
 #if !defined(OJPH_ENABLE_WASM_SIMD) || !defined(OJPH_EMSCRIPTEN)
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_ana_step(const lifting_step* s, const line_buf* sig, 
-                               const line_buf* other, const line_buf* aug, 
-                               ui32 repeat)
+    void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis)
     {
       const si32 a = s->rev.Aatk;
       const si32 b = s->rev.Batk;
@@ -207,12 +193,35 @@ namespace ojph {
 
       si32* dst = aug->i32;
       const si32* src1 = sig->i32, * src2 = other->i32;
-      if (a >= 0)
-        for (ui32 i = repeat; i > 0; --i)
-          *dst++ += (b + a * (*src1++ + *src2++)) >> e;
-      else
-        for (ui32 i = repeat; i > 0; --i)
-          *dst++ -= (- b - a * (*src1++ + *src2++)) >> e;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1 && b == 2 && e == 2)
+      { // normal update
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + (*src1++ + *src2++)) >> e;
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // normal predict
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (*src1++ + *src2++) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (*src1++ + *src2++) >> e;
+      }
+      else { // general case
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+      }
     }
 
     /////////////////////////////////////////////////////////////////////////
@@ -258,12 +267,15 @@ namespace ojph {
           // lifting step
           const si32* sp = lp + (even ? 1 : 0);
           si32* dp = hp;
-          if (a >= 0)
+          if (a == 1 && b == 2 && e == 2)        // normal update
             for (ui32 i = h_width; i > 0; --i, sp++, dp++)
-              *dp += (b + a * (sp[-1] + sp[0])) >> e;
-          else
+              *dp += (b + (sp[-1] + sp[0])) >> e;
+          else if (a == -1 && b == 1 && e == 1)  // normal predict
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp -= (sp[-1] + sp[0]) >> e;
+          else                                   // general case
             for (ui32 i = h_width; i > 0; --i, sp++, dp++)
-              *dp -= (- b - a * (sp[-1] + sp[0])) >> e;
+              *dp += (b + a * (sp[-1] + sp[0])) >> e;
 
           // swap buffers
           si32* t = lp; lp = hp; hp = t;
@@ -279,25 +291,6 @@ namespace ojph {
       }
     }
     
-    //////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_syn_step(const lifting_step* s, const line_buf* aug, 
-                               const line_buf* sig, const line_buf* other, 
-                               ui32 repeat)
-    {
-      const si32 a = s->rev.Aatk;
-      const si32 b = s->rev.Batk;
-      const ui32 e = s->rev.Eatk;
-
-      si32* dst = aug->i32;
-      const si32* src1 = sig->i32, * src2 = other->i32;
-      if (a >= 0)
-        for (ui32 i = repeat; i > 0; --i)
-          *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
-      else
-        for (ui32 i = repeat; i > 0; --i)
-          *dst++ += (- b - a * (*src1++ + *src2++)) >> e;
-    }
-
     //////////////////////////////////////////////////////////////////////////
     void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
                           const line_buf* lsrc, const line_buf* hsrc, 
@@ -323,12 +316,15 @@ namespace ojph {
           // lifting step
           const si32* sp = oth + (ev ? 0 : 1);
           si32* dp = aug;
-          if (a >= 0)
+          if (a == 1 && b == 2 && e == 2)        // normal update
             for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
-              *dp -= (b + a * (sp[-1] + sp[0])) >> e;
-          else
+              *dp -= (b + (sp[-1] + sp[0])) >> e;
+          else if (a == -1 && b == 1 && e == 1)  // normal predict
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp += (sp[-1] + sp[0]) >> e;
+          else                                   // general case
             for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
-              *dp += (- b - a * (sp[-1] + sp[0])) >> e;
+              *dp -= (b + a * (sp[-1] + sp[0])) >> e;
 
           // swap buffers
           si32* t = aug; aug = oth; oth = t;
@@ -363,18 +359,29 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
-                               const line_buf* other, const line_buf* aug, 
-                               ui32 repeat)
+    void gen_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis)
     {
       float a = s->irv.Aatk;
 
+      if (synthesis)
+        a = -a;
+
       float* dst = aug->f32;
       const float* src1 = sig->f32, * src2 = other->f32;
       for (ui32 i = repeat; i > 0; --i)
         *dst++ += a * (*src1++ + *src2++);
     }
-    
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
+    {
+      float* dst = aug->f32;
+      for (ui32 i = repeat; i > 0; --i)
+        *dst++ *= K;
+    }
+
     /////////////////////////////////////////////////////////////////////////
     void gen_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
                           const line_buf* hdst, const line_buf* src, 
@@ -447,19 +454,6 @@ namespace ojph {
       }
     }
     
-    //////////////////////////////////////////////////////////////////////////
-    void gen_irv_vert_syn_step(const lifting_step* s, const line_buf* aug, 
-                               const line_buf* sig, const line_buf* other, 
-                               ui32 repeat)
-    {
-      float a = s->irv.Aatk;
-
-      float* dst = aug->f32;
-      const float* src1 = sig->f32, * src2 = other->f32;
-      for (ui32 i = repeat; i > 0; --i)
-        *dst++ -= a * (*src1++ + *src2++);
-    }
-
     //////////////////////////////////////////////////////////////////////////
     void gen_irv_horz_syn(const param_atk* atk, const line_buf* dst, 
                           const line_buf* lsrc, const line_buf* hsrc, 
@@ -527,14 +521,6 @@ namespace ojph {
       }
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
-    {
-      float* dst = aug->f32;
-      for (ui32 i = repeat; i > 0; --i)
-        *dst++ *= K;
-    }
-
 #endif // !OJPH_ENABLE_WASM_SIMD
 
   }
diff --git a/src/core/transform/ojph_transform.h b/src/core/transform/ojph_transform.h
index 1aae8b82..0e59632e 100644
--- a/src/core/transform/ojph_transform.h
+++ b/src/core/transform/ojph_transform.h
@@ -55,56 +55,43 @@ namespace ojph {
     /////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    extern void (*rev_vert_ana_step)
+    extern void (*rev_vert_step)
       (const lifting_step* s, const line_buf* sig, const line_buf* other,
-        const line_buf* aug, ui32 repeat);
+        const line_buf* aug, ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
     extern void (*rev_horz_ana)
       (const param_atk* atk, const line_buf* ldst, const line_buf* hdst,
         const line_buf* src, ui32 width, bool even);
 
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*rev_vert_syn_step)
-      (const lifting_step* s, const line_buf* aug, const line_buf* sig,
-        const line_buf* other, ui32 repeat);
-
     /////////////////////////////////////////////////////////////////////////
     extern void (*rev_horz_syn)
       (const param_atk* atk, const line_buf* dst, const line_buf* lsrc,
         const line_buf* hsrc, ui32 width, bool even);
 
-
-
     /////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     /////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    extern void (*irv_vert_ana_step)
+    extern void (*irv_vert_step)
       (const lifting_step* s, const line_buf* sig, const line_buf* other, 
-        const line_buf* aug, ui32 repeat);
+        const line_buf* aug, ui32 repeat, bool synthesis);
+
+    /////////////////////////////////////////////////////////////////////////
+    extern void (*irv_vert_times_K)
+      (float K, const line_buf* aug, ui32 repeat);
 
     /////////////////////////////////////////////////////////////////////////
     extern void (*irv_horz_ana)
       (const param_atk* atk, const line_buf* ldst, const line_buf* hdst, 
         const line_buf* src, ui32 width, bool even);
 
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*irv_vert_syn_step)
-      (const lifting_step* s, const line_buf* aug, const line_buf* sig, 
-        const line_buf* other, ui32 repeat);
-
     /////////////////////////////////////////////////////////////////////////
     extern void (*irv_horz_syn)
       (const param_atk* atk, const line_buf* dst, const line_buf* lsrc, 
         const line_buf* hsrc, ui32 width, bool even);
 
-    /////////////////////////////////////////////////////////////////////////
-    extern void (*irv_vert_times_K)
-      (float K, const line_buf* aug, ui32 repeat);
-
-
   }
 }
 
diff --git a/src/core/transform/ojph_transform_avx.cpp b/src/core/transform/ojph_transform_avx.cpp
index 8499bf19..74f361ad 100644
--- a/src/core/transform/ojph_transform_avx.cpp
+++ b/src/core/transform/ojph_transform_avx.cpp
@@ -51,11 +51,15 @@ namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    void avx_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
-                               const line_buf* other, const line_buf* aug, 
-                               ui32 repeat)
+    void avx_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis)
     {
-      __m256 factor = _mm256_set1_ps(s->irv.Aatk);
+      float a = s->irv.Aatk;
+      if (synthesis)
+        a = -a;
+
+      __m256 factor = _mm256_set1_ps(a);
 
       float* dst = aug->f32;
       const float* src1 = sig->f32, * src2 = other->f32;
@@ -70,6 +74,19 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    void avx_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
+    {
+      __m256 factor = _mm256_set1_ps(K);
+      float* dst = aug->f32;
+      int i = (int)repeat;
+      for (; i > 0; i -= 8, dst += 8)
+      {
+        __m256 s = _mm256_load_ps(dst);
+        _mm256_store_ps(dst, _mm256_mul_ps(factor, s));
+      }
+    }
+
     /////////////////////////////////////////////////////////////////////////
     void avx_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
                           const line_buf* hdst, const line_buf* src, 
@@ -215,26 +232,6 @@ namespace ojph {
       }
     }
     
-    //////////////////////////////////////////////////////////////////////////
-    void avx_irv_vert_syn_step(const lifting_step* s, const line_buf* aug, 
-                               const line_buf* sig, const line_buf* other, 
-                               ui32 repeat)
-    {
-      __m256 factor = _mm256_set1_ps(s->irv.Aatk);
-
-      float* dst = aug->f32;
-      const float* src1 = sig->f32, * src2 = other->f32;
-      int i = (int)repeat;
-      for ( ; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
-      {
-        __m256 s1 = _mm256_load_ps(src1);
-        __m256 s2 = _mm256_load_ps(src2);
-        __m256 d  = _mm256_load_ps(dst);
-        d = _mm256_sub_ps(d, _mm256_mul_ps(factor, _mm256_add_ps(s1, s2)));
-        _mm256_store_ps(dst, d);
-      }
-    }
-
     //////////////////////////////////////////////////////////////////////////
     void avx_irv_horz_syn(const param_atk* atk, const line_buf* dst, 
                           const line_buf* lsrc, const line_buf* hsrc, 
@@ -381,19 +378,5 @@ namespace ojph {
       }
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
-    {
-      __m256 factor = _mm256_set1_ps(K);
-      float* dst = aug->f32;
-      int i = (int)repeat;
-      for ( ; i > 0; i -= 8, dst += 8 )
-      {
-        __m256 s = _mm256_load_ps(dst);
-        _mm256_store_ps(dst, _mm256_mul_ps(factor, s));
-      }
-    }
-
-
   } // !local
 } // !ojph
diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h
index 816e9e8b..fe7d1f27 100644
--- a/src/core/transform/ojph_transform_local.h
+++ b/src/core/transform/ojph_transform_local.h
@@ -55,56 +55,46 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    // Reversible functions
+    // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_ana_step(const lifting_step* s, const line_buf* sig, 
-                               const line_buf* other, const line_buf* aug, 
-                               ui32 repeat);
+    void gen_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
-                          const line_buf* hdst, const line_buf* src, 
-                          ui32 width, bool even);
+    void gen_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_rev_vert_syn_step(const lifting_step* s, const line_buf* aug, 
-                               const line_buf* sig, const line_buf* other, 
-                               ui32 repeat);
+    void gen_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                          const line_buf* hdst, const line_buf* src, 
+                          ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
-                          const line_buf* lsrc, const line_buf* hsrc, 
+    void gen_irv_horz_syn(const param_atk *atk, const line_buf* dst, 
+                          const line_buf *lsrc, const line_buf *hsrc, 
                           ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
-    // Irreversible functions
+    // Reversible functions
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
-                               const line_buf* other, const line_buf* aug, 
-                               ui32 repeat);
+    void gen_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+    void gen_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
                           const line_buf* hdst, const line_buf* src, 
                           ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
-    void gen_irv_vert_syn_step(const lifting_step* s, const line_buf* aug,
-                               const line_buf* sig, const line_buf* other,
-                               ui32 repeat);
-
-    /////////////////////////////////////////////////////////////////////////
-    void gen_irv_horz_syn(const param_atk *atk, const line_buf* dst, 
-                          const line_buf *lsrc, const line_buf *hsrc, 
+    void gen_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                          const line_buf* lsrc, const line_buf* hsrc, 
                           ui32 width, bool even);
 
-    /////////////////////////////////////////////////////////////////////////
-    void gen_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
-
     //////////////////////////////////////////////////////////////////////////
     //
     //
@@ -118,28 +108,23 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void sse_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
-                               const line_buf* other, const line_buf* aug, 
-                               ui32 repeat);
+    void sse_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis);
+
+    /////////////////////////////////////////////////////////////////////////
+    void sse_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
 
     /////////////////////////////////////////////////////////////////////////
     void sse_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
                           const line_buf* hdst, const line_buf* src, 
                           ui32 width, bool even);
 
-    /////////////////////////////////////////////////////////////////////////
-    void sse_irv_vert_syn_step(const lifting_step* s, const line_buf* aug,
-                               const line_buf* sig, const line_buf* other,
-                               ui32 repeat);
-
     /////////////////////////////////////////////////////////////////////////
     void sse_irv_horz_syn(const param_atk *atk, const line_buf* dst,
                           const line_buf *lsrc, const line_buf *hsrc, 
                           ui32 width, bool even);
 
-    /////////////////////////////////////////////////////////////////////////
-    void sse_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
-
     //////////////////////////////////////////////////////////////////////////
     //
     //
@@ -153,20 +138,15 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_ana_step(const lifting_step* s, const line_buf* sig, 
-                                const line_buf* other, const line_buf* aug, 
-                                ui32 repeat);
+    void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
     void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
                            const line_buf* hdst, const line_buf* src, 
                            ui32 width, bool even);
 
-    /////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_syn_step(const lifting_step* s, const line_buf* aug,
-                                const line_buf* sig, const line_buf* other, 
-                                ui32 repeat);
-
     /////////////////////////////////////////////////////////////////////////
     void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst,
                            const line_buf* lsrc, const line_buf* hsrc, 
@@ -186,28 +166,23 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void avx_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
-                               const line_buf* other, const line_buf* aug, 
-                               ui32 repeat);
+    void avx_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis);
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
 
     /////////////////////////////////////////////////////////////////////////
     void avx_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
                           const line_buf* hdst, const line_buf* src, 
                           ui32 width, bool even);
 
-    /////////////////////////////////////////////////////////////////////////
-    void avx_irv_vert_syn_step(const lifting_step* s, const line_buf* aug,
-                               const line_buf* sig, const line_buf* other,
-                               ui32 repeat);
-
     /////////////////////////////////////////////////////////////////////////
     void avx_irv_horz_syn(const param_atk *atk, const line_buf* dst,
                           const line_buf *lsrc, const line_buf *hsrc, 
                           ui32 width, bool even);
 
-    /////////////////////////////////////////////////////////////////////////
-    void avx_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
-
     //////////////////////////////////////////////////////////////////////////
     //
     //
@@ -221,20 +196,15 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_ana_step(const lifting_step* s, const line_buf* sig, 
-                                const line_buf* other, const line_buf* aug, 
-                                ui32 repeat);
+    void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
     void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
                            const line_buf* hdst, const line_buf* src, 
                            ui32 width, bool even);
 
-    /////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_syn_step(const lifting_step* s, const line_buf* aug,
-                                const line_buf* sig, const line_buf* other, 
-                                ui32 repeat);
-
     /////////////////////////////////////////////////////////////////////////
     void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst,
                            const line_buf* lsrc, const line_buf* hsrc, 
@@ -253,48 +223,38 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void avx512_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
-                                  const line_buf* other, const line_buf* aug, 
-                                  ui32 repeat);
+    void avx512_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis);
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
 
     /////////////////////////////////////////////////////////////////////////
     void avx512_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
                              const line_buf* hdst, const line_buf* src, 
                              ui32 width, bool even);
 
-    /////////////////////////////////////////////////////////////////////////
-    void avx512_irv_vert_syn_step(const lifting_step* s, const line_buf* aug,
-                                  const line_buf* sig, const line_buf* other,
-                                  ui32 repeat);
-
     /////////////////////////////////////////////////////////////////////////
     void avx512_irv_horz_syn(const param_atk *atk, const line_buf* dst,
                              const line_buf *lsrc, const line_buf *hsrc, 
                              ui32 width, bool even);
 
-    /////////////////////////////////////////////////////////////////////////
-    void avx512_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
-
 
     //////////////////////////////////////////////////////////////////////////
     // Reversible functions
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void avx512_rev_vert_ana_step(const lifting_step* s, const line_buf* sig,
-                                  const line_buf* other, const line_buf* aug, 
-                                  ui32 repeat);
+    void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig,
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
     void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
                              const line_buf* hdst, const line_buf* src, 
                              ui32 width, bool even);
 
-    /////////////////////////////////////////////////////////////////////////
-    void avx512_rev_vert_syn_step(const lifting_step* s, const line_buf* aug,
-                                  const line_buf* sig, const line_buf* other, 
-                                  ui32 repeat);
-
     /////////////////////////////////////////////////////////////////////////
     void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst,
                              const line_buf* lsrc, const line_buf* hsrc, 
@@ -309,55 +269,45 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    // Reversible functions
+    // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_ana_step(const lifting_step* s, const line_buf* sig,
-                                const line_buf* other, const line_buf* aug, 
-                                ui32 repeat);
+    void wasm_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
-                           const line_buf* hdst, const line_buf* src, 
-                           ui32 width, bool even);
+    void wasm_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_syn_step(const lifting_step* s, const line_buf* aug,
-                                const line_buf* sig, const line_buf* other, 
-                                ui32 repeat);
+    void wasm_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst,
-                           const line_buf* lsrc, const line_buf* hsrc, 
+    void wasm_irv_horz_syn(const param_atk *atk, const line_buf* dst,
+                           const line_buf *lsrc, const line_buf *hsrc, 
                            ui32 width, bool even);
 
     //////////////////////////////////////////////////////////////////////////
-    // Irreversible functions
+    // Reversible functions
     //////////////////////////////////////////////////////////////////////////
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
-                                const line_buf* other, const line_buf* aug, 
-                                ui32 repeat);
+    void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig,
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis);
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_irv_horz_ana(const param_atk* atk, const line_buf* ldst,
+    void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst,
                            const line_buf* hdst, const line_buf* src, 
                            ui32 width, bool even);
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_irv_vert_syn_step(const lifting_step* s, const line_buf* aug,
-                                const line_buf* sig, const line_buf* other,
-                                ui32 repeat);
-
-    /////////////////////////////////////////////////////////////////////////
-    void wasm_irv_horz_syn(const param_atk *atk, const line_buf* dst,
-                           const line_buf *lsrc, const line_buf *hsrc, 
+    void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst,
+                           const line_buf* lsrc, const line_buf* hsrc, 
                            ui32 width, bool even);
-
-    /////////////////////////////////////////////////////////////////////////
-    void wasm_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat);
   }
 }
 
diff --git a/src/core/transform/ojph_transform_sse.cpp b/src/core/transform/ojph_transform_sse.cpp
index 69907841..b61ea5e9 100644
--- a/src/core/transform/ojph_transform_sse.cpp
+++ b/src/core/transform/ojph_transform_sse.cpp
@@ -51,11 +51,15 @@ namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    void sse_irv_vert_ana_step(const lifting_step* s, const line_buf* sig, 
-                               const line_buf* other, const line_buf* aug, 
-                               ui32 repeat)
+    void sse_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                           const line_buf* other, const line_buf* aug, 
+                           ui32 repeat, bool synthesis)
     {
-      __m128 factor = _mm_set1_ps(s->irv.Aatk);
+      float a = s->irv.Aatk;
+      if (synthesis)
+        a = -a;
+
+      __m128 factor = _mm_set1_ps(a);
 
       float* dst = aug->f32;
       const float* src1 = sig->f32, * src2 = other->f32;
@@ -70,6 +74,19 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    void sse_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
+    {
+      __m128 factor = _mm_set1_ps(K);
+      float* dst = aug->f32;
+      int i = (int)repeat;
+      for (; i > 0; i -= 4, dst += 4)
+      {
+        __m128 s = _mm_load_ps(dst);
+        _mm_store_ps(dst, _mm_mul_ps(factor, s));
+      }
+    }
+
     /////////////////////////////////////////////////////////////////////////
     void sse_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
                           const line_buf* hdst, const line_buf* src, 
@@ -195,26 +212,6 @@ namespace ojph {
       }
     }
     
-    //////////////////////////////////////////////////////////////////////////
-    void sse_irv_vert_syn_step(const lifting_step* s, const line_buf* aug, 
-                               const line_buf* sig, const line_buf* other, 
-                               ui32 repeat)
-    {
-      __m128 factor = _mm_set1_ps(s->irv.Aatk);
-
-      float* dst = aug->f32;
-      const float* src1 = sig->f32, * src2 = other->f32;
-      int i = (int)repeat;
-      for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
-      {
-        __m128 s1 = _mm_load_ps(src1);
-        __m128 s2 = _mm_load_ps(src2);
-        __m128 d  = _mm_load_ps(dst);
-        d = _mm_sub_ps(d, _mm_mul_ps(factor, _mm_add_ps(s1, s2)));
-        _mm_store_ps(dst, d);
-      }
-    }
-
     //////////////////////////////////////////////////////////////////////////
     void sse_irv_horz_syn(const param_atk* atk, const line_buf* dst, 
                           const line_buf* lsrc, const line_buf* hsrc, 
@@ -339,18 +336,5 @@ namespace ojph {
       }
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
-    {
-      __m128 factor = _mm_set1_ps(K);
-      float* dst = aug->f32;
-      int i = (int)repeat;
-      for ( ; i > 0; i -= 4, dst += 4)
-      {
-        __m128 s = _mm_load_ps(dst);
-        _mm_store_ps(dst, _mm_mul_ps(factor, s));
-      }
-    }
-
   } // !local
 } // !ojph

From 03ef77acbcc04da174b03d9312987b45b4c92e8c Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Fri, 12 Apr 2024 21:31:14 +1000
Subject: [PATCH 29/37] Completed sse, sse2, avx, avx2.  Still wasm and avx512.

---
 src/core/transform/ojph_transform.cpp      |  89 +--
 src/core/transform/ojph_transform_avx.cpp  | 168 +-----
 src/core/transform/ojph_transform_avx2.cpp | 617 +++++++++++++++------
 src/core/transform/ojph_transform_local.h  | 165 ++++++
 src/core/transform/ojph_transform_sse.cpp  | 135 +----
 src/core/transform/ojph_transform_sse2.cpp | 554 ++++++++++++------
 6 files changed, 1086 insertions(+), 642 deletions(-)

diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp
index 2a219bca..95ab686c 100644
--- a/src/core/transform/ojph_transform.cpp
+++ b/src/core/transform/ojph_transform.cpp
@@ -112,6 +112,7 @@ namespace ojph {
       irv_horz_syn              = gen_irv_horz_syn;
 
 #ifndef OJPH_DISABLE_INTEL_SIMD
+
       int level = get_cpu_ext_level();
 
       if (level >= X86_CPU_EXT_LEVEL_SSE)
@@ -122,13 +123,12 @@ namespace ojph {
         irv_horz_syn              = sse_irv_horz_syn;
       }
 
-      //if (level >= X86_CPU_EXT_LEVEL_SSE2)
-      //{
-      //  rev_vert_ana_step         = sse2_rev_vert_ana_step;
-      //  rev_horz_ana              = sse2_rev_horz_ana;
-      //  rev_vert_syn_step         = sse2_rev_vert_syn_step;
-      //  rev_horz_syn              = sse2_rev_horz_syn;
-      //}
+      if (level >= X86_CPU_EXT_LEVEL_SSE2)
+      {
+        rev_vert_step             = sse2_rev_vert_step;
+        rev_horz_ana              = sse2_rev_horz_ana;
+        rev_horz_syn              = sse2_rev_horz_syn;
+      }
 
       if (level >= X86_CPU_EXT_LEVEL_AVX)
       {
@@ -138,26 +138,23 @@ namespace ojph {
         irv_horz_syn              = avx_irv_horz_syn;
       }
 
-      //if (level >= X86_CPU_EXT_LEVEL_AVX2)
-      //{
-      //  rev_vert_ana_step         = avx2_rev_vert_ana_step;
-      //  rev_horz_ana              = avx2_rev_horz_ana;
-      //  rev_vert_syn_step         = avx2_rev_vert_syn_step;
-      //  rev_horz_syn              = avx2_rev_horz_syn;
-      //}
+      if (level >= X86_CPU_EXT_LEVEL_AVX2)
+      {
+        rev_vert_step             = avx2_rev_vert_step;
+        rev_horz_ana              = avx2_rev_horz_ana;
+        rev_horz_syn              = avx2_rev_horz_syn;
+      }
 
       //if (level >= X86_CPU_EXT_LEVEL_AVX512)
       //{
-      //  rev_vert_ana_step         = avx512_rev_vert_ana_step;
+      //  rev_vert_step             = avx512_rev_vert_ana_step;
       //  rev_horz_ana              = avx512_rev_horz_ana;
-      //  rev_vert_syn_step         = avx512_rev_vert_syn_step;
       //  rev_horz_syn              = avx512_rev_horz_syn;
 
-      //  irv_vert_ana_step         = avx512_irv_vert_ana_step;
-      //  irv_horz_ana              = avx512_irv_horz_ana;      
+      //  irv_vert_step             = avx512_irv_vert_step;
+      //  irv_vert_times_K          = avx512_irv_vert_times_K;
       //  irv_vert_syn_step         = avx512_irv_vert_syn_step;
       //  irv_horz_syn              = avx512_irv_horz_syn;
-      //  irv_vert_times_K          = avx512_irv_vert_times_K;
       //}
 
 #endif // !OJPH_DISABLE_INTEL_SIMD
@@ -196,17 +193,17 @@ namespace ojph {
       // The general definition of the wavelet in Part 2 is slightly 
       // different to part 2, although they are mathematically equivalent
       // here, we identify the simpler form from Part 1 and employ them
-      if (a == 1 && b == 2 && e == 2)
-      { // normal update
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
         if (synthesis)
           for (ui32 i = repeat; i > 0; --i)
-            *dst++ -= (b + (*src1++ + *src2++)) >> e;
+            *dst++ -= (b + *src1++ + *src2++) >> e;
         else
           for (ui32 i = repeat; i > 0; --i)
-            *dst++ += (b + (*src1++ + *src2++)) >> e;
+            *dst++ += (b + *src1++ + *src2++) >> e;
       }
       else if (a == -1 && b == 1 && e == 1)
-      { // normal predict
+      { // 5/3 predict
         if (synthesis)
           for (ui32 i = repeat; i > 0; --i)
             *dst++ += (*src1++ + *src2++) >> e;
@@ -214,6 +211,15 @@ namespace ojph {
           for (ui32 i = repeat; i > 0; --i)
             *dst++ -= (*src1++ + *src2++) >> e;
       }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b - (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b - (*src1++ + *src2++)) >> e;
+      }
       else { // general case
         if (synthesis)
           for (ui32 i = repeat; i > 0; --i)
@@ -267,15 +273,26 @@ namespace ojph {
           // lifting step
           const si32* sp = lp + (even ? 1 : 0);
           si32* dp = hp;
-          if (a == 1 && b == 2 && e == 2)        // normal update
+          if (a == 1) 
+          { // 5/3 update and any case with a == 1
             for (ui32 i = h_width; i > 0; --i, sp++, dp++)
               *dp += (b + (sp[-1] + sp[0])) >> e;
-          else if (a == -1 && b == 1 && e == 1)  // normal predict
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
             for (ui32 i = h_width; i > 0; --i, sp++, dp++)
               *dp -= (sp[-1] + sp[0]) >> e;
-          else                                   // general case
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+              *dp += (b - (sp[-1] + sp[0])) >> e;
+          }
+          else {
+            // general case
             for (ui32 i = h_width; i > 0; --i, sp++, dp++)
               *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
 
           // swap buffers
           si32* t = lp; lp = hp; hp = t;
@@ -316,15 +333,26 @@ namespace ojph {
           // lifting step
           const si32* sp = oth + (ev ? 0 : 1);
           si32* dp = aug;
-          if (a == 1 && b == 2 && e == 2)        // normal update
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
             for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
               *dp -= (b + (sp[-1] + sp[0])) >> e;
-          else if (a == -1 && b == 1 && e == 1)  // normal predict
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
             for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
               *dp += (sp[-1] + sp[0]) >> e;
-          else                                   // general case
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+              *dp -= (b - (sp[-1] + sp[0])) >> e;
+          }
+          else {
+            // general case
             for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
               *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+          }
 
           // swap buffers
           si32* t = aug; aug = oth; oth = t;
@@ -413,7 +441,6 @@ namespace ojph {
         ui32 num_steps = atk->get_num_steps();
         for (ui32 j = num_steps; j > 0; --j)
         {
-          // first lifting step
           const lifting_step* s = atk->get_step(j - 1);
           const float a = s->irv.Aatk;
 
diff --git a/src/core/transform/ojph_transform_avx.cpp b/src/core/transform/ojph_transform_avx.cpp
index 74f361ad..e7933ff1 100644
--- a/src/core/transform/ojph_transform_avx.cpp
+++ b/src/core/transform/ojph_transform_avx.cpp
@@ -41,15 +41,26 @@
 #include "ojph_defs.h"
 #include "ojph_arch.h"
 #include "ojph_mem.h"
-#include "ojph_transform.h"
-#include "ojph_transform_local.h"
-
 #include "ojph_params.h"
 #include "../codestream/ojph_params_local.h"
 
+#include "ojph_transform.h"
+#include "ojph_transform_local.h"
+
 namespace ojph {
   namespace local {
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline void avx_multiply_const(float* p, float f, int width)
+    {
+      __m256 factor = _mm256_set1_ps(f);
+      for (; width > 0; width -= 8, p += 8)
+      {
+        __m256 s = _mm256_load_ps(p);
+        _mm256_store_ps(p, _mm256_mul_ps(factor, s));
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void avx_irv_vert_step(const lifting_step* s, const line_buf* sig, 
                            const line_buf* other, const line_buf* aug, 
@@ -95,59 +106,12 @@ namespace ojph {
       if (width > 1)
       {
         // split src into ldst and hdst
-        if (even)
         {
-          float* dph = hdst->f32;
           float* dpl = ldst->f32;
-          float* sp = src->f32;
-          int i = (int)width;
-          for ( ; i > 8; i -= 16, sp += 16, dpl += 8, dph += 8)
-          {
-             __m256 a = _mm256_load_ps(sp);
-             __m256 b = _mm256_load_ps(sp + 8);
-             __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
-             __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
-             __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
-             __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
-             _mm256_store_ps(dpl, e);
-             _mm256_store_ps(dph, f);
-          }
-          for (; i > 0; i -= 8, sp += 8, dpl += 4, dph += 4)
-          {
-            __m128 a = _mm_load_ps(sp);
-            __m128 b = _mm_load_ps(sp + 4);
-            __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
-            __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
-            _mm_store_ps(dpl, c);
-            _mm_store_ps(dph, d);
-          }
-        }
-        else
-        {
           float* dph = hdst->f32;
-          float* dpl = ldst->f32;
           float* sp = src->f32;
-          int i = (int)width;
-          for ( ; i > 8; i -= 16, sp += 16, dpl += 8, dph += 8)
-          {
-            __m256 a = _mm256_load_ps(sp);
-            __m256 b = _mm256_load_ps(sp + 8);
-            __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
-            __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
-            __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
-            __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
-            _mm256_store_ps(dpl, f);
-            _mm256_store_ps(dph, e);
-          }
-          for (; i > 0; i -= 8, sp += 8, dpl += 4, dph += 4)
-          {
-            __m128 a = _mm_load_ps(sp);
-            __m128 b = _mm_load_ps(sp + 4);
-            __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
-            __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
-            _mm_store_ps(dpl, d);
-            _mm_store_ps(dph, c);
-          }
+          int w = (int)width;
+          AVX_DEINTERLEAVE(dpl, dph, sp, w, even);
         }
 
         // the actual horizontal transform
@@ -157,7 +121,6 @@ namespace ojph {
         ui32 num_steps = atk->get_num_steps();
         for (ui32 j = num_steps; j > 0; --j)
         {
-          // first lifting step
           const lifting_step* s = atk->get_step(j - 1);
           const float a = s->irv.Aatk;
 
@@ -201,27 +164,8 @@ namespace ojph {
         { // multiply by K or 1/K
           float K = atk->get_K();
           float K_inv = 1.0f / K;
-          float* dp;
-          int i;
-          __m256 factor;
-
-          factor = _mm256_set1_ps(K_inv);
-          dp = lp;
-          i = (int)l_width;
-          for ( ; i > 0; i -= 8, dp += 8)
-          {
-            __m256 s = _mm256_load_ps(dp);
-            _mm256_store_ps(dp, _mm256_mul_ps(factor, s));
-          }
-
-          factor = _mm256_set1_ps(K);
-          dp = hp;
-          i = (int)h_width;
-          for ( ; i > 0; i -= 8, dp += 8)
-          {
-            __m256 s = _mm256_load_ps(dp);
-            _mm256_store_ps(dp, _mm256_mul_ps(factor, s));
-          }
+          avx_multiply_const(lp, K_inv, (int)l_width);
+          avx_multiply_const(hp, K, (int)h_width);
         }
       }
       else {
@@ -247,27 +191,8 @@ namespace ojph {
         { // multiply by K or 1/K
           float K = atk->get_K();
           float K_inv = 1.0f / K;
-          float* dp;
-          int i;
-          __m256 factor;
-
-          factor = _mm256_set1_ps(K);
-          dp = aug;
-          i = (int)aug_width;
-          for ( ; i > 0; i -= 8, dp += 8)
-          {
-            __m256 s = _mm256_load_ps(dp);
-            _mm256_store_ps(dp, _mm256_mul_ps(factor, s));
-          }
-
-          factor = _mm256_set1_ps(K_inv);
-          dp = oth;
-          i = (int)oth_width;
-          for ( ; i > 0; i -= 8, dp += 8)
-          {
-            __m256 s = _mm256_load_ps(dp);
-            _mm256_store_ps(dp, _mm256_mul_ps(factor, s));
-          }
+          avx_multiply_const(aug, K, (int)aug_width);
+          avx_multiply_const(oth, K_inv, (int)oth_width);
         }
 
         // the actual horizontal transform
@@ -315,59 +240,12 @@ namespace ojph {
         }
 
         // combine both lsrc and hsrc into dst
-        if (even)
         {
-          float* sph = hsrc->f32;
-          float* spl = lsrc->f32;
           float* dp = dst->f32;
-          int i = (int)width;
-          for ( ; i > 8; i -= 16, dp += 16, spl += 8, sph += 8)
-          {
-            __m256 a = _mm256_load_ps(spl);
-            __m256 b = _mm256_load_ps(sph);
-            __m256 c = _mm256_unpacklo_ps(a, b);
-            __m256 d = _mm256_unpackhi_ps(a, b);
-            __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
-            __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
-            _mm256_store_ps(dp, e);
-            _mm256_store_ps(dp + 8, f);
-          }
-          for (; i > 0; i -= 8, dp += 8, spl += 4, sph += 4)
-          {
-            __m128 a = _mm_load_ps(spl);
-            __m128 b = _mm_load_ps(sph);
-            __m128 c = _mm_unpacklo_ps(a, b);
-            __m128 d = _mm_unpackhi_ps(a, b);
-            _mm_store_ps(dp, c);
-            _mm_store_ps(dp + 4, d);
-          }
-        }
-        else
-        {
-          float* sph = hsrc->f32;
           float* spl = lsrc->f32;
-          float* dp = dst->f32;
-          int i = (int)width;
-          for (; i > 8; i -= 16, dp += 16, spl += 8, sph += 8)
-          { // i>=8 because we can exceed the aligned buffer by up to 7
-            __m256 a = _mm256_load_ps(spl);
-            __m256 b = _mm256_load_ps(sph);
-            __m256 c = _mm256_unpacklo_ps(b, a);
-            __m256 d = _mm256_unpackhi_ps(b, a);
-            __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
-            __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
-            _mm256_store_ps(dp, e);
-            _mm256_store_ps(dp + 8, f);
-          }
-          for (; i > 0; i -= 8, dp += 8, spl += 4, sph += 4)
-          {
-            __m128 a = _mm_load_ps(spl);
-            __m128 b = _mm_load_ps(sph);
-            __m128 c = _mm_unpacklo_ps(b, a);
-            __m128 d = _mm_unpackhi_ps(b, a);
-            _mm_store_ps(dp, c);
-            _mm_store_ps(dp + 4, d);
-          }
+          float* sph = hsrc->f32;
+          int w = (int)width;
+          AVX_INTERLEAVE(dp, spl, sph, w, even);
         }
       }
       else {
diff --git a/src/core/transform/ojph_transform_avx2.cpp b/src/core/transform/ojph_transform_avx2.cpp
index 915e246c..a7b16ddb 100644
--- a/src/core/transform/ojph_transform_avx2.cpp
+++ b/src/core/transform/ojph_transform_avx2.cpp
@@ -40,6 +40,9 @@
 #include "ojph_defs.h"
 #include "ojph_arch.h"
 #include "ojph_mem.h"
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
+
 #include "ojph_transform.h"
 #include "ojph_transform_local.h"
 
@@ -48,218 +51,470 @@
 namespace ojph {
   namespace local {
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_fwd_predict(const line_buf* line_src1,
-                                        const line_buf* line_src2,
-                                        line_buf *line_dst, ui32 repeat)
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui32 e = s->rev.Eatk;
+      __m256i va = _mm256_set1_epi32(a);
+      __m256i vb = _mm256_set1_epi32(b);
 
-      for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
-      {
-        __m256i s1 = _mm256_load_si256((__m256i*)src1);
-        __m256i s2 = _mm256_load_si256((__m256i*)src2);
-        __m256i d = _mm256_load_si256((__m256i*)dst);
-        s1 = _mm256_srai_epi32(_mm256_add_epi32(s1, s2), 1);
-        d = _mm256_sub_epi32(d, s1);
-        _mm256_store_si256((__m256i*)dst, d);
+      si32* dst = aug->i32;
+      const si32* src1 = sig->i32, * src2 = other->i32;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i v = _mm256_add_epi32(vb, t);
+            __m256i w = _mm256_srai_epi32(v, e);
+            d = _mm256_sub_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i v = _mm256_add_epi32(vb, t);
+            __m256i w = _mm256_srai_epi32(v, e);
+            d = _mm256_add_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
       }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_fwd_update(const line_buf* line_src1,
-                                       const line_buf* line_src2,
-                                       line_buf *line_dst, ui32 repeat)
-    {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-
-      __m256i offset = _mm256_set1_epi32(2);
-      for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
-      {
-        __m256i s1 = _mm256_load_si256((__m256i*)src1);
-        s1 = _mm256_add_epi32(s1, offset);
-        __m256i s2 = _mm256_load_si256((__m256i*)src2);
-        s2 = _mm256_add_epi32(s2, s1);
-        __m256i d = _mm256_load_si256((__m256i*)dst);
-        d = _mm256_add_epi32(d, _mm256_srai_epi32(s2, 2));
-        _mm256_store_si256((__m256i*)dst, d);
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i w = _mm256_srai_epi32(t, e);
+            d = _mm256_add_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i w = _mm256_srai_epi32(t, e);
+            d = _mm256_sub_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i v = _mm256_sub_epi32(vb, t);
+            __m256i w = _mm256_srai_epi32(v, e);
+            d = _mm256_sub_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i v = _mm256_sub_epi32(vb, t);
+            __m256i w = _mm256_srai_epi32(v, e);
+            d = _mm256_add_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+      }
+      else { // general case
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i u = _mm256_mullo_epi32(va, t);
+            __m256i v = _mm256_add_epi32(vb, u);
+            __m256i w = _mm256_srai_epi32(v, e);
+            d = _mm256_sub_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
+          {
+            __m256i s1 = _mm256_load_si256((__m256i*)src1);
+            __m256i s2 = _mm256_load_si256((__m256i*)src2);
+            __m256i d = _mm256_load_si256((__m256i*)dst);
+            __m256i t = _mm256_add_epi32(s1, s2);
+            __m256i u = _mm256_mullo_epi32(va, t);
+            __m256i v = _mm256_add_epi32(vb, u);
+            __m256i w = _mm256_srai_epi32(v, e);
+            d = _mm256_add_epi32(d, w);
+            _mm256_store_si256((__m256i*)dst, d);
+          }
       }
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_horz_wvlt_fwd_tx(line_buf* line_src, line_buf *line_ldst,
-                                   line_buf *line_hdst,ui32 width, bool even)
+    /////////////////////////////////////////////////////////////////////////
+    void avx2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
     {
       if (width > 1)
       {
-        si32 *src = line_src->i32;
-        si32 *ldst = line_ldst->i32, *hdst = line_hdst->i32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        // extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const si32* sp = src + (even ? 1 : 0);
-        si32 *dph = hdst;
-        const __m256i mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
-        for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dph+=8)
-        { //this is doing twice the work it needs to do
-          //it can be definitely written better
-          __m256i s1 = _mm256_loadu_si256((__m256i*)(sp-1));
-          __m256i s2 = _mm256_loadu_si256((__m256i*)(sp+1));
-          __m256i d = _mm256_loadu_si256((__m256i*)sp);
-          s1 = _mm256_srai_epi32(_mm256_add_epi32(s1, s2), 1);
-          __m256i d1 = _mm256_sub_epi32(d, s1);
-          sp += 8;
-          s1 = _mm256_loadu_si256((__m256i*)(sp-1));
-          s2 = _mm256_loadu_si256((__m256i*)(sp+1));
-          d = _mm256_loadu_si256((__m256i*)sp);
-          s1 = _mm256_srai_epi32(_mm256_add_epi32(s1, s2), 1);
-          __m256i d2 = _mm256_sub_epi32(d, s1);
-          sp += 8;
-          d1 = _mm256_permutevar8x32_epi32(d1, mask);
-          d2 = _mm256_permutevar8x32_epi32(d2, mask);
-          d = _mm256_permute2x128_si256(d1, d2, (2 << 4) | 0);
-          _mm256_store_si256((__m256i*)dph, d);
+        // combine both lsrc and hsrc into dst
+        {
+          float* dpl = ldst->f32;
+          float* dph = hdst->f32;
+          float* sp = src->f32;
+          int w = (int)width;
+          AVX_DEINTERLEAVE(dpl, dph, sp, w, even);
         }
 
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        sp = src + (even ? 0 : 1);
-        const si32* sph = hdst + (even ? 0 : 1);
-        si32 *dpl = ldst;
-        __m256i offset = _mm256_set1_epi32(2);
-        for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sp+=16, sph+=8, dpl+=8)
+        si32* hp = hdst->i32, * lp = ldst->i32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
         {
-          __m256i s1 = _mm256_loadu_si256((__m256i*)(sph-1));
-          s1 = _mm256_add_epi32(s1, offset);
-          __m256i s2 = _mm256_loadu_si256((__m256i*)sph);
-          s2 = _mm256_add_epi32(s2, s1);
-          __m256i d1 = _mm256_loadu_si256((__m256i*)sp);
-          __m256i d2 = _mm256_loadu_si256((__m256i*)sp + 1);
-          d1 = _mm256_permutevar8x32_epi32(d1, mask);
-          d2 = _mm256_permutevar8x32_epi32(d2, mask);
-          __m256i d = _mm256_permute2x128_si256(d1, d2, (2 << 4) | 0);
-          d = _mm256_add_epi32(d, _mm256_srai_epi32(s2, 2));
-          _mm256_store_si256((__m256i*)dpl, d);
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui32 e = s->rev.Eatk;
+          __m256i va = _mm256_set1_epi32(a);
+          __m256i vb = _mm256_set1_epi32(b);
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si32* sp = lp;
+          si32* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_add_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_add_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i w = _mm256_srai_epi32(t, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i w = _mm256_srai_epi32(t, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_sub_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_sub_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i u = _mm256_mullo_epi32(va, t);
+                __m256i v = _mm256_add_epi32(vb, u);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i u = _mm256_mullo_epi32(va, t);
+                __m256i v = _mm256_add_epi32(vb, u);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si32* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_ldst->i32[0] = line_src->i32[0];
+          ldst->i32[0] = src->i32[0];
         else
-          line_hdst->i32[0] = line_src->i32[0] << 1;
+          hdst->i32[0] = src->i32[0] << 1;
       }
     }
-
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_bwd_predict(const line_buf* line_src1,
-                                        const line_buf* line_src2,
-                                        line_buf *line_dst, ui32 repeat)
-    {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
     
-      for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
-      {
-        __m256i s1 = _mm256_load_si256((__m256i*)src1);
-        __m256i s2 = _mm256_load_si256((__m256i*)src2);
-        __m256i d = _mm256_load_si256((__m256i*)dst);
-        s1 = _mm256_srai_epi32(_mm256_add_epi32(s1, s2), 1);
-        d = _mm256_add_epi32(d, s1);
-        _mm256_store_si256((__m256i*)dst, d);
-      }
-    }
-
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_vert_wvlt_bwd_update(const line_buf* line_src1,
-                                       const line_buf* line_src2,
-                                       line_buf *line_dst, ui32 repeat)
-    {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-    
-      __m256i offset = _mm256_set1_epi32(2);
-      for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
-      {
-        __m256i s1 = _mm256_load_si256((__m256i*)src1);
-        s1 = _mm256_add_epi32(s1, offset);
-        __m256i s2 = _mm256_load_si256((__m256i*)src2);
-        s2 = _mm256_add_epi32(s2, s1);
-        __m256i d = _mm256_load_si256((__m256i*)dst);
-        d = _mm256_sub_epi32(d, _mm256_srai_epi32(s2, 2));
-        _mm256_store_si256((__m256i*)dst, d);
-      }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
-                                   line_buf *line_hsrc, ui32 width, bool even)
+    void avx2_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
     {
       if (width > 1)
       {
-        si32 *lsrc = line_lsrc->i32, *hsrc = line_hsrc->i32;
-        si32 *dst = line_dst->i32;
+        bool ev = even;
+        si32* oth = hsrc->i32, * aug = lsrc->i32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui32 e = s->rev.Eatk;
+          __m256i va = _mm256_set1_epi32(a);
+          __m256i vb = _mm256_set1_epi32(b);
 
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si32* sp = oth;
+          si32* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_add_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_add_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i w = _mm256_srai_epi32(t, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i w = _mm256_srai_epi32(t, e);
+                d = _mm256_add_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_sub_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i v = _mm256_sub_epi32(vb, t);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp - 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i u = _mm256_mullo_epi32(va, t);
+                __m256i v = _mm256_add_epi32(vb, u);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 8, sp += 8, dp += 8)
+              {
+                __m256i s1 = _mm256_load_si256((__m256i*)sp);
+                __m256i s2 = _mm256_loadu_si256((__m256i*)(sp + 1));
+                __m256i d = _mm256_load_si256((__m256i*)dp);
+                __m256i t = _mm256_add_epi32(s1, s2);
+                __m256i u = _mm256_mullo_epi32(va, t);
+                __m256i v = _mm256_add_epi32(vb, u);
+                __m256i w = _mm256_srai_epi32(v, e);
+                d = _mm256_sub_epi32(d, w);
+                _mm256_store_si256((__m256i*)dp, d);
+              }
+          }
 
-        // extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        const si32 *sph = hsrc + (even ? 0 : 1);
-        si32 *spl = lsrc;
-        __m256i offset = _mm256_set1_epi32(2);
-        for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sph+=8, spl+=8)
-        {
-          __m256i s1 = _mm256_loadu_si256((__m256i*)(sph-1));
-          s1 = _mm256_add_epi32(s1, offset);
-          __m256i s2 = _mm256_loadu_si256((__m256i*)sph);
-          s2 = _mm256_add_epi32(s2, s1);
-          __m256i d = _mm256_load_si256((__m256i*)spl);
-          d = _mm256_sub_epi32(d, _mm256_srai_epi32(s2, 2));
-          _mm256_store_si256((__m256i*)spl, d);
+          // swap buffers
+          si32* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
         }
 
-        // extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width - 1];
-        // inverse predict and combine
-        si32 *dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        ui32 width = L_width + (even ? 0 : 1);
-        for (ui32 i = (width + 7) >> 3; i > 0; --i, sph+=8, spl+=8, dp+=16)
+        // combine both lsrc and hsrc into dst
         {
-          __m256i s1 = _mm256_loadu_si256((__m256i*)spl);
-          __m256i s2 = _mm256_loadu_si256((__m256i*)(spl+1));
-          __m256i d = _mm256_load_si256((__m256i*)sph);
-          s2 = _mm256_srai_epi32(_mm256_add_epi32(s1, s2), 1);
-          d = _mm256_add_epi32(d, s2);
-          s2 = _mm256_unpackhi_epi32(s1, d);
-          s1 = _mm256_unpacklo_epi32(s1, d);
-          d = _mm256_permute2x128_si256(s1, s2, (2 << 4) | 0);
-          _mm256_storeu_si256((__m256i*)dp, d);
-          d = _mm256_permute2x128_si256(s1, s2, (3 << 4) | 1);
-          _mm256_storeu_si256((__m256i*)dp + 1, d);
+          float* dp = dst->f32;
+          float* spl = lsrc->f32;
+          float* sph = hsrc->f32;
+          int w = (int)width;
+          AVX_INTERLEAVE(dp, spl, sph, w, even);
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->i32[0] = line_lsrc->i32[0];
+          dst->i32[0] = lsrc->i32[0];
         else
-          line_dst->i32[0] = line_hsrc->i32[0] >> 1;
+          dst->i32[0] = hsrc->i32[0] >> 1;
       }
     }
-  }
-}
+
+
+
+  } // !local
+} // !ojph
diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h
index fe7d1f27..3ba9e6d0 100644
--- a/src/core/transform/ojph_transform_local.h
+++ b/src/core/transform/ojph_transform_local.h
@@ -45,6 +45,7 @@ namespace ojph {
   struct line_buf;
   namespace local {
     struct param_atk;
+    union lifting_step;
 
     //////////////////////////////////////////////////////////////////////////
     //
@@ -103,6 +104,60 @@ namespace ojph {
     //
     //////////////////////////////////////////////////////////////////////////
 
+    //////////////////////////////////////////////////////////////////////////
+    // Supporting macros
+    //////////////////////////////////////////////////////////////////////////
+
+    //////////////////////////////////////////////////////////////////////////
+    #define SSE_DEINTERLEAVE(dpl, dph, sp, width, even)                      \
+    {                                                                        \
+      if (even)                                                              \
+        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)           \
+        {                                                                    \
+          __m128 a = _mm_load_ps(sp);                                        \
+          __m128 b = _mm_load_ps(sp + 4);                                    \
+          __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));          \
+          __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));          \
+          _mm_store_ps(dpl, c);                                              \
+          _mm_store_ps(dph, d);                                              \
+        }                                                                    \
+      else                                                                   \
+        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)           \
+        {                                                                    \
+          __m128 a = _mm_load_ps(sp);                                        \
+          __m128 b = _mm_load_ps(sp + 4);                                    \
+          __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));          \
+          __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));          \
+          _mm_store_ps(dpl, d);                                              \
+          _mm_store_ps(dph, c);                                              \
+        }                                                                    \
+    }                                                                        
+
+    //////////////////////////////////////////////////////////////////////////
+    #define SSE_INTERLEAVE(dp, spl, sph, width, even)                        \
+    {                                                                        \
+      if (even)                                                              \
+        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)           \
+        {                                                                    \
+          __m128 a = _mm_load_ps(spl);                                       \
+          __m128 b = _mm_load_ps(sph);                                       \
+          __m128 c = _mm_unpacklo_ps(a, b);                                  \
+          __m128 d = _mm_unpackhi_ps(a, b);                                  \
+          _mm_store_ps(dp, c);                                               \
+          _mm_store_ps(dp + 4, d);                                           \
+        }                                                                    \
+      else                                                                   \
+        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)           \
+        {                                                                    \
+          __m128 a = _mm_load_ps(spl);                                       \
+          __m128 b = _mm_load_ps(sph);                                       \
+          __m128 c = _mm_unpacklo_ps(b, a);                                  \
+          __m128 d = _mm_unpackhi_ps(b, a);                                  \
+          _mm_store_ps(dp, c);                                               \
+          _mm_store_ps(dp + 4, d);                                           \
+        }                                                                    \
+    }
+
     //////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
@@ -161,6 +216,116 @@ namespace ojph {
     //
     //////////////////////////////////////////////////////////////////////////
 
+    //////////////////////////////////////////////////////////////////////////
+    // Supporting macros
+    //////////////////////////////////////////////////////////////////////////
+
+    //////////////////////////////////////////////////////////////////////////
+    // We split multiples of 16 followed by multiples of 8, because
+    // we assume byte_alignment == 32
+    #define AVX_DEINTERLEAVE(dpl, dph, sp, width, even)                      \
+    {                                                                        \
+      if (even)                                                              \
+      {                                                                      \
+        for (; width > 8; width -= 16, sp += 16, dpl += 8, dph += 8)         \
+        {                                                                    \
+          __m256 a = _mm256_load_ps(sp);                                     \
+          __m256 b = _mm256_load_ps(sp + 8);                                 \
+          __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));           \
+          __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));           \
+          __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));       \
+          __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));       \
+          _mm256_store_ps(dpl, e);                                           \
+          _mm256_store_ps(dph, f);                                           \
+        }                                                                    \
+        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)           \
+        {                                                                    \
+          __m128 a = _mm_load_ps(sp);                                        \
+          __m128 b = _mm_load_ps(sp + 4);                                    \
+          __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));          \
+          __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));          \
+          _mm_store_ps(dpl, c);                                              \
+          _mm_store_ps(dph, d);                                              \
+        }                                                                    \
+      }                                                                      \
+      else                                                                   \
+      {                                                                      \
+        for (; width > 8; width -= 16, sp += 16, dpl += 8, dph += 8)         \
+        {                                                                    \
+          __m256 a = _mm256_load_ps(sp);                                     \
+          __m256 b = _mm256_load_ps(sp + 8);                                 \
+          __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));           \
+          __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));           \
+          __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));       \
+          __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));       \
+          _mm256_store_ps(dpl, f);                                           \
+          _mm256_store_ps(dph, e);                                           \
+        }                                                                    \
+        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)           \
+        {                                                                    \
+          __m128 a = _mm_load_ps(sp);                                        \
+          __m128 b = _mm_load_ps(sp + 4);                                    \
+          __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));          \
+          __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));          \
+          _mm_store_ps(dpl, d);                                              \
+          _mm_store_ps(dph, c);                                              \
+        }                                                                    \
+      }                                                                      \
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // We split multiples of 16 followed by multiples of 8, because
+    // we assume byte_alignment == 32
+    #define AVX_INTERLEAVE(dp, spl, sph, width, even)                        \
+    {                                                                        \
+      if (even)                                                              \
+      {                                                                      \
+        for (; width > 8; width -= 16, dp += 16, spl += 8, sph += 8)         \
+        {                                                                    \
+          __m256 a = _mm256_load_ps(spl);                                    \
+          __m256 b = _mm256_load_ps(sph);                                    \
+          __m256 c = _mm256_unpacklo_ps(a, b);                               \
+          __m256 d = _mm256_unpackhi_ps(a, b);                               \
+          __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));           \
+          __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));           \
+          _mm256_store_ps(dp, e);                                            \
+          _mm256_store_ps(dp + 8, f);                                        \
+        }                                                                    \
+        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)           \
+        {                                                                    \
+          __m128 a = _mm_load_ps(spl);                                       \
+          __m128 b = _mm_load_ps(sph);                                       \
+          __m128 c = _mm_unpacklo_ps(a, b);                                  \
+          __m128 d = _mm_unpackhi_ps(a, b);                                  \
+          _mm_store_ps(dp, c);                                               \
+          _mm_store_ps(dp + 4, d);                                           \
+        }                                                                    \
+      }                                                                      \
+      else                                                                   \
+      {                                                                      \
+        for (; width > 8; width -= 16, dp += 16, spl += 8, sph += 8)         \
+        {                                                                    \
+          __m256 a = _mm256_load_ps(spl);                                    \
+          __m256 b = _mm256_load_ps(sph);                                    \
+          __m256 c = _mm256_unpacklo_ps(b, a);                               \
+          __m256 d = _mm256_unpackhi_ps(b, a);                               \
+          __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));           \
+          __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));           \
+          _mm256_store_ps(dp, e);                                            \
+          _mm256_store_ps(dp + 8, f);                                        \
+        }                                                                    \
+        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)           \
+        {                                                                    \
+          __m128 a = _mm_load_ps(spl);                                       \
+          __m128 b = _mm_load_ps(sph);                                       \
+          __m128 c = _mm_unpacklo_ps(b, a);                                  \
+          __m128 d = _mm_unpackhi_ps(b, a);                                  \
+          _mm_store_ps(dp, c);                                               \
+          _mm_store_ps(dp + 4, d);                                           \
+        }                                                                    \
+      }                                                                      \
+    }
+
     //////////////////////////////////////////////////////////////////////////
     // Irreversible functions
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/core/transform/ojph_transform_sse.cpp b/src/core/transform/ojph_transform_sse.cpp
index b61ea5e9..897a1939 100644
--- a/src/core/transform/ojph_transform_sse.cpp
+++ b/src/core/transform/ojph_transform_sse.cpp
@@ -41,15 +41,26 @@
 #include "ojph_defs.h"
 #include "ojph_arch.h"
 #include "ojph_mem.h"
-#include "ojph_transform.h"
-#include "ojph_transform_local.h"
-
 #include "ojph_params.h"
 #include "../codestream/ojph_params_local.h"
 
+#include "ojph_transform.h"
+#include "ojph_transform_local.h"
+
 namespace ojph {
   namespace local {
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline void sse_multiply_const(float* p, float f, int width)
+    {
+      __m128 factor = _mm_set1_ps(f);
+      for (; width > 0; width -= 4, p += 4)
+      {
+        __m128 s = _mm_load_ps(p);
+        _mm_store_ps(p, _mm_mul_ps(factor, s));
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void sse_irv_vert_step(const lifting_step* s, const line_buf* sig, 
                            const line_buf* other, const line_buf* aug, 
@@ -77,14 +88,7 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void sse_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
     {
-      __m128 factor = _mm_set1_ps(K);
-      float* dst = aug->f32;
-      int i = (int)repeat;
-      for (; i > 0; i -= 4, dst += 4)
-      {
-        __m128 s = _mm_load_ps(dst);
-        _mm_store_ps(dst, _mm_mul_ps(factor, s));
-      }
+      sse_multiply_const(aug->f32, K, (int)repeat);
     }
 
     /////////////////////////////////////////////////////////////////////////
@@ -95,39 +99,12 @@ namespace ojph {
       if (width > 1)
       {
         // split src into ldst and hdst
-        if (even)
         {
-          float* dph = hdst->f32;
           float* dpl = ldst->f32;
-          float* sp = src->f32;
-
-          int i = (int)width;
-          for ( ; i > 0; i -= 8, sp += 8, dpl += 4, dph += 4)
-          {
-            __m128 a = _mm_load_ps(sp);
-            __m128 b = _mm_load_ps(sp + 4);
-            __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
-            __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
-            _mm_store_ps(dpl, c);
-            _mm_store_ps(dph, d);
-          }
-        }
-        else
-        {
           float* dph = hdst->f32;
-          float* dpl = ldst->f32;
           float* sp = src->f32;
-
-          int i = (int)width;
-          for ( ; i > 0; i -= 8, sp += 8, dpl += 4, dph += 4)
-          {
-            __m128 a = _mm_load_ps(sp);
-            __m128 b = _mm_load_ps(sp + 4);
-            __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
-            __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
-            _mm_store_ps(dpl, d);
-            _mm_store_ps(dph, c);
-          }
+          int w = (int)width;
+          SSE_DEINTERLEAVE(dpl, dph, sp, w, even);
         }
 
         // the actual horizontal transform
@@ -137,7 +114,6 @@ namespace ojph {
         ui32 num_steps = atk->get_num_steps();
         for (ui32 j = num_steps; j > 0; --j)
         {
-          // first lifting step
           const lifting_step* s = atk->get_step(j - 1);
           const float a = s->irv.Aatk;
 
@@ -181,27 +157,8 @@ namespace ojph {
         { // multiply by K or 1/K
           float K = atk->get_K();
           float K_inv = 1.0f / K;
-          float* dp;
-          int i;
-          __m128 factor;
-
-          factor = _mm_set1_ps(K_inv);
-          dp = lp;
-          i = (int)l_width;
-          for ( ; i > 0; i -= 4, dp += 4)
-          {
-            __m128 s = _mm_load_ps(dp);
-            _mm_store_ps(dp, _mm_mul_ps(factor, s));
-          }
-
-          factor = _mm_set1_ps(K);
-          dp = hp;
-          i = (int)h_width;
-          for ( ; i > 0; i -= 4, dp += 4)
-          {
-            __m128 s = _mm_load_ps(dp);
-            _mm_store_ps(dp, _mm_mul_ps(factor, s));
-          }
+          sse_multiply_const(lp, K_inv, (int)l_width);
+          sse_multiply_const(hp, K, (int)h_width);
         }
       }
       else {
@@ -227,27 +184,8 @@ namespace ojph {
         { // multiply by K or 1/K
           float K = atk->get_K();
           float K_inv = 1.0f / K;
-          float* dp;
-          int i;
-          __m128 factor;
-
-          factor = _mm_set1_ps(K);
-          dp = aug;
-          i = (int)aug_width;
-          for ( ; i > 0; i -= 4, dp += 4)
-          {
-            __m128 s = _mm_load_ps(dp);
-            _mm_store_ps(dp, _mm_mul_ps(factor, s));
-          }
-
-          factor = _mm_set1_ps(K_inv);
-          dp = oth;
-          i = (int)oth_width;
-          for ( ; i > 0; i -= 4, dp += 4)
-          {
-            __m128 s = _mm_load_ps(dp);
-            _mm_store_ps(dp, _mm_mul_ps(factor, s));
-          }
+          sse_multiply_const(aug, K, (int)aug_width);
+          sse_multiply_const(oth, K_inv, (int)oth_width);
         }
 
         // the actual horizontal transform
@@ -295,37 +233,12 @@ namespace ojph {
         }
 
         // combine both lsrc and hsrc into dst
-        if (even)
         {
-          float* sph = hsrc->f32;
-          float* spl = lsrc->f32;
           float* dp = dst->f32;
-          int i = (int)width;
-          for ( ; i > 0; i -= 8, dp += 8, spl += 4, sph += 4)
-          {
-            __m128 a = _mm_load_ps(spl);
-            __m128 b = _mm_load_ps(sph);
-            __m128 c = _mm_unpacklo_ps(a, b);
-            __m128 d = _mm_unpackhi_ps(a, b);
-            _mm_store_ps(dp, c);
-            _mm_store_ps(dp + 4, d);
-          }
-        }
-        else
-        {
-          float* sph = hsrc->f32;
           float* spl = lsrc->f32;
-          float* dp = dst->f32;
-          int i = (int)width;
-          for ( ; i > 0; i -= 8, dp += 8, spl += 4, sph += 4)
-          {
-            __m128 a = _mm_load_ps(spl);
-            __m128 b = _mm_load_ps(sph);
-            __m128 c = _mm_unpacklo_ps(b, a);
-            __m128 d = _mm_unpackhi_ps(b, a);
-            _mm_store_ps(dp, c);
-            _mm_store_ps(dp + 4, d);
-          }
+          float* sph = hsrc->f32;
+          int w = (int)width;
+          SSE_INTERLEAVE(dp, spl, sph, w, even);
         }
       }
       else {
diff --git a/src/core/transform/ojph_transform_sse2.cpp b/src/core/transform/ojph_transform_sse2.cpp
index 5f3de49d..4939a219 100644
--- a/src/core/transform/ojph_transform_sse2.cpp
+++ b/src/core/transform/ojph_transform_sse2.cpp
@@ -40,6 +40,9 @@
 #include "ojph_defs.h"
 #include "ojph_arch.h"
 #include "ojph_mem.h"
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
+
 #include "ojph_transform.h"
 #include "ojph_transform_local.h"
 
@@ -48,211 +51,414 @@
 namespace ojph {
   namespace local {
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_fwd_predict(const line_buf* line_src1,
-                                        const line_buf* line_src2,
-                                        line_buf *line_dst, ui32 repeat)
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui32 e = s->rev.Eatk;
+      __m128i va = _mm_set1_epi32(a);
+      __m128i vb = _mm_set1_epi32(b);
 
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
-      {
-        __m128i s1 = _mm_load_si128((__m128i*)src1);
-        __m128i s2 = _mm_load_si128((__m128i*)src2);
-        __m128i d = _mm_load_si128((__m128i*)dst);
-        s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
-        d = _mm_sub_epi32(d, s1);
-        _mm_store_si128((__m128i*)dst, d);
+      si32* dst = aug->i32;
+      const si32* src1 = sig->i32, * src2 = other->i32;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi32(s1, s2);
+            __m128i v = _mm_add_epi32(vb, t);
+            __m128i w = _mm_srai_epi32(v, e);
+            d = _mm_sub_epi32(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi32(s1, s2);
+            __m128i v = _mm_add_epi32(vb, t);
+            __m128i w = _mm_srai_epi32(v, e);
+            d = _mm_add_epi32(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
       }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_fwd_update(const line_buf* line_src1,
-                                       const line_buf* line_src2,
-                                       line_buf *line_dst, ui32 repeat)
-    {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-    
-      __m128i offset = _mm_set1_epi32(2);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
-      {
-        __m128i s1 = _mm_load_si128((__m128i*)src1);
-        s1 = _mm_add_epi32(s1, offset);
-        __m128i s2 = _mm_load_si128((__m128i*)src2);
-        s2 = _mm_add_epi32(s2, s1);
-        __m128i d = _mm_load_si128((__m128i*)dst);
-        d = _mm_add_epi32(d, _mm_srai_epi32(s2, 2));
-        _mm_store_si128((__m128i*)dst, d);
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi32(s1, s2);
+            __m128i w = _mm_srai_epi32(t, e);
+            d = _mm_add_epi32(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi32(s1, s2);
+            __m128i w = _mm_srai_epi32(t, e);
+            d = _mm_sub_epi32(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi32(s1, s2);
+            __m128i v = _mm_sub_epi32(vb, t);
+            __m128i w = _mm_srai_epi32(v, e);
+            d = _mm_sub_epi32(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            __m128i s1 = _mm_load_si128((__m128i*)src1);
+            __m128i s2 = _mm_load_si128((__m128i*)src2);
+            __m128i d = _mm_load_si128((__m128i*)dst);
+            __m128i t = _mm_add_epi32(s1, s2);
+            __m128i v = _mm_sub_epi32(vb, t);
+            __m128i w = _mm_srai_epi32(v, e);
+            d = _mm_add_epi32(d, w);
+            _mm_store_si128((__m128i*)dst, d);
+          }
+      }
+      else { // general case
+        // 32bit multiplication is not supported in sse2; we need sse4.1,
+        // where we can use _mm_mullo_epi32, which multiplies 32bit x 32bit,
+        // keeping the LSBs
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
       }
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst,
-                                   line_buf *line_hdst, ui32 width, bool even)
+    /////////////////////////////////////////////////////////////////////////
+    void sse2_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
     {
       if (width > 1)
       {
-        si32 *src = line_src->i32;
-        si32 *ldst = line_ldst->i32, *hdst = line_hdst->i32;
-
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        // extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const si32* sp = src + (even ? 1 : 0);
-        si32 *dph = hdst;
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
-        { //this is doing twice the work it needs to do
-          //it can be definitely written better
-          __m128i s1 = _mm_loadu_si128((__m128i*)(sp-1));
-          __m128i s2 = _mm_loadu_si128((__m128i*)(sp+1));
-          __m128i d = _mm_loadu_si128((__m128i*)sp);
-          s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
-          __m128i d1 = _mm_sub_epi32(d, s1);
-          sp += 4;
-          s1 = _mm_loadu_si128((__m128i*)(sp-1));
-          s2 = _mm_loadu_si128((__m128i*)(sp+1));
-          d = _mm_loadu_si128((__m128i*)sp);
-          s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
-          __m128i d2 = _mm_sub_epi32(d, s1);
-          sp += 4;
-          d = _mm_castps_si128(_mm_shuffle_ps(
-              _mm_castsi128_ps(d1), _mm_castsi128_ps(d2), 0x88));
-          _mm_store_si128((__m128i*)dph, d);
+        // combine both lsrc and hsrc into dst
+        {
+          float* dpl = ldst->f32;
+          float* dph = hdst->f32;
+          float* sp = src->f32;
+          int w = (int)width;
+          SSE_DEINTERLEAVE(dpl, dph, sp, w, even);
         }
 
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        sp = src + (even ? 0 : 1);
-        const si32* sph = hdst + (even ? 0 : 1);
-        si32 *dpl = ldst;
-        __m128i offset = _mm_set1_epi32(2);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
+        si32* hp = hdst->i32, * lp = ldst->i32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
         {
-          __m128i s1 = _mm_loadu_si128((__m128i*)(sph-1));
-          s1 = _mm_add_epi32(s1, offset);
-          __m128i s2 = _mm_loadu_si128((__m128i*)sph);
-          s2 = _mm_add_epi32(s2, s1);
-          __m128i d1 = _mm_loadu_si128((__m128i*)sp);
-          __m128i d2 = _mm_loadu_si128((__m128i*)sp + 1);
-          __m128i d = _mm_castps_si128(_mm_shuffle_ps(
-              _mm_castsi128_ps(d1), _mm_castsi128_ps(d2), 0x88));
-          d = _mm_add_epi32(d, _mm_srai_epi32(s2, 2));
-          _mm_store_si128((__m128i*)dpl, d);
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui32 e = s->rev.Eatk;
+          __m128i va = _mm_set1_epi32(a);
+          __m128i vb = _mm_set1_epi32(b);
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si32* sp = lp;
+          si32* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_add_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_add_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_add_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_add_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i w = _mm_srai_epi32(t, e);
+                d = _mm_sub_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i w = _mm_srai_epi32(t, e);
+                d = _mm_sub_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_sub_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_add_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_sub_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_add_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 32bit multiplication is not supported in sse2; we need sse4.1,
+            // where we can use _mm_mullo_epi32, which multiplies
+            // 32bit x 32bit, keeping the LSBs
+            if (even)
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[0] + sp[1])) >> e;
+            else
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
+
+          // swap buffers
+          si32* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_ldst->i32[0] = line_src->i32[0];
+          ldst->i32[0] = src->i32[0];
         else
-          line_hdst->i32[0] = line_src->i32[0] << 1;
-      }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_bwd_predict(const line_buf* line_src1,
-                                        const line_buf* line_src2,
-                                        line_buf *line_dst, ui32 repeat)
-    {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-    
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
-      {
-        __m128i s1 = _mm_load_si128((__m128i*)src1);
-        __m128i s2 = _mm_load_si128((__m128i*)src2);
-        __m128i d = _mm_load_si128((__m128i*)dst);
-        s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
-        d = _mm_add_epi32(d, s1);
-        _mm_store_si128((__m128i*)dst, d);
+          hdst->i32[0] = src->i32[0] << 1;
       }
     }
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_vert_wvlt_bwd_update(const line_buf* line_src1,
-                                       const line_buf* line_src2,
-                                       line_buf *line_dst, ui32 repeat)
-    {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
     
-      __m128i offset = _mm_set1_epi32(2);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
-      {
-        __m128i s1 = _mm_load_si128((__m128i*)src1);
-        s1 = _mm_add_epi32(s1, offset);
-        __m128i s2 = _mm_load_si128((__m128i*)src2);
-        s2 = _mm_add_epi32(s2, s1);
-        __m128i d = _mm_load_si128((__m128i*)dst);
-        d = _mm_sub_epi32(d, _mm_srai_epi32(s2, 2));
-        _mm_store_si128((__m128i*)dst, d);
-      }
-    }
-
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc,
-                                   line_buf *line_hsrc, ui32 width, bool even)
+    void sse2_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
     {
       if (width > 1)
       {
-        si32 *lsrc = line_lsrc->i32, *hsrc = line_hsrc->i32;
-        si32 *dst = line_dst->i32;
+        bool ev = even;
+        si32* oth = hsrc->i32, * aug = lsrc->i32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui32 e = s->rev.Eatk;
+          __m128i va = _mm_set1_epi32(a);
+          __m128i vb = _mm_set1_epi32(b);
 
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si32* sp = oth;
+          si32* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_add_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_sub_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_add_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_sub_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i w = _mm_srai_epi32(t, e);
+                d = _mm_add_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i w = _mm_srai_epi32(t, e);
+                d = _mm_add_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp - 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_sub_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_sub_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                __m128i s1 = _mm_load_si128((__m128i*)sp);
+                __m128i s2 = _mm_loadu_si128((__m128i*)(sp + 1));
+                __m128i d = _mm_load_si128((__m128i*)dp);
+                __m128i t = _mm_add_epi32(s1, s2);
+                __m128i v = _mm_sub_epi32(vb, t);
+                __m128i w = _mm_srai_epi32(v, e);
+                d = _mm_sub_epi32(d, w);
+                _mm_store_si128((__m128i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 32bit multiplication is not supported in sse2; we need sse4.1,
+            // where we can use _mm_mullo_epi32, which multiplies
+            // 32bit x 32bit, keeping the LSBs
+            if (ev)
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+            else
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[0] + sp[1])) >> e;
+          }
 
-        // extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        const si32 *sph = hsrc + (even ? 0 : 1);
-        si32 *spl = lsrc;
-        __m128i offset = _mm_set1_epi32(2);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, spl+=4)
-        {
-          __m128i s1 = _mm_loadu_si128((__m128i*)(sph-1));
-          s1 = _mm_add_epi32(s1, offset);
-          __m128i s2 = _mm_loadu_si128((__m128i*)sph);
-          s2 = _mm_add_epi32(s2, s1);
-          __m128i d = _mm_load_si128((__m128i*)spl);
-          d = _mm_sub_epi32(d, _mm_srai_epi32(s2, 2));
-          _mm_store_si128((__m128i*)spl, d);
+          // swap buffers
+          si32* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
         }
 
-        // extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width - 1];
-        // inverse predict and combine
-        si32 *dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        ui32 width = L_width + (even ? 0 : 1);
-        for (ui32 i = (width + 3) >> 2; i > 0; --i, sph+=4, spl+=4, dp+=8)
+        // combine both lsrc and hsrc into dst
         {
-          __m128i s1 = _mm_loadu_si128((__m128i*)spl);
-          __m128i s2 = _mm_loadu_si128((__m128i*)(spl+1));
-          __m128i d = _mm_load_si128((__m128i*)sph);
-          s2 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
-          d = _mm_add_epi32(d, s2);
-          _mm_storeu_si128((__m128i*)dp, _mm_unpacklo_epi32(s1, d));
-          _mm_storeu_si128((__m128i*)dp + 1, _mm_unpackhi_epi32(s1, d));
+          float* dp = dst->f32;
+          float* spl = lsrc->f32;
+          float* sph = hsrc->f32;
+          int w = (int)width;
+          SSE_INTERLEAVE(dp, spl, sph, w, even);
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->i32[0] = line_lsrc->i32[0];
+          dst->i32[0] = lsrc->i32[0];
         else
-          line_dst->i32[0] = line_hsrc->i32[0] >> 1;
+          dst->i32[0] = hsrc->i32[0] >> 1;
       }
     }
-  }
-}
+
+  } // !local
+} // !ojph

From d1f505f2869c600c31532a0ae48aacb377336296 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Fri, 12 Apr 2024 21:42:34 +1000
Subject: [PATCH 30/37] Addresses compilation warnings.

---
 src/core/transform/ojph_transform_avx2.cpp | 6 +++---
 src/core/transform/ojph_transform_sse2.cpp | 9 +++------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/core/transform/ojph_transform_avx2.cpp b/src/core/transform/ojph_transform_avx2.cpp
index a7b16ddb..243fe87f 100644
--- a/src/core/transform/ojph_transform_avx2.cpp
+++ b/src/core/transform/ojph_transform_avx2.cpp
@@ -58,7 +58,7 @@ namespace ojph {
     {
       const si32 a = s->rev.Aatk;
       const si32 b = s->rev.Batk;
-      const ui32 e = s->rev.Eatk;
+      const si32 e = s->rev.Eatk;
       __m256i va = _mm256_set1_epi32(a);
       __m256i vb = _mm256_set1_epi32(b);
 
@@ -206,7 +206,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j - 1);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
+          const si32 e = s->rev.Eatk;
           __m256i va = _mm256_set1_epi32(a);
           __m256i vb = _mm256_set1_epi32(b);
 
@@ -364,7 +364,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
+          const si32 e = s->rev.Eatk;
           __m256i va = _mm256_set1_epi32(a);
           __m256i vb = _mm256_set1_epi32(b);
 
diff --git a/src/core/transform/ojph_transform_sse2.cpp b/src/core/transform/ojph_transform_sse2.cpp
index 4939a219..8328842a 100644
--- a/src/core/transform/ojph_transform_sse2.cpp
+++ b/src/core/transform/ojph_transform_sse2.cpp
@@ -58,8 +58,7 @@ namespace ojph {
     {
       const si32 a = s->rev.Aatk;
       const si32 b = s->rev.Batk;
-      const ui32 e = s->rev.Eatk;
-      __m128i va = _mm_set1_epi32(a);
+      const si32 e = s->rev.Eatk;
       __m128i vb = _mm_set1_epi32(b);
 
       si32* dst = aug->i32;
@@ -188,8 +187,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j - 1);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
-          __m128i va = _mm_set1_epi32(a);
+          const si32 e = s->rev.Eatk;
           __m128i vb = _mm_set1_epi32(b);
 
           // extension
@@ -328,8 +326,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const ui32 e = s->rev.Eatk;
-          __m128i va = _mm_set1_epi32(a);
+          const si32 e = s->rev.Eatk;
           __m128i vb = _mm_set1_epi32(b);
 
           // extension

From 1c4a14ce94a3fcd2073318eb86027106033a396b Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Sat, 13 Apr 2024 09:45:20 +1000
Subject: [PATCH 31/37] avx512 dwt implemented

---
 src/core/CMakeLists.txt                      |   9 +-
 src/core/common/ojph_arch.h                  |   6 +-
 src/core/transform/ojph_transform.cpp        |  24 +-
 src/core/transform/ojph_transform_avx.cpp    |   9 +-
 src/core/transform/ojph_transform_avx2.cpp   |   2 -
 src/core/transform/ojph_transform_avx512.cpp | 830 +++++++++++++++++++
 src/core/transform/ojph_transform_local.h    |  48 +-
 7 files changed, 855 insertions(+), 73 deletions(-)
 create mode 100644 src/core/transform/ojph_transform_avx512.cpp

diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 40b9649b..19123a2e 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -18,11 +18,12 @@ file(GLOB TRANSFORM_SSE    "transform/*_sse.cpp")
 file(GLOB TRANSFORM_SSE2   "transform/*_sse2.cpp")
 file(GLOB TRANSFORM_AVX    "transform/*_avx.cpp")
 file(GLOB TRANSFORM_AVX2   "transform/*_avx2.cpp")
+file(GLOB TRANSFORM_AVX512 "transform/*_avx512.cpp")
 file(GLOB TRANSFORM_WASM   "transform/*_wasm.cpp")
 
 list(REMOVE_ITEM CODESTREAM ${CODESTREAM_SSE} ${CODESTREAM_SSE2} ${CODESTREAM_AVX} ${CODESTREAM_AVX2} ${CODESTREAM_WASM})
 list(REMOVE_ITEM CODING ${CODING_SSSE3} ${CODING_WASM} ${CODING_AVX512})
-list(REMOVE_ITEM TRANSFORM ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2} ${TRANSFORM_WASM})
+list(REMOVE_ITEM TRANSFORM ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2} ${TRANSFORM_AVX512} ${TRANSFORM_WASM})
 list(APPEND SOURCES ${CODESTREAM} ${CODING} ${COMMON} ${OTHERS} ${TRANSFORM})
 
 source_group("codestream"        FILES ${CODESTREAM})
@@ -42,10 +43,10 @@ if(EMSCRIPTEN)
   source_group("coding" FILES ${CODING_WASM})
   source_group("transform" FILES ${TRANSFORM_WASM})
 elseif(NOT OJPH_DISABLE_INTEL_SIMD)
-  add_library(openjph ${SOURCES} ${CODESTREAM_SSE} ${CODESTREAM_SSE2} ${CODESTREAM_AVX} ${CODESTREAM_AVX2} ${CODING_SSSE3} ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2})
+  add_library(openjph ${SOURCES} ${CODESTREAM_SSE} ${CODESTREAM_SSE2} ${CODESTREAM_AVX} ${CODESTREAM_AVX2} ${CODING_SSSE3} ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2} ${TRANSFORM_AVX512})
   source_group("codestream" FILES ${CODESTREAM_SSE} ${CODESTREAM_SSE2} ${CODESTREAM_AVX} ${CODESTREAM_AVX2})
   source_group("coding" FILES ${CODING_SSSE3})
-  source_group("transform" FILES ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2})
+  source_group("transform" FILES ${TRANSFORM_SSE} ${TRANSFORM_SSE2} ${TRANSFORM_AVX} ${TRANSFORM_AVX2} ${TRANSFORM_AVX512})
   if (OJPH_ENABLE_INTEL_AVX512)
     target_sources(openjph PRIVATE ${CODING_AVX512})
     source_group("coding" FILES ${CODING_AVX512})
@@ -71,6 +72,7 @@ if (MSVC)
   set_source_files_properties(transform/ojph_colour_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
   set_source_files_properties(transform/ojph_transform_avx.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX")
   set_source_files_properties(transform/ojph_transform_avx2.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX2")
+  set_source_files_properties(transform/ojph_transform_avx512.cpp PROPERTIES COMPILE_FLAGS "/arch:AVX512")
 else()
   set_source_files_properties(codestream/ojph_codestream_avx.cpp PROPERTIES COMPILE_FLAGS -mavx)
   set_source_files_properties(codestream/ojph_codestream_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
@@ -80,6 +82,7 @@ else()
   set_source_files_properties(transform/ojph_colour_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
   set_source_files_properties(transform/ojph_transform_avx.cpp PROPERTIES COMPILE_FLAGS -mavx)
   set_source_files_properties(transform/ojph_transform_avx2.cpp PROPERTIES COMPILE_FLAGS -mavx2)
+  set_source_files_properties(transform/ojph_transform_avx512.cpp PROPERTIES COMPILE_FLAGS -mavx512f)
 endif()
 
 if (MSVC)
diff --git a/src/core/common/ojph_arch.h b/src/core/common/ojph_arch.h
index 62b630bb..fa9d077d 100644
--- a/src/core/common/ojph_arch.h
+++ b/src/core/common/ojph_arch.h
@@ -194,11 +194,7 @@ namespace ojph {
   ////////////////////////////////////////////////////////////////////////////
   // constants
   ////////////////////////////////////////////////////////////////////////////
-#ifdef OJPH_ENABLE_INTEL_AVX512
-  const ui32 byte_alignment = 64; //64 bytes == 512 bits
-#else
-  const ui32 byte_alignment = 32; //32 bytes == 256 bits
-#endif
+  const ui32 byte_alignment = 64; // 64 bytes == 512 bits
   const ui32 log_byte_alignment = 31 - count_leading_zeros(byte_alignment);
   const ui32 object_alignment = 8;
 
diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp
index 95ab686c..83eed644 100644
--- a/src/core/transform/ojph_transform.cpp
+++ b/src/core/transform/ojph_transform.cpp
@@ -145,17 +145,19 @@ namespace ojph {
         rev_horz_syn              = avx2_rev_horz_syn;
       }
 
-      //if (level >= X86_CPU_EXT_LEVEL_AVX512)
-      //{
-      //  rev_vert_step             = avx512_rev_vert_ana_step;
-      //  rev_horz_ana              = avx512_rev_horz_ana;
-      //  rev_horz_syn              = avx512_rev_horz_syn;
-
-      //  irv_vert_step             = avx512_irv_vert_step;
-      //  irv_vert_times_K          = avx512_irv_vert_times_K;
-      //  irv_vert_syn_step         = avx512_irv_vert_syn_step;
-      //  irv_horz_syn              = avx512_irv_horz_syn;
-      //}
+#ifdef OJPH_ENABLE_INTEL_AVX512
+      if (level >= X86_CPU_EXT_LEVEL_AVX512)
+      {
+        rev_vert_step             = avx512_rev_vert_step;
+        rev_horz_ana              = avx512_rev_horz_ana;
+        rev_horz_syn              = avx512_rev_horz_syn;
+
+        irv_vert_step             = avx512_irv_vert_step;
+        irv_vert_times_K          = avx512_irv_vert_times_K;
+        irv_horz_ana              = avx512_irv_horz_ana;
+        irv_horz_syn              = avx512_irv_horz_syn;
+      }
+#endif // !OJPH_ENABLE_INTEL_AVX512
 
 #endif // !OJPH_DISABLE_INTEL_SIMD
 
diff --git a/src/core/transform/ojph_transform_avx.cpp b/src/core/transform/ojph_transform_avx.cpp
index e7933ff1..08566624 100644
--- a/src/core/transform/ojph_transform_avx.cpp
+++ b/src/core/transform/ojph_transform_avx.cpp
@@ -88,14 +88,7 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void avx_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
     {
-      __m256 factor = _mm256_set1_ps(K);
-      float* dst = aug->f32;
-      int i = (int)repeat;
-      for (; i > 0; i -= 8, dst += 8)
-      {
-        __m256 s = _mm256_load_ps(dst);
-        _mm256_store_ps(dst, _mm256_mul_ps(factor, s));
-      }
+      avx_multiply_const(aug->f32, K, (int)repeat);
     }
 
     /////////////////////////////////////////////////////////////////////////
diff --git a/src/core/transform/ojph_transform_avx2.cpp b/src/core/transform/ojph_transform_avx2.cpp
index 243fe87f..847cd4c4 100644
--- a/src/core/transform/ojph_transform_avx2.cpp
+++ b/src/core/transform/ojph_transform_avx2.cpp
@@ -514,7 +514,5 @@ namespace ojph {
       }
     }
 
-
-
   } // !local
 } // !ojph
diff --git a/src/core/transform/ojph_transform_avx512.cpp b/src/core/transform/ojph_transform_avx512.cpp
new file mode 100644
index 00000000..efb7655a
--- /dev/null
+++ b/src/core/transform/ojph_transform_avx512.cpp
@@ -0,0 +1,830 @@
+//***************************************************************************/
+// This software is released under the 2-Clause BSD license, included
+// below.
+//
+// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2019, The University of New South Wales, Australia
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// 
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// 
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+// IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+// TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+// PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+// TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//***************************************************************************/
+// This file is part of the OpenJPH software implementation.
+// File: ojph_transform_avx2.cpp
+// Author: Aous Naman
+// Date: 28 August 2019
+//***************************************************************************/
+
+#include <cstdio>
+
+#include "ojph_defs.h"
+#include "ojph_arch.h"
+#include "ojph_mem.h"
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
+
+#include "ojph_transform.h"
+#include "ojph_transform_local.h"
+
+#include <immintrin.h>
+
+namespace ojph {
+  namespace local {
+
+    //////////////////////////////////////////////////////////////////////////
+    // We split multiples of 32 followed by multiples of 16, because
+    // we assume byte_alignment == 64
+    static void avx512_deinterleave(float* dpl, float* dph, float* sp, 
+                                    int width, bool even)
+    {
+      __m512i idx1 = _mm512_set_epi32(
+        0x1E, 0x1C, 0x1A, 0x18, 0x16, 0x14, 0x12, 0x10,
+        0x0E, 0x0C, 0x0A, 0x08, 0x06, 0x04, 0x02, 0x00
+      );
+      __m512i idx2 = _mm512_set_epi32(
+        0x1F, 0x1D, 0x1B, 0x19, 0x17, 0x15, 0x13, 0x11,
+        0x0F, 0x0D, 0x0B, 0x09, 0x07, 0x05, 0x03, 0x01
+      );
+      if (even)
+      {
+        for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16)
+        {
+          __m512 a = _mm512_load_ps(sp);
+          __m512 b = _mm512_load_ps(sp + 16);
+          __m512 c = _mm512_permutex2var_ps(a, idx1, b);
+          __m512 d = _mm512_permutex2var_ps(a, idx2, b);
+          _mm512_store_ps(dpl, c);
+          _mm512_store_ps(dph, d);
+        }
+        for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
+        {
+          __m256 a = _mm256_load_ps(sp);
+          __m256 b = _mm256_load_ps(sp + 8);
+          __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
+          __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
+          __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
+          __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
+          _mm256_store_ps(dpl, e);
+          _mm256_store_ps(dph, f);
+        }
+      }
+      else
+      {
+        for (; width > 16; width -= 32, sp += 32, dpl += 16, dph += 16)
+        {
+          __m512 a = _mm512_load_ps(sp);
+          __m512 b = _mm512_load_ps(sp + 16);
+          __m512 c = _mm512_permutex2var_ps(a, idx2, b);
+          __m512 d = _mm512_permutex2var_ps(a, idx1, b);
+          _mm512_store_ps(dpl, c);
+          _mm512_store_ps(dph, d);
+        }
+        for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
+        {
+          __m256 a = _mm256_load_ps(sp);
+          __m256 b = _mm256_load_ps(sp + 8);
+          __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
+          __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
+          __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
+          __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
+          _mm256_store_ps(dpl, f);
+          _mm256_store_ps(dph, e);
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // We split multiples of 32 followed by multiples of 16, because
+    // we assume byte_alignment == 64
+    static void avx512_interleave(float* dp, float* spl, float* sph,
+                                  int width, bool even)
+    {
+      __m512i idx1 = _mm512_set_epi32(
+        0x17, 0x7, 0x16, 0x6, 0x15, 0x5, 0x14, 0x4,
+        0x13, 0x3, 0x12, 0x2, 0x11, 0x1, 0x10, 0x0
+      );
+      __m512i idx2 = _mm512_set_epi32(
+        0x1F, 0xF, 0x1E, 0xE, 0x1D, 0xD, 0x1C, 0xC,
+        0x1B, 0xB, 0x1A, 0xA, 0x19, 0x9, 0x18, 0x8
+      );
+      if (even)
+      {
+        for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16)
+        {
+          __m512 a = _mm512_load_ps(spl);
+          __m512 b = _mm512_load_ps(sph);
+          __m512 c = _mm512_permutex2var_ps(a, idx1, b);
+          __m512 d = _mm512_permutex2var_ps(a, idx2, b);
+          _mm512_store_ps(dp, c);
+          _mm512_store_ps(dp + 16, d);
+        }
+        for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
+        {
+          __m256 a = _mm256_load_ps(spl);
+          __m256 b = _mm256_load_ps(sph);
+          __m256 c = _mm256_unpacklo_ps(a, b);
+          __m256 d = _mm256_unpackhi_ps(a, b);
+          __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
+          __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
+          _mm256_store_ps(dp, e);
+          _mm256_store_ps(dp + 8, f);
+        }
+      }
+      else
+      {
+        for (; width > 16; width -= 32, dp += 32, spl += 16, sph += 16)
+        {
+          __m512 a = _mm512_load_ps(spl);
+          __m512 b = _mm512_load_ps(sph);
+          __m512 c = _mm512_permutex2var_ps(b, idx1, a);
+          __m512 d = _mm512_permutex2var_ps(b, idx2, a);
+          _mm512_store_ps(dp, c);
+          _mm512_store_ps(dp + 16, d);
+        }
+        for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
+        {
+          __m256 a = _mm256_load_ps(spl);
+          __m256 b = _mm256_load_ps(sph);
+          __m256 c = _mm256_unpacklo_ps(b, a);
+          __m256 d = _mm256_unpackhi_ps(b, a);
+          __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
+          __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
+          _mm256_store_ps(dp, e);
+          _mm256_store_ps(dp + 8, f);
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline void avx512_multiply_const(float* p, float f, int width)
+    {
+      __m512 factor = _mm512_set1_ps(f);
+      for (; width > 0; width -= 16, p += 16)
+      {
+        __m512 s = _mm512_load_ps(p);
+        _mm512_store_ps(p, _mm512_mul_ps(factor, s));
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx512_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      float a = s->irv.Aatk;
+      if (synthesis)
+        a = -a;
+
+      __m512 factor = _mm512_set1_ps(a);
+
+      float* dst = aug->f32;
+      const float* src1 = sig->f32, * src2 = other->f32;
+      int i = (int)repeat;
+      for ( ; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+      {
+        __m512 s1 = _mm512_load_ps(src1);
+        __m512 s2 = _mm512_load_ps(src2);
+        __m512 d = _mm512_load_ps(dst);
+        d = _mm512_add_ps(d, _mm512_mul_ps(factor, _mm512_add_ps(s1, s2)));
+        _mm512_store_ps(dst, d);
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx512_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
+    {
+      avx512_multiply_const(aug->f32, K, (int)repeat);
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // split src into ldst and hdst
+        {
+          float* dpl = ldst->f32;
+          float* dph = hdst->f32;
+          float* sp = src->f32;
+          int w = (int)width;
+          AVX_DEINTERLEAVE(dpl, dph, sp, w, even);
+        }
+
+        // the actual horizontal transform
+        float* hp = hdst->f32, * lp = ldst->f32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          const lifting_step* s = atk->get_step(j - 1);
+          const float a = s->irv.Aatk;
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const float* sp = lp;
+          float* dp = hp;
+          int i = (int)h_width;
+          __m512 f = _mm512_set1_ps(a);
+          if (even)
+          {
+            for (; i > 0; i -= 16, sp += 16, dp += 16)
+            {
+              __m512 m = _mm512_load_ps(sp);
+              __m512 n = _mm512_loadu_ps(sp + 1);
+              __m512 p = _mm512_load_ps(dp);
+              p = _mm512_add_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
+              _mm512_store_ps(dp, p);
+            }
+          }
+          else
+          {
+            for (; i > 0; i -= 16, sp += 16, dp += 16)
+            {
+              __m512 m = _mm512_load_ps(sp);
+              __m512 n = _mm512_loadu_ps(sp - 1);
+              __m512 p = _mm512_load_ps(dp);
+              p = _mm512_add_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
+              _mm512_store_ps(dp, p);
+            }
+          }
+
+          // swap buffers
+          float* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          avx512_multiply_const(lp, K_inv, (int)l_width);
+          avx512_multiply_const(hp, K, (int)h_width);
+        }
+      }
+      else {
+        if (even)
+          ldst->f32[0] = src->f32[0];
+        else
+          hdst->f32[0] = src->f32[0] * 2.0f;
+      }
+    }
+    
+    //////////////////////////////////////////////////////////////////////////
+    void avx512_irv_horz_syn(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        float* oth = hsrc->f32, * aug = lsrc->f32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          avx512_multiply_const(aug, K, (int)aug_width);
+          avx512_multiply_const(oth, K_inv, (int)oth_width);
+        }
+
+        // the actual horizontal transform
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const float a = s->irv.Aatk;
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const float* sp = oth;
+          float* dp = aug;
+          int i = (int)aug_width;
+          __m512 f = _mm512_set1_ps(a);
+          if (ev)
+          {
+            for (; i > 0; i -= 16, sp += 16, dp += 16)
+            {
+              __m512 m = _mm512_load_ps(sp);
+              __m512 n = _mm512_loadu_ps(sp - 1);
+              __m512 p = _mm512_load_ps(dp);
+              p = _mm512_sub_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
+              _mm512_store_ps(dp, p);
+            }
+          }
+          else
+          {
+            for (; i > 0; i -= 16, sp += 16, dp += 16)
+            {
+              __m512 m = _mm512_load_ps(sp);
+              __m512 n = _mm512_loadu_ps(sp + 1);
+              __m512 p = _mm512_load_ps(dp);
+              p = _mm512_sub_ps(p, _mm512_mul_ps(f, _mm512_add_ps(m, n)));
+              _mm512_store_ps(dp, p);
+            }
+          }
+
+          // swap buffers
+          float* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        avx512_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even);
+      }
+      else {
+        if (even)
+          dst->f32[0] = lsrc->f32[0];
+        else
+          dst->f32[0] = hsrc->f32[0] * 0.5f;
+      }
+    }
+
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                              const line_buf* other, const line_buf* aug, 
+                              ui32 repeat, bool synthesis)
+    {
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const si32 e = s->rev.Eatk;
+      __m512i va = _mm512_set1_epi32(a);
+      __m512i vb = _mm512_set1_epi32(b);
+
+      si32* dst = aug->i32;
+      const si32* src1 = sig->i32, * src2 = other->i32;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i v = _mm512_add_epi32(vb, t);
+            __m512i w = _mm512_srai_epi32(v, e);
+            d = _mm512_sub_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i v = _mm512_add_epi32(vb, t);
+            __m512i w = _mm512_srai_epi32(v, e);
+            d = _mm512_add_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i w = _mm512_srai_epi32(t, e);
+            d = _mm512_add_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i w = _mm512_srai_epi32(t, e);
+            d = _mm512_sub_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i v = _mm512_sub_epi32(vb, t);
+            __m512i w = _mm512_srai_epi32(v, e);
+            d = _mm512_sub_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i v = _mm512_sub_epi32(vb, t);
+            __m512i w = _mm512_srai_epi32(v, e);
+            d = _mm512_add_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+      else { // general case
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i u = _mm512_mullo_epi32(va, t);
+            __m512i v = _mm512_add_epi32(vb, u);
+            __m512i w = _mm512_srai_epi32(v, e);
+            d = _mm512_sub_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 16, dst += 16, src1 += 16, src2 += 16)
+          {
+            __m512i s1 = _mm512_load_si512((__m512i*)src1);
+            __m512i s2 = _mm512_load_si512((__m512i*)src2);
+            __m512i d = _mm512_load_si512((__m512i*)dst);
+            __m512i t = _mm512_add_epi32(s1, s2);
+            __m512i u = _mm512_mullo_epi32(va, t);
+            __m512i v = _mm512_add_epi32(vb, u);
+            __m512i w = _mm512_srai_epi32(v, e);
+            d = _mm512_add_epi32(d, w);
+            _mm512_store_si512((__m512i*)dst, d);
+          }
+      }
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                             const line_buf* hdst, const line_buf* src, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        // combine both lsrc and hsrc into dst
+        {
+          float* dpl = ldst->f32;
+          float* dph = hdst->f32;
+          float* sp = src->f32;
+          int w = (int)width;
+          AVX_DEINTERLEAVE(dpl, dph, sp, w, even);
+        }
+
+        si32* hp = hdst->i32, * lp = ldst->i32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
+        {
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const si32 e = s->rev.Eatk;
+          __m512i va = _mm512_set1_epi32(a);
+          __m512i vb = _mm512_set1_epi32(b);
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si32* sp = lp;
+          si32* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_add_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_add_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i w = _mm512_srai_epi32(t, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i w = _mm512_srai_epi32(t, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_sub_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_sub_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i u = _mm512_mullo_epi32(va, t);
+                __m512i v = _mm512_add_epi32(vb, u);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i u = _mm512_mullo_epi32(va, t);
+                __m512i v = _mm512_add_epi32(vb, u);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si32* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
+        }
+      }
+      else {
+        if (even)
+          ldst->i32[0] = src->i32[0];
+        else
+          hdst->i32[0] = src->i32[0] << 1;
+      }
+    }
+    
+    //////////////////////////////////////////////////////////////////////////
+    void avx512_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                             const line_buf* lsrc, const line_buf* hsrc, 
+                             ui32 width, bool even)
+    {
+      if (width > 1)
+      {
+        bool ev = even;
+        si32* oth = hsrc->i32, * aug = lsrc->i32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const si32 e = s->rev.Eatk;
+          __m512i va = _mm512_set1_epi32(a);
+          __m512i vb = _mm512_set1_epi32(b);
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si32* sp = oth;
+          si32* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_add_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_add_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i w = _mm512_srai_epi32(t, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i w = _mm512_srai_epi32(t, e);
+                d = _mm512_add_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_sub_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i v = _mm512_sub_epi32(vb, t);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+          else {
+            // general case
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp - 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i u = _mm512_mullo_epi32(va, t);
+                __m512i v = _mm512_add_epi32(vb, u);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 16, sp += 16, dp += 16)
+              {
+                __m512i s1 = _mm512_load_si512((__m512i*)sp);
+                __m512i s2 = _mm512_loadu_si512((__m512i*)(sp + 1));
+                __m512i d = _mm512_load_si512((__m512i*)dp);
+                __m512i t = _mm512_add_epi32(s1, s2);
+                __m512i u = _mm512_mullo_epi32(va, t);
+                __m512i v = _mm512_add_epi32(vb, u);
+                __m512i w = _mm512_srai_epi32(v, e);
+                d = _mm512_sub_epi32(d, w);
+                _mm512_store_si512((__m512i*)dp, d);
+              }
+          }
+
+          // swap buffers
+          si32* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        avx512_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even);
+      }
+      else {
+        if (even)
+          dst->i32[0] = lsrc->i32[0];
+        else
+          dst->i32[0] = hsrc->i32[0] >> 1;
+      }
+    }
+
+  } // !local
+} // !ojph
diff --git a/src/core/transform/ojph_transform_local.h b/src/core/transform/ojph_transform_local.h
index 3ba9e6d0..ec2a2e12 100644
--- a/src/core/transform/ojph_transform_local.h
+++ b/src/core/transform/ojph_transform_local.h
@@ -221,13 +221,11 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    // We split multiples of 16 followed by multiples of 8, because
-    // we assume byte_alignment == 32
     #define AVX_DEINTERLEAVE(dpl, dph, sp, width, even)                      \
     {                                                                        \
       if (even)                                                              \
       {                                                                      \
-        for (; width > 8; width -= 16, sp += 16, dpl += 8, dph += 8)         \
+        for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)         \
         {                                                                    \
           __m256 a = _mm256_load_ps(sp);                                     \
           __m256 b = _mm256_load_ps(sp + 8);                                 \
@@ -238,19 +236,10 @@ namespace ojph {
           _mm256_store_ps(dpl, e);                                           \
           _mm256_store_ps(dph, f);                                           \
         }                                                                    \
-        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)           \
-        {                                                                    \
-          __m128 a = _mm_load_ps(sp);                                        \
-          __m128 b = _mm_load_ps(sp + 4);                                    \
-          __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));          \
-          __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));          \
-          _mm_store_ps(dpl, c);                                              \
-          _mm_store_ps(dph, d);                                              \
-        }                                                                    \
       }                                                                      \
       else                                                                   \
       {                                                                      \
-        for (; width > 8; width -= 16, sp += 16, dpl += 8, dph += 8)         \
+        for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)         \
         {                                                                    \
           __m256 a = _mm256_load_ps(sp);                                     \
           __m256 b = _mm256_load_ps(sp + 8);                                 \
@@ -261,26 +250,15 @@ namespace ojph {
           _mm256_store_ps(dpl, f);                                           \
           _mm256_store_ps(dph, e);                                           \
         }                                                                    \
-        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)           \
-        {                                                                    \
-          __m128 a = _mm_load_ps(sp);                                        \
-          __m128 b = _mm_load_ps(sp + 4);                                    \
-          __m128 c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));          \
-          __m128 d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));          \
-          _mm_store_ps(dpl, d);                                              \
-          _mm_store_ps(dph, c);                                              \
-        }                                                                    \
       }                                                                      \
     }
 
     //////////////////////////////////////////////////////////////////////////
-    // We split multiples of 16 followed by multiples of 8, because
-    // we assume byte_alignment == 32
     #define AVX_INTERLEAVE(dp, spl, sph, width, even)                        \
     {                                                                        \
       if (even)                                                              \
       {                                                                      \
-        for (; width > 8; width -= 16, dp += 16, spl += 8, sph += 8)         \
+        for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)         \
         {                                                                    \
           __m256 a = _mm256_load_ps(spl);                                    \
           __m256 b = _mm256_load_ps(sph);                                    \
@@ -291,19 +269,10 @@ namespace ojph {
           _mm256_store_ps(dp, e);                                            \
           _mm256_store_ps(dp + 8, f);                                        \
         }                                                                    \
-        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)           \
-        {                                                                    \
-          __m128 a = _mm_load_ps(spl);                                       \
-          __m128 b = _mm_load_ps(sph);                                       \
-          __m128 c = _mm_unpacklo_ps(a, b);                                  \
-          __m128 d = _mm_unpackhi_ps(a, b);                                  \
-          _mm_store_ps(dp, c);                                               \
-          _mm_store_ps(dp + 4, d);                                           \
-        }                                                                    \
       }                                                                      \
       else                                                                   \
       {                                                                      \
-        for (; width > 8; width -= 16, dp += 16, spl += 8, sph += 8)         \
+        for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)         \
         {                                                                    \
           __m256 a = _mm256_load_ps(spl);                                    \
           __m256 b = _mm256_load_ps(sph);                                    \
@@ -314,15 +283,6 @@ namespace ojph {
           _mm256_store_ps(dp, e);                                            \
           _mm256_store_ps(dp + 8, f);                                        \
         }                                                                    \
-        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)           \
-        {                                                                    \
-          __m128 a = _mm_load_ps(spl);                                       \
-          __m128 b = _mm_load_ps(sph);                                       \
-          __m128 c = _mm_unpacklo_ps(b, a);                                  \
-          __m128 d = _mm_unpackhi_ps(b, a);                                  \
-          _mm_store_ps(dp, c);                                               \
-          _mm_store_ps(dp + 4, d);                                           \
-        }                                                                    \
       }                                                                      \
     }
 

From 30b32cc67f61f3aa63d9fb99b3d87cbd04c72bfa Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Sat, 13 Apr 2024 09:51:18 +1000
Subject: [PATCH 32/37] Fix compilation, and a missing optimization.

---
 src/core/transform/ojph_transform_avx512.cpp | 22 +++++---------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/src/core/transform/ojph_transform_avx512.cpp b/src/core/transform/ojph_transform_avx512.cpp
index efb7655a..02edca60 100644
--- a/src/core/transform/ojph_transform_avx512.cpp
+++ b/src/core/transform/ojph_transform_avx512.cpp
@@ -224,13 +224,7 @@ namespace ojph {
       if (width > 1)
       {
         // split src into ldst and hdst
-        {
-          float* dpl = ldst->f32;
-          float* dph = hdst->f32;
-          float* sp = src->f32;
-          int w = (int)width;
-          AVX_DEINTERLEAVE(dpl, dph, sp, w, even);
-        }
+        avx512_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even);
 
         // the actual horizontal transform
         float* hp = hdst->f32, * lp = ldst->f32;
@@ -376,7 +370,7 @@ namespace ojph {
     {
       const si32 a = s->rev.Aatk;
       const si32 b = s->rev.Batk;
-      const si32 e = s->rev.Eatk;
+      const ui32 e = s->rev.Eatk;
       __m512i va = _mm512_set1_epi32(a);
       __m512i vb = _mm512_set1_epi32(b);
 
@@ -506,13 +500,7 @@ namespace ojph {
       if (width > 1)
       {
         // combine both lsrc and hsrc into dst
-        {
-          float* dpl = ldst->f32;
-          float* dph = hdst->f32;
-          float* sp = src->f32;
-          int w = (int)width;
-          AVX_DEINTERLEAVE(dpl, dph, sp, w, even);
-        }
+        avx512_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even);
 
         si32* hp = hdst->i32, * lp = ldst->i32;
         ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
@@ -524,7 +512,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j - 1);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const si32 e = s->rev.Eatk;
+          const ui32 e = s->rev.Eatk;
           __m512i va = _mm512_set1_epi32(a);
           __m512i vb = _mm512_set1_epi32(b);
 
@@ -682,7 +670,7 @@ namespace ojph {
           const lifting_step* s = atk->get_step(j);
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
-          const si32 e = s->rev.Eatk;
+          const ui32 e = s->rev.Eatk;
           __m512i va = _mm512_set1_epi32(a);
           __m512i vb = _mm512_set1_epi32(b);
 

From f28a90fce49edf94d44865add3299650058ebe6d Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Sat, 13 Apr 2024 17:57:45 +1000
Subject: [PATCH 33/37] Wasm completed -- not tested yet.

---
 src/apps/ojph_compress/CMakeLists.txt        |   2 +-
 src/apps/ojph_expand/CMakeLists.txt          |   2 +-
 src/core/CMakeLists.txt                      |   2 +-
 src/core/transform/ojph_transform.cpp        |  18 +-
 src/core/transform/ojph_transform_avx512.cpp |  10 +-
 src/core/transform/ojph_transform_wasm.cpp   | 957 +++++++++++--------
 tests/CMakeLists.txt                         |  16 +-
 7 files changed, 596 insertions(+), 411 deletions(-)

diff --git a/src/apps/ojph_compress/CMakeLists.txt b/src/apps/ojph_compress/CMakeLists.txt
index bbb77abc..dadcca9b 100644
--- a/src/apps/ojph_compress/CMakeLists.txt
+++ b/src/apps/ojph_compress/CMakeLists.txt
@@ -17,7 +17,7 @@ source_group("others"      FILES ${OJPH_IMG_IO})
 source_group("common"      FILES ${OJPH_IMG_IO_H})
 
 if(EMSCRIPTEN)
-  add_compile_options(-std=c++11 -O3 -fexceptions -DOJPH_DISABLE_INTEL_SIMD)
+  add_compile_options(-std=c++11 -O3 -fexceptions)
   add_executable(ojph_compress ${SOURCES})
   add_executable(ojph_compress_simd ${SOURCES} ${OJPH_IMG_IO_SSE4})
   target_compile_options(ojph_compress_simd PRIVATE -DOJPH_ENABLE_WASM_SIMD -msimd128 -msse4.1)
diff --git a/src/apps/ojph_expand/CMakeLists.txt b/src/apps/ojph_expand/CMakeLists.txt
index c0ac185e..d4b65523 100644
--- a/src/apps/ojph_expand/CMakeLists.txt
+++ b/src/apps/ojph_expand/CMakeLists.txt
@@ -17,7 +17,7 @@ source_group("others"      FILES ${OJPH_IMG_IO})
 source_group("common"      FILES ${OJPH_IMG_IO_H})
 
 if(EMSCRIPTEN)
-  add_compile_options(-std=c++11 -O3 -fexceptions -DOJPH_DISABLE_INTEL_SIMD)
+  add_compile_options(-std=c++11 -O3 -fexceptions)
   add_executable(ojph_expand ${SOURCES})
   add_executable(ojph_expand_simd ${SOURCES} ${OJPH_IMG_IO_SSE4})
   target_compile_options(ojph_expand_simd PRIVATE -DOJPH_ENABLE_WASM_SIMD -msimd128 -msse4.1)
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 19123a2e..40fffa48 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -33,7 +33,7 @@ source_group("others"            FILES ${OTHERS})
 source_group("transform"         FILES ${TRANSFORM})
 
 if(EMSCRIPTEN)
-  add_compile_options(-std=c++11 -O3 -fexceptions -DOJPH_DISABLE_INTEL_SIMD)
+  add_compile_options(-std=c++11 -O3 -fexceptions)
   add_library(openjph ${SOURCES})
   add_library(openjphsimd ${SOURCES} ${CODESTREAM_WASM} ${CODING_WASM} ${TRANSFORM_WASM})
   target_include_directories(openjph PUBLIC common)
diff --git a/src/core/transform/ojph_transform.cpp b/src/core/transform/ojph_transform.cpp
index 83eed644..0dc5f95c 100644
--- a/src/core/transform/ojph_transform.cpp
+++ b/src/core/transform/ojph_transform.cpp
@@ -162,16 +162,14 @@ namespace ojph {
 #endif // !OJPH_DISABLE_INTEL_SIMD
 
 #else // OJPH_ENABLE_WASM_SIMD
-      rev_vert_ana_step         = wasm_rev_vert_ana_step;
-      rev_horz_ana              = wasm_rev_horz_ana;
-      rev_vert_syn_step         = wasm_rev_vert_syn_step;
-      rev_horz_syn              = wasm_rev_horz_syn;
-
-      irv_vert_ana_step         = wasm_irv_vert_ana_step;
-      irv_horz_ana              = wasm_irv_horz_ana;      
-      irv_vert_syn_step         = wasm_irv_vert_syn_step;
-      irv_horz_syn              = wasm_irv_horz_syn;
-      irv_vert_times_K          = wasm_irv_vert_times_K;
+        rev_vert_step             = wasm_rev_vert_step;
+        rev_horz_ana              = wasm_rev_horz_ana;
+        rev_horz_syn              = wasm_rev_horz_syn;
+        
+        irv_vert_step             = wasm_irv_vert_step;
+        irv_vert_times_K          = wasm_irv_vert_times_K;
+        irv_horz_ana              = wasm_irv_horz_ana;
+        irv_horz_syn              = wasm_irv_horz_syn;
 #endif // !OJPH_ENABLE_WASM_SIMD
 
       wavelet_transform_functions_initialized = true;
diff --git a/src/core/transform/ojph_transform_avx512.cpp b/src/core/transform/ojph_transform_avx512.cpp
index 02edca60..504aa870 100644
--- a/src/core/transform/ojph_transform_avx512.cpp
+++ b/src/core/transform/ojph_transform_avx512.cpp
@@ -2,9 +2,9 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
-// Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
-// Copyright (c) 2019, The University of New South Wales, Australia
+// Copyright (c) 2019-2024, Aous Naman 
+// Copyright (c) 2019-2024, Kakadu Software Pty Ltd, Australia
+// Copyright (c) 2019-2024, The University of New South Wales, Australia
 // 
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
@@ -30,9 +30,9 @@
 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //***************************************************************************/
 // This file is part of the OpenJPH software implementation.
-// File: ojph_transform_avx2.cpp
+// File: ojph_transform_avx512.cpp
 // Author: Aous Naman
-// Date: 28 August 2019
+// Date: 13 April 2024
 //***************************************************************************/
 
 #include <cstdio>
diff --git a/src/core/transform/ojph_transform_wasm.cpp b/src/core/transform/ojph_transform_wasm.cpp
index 8f48e352..7b9ffb10 100644
--- a/src/core/transform/ojph_transform_wasm.cpp
+++ b/src/core/transform/ojph_transform_wasm.cpp
@@ -41,6 +41,9 @@
 #include "ojph_defs.h"
 #include "ojph_arch.h"
 #include "ojph_mem.h"
+#include "ojph_params.h"
+#include "../codestream/ojph_params_local.h"
+
 #include "ojph_transform.h"
 #include "ojph_transform_local.h"
 
@@ -48,473 +51,645 @@ namespace ojph {
   namespace local {
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_fwd_predict(const line_buf* line_src1, 
-                                        const line_buf* line_src2,
-                                        line_buf *line_dst, ui32 repeat)
-    {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
-      {
-        v128_t s1 = wasm_v128_load(src1);
-        v128_t s2 = wasm_v128_load(src2);
-        v128_t d = wasm_v128_load(dst);
-        s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
-        d = wasm_i32x4_sub(d, s1);
-        wasm_v128_store(dst, d);
-      }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_fwd_update(const line_buf* line_src1, 
-                                       const line_buf* line_src2,
-                                       line_buf *line_dst, ui32 repeat)
+    void wasm_deinterleave(float* dpl, float* dph, float* sp, 
+                           int width, bool even)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-
-      v128_t offset = wasm_i32x4_splat(2);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
-      {
-        v128_t s1 = wasm_v128_load(src1);
-        s1 = wasm_i32x4_add(s1, offset);
-        v128_t s2 = wasm_v128_load(src2);
-        s2 = wasm_i32x4_add(s2, s1);
-        v128_t d = wasm_v128_load(dst);
-        d = wasm_i32x4_add(d, wasm_i32x4_shr(s2, 2));
-        wasm_v128_store(dst, d);
-      }
+      if (even)
+        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+        {
+          v128_t a = wasm_v128_load(sp);
+          v128_t b = wasm_v128_load(sp + 4);
+          v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
+          v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
+          // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+          // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+          wasm_v128_store(dpl, c);
+          wasm_v128_store(dph, d);
+        }
+      else
+        for (; width > 0; width -= 8, sp += 8, dpl += 4, dph += 4)
+        {
+          v128_t a = wasm_v128_load(sp);
+          v128_t b = wasm_v128_load(sp + 4);
+          v128_t c = wasm_i32x4_shuffle(a, b, 0, 2, 4 + 0, 4 + 2);
+          v128_t d = wasm_i32x4_shuffle(a, b, 1, 3, 4 + 1, 4 + 3);
+          // v128_t c = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2, 0, 2, 0));
+          // v128_t d = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3, 1, 3, 1));
+          wasm_v128_store(dpl, d);
+          wasm_v128_store(dph, c);
+        }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, 
-                                   line_buf *line_hdst, ui32 width, bool even)
+    void wasm_interleave(float* dp, float* spl, float* sph, 
+                         int width, bool even)
     {
-      if (width > 1)
-      {
-        si32 *src = line_src->i32;
-        si32 *ldst = line_ldst->i32, *hdst = line_hdst->i32;
-      
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        // extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const si32* sp = src + (even ? 1 : 0);
-        si32 *dph = hdst;
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
-        { //this is doing twice the work it needs to do
-          //it can be definitely written better
-          v128_t s1 = wasm_v128_load(sp - 1);
-          v128_t s2 = wasm_v128_load(sp + 1);
-          v128_t d = wasm_v128_load(sp);
-          s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
-          v128_t d1 = wasm_i32x4_sub(d, s1);
-          sp += 4;
-          s1 = wasm_v128_load(sp - 1);
-          s2 = wasm_v128_load(sp + 1);
-          d = wasm_v128_load(sp);
-          s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
-          v128_t d2 = wasm_i32x4_sub(d, s1);
-          sp += 4;
-          d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
-          wasm_v128_store(dph, d);
-        }
-
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        sp = src + (even ? 0 : 1);
-        const si32* sph = hdst + (even ? 0 : 1);
-        si32 *dpl = ldst;
-        v128_t offset = wasm_i32x4_splat(2);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
+      if (even)
+        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
         {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          s1 = wasm_i32x4_add(s1, offset);
-          v128_t s2 = wasm_v128_load(sph);
-          s2 = wasm_i32x4_add(s2, s1);
-          v128_t d1 = wasm_v128_load(sp);
-          v128_t d2 = wasm_v128_load(sp + 4);
-          v128_t d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
-          d = wasm_i32x4_add(d, wasm_i32x4_shr(s2, 2));
-          wasm_v128_store(dpl, d);
+          v128_t a = wasm_v128_load(spl);
+          v128_t b = wasm_v128_load(sph);
+          v128_t c = wasm_i32x4_shuffle(a, b, 0, 4 + 0, 1, 4 + 1);
+          v128_t d = wasm_i32x4_shuffle(a, b, 2, 4 + 2, 3, 4 + 3);
+          // v128_t c = _mm_unpacklo_ps(a, b);
+          // v128_t d = _mm_unpackhi_ps(a, b);
+          wasm_v128_store(dp, c);
+          wasm_v128_store(dp + 4, d);
         }
-      }
       else
-      {
-        if (even)
-          line_ldst->i32[0] = line_src->i32[0];
-        else
-          line_hdst->i32[0] = line_src->i32[0] << 1;
-      }
+        for (; width > 0; width -= 8, dp += 8, spl += 4, sph += 4)
+        {
+          v128_t a = wasm_v128_load(spl);
+          v128_t b = wasm_v128_load(sph);
+          v128_t c = wasm_i32x4_shuffle(b, a, 0, 4 + 0, 1, 4 + 1);
+          v128_t d = wasm_i32x4_shuffle(b, a, 2, 4 + 2, 3, 4 + 3);
+          // v128_t c = _mm_unpacklo_ps(b, a);
+          // v128_t d = _mm_unpackhi_ps(b, a);
+          wasm_v128_store(dp, c);
+          wasm_v128_store(dp + 4, d);
+        }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_bwd_predict(const line_buf *line_src1, 
-                                        const line_buf *line_src2,
-                                        line_buf *line_dst, ui32 repeat)
+    static inline void wasm_multiply_const(float* p, float f, int width)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-    
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+      v128_t factor = wasm_f32x4_splat(f);
+      for (; width > 0; width -= 4, p += 4)
       {
-        v128_t s1 = wasm_v128_load(src1);
-        v128_t s2 = wasm_v128_load(src2);
-        v128_t d = wasm_v128_load(dst);
-        s1 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
-        d = wasm_i32x4_add(d, s1);
-        wasm_v128_store(dst, d);
+        v128_t s = wasm_v128_load(p);
+        wasm_v128_store(p, wasm_f32x4_mul(factor, s));
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_vert_wvlt_bwd_update(const line_buf *line_src1, 
-                                       const line_buf *line_src2,
-                                       line_buf *line_dst, ui32 repeat)
+    void wasm_irv_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
     {
-      si32 *dst = line_dst->i32;
-      const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
-    
-      v128_t offset = wasm_i32x4_splat(2);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+      float a = s->irv.Aatk;
+      if (synthesis)
+        a = -a;
+
+      v128_t factor = wasm_f32x4_splat(a);
+
+      float* dst = aug->f32;
+      const float* src1 = sig->f32, * src2 = other->f32;
+      int i = (int)repeat;
+      for ( ; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
       {
         v128_t s1 = wasm_v128_load(src1);
-        s1 = wasm_i32x4_add(s1, offset);
         v128_t s2 = wasm_v128_load(src2);
-        s2 = wasm_i32x4_add(s2, s1);
-        v128_t d = wasm_v128_load(dst);
-        d = wasm_i32x4_sub(d, wasm_i32x4_shr(s2, 2));
+        v128_t d  = wasm_v128_load(dst);
+        d = wasm_f32x4_add(d, wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2)));
         wasm_v128_store(dst, d);
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc, 
-                                   line_buf *line_hsrc, ui32 width, bool even)
+    void wasm_irv_vert_times_K(float K, const line_buf* aug, ui32 repeat)
+    {
+      wasm_multiply_const(aug->f32, K, (int)repeat);
+    }
+
+    /////////////////////////////////////////////////////////////////////////
+    void wasm_irv_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
     {
       if (width > 1)
       {
-        si32 *lsrc = line_lsrc->i32, *hsrc = line_hsrc->i32;
-        si32 *dst = line_dst->i32;
-      
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
+        // split src into ldst and hdst
+        wasm_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even);
 
-        // extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        const si32 *sph = hsrc + (even ? 0 : 1);
-        si32 *spl = lsrc;
-        v128_t offset = wasm_i32x4_splat(2);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, spl+=4)
+        // the actual horizontal transform
+        float* hp = hdst->f32, * lp = ldst->f32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
         {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          s1 = wasm_i32x4_add(s1, offset);
-          v128_t s2 = wasm_v128_load(sph);
-          s2 = wasm_i32x4_add(s2, s1);
-          v128_t d = wasm_v128_load(spl);
-          d = wasm_i32x4_sub(d, wasm_i32x4_shr(s2, 2));
-          wasm_v128_store(spl, d);
+          const lifting_step* s = atk->get_step(j - 1);
+          const float a = s->irv.Aatk;
+
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const float* sp = lp;
+          float* dp = hp;
+          int i = (int)h_width;
+          v128_t f = wasm_f32x4_splat(a);
+          if (even)
+          {
+            for (; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = wasm_v128_load(sp);
+              v128_t n = wasm_v128_load(sp + 1);
+              v128_t p = wasm_v128_load(dp);
+              p = wasm_f32x4_add(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
+              wasm_v128_store(dp, p);
+            }
+          }
+          else
+          {
+            for (; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = wasm_v128_load(sp);
+              v128_t n = wasm_v128_load(sp - 1);
+              v128_t p = wasm_v128_load(dp);
+              p = wasm_f32x4_add(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
+              wasm_v128_store(dp, p);
+            }
+          }
+
+          // swap buffers
+          float* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
         }
 
-        // extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width - 1];
-        // inverse predict and combine
-        si32 *dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        ui32 width = L_width + (even ? 0 : 1);
-        for (ui32 i = (width + 3) >> 2; i > 0; --i, sph+=4, spl+=4, dp+=8)
-        {
-          v128_t s1 = wasm_v128_load(spl);
-          v128_t s2 = wasm_v128_load(spl + 1);
-          v128_t d = wasm_v128_load(sph);
-          s2 = wasm_i32x4_shr(wasm_i32x4_add(s1, s2), 1);
-          d = wasm_i32x4_add(d, s2);
-          wasm_v128_store(dp, wasm_i32x4_shuffle(s1, d, 0, 4, 1, 5));
-          wasm_v128_store(dp + 4, wasm_i32x4_shuffle(s1, d, 2, 6, 3, 7));
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          wasm_multiply_const(lp, K_inv, (int)l_width);
+          wasm_multiply_const(hp, K, (int)h_width);
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->i32[0] = line_lsrc->i32[0];
+          ldst->f32[0] = src->f32[0];
         else
-          line_dst->i32[0] = line_hsrc->i32[0] >> 1;
+          hdst->f32[0] = src->f32[0] * 2.0f;
       }
     }
     
     //////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_vert_wvlt_step(const line_buf *line_src1, 
-                                   const line_buf *line_src2,
-                                   line_buf *line_dst, int step_num, 
-                                   ui32 repeat)
+    void wasm_irv_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
     {
-      float *dst = line_dst->f32;
-      const float *src1 = line_src1->f32, *src2 = line_src2->f32;
-    
-      v128_t factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[step_num]);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
+      if (width > 1)
       {
-        v128_t s1 = wasm_v128_load(src1);
-        v128_t s2 = wasm_v128_load(src2);
-        v128_t d = wasm_v128_load(dst);
-        d = wasm_f32x4_add(d, wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2)));
-        wasm_v128_store(dst, d);
+        bool ev = even;
+        float* oth = hsrc->f32, * aug = lsrc->f32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+
+        { // multiply by K or 1/K
+          float K = atk->get_K();
+          float K_inv = 1.0f / K;
+          wasm_multiply_const(aug, K, (int)aug_width);
+          wasm_multiply_const(oth, K_inv, (int)oth_width);
+        }
+
+        // the actual horizontal transform
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
+        {
+          const lifting_step* s = atk->get_step(j);
+          const float a = s->irv.Aatk;
+
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const float* sp = oth;
+          float* dp = aug;
+          int i = (int)aug_width;
+          v128_t f = wasm_f32x4_splat(a);
+          if (ev)
+          {
+            for ( ; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = wasm_v128_load(sp);
+              v128_t n = wasm_v128_load(sp - 1);
+              v128_t p = wasm_v128_load(dp);
+              p = wasm_f32x4_sub(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
+              wasm_v128_store(dp, p);
+            }
+          }
+          else
+          {
+            for ( ; i > 0; i -= 4, sp += 4, dp += 4)
+            {
+              v128_t m = wasm_v128_load(sp);
+              v128_t n = wasm_v128_load(sp + 1);
+              v128_t p = wasm_v128_load(dp);
+              p = wasm_f32x4_sub(p, wasm_f32x4_mul(f, wasm_f32x4_add(m, n)));
+              wasm_v128_store(dp, p);
+            }
+          }
+
+          // swap buffers
+          float* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
+        }
+
+        // combine both lsrc and hsrc into dst
+        wasm_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even);
+      }
+      else {
+        if (even)
+          dst->f32[0] = lsrc->f32[0];
+        else
+          dst->f32[0] = hsrc->f32[0] * 0.5f;
       }
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_vert_wvlt_K(const line_buf *line_src, line_buf *line_dst,
-                                bool L_analysis_or_H_synthesis, ui32 repeat)
+    void wasm_rev_vert_step(const lifting_step* s, const line_buf* sig, 
+                            const line_buf* other, const line_buf* aug, 
+                            ui32 repeat, bool synthesis)
     {
-      float *dst = line_dst->f32;
-      const float *src = line_src->f32;
+      const si32 a = s->rev.Aatk;
+      const si32 b = s->rev.Batk;
+      const ui32 e = s->rev.Eatk;
+      v128_t vb = wasm_i32x4_splat(b);
 
-      float f = LIFTING_FACTORS::K_inv;
-      f = L_analysis_or_H_synthesis ? f : LIFTING_FACTORS::K;
-      v128_t factor = wasm_f32x4_splat(f);
-      for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src+=4)
-      {
-        v128_t s = wasm_v128_load(src);
-        wasm_v128_store(dst, wasm_f32x4_mul(factor, s));
+      si32* dst = aug->i32;
+      const si32* src1 = sig->i32, * src2 = other->i32;
+      // The general definition of the wavelet in Part 2 is slightly 
+      // different to part 2, although they are mathematically equivalent
+      // here, we identify the simpler form from Part 1 and employ them
+      if (a == 1)
+      { // 5/3 update and any case with a == 1
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t v = wasm_i32x4_add(vb, t);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t v = wasm_i32x4_add(vb, t);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1 && b == 1 && e == 1)
+      { // 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t w = wasm_i32x4_shr(t, e);
+            d = wasm_i32x4_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t w = wasm_i32x4_shr(t, e);
+            d = wasm_i32x4_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else if (a == -1)
+      { // any case with a == -1, which is not 5/3 predict
+        int i = (int)repeat;
+        if (synthesis)
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t v = wasm_i32x4_sub(vb, t);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+        else
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t v = wasm_i32x4_sub(vb, t);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
+      }
+      else { // general case
+        // 32bit multiplication is not supported in sse2; we need sse4.1,
+        // where we can use _mm_mullo_epi32, which multiplies 32bit x 32bit,
+        // keeping the LSBs
+        if (synthesis)
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+        else
+          for (ui32 i = repeat; i > 0; --i)
+            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
       }
     }
 
     /////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, 
-                                     line_buf *line_hdst, ui32 width, 
-                                     bool even)
+    void wasm_rev_horz_ana(const param_atk* atk, const line_buf* ldst, 
+                           const line_buf* hdst, const line_buf* src, 
+                           ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *src = line_src->f32;
-        float *ldst = line_ldst->f32, *hdst = line_hdst->f32;
-      
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        //extension
-        src[-1] = src[1];
-        src[width] = src[width-2];
-        // predict
-        const float* sp = src + (even ? 1 : 0);
-        float *dph = hdst;
-        v128_t factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[0]);
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
-        { //this is doing twice the work it needs to do
-          //it can be definitely written better
-          v128_t s1 = wasm_v128_load(sp - 1);
-          v128_t s2 = wasm_v128_load(sp + 1);
-          v128_t d = wasm_v128_load(sp);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          v128_t d1 = wasm_f32x4_add(d, s1);
-          sp += 4;
-          s1 = wasm_v128_load(sp - 1);
-          s2 = wasm_v128_load(sp + 1);
-          d = wasm_v128_load(sp);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          v128_t d2 = wasm_f32x4_add(d, s1);
-          sp += 4;
-          d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
-          wasm_v128_store(dph, d);
-        }
+        // combine both lsrc and hsrc into dst
+        wasm_deinterleave(ldst->f32, hdst->f32, src->f32, (int)width, even);          
 
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[1]);
-        sp = src + (even ? 0 : 1);
-        const float* sph = hdst + (even ? 0 : 1);
-        float *dpl = ldst;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
+        si32* hp = hdst->i32, * lp = ldst->i32;
+        ui32 l_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 h_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = num_steps; j > 0; --j)
         {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          v128_t s2 = wasm_v128_load(sph);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          v128_t d1 = wasm_v128_load(sp);
-          v128_t d2 = wasm_v128_load(sp + 4);
-          v128_t d = wasm_i32x4_shuffle(d1, d2, 0, 2, 4, 6);
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dpl, d);
-        }
+          // first lifting step
+          const lifting_step* s = atk->get_step(j - 1);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui32 e = s->rev.Eatk;
+          v128_t vb = wasm_i32x4_splat(b);
 
-        //extension
-        ldst[-1] = ldst[0];
-        ldst[L_width] = ldst[L_width-1];
-        //predict
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[2]);
-        const float* spl = ldst + (even ? 1 : 0);
-        dph = hdst;
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, spl+=4, dph+=4)
-        {
-          v128_t s1 = wasm_v128_load(spl - 1);
-          v128_t s2 = wasm_v128_load(spl);
-          v128_t d = wasm_v128_load(dph);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dph, d);
-        }
+          // extension
+          lp[-1] = lp[0];
+          lp[l_width] = lp[l_width - 1];
+          // lifting step
+          const si32* sp = lp;
+          si32* dp = hp;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)h_width;
+            if (even)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_add(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_add(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t w = wasm_i32x4_shr(t, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t w = wasm_i32x4_shr(t, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)h_width;
+            if (even)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_sub(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_sub(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 32bit multiplication is not supported in sse2; we need sse4.1,
+            // where we can use _mm_mullo_epi32, which multiplies
+            // 32bit x 32bit, keeping the LSBs
+            if (even)
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[0] + sp[1])) >> e;
+            else
+              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
+                *dp += (b + a * (sp[-1] + sp[0])) >> e;
+          }
 
-        // extension
-        hdst[-1] = hdst[0];
-        hdst[H_width] = hdst[H_width-1];
-        // update
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[3]);
-        sph = hdst + (even ? 0 : 1);
-        dpl = ldst;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, dpl+=4)
-        {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          v128_t s2 = wasm_v128_load(sph);
-          v128_t d = wasm_v128_load(dpl);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dpl, d);
-        }
-
-        //multipliers
-        float *dp = ldst;
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::K_inv);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          v128_t d = wasm_v128_load(dp);
-          wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
-        }
-        dp = hdst;
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::K);
-        for (int i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          v128_t d = wasm_v128_load(dp);
-          wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
+          // swap buffers
+          si32* t = lp; lp = hp; hp = t;
+          even = !even;
+          ui32 w = l_width; l_width = h_width; h_width = w;
         }
       }
-      else
-      {
+      else {
         if (even)
-          line_ldst->f32[0] = line_src->f32[0];
+          ldst->i32[0] = src->i32[0];
         else
-          line_hdst->f32[0] = line_src->f32[0] + line_src->f32[0];
+          hdst->i32[0] = src->i32[0] << 1;
       }
     }
-
-    /////////////////////////////////////////////////////////////////////////
-    void wasm_irrev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc, 
-                                     line_buf *line_hsrc, ui32 width, 
-                                     bool even)
+    
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_rev_horz_syn(const param_atk* atk, const line_buf* dst, 
+                           const line_buf* lsrc, const line_buf* hsrc, 
+                           ui32 width, bool even)
     {
       if (width > 1)
       {
-        float *lsrc = line_lsrc->f32, *hsrc = line_hsrc->f32;
-        float *dst = line_dst->f32;
-      
-        const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
-        const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
-
-        //multipliers
-        float *dp = lsrc;
-        v128_t factor = wasm_f32x4_splat(LIFTING_FACTORS::K);
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          v128_t d = wasm_v128_load(dp);
-          wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
-        }
-        dp = hsrc;
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::K_inv);
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
-        {
-          v128_t d = wasm_v128_load(dp);
-          wasm_v128_store(dp, wasm_f32x4_mul(factor, d));
-        }
-
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[7]);
-        const float *sph = hsrc + (even ? 0 : 1);
-        float *dpl = lsrc;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
+        bool ev = even;
+        si32* oth = hsrc->i32, * aug = lsrc->i32;
+        ui32 aug_width = (width + (even ? 1 : 0)) >> 1;  // low pass
+        ui32 oth_width = (width + (even ? 0 : 1)) >> 1;  // high pass
+        ui32 num_steps = atk->get_num_steps();
+        for (ui32 j = 0; j < num_steps; ++j)
         {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          v128_t s2 = wasm_v128_load(sph);
-          v128_t d = wasm_v128_load(dpl);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dpl, d);
-        }
+          const lifting_step* s = atk->get_step(j);
+          const si32 a = s->rev.Aatk;
+          const si32 b = s->rev.Batk;
+          const ui32 e = s->rev.Eatk;
+          v128_t vb = wasm_i32x4_splat(b);
 
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[6]);
-        const float *spl = lsrc + (even ? 0 : -1);
-        float *dph = hsrc;
-        for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4, spl+=4)
-        {
-          v128_t s1 = wasm_v128_load(spl);
-          v128_t s2 = wasm_v128_load(spl + 1);
-          v128_t d = wasm_v128_load(dph);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dph, d);
-        }
+          // extension
+          oth[-1] = oth[0];
+          oth[oth_width] = oth[oth_width - 1];
+          // lifting step
+          const si32* sp = oth;
+          si32* dp = aug;
+          if (a == 1)
+          { // 5/3 update and any case with a == 1
+            int i = (int)aug_width;
+            if (ev)
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_add(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+            else
+            {
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_add(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            }
+          }
+          else if (a == -1 && b == 1 && e == 1)
+          {  // 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t w = wasm_i32x4_shr(t, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t w = wasm_i32x4_shr(t, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else if (a == -1)
+          { // any case with a == -1, which is not 5/3 predict
+            int i = (int)aug_width;
+            if (ev)
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_sub(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+            else
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t v = wasm_i32x4_sub(vb, t);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
+          }
+          else {
+            // general case
+            // 32bit multiplication is not supported in sse2; we need sse4.1,
+            // where we can use _mm_mullo_epi32, which multiplies
+            // 32bit x 32bit, keeping the LSBs
+            if (ev)
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+            else
+              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
+                *dp -= (b + a * (sp[0] + sp[1])) >> e;
+          }
 
-        //extension
-        hsrc[-1] = hsrc[0];
-        hsrc[H_width] = hsrc[H_width-1];
-        //inverse update
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[5]);
-        sph = hsrc + (even ? 0 : 1);
-        dpl = lsrc;
-        for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
-        {
-          v128_t s1 = wasm_v128_load(sph - 1);
-          v128_t s2 = wasm_v128_load(sph);
-          v128_t d = wasm_v128_load(dpl);
-          s1 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s1);
-          wasm_v128_store(dpl, d);
+          // swap buffers
+          si32* t = aug; aug = oth; oth = t;
+          ev = !ev;
+          ui32 w = aug_width; aug_width = oth_width; oth_width = w;
         }
 
-        //extension
-        lsrc[-1] = lsrc[0];
-        lsrc[L_width] = lsrc[L_width-1];
-        //inverse perdict and combine
-        factor = wasm_f32x4_splat(LIFTING_FACTORS::steps[4]);
-        dp = dst + (even ? 0 : -1);
-        spl = lsrc + (even ? 0 : -1);
-        sph = hsrc;
-        ui32 width = L_width + (even ? 0 : 1);
-        for (ui32 i = (width + 3) >> 2; i > 0; --i, spl+=4, sph+=4, dp+=8)
-        {
-          v128_t s1 = wasm_v128_load(spl);
-          v128_t s2 = wasm_v128_load(spl + 1);
-          v128_t d = wasm_v128_load(sph);
-          s2 = wasm_f32x4_mul(factor, wasm_f32x4_add(s1, s2));
-          d = wasm_f32x4_add(d, s2);
-          wasm_v128_store(dp, wasm_i32x4_shuffle(s1, d, 0, 4, 1, 5));
-          wasm_v128_store(dp + 4, wasm_i32x4_shuffle(s1, d, 2, 6, 3, 7));
-        }
+        // combine both lsrc and hsrc into dst
+        wasm_interleave(dst->f32, lsrc->f32, hsrc->f32, (int)width, even);
       }
-      else
-      {
+      else {
         if (even)
-          line_dst->f32[0] = line_lsrc->f32[0];
+          dst->i32[0] = lsrc->i32[0];
         else
-          line_dst->f32[0] = line_hsrc->f32[0] * 0.5f;
+          dst->i32[0] = hsrc->i32[0] >> 1;
       }
     }
-
-  }
-}
+    
+  } // !local
+} // !ojph
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 48c8f67d..000409ff 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -69,10 +69,22 @@ else()
     COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE:ojph_expand>" "./"
     COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE:ojph_compress>" "./"
   )
+  if(EMSCRIPTEN)
+    add_custom_command(TARGET test_executables POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE:ojph_expand_simd>" "./"
+      COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE:ojph_compress_simd>" "./"
+    )
+    add_custom_command(TARGET test_executables POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE_DIR:ojph_expand>/ojph_expand.wasm" "./"
+      COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE_DIR:ojph_compress>/ojph_compress.wasm" "./"
+      COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE_DIR:ojph_expand_simd>/ojph_expand_simd.wasm" "./"
+      COMMAND ${CMAKE_COMMAND} -E copy "$<TARGET_FILE_DIR:ojph_compress_simd>/ojph_compress_simd.wasm" "./"
+    )
+  endif(EMSCRIPTEN)
   if(MSYS)
     add_custom_command(TARGET test_executables POST_BUILD
       COMMAND ${CMAKE_COMMAND} -E copy "../bin/msys-gtest.dll" "./"
       COMMAND ${CMAKE_COMMAND} -E copy "../bin/msys-gtest_main.dll" "./"
     )
-  endif()
-endif()
+  endif(MSYS)
+endif(MSVC)

From 21bc405c991dbcad3aae7876f157245ca41cbe3d Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Sat, 13 Apr 2024 20:08:27 +1000
Subject: [PATCH 34/37] Wasm simd is buggy.

---
 src/apps/ojph_compress/CMakeLists.txt      |   1 +
 src/apps/ojph_expand/CMakeLists.txt        |   1 +
 src/core/transform/ojph_transform_wasm.cpp | 110 ++++++++++++++++-----
 3 files changed, 86 insertions(+), 26 deletions(-)

diff --git a/src/apps/ojph_compress/CMakeLists.txt b/src/apps/ojph_compress/CMakeLists.txt
index dadcca9b..27723789 100644
--- a/src/apps/ojph_compress/CMakeLists.txt
+++ b/src/apps/ojph_compress/CMakeLists.txt
@@ -18,6 +18,7 @@ source_group("common"      FILES ${OJPH_IMG_IO_H})
 
 if(EMSCRIPTEN)
   add_compile_options(-std=c++11 -O3 -fexceptions)
+  add_link_options(-sWASM=1 -sASSERTIONS=1 -sALLOW_MEMORY_GROWTH=1 -sNODERAWFS=1 -sENVIRONMENT=node -sEXIT_RUNTIME=1 -sEXCEPTION_CATCHING_ALLOWED=['fake'])
   add_executable(ojph_compress ${SOURCES})
   add_executable(ojph_compress_simd ${SOURCES} ${OJPH_IMG_IO_SSE4})
   target_compile_options(ojph_compress_simd PRIVATE -DOJPH_ENABLE_WASM_SIMD -msimd128 -msse4.1)
diff --git a/src/apps/ojph_expand/CMakeLists.txt b/src/apps/ojph_expand/CMakeLists.txt
index d4b65523..ac650c38 100644
--- a/src/apps/ojph_expand/CMakeLists.txt
+++ b/src/apps/ojph_expand/CMakeLists.txt
@@ -18,6 +18,7 @@ source_group("common"      FILES ${OJPH_IMG_IO_H})
 
 if(EMSCRIPTEN)
   add_compile_options(-std=c++11 -O3 -fexceptions)
+  add_link_options(-sWASM=1 -sASSERTIONS=1 -sALLOW_MEMORY_GROWTH=1 -sNODERAWFS=1 -sENVIRONMENT=node -sEXIT_RUNTIME=1 -sEXCEPTION_CATCHING_ALLOWED=['fake'])
   add_executable(ojph_expand ${SOURCES})
   add_executable(ojph_expand_simd ${SOURCES} ${OJPH_IMG_IO_SSE4})
   target_compile_options(ojph_expand_simd PRIVATE -DOJPH_ENABLE_WASM_SIMD -msimd128 -msse4.1)
diff --git a/src/core/transform/ojph_transform_wasm.cpp b/src/core/transform/ojph_transform_wasm.cpp
index 7b9ffb10..83cee30c 100644
--- a/src/core/transform/ojph_transform_wasm.cpp
+++ b/src/core/transform/ojph_transform_wasm.cpp
@@ -305,6 +305,7 @@ namespace ojph {
       const si32 a = s->rev.Aatk;
       const si32 b = s->rev.Batk;
       const ui32 e = s->rev.Eatk;
+      v128_t va = wasm_i32x4_splat(a);
       v128_t vb = wasm_i32x4_splat(b);
 
       si32* dst = aug->i32;
@@ -394,16 +395,35 @@ namespace ojph {
             wasm_v128_store((v128_t*)dst, d);
           }
       }
-      else { // general case
-        // 32bit multiplication is not supported in sse2; we need sse4.1,
-        // where we can use _mm_mullo_epi32, which multiplies 32bit x 32bit,
-        // keeping the LSBs
+      else 
+      { // general case
+        int i = (int)repeat;
         if (synthesis)
-          for (ui32 i = repeat; i > 0; --i)
-            *dst++ -= (b + a * (*src1++ + *src2++)) >> e;
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t u = wasm_i32x4_mul(va, t);
+            v128_t v = wasm_i32x4_add(vb, u);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_sub(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
         else
-          for (ui32 i = repeat; i > 0; --i)
-            *dst++ += (b + a * (*src1++ + *src2++)) >> e;
+          for (; i > 0; i -= 4, dst += 4, src1 += 4, src2 += 4)
+          {
+            v128_t s1 = wasm_v128_load((v128_t*)src1);
+            v128_t s2 = wasm_v128_load((v128_t*)src2);
+            v128_t d = wasm_v128_load((v128_t*)dst);
+            v128_t t = wasm_i32x4_add(s1, s2);
+            v128_t u = wasm_i32x4_mul(va, t);
+            v128_t v = wasm_i32x4_add(vb, u);
+            v128_t w = wasm_i32x4_shr(v, e);
+            d = wasm_i32x4_add(d, w);
+            wasm_v128_store((v128_t*)dst, d);
+          }
       }
     }
 
@@ -428,6 +448,7 @@ namespace ojph {
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
           const ui32 e = s->rev.Eatk;
+          v128_t va = wasm_i32x4_splat(a);
           v128_t vb = wasm_i32x4_splat(b);
 
           // extension
@@ -522,17 +543,35 @@ namespace ojph {
                 wasm_v128_store((v128_t*)dp, d);
               }
           }
-          else {
-            // general case
-            // 32bit multiplication is not supported in sse2; we need sse4.1,
-            // where we can use _mm_mullo_epi32, which multiplies
-            // 32bit x 32bit, keeping the LSBs
+          else 
+          { // general case
+            int i = (int)h_width;
             if (even)
-              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
-                *dp += (b + a * (sp[0] + sp[1])) >> e;
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t u = wasm_i32x4_mul(va, t);
+                v128_t v = wasm_i32x4_add(vb, u);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
             else
-              for (ui32 i = h_width; i > 0; --i, sp++, dp++)
-                *dp += (b + a * (sp[-1] + sp[0])) >> e;
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t u = wasm_i32x4_mul(va, t);                
+                v128_t v = wasm_i32x4_add(vb, u);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_add(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
           }
 
           // swap buffers
@@ -567,6 +606,7 @@ namespace ojph {
           const si32 a = s->rev.Aatk;
           const si32 b = s->rev.Batk;
           const ui32 e = s->rev.Eatk;
+          v128_t va = wasm_i32x4_splat(a);
           v128_t vb = wasm_i32x4_splat(b);
 
           // extension
@@ -661,17 +701,35 @@ namespace ojph {
                 wasm_v128_store((v128_t*)dp, d);
               }
           }
-          else {
-            // general case
-            // 32bit multiplication is not supported in sse2; we need sse4.1,
-            // where we can use _mm_mullo_epi32, which multiplies
-            // 32bit x 32bit, keeping the LSBs
+          else 
+          { // general case
+            int i = (int)aug_width;
             if (ev)
-              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
-                *dp -= (b + a * (sp[-1] + sp[0])) >> e;
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t u = wasm_i32x4_mul(va, t);
+                v128_t v = wasm_i32x4_add(vb, u);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
             else
-              for (ui32 i = aug_width; i > 0; --i, sp++, dp++)
-                *dp -= (b + a * (sp[0] + sp[1])) >> e;
+              for (; i > 0; i -= 4, sp += 4, dp += 4)
+              {
+                v128_t s1 = wasm_v128_load((v128_t*)sp);
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t d = wasm_v128_load((v128_t*)dp);
+                v128_t t = wasm_i32x4_add(s1, s2);
+                v128_t u = wasm_i32x4_mul(va, t);                
+                v128_t v = wasm_i32x4_add(vb, u);
+                v128_t w = wasm_i32x4_shr(v, e);
+                d = wasm_i32x4_sub(d, w);
+                wasm_v128_store((v128_t*)dp, d);
+              }
           }
 
           // swap buffers

From e40fa17ccbd44e49251a880813a1b513fe184d6b Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Sat, 13 Apr 2024 20:12:30 +1000
Subject: [PATCH 35/37] A small bug fix.

---
 src/core/transform/ojph_transform_wasm.cpp | 4 ++--
 tests/test_executables.cpp                 | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/core/transform/ojph_transform_wasm.cpp b/src/core/transform/ojph_transform_wasm.cpp
index 83cee30c..bd652dfa 100644
--- a/src/core/transform/ojph_transform_wasm.cpp
+++ b/src/core/transform/ojph_transform_wasm.cpp
@@ -550,7 +550,7 @@ namespace ojph {
               for (; i > 0; i -= 4, sp += 4, dp += 4)
               {
                 v128_t s1 = wasm_v128_load((v128_t*)sp);
-                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
+                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
                 v128_t d = wasm_v128_load((v128_t*)dp);
                 v128_t t = wasm_i32x4_add(s1, s2);
                 v128_t u = wasm_i32x4_mul(va, t);
@@ -563,7 +563,7 @@ namespace ojph {
               for (; i > 0; i -= 4, sp += 4, dp += 4)
               {
                 v128_t s1 = wasm_v128_load((v128_t*)sp);
-                v128_t s2 = wasm_v128_load((v128_t*)(sp + 1));
+                v128_t s2 = wasm_v128_load((v128_t*)(sp - 1));
                 v128_t d = wasm_v128_load((v128_t*)dp);
                 v128_t t = wasm_i32x4_add(s1, s2);
                 v128_t u = wasm_i32x4_mul(va, t);                
diff --git a/tests/test_executables.cpp b/tests/test_executables.cpp
index f42174f6..99b4f8c0 100644
--- a/tests/test_executables.cpp
+++ b/tests/test_executables.cpp
@@ -128,7 +128,6 @@ void run_ojph_compress(const std::string& ref_filename,
       + " -i " + REF_FILE_DIR + ref_filename
       + " -o " + OUT_FILE_DIR + base_filename + extended_base_fname +
       "." + out_ext + " " + extra_options;
-    std::cerr << command << std::endl;
     EXPECT_EQ(execute(command, result), 0);
   }
   catch (const std::runtime_error& error) {

From a92f9216bd81e482d62e8995be405cd32d3b8c77 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Sat, 13 Apr 2024 21:52:16 +1000
Subject: [PATCH 36/37] Added one test.

---
 tests/test_executables.cpp         | 646 +++++++++++++++--------------
 tests/test_helpers/ht_cmdlines.txt |   1 +
 2 files changed, 332 insertions(+), 315 deletions(-)

diff --git a/tests/test_executables.cpp b/tests/test_executables.cpp
index 99b4f8c0..8660f9d1 100644
--- a/tests/test_executables.cpp
+++ b/tests/test_executables.cpp
@@ -274,11 +274,11 @@ TEST(TestExecutables, OpenJPHExpandNoArguments) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_64x64.jph -precise -quiet -rate 0.5 -full
 TEST(TestExecutables, SimpleDecIrv9764x64) {
-  double mse[3] = { 39.2812, 36.3819, 47.642 };
-  int pae[3] = { 74, 77, 73 };
+  double mse[3] = { 39.2812, 36.3819, 47.642};
+  int pae[3] = { 74, 77, 73};
   run_ojph_expand("simple_dec_irv97_64x64", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -286,11 +286,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_32x32.jph -precise -quiet -rate 1 Cblk={32,32} -full
 TEST(TestExecutables, SimpleDecIrv9732x32) {
-  double mse[3] = { 18.6979, 17.1208, 22.7539 };
-  int pae[3] = { 51, 48, 46 };
+  double mse[3] = { 18.6979, 17.1208, 22.7539};
+  int pae[3] = { 51, 48, 46};
   run_ojph_expand("simple_dec_irv97_32x32", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_32x32", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -298,11 +298,11 @@ TEST(TestExecutables, SimpleDecIrv9732x32) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_16x16.jph -precise -quiet -rate 1 Cblk={16,16} -full
 TEST(TestExecutables, SimpleDecIrv9716x16) {
-  double mse[3] = { 20.1706, 18.5427, 24.6146 };
-  int pae[3] = { 53, 51, 47 };
+  double mse[3] = { 20.1706, 18.5427, 24.6146};
+  int pae[3] = { 53, 51, 47};
   run_ojph_expand("simple_dec_irv97_16x16", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_16x16", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -310,11 +310,11 @@ TEST(TestExecutables, SimpleDecIrv9716x16) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_4x4.jph -precise -quiet -rate 1 Cblk={4,4} -full
 TEST(TestExecutables, SimpleDecIrv974x4) {
-  double mse[3] = { 40.8623, 37.9308, 49.7276 };
-  int pae[3] = { 75, 77, 80 };
+  double mse[3] = { 40.8623, 37.9308, 49.7276};
+  int pae[3] = { 75, 77, 80};
   run_ojph_expand("simple_dec_irv97_4x4", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_4x4", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -322,11 +322,11 @@ TEST(TestExecutables, SimpleDecIrv974x4) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_1024x4.jph -precise -quiet -rate 1 Cblk={1024,4} -full
 TEST(TestExecutables, SimpleDecIrv971024x4) {
-  double mse[3] = { 19.8275, 18.2511, 24.2832 };
-  int pae[3] = { 53, 52, 50 };
+  double mse[3] = { 19.8275, 18.2511, 24.2832};
+  int pae[3] = { 53, 52, 50};
   run_ojph_expand("simple_dec_irv97_1024x4", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_1024x4", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -334,11 +334,11 @@ TEST(TestExecutables, SimpleDecIrv971024x4) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_4x1024.jph -precise -quiet -rate 1 Cblk={4,1024} -full
 TEST(TestExecutables, SimpleDecIrv974x1024) {
-  double mse[3] = { 19.9635, 18.4063, 24.1719 };
-  int pae[3] = { 51, 48, 51 };
+  double mse[3] = { 19.9635, 18.4063, 24.1719};
+  int pae[3] = { 51, 48, 51};
   run_ojph_expand("simple_dec_irv97_4x1024", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_4x1024", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -346,11 +346,11 @@ TEST(TestExecutables, SimpleDecIrv974x1024) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_512x8.jph -precise -quiet -rate 1 Cblk={512,8} -full
 TEST(TestExecutables, SimpleDecIrv97512x8) {
-  double mse[3] = { 18.7929, 17.2026, 22.9922 };
-  int pae[3] = { 53, 52, 50 };
+  double mse[3] = { 18.7929, 17.2026, 22.9922};
+  int pae[3] = { 53, 52, 50};
   run_ojph_expand("simple_dec_irv97_512x8", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_512x8", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -358,11 +358,11 @@ TEST(TestExecutables, SimpleDecIrv97512x8) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_8x512.jph -precise -quiet -rate 1 Cblk={8,512} -full
 TEST(TestExecutables, SimpleDecIrv978x512) {
-  double mse[3] = { 19.3661, 17.8067, 23.4574 };
-  int pae[3] = { 51, 48, 52 };
+  double mse[3] = { 19.3661, 17.8067, 23.4574};
+  int pae[3] = { 51, 48, 52};
   run_ojph_expand("simple_dec_irv97_8x512", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_8x512", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -370,11 +370,11 @@ TEST(TestExecutables, SimpleDecIrv978x512) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_256x16.jph -precise -quiet -rate 1 Cblk={256,16} -full
 TEST(TestExecutables, SimpleDecIrv97256x16) {
-  double mse[3] = { 18.6355, 17.0963, 22.6076 };
-  int pae[3] = { 54, 51, 48 };
+  double mse[3] = { 18.6355, 17.0963, 22.6076};
+  int pae[3] = { 54, 51, 48};
   run_ojph_expand("simple_dec_irv97_256x16", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_256x16", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -382,11 +382,11 @@ TEST(TestExecutables, SimpleDecIrv97256x16) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_16x256.jph -precise -quiet -rate 1 Cblk={16,256} -full
 TEST(TestExecutables, SimpleDecIrv9716x256) {
-  double mse[3] = { 18.5933, 17.0208, 22.5709 };
-  int pae[3] = { 51, 48, 47 };
+  double mse[3] = { 18.5933, 17.0208, 22.5709};
+  int pae[3] = { 51, 48, 47};
   run_ojph_expand("simple_dec_irv97_16x256", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_16x256", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -394,11 +394,11 @@ TEST(TestExecutables, SimpleDecIrv9716x256) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_128x32.jph -precise -quiet -rate 1 Cblk={128,32} -full
 TEST(TestExecutables, SimpleDecIrv97128x32) {
-  double mse[3] = { 18.4443, 16.9133, 22.4193 };
-  int pae[3] = { 52, 50, 46 };
+  double mse[3] = { 18.4443, 16.9133, 22.4193};
+  int pae[3] = { 52, 50, 46};
   run_ojph_expand("simple_dec_irv97_128x32", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_128x32", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -406,11 +406,11 @@ TEST(TestExecutables, SimpleDecIrv97128x32) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_32x128.jph -precise -quiet -rate 1 Cblk={32,128} -full
 TEST(TestExecutables, SimpleDecIrv9732x128) {
-  double mse[3] = { 18.4874, 16.9379, 22.4855 };
-  int pae[3] = { 51, 48, 45 };
+  double mse[3] = { 18.4874, 16.9379, 22.4855};
+  int pae[3] = { 51, 48, 45};
   run_ojph_expand("simple_dec_irv97_32x128", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_32x128", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -418,11 +418,11 @@ TEST(TestExecutables, SimpleDecIrv9732x128) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_rev53_64x64.jph -precise -quiet Creversible=yes -full
 TEST(TestExecutables, SimpleDecRev5364x64) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_expand("simple_dec_rev53_64x64", "jph", "ppm");
   run_mse_pae("simple_dec_rev53_64x64", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -431,11 +431,11 @@ TEST(TestExecutables, SimpleDecRev5364x64) {
 // -o simple_dec_rev53_32x32.jph -precise -quiet Creversible=yes Cblk={32,32}
 // -full
 TEST(TestExecutables, SimpleDecRev5332x32) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_expand("simple_dec_rev53_32x32", "jph", "ppm");
   run_mse_pae("simple_dec_rev53_32x32", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -444,11 +444,11 @@ TEST(TestExecutables, SimpleDecRev5332x32) {
 // -o simple_dec_rev53_4x4.jph -precise -quiet Creversible=yes Cblk={4,4}
 // -full
 TEST(TestExecutables, SimpleDecRev534x4) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_expand("simple_dec_rev53_4x4", "jph", "ppm");
   run_mse_pae("simple_dec_rev53_4x4", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -457,11 +457,11 @@ TEST(TestExecutables, SimpleDecRev534x4) {
 // -o simple_dec_rev53_1024x4.jph -precise -quiet Creversible=yes
 // Cblk={1024,4} -full
 TEST(TestExecutables, SimpleDecRev531024x4) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_expand("simple_dec_rev53_1024x4", "jph", "ppm");
   run_mse_pae("simple_dec_rev53_1024x4", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -470,11 +470,11 @@ TEST(TestExecutables, SimpleDecRev531024x4) {
 // -o simple_dec_rev53_4x1024.jph -precise -quiet Creversible=yes
 // Cblk={4,1024} -full
 TEST(TestExecutables, SimpleDecRev534x1024) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_expand("simple_dec_rev53_4x1024", "jph", "ppm");
   run_mse_pae("simple_dec_rev53_4x1024", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -485,11 +485,11 @@ TEST(TestExecutables, SimpleDecRev534x1024) {
 // Sdims={288,352},{144,176},{144,176} Ssampling={1,1},{2,2},{2,2}
 // Nprecision={8} Nsigned={no} -full
 TEST(TestExecutables, SimpleDecIrv9764x64Yuv) {
-  double mse[3] = { 20.2778, 6.27912, 4.15937 };
-  int pae[3] = { 52, 22, 31 };
+  double mse[3] = { 20.2778, 6.27912, 4.15937};
+  int pae[3] = { 52, 22, 31};
   run_ojph_expand("simple_dec_irv97_64x64_yuv", "jph", "yuv");
   run_mse_pae("simple_dec_irv97_64x64_yuv", "yuv", "foreman_420.yuv",
-    ":352x288x8x420", 3, mse, pae);
+              ":352x288x8x420", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -500,11 +500,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64Yuv) {
 // Sdims={288,352},{144,176},{144,176} Ssampling={1,1},{2,2},{2,2}
 // Nprecision={8} Nsigned={no} -full
 TEST(TestExecutables, SimpleDecRev5364x64Yuv) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_expand("simple_dec_rev53_64x64_yuv", "jph", "yuv");
   run_mse_pae("simple_dec_rev53_64x64_yuv", "yuv", "foreman_420.yuv",
-    ":352x288x8x420", 3, mse, pae);
+              ":352x288x8x420", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -515,11 +515,11 @@ TEST(TestExecutables, SimpleDecRev5364x64Yuv) {
 // Sdims={288,352},{144,176},{144,176} Ssampling={1,1},{2,2},{2,2}
 // Nprecision={8} Nsigned={no} Stiles={33,257} -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesYuv) {
-  double mse[3] = { 34.4972, 10.1112, 7.96331 };
-  int pae[3] = { 67, 30, 39 };
+  double mse[3] = { 34.4972, 10.1112, 7.96331};
+  int pae[3] = { 67, 30, 39};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_yuv", "jph", "yuv");
   run_mse_pae("simple_dec_irv97_64x64_tiles_yuv", "yuv", "foreman_420.yuv",
-    ":352x288x8x420", 3, mse, pae);
+              ":352x288x8x420", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -530,11 +530,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesYuv) {
 // Sdims={288,352},{144,176},{144,176} Ssampling={1,1},{2,2},{2,2}
 // Nprecision={8} Nsigned={no} Stiles={33,257} -full
 TEST(TestExecutables, SimpleDecRev5364x64TilesYuv) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_expand("simple_dec_rev53_64x64_tiles_yuv", "jph", "yuv");
   run_mse_pae("simple_dec_rev53_64x64_tiles_yuv", "yuv", "foreman_420.yuv",
-    ":352x288x8x420", 3, mse, pae);
+              ":352x288x8x420", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -544,11 +544,11 @@ TEST(TestExecutables, SimpleDecRev5364x64TilesYuv) {
 // Clevels=5 Corder=LRCP Cprecincts={2,256} Sorigin={374,1717}
 // Stile_origin={374,1717} -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesLRCP) {
-  double mse[3] = { 71.8149, 68.7115, 89.4001 };
-  int pae[3] = { 78, 78, 83 };
+  double mse[3] = { 71.8149, 68.7115, 89.4001};
+  int pae[3] = { 78, 78, 83};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_LRCP", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_LRCP", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -558,11 +558,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesLRCP) {
 // Clevels=5 Corder=RLCP Cprecincts={2,256} Sorigin={374,1717}
 // Stile_origin={374,1717} -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesRLCP) {
-  double mse[3] = { 71.8149, 68.7115, 89.4001 };
-  int pae[3] = { 78, 78, 83 };
+  double mse[3] = { 71.8149, 68.7115, 89.4001};
+  int pae[3] = { 78, 78, 83};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_RLCP", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_RLCP", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -572,11 +572,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesRLCP) {
 // Clevels=5 Corder=RPCL Cprecincts={2,256} Sorigin={374,1717}
 // Stile_origin={374,1717} -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesRPCL) {
-  double mse[3] = { 71.8149, 68.7115, 89.4001 };
-  int pae[3] = { 78, 78, 83 };
+  double mse[3] = { 71.8149, 68.7115, 89.4001};
+  int pae[3] = { 78, 78, 83};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_RPCL", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_RPCL", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -586,11 +586,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesRPCL) {
 // Clevels=5 Corder=PCRL Cprecincts={2,256} Sorigin={374,1717}
 // Stile_origin={374,1717} -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesPCRL) {
-  double mse[3] = { 71.8149, 68.7115, 89.4001 };
-  int pae[3] = { 78, 78, 83 };
+  double mse[3] = { 71.8149, 68.7115, 89.4001};
+  int pae[3] = { 78, 78, 83};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_PCRL", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_PCRL", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -600,11 +600,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesPCRL) {
 // Clevels=5 Corder=CPRL Cprecincts={2,256} Sorigin={374,1717}
 // Stile_origin={374,1717} -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesCPRL) {
-  double mse[3] = { 71.8149, 68.7115, 89.4001 };
-  int pae[3] = { 78, 78, 83 };
+  double mse[3] = { 71.8149, 68.7115, 89.4001};
+  int pae[3] = { 78, 78, 83};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_CPRL", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_CPRL", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -614,11 +614,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesCPRL) {
 // Clevels=5 Corder=LRCP Sorigin={5,33} Stile_origin={5,10} Stiles={33,257}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesLRCP33) {
-  double mse[3] = { 56.2139, 51.4121, 69.0107 };
-  int pae[3] = { 80, 81, 98 };
+  double mse[3] = { 56.2139, 51.4121, 69.0107};
+  int pae[3] = { 80, 81, 98};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_LRCP33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_LRCP33", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -628,11 +628,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesLRCP33) {
 // Clevels=5 Corder=RLCP Sorigin={5,33} Stile_origin={5,10} Stiles={33,257}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesRLCP33) {
-  double mse[3] = { 56.2139, 51.4121, 69.0107 };
-  int pae[3] = { 80, 81, 98 };
+  double mse[3] = { 56.2139, 51.4121, 69.0107};
+  int pae[3] = { 80, 81, 98};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_RLCP33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_RLCP33", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -642,11 +642,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesRLCP33) {
 // Clevels=5 Corder=RPCL Sorigin={5,33} Stile_origin={5,10} Stiles={33,257}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesRPCL33) {
-  double mse[3] = { 56.2139, 51.4121, 69.0107 };
-  int pae[3] = { 80, 81, 98 };
+  double mse[3] = { 56.2139, 51.4121, 69.0107};
+  int pae[3] = { 80, 81, 98};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_RPCL33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_RPCL33", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -656,11 +656,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesRPCL33) {
 // Clevels=5 Corder=PCRL Sorigin={5,33} Stile_origin={5,10} Stiles={33,257}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesPCRL33) {
-  double mse[3] = { 56.2139, 51.4121, 69.0107 };
-  int pae[3] = { 80, 81, 98 };
+  double mse[3] = { 56.2139, 51.4121, 69.0107};
+  int pae[3] = { 80, 81, 98};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_PCRL33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_PCRL33", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -670,11 +670,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesPCRL33) {
 // Clevels=5 Corder=CPRL Sorigin={5,33} Stile_origin={5,10} Stiles={33,257}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesCPRL33) {
-  double mse[3] = { 56.2139, 51.4121, 69.0107 };
-  int pae[3] = { 80, 81, 98 };
+  double mse[3] = { 56.2139, 51.4121, 69.0107};
+  int pae[3] = { 80, 81, 98};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_CPRL33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_CPRL33", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -684,11 +684,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesCPRL33) {
 // Clevels=5 Corder=LRCP Sorigin={5,33} Stile_origin={5,10} Stiles={33,33}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesLRCP33x33) {
-  double mse[3] = { 210.283, 210.214, 257.276 };
-  int pae[3] = { 165, 161, 166 };
+  double mse[3] = { 210.283, 210.214, 257.276};
+  int pae[3] = { 165, 161, 166};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_LRCP33x33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_LRCP33x33", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -698,11 +698,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesLRCP33x33) {
 // Clevels=5 Corder=RLCP Sorigin={5,33} Stile_origin={5,10} Stiles={33,33}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesRLCP33x33) {
-  double mse[3] = { 210.283, 210.214, 257.276 };
-  int pae[3] = { 165, 161, 166 };
+  double mse[3] = { 210.283, 210.214, 257.276};
+  int pae[3] = { 165, 161, 166};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_RLCP33x33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_RLCP33x33", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -712,11 +712,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesRLCP33x33) {
 // Clevels=5 Corder=RPCL Sorigin={5,33} Stile_origin={5,10} Stiles={33,33}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesRPCL33x33) {
-  double mse[3] = { 210.283, 210.214, 257.276 };
-  int pae[3] = { 165, 161, 166 };
+  double mse[3] = { 210.283, 210.214, 257.276};
+  int pae[3] = { 165, 161, 166};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_RPCL33x33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_RPCL33x33", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -726,11 +726,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesRPCL33x33) {
 // Clevels=5 Corder=PCRL Sorigin={5,33} Stile_origin={5,10} Stiles={33,33}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesPCRL33x33) {
-  double mse[3] = { 210.283, 210.214, 257.276 };
-  int pae[3] = { 165, 161, 166 };
+  double mse[3] = { 210.283, 210.214, 257.276};
+  int pae[3] = { 165, 161, 166};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_PCRL33x33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_PCRL33x33", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -740,11 +740,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesPCRL33x33) {
 // Clevels=5 Corder=CPRL Sorigin={5,33} Stile_origin={5,10} Stiles={33,33}
 // -full
 TEST(TestExecutables, SimpleDecIrv9764x64TilesCPRL33x33) {
-  double mse[3] = { 210.283, 210.214, 257.276 };
-  int pae[3] = { 165, 161, 166 };
+  double mse[3] = { 210.283, 210.214, 257.276};
+  int pae[3] = { 165, 161, 166};
   run_ojph_expand("simple_dec_irv97_64x64_tiles_CPRL33x33", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_tiles_CPRL33x33", "ppm", "Malamute.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -753,11 +753,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64TilesCPRL33x33) {
 // -o simple_dec_rev53_64x64_gray_tiles.jph -precise -quiet Creversible=yes
 // Clevels=5 Stiles={33,257} -full
 TEST(TestExecutables, SimpleDecRev5364x64GrayTiles) {
-  double mse[1] = { 0 };
-  int pae[1] = { 0 };
+  double mse[1] = { 0};
+  int pae[1] = { 0};
   run_ojph_expand("simple_dec_rev53_64x64_gray_tiles", "jph", "pgm");
   run_mse_pae("simple_dec_rev53_64x64_gray_tiles", "pgm", "monarch.pgm",
-    "", 1, mse, pae);
+              "", 1, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -766,11 +766,11 @@ TEST(TestExecutables, SimpleDecRev5364x64GrayTiles) {
 // -o simple_dec_irv97_64x64_gray_tiles.jph -precise -quiet -rate 0.5
 // Clevels=5 Stiles={33,257} -full
 TEST(TestExecutables, SimpleDecIrv9764x64GrayTiles) {
-  double mse[1] = { 18.9601 };
-  int pae[1] = { 56 };
+  double mse[1] = { 18.9601};
+  int pae[1] = { 56};
   run_ojph_expand("simple_dec_irv97_64x64_gray_tiles", "jph", "pgm");
   run_mse_pae("simple_dec_irv97_64x64_gray_tiles", "pgm", "monarch.pgm",
-    "", 1, mse, pae);
+              "", 1, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -778,11 +778,11 @@ TEST(TestExecutables, SimpleDecIrv9764x64GrayTiles) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_64x64_16bit.jph -precise -quiet -rate 0.5 -full
 TEST(TestExecutables, SimpleDecIrv9764x6416bit) {
-  double mse[3] = { 60507.2, 36672.5, 64809.8 };
-  int pae[3] = { 2547, 1974, 1922 };
+  double mse[3] = { 60507.2, 36672.5, 64809.8};
+  int pae[3] = { 2547, 1974, 1922};
   run_ojph_expand("simple_dec_irv97_64x64_16bit", "jph", "ppm");
   run_mse_pae("simple_dec_irv97_64x64_16bit", "ppm", "mm.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -790,11 +790,11 @@ TEST(TestExecutables, SimpleDecIrv9764x6416bit) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_irv97_64x64_16bit_gray.jph -precise -quiet -rate 0.5 -full
 TEST(TestExecutables, SimpleDecIrv9764x6416bitGray) {
-  double mse[1] = { 19382.9 };
-  int pae[1] = { 1618 };
+  double mse[1] = { 19382.9};
+  int pae[1] = { 1618};
   run_ojph_expand("simple_dec_irv97_64x64_16bit_gray", "jph", "pgm");
   run_mse_pae("simple_dec_irv97_64x64_16bit_gray", "pgm", "mm.pgm",
-    "", 1, mse, pae);
+              "", 1, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -802,11 +802,11 @@ TEST(TestExecutables, SimpleDecIrv9764x6416bitGray) {
 // Command-line options used to obtain this file is:
 // -o simple_dec_rev53_64x64_16bit.jph -precise -quiet Creversible=yes -full
 TEST(TestExecutables, SimpleDecRev5364x6416bit) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_expand("simple_dec_rev53_64x64_16bit", "jph", "ppm");
   run_mse_pae("simple_dec_rev53_64x64_16bit", "ppm", "mm.ppm",
-    "", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -815,11 +815,27 @@ TEST(TestExecutables, SimpleDecRev5364x6416bit) {
 // -o simple_dec_rev53_64x64_16bit_gray.jph -precise -quiet Creversible=yes
 // -full
 TEST(TestExecutables, SimpleDecRev5364x6416bitGray) {
-  double mse[1] = { 0 };
-  int pae[1] = { 0 };
+  double mse[1] = { 0};
+  int pae[1] = { 0};
   run_ojph_expand("simple_dec_rev53_64x64_16bit_gray", "jph", "pgm");
   run_mse_pae("simple_dec_rev53_64x64_16bit_gray", "pgm", "mm.pgm",
-    "", 1, mse, pae);
+              "", 1, mse, pae);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Test ojph_expand with codeblocks when the rev53 wavelet is used.
+// Command-line options used to obtain this file is:
+// -o simple_dec_irv53_bhvhb_low_latency.jph -quiet Corder=PCRL Clevels=5 
+// "Cmodes=HT|CAUSAL" -rate 2 -o simple_dec_irv53_bhvhb_low_latency.jph Catk=2 
+// Kkernels:I2=I5X3 Cprecincts="{16,8192},{8,8192},{4,8192}" Cblk="{8,256}" 
+// Cdecomp="B(-:-:-),H(-),V(-),H(-),B(-:-:-)" Qstep=0.0001 -precise -no_weights 
+// -tolerance 0
+TEST(TestExecutables, SimpleDecIrv53BhvhbLowLatency) {
+  double mse[3] = { 5.52392, 4.01405, 6.8166};
+  int pae[3] = { 16, 17, 23};
+  run_ojph_expand("simple_dec_irv53_bhvhb_low_latency", "jph", "ppm");
+  run_mse_pae("simple_dec_irv53_bhvhb_low_latency", "ppm", "Malamute.ppm",
+              ":I2=I5X3 Cprecincts=", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -828,14 +844,14 @@ TEST(TestExecutables, SimpleDecRev5364x6416bitGray) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_64x64.j2c -qstep 0.1
 TEST(TestExecutables, SimpleEncIrv9764x64) {
-  double mse[3] = { 46.2004, 43.622, 56.7452 };
-  int pae[3] = { 48, 46, 52 };
+  double mse[3] = { 46.2004, 43.622, 56.7452};
+  int pae[3] = { 48, 46, 52};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_irv97_64x64", "", "j2c",
-    "-qstep 0.1");
+                    "simple_enc_irv97_64x64", "", "j2c",
+                    "-qstep 0.1");
   run_ojph_compress_expand("simple_enc_irv97_64x64", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_64x64", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -844,14 +860,14 @@ TEST(TestExecutables, SimpleEncIrv9764x64) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_32x32.j2c -qstep 0.01 -block_size {32,32}
 TEST(TestExecutables, SimpleEncIrv9732x32) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395 };
-  int pae[3] = { 7, 6, 9 };
+  double mse[3] = { 1.78779, 1.26001, 2.38395};
+  int pae[3] = { 7, 6, 9};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_irv97_32x32", "", "j2c",
-    "-qstep 0.01 -block_size \"{32,32}\"");
+                    "simple_enc_irv97_32x32", "", "j2c",
+                    "-qstep 0.01 -block_size \"{32,32}\"");
   run_ojph_compress_expand("simple_enc_irv97_32x32", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_32x32", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -860,14 +876,14 @@ TEST(TestExecutables, SimpleEncIrv9732x32) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_16x16.j2c -qstep 0.01 -block_size {16,16}
 TEST(TestExecutables, SimpleEncIrv9716x16) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395 };
-  int pae[3] = { 7, 6, 9 };
+  double mse[3] = { 1.78779, 1.26001, 2.38395};
+  int pae[3] = { 7, 6, 9};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_irv97_16x16", "", "j2c",
-    "-qstep 0.01 -block_size \"{16,16}\"");
+                    "simple_enc_irv97_16x16", "", "j2c",
+                    "-qstep 0.01 -block_size \"{16,16}\"");
   run_ojph_compress_expand("simple_enc_irv97_16x16", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_16x16", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -876,14 +892,14 @@ TEST(TestExecutables, SimpleEncIrv9716x16) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_4x4.j2c -qstep 0.01 -block_size {4,4}
 TEST(TestExecutables, SimpleEncIrv974x4) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395 };
-  int pae[3] = { 7, 6, 9 };
+  double mse[3] = { 1.78779, 1.26001, 2.38395};
+  int pae[3] = { 7, 6, 9};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_irv97_4x4", "", "j2c",
-    "-qstep 0.01 -block_size \"{4,4}\"");
+                    "simple_enc_irv97_4x4", "", "j2c",
+                    "-qstep 0.01 -block_size \"{4,4}\"");
   run_ojph_compress_expand("simple_enc_irv97_4x4", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_4x4", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -892,14 +908,14 @@ TEST(TestExecutables, SimpleEncIrv974x4) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_1024x4.j2c -qstep 0.01 -block_size {4,1024}
 TEST(TestExecutables, SimpleEncIrv971024x4) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395 };
-  int pae[3] = { 7, 6, 9 };
+  double mse[3] = { 1.78779, 1.26001, 2.38395};
+  int pae[3] = { 7, 6, 9};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_irv97_1024x4", "", "j2c",
-    "-qstep 0.01 -block_size \"{4,1024}\"");
+                    "simple_enc_irv97_1024x4", "", "j2c",
+                    "-qstep 0.01 -block_size \"{4,1024}\"");
   run_ojph_compress_expand("simple_enc_irv97_1024x4", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_1024x4", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -908,14 +924,14 @@ TEST(TestExecutables, SimpleEncIrv971024x4) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_4x1024.j2c -qstep 0.01 -block_size {1024,4}
 TEST(TestExecutables, SimpleEncIrv974x1024) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395 };
-  int pae[3] = { 7, 6, 9 };
+  double mse[3] = { 1.78779, 1.26001, 2.38395};
+  int pae[3] = { 7, 6, 9};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_irv97_4x1024", "", "j2c",
-    "-qstep 0.01 -block_size \"{1024,4}\"");
+                    "simple_enc_irv97_4x1024", "", "j2c",
+                    "-qstep 0.01 -block_size \"{1024,4}\"");
   run_ojph_compress_expand("simple_enc_irv97_4x1024", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_4x1024", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -924,14 +940,14 @@ TEST(TestExecutables, SimpleEncIrv974x1024) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_512x8.j2c -qstep 0.01 -block_size {8,512}
 TEST(TestExecutables, SimpleEncIrv97512x8) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395 };
-  int pae[3] = { 7, 6, 9 };
+  double mse[3] = { 1.78779, 1.26001, 2.38395};
+  int pae[3] = { 7, 6, 9};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_irv97_512x8", "", "j2c",
-    "-qstep 0.01 -block_size \"{8,512}\"");
+                    "simple_enc_irv97_512x8", "", "j2c",
+                    "-qstep 0.01 -block_size \"{8,512}\"");
   run_ojph_compress_expand("simple_enc_irv97_512x8", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_512x8", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -940,14 +956,14 @@ TEST(TestExecutables, SimpleEncIrv97512x8) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_8x512.j2c -qstep 0.01 -block_size {512,8}
 TEST(TestExecutables, SimpleEncIrv978x512) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395 };
-  int pae[3] = { 7, 6, 9 };
+  double mse[3] = { 1.78779, 1.26001, 2.38395};
+  int pae[3] = { 7, 6, 9};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_irv97_8x512", "", "j2c",
-    "-qstep 0.01 -block_size \"{512,8}\"");
+                    "simple_enc_irv97_8x512", "", "j2c",
+                    "-qstep 0.01 -block_size \"{512,8}\"");
   run_ojph_compress_expand("simple_enc_irv97_8x512", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_8x512", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -956,14 +972,14 @@ TEST(TestExecutables, SimpleEncIrv978x512) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_256x16.j2c -qstep 0.01 -block_size {16,256}
 TEST(TestExecutables, SimpleEncIrv97256x16) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395 };
-  int pae[3] = { 7, 6, 9 };
+  double mse[3] = { 1.78779, 1.26001, 2.38395};
+  int pae[3] = { 7, 6, 9};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_irv97_256x16", "", "j2c",
-    "-qstep 0.01 -block_size \"{16,256}\"");
+                    "simple_enc_irv97_256x16", "", "j2c",
+                    "-qstep 0.01 -block_size \"{16,256}\"");
   run_ojph_compress_expand("simple_enc_irv97_256x16", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_256x16", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -972,14 +988,14 @@ TEST(TestExecutables, SimpleEncIrv97256x16) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_16x256.j2c -qstep 0.01 -block_size {256,16}
 TEST(TestExecutables, SimpleEncIrv9716x256) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395 };
-  int pae[3] = { 7, 6, 9 };
+  double mse[3] = { 1.78779, 1.26001, 2.38395};
+  int pae[3] = { 7, 6, 9};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_irv97_16x256", "", "j2c",
-    "-qstep 0.01 -block_size \"{256,16}\"");
+                    "simple_enc_irv97_16x256", "", "j2c",
+                    "-qstep 0.01 -block_size \"{256,16}\"");
   run_ojph_compress_expand("simple_enc_irv97_16x256", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_16x256", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -988,14 +1004,14 @@ TEST(TestExecutables, SimpleEncIrv9716x256) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_128x32.j2c -qstep 0.01 -block_size {32,128}
 TEST(TestExecutables, SimpleEncIrv97128x32) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395 };
-  int pae[3] = { 7, 6, 9 };
+  double mse[3] = { 1.78779, 1.26001, 2.38395};
+  int pae[3] = { 7, 6, 9};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_irv97_128x32", "", "j2c",
-    "-qstep 0.01 -block_size \"{32,128}\"");
+                    "simple_enc_irv97_128x32", "", "j2c",
+                    "-qstep 0.01 -block_size \"{32,128}\"");
   run_ojph_compress_expand("simple_enc_irv97_128x32", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_128x32", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1004,14 +1020,14 @@ TEST(TestExecutables, SimpleEncIrv97128x32) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_32x128.j2c -qstep 0.01 -block_size {128,32}
 TEST(TestExecutables, SimpleEncIrv9732x128) {
-  double mse[3] = { 1.78779, 1.26001, 2.38395 };
-  int pae[3] = { 7, 6, 9 };
+  double mse[3] = { 1.78779, 1.26001, 2.38395};
+  int pae[3] = { 7, 6, 9};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_irv97_32x128", "", "j2c",
-    "-qstep 0.01 -block_size \"{128,32}\"");
+                    "simple_enc_irv97_32x128", "", "j2c",
+                    "-qstep 0.01 -block_size \"{128,32}\"");
   run_ojph_compress_expand("simple_enc_irv97_32x128", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_32x128", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1021,14 +1037,14 @@ TEST(TestExecutables, SimpleEncIrv9732x128) {
 // -o simple_enc_irv97_64x64_tiles_33x33_d5.j2c -qstep 0.01 -tile_size {33,33}
 // -num_decomps 5
 TEST(TestExecutables, SimpleEncIrv9764x64Tiles33x33D5) {
-  double mse[3] = { 1.88906, 1.30757, 2.5347 };
-  int pae[3] = { 9, 6, 10 };
+  double mse[3] = { 1.88906, 1.30757, 2.5347};
+  int pae[3] = { 9, 6, 10};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_irv97_64x64_tiles_33x33_d5", "", "j2c",
-    "-qstep 0.01 -tile_size \"{33,33}\" -num_decomps 5");
+                    "simple_enc_irv97_64x64_tiles_33x33_d5", "", "j2c",
+                    "-qstep 0.01 -tile_size \"{33,33}\" -num_decomps 5");
   run_ojph_compress_expand("simple_enc_irv97_64x64_tiles_33x33_d5", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_64x64_tiles_33x33_d5", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1038,14 +1054,14 @@ TEST(TestExecutables, SimpleEncIrv9764x64Tiles33x33D5) {
 // -o simple_enc_irv97_64x64_tiles_33x33_d6.j2c -qstep 0.01 -tile_size {33,33}
 // -num_decomps 6
 TEST(TestExecutables, SimpleEncIrv9764x64Tiles33x33D6) {
-  double mse[3] = { 1.88751, 1.30673, 2.53378 };
-  int pae[3] = { 8, 6, 10 };
+  double mse[3] = { 1.88751, 1.30673, 2.53378};
+  int pae[3] = { 8, 6, 10};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_irv97_64x64_tiles_33x33_d6", "", "j2c",
-    "-qstep 0.01 -tile_size \"{33,33}\" -num_decomps 6");
+                    "simple_enc_irv97_64x64_tiles_33x33_d6", "", "j2c",
+                    "-qstep 0.01 -tile_size \"{33,33}\" -num_decomps 6");
   run_ojph_compress_expand("simple_enc_irv97_64x64_tiles_33x33_d6", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_64x64_tiles_33x33_d6", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1054,14 +1070,14 @@ TEST(TestExecutables, SimpleEncIrv9764x64Tiles33x33D6) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_64x64_16bit.j2c -qstep 0.01
 TEST(TestExecutables, SimpleEncIrv9764x6416bit) {
-  double mse[3] = { 51727.3, 32596.4, 45897.8 };
-  int pae[3] = { 1512, 1481, 1778 };
+  double mse[3] = { 51727.3, 32596.4, 45897.8};
+  int pae[3] = { 1512, 1481, 1778};
   run_ojph_compress("mm.ppm",
-    "simple_enc_irv97_64x64_16bit", "", "j2c",
-    "-qstep 0.01");
+                    "simple_enc_irv97_64x64_16bit", "", "j2c",
+                    "-qstep 0.01");
   run_ojph_compress_expand("simple_enc_irv97_64x64_16bit", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_64x64_16bit", "ppm",
-    "mm.ppm", "", 3, mse, pae);
+              "mm.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1070,14 +1086,14 @@ TEST(TestExecutables, SimpleEncIrv9764x6416bit) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_64x64_16bit_gray.j2c -qstep 0.01
 TEST(TestExecutables, SimpleEncIrv9764x6416bitGray) {
-  double mse[1] = { 25150.6 };
-  int pae[1] = { 1081 };
+  double mse[1] = { 25150.6};
+  int pae[1] = { 1081};
   run_ojph_compress("mm.pgm",
-    "simple_enc_irv97_64x64_16bit_gray", "", "j2c",
-    "-qstep 0.01");
+                    "simple_enc_irv97_64x64_16bit_gray", "", "j2c",
+                    "-qstep 0.01");
   run_ojph_compress_expand("simple_enc_irv97_64x64_16bit_gray", "j2c", "pgm");
   run_mse_pae("simple_enc_irv97_64x64_16bit_gray", "pgm",
-    "mm.pgm", "", 1, mse, pae);
+              "mm.pgm", "", 1, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1086,14 +1102,14 @@ TEST(TestExecutables, SimpleEncIrv9764x6416bitGray) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_64x64_16bit.j2c -reversible true
 TEST(TestExecutables, SimpleEncRev5364x6416bit) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("mm.ppm",
-    "simple_enc_rev53_64x64_16bit", "", "j2c",
-    "-reversible true");
+                    "simple_enc_rev53_64x64_16bit", "", "j2c",
+                    "-reversible true");
   run_ojph_compress_expand("simple_enc_rev53_64x64_16bit", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_64x64_16bit", "ppm",
-    "mm.ppm", "", 3, mse, pae);
+              "mm.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1102,14 +1118,14 @@ TEST(TestExecutables, SimpleEncRev5364x6416bit) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_64x64_16bit_gray.j2c -reversible true
 TEST(TestExecutables, SimpleEncRev5364x6416bitGray) {
-  double mse[1] = { 0 };
-  int pae[1] = { 0 };
+  double mse[1] = { 0};
+  int pae[1] = { 0};
   run_ojph_compress("mm.pgm",
-    "simple_enc_rev53_64x64_16bit_gray", "", "j2c",
-    "-reversible true");
+                    "simple_enc_rev53_64x64_16bit_gray", "", "j2c",
+                    "-reversible true");
   run_ojph_compress_expand("simple_enc_rev53_64x64_16bit_gray", "j2c", "pgm");
   run_mse_pae("simple_enc_rev53_64x64_16bit_gray", "pgm",
-    "mm.pgm", "", 1, mse, pae);
+              "mm.pgm", "", 1, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1118,14 +1134,14 @@ TEST(TestExecutables, SimpleEncRev5364x6416bitGray) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_64x64_16bit.j2c -reversible true
 TEST(TestExecutables, SimpleEncRev5364x64) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_rev53_64x64", "", "j2c",
-    "-reversible true");
+                    "simple_enc_rev53_64x64", "", "j2c",
+                    "-reversible true");
   run_ojph_compress_expand("simple_enc_rev53_64x64", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_64x64", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1134,14 +1150,14 @@ TEST(TestExecutables, SimpleEncRev5364x64) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_32x32.j2c -reversible true -block_size {32,32}
 TEST(TestExecutables, SimpleEncRev5332x32) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_rev53_32x32", "", "j2c",
-    "-reversible true -block_size \"{32,32}\"");
+                    "simple_enc_rev53_32x32", "", "j2c",
+                    "-reversible true -block_size \"{32,32}\"");
   run_ojph_compress_expand("simple_enc_rev53_32x32", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_32x32", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1150,14 +1166,14 @@ TEST(TestExecutables, SimpleEncRev5332x32) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_4x4.j2c -reversible true -block_size {4,4}
 TEST(TestExecutables, SimpleEncRev534x4) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_rev53_4x4", "", "j2c",
-    "-reversible true -block_size \"{4,4}\"");
+                    "simple_enc_rev53_4x4", "", "j2c",
+                    "-reversible true -block_size \"{4,4}\"");
   run_ojph_compress_expand("simple_enc_rev53_4x4", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_4x4", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1166,14 +1182,14 @@ TEST(TestExecutables, SimpleEncRev534x4) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_1024x4.j2c -reversible true -block_size {4,1024}
 TEST(TestExecutables, SimpleEncRev531024x4) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_rev53_1024x4", "", "j2c",
-    "-reversible true -block_size \"{4,1024}\"");
+                    "simple_enc_rev53_1024x4", "", "j2c",
+                    "-reversible true -block_size \"{4,1024}\"");
   run_ojph_compress_expand("simple_enc_rev53_1024x4", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_1024x4", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1182,14 +1198,14 @@ TEST(TestExecutables, SimpleEncRev531024x4) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_4x1024.j2c -reversible true -block_size {1024,4}
 TEST(TestExecutables, SimpleEncRev534x1024) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_rev53_4x1024", "", "j2c",
-    "-reversible true -block_size \"{1024,4}\"");
+                    "simple_enc_rev53_4x1024", "", "j2c",
+                    "-reversible true -block_size \"{1024,4}\"");
   run_ojph_compress_expand("simple_enc_rev53_4x1024", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_4x1024", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1199,14 +1215,14 @@ TEST(TestExecutables, SimpleEncRev534x1024) {
 // -o simple_enc_rev53_64x64_tiles_33x33_d5.j2c -reversible true -tile_size
 // {32,32} -num_decomps 5
 TEST(TestExecutables, SimpleEncRev5364x64Tiles33x33D5) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_rev53_64x64_tiles_33x33_d5", "", "j2c",
-    "-reversible true -tile_size \"{32,32}\" -num_decomps 5");
+                    "simple_enc_rev53_64x64_tiles_33x33_d5", "", "j2c",
+                    "-reversible true -tile_size \"{32,32}\" -num_decomps 5");
   run_ojph_compress_expand("simple_enc_rev53_64x64_tiles_33x33_d5", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_64x64_tiles_33x33_d5", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1216,14 +1232,14 @@ TEST(TestExecutables, SimpleEncRev5364x64Tiles33x33D5) {
 // -o simple_enc_rev53_64x64_tiles_33x33_d6.j2c -reversible true -tile_size
 // {32,32} -num_decomps 6
 TEST(TestExecutables, SimpleEncRev5364x64Tiles33x33D6) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("Malamute.ppm",
-    "simple_enc_rev53_64x64_tiles_33x33_d6", "", "j2c",
-    "-reversible true -tile_size \"{32,32}\" -num_decomps 6");
+                    "simple_enc_rev53_64x64_tiles_33x33_d6", "", "j2c",
+                    "-reversible true -tile_size \"{32,32}\" -num_decomps 6");
   run_ojph_compress_expand("simple_enc_rev53_64x64_tiles_33x33_d6", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_64x64_tiles_33x33_d6", "ppm",
-    "Malamute.ppm", "", 3, mse, pae);
+              "Malamute.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1233,16 +1249,16 @@ TEST(TestExecutables, SimpleEncRev5364x64Tiles33x33D6) {
 // -o simple_enc_irv97_64x64_yuv.j2c -qstep 0.1 -dims {352,288} -num_comps 3
 // -downsamp {1,1},{2,2},{2,2} -bit_depth 8,8,8 -signed false,false,false
 TEST(TestExecutables, SimpleEncIrv9764x64Yuv) {
-  double mse[3] = { 30.3548, 7.69602, 5.22246 };
-  int pae[3] = { 49, 27, 26 };
+  double mse[3] = { 30.3548, 7.69602, 5.22246};
+  int pae[3] = { 49, 27, 26};
   run_ojph_compress("foreman_420.yuv",
-    "simple_enc_irv97_64x64_yuv", "", "j2c",
-    "-qstep 0.1 -dims \"{352,288}\" -num_comps 3 -downsamp"
-    " \"{1,1}\",\"{2,2}\",\"{2,2}\" -bit_depth 8,8,8"
-    " -signed false,false,false");
+                    "simple_enc_irv97_64x64_yuv", "", "j2c",
+                    "-qstep 0.1 -dims \"{352,288}\" -num_comps 3 -downsamp"
+                    " \"{1,1}\",\"{2,2}\",\"{2,2}\" -bit_depth 8,8,8"
+                    " -signed false,false,false");
   run_ojph_compress_expand("simple_enc_irv97_64x64_yuv", "j2c", "yuv");
   run_mse_pae("simple_enc_irv97_64x64_yuv", "yuv",
-    "foreman_420.yuv", ":352x288x8x420", 3, mse, pae);
+              "foreman_420.yuv", ":352x288x8x420", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1253,16 +1269,16 @@ TEST(TestExecutables, SimpleEncIrv9764x64Yuv) {
 // {352,288} -num_comps 3 -downsamp {1,1},{2,2},{2,2} -bit_depth 8,8,8 -signed
 // false,false,false
 TEST(TestExecutables, SimpleEncRev5364x64Yuv) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("foreman_420.yuv",
-    "simple_enc_rev53_64x64_yuv", "", "j2c",
-    "-reversible true -qstep 0.1 -dims \"{352,288}\""
-    " -num_comps 3 -downsamp \"{1,1}\",\"{2,2}\",\"{2,2}\""
-    " -bit_depth 8,8,8 -signed false,false,false");
+                    "simple_enc_rev53_64x64_yuv", "", "j2c",
+                    "-reversible true -qstep 0.1 -dims \"{352,288}\""
+                    " -num_comps 3 -downsamp \"{1,1}\",\"{2,2}\",\"{2,2}\""
+                    " -bit_depth 8,8,8 -signed false,false,false");
   run_ojph_compress_expand("simple_enc_rev53_64x64_yuv", "j2c", "yuv");
   run_mse_pae("simple_enc_rev53_64x64_yuv", "yuv",
-    "foreman_420.yuv", ":352x288x8x420", 3, mse, pae);
+              "foreman_420.yuv", ":352x288x8x420", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1271,14 +1287,14 @@ TEST(TestExecutables, SimpleEncRev5364x64Yuv) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_tall_narrow.j2c -qstep 0.1
 TEST(TestExecutables, SimpleEncIrv97TallNarrow) {
-  double mse[3] = { 112.097, 79.2214, 71.1367 };
-  int pae[3] = { 56, 41, 32 };
+  double mse[3] = { 112.097, 79.2214, 71.1367};
+  int pae[3] = { 56, 41, 32};
   run_ojph_compress("tall_narrow.ppm",
-    "simple_enc_irv97_tall_narrow", "", "j2c",
-    "-qstep 0.1");
+                    "simple_enc_irv97_tall_narrow", "", "j2c",
+                    "-qstep 0.1");
   run_ojph_compress_expand("simple_enc_irv97_tall_narrow", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_tall_narrow", "ppm",
-    "tall_narrow.ppm", "", 3, mse, pae);
+              "tall_narrow.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1287,14 +1303,14 @@ TEST(TestExecutables, SimpleEncIrv97TallNarrow) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_irv97_tall_narrow1.j2c -image_offset {1,0} -qstep 0.1
 TEST(TestExecutables, SimpleEncIrv97TallNarrow1) {
-  double mse[3] = { 100.906, 76.113, 72.8347 };
-  int pae[3] = { 39, 35, 34 };
+  double mse[3] = { 100.906, 76.113, 72.8347};
+  int pae[3] = { 39, 35, 34};
   run_ojph_compress("tall_narrow.ppm",
-    "simple_enc_irv97_tall_narrow1", "", "j2c",
-    "-image_offset \"{1,0}\" -qstep 0.1");
+                    "simple_enc_irv97_tall_narrow1", "", "j2c",
+                    "-image_offset \"{1,0}\" -qstep 0.1");
   run_ojph_compress_expand("simple_enc_irv97_tall_narrow1", "j2c", "ppm");
   run_mse_pae("simple_enc_irv97_tall_narrow1", "ppm",
-    "tall_narrow.ppm", "", 3, mse, pae);
+              "tall_narrow.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1303,14 +1319,14 @@ TEST(TestExecutables, SimpleEncIrv97TallNarrow1) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_tall_narrow.j2c -reversible true
 TEST(TestExecutables, SimpleEncRev53TallNarrow) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("tall_narrow.ppm",
-    "simple_enc_rev53_tall_narrow", "", "j2c",
-    "-reversible true");
+                    "simple_enc_rev53_tall_narrow", "", "j2c",
+                    "-reversible true");
   run_ojph_compress_expand("simple_enc_rev53_tall_narrow", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_tall_narrow", "ppm",
-    "tall_narrow.ppm", "", 3, mse, pae);
+              "tall_narrow.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1319,14 +1335,14 @@ TEST(TestExecutables, SimpleEncRev53TallNarrow) {
 // The compressed file is obtained using these command-line options:
 // -o simple_enc_rev53_tall_narrow1.j2c -image_offset {1,0} -reversible true
 TEST(TestExecutables, SimpleEncRev53TallNarrow1) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("tall_narrow.ppm",
-    "simple_enc_rev53_tall_narrow1", "", "j2c",
-    "-image_offset \"{1,0}\" -reversible true");
+                    "simple_enc_rev53_tall_narrow1", "", "j2c",
+                    "-image_offset \"{1,0}\" -reversible true");
   run_ojph_compress_expand("simple_enc_rev53_tall_narrow1", "j2c", "ppm");
   run_mse_pae("simple_enc_rev53_tall_narrow1", "ppm",
-    "tall_narrow.ppm", "", 3, mse, pae);
+              "tall_narrow.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1335,14 +1351,14 @@ TEST(TestExecutables, SimpleEncRev53TallNarrow1) {
 // The compressed file is obtained using these command-line options:
 // -o dpx_enc_1280x720_10bit_le_nuke11.j2c -reversible true
 TEST(TestExecutables, DpxEnc1280x72010bitLeNuke11) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("dpx_1280x720_10bit.ppm",
-    "dpx_enc_1280x720_10bit_le_nuke11", "", "j2c",
-    "-reversible true");
+                    "dpx_enc_1280x720_10bit_le_nuke11", "", "j2c",
+                    "-reversible true");
   run_ojph_compress_expand("dpx_enc_1280x720_10bit_le_nuke11", "j2c", "ppm");
   run_mse_pae("dpx_enc_1280x720_10bit_le_nuke11", "ppm",
-    "dpx_1280x720_10bit.ppm", "", 3, mse, pae);
+              "dpx_1280x720_10bit.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1351,14 +1367,14 @@ TEST(TestExecutables, DpxEnc1280x72010bitLeNuke11) {
 // The compressed file is obtained using these command-line options:
 // -o dpx_enc_1280x720_10bit_be_nuke11.j2c -reversible true
 TEST(TestExecutables, DpxEnc1280x72010bitBeNuke11) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("dpx_1280x720_10bit.ppm",
-    "dpx_enc_1280x720_10bit_be_nuke11", "", "j2c",
-    "-reversible true");
+                    "dpx_enc_1280x720_10bit_be_nuke11", "", "j2c",
+                    "-reversible true");
   run_ojph_compress_expand("dpx_enc_1280x720_10bit_be_nuke11", "j2c", "ppm");
   run_mse_pae("dpx_enc_1280x720_10bit_be_nuke11", "ppm",
-    "dpx_1280x720_10bit.ppm", "", 3, mse, pae);
+              "dpx_1280x720_10bit.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1367,14 +1383,14 @@ TEST(TestExecutables, DpxEnc1280x72010bitBeNuke11) {
 // The compressed file is obtained using these command-line options:
 // -o dpx_enc_1280x720_16bit_le_nuke11.j2c -reversible true
 TEST(TestExecutables, DpxEnc1280x72016bitLeNuke11) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("dpx_1280x720_16bit.ppm",
-    "dpx_enc_1280x720_16bit_le_nuke11", "", "j2c",
-    "-reversible true");
+                    "dpx_enc_1280x720_16bit_le_nuke11", "", "j2c",
+                    "-reversible true");
   run_ojph_compress_expand("dpx_enc_1280x720_16bit_le_nuke11", "j2c", "ppm");
   run_mse_pae("dpx_enc_1280x720_16bit_le_nuke11", "ppm",
-    "dpx_1280x720_16bit.ppm", "", 3, mse, pae);
+              "dpx_1280x720_16bit.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1383,14 +1399,14 @@ TEST(TestExecutables, DpxEnc1280x72016bitLeNuke11) {
 // The compressed file is obtained using these command-line options:
 // -o dpx_enc_1280x720_16bit_be_nuke11.j2c -reversible true
 TEST(TestExecutables, DpxEnc1280x72016bitBeNuke11) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("dpx_1280x720_16bit.ppm",
-    "dpx_enc_1280x720_16bit_be_nuke11", "", "j2c",
-    "-reversible true");
+                    "dpx_enc_1280x720_16bit_be_nuke11", "", "j2c",
+                    "-reversible true");
   run_ojph_compress_expand("dpx_enc_1280x720_16bit_be_nuke11", "j2c", "ppm");
   run_mse_pae("dpx_enc_1280x720_16bit_be_nuke11", "ppm",
-    "dpx_1280x720_16bit.ppm", "", 3, mse, pae);
+              "dpx_1280x720_16bit.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1399,14 +1415,14 @@ TEST(TestExecutables, DpxEnc1280x72016bitBeNuke11) {
 // The compressed file is obtained using these command-line options:
 // -o dpx_enc_1280x720_10bit_resolve18.j2c -reversible true
 TEST(TestExecutables, DpxEnc1280x72010bitResolve18) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("dpx_1280x720_10bit.ppm",
-    "dpx_enc_1280x720_10bit_resolve18", "", "j2c",
-    "-reversible true");
+                    "dpx_enc_1280x720_10bit_resolve18", "", "j2c",
+                    "-reversible true");
   run_ojph_compress_expand("dpx_enc_1280x720_10bit_resolve18", "j2c", "ppm");
   run_mse_pae("dpx_enc_1280x720_10bit_resolve18", "ppm",
-    "dpx_1280x720_10bit.ppm", "", 3, mse, pae);
+              "dpx_1280x720_10bit.ppm", "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1415,14 +1431,14 @@ TEST(TestExecutables, DpxEnc1280x72010bitResolve18) {
 // The compressed file is obtained using these command-line options:
 // -o dpx_enc_1280x720_16bit_resolve18.j2c -reversible true
 TEST(TestExecutables, DpxEnc1280x72016bitResolve18) {
-  double mse[3] = { 0, 0, 0 };
-  int pae[3] = { 0, 0, 0 };
+  double mse[3] = { 0, 0, 0};
+  int pae[3] = { 0, 0, 0};
   run_ojph_compress("dpx_1280x720_16bit.ppm",
-    "dpx_enc_1280x720_16bit_resolve18", "", "j2c",
-    "-reversible true");
+                    "dpx_enc_1280x720_16bit_resolve18", "", "j2c",
+                    "-reversible true");
   run_ojph_compress_expand("dpx_enc_1280x720_16bit_resolve18", "j2c", "ppm");
   run_mse_pae("dpx_enc_1280x720_16bit_resolve18", "ppm",
-    "dpx_1280x720_16bit.ppm", "", 3, mse, pae);
+              "dpx_1280x720_16bit.ppm", "", 3, mse, pae);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/test_helpers/ht_cmdlines.txt b/tests/test_helpers/ht_cmdlines.txt
index a8c0987d..0542a2d6 100644
--- a/tests/test_helpers/ht_cmdlines.txt
+++ b/tests/test_helpers/ht_cmdlines.txt
@@ -52,6 +52,7 @@ add_test(NAME simple_dec_irv97_64x64_16bit_gray COMMAND ${CMAKE_CURRENT_SOURCE_D
 add_test(NAME simple_dec_rev53_64x64_16bit COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -rdec      "-i ${images_folder}/mm.ppm -o simple_dec_rev53_64x64_16bit.jph      -precise -quiet Creversible=yes -full"  "-i simple_dec_rev53_64x64_16bit.jph      -o test1.ppm -precise -quiet" "-i simple_dec_rev53_64x64_16bit.jph      -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
 add_test(NAME simple_dec_rev53_64x64_16bit_gray COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -rdec "-i ${images_folder}/mm.pgm -o simple_dec_rev53_64x64_16bit_gray.jph -precise -quiet Creversible=yes -full"  "-i simple_dec_rev53_64x64_16bit_gray.jph -o test1.pgm -precise -quiet" "-i simple_dec_rev53_64x64_16bit_gray.jph -o test2.pgm" "${images_folder}/mm.pgm" "test1.pgm" "test2.pgm")
 
+add_test(NAME simple_dec_irv53_bhvhb_low_latency COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -dec "-i ${images_folder}/mm.ppm -o simple_dec_irv53_bhvhb_low_latency.jph -quiet Corder=PCRL Clevels=5 "Cmodes=HT|CAUSAL" -rate 2 -o simple_dec_irv53_bhvhb_low_latency.jph Catk=2 Kkernels:I2=I5X3 Cprecincts="{16,8192},{8,8192},{4,8192}" Cblk="{8,256}" Cdecomp="B(-:-:-),H(-),V(-),H(-),B(-:-:-)" Qstep=0.0001 -precise -no_weights -tolerance 0"  "-i simple_dec_irv53_bhvhb_low_latency.jph -o test1.ppm -precise -quiet" "-i simple_dec_irv53_bhvhb_low_latency.jph -o test2.ppm" "${images_folder}/mm.pgm" "test1.pgm" "test2.pgm")
 
 #############################################################
 # Encoding

From 9345152e05e654b795b389ecfa0a3045efa45a5b Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Sat, 13 Apr 2024 22:18:15 +1000
Subject: [PATCH 37/37] Fixing tests.

---
 tests/test_executables.cpp                      | 10 +++++-----
 tests/test_helpers/convert_mse_pae_to_tests.cpp |  7 +++++--
 tests/test_helpers/ht_cmdlines.txt              |  2 +-
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/tests/test_executables.cpp b/tests/test_executables.cpp
index 8660f9d1..9f77f75e 100644
--- a/tests/test_executables.cpp
+++ b/tests/test_executables.cpp
@@ -825,17 +825,17 @@ TEST(TestExecutables, SimpleDecRev5364x6416bitGray) {
 ///////////////////////////////////////////////////////////////////////////////
 // Test ojph_expand with codeblocks when the rev53 wavelet is used.
 // Command-line options used to obtain this file is:
-// -o simple_dec_irv53_bhvhb_low_latency.jph -quiet Corder=PCRL Clevels=5 
-// "Cmodes=HT|CAUSAL" -rate 2 -o simple_dec_irv53_bhvhb_low_latency.jph Catk=2 
-// Kkernels:I2=I5X3 Cprecincts="{16,8192},{8,8192},{4,8192}" Cblk="{8,256}" 
-// Cdecomp="B(-:-:-),H(-),V(-),H(-),B(-:-:-)" Qstep=0.0001 -precise -no_weights 
+// -o simple_dec_irv53_bhvhb_low_latency.jph -quiet Corder=PCRL Clevels=5
+// Cmodes=HT|CAUSAL -rate 2 Catk=2 Kkernels:I2=I5X3
+// Cprecincts={16,8192},{8,8192},{4,8192} Cblk={8,256}
+// Cdecomp=B(-:-:-),H(-),V(-),H(-),B(-:-:-) Qstep=0.0001 -precise -no_weights
 // -tolerance 0
 TEST(TestExecutables, SimpleDecIrv53BhvhbLowLatency) {
   double mse[3] = { 5.52392, 4.01405, 6.8166};
   int pae[3] = { 16, 17, 23};
   run_ojph_expand("simple_dec_irv53_bhvhb_low_latency", "jph", "ppm");
   run_mse_pae("simple_dec_irv53_bhvhb_low_latency", "ppm", "Malamute.ppm",
-              ":I2=I5X3 Cprecincts=", 3, mse, pae);
+              "", 3, mse, pae);
 }
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/tests/test_helpers/convert_mse_pae_to_tests.cpp b/tests/test_helpers/convert_mse_pae_to_tests.cpp
index 25bf084c..630b6230 100644
--- a/tests/test_helpers/convert_mse_pae_to_tests.cpp
+++ b/tests/test_helpers/convert_mse_pae_to_tests.cpp
@@ -200,8 +200,11 @@ void process_cmdlines(std::ifstream& file,
 
       start_pos = line.find(":");
       if (start_pos != std::string::npos) {
-        size_t end_pos = line.find("\"", start_pos);
-        yuv_specs = line.substr(start_pos, end_pos - start_pos);
+        if (std::isdigit(line.at(start_pos + 1)))
+        {
+          size_t end_pos = line.find("\"", start_pos);
+          yuv_specs = line.substr(start_pos, end_pos - start_pos);
+        }
       }
       break;
     }
diff --git a/tests/test_helpers/ht_cmdlines.txt b/tests/test_helpers/ht_cmdlines.txt
index 0542a2d6..3b94c887 100644
--- a/tests/test_helpers/ht_cmdlines.txt
+++ b/tests/test_helpers/ht_cmdlines.txt
@@ -52,7 +52,7 @@ add_test(NAME simple_dec_irv97_64x64_16bit_gray COMMAND ${CMAKE_CURRENT_SOURCE_D
 add_test(NAME simple_dec_rev53_64x64_16bit COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -rdec      "-i ${images_folder}/mm.ppm -o simple_dec_rev53_64x64_16bit.jph      -precise -quiet Creversible=yes -full"  "-i simple_dec_rev53_64x64_16bit.jph      -o test1.ppm -precise -quiet" "-i simple_dec_rev53_64x64_16bit.jph      -o test2.ppm" "${images_folder}/mm.ppm" "test1.ppm" "test2.ppm")
 add_test(NAME simple_dec_rev53_64x64_16bit_gray COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -rdec "-i ${images_folder}/mm.pgm -o simple_dec_rev53_64x64_16bit_gray.jph -precise -quiet Creversible=yes -full"  "-i simple_dec_rev53_64x64_16bit_gray.jph -o test1.pgm -precise -quiet" "-i simple_dec_rev53_64x64_16bit_gray.jph -o test2.pgm" "${images_folder}/mm.pgm" "test1.pgm" "test2.pgm")
 
-add_test(NAME simple_dec_irv53_bhvhb_low_latency COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -dec "-i ${images_folder}/mm.ppm -o simple_dec_irv53_bhvhb_low_latency.jph -quiet Corder=PCRL Clevels=5 "Cmodes=HT|CAUSAL" -rate 2 -o simple_dec_irv53_bhvhb_low_latency.jph Catk=2 Kkernels:I2=I5X3 Cprecincts="{16,8192},{8,8192},{4,8192}" Cblk="{8,256}" Cdecomp="B(-:-:-),H(-),V(-),H(-),B(-:-:-)" Qstep=0.0001 -precise -no_weights -tolerance 0"  "-i simple_dec_irv53_bhvhb_low_latency.jph -o test1.ppm -precise -quiet" "-i simple_dec_irv53_bhvhb_low_latency.jph -o test2.ppm" "${images_folder}/mm.pgm" "test1.pgm" "test2.pgm")
+add_test(NAME simple_dec_irv53_bhvhb_low_latency COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/com_decom.sh -dec "-i ${images_folder}/mm.ppm -o simple_dec_irv53_bhvhb_low_latency.jph -quiet Corder=PCRL Clevels=5 Cmodes=HT|CAUSAL -rate 2 Catk=2 Kkernels:I2=I5X3 Cprecincts=\{16,8192\},\{8,8192\},\{4,8192\} Cblk=\{8,256\} Cdecomp=B(-:-:-),H(-),V(-),H(-),B(-:-:-) Qstep=0.0001 -precise -no_weights -tolerance 0"  "-i simple_dec_irv53_bhvhb_low_latency.jph -o test1.ppm -precise -quiet" "-i simple_dec_irv53_bhvhb_low_latency.jph -o test2.ppm" "${images_folder}/mm.pgm" "test1.pgm" "test2.pgm")
 
 #############################################################
 # Encoding