Main: Bitwise - use floatToHalf from meshoptimizer

for correct rounding behaviour
OGRECave · Jul 23, 2024 · 9a8ecdb · 9a8ecdb
1 parent 0f29071
commit 9a8ecdb
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 99 deletions.
diff --git a/OgreMain/include/OgreBitwise.h b/OgreMain/include/OgreBitwise.h
@@ -303,7 +303,7 @@ namespace Ogre {
         }
 
         /** Convert a float32 to a float16 (NV_half_float)
-            Courtesy of OpenEXR
+            Courtesy of meshoptimizer
         */
         static inline uint16 floatToHalf(float i)
         {
@@ -313,48 +313,29 @@ namespace Ogre {
         }
         /** Converts float in uint32 format to a a half in uint16 format
         */
-        static inline uint16 floatToHalfI(uint32 i)
+        static inline uint16 floatToHalfI(uint32 ui)
         {
-            int s =  (i >> 16) & 0x00008000;
-            int e = ((i >> 23) & 0x000000ff) - (127 - 15);
-            int m =   i        & 0x007fffff;
-
-            if (e <= 0)
-            {
-                if (e < -10)
-                {
-                    return 0;
-                }
-                m = (m | 0x00800000) >> (1 - e);
-
-                return static_cast<uint16>(s | (m >> 13));
-            }
-            else if (e == 0xff - (127 - 15))
-            {
-                if (m == 0) // Inf
-                {
-                    return static_cast<uint16>(s | 0x7c00);
-                } 
-                else    // NAN
-                {
-                    m >>= 13;
-                    return static_cast<uint16>(s | 0x7c00 | m | (m == 0));
-                }
-            }
-            else
-            {
-                if (e > 30) // Overflow
-                {
-                    return static_cast<uint16>(s | 0x7c00);
-                }
-
-                return static_cast<uint16>(s | (e << 10) | (m >> 13));
-            }
+            int s = (ui >> 16) & 0x8000;
+            int em = ui & 0x7fffffff;
+
+            // bias exponent and round to nearest; 112 is relative exponent bias (127-15)
+            int h = (em - (112 << 23) + (1 << 12)) >> 13;
+
+            // underflow: flush to zero; 113 encodes exponent -14
+            h = (em < (113 << 23)) ? 0 : h;
+
+            // overflow: infinity; 143 encodes exponent 16
+            h = (em >= (143 << 23)) ? 0x7c00 : h;
+
+            // NaN; note that we convert all types of NaN to qNaN
+            h = (em > (255 << 23)) ? 0x7e00 : h;
+
+            return (unsigned short)(s | h);
         }
 
         /**
          * Convert a float16 (NV_half_float) to a float32
-         * Courtesy of OpenEXR
+         * Courtesy of meshoptimizer
          */
         static inline float halfToFloat(uint16 y)
         {
@@ -365,46 +346,22 @@ namespace Ogre {
         /** Converts a half in uint16 format to a float
             in uint32 format
          */
-        static inline uint32 halfToFloatI(uint16 y)
+        static inline uint32 halfToFloatI(uint16 h)
         {
-            int s = (y >> 15) & 0x00000001;
-            int e = (y >> 10) & 0x0000001f;
-            int m =  y        & 0x000003ff;
-
-            if (e == 0)
-            {
-                if (m == 0) // Plus or minus zero
-                {
-                    return s << 31;
-                }
-                else // Denormalized number -- renormalize it
-                {
-                    while (!(m & 0x00000400))
-                    {
-                        m <<= 1;
-                        e -=  1;
-                    }
-
-                    e += 1;
-                    m &= ~0x00000400;
-                }
-            }
-            else if (e == 31)
-            {
-                if (m == 0) // Inf
-                {
-                    return (s << 31) | 0x7f800000;
-                }
-                else // NaN
-                {
-                    return (s << 31) | 0x7f800000 | (m << 13);
-                }
-            }
-
-            e = e + (127 - 15);
-            m = m << 13;
-
-            return (s << 31) | (e << 23) | m;
+            unsigned int s = unsigned(h & 0x8000) << 16;
+            int em = h & 0x7fff;
+
+            // bias exponent and pad mantissa with 0; 112 is relative exponent bias (127-15)
+            int r = (em + (112 << 10)) << 13;
+
+            // denormal: flush to zero
+            r = (em < (1 << 10)) ? 0 : r;
+
+            // infinity/NaN; note that we preserve NaN payload as a byproduct of unifying inf/nan cases
+            // 112 is an exponent bias fixup; since we already applied it once, applying it twice converts 31 to 255
+            r += (em >= (31 << 10)) ? (112 << 23) : 0;
+
+            return s | r;
         }
 
 

diff --git a/Tests/OgreMain/src/PixelFormatTests.cpp b/Tests/OgreMain/src/PixelFormatTests.cpp
@@ -78,33 +78,20 @@ TEST_F(PixelFormatTests,IntegerPackUnpack)
 TEST_F(PixelFormatTests,FloatPackUnpack)
 {
     // Float32
-    float data[4] = {1.0f, 2.0f, 3.0f, 4.0f};
+    ColourValue src{0.99999f, -0.9999f, 1.49999f, 1.99999f};
     float r,g,b,a;
-    PixelUtil::unpackColour(&r, &g, &b, &a, PF_FLOAT32_RGBA, data);
-    EXPECT_EQ(r, 1.0f);
-    EXPECT_EQ(g, 2.0f);
-    EXPECT_EQ(b, 3.0f);
-    EXPECT_EQ(a, 4.0f);
+    PixelUtil::unpackColour(&r, &g, &b, &a, PF_FLOAT32_RGBA, src.ptr());
+    EXPECT_EQ(src, ColourValue(r, g, b, a));
 
     // Float16
-    setupBoxes(PF_A8B8G8R8, PF_FLOAT16_RGBA);
-    mDst2.format = PF_A8B8G8R8;
-    unsigned int eob = mSrc.getWidth()*4;
+    ColourValue  ref{1.0f, -1.0f, 1.5f, 2.0f}; // conversion to float16 should round to nearest
+    uint16 data2[4];
+    ColourValue  dst;
 
-    PixelUtil::bulkPixelConversion(mSrc, mDst1);
-    PixelUtil::bulkPixelConversion(mDst1, mDst2);
-
-    // Locate errors
-    std::stringstream s;
-    unsigned int x;
-    for(x=0; x<eob; x++) {
-        if(mTemp2[x] != mRandomData[x])
-            s << std::hex << std::setw(2) << std::setfill('0') << (unsigned int) mRandomData[x]
-              << "!= " << std::hex << std::setw(2) << std::setfill('0') << (unsigned int) mTemp2[x] << " ";
-    }
+    PixelUtil::bulkPixelConversion(src.ptr(), PF_FLOAT32_RGBA, data2, PF_FLOAT16_RGBA, 1);
+    PixelUtil::bulkPixelConversion(data2, PF_FLOAT16_RGBA, dst.ptr(), PF_FLOAT32_RGBA, 1);
 
-    // src and dst2 should match
-    EXPECT_TRUE(memcmp(mSrc.data, mDst2.data, eob) == 0) << "PF_FLOAT16_RGBA<->PF_A8B8G8R8 conversion was not lossless "+s.str();
+    EXPECT_EQ(dst, ref);
 }
 //--------------------------------------------------------------------------
 // Pure 32 bit float precision brute force pixel conversion; for comparison