From db3cbb4b74c8a93c9a4dfe8b13e7df190dcec97f Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 16 Jun 2020 02:21:39 +0530 Subject: [PATCH 1/7] Fix bit width error in orc reader Patch bit width is only allowed to be from a fixed set of values. Patch width is to be selected as the smallest value from the set that fit the required patch size (pw + pgw) --- cpp/src/io/orc/stripe_data.cu | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 117b3623a7a..f6e3e38d232 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -642,6 +642,18 @@ static const __device__ __constant__ uint8_t kRLEv2_W[32] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 28, 30, 32, 40, 48, 56, 64}; +/** + * @brief Maps the RLEv2 patch size (pw + pgw) to number of bits + * + * Patch size (in bits) is only allowed to be from the below set. If `pw + pgw == 34` then the size + * of the patch in the file is the smallest size in the set that can fit 34 bits i.e. + * `ClosestFixedBitsMap[34] == 40` + */ +static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = { + 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 26, 26, 28, 28, 30, 30, 32, 32, 40, 40, 40, 40, 40, 40, 40, 40, 48, 48, 48, + 48, 48, 48, 48, 48, 56, 56, 56, 56, 56, 56, 56, 56, 64, 64, 64, 64, 64, 64, 64, 64}; + /** * @brief ORC Integer RLEv2 decoding * @@ -834,9 +846,10 @@ static __device__ uint32_t Integer_RLEv2( uint32_t pgw = 1 + ((pw_byte3 >> 5) & 7); // patch gap width, 1 to 8 bits uint32_t pll = pw_byte3 & 0x1f; // patch list length if (pll != 0) { + uint32_t pgw_pw_len = ClosestFixedBitsMap[pw + pgw]; uint64_t patch_pos64 = (tr < pll) - ? bytestream_readbits64(bs, pos * 8 + ((n * w + 7) & ~7) + tr * (pgw + pw), pgw + pw) + ? bytestream_readbits64(bs, pos * 8 + ((n * w + 7) & ~7) + tr * (pgw + pw), pgw_pw_len) : 0; uint32_t patch_pos; T patch = 1; From 1e30d8a2914fc7088b83ea1e9c376d231c915da3 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 16 Jun 2020 02:23:58 +0530 Subject: [PATCH 2/7] Fix issue #5440 Fixes the narrowing conversion in bytestream reading in patched RLE --- cpp/src/io/orc/stripe_data.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index f6e3e38d232..62d2d9a16bb 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -824,7 +824,7 @@ static __device__ uint32_t Integer_RLEv2( bytestream_readbe(bs, pos * 8 + i * w, w, v); vals[base + i] = v; } else if (mode == 2) { - uint32_t ofs = bytestream_readbits64(bs, pos * 8 + i * w, w); + uint64_t ofs = bytestream_readbits64(bs, pos * 8 + i * w, w); vals[base + i] = rle->baseval.u64[r] + ofs; } else { int64_t delta = rle->delta[r], ofs; From c457f3b62a4c557798506fb3a802b8dd88f34505 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 16 Jun 2020 02:38:10 +0530 Subject: [PATCH 3/7] Add pytests for ORC RLEv2 fixes --- .../cudf/tests/data/orc/TestOrcFile.RLEv2.orc | Bin 0 -> 445 bytes python/cudf/cudf/tests/test_orc.py | 1 + 2 files changed, 1 insertion(+) create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.RLEv2.orc diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.RLEv2.orc new file mode 100644 index 0000000000000000000000000000000000000000..26535e095499340229ed4bd86503c5684ba3623d GIT binary patch literal 445 zcmeYdau#G@;9?VE;SdR6um&=vxtJLk7=%PPM1=S_7zFyJyuQOM@n;&43}6Tc%4xjV7(BT&1%gagBoUmQsWjS=%5b;JZ-Yj+&I_T*D*qwJAK44r>j zHtuj(b6-J>zvHX;kw?rM|20Q2`8YHQ9uZ>NIK%sp4l9EX1HE|RM>yUp(XES5ROa}>uHlV_yhe;0wI1?TSJr)pr^f*D_#q3YN_%$!DW?{7TwT|q5isyxF3peYGr6^AvB z9zCcdpgD2ff|ZJkjvPLyeB{Upm7@`gM?&QXoR=jIcB${k@@^{IHq kgXaeY4stMX6bmp)G&Be(F){FHxEXLZePL$y4{{a<0ETF|RsaA1 literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index de52c72f0fe..2883fa91d7d 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -67,6 +67,7 @@ def _make_path_or_buf(src): "double1", ], ), + ("TestOrcFile.RLEv2.orc", ["x", "y"]), ("TestOrcFile.testSnappy.orc", None), ("TestOrcFile.demo-12-zlib.orc", ["_col2", "_col3", "_col4", "_col5"]), ], From 255b1d737766d2d8a6c5f5c9b33abcd2338fe646 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 16 Jun 2020 02:44:27 +0530 Subject: [PATCH 4/7] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bf0366daca..5056946dcde 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -95,6 +95,7 @@ - PR #5446 Fix compile error caused by out-of-date PR merge (4990) - PR #5459 Fix str.translate to convert table characters to UTF-8 - PR #5465 Fix benchmark out of memory errors due to multiple initialization +- PR #5473 Fix RLEv2 patched base in ORC reader # cuDF 0.14.0 (Date TBD) From 0f35ee59007c1caf649b754465bf315242fc5904 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Tue, 16 Jun 2020 05:57:59 +0530 Subject: [PATCH 5/7] Add link to apache source for the fix information. --- cpp/src/io/orc/stripe_data.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 62d2d9a16bb..3bdec37c62e 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -648,6 +648,8 @@ static const __device__ __constant__ uint8_t kRLEv2_W[32] = { * Patch size (in bits) is only allowed to be from the below set. If `pw + pgw == 34` then the size * of the patch in the file is the smallest size in the set that can fit 34 bits i.e. * `ClosestFixedBitsMap[34] == 40` + * + * @see https://github.com/apache/orc/commit/9faf7f5147a7bc69 */ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = { 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, From db3c0072042381f28369fb1b7b98f6057189ca22 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Wed, 17 Jun 2020 06:18:13 +0530 Subject: [PATCH 6/7] Additinal changes required for the fix to orc rle --- cpp/src/io/orc/stripe_data.cu | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 3bdec37c62e..3b38d1085a0 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -698,15 +698,16 @@ static __device__ uint32_t Integer_RLEv2( l = (l * n + 7) >> 3; } else if (mode == 2) { // 10wwwwwn.nnnnnnnn.xxxxxxxx.yyyyyyyy: patched base encoding - uint32_t byte2 = bytestream_readbyte(bs, pos++); - uint32_t byte3 = bytestream_readbyte(bs, pos++); - uint32_t bw = 1 + (byte2 >> 5); // base value width, 1 to 8 bytes - uint32_t pw = kRLEv2_W[byte2 & 0x1f]; // patch width, 1 to 64 bits - uint32_t pgw = 1 + (byte3 >> 5); // patch gap width, 1 to 8 bits - uint32_t pll = byte3 & 0x1f; // patch list length - l = (l * n + 7) >> 3; + uint32_t byte2 = bytestream_readbyte(bs, pos++); + uint32_t byte3 = bytestream_readbyte(bs, pos++); + uint32_t bw = 1 + (byte2 >> 5); // base value width, 1 to 8 bytes + uint32_t pw = kRLEv2_W[byte2 & 0x1f]; // patch width, 1 to 64 bits + uint32_t pgw = 1 + (byte3 >> 5); // patch gap width, 1 to 8 bits + uint32_t pgw_pw_len = ClosestFixedBitsMap[min(pw + pgw, 64u)]; // ceiled patch width + uint32_t pll = byte3 & 0x1f; // patch list length + l = (l * n + 7) >> 3; l += bw; - l += (pll * (pgw + pw) + 7) >> 3; + l += (pll * (pgw_pw_len) + 7) >> 3; } else { // 11wwwwwn.nnnnnnnn..: delta encoding uint32_t deltapos = varint_length(bs, pos); @@ -848,10 +849,10 @@ static __device__ uint32_t Integer_RLEv2( uint32_t pgw = 1 + ((pw_byte3 >> 5) & 7); // patch gap width, 1 to 8 bits uint32_t pll = pw_byte3 & 0x1f; // patch list length if (pll != 0) { - uint32_t pgw_pw_len = ClosestFixedBitsMap[pw + pgw]; + uint32_t pgw_pw_len = ClosestFixedBitsMap[min(pw + pgw, 64u)]; uint64_t patch_pos64 = - (tr < pll) - ? bytestream_readbits64(bs, pos * 8 + ((n * w + 7) & ~7) + tr * (pgw + pw), pgw_pw_len) + (tr < pll) ? bytestream_readbits64( + bs, pos * 8 + ((n * w + 7) & ~7) + tr * (pgw_pw_len), pgw_pw_len) : 0; uint32_t patch_pos; T patch = 1; From 9fc167ce118f576d7bb62d611e6cfc6234cdc4fc Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Thu, 18 Jun 2020 01:02:25 +0530 Subject: [PATCH 7/7] Missed style changes --- cpp/src/io/orc/stripe_data.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 3b38d1085a0..2617d9df035 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -853,7 +853,7 @@ static __device__ uint32_t Integer_RLEv2( uint64_t patch_pos64 = (tr < pll) ? bytestream_readbits64( bs, pos * 8 + ((n * w + 7) & ~7) + tr * (pgw_pw_len), pgw_pw_len) - : 0; + : 0; uint32_t patch_pos; T patch = 1; patch <<= pw;