From b3a584a29820ac61197ab0198e024dd7ce2f6f9d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 24 Sep 2019 18:33:53 -0500 Subject: [PATCH 1/4] Add vendored base64 C++ implementation and ensure that Thrift KeyValue in Parquet metadata is UTF-8 --- LICENSE.txt | 28 +++++ cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/util/base64.h | 20 ++++ cpp/src/arrow/vendored/base64.cpp | 128 +++++++++++++++++++++++ cpp/src/arrow/vendored/base64.h | 41 ++++++++ cpp/src/parquet/arrow/reader_internal.cc | 4 +- cpp/src/parquet/arrow/writer.cc | 9 +- 7 files changed, 229 insertions(+), 2 deletions(-) create mode 100644 cpp/src/arrow/util/base64.h create mode 100644 cpp/src/arrow/vendored/base64.cpp create mode 100644 cpp/src/arrow/vendored/base64.h diff --git a/LICENSE.txt b/LICENSE.txt index cb359c0572014..edf3379d21e37 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1874,3 +1874,31 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------------------------------------------------------------- + +cpp/src/arrow/util/base64.h and base64.cc have the following license + +ZLIB License + +Copyright (C) 2004-2017 René Nyffenegger + +This source code is provided 'as-is', without any express or implied +warranty. In no event will the author be held liable for any damages arising +from the use of this software. + +Permission is granted to anyone to use this software for any purpose, including +commercial applications, and to alter it and redistribute it freely, subject to +the following restrictions: + +1. The origin of this source code must not be misrepresented; you must not + claim that you wrote the original source code. If you use this source code + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + +2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original source code. + +3. This notice may not be removed or altered from any source distribution. + +René Nyffenegger rene.nyffenegger@adp-gmbh.ch diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index f1968af592d22..d575faff822aa 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -145,6 +145,7 @@ set(ARROW_SRCS util/thread_pool.cc util/trie.cc util/utf8.cc + vendored/base64.cpp vendored/datetime/tz.cpp) # Add dependencies for third-party allocators. diff --git a/cpp/src/arrow/util/base64.h b/cpp/src/arrow/util/base64.h new file mode 100644 index 0000000000000..fa60b8b6af1ca --- /dev/null +++ b/cpp/src/arrow/util/base64.h @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/vendored/base64.h" diff --git a/cpp/src/arrow/vendored/base64.cpp b/cpp/src/arrow/vendored/base64.cpp new file mode 100644 index 0000000000000..060632c73fb4c --- /dev/null +++ b/cpp/src/arrow/vendored/base64.cpp @@ -0,0 +1,128 @@ +/* + base64.cpp and base64.h + + base64 encoding and decoding with C++. + + Version: 1.01.00 + + Copyright (C) 2004-2017 René Nyffenegger + + This source code is provided 'as-is', without any express or implied + warranty. In no event will the author be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this source code must not be misrepresented; you must not + claim that you wrote the original source code. If you use this source code + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original source code. + + 3. This notice may not be removed or altered from any source distribution. + + René Nyffenegger rene.nyffenegger@adp-gmbh.ch + +*/ + +#include "base64.h" +#include + +namespace arrow { +namespace util { + +static const std::string base64_chars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + + +static inline bool is_base64(unsigned char c) { + return (isalnum(c) || (c == '+') || (c == '/')); +} + +std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) { + std::string ret; + int i = 0; + int j = 0; + unsigned char char_array_3[3]; + unsigned char char_array_4[4]; + + while (in_len--) { + char_array_3[i++] = *(bytes_to_encode++); + if (i == 3) { + char_array_4[0] = (char_array_3[0] & 0xfc) >> 2; + char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4); + char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6); + char_array_4[3] = char_array_3[2] & 0x3f; + + for(i = 0; (i <4) ; i++) + ret += base64_chars[char_array_4[i]]; + i = 0; + } + } + + if (i) + { + for(j = i; j < 3; j++) + char_array_3[j] = '\0'; + + char_array_4[0] = ( char_array_3[0] & 0xfc) >> 2; + char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4); + char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6); + + for (j = 0; (j < i + 1); j++) + ret += base64_chars[char_array_4[j]]; + + while((i++ < 3)) + ret += '='; + + } + + return ret; + +} + +std::string base64_decode(std::string const& encoded_string) { + size_t in_len = encoded_string.size(); + int i = 0; + int j = 0; + int in_ = 0; + unsigned char char_array_4[4], char_array_3[3]; + std::string ret; + + while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) { + char_array_4[i++] = encoded_string[in_]; in_++; + if (i ==4) { + for (i = 0; i <4; i++) + char_array_4[i] = base64_chars.find(char_array_4[i]) & 0xff; + + char_array_3[0] = ( char_array_4[0] << 2 ) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + + for (i = 0; (i < 3); i++) + ret += char_array_3[i]; + i = 0; + } + } + + if (i) { + for (j = 0; j < i; j++) + char_array_4[j] = base64_chars.find(char_array_4[j]) & 0xff; + + char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + + for (j = 0; (j < i - 1); j++) ret += char_array_3[j]; + } + + return ret; +} + +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/vendored/base64.h b/cpp/src/arrow/vendored/base64.h new file mode 100644 index 0000000000000..e77449e382ced --- /dev/null +++ b/cpp/src/arrow/vendored/base64.h @@ -0,0 +1,41 @@ + +// Copyright © 2004-2017 by René Nyffenegger +// +// This source code is provided 'as-is', without any express or implied +// warranty. In no event will the author be held liable for any damages +// arising from the use of this software. +// +// Permission is granted to anyone to use this software for any purpose, +// including commercial applications, and to alter it and redistribute it +// freely, subject to the following restrictions: +// +// 1. The origin of this source code must not be misrepresented; you must not +// claim that you wrote the original source code. If you use this source code +// in a product, an acknowledgment in the product documentation would be +// appreciated but is not required. +// +// 2. Altered source versions must be plainly marked as such, and must not be +// misrepresented as being the original source code. +// +// 3. This notice may not be removed or altered from any source distribution. + +// +// base64 encoding and decoding with C++. +// Version: 1.01.00 +// + +#ifndef BASE64_H_C0CE2A47_D10E_42C9_A27C_C883944E704A +#define BASE64_H_C0CE2A47_D10E_42C9_A27C_C883944E704A + +#include + +namespace arrow { +namespace util { + +std::string base64_encode(unsigned char const*, unsigned int len); +std::string base64_decode(std::string const& s); + +} // namespace util +} // namespace arrow + +#endif /* BASE64_H_C0CE2A47_D10E_42C9_A27C_C883944E704A */ diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index f8307c54d0d99..7b803fa261f07 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -39,6 +39,7 @@ #include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/base64.h" #include "arrow/util/checked_cast.h" #include "arrow/util/int_util.h" #include "arrow/util/logging.h" @@ -576,7 +577,8 @@ Status GetOriginSchema(const std::shared_ptr& metadata, // The original Arrow schema was serialized using the store_schema option. We // deserialize it here and use it to inform read options such as // dictionary-encoded fields - auto schema_buf = std::make_shared(metadata->value(schema_index)); + auto decoded = ::arrow::util::base64_decode(metadata->value(schema_index)); + auto schema_buf = std::make_shared(decoded); ::arrow::ipc::DictionaryMemo dict_memo; ::arrow::io::BufferReader input(schema_buf); diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index cfd58b2a452db..18c01a527ea1f 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -30,6 +30,7 @@ #include "arrow/ipc/writer.h" #include "arrow/table.h" #include "arrow/type.h" +#include "arrow/util/base64.h" #include "arrow/visitor_inline.h" #include "parquet/arrow/reader_internal.h" @@ -577,7 +578,13 @@ Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* poo ::arrow::ipc::DictionaryMemo dict_memo; std::shared_ptr serialized; RETURN_NOT_OK(::arrow::ipc::SerializeSchema(schema, &dict_memo, pool, &serialized)); - result->Append(kArrowSchemaKey, serialized->ToString()); + + // The serialized schema is not UTF-8, which is required for Thrift + std::string schema_as_string = serialized->ToString(); + std::string schema_base64 = ::arrow::util::base64_encode( + reinterpret_cast(schema_as_string.data()), + static_cast(schema_as_string.size())); + result->Append(kArrowSchemaKey, schema_base64); *out = result; return Status::OK(); } From eabb121ba1640dbfa35d78b45edf442a0337a1a9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 24 Sep 2019 18:36:52 -0500 Subject: [PATCH 2/4] Fix LICENSE.txt, add iwyu export --- LICENSE.txt | 2 +- cpp/src/arrow/util/base64.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/LICENSE.txt b/LICENSE.txt index edf3379d21e37..7c56259910570 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1877,7 +1877,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ---------------------------------------------------------------------- -cpp/src/arrow/util/base64.h and base64.cc have the following license +cpp/src/arrow/vendored/base64.h and base64.cpp have the following license ZLIB License diff --git a/cpp/src/arrow/util/base64.h b/cpp/src/arrow/util/base64.h index fa60b8b6af1ca..cc46ec90d6c8c 100644 --- a/cpp/src/arrow/util/base64.h +++ b/cpp/src/arrow/util/base64.h @@ -17,4 +17,4 @@ #pragma once -#include "arrow/vendored/base64.h" +#include "arrow/vendored/base64.h" // IWYU pragma: export From 06f75cd5b448f57dbe04f247fb40ec2dddff63e3 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 24 Sep 2019 18:46:28 -0500 Subject: [PATCH 3/4] Fix Python unit test that needs to base64-decode now --- python/pyarrow/tests/test_extension_type.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index dd9208549f9b7..35a40241450d2 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -372,7 +372,10 @@ def test_parquet(tmpdir, registered_period_type): meta = pq.read_metadata(filename) assert meta.schema.column(0).physical_type == "INT64" assert b"ARROW:schema" in meta.metadata - schema = pa.read_schema(pa.BufferReader(meta.metadata[b"ARROW:schema"])) + + import base64 + decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) + schema = pa.read_schema(pa.BufferReader(decoded_schema)) assert schema.field("ext").metadata == { b'ARROW:extension:metadata': b'freq=D', b'ARROW:extension:name': b'pandas.period'} From c058e869427efb06d79743af1fe48a56b7f4e874 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 24 Sep 2019 21:47:16 -0500 Subject: [PATCH 4/4] Simplify, add MSVC exports --- LICENSE.txt | 2 +- cpp/src/arrow/util/base64.h | 16 +++++++++++- cpp/src/arrow/vendored/base64.cpp | 2 +- cpp/src/arrow/vendored/base64.h | 41 ------------------------------- 4 files changed, 17 insertions(+), 44 deletions(-) delete mode 100644 cpp/src/arrow/vendored/base64.h diff --git a/LICENSE.txt b/LICENSE.txt index 7c56259910570..755db54ae5c79 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1877,7 +1877,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ---------------------------------------------------------------------- -cpp/src/arrow/vendored/base64.h and base64.cpp have the following license +cpp/src/arrow/vendored/base64.cpp has the following license ZLIB License diff --git a/cpp/src/arrow/util/base64.h b/cpp/src/arrow/util/base64.h index cc46ec90d6c8c..9ab41412ac3f3 100644 --- a/cpp/src/arrow/util/base64.h +++ b/cpp/src/arrow/util/base64.h @@ -17,4 +17,18 @@ #pragma once -#include "arrow/vendored/base64.h" // IWYU pragma: export +#include + +#include "arrow/util/visibility.h" + +namespace arrow { +namespace util { + +ARROW_EXPORT +std::string base64_encode(unsigned char const*, unsigned int len); + +ARROW_EXPORT +std::string base64_decode(std::string const& s); + +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/vendored/base64.cpp b/cpp/src/arrow/vendored/base64.cpp index 060632c73fb4c..50ece19455ed2 100644 --- a/cpp/src/arrow/vendored/base64.cpp +++ b/cpp/src/arrow/vendored/base64.cpp @@ -29,7 +29,7 @@ */ -#include "base64.h" +#include "arrow/util/base64.h" #include namespace arrow { diff --git a/cpp/src/arrow/vendored/base64.h b/cpp/src/arrow/vendored/base64.h deleted file mode 100644 index e77449e382ced..0000000000000 --- a/cpp/src/arrow/vendored/base64.h +++ /dev/null @@ -1,41 +0,0 @@ - -// Copyright © 2004-2017 by René Nyffenegger -// -// This source code is provided 'as-is', without any express or implied -// warranty. In no event will the author be held liable for any damages -// arising from the use of this software. -// -// Permission is granted to anyone to use this software for any purpose, -// including commercial applications, and to alter it and redistribute it -// freely, subject to the following restrictions: -// -// 1. The origin of this source code must not be misrepresented; you must not -// claim that you wrote the original source code. If you use this source code -// in a product, an acknowledgment in the product documentation would be -// appreciated but is not required. -// -// 2. Altered source versions must be plainly marked as such, and must not be -// misrepresented as being the original source code. -// -// 3. This notice may not be removed or altered from any source distribution. - -// -// base64 encoding and decoding with C++. -// Version: 1.01.00 -// - -#ifndef BASE64_H_C0CE2A47_D10E_42C9_A27C_C883944E704A -#define BASE64_H_C0CE2A47_D10E_42C9_A27C_C883944E704A - -#include - -namespace arrow { -namespace util { - -std::string base64_encode(unsigned char const*, unsigned int len); -std::string base64_decode(std::string const& s); - -} // namespace util -} // namespace arrow - -#endif /* BASE64_H_C0CE2A47_D10E_42C9_A27C_C883944E704A */