-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ARROW-6678: [C++][Parquet] Binary data stored in Parquet metadata must be base64-encoded to be UTF-8 compliant #5493
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1874,3 +1874,31 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | ||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
|
||
---------------------------------------------------------------------- | ||
|
||
cpp/src/arrow/vendored/base64.cpp has the following license | ||
|
||
ZLIB License | ||
|
||
Copyright (C) 2004-2017 René Nyffenegger | ||
|
||
This source code is provided 'as-is', without any express or implied | ||
warranty. In no event will the author be held liable for any damages arising | ||
from the use of this software. | ||
|
||
Permission is granted to anyone to use this software for any purpose, including | ||
commercial applications, and to alter it and redistribute it freely, subject to | ||
the following restrictions: | ||
|
||
1. The origin of this source code must not be misrepresented; you must not | ||
claim that you wrote the original source code. If you use this source code | ||
in a product, an acknowledgment in the product documentation would be | ||
appreciated but is not required. | ||
|
||
2. Altered source versions must be plainly marked as such, and must not be | ||
misrepresented as being the original source code. | ||
|
||
3. This notice may not be removed or altered from any source distribution. | ||
|
||
René Nyffenegger [email protected] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#pragma once | ||
|
||
#include <string> | ||
|
||
#include "arrow/util/visibility.h" | ||
|
||
namespace arrow { | ||
namespace util { | ||
|
||
ARROW_EXPORT | ||
std::string base64_encode(unsigned char const*, unsigned int len); | ||
|
||
ARROW_EXPORT | ||
std::string base64_decode(std::string const& s); | ||
|
||
} // namespace util | ||
} // namespace arrow |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
/* | ||
base64.cpp and base64.h | ||
base64 encoding and decoding with C++. | ||
Version: 1.01.00 | ||
Copyright (C) 2004-2017 René Nyffenegger | ||
This source code is provided 'as-is', without any express or implied | ||
warranty. In no event will the author be held liable for any damages | ||
arising from the use of this software. | ||
Permission is granted to anyone to use this software for any purpose, | ||
including commercial applications, and to alter it and redistribute it | ||
freely, subject to the following restrictions: | ||
1. The origin of this source code must not be misrepresented; you must not | ||
claim that you wrote the original source code. If you use this source code | ||
in a product, an acknowledgment in the product documentation would be | ||
appreciated but is not required. | ||
2. Altered source versions must be plainly marked as such, and must not be | ||
misrepresented as being the original source code. | ||
3. This notice may not be removed or altered from any source distribution. | ||
René Nyffenegger [email protected] | ||
*/ | ||
|
||
#include "arrow/util/base64.h" | ||
#include <iostream> | ||
|
||
namespace arrow { | ||
namespace util { | ||
|
||
static const std::string base64_chars = | ||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" | ||
"abcdefghijklmnopqrstuvwxyz" | ||
"0123456789+/"; | ||
|
||
|
||
static inline bool is_base64(unsigned char c) { | ||
return (isalnum(c) || (c == '+') || (c == '/')); | ||
} | ||
|
||
std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) { | ||
std::string ret; | ||
int i = 0; | ||
int j = 0; | ||
unsigned char char_array_3[3]; | ||
unsigned char char_array_4[4]; | ||
|
||
while (in_len--) { | ||
char_array_3[i++] = *(bytes_to_encode++); | ||
if (i == 3) { | ||
char_array_4[0] = (char_array_3[0] & 0xfc) >> 2; | ||
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4); | ||
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6); | ||
char_array_4[3] = char_array_3[2] & 0x3f; | ||
|
||
for(i = 0; (i <4) ; i++) | ||
ret += base64_chars[char_array_4[i]]; | ||
i = 0; | ||
} | ||
} | ||
|
||
if (i) | ||
{ | ||
for(j = i; j < 3; j++) | ||
char_array_3[j] = '\0'; | ||
|
||
char_array_4[0] = ( char_array_3[0] & 0xfc) >> 2; | ||
char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4); | ||
char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6); | ||
|
||
for (j = 0; (j < i + 1); j++) | ||
ret += base64_chars[char_array_4[j]]; | ||
|
||
while((i++ < 3)) | ||
ret += '='; | ||
|
||
} | ||
|
||
return ret; | ||
|
||
} | ||
|
||
std::string base64_decode(std::string const& encoded_string) { | ||
size_t in_len = encoded_string.size(); | ||
int i = 0; | ||
int j = 0; | ||
int in_ = 0; | ||
unsigned char char_array_4[4], char_array_3[3]; | ||
std::string ret; | ||
|
||
while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) { | ||
char_array_4[i++] = encoded_string[in_]; in_++; | ||
if (i ==4) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit space is off here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the case in the source file https://github.com/ReneNyffenegger/cpp-base64/blob/master/base64.cpp#L97 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (we don't clang-format the files in vendored/*) |
||
for (i = 0; i <4; i++) | ||
char_array_4[i] = base64_chars.find(char_array_4[i]) & 0xff; | ||
|
||
char_array_3[0] = ( char_array_4[0] << 2 ) + ((char_array_4[1] & 0x30) >> 4); | ||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); | ||
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; | ||
|
||
for (i = 0; (i < 3); i++) | ||
ret += char_array_3[i]; | ||
i = 0; | ||
} | ||
} | ||
|
||
if (i) { | ||
for (j = 0; j < i; j++) | ||
char_array_4[j] = base64_chars.find(char_array_4[j]) & 0xff; | ||
|
||
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4); | ||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); | ||
|
||
for (j = 0; (j < i - 1); j++) ret += char_array_3[j]; | ||
} | ||
|
||
return ret; | ||
} | ||
|
||
} // namespace util | ||
} // namespace arrow |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I assume the source code hasn't been modified?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unmodified
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
To be pedantic, the only difference is adding the
arrow::util
namespace.