diff --git a/NOTICE b/NOTICE index 9f53e738b..e5a8308d6 100644 --- a/NOTICE +++ b/NOTICE @@ -74,6 +74,10 @@ The text of each license is also included in licenses/LICENSE-[project].txt cpp/thirdparty/mini-yam/Yaml.hpp cpp/thirdparty/mini-yaml/Yaml.cpp +* simple-uri-parser (https://github.com/jholloc/simple-uri-parser) + Files: + cpp/thirdparty/simple-uri-parser/uri_parser.h + ================================================================ BSD 3-Clause licenses diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3046ffb2f..5a6fc007a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -233,7 +233,9 @@ macro(build_gar) target_compile_features(gar PRIVATE cxx_std_17) target_include_directories(gar PUBLIC $ $ - $ + ) + target_include_directories(gar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/mini-yaml + ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty ) if(BUILD_ARROW_FROM_SOURCE) target_include_directories(gar SYSTEM BEFORE PRIVATE ${GAR_ARROW_INCLUDE_DIR}) diff --git a/cpp/src/filesystem.cc b/cpp/src/filesystem.cc index bc76e7361..e0a8a4ac3 100644 --- a/cpp/src/filesystem.cc +++ b/cpp/src/filesystem.cc @@ -25,6 +25,7 @@ #include "arrow/filesystem/s3fs.h" #include "arrow/ipc/writer.h" #include "parquet/arrow/writer.h" +#include "simple-uri-parser/uri_parser.h" #include "gar/fwd.h" #include "gar/util/expression.h" @@ -78,12 +79,6 @@ static Status CastToLargeOffsetArray( GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(out, arrow::ChunkedArray::Make(chunks)); return Status::OK(); } - -Result ParseFileSystemUri(const std::string& uri_string) { - arrow::internal::Uri uri; - RETURN_NOT_ARROW_OK(uri.Parse(uri_string)); - return std::move(uri); -} } // namespace detail std::shared_ptr FileSystem::GetFileFormat( @@ -291,15 +286,16 @@ Result> FileSystemFromUriOrPath( GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN( auto arrow_fs, arrow::fs::FileSystemFromUriOrPath(uri_string)); - GAR_ASSIGN_OR_RAISE(auto uri, detail::ParseFileSystemUri(uri_string)); + auto uri = uri::parse_uri(uri_string); + if (uri.error != uri::Error::None) { + return Status::Invalid("Failed to parse URI: ", uri_string); + } if (out_path != nullptr) { - if (uri.scheme() == "file" || uri.scheme() == "hdfs" || - uri.scheme().empty()) { - *out_path = uri.path(); - } else if (uri.scheme() == "s3" || uri.scheme() == "gs") { + if (uri.scheme == "file" || uri.scheme == "hdfs" || uri.scheme.empty()) { + *out_path = uri.path; + } else if (uri.scheme == "s3" || uri.scheme == "gs") { // bucket name is the host, path is the path - // the arrow parser would delete the trailing slash which we don't want to - *out_path = uri.host() + uri.path(); + *out_path = uri.authority.host + uri.path; } else { return Status::Invalid("Unrecognized filesystem type in URI: ", uri_string); diff --git a/cpp/thirdparty/simple-uri-parser/uri_parser.h b/cpp/thirdparty/simple-uri-parser/uri_parser.h new file mode 100644 index 000000000..c90fe84b3 --- /dev/null +++ b/cpp/thirdparty/simple-uri-parser/uri_parser.h @@ -0,0 +1,241 @@ +/** +* MIT License +* +* Copyright (c) 2021 Jonathan Hollocombe +* +* Permission is hereby granted, free of charge, to any person obtaining a copy +* of this software and associated documentation files (the "Software"), to deal +* in the Software without restriction, including without limitation the rights +* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the Software is +* furnished to do so, subject to the following conditions: + +* The above copyright notice and this permission notice shall be included in all +* copies or substantial portions of the Software. + +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +* SOFTWARE. +*/ + +#pragma once + +#ifndef SIMPLE_URI_PARSER_LIBRARY_H +#define SIMPLE_URI_PARSER_LIBRARY_H + +#include +#include +#include + +#ifndef simple_uri_CPLUSPLUS +# if defined(_MSVC_LANG ) && !defined(__clang__) +# define simple_uri_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG ) +# else +# define simple_uri_CPLUSPLUS __cplusplus +# endif +#endif + +#define simple_uri_CPP17_OR_GREATER ( simple_uri_CPLUSPLUS >= 201703L ) + +namespace uri { + +#if simple_uri_CPP17_OR_GREATER + using string_view_type = std::string_view; + using string_arg_type = std::string_view; + constexpr auto npos = std::string_view::npos; +#else + using string_view_type = std::string; + using string_arg_type = const std::string&; + constexpr auto npos = std::string::npos; +#endif + +using query_type = std::unordered_map; + +enum class Error { + None, + InvalidScheme, + InvalidPort, +}; + +struct Authority { + std::string authority; + std::string userinfo; + std::string host; + long port = 0; +}; + +struct Uri { + Error error; + std::string scheme; + Authority authority = {}; + std::string path; + query_type query = {}; + std::string query_string; + std::string fragment; + + explicit Uri(Error error) : error(error) {} + Uri(std::string scheme, Authority authority, std::string path, query_type query, std::string query_string, std::string fragment) + : error(Error::None) + , scheme(std::move(scheme)) + , authority(std::move(authority)) + , path(std::move(path)) + , query(std::move(query)) + , query_string(std::move(query_string)) + , fragment(std::move(fragment)) + {} +}; + +} + +namespace { + +bool valid_scheme(uri::string_arg_type scheme) { + if (scheme.empty()) { + return false; + } + auto pos = std::find_if_not(scheme.begin(), scheme.end(), [&](char c){ + return std::isalnum(c) || c == '+' || c == '.' || c == '-'; + }); + return pos == scheme.end(); +} + +std::tuple parse_scheme(uri::string_arg_type uri) { + auto pos = uri.find(':'); + if (pos == uri::npos) { + return { "", uri::Error::InvalidScheme, uri }; + } + + auto scheme = uri.substr(0, pos); + if (!::valid_scheme(scheme)) { + return { "", uri::Error::InvalidScheme, uri }; + } + std::string scheme_string{ scheme }; + std::transform(scheme_string.begin(), scheme_string.end(), scheme_string.begin(), + [](unsigned char c){ return std::tolower(c); }); + + return { scheme_string, uri::Error::None, uri.substr(pos + 1) }; +} + +std::tuple parse_authority(uri::string_arg_type uri) { + uri::Authority authority; + + bool has_authority = uri.length() >= 2 && uri[0] == '/' && uri[1] == '/'; + if (!has_authority) { + return { authority, uri::Error::None, uri }; + } + + auto pos = uri.substr(2).find('/'); + auto auth_string = uri.substr(2, pos); + auto rem = uri.substr(pos + 2); + authority.authority = auth_string; + + pos = auth_string.find('@'); + if (pos != uri::npos) { + authority.userinfo = std::string(auth_string.substr(0, pos)); + auth_string = auth_string.substr(pos + 1); + } + + char* end_ptr = nullptr; + if (!auth_string.empty() && auth_string[0] != '[') { + pos = auth_string.find(':'); + if (pos != uri::npos) { + authority.port = std::strtol(&auth_string[pos + 1], &end_ptr, 10); + if (end_ptr != &*auth_string.end()) { + return { authority, uri::Error::InvalidPort, auth_string }; + } + } + } + + authority.host = auth_string.substr(0, pos); + + return { authority, uri::Error::None, rem }; +} + +std::tuple parse_path(uri::string_arg_type uri) { + auto pos = uri.find_first_of("#?"); + if (pos == uri::npos) { + auto path = std::string(uri); + return { path, uri::Error::None, "" }; + } else { + auto path = std::string(uri.substr(0, pos)); + return { path, uri::Error::None, uri.substr(pos + 1) }; + } +} + +std::tuple parse_query(uri::string_arg_type uri) { + auto hash_pos = uri.find('#'); + auto query_substring = uri.substr(0, hash_pos); + auto query_string = std::string(query_substring); + uri::query_type query; + while (!query_substring.empty()) { + auto delim_pos = query_substring.find_first_of("&;?", 0); + auto arg = query_substring.substr(0, delim_pos); + auto equals_pos = arg.find('='); + if (equals_pos == uri::npos) { + query[std::string(arg)] = ""; + } else { + query[std::string(arg.substr(0, equals_pos))] = arg.substr(equals_pos + 1); + } + if (delim_pos == uri::npos) { + query_substring = ""; + } else { + query_substring = query_substring.substr(delim_pos + 1); + } + } + + return {query, query_string, uri::Error::None, uri.substr(hash_pos + 1) }; +} + +std::tuple parse_fragment(uri::string_arg_type uri) { + return { std::string(uri), uri::Error::None, uri }; +} + +} // anon namespace + +namespace uri { + +inline Uri parse_uri(uri::string_arg_type uri_in) { + Error error; + + string_view_type uri; + std::string scheme; + std::tie(scheme, error, uri) = ::parse_scheme(uri_in); + if (error != Error::None) { + return Uri(error); + } + + Authority authority; + std::tie(authority, error, uri) = ::parse_authority(uri); + if (error != Error::None) { + return Uri(error); + } + + std::string path; + std::tie(path, error, uri) = ::parse_path(uri); + if (error != Error::None) { + return Uri(error); + } + + query_type query; + std::string query_string; + std::tie(query, query_string, error, uri) = ::parse_query(uri); + if (error != Error::None) { + return Uri(error); + } + + std::string fragment; + std::tie(fragment, error, uri) = ::parse_fragment(uri); + if (error != Error::None) { + return Uri(error); + } + + return Uri(scheme, authority, path, query, query_string, fragment); +} + +} // namespace uri + +#endif // SIMPLE_URI_PARSER_LIBRARY_H diff --git a/licenses/LICENSE-simple-uri-parser b/licenses/LICENSE-simple-uri-parser new file mode 100644 index 000000000..5c6b9f0b2 --- /dev/null +++ b/licenses/LICENSE-simple-uri-parser @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Jonathan Hollocombe + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE.