Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(c++): Use simple-uri-parser as uri parser, remove the rely on arrow::internal::URI #460

Merged
merged 4 commits into from
Apr 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ The text of each license is also included in licenses/LICENSE-[project].txt
cpp/thirdparty/mini-yam/Yaml.hpp
cpp/thirdparty/mini-yaml/Yaml.cpp

* simple-uri-parser (https://github.com/jholloc/simple-uri-parser)
Files:
cpp/thirdparty/simple-uri-parser/uri_parser.h


================================================================
BSD 3-Clause licenses
Expand Down
4 changes: 3 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,9 @@ macro(build_gar)
target_compile_features(gar PRIVATE cxx_std_17)
target_include_directories(gar PUBLIC $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>
$<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/mini-yaml>
)
target_include_directories(gar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/thirdparty/mini-yaml
${CMAKE_CURRENT_SOURCE_DIR}/thirdparty
)
if(BUILD_ARROW_FROM_SOURCE)
target_include_directories(gar SYSTEM BEFORE PRIVATE ${GAR_ARROW_INCLUDE_DIR})
Expand Down
22 changes: 9 additions & 13 deletions cpp/src/filesystem.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "arrow/filesystem/s3fs.h"
#include "arrow/ipc/writer.h"
#include "parquet/arrow/writer.h"
#include "simple-uri-parser/uri_parser.h"

#include "gar/fwd.h"
#include "gar/util/expression.h"
Expand Down Expand Up @@ -78,12 +79,6 @@ static Status CastToLargeOffsetArray(
GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(out, arrow::ChunkedArray::Make(chunks));
return Status::OK();
}

Result<arrow::internal::Uri> ParseFileSystemUri(const std::string& uri_string) {
arrow::internal::Uri uri;
RETURN_NOT_ARROW_OK(uri.Parse(uri_string));
return std::move(uri);
}
} // namespace detail

std::shared_ptr<ds::FileFormat> FileSystem::GetFileFormat(
Expand Down Expand Up @@ -291,15 +286,16 @@ Result<std::shared_ptr<FileSystem>> FileSystemFromUriOrPath(

GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(
auto arrow_fs, arrow::fs::FileSystemFromUriOrPath(uri_string));
GAR_ASSIGN_OR_RAISE(auto uri, detail::ParseFileSystemUri(uri_string));
auto uri = uri::parse_uri(uri_string);
if (uri.error != uri::Error::None) {
return Status::Invalid("Failed to parse URI: ", uri_string);
}
if (out_path != nullptr) {
if (uri.scheme() == "file" || uri.scheme() == "hdfs" ||
uri.scheme().empty()) {
*out_path = uri.path();
} else if (uri.scheme() == "s3" || uri.scheme() == "gs") {
if (uri.scheme == "file" || uri.scheme == "hdfs" || uri.scheme.empty()) {
*out_path = uri.path;
} else if (uri.scheme == "s3" || uri.scheme == "gs") {
// bucket name is the host, path is the path
// the arrow parser would delete the trailing slash which we don't want to
*out_path = uri.host() + uri.path();
*out_path = uri.authority.host + uri.path;
} else {
return Status::Invalid("Unrecognized filesystem type in URI: ",
uri_string);
Expand Down
241 changes: 241 additions & 0 deletions cpp/thirdparty/simple-uri-parser/uri_parser.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
/**
* MIT License
*
* Copyright (c) 2021 Jonathan Hollocombe
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:

* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.

* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

#pragma once

#ifndef SIMPLE_URI_PARSER_LIBRARY_H
#define SIMPLE_URI_PARSER_LIBRARY_H

#include <string>
#include <unordered_map>
#include <algorithm>

#ifndef simple_uri_CPLUSPLUS
# if defined(_MSVC_LANG ) && !defined(__clang__)
# define simple_uri_CPLUSPLUS (_MSC_VER == 1900 ? 201103L : _MSVC_LANG )
# else
# define simple_uri_CPLUSPLUS __cplusplus
# endif
#endif

#define simple_uri_CPP17_OR_GREATER ( simple_uri_CPLUSPLUS >= 201703L )

namespace uri {

#if simple_uri_CPP17_OR_GREATER
using string_view_type = std::string_view;
using string_arg_type = std::string_view;
constexpr auto npos = std::string_view::npos;
#else
using string_view_type = std::string;
using string_arg_type = const std::string&;
constexpr auto npos = std::string::npos;
#endif

using query_type = std::unordered_map<std::string, std::string>;

enum class Error {
None,
InvalidScheme,
InvalidPort,
};

struct Authority {
std::string authority;
std::string userinfo;
std::string host;
long port = 0;
};

struct Uri {
Error error;
std::string scheme;
Authority authority = {};
std::string path;
query_type query = {};
std::string query_string;
std::string fragment;

explicit Uri(Error error) : error(error) {}
Uri(std::string scheme, Authority authority, std::string path, query_type query, std::string query_string, std::string fragment)
: error(Error::None)
, scheme(std::move(scheme))
, authority(std::move(authority))
, path(std::move(path))
, query(std::move(query))
, query_string(std::move(query_string))
, fragment(std::move(fragment))
{}
};

}

namespace {

bool valid_scheme(uri::string_arg_type scheme) {
if (scheme.empty()) {
return false;
}
auto pos = std::find_if_not(scheme.begin(), scheme.end(), [&](char c){
return std::isalnum(c) || c == '+' || c == '.' || c == '-';
});
return pos == scheme.end();
}

std::tuple<std::string, uri::Error, uri::string_view_type> parse_scheme(uri::string_arg_type uri) {
auto pos = uri.find(':');
if (pos == uri::npos) {
return { "", uri::Error::InvalidScheme, uri };
}

auto scheme = uri.substr(0, pos);
if (!::valid_scheme(scheme)) {
return { "", uri::Error::InvalidScheme, uri };
}
std::string scheme_string{ scheme };
std::transform(scheme_string.begin(), scheme_string.end(), scheme_string.begin(),
[](unsigned char c){ return std::tolower(c); });

return { scheme_string, uri::Error::None, uri.substr(pos + 1) };
}

std::tuple<uri::Authority, uri::Error, uri::string_view_type> parse_authority(uri::string_arg_type uri) {
uri::Authority authority;

bool has_authority = uri.length() >= 2 && uri[0] == '/' && uri[1] == '/';
if (!has_authority) {
return { authority, uri::Error::None, uri };
}

auto pos = uri.substr(2).find('/');
auto auth_string = uri.substr(2, pos);
auto rem = uri.substr(pos + 2);
authority.authority = auth_string;

pos = auth_string.find('@');
if (pos != uri::npos) {
authority.userinfo = std::string(auth_string.substr(0, pos));
auth_string = auth_string.substr(pos + 1);
}

char* end_ptr = nullptr;
if (!auth_string.empty() && auth_string[0] != '[') {
pos = auth_string.find(':');
if (pos != uri::npos) {
authority.port = std::strtol(&auth_string[pos + 1], &end_ptr, 10);
if (end_ptr != &*auth_string.end()) {
return { authority, uri::Error::InvalidPort, auth_string };
}
}
}

authority.host = auth_string.substr(0, pos);

return { authority, uri::Error::None, rem };
}

std::tuple<std::string, uri::Error, uri::string_view_type> parse_path(uri::string_arg_type uri) {
auto pos = uri.find_first_of("#?");
if (pos == uri::npos) {
auto path = std::string(uri);
return { path, uri::Error::None, "" };
} else {
auto path = std::string(uri.substr(0, pos));
return { path, uri::Error::None, uri.substr(pos + 1) };
}
}

std::tuple<uri::query_type, std::string, uri::Error, uri::string_view_type> parse_query(uri::string_arg_type uri) {
auto hash_pos = uri.find('#');
auto query_substring = uri.substr(0, hash_pos);
auto query_string = std::string(query_substring);
uri::query_type query;
while (!query_substring.empty()) {
auto delim_pos = query_substring.find_first_of("&;?", 0);
auto arg = query_substring.substr(0, delim_pos);
auto equals_pos = arg.find('=');
if (equals_pos == uri::npos) {
query[std::string(arg)] = "";
} else {
query[std::string(arg.substr(0, equals_pos))] = arg.substr(equals_pos + 1);
}
if (delim_pos == uri::npos) {
query_substring = "";
} else {
query_substring = query_substring.substr(delim_pos + 1);
}
}

return {query, query_string, uri::Error::None, uri.substr(hash_pos + 1) };
}

std::tuple<std::string, uri::Error, uri::string_view_type> parse_fragment(uri::string_arg_type uri) {
return { std::string(uri), uri::Error::None, uri };
}

} // anon namespace

namespace uri {

inline Uri parse_uri(uri::string_arg_type uri_in) {
Error error;

string_view_type uri;
std::string scheme;
std::tie(scheme, error, uri) = ::parse_scheme(uri_in);
if (error != Error::None) {
return Uri(error);
}

Authority authority;
std::tie(authority, error, uri) = ::parse_authority(uri);
if (error != Error::None) {
return Uri(error);
}

std::string path;
std::tie(path, error, uri) = ::parse_path(uri);
if (error != Error::None) {
return Uri(error);
}

query_type query;
std::string query_string;
std::tie(query, query_string, error, uri) = ::parse_query(uri);
if (error != Error::None) {
return Uri(error);
}

std::string fragment;
std::tie(fragment, error, uri) = ::parse_fragment(uri);
if (error != Error::None) {
return Uri(error);
}

return Uri(scheme, authority, path, query, query_string, fragment);
}

} // namespace uri

#endif // SIMPLE_URI_PARSER_LIBRARY_H
21 changes: 21 additions & 0 deletions licenses/LICENSE-simple-uri-parser
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2021 Jonathan Hollocombe

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Loading