From dbbaf92a8d925bd5d7a2277f54f57bcf89b810f6 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 29 Sep 2023 12:45:37 +0100 Subject: [PATCH 01/40] Paste in `AzurePath` and `ObjectInputFile` from #12914 --- cpp/src/arrow/filesystem/azurefs.cc | 250 ++++++++++++++++++++++++++++ 1 file changed, 250 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index fcbae332d2397..f1fc248aa683a 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -26,6 +26,103 @@ namespace arrow { namespace fs { +struct AzurePath { + std::string full_path; + std::string container; + std::string path_to_file; + std::vector path_to_file_parts; + + static Result FromString(const std::string& s) { + // https://synapsemladlsgen2.dfs.core.windows.net/synapsemlfs/testdir/testfile.txt + // container = synapsemlfs + // account_name = synapsemladlsgen2 + // path_to_file = testdir/testfile.txt + // path_to_file_parts = [testdir, testfile.txt] + + // Expected input here => s = synapsemlfs/testdir/testfile.txt, + // http://127.0.0.1/accountName/pathToBlob + auto src = internal::RemoveTrailingSlash(s); + if (arrow::internal::StartsWith(src, "https://127.0.0.1") || arrow::internal::StartsWith(src, "http://127.0.0.1")) { + RETURN_NOT_OK(FromLocalHostString(&src)); + } + auto input_path = std::string(src.data()); + if (internal::IsLikelyUri(input_path)) { + RETURN_NOT_OK(ExtractBlobPath(&input_path)); + src = std::string_view(input_path); + } + src = internal::RemoveLeadingSlash(src); + auto first_sep = src.find_first_of(kSep); + if (first_sep == 0) { + return Status::IOError("Path cannot start with a separator ('", input_path, "')"); + } + if (first_sep == std::string::npos) { + return AzurePath{std::string(src), std::string(src), "", {}}; + } + AzurePath path; + path.full_path = std::string(src); + path.container = std::string(src.substr(0, first_sep)); + path.path_to_file = std::string(src.substr(first_sep + 1)); + path.path_to_file_parts = internal::SplitAbstractPath(path.path_to_file); + RETURN_NOT_OK(Validate(&path)); + return path; + } + + static Status FromLocalHostString(std::string_view* src) { + // src = http://127.0.0.1:10000/accountName/pathToBlob + Uri uri; + RETURN_NOT_OK(uri.Parse(src->data())); + *src = internal::RemoveLeadingSlash(uri.path()); + if (src->empty()) { + return Status::IOError("Missing account name in Azure Blob Storage URI"); + } + auto first_sep = src->find_first_of(kSep); + if (first_sep != std::string::npos) { + *src = src->substr(first_sep + 1); + } else { + *src = ""; + } + return Status::OK(); + } + + // Removes scheme, host and port from the uri + static Status ExtractBlobPath(std::string* src) { + Uri uri; + RETURN_NOT_OK(uri.Parse(*src)); + *src = uri.path(); + return Status::OK(); + } + + static Status Validate(const AzurePath* path) { + auto status = internal::ValidateAbstractPathParts(path->path_to_file_parts); + if (!status.ok()) { + return Status::Invalid(status.message(), " in path ", path->full_path); + } else { + return status; + } + } + + AzurePath parent() const { + DCHECK(has_parent()); + auto parent = AzurePath{"", container, "", path_to_file_parts}; + parent.path_to_file_parts.pop_back(); + parent.path_to_file = internal::JoinAbstractPath(parent.path_to_file_parts); + if (parent.path_to_file.empty()) { + parent.full_path = parent.container; + } else { + parent.full_path = parent.container + kSep + parent.path_to_file; + } + return parent; + } + + bool has_parent() const { return !path_to_file.empty(); } + + bool empty() const { return container.empty() && path_to_file.empty(); } + + bool operator==(const AzurePath& other) const { + return container == other.container && path_to_file == other.path_to_file; + } +}; + // ----------------------------------------------------------------------- // AzureOptions Implementation @@ -37,6 +134,159 @@ bool AzureOptions::Equals(const AzureOptions& other) const { credentials_kind == other.credentials_kind); } + +class ObjectInputFile final : public io::RandomAccessFile { + public: + ObjectInputFile( + std::shared_ptr& path_client, + std::shared_ptr& file_client, + const io::IOContext& io_context, const AzurePath& path, int64_t size = kNoSize) + : path_client_(std::move(path_client)), + file_client_(std::move(file_client)), + io_context_(io_context), + path_(path), + content_length_(size) {} + + Status Init() { + if (content_length_ != kNoSize) { + DCHECK_GE(content_length_, 0); + return Status::OK(); + } + try { + auto properties = path_client_->GetProperties(); + if (properties.Value.IsDirectory) { + return ::arrow::fs::internal::NotAFile(path_.full_path); + } + content_length_ = properties.Value.FileSize; + metadata_ = GetObjectMetadata(properties.Value.Metadata); + return Status::OK(); + } catch (const Azure::Storage::StorageException& exception) { + return ::arrow::fs::internal::PathNotFound(path_.full_path); + } + } + + Status CheckClosed() const { + if (closed_) { + return Status::Invalid("Operation on closed stream"); + } + return Status::OK(); + } + + Status CheckPosition(int64_t position, const char* action) const { + if (position < 0) { + return Status::Invalid("Cannot ", action, " from negative position"); + } + if (position > content_length_) { + return Status::IOError("Cannot ", action, " past end of file"); + } + return Status::OK(); + } + + // RandomAccessFile APIs + + Result> ReadMetadata() override { + return metadata_; + } + + Future> ReadMetadataAsync( + const io::IOContext& io_context) override { + return metadata_; + } + + Status Close() override { + path_client_ = nullptr; + file_client_ = nullptr; + closed_ = true; + return Status::OK(); + } + + bool closed() const override { return closed_; } + + Result Tell() const override { + RETURN_NOT_OK(CheckClosed()); + return pos_; + } + + Result GetSize() override { + RETURN_NOT_OK(CheckClosed()); + return content_length_; + } + + Status Seek(int64_t position) override { + RETURN_NOT_OK(CheckClosed()); + RETURN_NOT_OK(CheckPosition(position, "seek")); + + pos_ = position; + return Status::OK(); + } + + Result ReadAt(int64_t position, int64_t nbytes, void* out) override { + RETURN_NOT_OK(CheckClosed()); + RETURN_NOT_OK(CheckPosition(position, "read")); + + nbytes = std::min(nbytes, content_length_ - position); + if (nbytes == 0) { + return 0; + } + + // Read the desired range of bytes + Azure::Storage::Blobs::DownloadBlobToOptions download_options; + Azure::Core::Http::HttpRange range; + range.Offset = position; + range.Length = nbytes; + download_options.Range = Azure::Nullable(range); + try { + auto result = + file_client_ + ->DownloadTo(reinterpret_cast(out), nbytes, download_options) + .Value; + return result.ContentRange.Length.Value(); + } catch (const Azure::Storage::StorageException& exception) { + return Status::IOError(exception.RawResponse->GetReasonPhrase()); + } + } + + Result> ReadAt(int64_t position, int64_t nbytes) override { + RETURN_NOT_OK(CheckClosed()); + RETURN_NOT_OK(CheckPosition(position, "read")); + + // No need to allocate more than the remaining number of bytes + nbytes = std::min(nbytes, content_length_ - position); + + ARROW_ASSIGN_OR_RAISE(auto buf, AllocateResizableBuffer(nbytes, io_context_.pool())); + if (nbytes > 0) { + ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, + ReadAt(position, nbytes, buf->mutable_data())); + DCHECK_LE(bytes_read, nbytes); + RETURN_NOT_OK(buf->Resize(bytes_read)); + } + return std::move(buf); + } + + Result Read(int64_t nbytes, void* out) override { + ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, ReadAt(pos_, nbytes, out)); + pos_ += bytes_read; + return bytes_read; + } + + Result> Read(int64_t nbytes) override { + ARROW_ASSIGN_OR_RAISE(auto buffer, ReadAt(pos_, nbytes)); + pos_ += buffer->size(); + return std::move(buffer); + } + + protected: + std::shared_ptr path_client_; + std::shared_ptr file_client_; + const io::IOContext io_context_; + AzurePath path_; + + bool closed_ = false; + int64_t pos_ = 0; + int64_t content_length_ = kNoSize; + std::shared_ptr metadata_; +}; + // ----------------------------------------------------------------------- // AzureFilesystem Implementation From 1db80b30c32ceebfb95b85ca7a95fcc7f049d603 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 29 Sep 2023 12:58:06 +0100 Subject: [PATCH 02/40] Paste in TestAzureFileSystem and an example test (TestAzureFileSystem, FromAccountKey) from #12914 --- cpp/src/arrow/filesystem/azurefs_test.cc | 88 ++++++++++++++++++++++++ 1 file changed, 88 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 9bf7cb8e753ec..d0f49d9211b0c 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -154,6 +154,94 @@ TEST(AzureFileSystem, OptionsCompare) { EXPECT_TRUE(options.Equals(options)); } + +class TestAzureFileSystem : public ::testing::Test { + public: + std::shared_ptr fs_; + std::shared_ptr gen2_client_; + AzureOptions options_; + + void MakeFileSystem() { + const std::string& account_name = GetAzuriteEnv()->account_name(); + const std::string& account_key = GetAzuriteEnv()->account_key(); + options_.is_azurite = true; + options_.ConfigureAccountKeyCredentials(account_name, account_key); + gen2_client_ = + std::make_shared( + options_.account_dfs_url, options_.storage_credentials_provider); + ASSERT_OK_AND_ASSIGN(fs_, AzureBlobFileSystem::Make(options_)); + } + + void SetUp() override { + ASSERT_THAT(GetAzuriteEnv(), NotNull()); + ASSERT_OK(GetAzuriteEnv()->status()); + + MakeFileSystem(); + auto file_system_client = gen2_client_->GetFileSystemClient("container"); + file_system_client.CreateIfNotExists(); + file_system_client = gen2_client_->GetFileSystemClient("empty-container"); + file_system_client.CreateIfNotExists(); + auto file_client = + std::make_shared( + options_.account_blob_url + "container/somefile", + options_.storage_credentials_provider); + std::string s = "some data"; + file_client->UploadFrom( + const_cast(reinterpret_cast(s.data())), s.size()); + } + + void TearDown() override { + auto containers = gen2_client_->ListFileSystems(); + for (auto container : containers.FileSystems) { + auto file_system_client = gen2_client_->GetFileSystemClient(container.Name); + file_system_client.DeleteIfExists(); + } + } + + void AssertObjectContents( + Azure::Storage::Files::DataLake::DataLakeServiceClient* client, + const std::string& container, const std::string& path_to_file, + const std::string& expected) { + auto path_client = + std::make_shared( + client->GetUrl() + container + "/" + path_to_file, + options_.storage_credentials_provider); + auto size = path_client->GetProperties().Value.FileSize; + if (size == 0) { + ASSERT_EQ(expected, ""); + return; + } + auto buf = AllocateBuffer(size, fs_->io_context().pool()); + Azure::Storage::Blobs::DownloadBlobToOptions download_options; + Azure::Core::Http::HttpRange range; + range.Offset = 0; + range.Length = size; + download_options.Range = Azure::Nullable(range); + auto file_client = + std::make_shared( + client->GetUrl() + container + "/" + path_to_file, + options_.storage_credentials_provider); + auto result = file_client + ->DownloadTo(reinterpret_cast(buf->get()->mutable_data()), + size, download_options) + .Value; + auto buf_data = std::move(buf->get()); + auto expected_data = std::make_shared( + reinterpret_cast(expected.data()), expected.size()); + AssertBufferEqual(*buf_data, *expected_data); + } +}; + +TEST_F(TestAzureFileSystem, FromAccountKey) { + auto options = AzureOptions::FromAccountKey(GetAzuriteEnv()->account_name(), + GetAzuriteEnv()->account_key()) + .ValueOrDie(); + ASSERT_EQ(options.credentials_kind, + arrow::fs::AzureCredentialsKind::StorageCredentials); + ASSERT_NE(options.storage_credentials_provider, nullptr); +} + + } // namespace } // namespace fs } // namespace arrow From 81d15231b02d0d42dab8dcd8f5cf65e639b0eecc Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 29 Sep 2023 12:59:15 +0100 Subject: [PATCH 03/40] Paste in input file test cases from gscfs_test.cc --- cpp/src/arrow/filesystem/azurefs_test.cc | 148 +++++++++++++++++++++++ 1 file changed, 148 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index d0f49d9211b0c..4c2a5e74f730e 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -241,6 +241,154 @@ TEST_F(TestAzureFileSystem, FromAccountKey) { ASSERT_NE(options.storage_credentials_provider, nullptr); } +TEST_F(GcsIntegrationTest, OpenInputFileMixedReadVsReadAt) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + + // Create a file large enough to make the random access tests non-trivial. + auto constexpr kLineWidth = 100; + auto constexpr kLineCount = 4096; + std::vector lines(kLineCount); + int lineno = 0; + std::generate_n(lines.begin(), lines.size(), + [&] { return RandomLine(++lineno, kLineWidth); }); + + const auto path = + PreexistingBucketPath() + "OpenInputFileMixedReadVsReadAt/object-name"; + std::shared_ptr output; + ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); + for (auto const& line : lines) { + ASSERT_OK(output->Write(line.data(), line.size())); + } + ASSERT_OK(output->Close()); + + std::shared_ptr file; + ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(path)); + for (int i = 0; i != 32; ++i) { + SCOPED_TRACE("Iteration " + std::to_string(i)); + // Verify sequential reads work as expected. + std::array buffer{}; + std::int64_t size; + { + ASSERT_OK_AND_ASSIGN(auto actual, file->Read(kLineWidth)); + EXPECT_EQ(lines[2 * i], actual->ToString()); + } + { + ASSERT_OK_AND_ASSIGN(size, file->Read(buffer.size(), buffer.data())); + EXPECT_EQ(size, kLineWidth); + auto actual = std::string{buffer.begin(), buffer.end()}; + EXPECT_EQ(lines[2 * i + 1], actual); + } + + // Verify random reads interleave too. + auto const index = RandomIndex(kLineCount); + auto const position = index * kLineWidth; + ASSERT_OK_AND_ASSIGN(size, file->ReadAt(position, buffer.size(), buffer.data())); + EXPECT_EQ(size, kLineWidth); + auto actual = std::string{buffer.begin(), buffer.end()}; + EXPECT_EQ(lines[index], actual); + + // Verify random reads using buffers work. + ASSERT_OK_AND_ASSIGN(auto b, file->ReadAt(position, kLineWidth)); + EXPECT_EQ(lines[index], b->ToString()); + } +} + +TEST_F(GcsIntegrationTest, OpenInputFileRandomSeek) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + + // Create a file large enough to make the random access tests non-trivial. + auto constexpr kLineWidth = 100; + auto constexpr kLineCount = 4096; + std::vector lines(kLineCount); + int lineno = 0; + std::generate_n(lines.begin(), lines.size(), + [&] { return RandomLine(++lineno, kLineWidth); }); + + const auto path = PreexistingBucketPath() + "OpenInputFileRandomSeek/object-name"; + std::shared_ptr output; + ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); + for (auto const& line : lines) { + ASSERT_OK(output->Write(line.data(), line.size())); + } + ASSERT_OK(output->Close()); + + std::shared_ptr file; + ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(path)); + for (int i = 0; i != 32; ++i) { + SCOPED_TRACE("Iteration " + std::to_string(i)); + // Verify sequential reads work as expected. + auto const index = RandomIndex(kLineCount); + auto const position = index * kLineWidth; + ASSERT_OK(file->Seek(position)); + ASSERT_OK_AND_ASSIGN(auto actual, file->Read(kLineWidth)); + EXPECT_EQ(lines[index], actual->ToString()); + } +} + +TEST_F(GcsIntegrationTest, OpenInputFileIoContext) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + + // Create a test file. + const auto path = PreexistingBucketPath() + "OpenInputFileIoContext/object-name"; + std::shared_ptr output; + ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); + const std::string contents = "The quick brown fox jumps over the lazy dog"; + ASSERT_OK(output->Write(contents.data(), contents.size())); + ASSERT_OK(output->Close()); + + std::shared_ptr file; + ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(path)); + EXPECT_EQ(fs->io_context().external_id(), file->io_context().external_id()); +} + +TEST_F(GcsIntegrationTest, OpenInputFileInfo) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + + arrow::fs::FileInfo info; + ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingObjectPath())); + + std::shared_ptr file; + ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(info)); + + std::array buffer{}; + std::int64_t size; + auto constexpr kStart = 16; + ASSERT_OK_AND_ASSIGN(size, file->ReadAt(kStart, buffer.size(), buffer.data())); + + auto const expected = std::string(kLoremIpsum).substr(kStart); + EXPECT_EQ(std::string(buffer.data(), size), expected); +} + +TEST_F(GcsIntegrationTest, OpenInputFileNotFound) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + + ASSERT_RAISES(IOError, fs->OpenInputFile(NotFoundObjectPath())); +} + +TEST_F(GcsIntegrationTest, OpenInputFileInfoInvalid) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + + arrow::fs::FileInfo info; + ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingBucketPath())); + ASSERT_RAISES(IOError, fs->OpenInputFile(info)); + + ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(NotFoundObjectPath())); + ASSERT_RAISES(IOError, fs->OpenInputFile(info)); +} + +TEST_F(GcsIntegrationTest, OpenInputFileClosed) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + + ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenInputFile(PreexistingObjectPath())); + ASSERT_OK(stream->Close()); + std::array buffer{}; + ASSERT_RAISES(Invalid, stream->Tell()); + ASSERT_RAISES(Invalid, stream->Read(buffer.size(), buffer.data())); + ASSERT_RAISES(Invalid, stream->Read(buffer.size())); + ASSERT_RAISES(Invalid, stream->ReadAt(1, buffer.size(), buffer.data())); + ASSERT_RAISES(Invalid, stream->ReadAt(1, 1)); + ASSERT_RAISES(Invalid, stream->Seek(2)); +} } // namespace } // namespace fs From 9e31d1a6c70f05095772d8f1a079310e5f0b3b86 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 29 Sep 2023 14:44:08 +0100 Subject: [PATCH 04/40] Minimal changes for successful build --- cpp/src/arrow/filesystem/azurefs.cc | 49 ++- cpp/src/arrow/filesystem/azurefs_test.cc | 468 +++++++++++------------ 2 files changed, 270 insertions(+), 247 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index f1fc248aa683a..fb96b90f91f96 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -19,13 +19,37 @@ #include #include +#include +#include "arrow/buffer.h" +#include "arrow/filesystem/path_util.h" +#include "arrow/filesystem/util_internal.h" #include "arrow/result.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" +#include "arrow/util/future.h" +#include "arrow/util/string.h" + namespace arrow { namespace fs { +static const char kSep = '/'; + +// ----------------------------------------------------------------------- +// AzureOptions Implementation + +AzureOptions::AzureOptions() {} + +bool AzureOptions::Equals(const AzureOptions& other) const { + return (account_dfs_url == other.account_dfs_url && + account_blob_url == other.account_blob_url && + credentials_kind == other.credentials_kind); +} + +namespace { + struct AzurePath { std::string full_path; std::string container; @@ -69,7 +93,7 @@ struct AzurePath { static Status FromLocalHostString(std::string_view* src) { // src = http://127.0.0.1:10000/accountName/pathToBlob - Uri uri; + arrow::internal::Uri uri; RETURN_NOT_OK(uri.Parse(src->data())); *src = internal::RemoveLeadingSlash(uri.path()); if (src->empty()) { @@ -86,7 +110,7 @@ struct AzurePath { // Removes scheme, host and port from the uri static Status ExtractBlobPath(std::string* src) { - Uri uri; + arrow::internal::Uri uri; RETURN_NOT_OK(uri.Parse(*src)); *src = uri.path(); return Status::OK(); @@ -123,18 +147,15 @@ struct AzurePath { } }; -// ----------------------------------------------------------------------- -// AzureOptions Implementation - -AzureOptions::AzureOptions() {} - -bool AzureOptions::Equals(const AzureOptions& other) const { - return (account_dfs_url == other.account_dfs_url && - account_blob_url == other.account_blob_url && - credentials_kind == other.credentials_kind); +template +std::shared_ptr GetObjectMetadata(const ObjectResult& result) { + auto md = std::make_shared(); + for (auto prop : result) { + md->Append(prop.first, prop.second); + } + return md; } - class ObjectInputFile final : public io::RandomAccessFile { public: ObjectInputFile( @@ -189,7 +210,7 @@ class ObjectInputFile final : public io::RandomAccessFile { } Future> ReadMetadataAsync( - const io::IOContext& io_context) override { + const io::IOContext& io_context) override { return metadata_; } @@ -287,6 +308,8 @@ class ObjectInputFile final : public io::RandomAccessFile { std::shared_ptr metadata_; }; +} // namespace + // ----------------------------------------------------------------------- // AzureFilesystem Implementation diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 4c2a5e74f730e..4e06f2f8772ee 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -155,240 +155,240 @@ TEST(AzureFileSystem, OptionsCompare) { } -class TestAzureFileSystem : public ::testing::Test { - public: - std::shared_ptr fs_; - std::shared_ptr gen2_client_; - AzureOptions options_; - - void MakeFileSystem() { - const std::string& account_name = GetAzuriteEnv()->account_name(); - const std::string& account_key = GetAzuriteEnv()->account_key(); - options_.is_azurite = true; - options_.ConfigureAccountKeyCredentials(account_name, account_key); - gen2_client_ = - std::make_shared( - options_.account_dfs_url, options_.storage_credentials_provider); - ASSERT_OK_AND_ASSIGN(fs_, AzureBlobFileSystem::Make(options_)); - } - - void SetUp() override { - ASSERT_THAT(GetAzuriteEnv(), NotNull()); - ASSERT_OK(GetAzuriteEnv()->status()); - - MakeFileSystem(); - auto file_system_client = gen2_client_->GetFileSystemClient("container"); - file_system_client.CreateIfNotExists(); - file_system_client = gen2_client_->GetFileSystemClient("empty-container"); - file_system_client.CreateIfNotExists(); - auto file_client = - std::make_shared( - options_.account_blob_url + "container/somefile", - options_.storage_credentials_provider); - std::string s = "some data"; - file_client->UploadFrom( - const_cast(reinterpret_cast(s.data())), s.size()); - } - - void TearDown() override { - auto containers = gen2_client_->ListFileSystems(); - for (auto container : containers.FileSystems) { - auto file_system_client = gen2_client_->GetFileSystemClient(container.Name); - file_system_client.DeleteIfExists(); - } - } - - void AssertObjectContents( - Azure::Storage::Files::DataLake::DataLakeServiceClient* client, - const std::string& container, const std::string& path_to_file, - const std::string& expected) { - auto path_client = - std::make_shared( - client->GetUrl() + container + "/" + path_to_file, - options_.storage_credentials_provider); - auto size = path_client->GetProperties().Value.FileSize; - if (size == 0) { - ASSERT_EQ(expected, ""); - return; - } - auto buf = AllocateBuffer(size, fs_->io_context().pool()); - Azure::Storage::Blobs::DownloadBlobToOptions download_options; - Azure::Core::Http::HttpRange range; - range.Offset = 0; - range.Length = size; - download_options.Range = Azure::Nullable(range); - auto file_client = - std::make_shared( - client->GetUrl() + container + "/" + path_to_file, - options_.storage_credentials_provider); - auto result = file_client - ->DownloadTo(reinterpret_cast(buf->get()->mutable_data()), - size, download_options) - .Value; - auto buf_data = std::move(buf->get()); - auto expected_data = std::make_shared( - reinterpret_cast(expected.data()), expected.size()); - AssertBufferEqual(*buf_data, *expected_data); - } -}; - -TEST_F(TestAzureFileSystem, FromAccountKey) { - auto options = AzureOptions::FromAccountKey(GetAzuriteEnv()->account_name(), - GetAzuriteEnv()->account_key()) - .ValueOrDie(); - ASSERT_EQ(options.credentials_kind, - arrow::fs::AzureCredentialsKind::StorageCredentials); - ASSERT_NE(options.storage_credentials_provider, nullptr); -} - -TEST_F(GcsIntegrationTest, OpenInputFileMixedReadVsReadAt) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - - // Create a file large enough to make the random access tests non-trivial. - auto constexpr kLineWidth = 100; - auto constexpr kLineCount = 4096; - std::vector lines(kLineCount); - int lineno = 0; - std::generate_n(lines.begin(), lines.size(), - [&] { return RandomLine(++lineno, kLineWidth); }); - - const auto path = - PreexistingBucketPath() + "OpenInputFileMixedReadVsReadAt/object-name"; - std::shared_ptr output; - ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); - for (auto const& line : lines) { - ASSERT_OK(output->Write(line.data(), line.size())); - } - ASSERT_OK(output->Close()); - - std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(path)); - for (int i = 0; i != 32; ++i) { - SCOPED_TRACE("Iteration " + std::to_string(i)); - // Verify sequential reads work as expected. - std::array buffer{}; - std::int64_t size; - { - ASSERT_OK_AND_ASSIGN(auto actual, file->Read(kLineWidth)); - EXPECT_EQ(lines[2 * i], actual->ToString()); - } - { - ASSERT_OK_AND_ASSIGN(size, file->Read(buffer.size(), buffer.data())); - EXPECT_EQ(size, kLineWidth); - auto actual = std::string{buffer.begin(), buffer.end()}; - EXPECT_EQ(lines[2 * i + 1], actual); - } - - // Verify random reads interleave too. - auto const index = RandomIndex(kLineCount); - auto const position = index * kLineWidth; - ASSERT_OK_AND_ASSIGN(size, file->ReadAt(position, buffer.size(), buffer.data())); - EXPECT_EQ(size, kLineWidth); - auto actual = std::string{buffer.begin(), buffer.end()}; - EXPECT_EQ(lines[index], actual); - - // Verify random reads using buffers work. - ASSERT_OK_AND_ASSIGN(auto b, file->ReadAt(position, kLineWidth)); - EXPECT_EQ(lines[index], b->ToString()); - } -} - -TEST_F(GcsIntegrationTest, OpenInputFileRandomSeek) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - - // Create a file large enough to make the random access tests non-trivial. - auto constexpr kLineWidth = 100; - auto constexpr kLineCount = 4096; - std::vector lines(kLineCount); - int lineno = 0; - std::generate_n(lines.begin(), lines.size(), - [&] { return RandomLine(++lineno, kLineWidth); }); - - const auto path = PreexistingBucketPath() + "OpenInputFileRandomSeek/object-name"; - std::shared_ptr output; - ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); - for (auto const& line : lines) { - ASSERT_OK(output->Write(line.data(), line.size())); - } - ASSERT_OK(output->Close()); - - std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(path)); - for (int i = 0; i != 32; ++i) { - SCOPED_TRACE("Iteration " + std::to_string(i)); - // Verify sequential reads work as expected. - auto const index = RandomIndex(kLineCount); - auto const position = index * kLineWidth; - ASSERT_OK(file->Seek(position)); - ASSERT_OK_AND_ASSIGN(auto actual, file->Read(kLineWidth)); - EXPECT_EQ(lines[index], actual->ToString()); - } -} - -TEST_F(GcsIntegrationTest, OpenInputFileIoContext) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - - // Create a test file. - const auto path = PreexistingBucketPath() + "OpenInputFileIoContext/object-name"; - std::shared_ptr output; - ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); - const std::string contents = "The quick brown fox jumps over the lazy dog"; - ASSERT_OK(output->Write(contents.data(), contents.size())); - ASSERT_OK(output->Close()); - - std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(path)); - EXPECT_EQ(fs->io_context().external_id(), file->io_context().external_id()); -} - -TEST_F(GcsIntegrationTest, OpenInputFileInfo) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - - arrow::fs::FileInfo info; - ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingObjectPath())); - - std::shared_ptr file; - ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(info)); - - std::array buffer{}; - std::int64_t size; - auto constexpr kStart = 16; - ASSERT_OK_AND_ASSIGN(size, file->ReadAt(kStart, buffer.size(), buffer.data())); - - auto const expected = std::string(kLoremIpsum).substr(kStart); - EXPECT_EQ(std::string(buffer.data(), size), expected); -} - -TEST_F(GcsIntegrationTest, OpenInputFileNotFound) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - - ASSERT_RAISES(IOError, fs->OpenInputFile(NotFoundObjectPath())); -} - -TEST_F(GcsIntegrationTest, OpenInputFileInfoInvalid) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - - arrow::fs::FileInfo info; - ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingBucketPath())); - ASSERT_RAISES(IOError, fs->OpenInputFile(info)); - - ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(NotFoundObjectPath())); - ASSERT_RAISES(IOError, fs->OpenInputFile(info)); -} - -TEST_F(GcsIntegrationTest, OpenInputFileClosed) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - - ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenInputFile(PreexistingObjectPath())); - ASSERT_OK(stream->Close()); - std::array buffer{}; - ASSERT_RAISES(Invalid, stream->Tell()); - ASSERT_RAISES(Invalid, stream->Read(buffer.size(), buffer.data())); - ASSERT_RAISES(Invalid, stream->Read(buffer.size())); - ASSERT_RAISES(Invalid, stream->ReadAt(1, buffer.size(), buffer.data())); - ASSERT_RAISES(Invalid, stream->ReadAt(1, 1)); - ASSERT_RAISES(Invalid, stream->Seek(2)); -} +// class TestAzureFileSystem : public ::testing::Test { +// public: +// std::shared_ptr fs_; +// std::shared_ptr gen2_client_; +// AzureOptions options_; + +// void MakeFileSystem() { +// const std::string& account_name = GetAzuriteEnv()->account_name(); +// const std::string& account_key = GetAzuriteEnv()->account_key(); +// options_.is_azurite = true; +// options_.ConfigureAccountKeyCredentials(account_name, account_key); +// gen2_client_ = +// std::make_shared( +// options_.account_dfs_url, options_.storage_credentials_provider); +// ASSERT_OK_AND_ASSIGN(fs_, AzureBlobFileSystem::Make(options_)); +// } + +// void SetUp() override { +// ASSERT_THAT(GetAzuriteEnv(), NotNull()); +// ASSERT_OK(GetAzuriteEnv()->status()); + +// MakeFileSystem(); +// auto file_system_client = gen2_client_->GetFileSystemClient("container"); +// file_system_client.CreateIfNotExists(); +// file_system_client = gen2_client_->GetFileSystemClient("empty-container"); +// file_system_client.CreateIfNotExists(); +// auto file_client = +// std::make_shared( +// options_.account_blob_url + "container/somefile", +// options_.storage_credentials_provider); +// std::string s = "some data"; +// file_client->UploadFrom( +// const_cast(reinterpret_cast(s.data())), s.size()); +// } + +// void TearDown() override { +// auto containers = gen2_client_->ListFileSystems(); +// for (auto container : containers.FileSystems) { +// auto file_system_client = gen2_client_->GetFileSystemClient(container.Name); +// file_system_client.DeleteIfExists(); +// } +// } + +// void AssertObjectContents( +// Azure::Storage::Files::DataLake::DataLakeServiceClient* client, +// const std::string& container, const std::string& path_to_file, +// const std::string& expected) { +// auto path_client = +// std::make_shared( +// client->GetUrl() + container + "/" + path_to_file, +// options_.storage_credentials_provider); +// auto size = path_client->GetProperties().Value.FileSize; +// if (size == 0) { +// ASSERT_EQ(expected, ""); +// return; +// } +// auto buf = AllocateBuffer(size, fs_->io_context().pool()); +// Azure::Storage::Blobs::DownloadBlobToOptions download_options; +// Azure::Core::Http::HttpRange range; +// range.Offset = 0; +// range.Length = size; +// download_options.Range = Azure::Nullable(range); +// auto file_client = +// std::make_shared( +// client->GetUrl() + container + "/" + path_to_file, +// options_.storage_credentials_provider); +// auto result = file_client +// ->DownloadTo(reinterpret_cast(buf->get()->mutable_data()), +// size, download_options) +// .Value; +// auto buf_data = std::move(buf->get()); +// auto expected_data = std::make_shared( +// reinterpret_cast(expected.data()), expected.size()); +// AssertBufferEqual(*buf_data, *expected_data); +// } +// }; + +// TEST_F(TestAzureFileSystem, FromAccountKey) { +// auto options = AzureOptions::FromAccountKey(GetAzuriteEnv()->account_name(), +// GetAzuriteEnv()->account_key()) +// .ValueOrDie(); +// ASSERT_EQ(options.credentials_kind, +// arrow::fs::AzureCredentialsKind::StorageCredentials); +// ASSERT_NE(options.storage_credentials_provider, nullptr); +// } + +// TEST_F(GcsIntegrationTest, OpenInputFileMixedReadVsReadAt) { +// auto fs = GcsFileSystem::Make(TestGcsOptions()); + +// // Create a file large enough to make the random access tests non-trivial. +// auto constexpr kLineWidth = 100; +// auto constexpr kLineCount = 4096; +// std::vector lines(kLineCount); +// int lineno = 0; +// std::generate_n(lines.begin(), lines.size(), +// [&] { return RandomLine(++lineno, kLineWidth); }); + +// const auto path = +// PreexistingBucketPath() + "OpenInputFileMixedReadVsReadAt/object-name"; +// std::shared_ptr output; +// ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); +// for (auto const& line : lines) { +// ASSERT_OK(output->Write(line.data(), line.size())); +// } +// ASSERT_OK(output->Close()); + +// std::shared_ptr file; +// ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(path)); +// for (int i = 0; i != 32; ++i) { +// SCOPED_TRACE("Iteration " + std::to_string(i)); +// // Verify sequential reads work as expected. +// std::array buffer{}; +// std::int64_t size; +// { +// ASSERT_OK_AND_ASSIGN(auto actual, file->Read(kLineWidth)); +// EXPECT_EQ(lines[2 * i], actual->ToString()); +// } +// { +// ASSERT_OK_AND_ASSIGN(size, file->Read(buffer.size(), buffer.data())); +// EXPECT_EQ(size, kLineWidth); +// auto actual = std::string{buffer.begin(), buffer.end()}; +// EXPECT_EQ(lines[2 * i + 1], actual); +// } + +// // Verify random reads interleave too. +// auto const index = RandomIndex(kLineCount); +// auto const position = index * kLineWidth; +// ASSERT_OK_AND_ASSIGN(size, file->ReadAt(position, buffer.size(), buffer.data())); +// EXPECT_EQ(size, kLineWidth); +// auto actual = std::string{buffer.begin(), buffer.end()}; +// EXPECT_EQ(lines[index], actual); + +// // Verify random reads using buffers work. +// ASSERT_OK_AND_ASSIGN(auto b, file->ReadAt(position, kLineWidth)); +// EXPECT_EQ(lines[index], b->ToString()); +// } +// } + +// TEST_F(GcsIntegrationTest, OpenInputFileRandomSeek) { +// auto fs = GcsFileSystem::Make(TestGcsOptions()); + +// // Create a file large enough to make the random access tests non-trivial. +// auto constexpr kLineWidth = 100; +// auto constexpr kLineCount = 4096; +// std::vector lines(kLineCount); +// int lineno = 0; +// std::generate_n(lines.begin(), lines.size(), +// [&] { return RandomLine(++lineno, kLineWidth); }); + +// const auto path = PreexistingBucketPath() + "OpenInputFileRandomSeek/object-name"; +// std::shared_ptr output; +// ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); +// for (auto const& line : lines) { +// ASSERT_OK(output->Write(line.data(), line.size())); +// } +// ASSERT_OK(output->Close()); + +// std::shared_ptr file; +// ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(path)); +// for (int i = 0; i != 32; ++i) { +// SCOPED_TRACE("Iteration " + std::to_string(i)); +// // Verify sequential reads work as expected. +// auto const index = RandomIndex(kLineCount); +// auto const position = index * kLineWidth; +// ASSERT_OK(file->Seek(position)); +// ASSERT_OK_AND_ASSIGN(auto actual, file->Read(kLineWidth)); +// EXPECT_EQ(lines[index], actual->ToString()); +// } +// } + +// TEST_F(GcsIntegrationTest, OpenInputFileIoContext) { +// auto fs = GcsFileSystem::Make(TestGcsOptions()); + +// // Create a test file. +// const auto path = PreexistingBucketPath() + "OpenInputFileIoContext/object-name"; +// std::shared_ptr output; +// ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); +// const std::string contents = "The quick brown fox jumps over the lazy dog"; +// ASSERT_OK(output->Write(contents.data(), contents.size())); +// ASSERT_OK(output->Close()); + +// std::shared_ptr file; +// ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(path)); +// EXPECT_EQ(fs->io_context().external_id(), file->io_context().external_id()); +// } + +// TEST_F(GcsIntegrationTest, OpenInputFileInfo) { +// auto fs = GcsFileSystem::Make(TestGcsOptions()); + +// arrow::fs::FileInfo info; +// ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingObjectPath())); + +// std::shared_ptr file; +// ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(info)); + +// std::array buffer{}; +// std::int64_t size; +// auto constexpr kStart = 16; +// ASSERT_OK_AND_ASSIGN(size, file->ReadAt(kStart, buffer.size(), buffer.data())); + +// auto const expected = std::string(kLoremIpsum).substr(kStart); +// EXPECT_EQ(std::string(buffer.data(), size), expected); +// } + +// TEST_F(GcsIntegrationTest, OpenInputFileNotFound) { +// auto fs = GcsFileSystem::Make(TestGcsOptions()); + +// ASSERT_RAISES(IOError, fs->OpenInputFile(NotFoundObjectPath())); +// } + +// TEST_F(GcsIntegrationTest, OpenInputFileInfoInvalid) { +// auto fs = GcsFileSystem::Make(TestGcsOptions()); + +// arrow::fs::FileInfo info; +// ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingBucketPath())); +// ASSERT_RAISES(IOError, fs->OpenInputFile(info)); + +// ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(NotFoundObjectPath())); +// ASSERT_RAISES(IOError, fs->OpenInputFile(info)); +// } + +// TEST_F(GcsIntegrationTest, OpenInputFileClosed) { +// auto fs = GcsFileSystem::Make(TestGcsOptions()); + +// ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenInputFile(PreexistingObjectPath())); +// ASSERT_OK(stream->Close()); +// std::array buffer{}; +// ASSERT_RAISES(Invalid, stream->Tell()); +// ASSERT_RAISES(Invalid, stream->Read(buffer.size(), buffer.data())); +// ASSERT_RAISES(Invalid, stream->Read(buffer.size())); +// ASSERT_RAISES(Invalid, stream->ReadAt(1, buffer.size(), buffer.data())); +// ASSERT_RAISES(Invalid, stream->ReadAt(1, 1)); +// ASSERT_RAISES(Invalid, stream->Seek(2)); +// } } // namespace } // namespace fs From 5e4803bcf5c1f2272d7de13a9e1a1027ec9363db Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 29 Sep 2023 14:46:37 +0100 Subject: [PATCH 05/40] Paste in ConfigureAccountKeyCredentials from #12914 --- cpp/src/arrow/filesystem/azurefs.cc | 15 +++++++++++++++ cpp/src/arrow/filesystem/azurefs.h | 3 +++ 2 files changed, 18 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index fb96b90f91f96..46308732a7de5 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -48,6 +48,21 @@ bool AzureOptions::Equals(const AzureOptions& other) const { credentials_kind == other.credentials_kind); } +Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& account_name, + const std::string& account_key) { + if (this->is_azurite) { + account_blob_url = "http://127.0.0.1:10000/" + account_name + "/"; + account_dfs_url = "http://127.0.0.1:10000/" + account_name + "/"; + } else { + account_dfs_url = "https://" + account_name + ".dfs.core.windows.net/"; + account_blob_url = "https://" + account_name + ".blob.core.windows.net/"; + } + storage_credentials_provider = + std::make_shared(account_name, + account_key); + credentials_kind = AzureCredentialsKind::StorageCredentials; + return Status::OK(); +} namespace { struct AzurePath { diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index e5af4d23aabe5..1f7047ff94c56 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -79,6 +79,9 @@ struct ARROW_EXPORT AzureOptions { AzureOptions(); + Status ConfigureAccountKeyCredentials(const std::string& account_name, + const std::string& account_key); + bool Equals(const AzureOptions& other) const; }; From cc552bfe911e35f15d1f081d7674396b1813233e Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 29 Sep 2023 14:56:21 +0100 Subject: [PATCH 06/40] TestAzureFileSystem builds successfully --- cpp/src/arrow/filesystem/azurefs.cc | 2 +- cpp/src/arrow/filesystem/azurefs_test.cc | 148 +++++++++++------------ 2 files changed, 75 insertions(+), 75 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 46308732a7de5..30aacbf283a6e 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -50,7 +50,7 @@ bool AzureOptions::Equals(const AzureOptions& other) const { Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& account_name, const std::string& account_key) { - if (this->is_azurite) { + if (this->backend == AzureBackend::Azurite) { account_blob_url = "http://127.0.0.1:10000/" + account_name + "/"; account_dfs_url = "http://127.0.0.1:10000/" + account_name + "/"; } else { diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 4e06f2f8772ee..486e9af730e85 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -50,6 +50,7 @@ #include #include #include +#include namespace arrow { using internal::TemporaryDir; @@ -154,83 +155,82 @@ TEST(AzureFileSystem, OptionsCompare) { EXPECT_TRUE(options.Equals(options)); } +class TestAzureFileSystem : public ::testing::Test { + public: + std::shared_ptr fs_; + std::shared_ptr gen2_client_; + AzureOptions options_; + + void MakeFileSystem() { + const std::string& account_name = GetAzuriteEnv()->account_name(); + const std::string& account_key = GetAzuriteEnv()->account_key(); + options_.backend = AzureBackend::Azurite; + ASSERT_OK(options_.ConfigureAccountKeyCredentials(account_name, account_key)); + gen2_client_ = + std::make_shared( + options_.account_dfs_url, options_.storage_credentials_provider); + ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); + } -// class TestAzureFileSystem : public ::testing::Test { -// public: -// std::shared_ptr fs_; -// std::shared_ptr gen2_client_; -// AzureOptions options_; - -// void MakeFileSystem() { -// const std::string& account_name = GetAzuriteEnv()->account_name(); -// const std::string& account_key = GetAzuriteEnv()->account_key(); -// options_.is_azurite = true; -// options_.ConfigureAccountKeyCredentials(account_name, account_key); -// gen2_client_ = -// std::make_shared( -// options_.account_dfs_url, options_.storage_credentials_provider); -// ASSERT_OK_AND_ASSIGN(fs_, AzureBlobFileSystem::Make(options_)); -// } - -// void SetUp() override { -// ASSERT_THAT(GetAzuriteEnv(), NotNull()); -// ASSERT_OK(GetAzuriteEnv()->status()); - -// MakeFileSystem(); -// auto file_system_client = gen2_client_->GetFileSystemClient("container"); -// file_system_client.CreateIfNotExists(); -// file_system_client = gen2_client_->GetFileSystemClient("empty-container"); -// file_system_client.CreateIfNotExists(); -// auto file_client = -// std::make_shared( -// options_.account_blob_url + "container/somefile", -// options_.storage_credentials_provider); -// std::string s = "some data"; -// file_client->UploadFrom( -// const_cast(reinterpret_cast(s.data())), s.size()); -// } + void SetUp() override { + ASSERT_THAT(GetAzuriteEnv(), NotNull()); + ASSERT_OK(GetAzuriteEnv()->status()); + + MakeFileSystem(); + auto file_system_client = gen2_client_->GetFileSystemClient("container"); + file_system_client.CreateIfNotExists(); + file_system_client = gen2_client_->GetFileSystemClient("empty-container"); + file_system_client.CreateIfNotExists(); + auto file_client = + std::make_shared( + options_.account_blob_url + "container/somefile", + options_.storage_credentials_provider); + std::string s = "some data"; + file_client->UploadFrom( + const_cast(reinterpret_cast(s.data())), s.size()); + } -// void TearDown() override { -// auto containers = gen2_client_->ListFileSystems(); -// for (auto container : containers.FileSystems) { -// auto file_system_client = gen2_client_->GetFileSystemClient(container.Name); -// file_system_client.DeleteIfExists(); -// } -// } + void TearDown() override { + auto containers = gen2_client_->ListFileSystems(); + for (auto container : containers.FileSystems) { + auto file_system_client = gen2_client_->GetFileSystemClient(container.Name); + file_system_client.DeleteIfExists(); + } + } -// void AssertObjectContents( -// Azure::Storage::Files::DataLake::DataLakeServiceClient* client, -// const std::string& container, const std::string& path_to_file, -// const std::string& expected) { -// auto path_client = -// std::make_shared( -// client->GetUrl() + container + "/" + path_to_file, -// options_.storage_credentials_provider); -// auto size = path_client->GetProperties().Value.FileSize; -// if (size == 0) { -// ASSERT_EQ(expected, ""); -// return; -// } -// auto buf = AllocateBuffer(size, fs_->io_context().pool()); -// Azure::Storage::Blobs::DownloadBlobToOptions download_options; -// Azure::Core::Http::HttpRange range; -// range.Offset = 0; -// range.Length = size; -// download_options.Range = Azure::Nullable(range); -// auto file_client = -// std::make_shared( -// client->GetUrl() + container + "/" + path_to_file, -// options_.storage_credentials_provider); -// auto result = file_client -// ->DownloadTo(reinterpret_cast(buf->get()->mutable_data()), -// size, download_options) -// .Value; -// auto buf_data = std::move(buf->get()); -// auto expected_data = std::make_shared( -// reinterpret_cast(expected.data()), expected.size()); -// AssertBufferEqual(*buf_data, *expected_data); -// } -// }; + void AssertObjectContents( + Azure::Storage::Files::DataLake::DataLakeServiceClient* client, + const std::string& container, const std::string& path_to_file, + const std::string& expected) { + auto path_client = + std::make_shared( + client->GetUrl() + container + "/" + path_to_file, + options_.storage_credentials_provider); + auto size = path_client->GetProperties().Value.FileSize; + if (size == 0) { + ASSERT_EQ(expected, ""); + return; + } + auto buf = AllocateBuffer(size, fs_->io_context().pool()); + Azure::Storage::Blobs::DownloadBlobToOptions download_options; + Azure::Core::Http::HttpRange range; + range.Offset = 0; + range.Length = size; + download_options.Range = Azure::Nullable(range); + auto file_client = + std::make_shared( + client->GetUrl() + container + "/" + path_to_file, + options_.storage_credentials_provider); + auto result = file_client + ->DownloadTo(reinterpret_cast(buf->get()->mutable_data()), + size, download_options) + .Value; + auto buf_data = std::move(buf->get()); + auto expected_data = std::make_shared( + reinterpret_cast(expected.data()), expected.size()); + AssertBufferEqual(*buf_data, *expected_data); + } +}; // TEST_F(TestAzureFileSystem, FromAccountKey) { // auto options = AzureOptions::FromAccountKey(GetAzuriteEnv()->account_name(), From 8874fbeff2b89ea45ddf5db0a10bec7b4d320820 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Fri, 29 Sep 2023 19:24:07 +0100 Subject: [PATCH 07/40] Paste in OpenInputFile from #12914 --- cpp/src/arrow/filesystem/azurefs.cc | 92 +++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 30aacbf283a6e..0ffc2c17d03a8 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -353,6 +353,86 @@ class AzureFileSystem::Impl { } const AzureOptions& options() const { return options_; } + + Result> OpenInputFile(const std::string& s, + AzureBlobFileSystem* fs) { + ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(s)); + + if (path.empty()) { + return ::arrow::fs::internal::PathNotFound(path.full_path); + } + if (!is_hierarchical_namespace_enabled_) { + if (path.path_to_file_parts.size() > 1) { + return Status::IOError( + "Invalid Azure Blob Storage path provided," + " hierarchical namespace not enabled in storage account"); + } + } + ARROW_ASSIGN_OR_RAISE(auto response, FileExists(dfs_endpoint_url_ + path.full_path)); + if (!response) { + return ::arrow::fs::internal::PathNotFound(path.full_path); + } + std::shared_ptr path_client; + ARROW_ASSIGN_OR_RAISE( + path_client, InitPathClient( + options_, dfs_endpoint_url_ + path.full_path, path.container, + path.path_to_file)); + + std::shared_ptr file_client; + ARROW_ASSIGN_OR_RAISE( + file_client, InitPathClient( + options_, dfs_endpoint_url_ + path.full_path, path.container, + path.path_to_file)); + + auto ptr = std::make_shared(path_client, file_client, + fs->io_context(), path); + RETURN_NOT_OK(ptr->Init()); + return ptr; + } +}; + +Result> OpenInputFile(const FileInfo& info, + AzureBlobFileSystem* fs) { + if (info.type() == FileType::NotFound) { + return ::arrow::fs::internal::PathNotFound(info.path()); + } + if (info.type() != FileType::File && info.type() != FileType::Unknown) { + return ::arrow::fs::internal::NotAFile(info.path()); + } + + ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(info.path())); + + if (!is_hierarchical_namespace_enabled_) { + if (path.path_to_file_parts.size() > 1) { + return Status::IOError( + "Invalid Azure Blob Storage path provided, hierarchical namespace" + " not enabled in storage account"); + } + } + ARROW_ASSIGN_OR_RAISE(auto response, FileExists(dfs_endpoint_url_ + info.path())); + if (!response) { + return ::arrow::fs::internal::PathNotFound(info.path()); + } + std::shared_ptr path_client; + ARROW_ASSIGN_OR_RAISE( + path_client, + InitPathClient( + options_, dfs_endpoint_url_ + info.path(), path.container, path.path_to_file)); + + std::shared_ptr file_client; + ARROW_ASSIGN_OR_RAISE( + file_client, + InitPathClient( + options_, dfs_endpoint_url_ + info.path(), path.container, path.path_to_file)); + + auto ptr = std::make_shared(path_client, file_client, fs->io_context(), + path, info.size()); + RETURN_NOT_OK(ptr->Init()); + return ptr; +} + +protected: +AzureOptions options_; }; const AzureOptions& AzureFileSystem::options() const { return impl_->options(); } @@ -405,23 +485,23 @@ Status AzureFileSystem::CopyFile(const std::string& src, const std::string& dest } Result> AzureFileSystem::OpenInputStream( - const std::string& path) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + const std::string& s) { + return impl_->OpenInputFile(s, this); } Result> AzureFileSystem::OpenInputStream( const FileInfo& info) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + return impl_->OpenInputFile(info, this); } Result> AzureFileSystem::OpenInputFile( - const std::string& path) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + const std::string& s) { + return impl_->OpenInputFile(s, this); } Result> AzureFileSystem::OpenInputFile( const FileInfo& info) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + return impl_->OpenInputFile(info, this); } Result> AzureFileSystem::OpenOutputStream( From 7402412fb0245d19b865b2f14c8ec637bfeb6367 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 1 Oct 2023 17:18:28 +0100 Subject: [PATCH 08/40] First test builds successfully --- cpp/src/arrow/filesystem/azurefs.cc | 150 ++++++++------------- cpp/src/arrow/filesystem/azurefs_test.cc | 159 +++++++++++++---------- 2 files changed, 146 insertions(+), 163 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 0ffc2c17d03a8..12c325415762c 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -19,6 +19,7 @@ #include #include +#include #include #include "arrow/buffer.h" @@ -26,12 +27,11 @@ #include "arrow/filesystem/util_internal.h" #include "arrow/result.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/future.h" #include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" -#include "arrow/util/future.h" #include "arrow/util/string.h" - namespace arrow { namespace fs { @@ -81,7 +81,8 @@ struct AzurePath { // Expected input here => s = synapsemlfs/testdir/testfile.txt, // http://127.0.0.1/accountName/pathToBlob auto src = internal::RemoveTrailingSlash(s); - if (arrow::internal::StartsWith(src, "https://127.0.0.1") || arrow::internal::StartsWith(src, "http://127.0.0.1")) { + if (arrow::internal::StartsWith(src, "https://127.0.0.1") || + arrow::internal::StartsWith(src, "http://127.0.0.1")) { RETURN_NOT_OK(FromLocalHostString(&src)); } auto input_path = std::string(src.data()); @@ -174,11 +175,9 @@ std::shared_ptr GetObjectMetadata(const ObjectResult& re class ObjectInputFile final : public io::RandomAccessFile { public: ObjectInputFile( - std::shared_ptr& path_client, std::shared_ptr& file_client, const io::IOContext& io_context, const AzurePath& path, int64_t size = kNoSize) - : path_client_(std::move(path_client)), - file_client_(std::move(file_client)), + : file_client_(std::move(file_client)), io_context_(io_context), path_(path), content_length_(size) {} @@ -189,7 +188,7 @@ class ObjectInputFile final : public io::RandomAccessFile { return Status::OK(); } try { - auto properties = path_client_->GetProperties(); + auto properties = file_client_->GetProperties(); if (properties.Value.IsDirectory) { return ::arrow::fs::internal::NotAFile(path_.full_path); } @@ -225,12 +224,11 @@ class ObjectInputFile final : public io::RandomAccessFile { } Future> ReadMetadataAsync( - const io::IOContext& io_context) override { + const io::IOContext& io_context) override { return metadata_; } Status Close() override { - path_client_ = nullptr; file_client_ = nullptr; closed_ = true; return Status::OK(); @@ -312,7 +310,6 @@ class ObjectInputFile final : public io::RandomAccessFile { } protected: - std::shared_ptr path_client_; std::shared_ptr file_client_; const io::IOContext io_context_; AzurePath path_; @@ -331,108 +328,73 @@ class ObjectInputFile final : public io::RandomAccessFile { class AzureFileSystem::Impl { public: io::IOContext io_context_; - bool is_hierarchical_namespace_enabled_; + std::shared_ptr service_client_; AzureOptions options_; explicit Impl(AzureOptions options, io::IOContext io_context) : io_context_(io_context), options_(std::move(options)) {} Status Init() { - // TODO: GH-18014 Delete this once we have a proper implementation. This just - // initializes a pointless Azure blob service client with a fake endpoint to ensure - // the build will fail if the Azure SDK build is broken. - auto default_credential = std::make_shared(); - auto service_client = Azure::Storage::Blobs::BlobServiceClient( - "http://fake-blob-storage-endpoint", default_credential); - if (options_.backend == AzureBackend::Azurite) { - // gen1Client_->GetAccountInfo().Value.IsHierarchicalNamespaceEnabled - // throws error in azurite - is_hierarchical_namespace_enabled_ = false; - } + service_client_ = + std::make_shared( + options_.account_dfs_url, options_.storage_credentials_provider); return Status::OK(); } const AzureOptions& options() const { return options_; } Result> OpenInputFile(const std::string& s, - AzureBlobFileSystem* fs) { + AzureFileSystem* fs) { ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(s)); if (path.empty()) { return ::arrow::fs::internal::PathNotFound(path.full_path); } - if (!is_hierarchical_namespace_enabled_) { - if (path.path_to_file_parts.size() > 1) { - return Status::IOError( - "Invalid Azure Blob Storage path provided," - " hierarchical namespace not enabled in storage account"); - } - } - ARROW_ASSIGN_OR_RAISE(auto response, FileExists(dfs_endpoint_url_ + path.full_path)); - if (!response) { - return ::arrow::fs::internal::PathNotFound(path.full_path); - } - std::shared_ptr path_client; - ARROW_ASSIGN_OR_RAISE( - path_client, InitPathClient( - options_, dfs_endpoint_url_ + path.full_path, path.container, - path.path_to_file)); - - std::shared_ptr file_client; - ARROW_ASSIGN_OR_RAISE( - file_client, InitPathClient( - options_, dfs_endpoint_url_ + path.full_path, path.container, - path.path_to_file)); - - auto ptr = std::make_shared(path_client, file_client, - fs->io_context(), path); + + // TODO: Wrap this is a try catch and return + // `::arrow::fs::internal::NotAFile(blob_client.GetUrl());` if it fails because the + // file does not exist. + auto file_client = + std::make_shared( + std::move(service_client_->GetFileSystemClient(path.container) + .GetFileClient(path.path_to_file))); + // auto file_client = + // std::make_shared(options_.account_dfs_url, + // path.container, path.path_to_file, options_.storage_credentials_provider); + + auto ptr = std::make_shared(file_client, fs->io_context(), path); RETURN_NOT_OK(ptr->Init()); return ptr; } -}; - -Result> OpenInputFile(const FileInfo& info, - AzureBlobFileSystem* fs) { - if (info.type() == FileType::NotFound) { - return ::arrow::fs::internal::PathNotFound(info.path()); - } - if (info.type() != FileType::File && info.type() != FileType::Unknown) { - return ::arrow::fs::internal::NotAFile(info.path()); - } - - ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(info.path())); - if (!is_hierarchical_namespace_enabled_) { - if (path.path_to_file_parts.size() > 1) { - return Status::IOError( - "Invalid Azure Blob Storage path provided, hierarchical namespace" - " not enabled in storage account"); - } - } - ARROW_ASSIGN_OR_RAISE(auto response, FileExists(dfs_endpoint_url_ + info.path())); - if (!response) { - return ::arrow::fs::internal::PathNotFound(info.path()); - } - std::shared_ptr path_client; - ARROW_ASSIGN_OR_RAISE( - path_client, - InitPathClient( - options_, dfs_endpoint_url_ + info.path(), path.container, path.path_to_file)); - - std::shared_ptr file_client; - ARROW_ASSIGN_OR_RAISE( - file_client, - InitPathClient( - options_, dfs_endpoint_url_ + info.path(), path.container, path.path_to_file)); - - auto ptr = std::make_shared(path_client, file_client, fs->io_context(), - path, info.size()); - RETURN_NOT_OK(ptr->Init()); - return ptr; -} - -protected: -AzureOptions options_; + // Result> OpenInputFile(const FileInfo& info, + // AzureFileSystem* fs) { + // if (info.type() == FileType::NotFound) { + // return ::arrow::fs::internal::PathNotFound(info.path()); + // } + // if (info.type() != FileType::File && info.type() != FileType::Unknown) { + // return ::arrow::fs::internal::NotAFile(info.path()); + // } + + // ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(info.path())); + + // std::shared_ptr path_client; + // ARROW_ASSIGN_OR_RAISE( + // path_client, + // InitPathClient( + // options_, dfs_endpoint_url_ + info.path(), path.container, path.path_to_file)); + + // std::shared_ptr file_client; + // ARROW_ASSIGN_OR_RAISE( + // file_client, + // InitPathClient( + // options_, dfs_endpoint_url_ + info.path(), path.container, path.path_to_file)); + + // auto ptr = std::make_shared(path_client, file_client, fs->io_context(), + // path, info.size()); + // RETURN_NOT_OK(ptr->Init()); + // return ptr; + // } }; const AzureOptions& AzureFileSystem::options() const { return impl_->options(); } @@ -491,7 +453,8 @@ Result> AzureFileSystem::OpenInputStream( Result> AzureFileSystem::OpenInputStream( const FileInfo& info) { - return impl_->OpenInputFile(info, this); + // return impl_->OpenInputFile(info, this); + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); } Result> AzureFileSystem::OpenInputFile( @@ -501,7 +464,8 @@ Result> AzureFileSystem::OpenInputFile( Result> AzureFileSystem::OpenInputFile( const FileInfo& info) { - return impl_->OpenInputFile(info, this); + return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + // return impl_->OpenInputFile(info, this); } Result> AzureFileSystem::OpenOutputStream( diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 486e9af730e85..2178381387924 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -40,6 +40,7 @@ #include #include +#include #include #include "arrow/testing/gtest_util.h" @@ -160,6 +161,8 @@ class TestAzureFileSystem : public ::testing::Test { std::shared_ptr fs_; std::shared_ptr gen2_client_; AzureOptions options_; + std::mt19937_64 generator_; + std::string container_name_; void MakeFileSystem() { const std::string& account_name = GetAzuriteEnv()->account_name(); @@ -177,17 +180,10 @@ class TestAzureFileSystem : public ::testing::Test { ASSERT_OK(GetAzuriteEnv()->status()); MakeFileSystem(); - auto file_system_client = gen2_client_->GetFileSystemClient("container"); + generator_ = std::mt19937_64(std::random_device()()); + container_name_ = RandomChars(32); + auto file_system_client = gen2_client_->GetFileSystemClient(container_name_); file_system_client.CreateIfNotExists(); - file_system_client = gen2_client_->GetFileSystemClient("empty-container"); - file_system_client.CreateIfNotExists(); - auto file_client = - std::make_shared( - options_.account_blob_url + "container/somefile", - options_.storage_credentials_provider); - std::string s = "some data"; - file_client->UploadFrom( - const_cast(reinterpret_cast(s.data())), s.size()); } void TearDown() override { @@ -230,68 +226,91 @@ class TestAzureFileSystem : public ::testing::Test { reinterpret_cast(expected.data()), expected.size()); AssertBufferEqual(*buf_data, *expected_data); } -}; -// TEST_F(TestAzureFileSystem, FromAccountKey) { -// auto options = AzureOptions::FromAccountKey(GetAzuriteEnv()->account_name(), -// GetAzuriteEnv()->account_key()) -// .ValueOrDie(); -// ASSERT_EQ(options.credentials_kind, -// arrow::fs::AzureCredentialsKind::StorageCredentials); -// ASSERT_NE(options.storage_credentials_provider, nullptr); -// } + std::string PreexistingContainerName() const { return container_name_; } -// TEST_F(GcsIntegrationTest, OpenInputFileMixedReadVsReadAt) { -// auto fs = GcsFileSystem::Make(TestGcsOptions()); + std::string PreexistingContainerPath() const { + return PreexistingContainerName() + '/'; + } -// // Create a file large enough to make the random access tests non-trivial. -// auto constexpr kLineWidth = 100; -// auto constexpr kLineCount = 4096; -// std::vector lines(kLineCount); -// int lineno = 0; -// std::generate_n(lines.begin(), lines.size(), -// [&] { return RandomLine(++lineno, kLineWidth); }); + std::string RandomLine(int lineno, std::size_t width) { + auto line = std::to_string(lineno) + ": "; + line += RandomChars(width - line.size() - 1); + line += '\n'; + return line; + } -// const auto path = -// PreexistingBucketPath() + "OpenInputFileMixedReadVsReadAt/object-name"; -// std::shared_ptr output; -// ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); -// for (auto const& line : lines) { -// ASSERT_OK(output->Write(line.data(), line.size())); -// } -// ASSERT_OK(output->Close()); + uint8_t RandomInteger() { + return std::uniform_int_distribution()(generator_); + } -// std::shared_ptr file; -// ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(path)); -// for (int i = 0; i != 32; ++i) { -// SCOPED_TRACE("Iteration " + std::to_string(i)); -// // Verify sequential reads work as expected. -// std::array buffer{}; -// std::int64_t size; -// { -// ASSERT_OK_AND_ASSIGN(auto actual, file->Read(kLineWidth)); -// EXPECT_EQ(lines[2 * i], actual->ToString()); -// } -// { -// ASSERT_OK_AND_ASSIGN(size, file->Read(buffer.size(), buffer.data())); -// EXPECT_EQ(size, kLineWidth); -// auto actual = std::string{buffer.begin(), buffer.end()}; -// EXPECT_EQ(lines[2 * i + 1], actual); -// } - -// // Verify random reads interleave too. -// auto const index = RandomIndex(kLineCount); -// auto const position = index * kLineWidth; -// ASSERT_OK_AND_ASSIGN(size, file->ReadAt(position, buffer.size(), buffer.data())); -// EXPECT_EQ(size, kLineWidth); -// auto actual = std::string{buffer.begin(), buffer.end()}; -// EXPECT_EQ(lines[index], actual); - -// // Verify random reads using buffers work. -// ASSERT_OK_AND_ASSIGN(auto b, file->ReadAt(position, kLineWidth)); -// EXPECT_EQ(lines[index], b->ToString()); -// } -// } + std::size_t RandomIndex(std::size_t end) { + return std::uniform_int_distribution(0, end - 1)(generator_); + } + + std::string RandomChars(std::size_t count) { + auto const fillers = std::string("abcdefghijlkmnopqrstuvwxyz0123456789"); + std::uniform_int_distribution d(0, fillers.size() - 1); + std::string s; + std::generate_n(std::back_inserter(s), count, [&] { return fillers[d(generator_)]; }); + return s; + } +}; + +TEST_F(TestAzureFileSystem, OpenInputFileMixedReadVsReadAt) { + // Create a file large enough to make the random access tests non-trivial. + auto constexpr kLineWidth = 100; + auto constexpr kLineCount = 4096; + std::vector lines(kLineCount); + int lineno = 0; + std::generate_n(lines.begin(), lines.size(), + [&] { return RandomLine(++lineno, kLineWidth); }); + + const auto path_to_file = "OpenInputFileMixedReadVsReadAt/object-name"; + const auto path = PreexistingContainerPath() + path_to_file; + + // TODO: Switch to using Azure filesystem to write once its implemented. + auto file_client = gen2_client_->GetFileSystemClient("container").GetFileClient(path); + int64_t total_size = 0; + for (auto const& line : lines) { + auto bufferStream = Azure::Core::IO::MemoryBodyStream( + reinterpret_cast(line.data()), line.size()); + file_client.Append(bufferStream, 0); + total_size += line.size(); + } + file_client.Flush(total_size); + + std::shared_ptr file; + ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); + for (int i = 0; i != 32; ++i) { + SCOPED_TRACE("Iteration " + std::to_string(i)); + // Verify sequential reads work as expected. + std::array buffer{}; + std::int64_t size; + { + ASSERT_OK_AND_ASSIGN(auto actual, file->Read(kLineWidth)); + EXPECT_EQ(lines[2 * i], actual->ToString()); + } + { + ASSERT_OK_AND_ASSIGN(size, file->Read(buffer.size(), buffer.data())); + EXPECT_EQ(size, kLineWidth); + auto actual = std::string{buffer.begin(), buffer.end()}; + EXPECT_EQ(lines[2 * i + 1], actual); + } + + // Verify random reads interleave too. + auto const index = RandomIndex(kLineCount); + auto const position = index * kLineWidth; + ASSERT_OK_AND_ASSIGN(size, file->ReadAt(position, buffer.size(), buffer.data())); + EXPECT_EQ(size, kLineWidth); + auto actual = std::string{buffer.begin(), buffer.end()}; + EXPECT_EQ(lines[index], actual); + + // Verify random reads using buffers work. + ASSERT_OK_AND_ASSIGN(auto b, file->ReadAt(position, kLineWidth)); + EXPECT_EQ(lines[index], b->ToString()); + } +} // TEST_F(GcsIntegrationTest, OpenInputFileRandomSeek) { // auto fs = GcsFileSystem::Make(TestGcsOptions()); @@ -304,7 +323,7 @@ class TestAzureFileSystem : public ::testing::Test { // std::generate_n(lines.begin(), lines.size(), // [&] { return RandomLine(++lineno, kLineWidth); }); -// const auto path = PreexistingBucketPath() + "OpenInputFileRandomSeek/object-name"; +// const auto path = PreexistingContainerPath() + "OpenInputFileRandomSeek/object-name"; // std::shared_ptr output; // ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); // for (auto const& line : lines) { @@ -329,7 +348,7 @@ class TestAzureFileSystem : public ::testing::Test { // auto fs = GcsFileSystem::Make(TestGcsOptions()); // // Create a test file. -// const auto path = PreexistingBucketPath() + "OpenInputFileIoContext/object-name"; +// const auto path = PreexistingContainerPath() + "OpenInputFileIoContext/object-name"; // std::shared_ptr output; // ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); // const std::string contents = "The quick brown fox jumps over the lazy dog"; @@ -369,7 +388,7 @@ class TestAzureFileSystem : public ::testing::Test { // auto fs = GcsFileSystem::Make(TestGcsOptions()); // arrow::fs::FileInfo info; -// ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingBucketPath())); +// ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingContainerPath())); // ASSERT_RAISES(IOError, fs->OpenInputFile(info)); // ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(NotFoundObjectPath())); From 8d574dca0e5c6401217f6cffc903445c9dfcb9ce Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 1 Oct 2023 22:43:08 +0100 Subject: [PATCH 09/40] First ReadAt test passes --- cpp/src/arrow/filesystem/azurefs.cc | 44 ++++++++++-------------- cpp/src/arrow/filesystem/azurefs_test.cc | 12 ++----- 2 files changed, 22 insertions(+), 34 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 12c325415762c..24562b5da13ea 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -174,9 +174,9 @@ std::shared_ptr GetObjectMetadata(const ObjectResult& re class ObjectInputFile final : public io::RandomAccessFile { public: - ObjectInputFile( - std::shared_ptr& file_client, - const io::IOContext& io_context, const AzurePath& path, int64_t size = kNoSize) + ObjectInputFile(std::shared_ptr& file_client, + const io::IOContext& io_context, const AzurePath& path, + int64_t size = kNoSize) : file_client_(std::move(file_client)), io_context_(io_context), path_(path), @@ -189,10 +189,7 @@ class ObjectInputFile final : public io::RandomAccessFile { } try { auto properties = file_client_->GetProperties(); - if (properties.Value.IsDirectory) { - return ::arrow::fs::internal::NotAFile(path_.full_path); - } - content_length_ = properties.Value.FileSize; + content_length_ = properties.Value.BlobSize; metadata_ = GetObjectMetadata(properties.Value.Metadata); return Status::OK(); } catch (const Azure::Storage::StorageException& exception) { @@ -276,6 +273,8 @@ class ObjectInputFile final : public io::RandomAccessFile { .Value; return result.ContentRange.Length.Value(); } catch (const Azure::Storage::StorageException& exception) { + // TODO: return `::arrow::fs::internal::NotAFile(blob_client.GetUrl());` if the + // file does not exist. return Status::IOError(exception.RawResponse->GetReasonPhrase()); } } @@ -310,7 +309,7 @@ class ObjectInputFile final : public io::RandomAccessFile { } protected: - std::shared_ptr file_client_; + std::shared_ptr file_client_; const io::IOContext io_context_; AzurePath path_; @@ -328,16 +327,15 @@ class ObjectInputFile final : public io::RandomAccessFile { class AzureFileSystem::Impl { public: io::IOContext io_context_; - std::shared_ptr service_client_; + std::shared_ptr service_client_; AzureOptions options_; explicit Impl(AzureOptions options, io::IOContext io_context) : io_context_(io_context), options_(std::move(options)) {} Status Init() { - service_client_ = - std::make_shared( - options_.account_dfs_url, options_.storage_credentials_provider); + service_client_ = std::make_shared( + options_.account_blob_url, options_.storage_credentials_provider); return Status::OK(); } @@ -351,16 +349,9 @@ class AzureFileSystem::Impl { return ::arrow::fs::internal::PathNotFound(path.full_path); } - // TODO: Wrap this is a try catch and return - // `::arrow::fs::internal::NotAFile(blob_client.GetUrl());` if it fails because the - // file does not exist. - auto file_client = - std::make_shared( - std::move(service_client_->GetFileSystemClient(path.container) - .GetFileClient(path.path_to_file))); - // auto file_client = - // std::make_shared(options_.account_dfs_url, - // path.container, path.path_to_file, options_.storage_credentials_provider); + auto file_client = std::make_shared( + std::move(service_client_->GetBlobContainerClient(path.container) + .GetBlobClient(path.path_to_file))); auto ptr = std::make_shared(file_client, fs->io_context(), path); RETURN_NOT_OK(ptr->Init()); @@ -382,15 +373,18 @@ class AzureFileSystem::Impl { // ARROW_ASSIGN_OR_RAISE( // path_client, // InitPathClient( - // options_, dfs_endpoint_url_ + info.path(), path.container, path.path_to_file)); + // options_, dfs_endpoint_url_ + info.path(), path.container, + // path.path_to_file)); // std::shared_ptr file_client; // ARROW_ASSIGN_OR_RAISE( // file_client, // InitPathClient( - // options_, dfs_endpoint_url_ + info.path(), path.container, path.path_to_file)); + // options_, dfs_endpoint_url_ + info.path(), path.container, + // path.path_to_file)); - // auto ptr = std::make_shared(path_client, file_client, fs->io_context(), + // auto ptr = std::make_shared(path_client, file_client, + // fs->io_context(), // path, info.size()); // RETURN_NOT_OK(ptr->Init()); // return ptr; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 2178381387924..393bfe60eed5d 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -270,15 +270,9 @@ TEST_F(TestAzureFileSystem, OpenInputFileMixedReadVsReadAt) { const auto path = PreexistingContainerPath() + path_to_file; // TODO: Switch to using Azure filesystem to write once its implemented. - auto file_client = gen2_client_->GetFileSystemClient("container").GetFileClient(path); - int64_t total_size = 0; - for (auto const& line : lines) { - auto bufferStream = Azure::Core::IO::MemoryBodyStream( - reinterpret_cast(line.data()), line.size()); - file_client.Append(bufferStream, 0); - total_size += line.size(); - } - file_client.Flush(total_size); + auto file_client = gen2_client_->GetFileSystemClient(PreexistingContainerName()).GetFileClient(path_to_file); + std::string all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); + file_client.UploadFrom(reinterpret_cast(all_lines.data()), kLineWidth * kLineCount); std::shared_ptr file; ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); From 07947f3ec16bfa61e46a5fdc091a4d1c68cdb64e Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 3 Oct 2023 09:30:06 +0100 Subject: [PATCH 10/40] Majority of tests working --- cpp/src/arrow/filesystem/azurefs.cc | 1 + cpp/src/arrow/filesystem/azurefs_test.cc | 152 +++++++++++++---------- 2 files changed, 87 insertions(+), 66 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 24562b5da13ea..bf216c6c1ad49 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -193,6 +193,7 @@ class ObjectInputFile final : public io::RandomAccessFile { metadata_ = GetObjectMetadata(properties.Value.Metadata); return Status::OK(); } catch (const Azure::Storage::StorageException& exception) { + // TODO: Only return path not found if Azure gave us path not found. return ::arrow::fs::internal::PathNotFound(path_.full_path); } } diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 393bfe60eed5d..ac0e121838f6f 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -63,6 +63,15 @@ using ::testing::IsEmpty; using ::testing::Not; using ::testing::NotNull; +auto const* kLoremIpsum = R"""( +Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor +incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis +nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. +Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu +fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in +culpa qui officia deserunt mollit anim id est laborum. +)"""; + class AzuriteEnv : public ::testing::Environment { public: AzuriteEnv() { @@ -184,6 +193,10 @@ class TestAzureFileSystem : public ::testing::Test { container_name_ = RandomChars(32); auto file_system_client = gen2_client_->GetFileSystemClient(container_name_); file_system_client.CreateIfNotExists(); + + auto file_client = file_system_client.GetFileClient(PreexistingObjectName()); + file_client.UploadFrom(reinterpret_cast(kLoremIpsum), + strlen(kLoremIpsum)); } void TearDown() override { @@ -233,6 +246,14 @@ class TestAzureFileSystem : public ::testing::Test { return PreexistingContainerName() + '/'; } + static std::string PreexistingObjectName() { return "test-object-name"; } + + std::string PreexistingObjectPath() const { + return PreexistingContainerPath() + PreexistingObjectName(); + } + + std::string NotFoundObjectPath() { return PreexistingContainerPath() + "not-found"; } + std::string RandomLine(int lineno, std::size_t width) { auto line = std::to_string(lineno) + ": "; line += RandomChars(width - line.size() - 1); @@ -255,6 +276,16 @@ class TestAzureFileSystem : public ::testing::Test { std::generate_n(std::back_inserter(s), count, [&] { return fillers[d(generator_)]; }); return s; } + + void UploadLines(std::vector lines, const char* path_to_file, + int total_size) { + // TODO: Switch to using Azure filesystem to write once its implemented. + auto file_client = gen2_client_->GetFileSystemClient(PreexistingContainerName()) + .GetFileClient(path_to_file); + std::string all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); + file_client.UploadFrom(reinterpret_cast(all_lines.data()), + total_size); + } }; TEST_F(TestAzureFileSystem, OpenInputFileMixedReadVsReadAt) { @@ -269,10 +300,7 @@ TEST_F(TestAzureFileSystem, OpenInputFileMixedReadVsReadAt) { const auto path_to_file = "OpenInputFileMixedReadVsReadAt/object-name"; const auto path = PreexistingContainerPath() + path_to_file; - // TODO: Switch to using Azure filesystem to write once its implemented. - auto file_client = gen2_client_->GetFileSystemClient(PreexistingContainerName()).GetFileClient(path_to_file); - std::string all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); - file_client.UploadFrom(reinterpret_cast(all_lines.data()), kLineWidth * kLineCount); + UploadLines(lines, path_to_file, kLineCount * kLineWidth); std::shared_ptr file; ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); @@ -306,57 +334,52 @@ TEST_F(TestAzureFileSystem, OpenInputFileMixedReadVsReadAt) { } } -// TEST_F(GcsIntegrationTest, OpenInputFileRandomSeek) { -// auto fs = GcsFileSystem::Make(TestGcsOptions()); +TEST_F(TestAzureFileSystem, OpenInputFileRandomSeek) { + // Create a file large enough to make the random access tests non-trivial. + auto constexpr kLineWidth = 100; + auto constexpr kLineCount = 4096; + std::vector lines(kLineCount); + int lineno = 0; + std::generate_n(lines.begin(), lines.size(), + [&] { return RandomLine(++lineno, kLineWidth); }); -// // Create a file large enough to make the random access tests non-trivial. -// auto constexpr kLineWidth = 100; -// auto constexpr kLineCount = 4096; -// std::vector lines(kLineCount); -// int lineno = 0; -// std::generate_n(lines.begin(), lines.size(), -// [&] { return RandomLine(++lineno, kLineWidth); }); - -// const auto path = PreexistingContainerPath() + "OpenInputFileRandomSeek/object-name"; -// std::shared_ptr output; -// ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); -// for (auto const& line : lines) { -// ASSERT_OK(output->Write(line.data(), line.size())); -// } -// ASSERT_OK(output->Close()); + const auto path_to_file = "OpenInputFileRandomSeek/object-name"; + const auto path = PreexistingContainerPath() + path_to_file; + std::shared_ptr output; -// std::shared_ptr file; -// ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(path)); -// for (int i = 0; i != 32; ++i) { -// SCOPED_TRACE("Iteration " + std::to_string(i)); -// // Verify sequential reads work as expected. -// auto const index = RandomIndex(kLineCount); -// auto const position = index * kLineWidth; -// ASSERT_OK(file->Seek(position)); -// ASSERT_OK_AND_ASSIGN(auto actual, file->Read(kLineWidth)); -// EXPECT_EQ(lines[index], actual->ToString()); -// } -// } + UploadLines(lines, path_to_file, kLineCount * kLineWidth); -// TEST_F(GcsIntegrationTest, OpenInputFileIoContext) { -// auto fs = GcsFileSystem::Make(TestGcsOptions()); + std::shared_ptr file; + ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); + for (int i = 0; i != 32; ++i) { + SCOPED_TRACE("Iteration " + std::to_string(i)); + // Verify sequential reads work as expected. + auto const index = RandomIndex(kLineCount); + auto const position = index * kLineWidth; + ASSERT_OK(file->Seek(position)); + ASSERT_OK_AND_ASSIGN(auto actual, file->Read(kLineWidth)); + EXPECT_EQ(lines[index], actual->ToString()); + } +} -// // Create a test file. -// const auto path = PreexistingContainerPath() + "OpenInputFileIoContext/object-name"; -// std::shared_ptr output; -// ASSERT_OK_AND_ASSIGN(output, fs->OpenOutputStream(path, {})); -// const std::string contents = "The quick brown fox jumps over the lazy dog"; -// ASSERT_OK(output->Write(contents.data(), contents.size())); -// ASSERT_OK(output->Close()); +TEST_F(TestAzureFileSystem, OpenInputFileIoContext) { + // Create a test file. + const auto path_to_file = "OpenInputFileIoContext/object-name"; + const auto path = PreexistingContainerPath() + path_to_file; + const std::string contents = "The quick brown fox jumps over the lazy dog"; -// std::shared_ptr file; -// ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(path)); -// EXPECT_EQ(fs->io_context().external_id(), file->io_context().external_id()); -// } + auto file_client = gen2_client_->GetFileSystemClient(PreexistingContainerName()) + .GetFileClient(path_to_file); + file_client.UploadFrom(reinterpret_cast(contents.data()), + contents.length()); -// TEST_F(GcsIntegrationTest, OpenInputFileInfo) { -// auto fs = GcsFileSystem::Make(TestGcsOptions()); + std::shared_ptr file; + ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(path)); + EXPECT_EQ(fs_->io_context().external_id(), file->io_context().external_id()); +} +// TODO: Implement this once GetFileInfo is implemented. +// TEST_F(TestAzureFileSystem, OpenInputFileInfo) { // arrow::fs::FileInfo info; // ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingObjectPath())); @@ -372,12 +395,11 @@ TEST_F(TestAzureFileSystem, OpenInputFileMixedReadVsReadAt) { // EXPECT_EQ(std::string(buffer.data(), size), expected); // } -// TEST_F(GcsIntegrationTest, OpenInputFileNotFound) { -// auto fs = GcsFileSystem::Make(TestGcsOptions()); - -// ASSERT_RAISES(IOError, fs->OpenInputFile(NotFoundObjectPath())); -// } +TEST_F(TestAzureFileSystem, OpenInputFileNotFound) { + ASSERT_RAISES(IOError, fs_->OpenInputFile(NotFoundObjectPath())); +} +// TODO: Implement this once GetFileInfo is implemented. // TEST_F(GcsIntegrationTest, OpenInputFileInfoInvalid) { // auto fs = GcsFileSystem::Make(TestGcsOptions()); @@ -389,19 +411,17 @@ TEST_F(TestAzureFileSystem, OpenInputFileMixedReadVsReadAt) { // ASSERT_RAISES(IOError, fs->OpenInputFile(info)); // } -// TEST_F(GcsIntegrationTest, OpenInputFileClosed) { -// auto fs = GcsFileSystem::Make(TestGcsOptions()); - -// ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenInputFile(PreexistingObjectPath())); -// ASSERT_OK(stream->Close()); -// std::array buffer{}; -// ASSERT_RAISES(Invalid, stream->Tell()); -// ASSERT_RAISES(Invalid, stream->Read(buffer.size(), buffer.data())); -// ASSERT_RAISES(Invalid, stream->Read(buffer.size())); -// ASSERT_RAISES(Invalid, stream->ReadAt(1, buffer.size(), buffer.data())); -// ASSERT_RAISES(Invalid, stream->ReadAt(1, 1)); -// ASSERT_RAISES(Invalid, stream->Seek(2)); -// } +TEST_F(TestAzureFileSystem, OpenInputFileClosed) { + ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputFile(PreexistingObjectPath())); + ASSERT_OK(stream->Close()); + std::array buffer{}; + ASSERT_RAISES(Invalid, stream->Tell()); + ASSERT_RAISES(Invalid, stream->Read(buffer.size(), buffer.data())); + ASSERT_RAISES(Invalid, stream->Read(buffer.size())); + ASSERT_RAISES(Invalid, stream->ReadAt(1, buffer.size(), buffer.data())); + ASSERT_RAISES(Invalid, stream->ReadAt(1, 1)); + ASSERT_RAISES(Invalid, stream->Seek(2)); +} } // namespace } // namespace fs From bb8421fdd19c4117729fd51a839e14c4c39fcde9 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 3 Oct 2023 10:02:15 +0100 Subject: [PATCH 11/40] Implement open file from info and enable the relevant tests --- cpp/src/arrow/filesystem/azurefs.cc | 76 ++++++++++-------------- cpp/src/arrow/filesystem/azurefs_test.cc | 46 +++++++------- 2 files changed, 56 insertions(+), 66 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index bf216c6c1ad49..b7c6ff9c41ebd 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -173,6 +173,16 @@ std::shared_ptr GetObjectMetadata(const ObjectResult& re } class ObjectInputFile final : public io::RandomAccessFile { + protected: + std::shared_ptr file_client_; + const io::IOContext io_context_; + AzurePath path_; + + bool closed_ = false; + int64_t pos_ = 0; + int64_t content_length_ = kNoSize; + std::shared_ptr metadata_; + public: ObjectInputFile(std::shared_ptr& file_client, const io::IOContext& io_context, const AzurePath& path, @@ -308,16 +318,6 @@ class ObjectInputFile final : public io::RandomAccessFile { pos_ += buffer->size(); return std::move(buffer); } - - protected: - std::shared_ptr file_client_; - const io::IOContext io_context_; - AzurePath path_; - - bool closed_ = false; - int64_t pos_ = 0; - int64_t content_length_ = kNoSize; - std::shared_ptr metadata_; }; } // namespace @@ -346,6 +346,8 @@ class AzureFileSystem::Impl { AzureFileSystem* fs) { ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(s)); + // TODO: Return NotAFile if path.path_to_file is empty. Return PathNotFound if account + // name or container name is empty. if (path.empty()) { return ::arrow::fs::internal::PathNotFound(path.full_path); } @@ -359,37 +361,26 @@ class AzureFileSystem::Impl { return ptr; } - // Result> OpenInputFile(const FileInfo& info, - // AzureFileSystem* fs) { - // if (info.type() == FileType::NotFound) { - // return ::arrow::fs::internal::PathNotFound(info.path()); - // } - // if (info.type() != FileType::File && info.type() != FileType::Unknown) { - // return ::arrow::fs::internal::NotAFile(info.path()); - // } - - // ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(info.path())); - - // std::shared_ptr path_client; - // ARROW_ASSIGN_OR_RAISE( - // path_client, - // InitPathClient( - // options_, dfs_endpoint_url_ + info.path(), path.container, - // path.path_to_file)); - - // std::shared_ptr file_client; - // ARROW_ASSIGN_OR_RAISE( - // file_client, - // InitPathClient( - // options_, dfs_endpoint_url_ + info.path(), path.container, - // path.path_to_file)); - - // auto ptr = std::make_shared(path_client, file_client, - // fs->io_context(), - // path, info.size()); - // RETURN_NOT_OK(ptr->Init()); - // return ptr; - // } + Result> OpenInputFile(const FileInfo& info, + AzureFileSystem* fs) { + if (info.type() == FileType::NotFound) { + return ::arrow::fs::internal::PathNotFound(info.path()); + } + if (info.type() != FileType::File && info.type() != FileType::Unknown) { + return ::arrow::fs::internal::NotAFile(info.path()); + } + + ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(info.path())); + + auto file_client = std::make_shared( + std::move(service_client_->GetBlobContainerClient(path.container) + .GetBlobClient(path.path_to_file))); + + auto ptr = std::make_shared(file_client, fs->io_context(), path, + info.size()); + RETURN_NOT_OK(ptr->Init()); + return ptr; + } }; const AzureOptions& AzureFileSystem::options() const { return impl_->options(); } @@ -459,8 +450,7 @@ Result> AzureFileSystem::OpenInputFile( Result> AzureFileSystem::OpenInputFile( const FileInfo& info) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); - // return impl_->OpenInputFile(info, this); + return impl_->OpenInputFile(info, this); } Result> AzureFileSystem::OpenOutputStream( diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index ac0e121838f6f..7b09bf3256c5c 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -378,38 +378,38 @@ TEST_F(TestAzureFileSystem, OpenInputFileIoContext) { EXPECT_EQ(fs_->io_context().external_id(), file->io_context().external_id()); } -// TODO: Implement this once GetFileInfo is implemented. -// TEST_F(TestAzureFileSystem, OpenInputFileInfo) { -// arrow::fs::FileInfo info; -// ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingObjectPath())); +TEST_F(TestAzureFileSystem, OpenInputFileInfo) { + // TODO: When implemented use ASSERT_OK_AND_ASSIGN(info, + // fs->GetFileInfo(PreexistingObjectPath())); + arrow::fs::FileInfo info(PreexistingObjectPath(), FileType::File); -// std::shared_ptr file; -// ASSERT_OK_AND_ASSIGN(file, fs->OpenInputFile(info)); + std::shared_ptr file; + ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(info)); -// std::array buffer{}; -// std::int64_t size; -// auto constexpr kStart = 16; -// ASSERT_OK_AND_ASSIGN(size, file->ReadAt(kStart, buffer.size(), buffer.data())); + std::array buffer{}; + std::int64_t size; + auto constexpr kStart = 16; + ASSERT_OK_AND_ASSIGN(size, file->ReadAt(kStart, buffer.size(), buffer.data())); -// auto const expected = std::string(kLoremIpsum).substr(kStart); -// EXPECT_EQ(std::string(buffer.data(), size), expected); -// } + auto const expected = std::string(kLoremIpsum).substr(kStart); + EXPECT_EQ(std::string(buffer.data(), size), expected); +} TEST_F(TestAzureFileSystem, OpenInputFileNotFound) { ASSERT_RAISES(IOError, fs_->OpenInputFile(NotFoundObjectPath())); } -// TODO: Implement this once GetFileInfo is implemented. -// TEST_F(GcsIntegrationTest, OpenInputFileInfoInvalid) { -// auto fs = GcsFileSystem::Make(TestGcsOptions()); +TEST_F(TestAzureFileSystem, OpenInputFileInfoInvalid) { + // TODO: When implemented use ASSERT_OK_AND_ASSIGN(info, + // fs->GetFileInfo(PreexistingContainerPath())); + arrow::fs::FileInfo info(PreexistingContainerPath(), FileType::File); + ASSERT_RAISES(IOError, fs_->OpenInputFile(info)); -// arrow::fs::FileInfo info; -// ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingContainerPath())); -// ASSERT_RAISES(IOError, fs->OpenInputFile(info)); - -// ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(NotFoundObjectPath())); -// ASSERT_RAISES(IOError, fs->OpenInputFile(info)); -// } + // TODO: When implemented use ASSERT_OK_AND_ASSIGN(info, + // fs->GetFileInfo(NotFoundObjectPath())); + arrow::fs::FileInfo info2(NotFoundObjectPath(), FileType::NotFound); + ASSERT_RAISES(IOError, fs_->OpenInputFile(info2)); +} TEST_F(TestAzureFileSystem, OpenInputFileClosed) { ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputFile(PreexistingObjectPath())); From e8cde8fa7053b713fb6de7b880f6cad572bf4590 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 14 Oct 2023 18:10:31 +0100 Subject: [PATCH 12/40] Add OpenInputStream implementation --- cpp/src/arrow/filesystem/azurefs.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index b7c6ff9c41ebd..fbc2a7686ca9a 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -439,8 +439,7 @@ Result> AzureFileSystem::OpenInputStream( Result> AzureFileSystem::OpenInputStream( const FileInfo& info) { - // return impl_->OpenInputFile(info, this); - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + return impl_->OpenInputFile(info, this); } Result> AzureFileSystem::OpenInputFile( From 0a693c84723f1062a451442888ff6f94980d5995 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 14 Oct 2023 18:17:09 +0100 Subject: [PATCH 13/40] Paste in input stream tests from gcsfd_test.cc --- cpp/src/arrow/filesystem/azurefs_test.cc | 154 +++++++++++++++++++++++ 1 file changed, 154 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 7b09bf3256c5c..6398b7e6f1e68 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -288,6 +288,160 @@ class TestAzureFileSystem : public ::testing::Test { } }; +TEST_F(GcsIntegrationTest, OpenInputStreamString) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + + std::shared_ptr stream; + ASSERT_OK_AND_ASSIGN(stream, fs->OpenInputStream(PreexistingObjectPath())); + + std::array buffer{}; + std::int64_t size; + ASSERT_OK_AND_ASSIGN(size, stream->Read(buffer.size(), buffer.data())); + + EXPECT_EQ(std::string(buffer.data(), size), kLoremIpsum); +} + +TEST_F(GcsIntegrationTest, OpenInputStreamStringBuffers) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + + std::shared_ptr stream; + ASSERT_OK_AND_ASSIGN(stream, fs->OpenInputStream(PreexistingObjectPath())); + + std::string contents; + std::shared_ptr buffer; + do { + ASSERT_OK_AND_ASSIGN(buffer, stream->Read(16)); + contents.append(buffer->ToString()); + } while (buffer && buffer->size() != 0); + + EXPECT_EQ(contents, kLoremIpsum); +} + +TEST_F(GcsIntegrationTest, OpenInputStreamInfo) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + + arrow::fs::FileInfo info; + ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingObjectPath())); + + std::shared_ptr stream; + ASSERT_OK_AND_ASSIGN(stream, fs->OpenInputStream(info)); + + std::array buffer{}; + std::int64_t size; + ASSERT_OK_AND_ASSIGN(size, stream->Read(buffer.size(), buffer.data())); + + EXPECT_EQ(std::string(buffer.data(), size), kLoremIpsum); +} + +TEST_F(GcsIntegrationTest, OpenInputStreamEmpty) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + + const auto object_path = + internal::ConcatAbstractPath(PreexistingBucketName(), "empty-object.txt"); + CreateFile(fs.get(), object_path, std::string()); + + ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenInputStream(object_path)); + std::array buffer{}; + std::int64_t size; + ASSERT_OK_AND_ASSIGN(size, stream->Read(buffer.size(), buffer.data())); + EXPECT_EQ(size, 0); +} + +TEST_F(GcsIntegrationTest, OpenInputStreamNotFound) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + + ASSERT_RAISES(IOError, fs->OpenInputStream(NotFoundObjectPath())); +} + +TEST_F(GcsIntegrationTest, OpenInputStreamInfoInvalid) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + + arrow::fs::FileInfo info; + ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingBucketPath())); + ASSERT_RAISES(IOError, fs->OpenInputStream(info)); + + ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(NotFoundObjectPath())); + ASSERT_RAISES(IOError, fs->OpenInputStream(info)); +} + +TEST_F(GcsIntegrationTest, OpenInputStreamUri) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + ASSERT_RAISES(Invalid, fs->OpenInputStream("gs://" + PreexistingObjectPath())); +} + +TEST_F(GcsIntegrationTest, OpenInputStreamReadMetadata) { + auto client = GcsClient(); + const auto custom_time = std::chrono::system_clock::now() + std::chrono::hours(1); + const std::string object_name = "OpenInputStreamMetadataTest/simple.txt"; + const gcs::ObjectMetadata expected = + client + .InsertObject(PreexistingBucketName(), object_name, + "The quick brown fox jumps over the lazy dog", + gcs::WithObjectMetadata(gcs::ObjectMetadata() + .set_content_type("text/plain") + .set_custom_time(custom_time) + .set_cache_control("no-cache") + .upsert_metadata("key0", "value0"))) + .value(); + + auto fs = GcsFileSystem::Make(TestGcsOptions()); + std::shared_ptr stream; + ASSERT_OK_AND_ASSIGN(stream, + fs->OpenInputStream(PreexistingBucketPath() + object_name)); + + auto format_time = [](std::chrono::system_clock::time_point tp) { + return absl::FormatTime(absl::RFC3339_full, absl::FromChrono(tp), + absl::UTCTimeZone()); + }; + + std::shared_ptr actual; + ASSERT_OK_AND_ASSIGN(actual, stream->ReadMetadata()); + ASSERT_OK_AND_EQ(expected.self_link(), actual->Get("selfLink")); + ASSERT_OK_AND_EQ(expected.name(), actual->Get("name")); + ASSERT_OK_AND_EQ(expected.bucket(), actual->Get("bucket")); + ASSERT_OK_AND_EQ(std::to_string(expected.generation()), actual->Get("generation")); + ASSERT_OK_AND_EQ(expected.content_type(), actual->Get("Content-Type")); + ASSERT_OK_AND_EQ(format_time(expected.time_created()), actual->Get("timeCreated")); + ASSERT_OK_AND_EQ(format_time(expected.updated()), actual->Get("updated")); + ASSERT_FALSE(actual->Contains("timeDeleted")); + ASSERT_OK_AND_EQ(format_time(custom_time), actual->Get("customTime")); + ASSERT_OK_AND_EQ("false", actual->Get("temporaryHold")); + ASSERT_OK_AND_EQ("false", actual->Get("eventBasedHold")); + ASSERT_FALSE(actual->Contains("retentionExpirationTime")); + ASSERT_OK_AND_EQ(expected.storage_class(), actual->Get("storageClass")); + ASSERT_FALSE(actual->Contains("storageClassUpdated")); + ASSERT_OK_AND_EQ(std::to_string(expected.size()), actual->Get("size")); + ASSERT_OK_AND_EQ(expected.md5_hash(), actual->Get("md5Hash")); + ASSERT_OK_AND_EQ(expected.media_link(), actual->Get("mediaLink")); + ASSERT_OK_AND_EQ(expected.content_encoding(), actual->Get("Content-Encoding")); + ASSERT_OK_AND_EQ(expected.content_disposition(), actual->Get("Content-Disposition")); + ASSERT_OK_AND_EQ(expected.content_language(), actual->Get("Content-Language")); + ASSERT_OK_AND_EQ(expected.cache_control(), actual->Get("Cache-Control")); + auto p = expected.metadata().find("key0"); + ASSERT_TRUE(p != expected.metadata().end()); + ASSERT_OK_AND_EQ(p->second, actual->Get("metadata.key0")); + ASSERT_EQ(expected.has_owner(), actual->Contains("owner.entity")); + ASSERT_EQ(expected.has_owner(), actual->Contains("owner.entityId")); + ASSERT_OK_AND_EQ(expected.crc32c(), actual->Get("crc32c")); + ASSERT_OK_AND_EQ(std::to_string(expected.component_count()), + actual->Get("componentCount")); + ASSERT_OK_AND_EQ(expected.etag(), actual->Get("etag")); + ASSERT_FALSE(actual->Contains("customerEncryption.encryptionAlgorithm")); + ASSERT_FALSE(actual->Contains("customerEncryption.keySha256")); + ASSERT_FALSE(actual->Contains("kmsKeyName")); +} + +TEST_F(GcsIntegrationTest, OpenInputStreamClosed) { + auto fs = GcsFileSystem::Make(TestGcsOptions()); + + ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenInputStream(PreexistingObjectPath())); + ASSERT_OK(stream->Close()); + std::array buffer{}; + ASSERT_RAISES(Invalid, stream->Read(buffer.size(), buffer.data())); + ASSERT_RAISES(Invalid, stream->Read(buffer.size())); + ASSERT_RAISES(Invalid, stream->Tell()); +} + TEST_F(TestAzureFileSystem, OpenInputFileMixedReadVsReadAt) { // Create a file large enough to make the random access tests non-trivial. auto constexpr kLineWidth = 100; From 8ff568415722eb6b4b435bbb9ed989418667ba3e Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sat, 14 Oct 2023 20:31:40 +0100 Subject: [PATCH 14/40] Fix input stream tests --- cpp/src/arrow/filesystem/azurefs_test.cc | 218 ++++++++++++----------- 1 file changed, 117 insertions(+), 101 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 6398b7e6f1e68..901b6b179246e 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -35,6 +35,7 @@ #include "arrow/filesystem/azurefs.h" #include "arrow/util/io_util.h" +#include "arrow/util/key_value_metadata.h" #include #include @@ -288,11 +289,9 @@ class TestAzureFileSystem : public ::testing::Test { } }; -TEST_F(GcsIntegrationTest, OpenInputStreamString) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - +TEST_F(TestAzureFileSystem, OpenInputStreamString) { std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs->OpenInputStream(PreexistingObjectPath())); + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); std::array buffer{}; std::int64_t size; @@ -301,11 +300,9 @@ TEST_F(GcsIntegrationTest, OpenInputStreamString) { EXPECT_EQ(std::string(buffer.data(), size), kLoremIpsum); } -TEST_F(GcsIntegrationTest, OpenInputStreamStringBuffers) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - +TEST_F(TestAzureFileSystem, OpenInputStreamStringBuffers) { std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs->OpenInputStream(PreexistingObjectPath())); + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); std::string contents; std::shared_ptr buffer; @@ -317,14 +314,13 @@ TEST_F(GcsIntegrationTest, OpenInputStreamStringBuffers) { EXPECT_EQ(contents, kLoremIpsum); } -TEST_F(GcsIntegrationTest, OpenInputStreamInfo) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - - arrow::fs::FileInfo info; - ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingObjectPath())); +TEST_F(TestAzureFileSystem, OpenInputStreamInfo) { + // TODO: When implemented use ASSERT_OK_AND_ASSIGN(info, + // fs->GetFileInfo(PreexistingObjectPath())); + arrow::fs::FileInfo info(PreexistingObjectPath(), FileType::File); std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, fs->OpenInputStream(info)); + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(info)); std::array buffer{}; std::int64_t size; @@ -333,108 +329,128 @@ TEST_F(GcsIntegrationTest, OpenInputStreamInfo) { EXPECT_EQ(std::string(buffer.data(), size), kLoremIpsum); } -TEST_F(GcsIntegrationTest, OpenInputStreamEmpty) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - - const auto object_path = - internal::ConcatAbstractPath(PreexistingBucketName(), "empty-object.txt"); - CreateFile(fs.get(), object_path, std::string()); +TEST_F(TestAzureFileSystem, OpenInputStreamEmpty) { + const auto path_to_file = "empty-object.txt"; + const auto path = PreexistingContainerPath() + path_to_file; + gen2_client_->GetFileSystemClient(PreexistingContainerName()) + .GetFileClient(path_to_file) + .UploadFrom(nullptr, 0); - ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenInputStream(object_path)); + ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(path)); std::array buffer{}; std::int64_t size; ASSERT_OK_AND_ASSIGN(size, stream->Read(buffer.size(), buffer.data())); EXPECT_EQ(size, 0); } -TEST_F(GcsIntegrationTest, OpenInputStreamNotFound) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - - ASSERT_RAISES(IOError, fs->OpenInputStream(NotFoundObjectPath())); +TEST_F(TestAzureFileSystem, OpenInputStreamNotFound) { + ASSERT_RAISES(IOError, fs_->OpenInputStream(NotFoundObjectPath())); } -TEST_F(GcsIntegrationTest, OpenInputStreamInfoInvalid) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - - arrow::fs::FileInfo info; - ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(PreexistingBucketPath())); - ASSERT_RAISES(IOError, fs->OpenInputStream(info)); - - ASSERT_OK_AND_ASSIGN(info, fs->GetFileInfo(NotFoundObjectPath())); - ASSERT_RAISES(IOError, fs->OpenInputStream(info)); -} +TEST_F(TestAzureFileSystem, OpenInputStreamInfoInvalid) { + // TODO: When implemented use ASSERT_OK_AND_ASSIGN(info, + // fs->GetFileInfo(PreexistingBucketPath())); + arrow::fs::FileInfo info(PreexistingContainerPath(), FileType::Directory); + ASSERT_RAISES(IOError, fs_->OpenInputStream(info)); -TEST_F(GcsIntegrationTest, OpenInputStreamUri) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - ASSERT_RAISES(Invalid, fs->OpenInputStream("gs://" + PreexistingObjectPath())); + // TODO: When implemented use ASSERT_OK_AND_ASSIGN(info, + // fs->GetFileInfo(NotFoundObjectPath())); + arrow::fs::FileInfo info2(PreexistingContainerPath(), FileType::NotFound); + ASSERT_RAISES(IOError, fs_->OpenInputStream(info2)); } -TEST_F(GcsIntegrationTest, OpenInputStreamReadMetadata) { - auto client = GcsClient(); - const auto custom_time = std::chrono::system_clock::now() + std::chrono::hours(1); - const std::string object_name = "OpenInputStreamMetadataTest/simple.txt"; - const gcs::ObjectMetadata expected = - client - .InsertObject(PreexistingBucketName(), object_name, - "The quick brown fox jumps over the lazy dog", - gcs::WithObjectMetadata(gcs::ObjectMetadata() - .set_content_type("text/plain") - .set_custom_time(custom_time) - .set_cache_control("no-cache") - .upsert_metadata("key0", "value0"))) - .value(); - - auto fs = GcsFileSystem::Make(TestGcsOptions()); - std::shared_ptr stream; - ASSERT_OK_AND_ASSIGN(stream, - fs->OpenInputStream(PreexistingBucketPath() + object_name)); - - auto format_time = [](std::chrono::system_clock::time_point tp) { - return absl::FormatTime(absl::RFC3339_full, absl::FromChrono(tp), - absl::UTCTimeZone()); - }; - - std::shared_ptr actual; - ASSERT_OK_AND_ASSIGN(actual, stream->ReadMetadata()); - ASSERT_OK_AND_EQ(expected.self_link(), actual->Get("selfLink")); - ASSERT_OK_AND_EQ(expected.name(), actual->Get("name")); - ASSERT_OK_AND_EQ(expected.bucket(), actual->Get("bucket")); - ASSERT_OK_AND_EQ(std::to_string(expected.generation()), actual->Get("generation")); - ASSERT_OK_AND_EQ(expected.content_type(), actual->Get("Content-Type")); - ASSERT_OK_AND_EQ(format_time(expected.time_created()), actual->Get("timeCreated")); - ASSERT_OK_AND_EQ(format_time(expected.updated()), actual->Get("updated")); - ASSERT_FALSE(actual->Contains("timeDeleted")); - ASSERT_OK_AND_EQ(format_time(custom_time), actual->Get("customTime")); - ASSERT_OK_AND_EQ("false", actual->Get("temporaryHold")); - ASSERT_OK_AND_EQ("false", actual->Get("eventBasedHold")); - ASSERT_FALSE(actual->Contains("retentionExpirationTime")); - ASSERT_OK_AND_EQ(expected.storage_class(), actual->Get("storageClass")); - ASSERT_FALSE(actual->Contains("storageClassUpdated")); - ASSERT_OK_AND_EQ(std::to_string(expected.size()), actual->Get("size")); - ASSERT_OK_AND_EQ(expected.md5_hash(), actual->Get("md5Hash")); - ASSERT_OK_AND_EQ(expected.media_link(), actual->Get("mediaLink")); - ASSERT_OK_AND_EQ(expected.content_encoding(), actual->Get("Content-Encoding")); - ASSERT_OK_AND_EQ(expected.content_disposition(), actual->Get("Content-Disposition")); - ASSERT_OK_AND_EQ(expected.content_language(), actual->Get("Content-Language")); - ASSERT_OK_AND_EQ(expected.cache_control(), actual->Get("Cache-Control")); - auto p = expected.metadata().find("key0"); - ASSERT_TRUE(p != expected.metadata().end()); - ASSERT_OK_AND_EQ(p->second, actual->Get("metadata.key0")); - ASSERT_EQ(expected.has_owner(), actual->Contains("owner.entity")); - ASSERT_EQ(expected.has_owner(), actual->Contains("owner.entityId")); - ASSERT_OK_AND_EQ(expected.crc32c(), actual->Get("crc32c")); - ASSERT_OK_AND_EQ(std::to_string(expected.component_count()), - actual->Get("componentCount")); - ASSERT_OK_AND_EQ(expected.etag(), actual->Get("etag")); - ASSERT_FALSE(actual->Contains("customerEncryption.encryptionAlgorithm")); - ASSERT_FALSE(actual->Contains("customerEncryption.keySha256")); - ASSERT_FALSE(actual->Contains("kmsKeyName")); +TEST_F(TestAzureFileSystem, OpenInputStreamUri) { + ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfss://" + PreexistingObjectPath())); } -TEST_F(GcsIntegrationTest, OpenInputStreamClosed) { - auto fs = GcsFileSystem::Make(TestGcsOptions()); - ASSERT_OK_AND_ASSIGN(auto stream, fs->OpenInputStream(PreexistingObjectPath())); +// TEST_F(TestAzureFileSystem, OpenInputStreamReadMetadata) { +// const auto custom_time = std::chrono::system_clock::now() + std::chrono::hours(1); +// const std::string object_name = "OpenInputStreamMetadataTest/simple.txt"; +// // const gcs::ObjectMetadata expected = +// // client +// // .InsertObject(PreexistingBucketName(), object_name, +// // "The quick brown fox jumps over the lazy dog", +// // gcs::WithObjectMetadata(gcs::ObjectMetadata() +// // .set_content_type("text/plain") +// // .set_custom_time(custom_time) +// // .set_cache_control("no-cache") +// // .upsert_metadata("key0", +// // "value0"))) +// // .value(); + +// const std::string blob_content = "The quick brown fox jumps over the lazy dog"; +// auto file_client = gen2_client_->GetFileSystemClient(PreexistingContainerName()) +// .GetFileClient(object_name); + +// const auto expected = +// file_client +// .UploadFrom(reinterpret_cast(blob_content.data()), +// blob_content.size()) +// .Value; + +// std::shared_ptr stream; +// ASSERT_OK_AND_ASSIGN(stream, +// fs_->OpenInputStream(PreexistingContainerPath() + object_name)); + +// // auto format_time = [](std::chrono::system_clock::time_point tp) { +// // return absl::FormatTime(absl::RFC3339_full, absl::FromChrono(tp), +// // absl::UTCTimeZone()); +// // }; + +// const auto properties = file_client.GetProperties(); + +// std::shared_ptr actual; +// ASSERT_OK_AND_ASSIGN(actual, stream->ReadMetadata()); + +// for (int i = 0; i < actual->size(); i++) { +// std::cout << "key=" << actual->key(i) << ", value=" << actual->value(i) << std::endl; +// } + +// std::string actual_last_modified; +// // ASSERT_OK_AND_ASSIGN(actual_last_modified, actual->Get("LastModified")); +// auto expected_last_modified = expected.LastModified; +// std::cout << "actual=" << actual_last_modified << std::endl; + +// // ASSERT_OK_AND_EQ(expected.LastModified, actual->Get("LastModified")); + +// // ASSERT_OK_AND_EQ(expected.self_link(), actual->Get("selfLink")); +// // ASSERT_OK_AND_EQ(expected.name(), actual->Get("name")); +// // ASSERT_OK_AND_EQ(expected.bucket(), actual->Get("bucket")); +// // ASSERT_OK_AND_EQ(std::to_string(expected.generation()), actual->Get("generation")); +// // ASSERT_OK_AND_EQ(expected.content_type(), actual->Get("Content-Type")); +// // ASSERT_OK_AND_EQ(format_time(expected.time_created()), actual->Get("timeCreated")); +// // ASSERT_OK_AND_EQ(format_time(expected.updated()), actual->Get("updated")); +// // ASSERT_FALSE(actual->Contains("timeDeleted")); +// // ASSERT_OK_AND_EQ(format_time(custom_time), actual->Get("customTime")); +// // ASSERT_OK_AND_EQ("false", actual->Get("temporaryHold")); +// // ASSERT_OK_AND_EQ("false", actual->Get("eventBasedHold")); +// // ASSERT_FALSE(actual->Contains("retentionExpirationTime")); +// // ASSERT_OK_AND_EQ(expected.storage_class(), actual->Get("storageClass")); +// // ASSERT_FALSE(actual->Contains("storageClassUpdated")); +// // ASSERT_OK_AND_EQ(std::to_string(expected.size()), actual->Get("size")); +// // ASSERT_OK_AND_EQ(expected.md5_hash(), actual->Get("md5Hash")); +// // ASSERT_OK_AND_EQ(expected.media_link(), actual->Get("mediaLink")); +// // ASSERT_OK_AND_EQ(expected.content_encoding(), actual->Get("Content-Encoding")); +// // ASSERT_OK_AND_EQ(expected.content_disposition(), actual->Get("Content-Disposition")); +// // ASSERT_OK_AND_EQ(expected.content_language(), actual->Get("Content-Language")); +// // ASSERT_OK_AND_EQ(expected.cache_control(), actual->Get("Cache-Control")); +// // auto p = expected.metadata().find("key0"); +// // ASSERT_TRUE(p != expected.metadata().end()); +// // ASSERT_OK_AND_EQ(p->second, actual->Get("metadata.key0")); +// // ASSERT_EQ(expected.has_owner(), actual->Contains("owner.entity")); +// // ASSERT_EQ(expected.has_owner(), actual->Contains("owner.entityId")); +// // ASSERT_OK_AND_EQ(expected.crc32c(), actual->Get("crc32c")); +// // ASSERT_OK_AND_EQ(std::to_string(expected.component_count()), +// // actual->Get("componentCount")); +// // ASSERT_OK_AND_EQ(expected.etag(), actual->Get("etag")); +// // ASSERT_FALSE(actual->Contains("customerEncryption.encryptionAlgorithm")); +// // ASSERT_FALSE(actual->Contains("customerEncryption.keySha256")); +// // ASSERT_FALSE(actual->Contains("kmsKeyName")); +// } + +TEST_F(TestAzureFileSystem, OpenInputStreamClosed) { + ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(PreexistingObjectPath())); ASSERT_OK(stream->Close()); std::array buffer{}; ASSERT_RAISES(Invalid, stream->Read(buffer.size(), buffer.data())); From bd85d5a26c73492d00bd7ccdb81f40312b1119ff Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Oct 2023 12:50:45 +0100 Subject: [PATCH 15/40] Fix metadata test --- cpp/src/arrow/filesystem/azurefs_test.cc | 97 ++++-------------------- 1 file changed, 13 insertions(+), 84 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 901b6b179246e..df6969ab2a5a7 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -363,91 +363,20 @@ TEST_F(TestAzureFileSystem, OpenInputStreamUri) { ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfss://" + PreexistingObjectPath())); } +TEST_F(TestAzureFileSystem, OpenInputStreamReadMetadata) { + const std::string object_name = "OpenInputStreamMetadataTest/simple.txt"; -// TEST_F(TestAzureFileSystem, OpenInputStreamReadMetadata) { -// const auto custom_time = std::chrono::system_clock::now() + std::chrono::hours(1); -// const std::string object_name = "OpenInputStreamMetadataTest/simple.txt"; -// // const gcs::ObjectMetadata expected = -// // client -// // .InsertObject(PreexistingBucketName(), object_name, -// // "The quick brown fox jumps over the lazy dog", -// // gcs::WithObjectMetadata(gcs::ObjectMetadata() -// // .set_content_type("text/plain") -// // .set_custom_time(custom_time) -// // .set_cache_control("no-cache") -// // .upsert_metadata("key0", -// // "value0"))) -// // .value(); - -// const std::string blob_content = "The quick brown fox jumps over the lazy dog"; -// auto file_client = gen2_client_->GetFileSystemClient(PreexistingContainerName()) -// .GetFileClient(object_name); - -// const auto expected = -// file_client -// .UploadFrom(reinterpret_cast(blob_content.data()), -// blob_content.size()) -// .Value; - -// std::shared_ptr stream; -// ASSERT_OK_AND_ASSIGN(stream, -// fs_->OpenInputStream(PreexistingContainerPath() + object_name)); - -// // auto format_time = [](std::chrono::system_clock::time_point tp) { -// // return absl::FormatTime(absl::RFC3339_full, absl::FromChrono(tp), -// // absl::UTCTimeZone()); -// // }; - -// const auto properties = file_client.GetProperties(); - -// std::shared_ptr actual; -// ASSERT_OK_AND_ASSIGN(actual, stream->ReadMetadata()); - -// for (int i = 0; i < actual->size(); i++) { -// std::cout << "key=" << actual->key(i) << ", value=" << actual->value(i) << std::endl; -// } - -// std::string actual_last_modified; -// // ASSERT_OK_AND_ASSIGN(actual_last_modified, actual->Get("LastModified")); -// auto expected_last_modified = expected.LastModified; -// std::cout << "actual=" << actual_last_modified << std::endl; - -// // ASSERT_OK_AND_EQ(expected.LastModified, actual->Get("LastModified")); - -// // ASSERT_OK_AND_EQ(expected.self_link(), actual->Get("selfLink")); -// // ASSERT_OK_AND_EQ(expected.name(), actual->Get("name")); -// // ASSERT_OK_AND_EQ(expected.bucket(), actual->Get("bucket")); -// // ASSERT_OK_AND_EQ(std::to_string(expected.generation()), actual->Get("generation")); -// // ASSERT_OK_AND_EQ(expected.content_type(), actual->Get("Content-Type")); -// // ASSERT_OK_AND_EQ(format_time(expected.time_created()), actual->Get("timeCreated")); -// // ASSERT_OK_AND_EQ(format_time(expected.updated()), actual->Get("updated")); -// // ASSERT_FALSE(actual->Contains("timeDeleted")); -// // ASSERT_OK_AND_EQ(format_time(custom_time), actual->Get("customTime")); -// // ASSERT_OK_AND_EQ("false", actual->Get("temporaryHold")); -// // ASSERT_OK_AND_EQ("false", actual->Get("eventBasedHold")); -// // ASSERT_FALSE(actual->Contains("retentionExpirationTime")); -// // ASSERT_OK_AND_EQ(expected.storage_class(), actual->Get("storageClass")); -// // ASSERT_FALSE(actual->Contains("storageClassUpdated")); -// // ASSERT_OK_AND_EQ(std::to_string(expected.size()), actual->Get("size")); -// // ASSERT_OK_AND_EQ(expected.md5_hash(), actual->Get("md5Hash")); -// // ASSERT_OK_AND_EQ(expected.media_link(), actual->Get("mediaLink")); -// // ASSERT_OK_AND_EQ(expected.content_encoding(), actual->Get("Content-Encoding")); -// // ASSERT_OK_AND_EQ(expected.content_disposition(), actual->Get("Content-Disposition")); -// // ASSERT_OK_AND_EQ(expected.content_language(), actual->Get("Content-Language")); -// // ASSERT_OK_AND_EQ(expected.cache_control(), actual->Get("Cache-Control")); -// // auto p = expected.metadata().find("key0"); -// // ASSERT_TRUE(p != expected.metadata().end()); -// // ASSERT_OK_AND_EQ(p->second, actual->Get("metadata.key0")); -// // ASSERT_EQ(expected.has_owner(), actual->Contains("owner.entity")); -// // ASSERT_EQ(expected.has_owner(), actual->Contains("owner.entityId")); -// // ASSERT_OK_AND_EQ(expected.crc32c(), actual->Get("crc32c")); -// // ASSERT_OK_AND_EQ(std::to_string(expected.component_count()), -// // actual->Get("componentCount")); -// // ASSERT_OK_AND_EQ(expected.etag(), actual->Get("etag")); -// // ASSERT_FALSE(actual->Contains("customerEncryption.encryptionAlgorithm")); -// // ASSERT_FALSE(actual->Contains("customerEncryption.keySha256")); -// // ASSERT_FALSE(actual->Contains("kmsKeyName")); -// } + gen2_client_->GetFileSystemClient(PreexistingContainerName()) + .GetFileClient(PreexistingObjectName()) + .SetMetadata(Azure::Storage::Metadata{{"key0", "value0"}}); + + std::shared_ptr stream; + ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); + + std::shared_ptr actual; + ASSERT_OK_AND_ASSIGN(actual, stream->ReadMetadata()); + ASSERT_OK_AND_EQ("value0", actual->Get("key0")); +} TEST_F(TestAzureFileSystem, OpenInputStreamClosed) { ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(PreexistingObjectPath())); From 6cb904fd84f891d869d592be3a8b5aad636af3c0 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Oct 2023 12:58:58 +0100 Subject: [PATCH 16/40] Use basic blob client for tests --- cpp/src/arrow/filesystem/azurefs_test.cc | 78 +++++++----------------- 1 file changed, 22 insertions(+), 56 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index df6969ab2a5a7..dd3a754c31dfe 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -140,13 +140,13 @@ TEST(AzureFileSystem, UploadThenDownload) { std::string("http://127.0.0.1:10000/") + account_name, credential); auto container_client = service_client.GetBlobContainerClient(container_name); container_client.CreateIfNotExists(); - auto blob_client = container_client.GetBlockBlobClient(blob_name); + auto blob_client_ = container_client.GetBlockBlobClient(blob_name); std::vector buffer(blob_content.begin(), blob_content.end()); - blob_client.UploadFrom(buffer.data(), buffer.size()); + blob_client_.UploadFrom(buffer.data(), buffer.size()); std::vector downloaded_content(blob_content.size()); - blob_client.DownloadTo(downloaded_content.data(), downloaded_content.size()); + blob_client_.DownloadTo(downloaded_content.data(), downloaded_content.size()); EXPECT_EQ(std::string(downloaded_content.begin(), downloaded_content.end()), blob_content); @@ -169,7 +169,7 @@ TEST(AzureFileSystem, OptionsCompare) { class TestAzureFileSystem : public ::testing::Test { public: std::shared_ptr fs_; - std::shared_ptr gen2_client_; + std::shared_ptr blob_client_; AzureOptions options_; std::mt19937_64 generator_; std::string container_name_; @@ -179,9 +179,8 @@ class TestAzureFileSystem : public ::testing::Test { const std::string& account_key = GetAzuriteEnv()->account_key(); options_.backend = AzureBackend::Azurite; ASSERT_OK(options_.ConfigureAccountKeyCredentials(account_name, account_key)); - gen2_client_ = - std::make_shared( - options_.account_dfs_url, options_.storage_credentials_provider); + blob_client_ = std::make_shared( + options_.account_blob_url, options_.storage_credentials_provider); ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); } @@ -192,55 +191,22 @@ class TestAzureFileSystem : public ::testing::Test { MakeFileSystem(); generator_ = std::mt19937_64(std::random_device()()); container_name_ = RandomChars(32); - auto file_system_client = gen2_client_->GetFileSystemClient(container_name_); + auto file_system_client = blob_client_->GetBlobContainerClient(container_name_); file_system_client.CreateIfNotExists(); - auto file_client = file_system_client.GetFileClient(PreexistingObjectName()); - file_client.UploadFrom(reinterpret_cast(kLoremIpsum), + auto blob_client = file_system_client.GetBlockBlobClient(PreexistingObjectName()); + blob_client.UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); } void TearDown() override { - auto containers = gen2_client_->ListFileSystems(); - for (auto container : containers.FileSystems) { - auto file_system_client = gen2_client_->GetFileSystemClient(container.Name); + auto containers = blob_client_->ListBlobContainers(); + for (auto container : containers.BlobContainers) { + auto file_system_client = blob_client_->GetBlobContainerClient(container.Name); file_system_client.DeleteIfExists(); } } - void AssertObjectContents( - Azure::Storage::Files::DataLake::DataLakeServiceClient* client, - const std::string& container, const std::string& path_to_file, - const std::string& expected) { - auto path_client = - std::make_shared( - client->GetUrl() + container + "/" + path_to_file, - options_.storage_credentials_provider); - auto size = path_client->GetProperties().Value.FileSize; - if (size == 0) { - ASSERT_EQ(expected, ""); - return; - } - auto buf = AllocateBuffer(size, fs_->io_context().pool()); - Azure::Storage::Blobs::DownloadBlobToOptions download_options; - Azure::Core::Http::HttpRange range; - range.Offset = 0; - range.Length = size; - download_options.Range = Azure::Nullable(range); - auto file_client = - std::make_shared( - client->GetUrl() + container + "/" + path_to_file, - options_.storage_credentials_provider); - auto result = file_client - ->DownloadTo(reinterpret_cast(buf->get()->mutable_data()), - size, download_options) - .Value; - auto buf_data = std::move(buf->get()); - auto expected_data = std::make_shared( - reinterpret_cast(expected.data()), expected.size()); - AssertBufferEqual(*buf_data, *expected_data); - } - std::string PreexistingContainerName() const { return container_name_; } std::string PreexistingContainerPath() const { @@ -281,10 +247,10 @@ class TestAzureFileSystem : public ::testing::Test { void UploadLines(std::vector lines, const char* path_to_file, int total_size) { // TODO: Switch to using Azure filesystem to write once its implemented. - auto file_client = gen2_client_->GetFileSystemClient(PreexistingContainerName()) - .GetFileClient(path_to_file); + auto blob_client = blob_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(path_to_file); std::string all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); - file_client.UploadFrom(reinterpret_cast(all_lines.data()), + blob_client.UploadFrom(reinterpret_cast(all_lines.data()), total_size); } }; @@ -332,8 +298,8 @@ TEST_F(TestAzureFileSystem, OpenInputStreamInfo) { TEST_F(TestAzureFileSystem, OpenInputStreamEmpty) { const auto path_to_file = "empty-object.txt"; const auto path = PreexistingContainerPath() + path_to_file; - gen2_client_->GetFileSystemClient(PreexistingContainerName()) - .GetFileClient(path_to_file) + blob_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(path_to_file) .UploadFrom(nullptr, 0); ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(path)); @@ -366,8 +332,8 @@ TEST_F(TestAzureFileSystem, OpenInputStreamUri) { TEST_F(TestAzureFileSystem, OpenInputStreamReadMetadata) { const std::string object_name = "OpenInputStreamMetadataTest/simple.txt"; - gen2_client_->GetFileSystemClient(PreexistingContainerName()) - .GetFileClient(PreexistingObjectName()) + blob_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlobClient(PreexistingObjectName()) .SetMetadata(Azure::Storage::Metadata{{"key0", "value0"}}); std::shared_ptr stream; @@ -467,9 +433,9 @@ TEST_F(TestAzureFileSystem, OpenInputFileIoContext) { const auto path = PreexistingContainerPath() + path_to_file; const std::string contents = "The quick brown fox jumps over the lazy dog"; - auto file_client = gen2_client_->GetFileSystemClient(PreexistingContainerName()) - .GetFileClient(path_to_file); - file_client.UploadFrom(reinterpret_cast(contents.data()), + auto blob_client = blob_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(path_to_file); + blob_client.UploadFrom(reinterpret_cast(contents.data()), contents.length()); std::shared_ptr file; From 3ff70517cc26663e062538d009e0d023fc6b31cc Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Oct 2023 13:04:56 +0100 Subject: [PATCH 17/40] Rename file_client -> blob_client --- cpp/src/arrow/filesystem/azurefs.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index fbc2a7686ca9a..915de53803be6 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -174,7 +174,7 @@ std::shared_ptr GetObjectMetadata(const ObjectResult& re class ObjectInputFile final : public io::RandomAccessFile { protected: - std::shared_ptr file_client_; + std::shared_ptr blob_client_; const io::IOContext io_context_; AzurePath path_; @@ -184,10 +184,10 @@ class ObjectInputFile final : public io::RandomAccessFile { std::shared_ptr metadata_; public: - ObjectInputFile(std::shared_ptr& file_client, + ObjectInputFile(std::shared_ptr& blob_client, const io::IOContext& io_context, const AzurePath& path, int64_t size = kNoSize) - : file_client_(std::move(file_client)), + : blob_client_(std::move(blob_client)), io_context_(io_context), path_(path), content_length_(size) {} @@ -198,7 +198,7 @@ class ObjectInputFile final : public io::RandomAccessFile { return Status::OK(); } try { - auto properties = file_client_->GetProperties(); + auto properties = blob_client_->GetProperties(); content_length_ = properties.Value.BlobSize; metadata_ = GetObjectMetadata(properties.Value.Metadata); return Status::OK(); @@ -237,7 +237,7 @@ class ObjectInputFile final : public io::RandomAccessFile { } Status Close() override { - file_client_ = nullptr; + blob_client_ = nullptr; closed_ = true; return Status::OK(); } @@ -279,7 +279,7 @@ class ObjectInputFile final : public io::RandomAccessFile { download_options.Range = Azure::Nullable(range); try { auto result = - file_client_ + blob_client_ ->DownloadTo(reinterpret_cast(out), nbytes, download_options) .Value; return result.ContentRange.Length.Value(); @@ -352,11 +352,11 @@ class AzureFileSystem::Impl { return ::arrow::fs::internal::PathNotFound(path.full_path); } - auto file_client = std::make_shared( + auto blob_client = std::make_shared( std::move(service_client_->GetBlobContainerClient(path.container) .GetBlobClient(path.path_to_file))); - auto ptr = std::make_shared(file_client, fs->io_context(), path); + auto ptr = std::make_shared(blob_client, fs->io_context(), path); RETURN_NOT_OK(ptr->Init()); return ptr; } @@ -372,11 +372,11 @@ class AzureFileSystem::Impl { ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(info.path())); - auto file_client = std::make_shared( + auto blob_client = std::make_shared( std::move(service_client_->GetBlobContainerClient(path.container) .GetBlobClient(path.path_to_file))); - auto ptr = std::make_shared(file_client, fs->io_context(), path, + auto ptr = std::make_shared(blob_client, fs->io_context(), path, info.size()); RETURN_NOT_OK(ptr->Init()); return ptr; From e70d01dd5b8094927c1b3c772f5bd5f6e9d34d9a Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Oct 2023 13:13:51 +0100 Subject: [PATCH 18/40] Fix implementation to pass OpenInputStreamUri test --- cpp/src/arrow/filesystem/azurefs.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 915de53803be6..a6d804babadb5 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -81,6 +81,10 @@ struct AzurePath { // Expected input here => s = synapsemlfs/testdir/testfile.txt, // http://127.0.0.1/accountName/pathToBlob auto src = internal::RemoveTrailingSlash(s); + if (internal::IsLikelyUri(s)) { + return Status::Invalid( + "Expected a Azure object path of the form 'container/key...', got a URI: '", s, "'"); + } if (arrow::internal::StartsWith(src, "https://127.0.0.1") || arrow::internal::StartsWith(src, "http://127.0.0.1")) { RETURN_NOT_OK(FromLocalHostString(&src)); From 00b81391bea82d46b1873399a4ed7c7f231ff3cd Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Oct 2023 14:34:34 +0100 Subject: [PATCH 19/40] Adjust some error handling --- cpp/src/arrow/filesystem/azurefs.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index a6d804babadb5..c5ba1e90d244c 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -207,8 +207,11 @@ class ObjectInputFile final : public io::RandomAccessFile { metadata_ = GetObjectMetadata(properties.Value.Metadata); return Status::OK(); } catch (const Azure::Storage::StorageException& exception) { - // TODO: Only return path not found if Azure gave us path not found. + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + // Could be either container or blob not found. return ::arrow::fs::internal::PathNotFound(path_.full_path); + } + return Status::IOError(exception.RawResponse->GetReasonPhrase()); } } @@ -288,8 +291,6 @@ class ObjectInputFile final : public io::RandomAccessFile { .Value; return result.ContentRange.Length.Value(); } catch (const Azure::Storage::StorageException& exception) { - // TODO: return `::arrow::fs::internal::NotAFile(blob_client.GetUrl());` if the - // file does not exist. return Status::IOError(exception.RawResponse->GetReasonPhrase()); } } From b14b83e8484cf1c3c739f6ff79122aa0f9ba2a59 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Oct 2023 14:35:03 +0100 Subject: [PATCH 20/40] Tidy tests --- cpp/src/arrow/filesystem/azurefs_test.cc | 28 ++++++++++++------------ 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index dd3a754c31dfe..8311662c0141a 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -169,7 +169,7 @@ TEST(AzureFileSystem, OptionsCompare) { class TestAzureFileSystem : public ::testing::Test { public: std::shared_ptr fs_; - std::shared_ptr blob_client_; + std::shared_ptr service_client_; AzureOptions options_; std::mt19937_64 generator_; std::string container_name_; @@ -179,9 +179,6 @@ class TestAzureFileSystem : public ::testing::Test { const std::string& account_key = GetAzuriteEnv()->account_key(); options_.backend = AzureBackend::Azurite; ASSERT_OK(options_.ConfigureAccountKeyCredentials(account_name, account_key)); - blob_client_ = std::make_shared( - options_.account_blob_url, options_.storage_credentials_provider); - ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); } void SetUp() override { @@ -191,19 +188,22 @@ class TestAzureFileSystem : public ::testing::Test { MakeFileSystem(); generator_ = std::mt19937_64(std::random_device()()); container_name_ = RandomChars(32); - auto file_system_client = blob_client_->GetBlobContainerClient(container_name_); - file_system_client.CreateIfNotExists(); + service_client_ = std::make_shared( + options_.account_blob_url, options_.storage_credentials_provider); + ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); + auto container_client = service_client_->GetBlobContainerClient(container_name_); + container_client.CreateIfNotExists(); - auto blob_client = file_system_client.GetBlockBlobClient(PreexistingObjectName()); + auto blob_client = container_client.GetBlockBlobClient(PreexistingObjectName()); blob_client.UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); } void TearDown() override { - auto containers = blob_client_->ListBlobContainers(); + auto containers = service_client_->ListBlobContainers(); for (auto container : containers.BlobContainers) { - auto file_system_client = blob_client_->GetBlobContainerClient(container.Name); - file_system_client.DeleteIfExists(); + auto container_client = service_client_->GetBlobContainerClient(container.Name); + container_client.DeleteIfExists(); } } @@ -247,7 +247,7 @@ class TestAzureFileSystem : public ::testing::Test { void UploadLines(std::vector lines, const char* path_to_file, int total_size) { // TODO: Switch to using Azure filesystem to write once its implemented. - auto blob_client = blob_client_->GetBlobContainerClient(PreexistingContainerName()) + auto blob_client = service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlockBlobClient(path_to_file); std::string all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); blob_client.UploadFrom(reinterpret_cast(all_lines.data()), @@ -298,7 +298,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamInfo) { TEST_F(TestAzureFileSystem, OpenInputStreamEmpty) { const auto path_to_file = "empty-object.txt"; const auto path = PreexistingContainerPath() + path_to_file; - blob_client_->GetBlobContainerClient(PreexistingContainerName()) + service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlockBlobClient(path_to_file) .UploadFrom(nullptr, 0); @@ -332,7 +332,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamUri) { TEST_F(TestAzureFileSystem, OpenInputStreamReadMetadata) { const std::string object_name = "OpenInputStreamMetadataTest/simple.txt"; - blob_client_->GetBlobContainerClient(PreexistingContainerName()) + service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlobClient(PreexistingObjectName()) .SetMetadata(Azure::Storage::Metadata{{"key0", "value0"}}); @@ -433,7 +433,7 @@ TEST_F(TestAzureFileSystem, OpenInputFileIoContext) { const auto path = PreexistingContainerPath() + path_to_file; const std::string contents = "The quick brown fox jumps over the lazy dog"; - auto blob_client = blob_client_->GetBlobContainerClient(PreexistingContainerName()) + auto blob_client = service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlockBlobClient(path_to_file); blob_client.UploadFrom(reinterpret_cast(contents.data()), contents.length()); From 918c68d632a6061ac54919cee6a88a5546f7ffdb Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Oct 2023 15:14:03 +0100 Subject: [PATCH 21/40] Make AzurePath consistent with changes from #11997 --- cpp/src/arrow/filesystem/azurefs.cc | 62 +++++++---------------------- 1 file changed, 14 insertions(+), 48 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index c5ba1e90d244c..5233bcf5a46dc 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -70,34 +70,25 @@ struct AzurePath { std::string container; std::string path_to_file; std::vector path_to_file_parts; + // An AzureFileSystem represents a single Azure storage account. AzurePath describes the + // container within that storage account and path within that container. static Result FromString(const std::string& s) { - // https://synapsemladlsgen2.dfs.core.windows.net/synapsemlfs/testdir/testfile.txt - // container = synapsemlfs - // account_name = synapsemladlsgen2 + // Example expected string format: testcontainer/testdir/testfile.txt + // container = testcontainer // path_to_file = testdir/testfile.txt // path_to_file_parts = [testdir, testfile.txt] - - // Expected input here => s = synapsemlfs/testdir/testfile.txt, - // http://127.0.0.1/accountName/pathToBlob - auto src = internal::RemoveTrailingSlash(s); if (internal::IsLikelyUri(s)) { return Status::Invalid( - "Expected a Azure object path of the form 'container/key...', got a URI: '", s, "'"); - } - if (arrow::internal::StartsWith(src, "https://127.0.0.1") || - arrow::internal::StartsWith(src, "http://127.0.0.1")) { - RETURN_NOT_OK(FromLocalHostString(&src)); + "Expected an Azure object path of the form 'container/path...', got a URI: '", + s, "'"); } + auto src = internal::RemoveTrailingSlash(s); auto input_path = std::string(src.data()); - if (internal::IsLikelyUri(input_path)) { - RETURN_NOT_OK(ExtractBlobPath(&input_path)); - src = std::string_view(input_path); - } src = internal::RemoveLeadingSlash(src); auto first_sep = src.find_first_of(kSep); if (first_sep == 0) { - return Status::IOError("Path cannot start with a separator ('", input_path, "')"); + return Status::Invalid("Path cannot start with a separator ('", input_path, "')"); } if (first_sep == std::string::npos) { return AzurePath{std::string(src), std::string(src), "", {}}; @@ -107,39 +98,14 @@ struct AzurePath { path.container = std::string(src.substr(0, first_sep)); path.path_to_file = std::string(src.substr(first_sep + 1)); path.path_to_file_parts = internal::SplitAbstractPath(path.path_to_file); - RETURN_NOT_OK(Validate(&path)); + RETURN_NOT_OK(Validate(path)); return path; } - static Status FromLocalHostString(std::string_view* src) { - // src = http://127.0.0.1:10000/accountName/pathToBlob - arrow::internal::Uri uri; - RETURN_NOT_OK(uri.Parse(src->data())); - *src = internal::RemoveLeadingSlash(uri.path()); - if (src->empty()) { - return Status::IOError("Missing account name in Azure Blob Storage URI"); - } - auto first_sep = src->find_first_of(kSep); - if (first_sep != std::string::npos) { - *src = src->substr(first_sep + 1); - } else { - *src = ""; - } - return Status::OK(); - } - - // Removes scheme, host and port from the uri - static Status ExtractBlobPath(std::string* src) { - arrow::internal::Uri uri; - RETURN_NOT_OK(uri.Parse(*src)); - *src = uri.path(); - return Status::OK(); - } - - static Status Validate(const AzurePath* path) { - auto status = internal::ValidateAbstractPathParts(path->path_to_file_parts); + static Status Validate(const AzurePath& path) { + auto status = internal::ValidateAbstractPathParts(path.path_to_file_parts); if (!status.ok()) { - return Status::Invalid(status.message(), " in path ", path->full_path); + return Status::Invalid(status.message(), " in path ", path.full_path); } else { return status; } @@ -208,8 +174,8 @@ class ObjectInputFile final : public io::RandomAccessFile { return Status::OK(); } catch (const Azure::Storage::StorageException& exception) { if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { - // Could be either container or blob not found. - return ::arrow::fs::internal::PathNotFound(path_.full_path); + // Could be either container or blob not found. + return ::arrow::fs::internal::PathNotFound(path_.full_path); } return Status::IOError(exception.RawResponse->GetReasonPhrase()); } From f696aee09810738ffd91dd798c705b9b34aa9b88 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Oct 2023 14:58:38 +0100 Subject: [PATCH 22/40] Tidy path validation --- cpp/src/arrow/filesystem/azurefs.cc | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 5233bcf5a46dc..cdfcae3a6bef5 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -133,6 +133,25 @@ struct AzurePath { } }; +Status PathNotFound(const AzurePath& path) { + return ::arrow::fs::internal::PathNotFound(path.full_path); +} + +Status NotAFile(const AzurePath& path) { + return ::arrow::fs::internal::NotAFile(path.full_path); +} + +Status ValidateFilePath(const AzurePath& path) { + if (path.container.empty()) { + return PathNotFound(path); + } + + if (path.container.empty() || path.path_to_file.empty()) { + return NotAFile(path); + } + return Status::OK(); +} + template std::shared_ptr GetObjectMetadata(const ObjectResult& result) { auto md = std::make_shared(); @@ -316,12 +335,7 @@ class AzureFileSystem::Impl { Result> OpenInputFile(const std::string& s, AzureFileSystem* fs) { ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(s)); - - // TODO: Return NotAFile if path.path_to_file is empty. Return PathNotFound if account - // name or container name is empty. - if (path.empty()) { - return ::arrow::fs::internal::PathNotFound(path.full_path); - } + RETURN_NOT_OK(ValidateFilePath(path)); auto blob_client = std::make_shared( std::move(service_client_->GetBlobContainerClient(path.container) @@ -342,6 +356,7 @@ class AzureFileSystem::Impl { } ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(info.path())); + RETURN_NOT_OK(ValidateFilePath(path)); auto blob_client = std::make_shared( std::move(service_client_->GetBlobContainerClient(path.container) From 98e019ca871436a10037d2ef0d4ee7fba43bcbf6 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Oct 2023 15:03:57 +0100 Subject: [PATCH 23/40] Remote un-needed includes for datalake client --- cpp/src/arrow/filesystem/azurefs.cc | 1 - cpp/src/arrow/filesystem/azurefs_test.cc | 1 - 2 files changed, 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index cdfcae3a6bef5..891e249ffd507 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -20,7 +20,6 @@ #include #include #include -#include #include "arrow/buffer.h" #include "arrow/filesystem/path_util.h" diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 8311662c0141a..1c53bb1646f49 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -52,7 +52,6 @@ #include #include #include -#include namespace arrow { using internal::TemporaryDir; From 0c585092fe797792e9aa6ba68e8ea78ba9a5e197 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Oct 2023 15:22:01 +0100 Subject: [PATCH 24/40] Remove one of the placeholder tests --- cpp/src/arrow/filesystem/azurefs_test.cc | 27 ------------------------ 1 file changed, 27 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 1c53bb1646f49..2ce9c4e13af54 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -124,33 +124,6 @@ AzuriteEnv* GetAzuriteEnv() { // Placeholder tests // TODO: GH-18014 Remove once a proper test is added -TEST(AzureFileSystem, UploadThenDownload) { - const std::string container_name = "sample-container"; - const std::string blob_name = "sample-blob.txt"; - const std::string blob_content = "Hello Azure!"; - - const std::string& account_name = GetAzuriteEnv()->account_name(); - const std::string& account_key = GetAzuriteEnv()->account_key(); - - auto credential = std::make_shared( - account_name, account_key); - - auto service_client = Azure::Storage::Blobs::BlobServiceClient( - std::string("http://127.0.0.1:10000/") + account_name, credential); - auto container_client = service_client.GetBlobContainerClient(container_name); - container_client.CreateIfNotExists(); - auto blob_client_ = container_client.GetBlockBlobClient(blob_name); - - std::vector buffer(blob_content.begin(), blob_content.end()); - blob_client_.UploadFrom(buffer.data(), buffer.size()); - - std::vector downloaded_content(blob_content.size()); - blob_client_.DownloadTo(downloaded_content.data(), downloaded_content.size()); - - EXPECT_EQ(std::string(downloaded_content.begin(), downloaded_content.end()), - blob_content); -} - TEST(AzureFileSystem, InitializeCredentials) { auto default_credential = std::make_shared(); auto managed_identity_credential = From 8505e9388461064df18149fc3a5128052a7996d6 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Oct 2023 15:28:22 +0100 Subject: [PATCH 25/40] Tidy --- cpp/src/arrow/filesystem/azurefs.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 891e249ffd507..6e5c2940360cf 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -418,8 +418,8 @@ Status AzureFileSystem::CopyFile(const std::string& src, const std::string& dest } Result> AzureFileSystem::OpenInputStream( - const std::string& s) { - return impl_->OpenInputFile(s, this); + const std::string& path) { + return impl_->OpenInputFile(path, this); } Result> AzureFileSystem::OpenInputStream( @@ -428,8 +428,8 @@ Result> AzureFileSystem::OpenInputStream( } Result> AzureFileSystem::OpenInputFile( - const std::string& s) { - return impl_->OpenInputFile(s, this); + const std::string& path) { + return impl_->OpenInputFile(path, this); } Result> AzureFileSystem::OpenInputFile( From 2b048ad2514fba4a9bcfb972b06fabaea5591215 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Oct 2023 19:25:11 +0100 Subject: [PATCH 26/40] Better error messges --- cpp/src/arrow/filesystem/azurefs.cc | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 6e5c2940360cf..abb0d8ad69048 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -151,6 +151,11 @@ Status ValidateFilePath(const AzurePath& path) { return Status::OK(); } +Status ErrorToStatus(const Azure::Storage::StorageException& exception, + const std::string& prefix = "") { + return Status::IOError(prefix, " Azure Error: ", exception.what()); +} + template std::shared_ptr GetObjectMetadata(const ObjectResult& result) { auto md = std::make_shared(); @@ -193,9 +198,10 @@ class ObjectInputFile final : public io::RandomAccessFile { } catch (const Azure::Storage::StorageException& exception) { if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { // Could be either container or blob not found. - return ::arrow::fs::internal::PathNotFound(path_.full_path); + return PathNotFound(path_); } - return Status::IOError(exception.RawResponse->GetReasonPhrase()); + return ErrorToStatus( + exception, "When fetching properties for '" + blob_client_->GetUrl() + "': "); } } @@ -275,7 +281,9 @@ class ObjectInputFile final : public io::RandomAccessFile { .Value; return result.ContentRange.Length.Value(); } catch (const Azure::Storage::StorageException& exception) { - return Status::IOError(exception.RawResponse->GetReasonPhrase()); + return ErrorToStatus(exception, "When reading from '" + blob_client_->GetUrl() + + "' at position " + std::to_string(position) + + " for " + std::to_string(nbytes) + " bytes: "); } } From 7fcf9db5346f332d93a5ec135d3e17620a5392a9 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Oct 2023 19:33:59 +0100 Subject: [PATCH 27/40] Remove unnecessary move --- cpp/src/arrow/filesystem/azurefs.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index abb0d8ad69048..4dfeeebe77ad5 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -345,8 +345,8 @@ class AzureFileSystem::Impl { RETURN_NOT_OK(ValidateFilePath(path)); auto blob_client = std::make_shared( - std::move(service_client_->GetBlobContainerClient(path.container) - .GetBlobClient(path.path_to_file))); + service_client_->GetBlobContainerClient(path.container) + .GetBlobClient(path.path_to_file)); auto ptr = std::make_shared(blob_client, fs->io_context(), path); RETURN_NOT_OK(ptr->Init()); @@ -366,8 +366,8 @@ class AzureFileSystem::Impl { RETURN_NOT_OK(ValidateFilePath(path)); auto blob_client = std::make_shared( - std::move(service_client_->GetBlobContainerClient(path.container) - .GetBlobClient(path.path_to_file))); + service_client_->GetBlobContainerClient(path.container) + .GetBlobClient(path.path_to_file)); auto ptr = std::make_shared(blob_client, fs->io_context(), path, info.size()); From cffc9bee18dee1ae9c1244deecb3cb943f6a5e56 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Sun, 15 Oct 2023 23:58:05 +0100 Subject: [PATCH 28/40] Improve compliance with style guide --- cpp/src/arrow/filesystem/azurefs.cc | 54 +++++++++++------------- cpp/src/arrow/filesystem/azurefs_test.cc | 17 ++++---- 2 files changed, 33 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 4dfeeebe77ad5..4065b82c6c390 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -34,8 +34,6 @@ namespace arrow { namespace fs { -static const char kSep = '/'; - // ----------------------------------------------------------------------- // AzureOptions Implementation @@ -64,13 +62,13 @@ Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& account_n } namespace { +// An AzureFileSystem represents a single Azure storage account. AzurePath describes a +// container and path within that storage account. struct AzurePath { std::string full_path; std::string container; std::string path_to_file; std::vector path_to_file_parts; - // An AzureFileSystem represents a single Azure storage account. AzurePath describes the - // container within that storage account and path within that container. static Result FromString(const std::string& s) { // Example expected string format: testcontainer/testdir/testfile.txt @@ -85,7 +83,7 @@ struct AzurePath { auto src = internal::RemoveTrailingSlash(s); auto input_path = std::string(src.data()); src = internal::RemoveLeadingSlash(src); - auto first_sep = src.find_first_of(kSep); + auto first_sep = src.find_first_of(internal::kSep); if (first_sep == 0) { return Status::Invalid("Path cannot start with a separator ('", input_path, "')"); } @@ -118,7 +116,7 @@ struct AzurePath { if (parent.path_to_file.empty()) { parent.full_path = parent.container; } else { - parent.full_path = parent.container + kSep + parent.path_to_file; + parent.full_path = parent.container + internal::kSep + parent.path_to_file; } return parent; } @@ -151,8 +149,8 @@ Status ValidateFilePath(const AzurePath& path) { return Status::OK(); } -Status ErrorToStatus(const Azure::Storage::StorageException& exception, - const std::string& prefix = "") { +Status ErrorToStatus(const std::string& prefix, + const Azure::Storage::StorageException& exception) { return Status::IOError(prefix, " Azure Error: ", exception.what()); } @@ -166,16 +164,6 @@ std::shared_ptr GetObjectMetadata(const ObjectResult& re } class ObjectInputFile final : public io::RandomAccessFile { - protected: - std::shared_ptr blob_client_; - const io::IOContext io_context_; - AzurePath path_; - - bool closed_ = false; - int64_t pos_ = 0; - int64_t content_length_ = kNoSize; - std::shared_ptr metadata_; - public: ObjectInputFile(std::shared_ptr& blob_client, const io::IOContext& io_context, const AzurePath& path, @@ -201,7 +189,7 @@ class ObjectInputFile final : public io::RandomAccessFile { return PathNotFound(path_); } return ErrorToStatus( - exception, "When fetching properties for '" + blob_client_->GetUrl() + "': "); + "When fetching properties for '" + blob_client_->GetUrl() + "': ", exception); } } @@ -281,9 +269,10 @@ class ObjectInputFile final : public io::RandomAccessFile { .Value; return result.ContentRange.Length.Value(); } catch (const Azure::Storage::StorageException& exception) { - return ErrorToStatus(exception, "When reading from '" + blob_client_->GetUrl() + - "' at position " + std::to_string(position) + - " for " + std::to_string(nbytes) + " bytes: "); + return ErrorToStatus("When reading from '" + blob_client_->GetUrl() + + "' at position " + std::to_string(position) + " for " + + std::to_string(nbytes) + " bytes: ", + exception); } } @@ -294,14 +283,14 @@ class ObjectInputFile final : public io::RandomAccessFile { // No need to allocate more than the remaining number of bytes nbytes = std::min(nbytes, content_length_ - position); - ARROW_ASSIGN_OR_RAISE(auto buf, AllocateResizableBuffer(nbytes, io_context_.pool())); + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes, io_context_.pool())); if (nbytes > 0) { ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, - ReadAt(position, nbytes, buf->mutable_data())); + ReadAt(position, nbytes, buffer->mutable_data())); DCHECK_LE(bytes_read, nbytes); - RETURN_NOT_OK(buf->Resize(bytes_read)); + RETURN_NOT_OK(buffer->Resize(bytes_read)); } - return std::move(buf); + return std::move(buffer); } Result Read(int64_t nbytes, void* out) override { @@ -315,6 +304,16 @@ class ObjectInputFile final : public io::RandomAccessFile { pos_ += buffer->size(); return std::move(buffer); } + + private: + std::shared_ptr blob_client_; + const io::IOContext io_context_; + AzurePath path_; + + bool closed_ = false; + int64_t pos_ = 0; + int64_t content_length_ = kNoSize; + std::shared_ptr metadata_; }; } // namespace @@ -343,7 +342,6 @@ class AzureFileSystem::Impl { AzureFileSystem* fs) { ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(s)); RETURN_NOT_OK(ValidateFilePath(path)); - auto blob_client = std::make_shared( service_client_->GetBlobContainerClient(path.container) .GetBlobClient(path.path_to_file)); @@ -361,10 +359,8 @@ class AzureFileSystem::Impl { if (info.type() != FileType::File && info.type() != FileType::Unknown) { return ::arrow::fs::internal::NotAFile(info.path()); } - ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(info.path())); RETURN_NOT_OK(ValidateFilePath(path)); - auto blob_client = std::make_shared( service_client_->GetBlobContainerClient(path.container) .GetBlobClient(path.path_to_file)); diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 2ce9c4e13af54..355ed6ad24b3a 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -34,25 +34,24 @@ #include #include "arrow/filesystem/azurefs.h" -#include "arrow/util/io_util.h" -#include "arrow/util/key_value_metadata.h" - -#include -#include -#include #include #include -#include "arrow/testing/gtest_util.h" -#include "arrow/testing/util.h" - +#include +#include +#include #include #include #include #include #include +#include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" +#include "arrow/util/io_util.h" +#include "arrow/util/key_value_metadata.h" + namespace arrow { using internal::TemporaryDir; namespace fs { From 08e5206a69ec1b5e8bb13fad0e14c22510cdb877 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Mon, 16 Oct 2023 09:35:50 +0100 Subject: [PATCH 29/40] Make open file consistent with #13577 --- cpp/src/arrow/filesystem/azurefs.cc | 2 ++ cpp/src/arrow/filesystem/azurefs_test.cc | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 4065b82c6c390..040e96f6f8e5c 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -340,6 +340,7 @@ class AzureFileSystem::Impl { Result> OpenInputFile(const std::string& s, AzureFileSystem* fs) { + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(s)); ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(s)); RETURN_NOT_OK(ValidateFilePath(path)); auto blob_client = std::make_shared( @@ -353,6 +354,7 @@ class AzureFileSystem::Impl { Result> OpenInputFile(const FileInfo& info, AzureFileSystem* fs) { + ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(info.path())); if (info.type() == FileType::NotFound) { return ::arrow::fs::internal::PathNotFound(info.path()); } diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 355ed6ad24b3a..ecf57522c18d9 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -300,6 +300,10 @@ TEST_F(TestAzureFileSystem, OpenInputStreamUri) { ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfss://" + PreexistingObjectPath())); } +TEST_F(TestAzureFileSystem, OpenInputStreamTrailingSlash) { + ASSERT_RAISES(IOError, fs_->OpenInputStream(PreexistingObjectPath() + '/')); +} + TEST_F(TestAzureFileSystem, OpenInputStreamReadMetadata) { const std::string object_name = "OpenInputStreamMetadataTest/simple.txt"; From c3405f5b39f64b838c47092f419d7030e373db39 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Mon, 16 Oct 2023 09:46:52 +0100 Subject: [PATCH 30/40] Tidy --- cpp/src/arrow/filesystem/azurefs.cc | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 040e96f6f8e5c..221ef1dd74e4f 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -17,9 +17,7 @@ #include "arrow/filesystem/azurefs.h" -#include #include -#include #include "arrow/buffer.h" #include "arrow/filesystem/path_util.h" @@ -143,7 +141,7 @@ Status ValidateFilePath(const AzurePath& path) { return PathNotFound(path); } - if (path.container.empty() || path.path_to_file.empty()) { + if (path.path_to_file.empty()) { return NotAFile(path); } return Status::OK(); @@ -257,11 +255,8 @@ class ObjectInputFile final : public io::RandomAccessFile { } // Read the desired range of bytes - Azure::Storage::Blobs::DownloadBlobToOptions download_options; - Azure::Core::Http::HttpRange range; - range.Offset = position; - range.Length = nbytes; - download_options.Range = Azure::Nullable(range); + Azure::Core::Http::HttpRange range{.Offset = position, .Length = nbytes}; + Azure::Storage::Blobs::DownloadBlobToOptions download_options{.Range = range}; try { auto result = blob_client_ @@ -283,7 +278,8 @@ class ObjectInputFile final : public io::RandomAccessFile { // No need to allocate more than the remaining number of bytes nbytes = std::min(nbytes, content_length_ - position); - ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateResizableBuffer(nbytes, io_context_.pool())); + ARROW_ASSIGN_OR_RAISE(auto buffer, + AllocateResizableBuffer(nbytes, io_context_.pool())); if (nbytes > 0) { ARROW_ASSIGN_OR_RAISE(int64_t bytes_read, ReadAt(position, nbytes, buffer->mutable_data())); From cad62dfe7fc1859cf33b6921082e3b19d5cccc27 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 18 Oct 2023 10:29:40 +0100 Subject: [PATCH 31/40] PR comments --- cpp/src/arrow/filesystem/azurefs.cc | 25 ++++++++++++------------ cpp/src/arrow/filesystem/azurefs_test.cc | 24 +++++++++++------------ 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 221ef1dd74e4f..04fc1f823c6c6 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -163,7 +163,7 @@ std::shared_ptr GetObjectMetadata(const ObjectResult& re class ObjectInputFile final : public io::RandomAccessFile { public: - ObjectInputFile(std::shared_ptr& blob_client, + ObjectInputFile(std::shared_ptr blob_client, const io::IOContext& io_context, const AzurePath& path, int64_t size = kNoSize) : blob_client_(std::move(blob_client)), @@ -191,14 +191,15 @@ class ObjectInputFile final : public io::RandomAccessFile { } } - Status CheckClosed() const { + Status CheckClosed(const char* action) const { if (closed_) { - return Status::Invalid("Operation on closed stream"); + return Status::Invalid("Cannot ", action, " on closed file."); } return Status::OK(); } Status CheckPosition(int64_t position, const char* action) const { + DCHECK_GE(content_length_, 0); if (position < 0) { return Status::Invalid("Cannot ", action, " from negative position"); } @@ -228,17 +229,17 @@ class ObjectInputFile final : public io::RandomAccessFile { bool closed() const override { return closed_; } Result Tell() const override { - RETURN_NOT_OK(CheckClosed()); + RETURN_NOT_OK(CheckClosed("tell")); return pos_; } Result GetSize() override { - RETURN_NOT_OK(CheckClosed()); + RETURN_NOT_OK(CheckClosed("size")); return content_length_; } Status Seek(int64_t position) override { - RETURN_NOT_OK(CheckClosed()); + RETURN_NOT_OK(CheckClosed("seek")); RETURN_NOT_OK(CheckPosition(position, "seek")); pos_ = position; @@ -246,7 +247,7 @@ class ObjectInputFile final : public io::RandomAccessFile { } Result ReadAt(int64_t position, int64_t nbytes, void* out) override { - RETURN_NOT_OK(CheckClosed()); + RETURN_NOT_OK(CheckClosed("read")); RETURN_NOT_OK(CheckPosition(position, "read")); nbytes = std::min(nbytes, content_length_ - position); @@ -258,11 +259,9 @@ class ObjectInputFile final : public io::RandomAccessFile { Azure::Core::Http::HttpRange range{.Offset = position, .Length = nbytes}; Azure::Storage::Blobs::DownloadBlobToOptions download_options{.Range = range}; try { - auto result = - blob_client_ - ->DownloadTo(reinterpret_cast(out), nbytes, download_options) - .Value; - return result.ContentRange.Length.Value(); + return blob_client_ + ->DownloadTo(reinterpret_cast(out), nbytes, download_options) + .Value.ContentRange.Length.Value(); } catch (const Azure::Storage::StorageException& exception) { return ErrorToStatus("When reading from '" + blob_client_->GetUrl() + "' at position " + std::to_string(position) + " for " + @@ -272,7 +271,7 @@ class ObjectInputFile final : public io::RandomAccessFile { } Result> ReadAt(int64_t position, int64_t nbytes) override { - RETURN_NOT_OK(CheckClosed()); + RETURN_NOT_OK(CheckClosed("read")); RETURN_NOT_OK(CheckPosition(position, "read")); // No need to allocate more than the remaining number of bytes diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index ecf57522c18d9..7ffe0c4793896 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -141,27 +141,29 @@ class TestAzureFileSystem : public ::testing::Test { public: std::shared_ptr fs_; std::shared_ptr service_client_; - AzureOptions options_; std::mt19937_64 generator_; std::string container_name_; - void MakeFileSystem() { + TestAzureFileSystem() : generator_(std::random_device()()) {} + + AzureOptions MakeOptions() { const std::string& account_name = GetAzuriteEnv()->account_name(); const std::string& account_key = GetAzuriteEnv()->account_key(); - options_.backend = AzureBackend::Azurite; - ASSERT_OK(options_.ConfigureAccountKeyCredentials(account_name, account_key)); + AzureOptions options; + options.backend = AzureBackend::Azurite; + ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); + return options; } void SetUp() override { ASSERT_THAT(GetAzuriteEnv(), NotNull()); ASSERT_OK(GetAzuriteEnv()->status()); - MakeFileSystem(); - generator_ = std::mt19937_64(std::random_device()()); container_name_ = RandomChars(32); + auto options = MakeOptions(); service_client_ = std::make_shared( - options_.account_blob_url, options_.storage_credentials_provider); - ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); + options.account_blob_url, options.storage_credentials_provider); + ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options)); auto container_client = service_client_->GetBlobContainerClient(container_name_); container_client.CreateIfNotExists(); @@ -199,10 +201,6 @@ class TestAzureFileSystem : public ::testing::Test { return line; } - uint8_t RandomInteger() { - return std::uniform_int_distribution()(generator_); - } - std::size_t RandomIndex(std::size_t end) { return std::uniform_int_distribution(0, end - 1)(generator_); } @@ -215,7 +213,7 @@ class TestAzureFileSystem : public ::testing::Test { return s; } - void UploadLines(std::vector lines, const char* path_to_file, + void UploadLines(const std::vector& lines, const char* path_to_file, int total_size) { // TODO: Switch to using Azure filesystem to write once its implemented. auto blob_client = service_client_->GetBlobContainerClient(PreexistingContainerName()) From 10c25e2da761fb95231696dc4a8254289f2888a5 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 18 Oct 2023 18:43:16 +0100 Subject: [PATCH 32/40] Reference issue for better metadata response --- cpp/src/arrow/filesystem/azurefs_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 7ffe0c4793896..b080114b5f46f 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -314,6 +314,8 @@ TEST_F(TestAzureFileSystem, OpenInputStreamReadMetadata) { std::shared_ptr actual; ASSERT_OK_AND_ASSIGN(actual, stream->ReadMetadata()); + // TODO(GH-38330): This is asserting that the user defined metadata is returned but this is + // probably not the correct behaviour. ASSERT_OK_AND_EQ("value0", actual->Get("key0")); } From b9f1eafc6bc580cbc7a21b6065d2ed84dc0362f5 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 18 Oct 2023 18:49:56 +0100 Subject: [PATCH 33/40] Avoid unnecessary copying of AzurePath --- cpp/src/arrow/filesystem/azurefs.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 04fc1f823c6c6..99d22695b8d29 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -164,11 +164,10 @@ std::shared_ptr GetObjectMetadata(const ObjectResult& re class ObjectInputFile final : public io::RandomAccessFile { public: ObjectInputFile(std::shared_ptr blob_client, - const io::IOContext& io_context, const AzurePath& path, - int64_t size = kNoSize) + const io::IOContext& io_context, AzurePath path, int64_t size = kNoSize) : blob_client_(std::move(blob_client)), io_context_(io_context), - path_(path), + path_(std::move(path)), content_length_(size) {} Status Init() { @@ -342,7 +341,8 @@ class AzureFileSystem::Impl { service_client_->GetBlobContainerClient(path.container) .GetBlobClient(path.path_to_file)); - auto ptr = std::make_shared(blob_client, fs->io_context(), path); + auto ptr = + std::make_shared(blob_client, fs->io_context(), std::move(path)); RETURN_NOT_OK(ptr->Init()); return ptr; } @@ -362,8 +362,8 @@ class AzureFileSystem::Impl { service_client_->GetBlobContainerClient(path.container) .GetBlobClient(path.path_to_file)); - auto ptr = std::make_shared(blob_client, fs->io_context(), path, - info.size()); + auto ptr = std::make_shared(blob_client, fs->io_context(), + std::move(path), info.size()); RETURN_NOT_OK(ptr->Init()); return ptr; } From 835e6aba9cc1282fea7aa6bce21fde3ce7d996af Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 18 Oct 2023 20:58:47 +0100 Subject: [PATCH 34/40] Better status message for invalid path --- cpp/src/arrow/filesystem/azurefs.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 99d22695b8d29..ea6dd121b0fdb 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -83,7 +83,7 @@ struct AzurePath { src = internal::RemoveLeadingSlash(src); auto first_sep = src.find_first_of(internal::kSep); if (first_sep == 0) { - return Status::Invalid("Path cannot start with a separator ('", input_path, "')"); + return Status::Invalid("Path cannot start with a separator ('", s, "')"); } if (first_sep == std::string::npos) { return AzurePath{std::string(src), std::string(src), "", {}}; From 8e9a985f0b7ec282b2336465e10334513444a264 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 18 Oct 2023 21:11:10 +0100 Subject: [PATCH 35/40] Remove unnecessary and dangerous path string parsing --- cpp/src/arrow/filesystem/azurefs.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index ea6dd121b0fdb..ec27318c4c07c 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -78,9 +78,7 @@ struct AzurePath { "Expected an Azure object path of the form 'container/path...', got a URI: '", s, "'"); } - auto src = internal::RemoveTrailingSlash(s); - auto input_path = std::string(src.data()); - src = internal::RemoveLeadingSlash(src); + const auto src = internal::RemoveTrailingSlash(s); auto first_sep = src.find_first_of(internal::kSep); if (first_sep == 0) { return Status::Invalid("Path cannot start with a separator ('", s, "')"); From 7f329cc260339c4c2fb31c4a17c04f3e461c26be Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 18 Oct 2023 21:22:10 +0100 Subject: [PATCH 36/40] Update cpp/src/arrow/filesystem/azurefs_test.cc Co-authored-by: Benjamin Kietzman --- cpp/src/arrow/filesystem/azurefs_test.cc | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index b080114b5f46f..15898f5e3634f 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -228,11 +228,8 @@ TEST_F(TestAzureFileSystem, OpenInputStreamString) { std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); - std::array buffer{}; - std::int64_t size; - ASSERT_OK_AND_ASSIGN(size, stream->Read(buffer.size(), buffer.data())); - - EXPECT_EQ(std::string(buffer.data(), size), kLoremIpsum); + ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); + EXPECT_EQ(buffer->ToString(), kLoremIpsum); } TEST_F(TestAzureFileSystem, OpenInputStreamStringBuffers) { From 13929d8fedb1cb02c11f56a1593c87dcc366aef7 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 18 Oct 2023 21:34:46 +0100 Subject: [PATCH 37/40] Avoid designated initializers --- cpp/src/arrow/filesystem/azurefs.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index ec27318c4c07c..d700df7674c07 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -253,8 +253,8 @@ class ObjectInputFile final : public io::RandomAccessFile { } // Read the desired range of bytes - Azure::Core::Http::HttpRange range{.Offset = position, .Length = nbytes}; - Azure::Storage::Blobs::DownloadBlobToOptions download_options{.Range = range}; + Azure::Core::Http::HttpRange range{position, nbytes}; + Azure::Storage::Blobs::DownloadBlobToOptions download_options{range}; try { return blob_client_ ->DownloadTo(reinterpret_cast(out), nbytes, download_options) From f652a4fc03a708d3e3c88bbf1f12ece7e6a725a5 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 18 Oct 2023 23:13:17 +0100 Subject: [PATCH 38/40] Make another test more concise --- cpp/src/arrow/filesystem/azurefs_test.cc | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 15898f5e3634f..3c343dd51bd6b 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -254,11 +254,8 @@ TEST_F(TestAzureFileSystem, OpenInputStreamInfo) { std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(info)); - std::array buffer{}; - std::int64_t size; - ASSERT_OK_AND_ASSIGN(size, stream->Read(buffer.size(), buffer.data())); - - EXPECT_EQ(std::string(buffer.data(), size), kLoremIpsum); + ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); + EXPECT_EQ(buffer->ToString(), kLoremIpsum); } TEST_F(TestAzureFileSystem, OpenInputStreamEmpty) { From 26d425edc9835d3c49da19bf3a85a4968c183f85 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 18 Oct 2023 23:13:33 +0100 Subject: [PATCH 39/40] Fix a clang build warning --- cpp/src/arrow/filesystem/azurefs.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index d700df7674c07..179be069b2acf 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -254,7 +254,8 @@ class ObjectInputFile final : public io::RandomAccessFile { // Read the desired range of bytes Azure::Core::Http::HttpRange range{position, nbytes}; - Azure::Storage::Blobs::DownloadBlobToOptions download_options{range}; + Azure::Storage::Blobs::DownloadBlobToOptions download_options; + download_options.Range = range; try { return blob_client_ ->DownloadTo(reinterpret_cast(out), nbytes, download_options) From 13c0d1c956b655428835bd276041a12b33c17f3d Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 18 Oct 2023 23:35:49 +0100 Subject: [PATCH 40/40] Reference follow up github issues in TODO comments --- cpp/src/arrow/filesystem/azurefs_test.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 3c343dd51bd6b..5d454bdc33f47 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -215,7 +215,7 @@ class TestAzureFileSystem : public ::testing::Test { void UploadLines(const std::vector& lines, const char* path_to_file, int total_size) { - // TODO: Switch to using Azure filesystem to write once its implemented. + // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. auto blob_client = service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlockBlobClient(path_to_file); std::string all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); @@ -247,7 +247,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamStringBuffers) { } TEST_F(TestAzureFileSystem, OpenInputStreamInfo) { - // TODO: When implemented use ASSERT_OK_AND_ASSIGN(info, + // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, // fs->GetFileInfo(PreexistingObjectPath())); arrow::fs::FileInfo info(PreexistingObjectPath(), FileType::File); @@ -277,12 +277,12 @@ TEST_F(TestAzureFileSystem, OpenInputStreamNotFound) { } TEST_F(TestAzureFileSystem, OpenInputStreamInfoInvalid) { - // TODO: When implemented use ASSERT_OK_AND_ASSIGN(info, + // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, // fs->GetFileInfo(PreexistingBucketPath())); arrow::fs::FileInfo info(PreexistingContainerPath(), FileType::Directory); ASSERT_RAISES(IOError, fs_->OpenInputStream(info)); - // TODO: When implemented use ASSERT_OK_AND_ASSIGN(info, + // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, // fs->GetFileInfo(NotFoundObjectPath())); arrow::fs::FileInfo info2(PreexistingContainerPath(), FileType::NotFound); ASSERT_RAISES(IOError, fs_->OpenInputStream(info2)); @@ -308,8 +308,8 @@ TEST_F(TestAzureFileSystem, OpenInputStreamReadMetadata) { std::shared_ptr actual; ASSERT_OK_AND_ASSIGN(actual, stream->ReadMetadata()); - // TODO(GH-38330): This is asserting that the user defined metadata is returned but this is - // probably not the correct behaviour. + // TODO(GH-38330): This is asserting that the user defined metadata is returned but this + // is probably not the correct behaviour. ASSERT_OK_AND_EQ("value0", actual->Get("key0")); } @@ -413,7 +413,7 @@ TEST_F(TestAzureFileSystem, OpenInputFileIoContext) { } TEST_F(TestAzureFileSystem, OpenInputFileInfo) { - // TODO: When implemented use ASSERT_OK_AND_ASSIGN(info, + // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, // fs->GetFileInfo(PreexistingObjectPath())); arrow::fs::FileInfo info(PreexistingObjectPath(), FileType::File); @@ -434,12 +434,12 @@ TEST_F(TestAzureFileSystem, OpenInputFileNotFound) { } TEST_F(TestAzureFileSystem, OpenInputFileInfoInvalid) { - // TODO: When implemented use ASSERT_OK_AND_ASSIGN(info, + // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, // fs->GetFileInfo(PreexistingContainerPath())); arrow::fs::FileInfo info(PreexistingContainerPath(), FileType::File); ASSERT_RAISES(IOError, fs_->OpenInputFile(info)); - // TODO: When implemented use ASSERT_OK_AND_ASSIGN(info, + // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, // fs->GetFileInfo(NotFoundObjectPath())); arrow::fs::FileInfo info2(NotFoundObjectPath(), FileType::NotFound); ASSERT_RAISES(IOError, fs_->OpenInputFile(info2));