Skip to content

Commit

Permalink
apacheGH-38699: [C++][FS][Azure] Implement CreateDir() (apache#38708)
Browse files Browse the repository at this point in the history
### Rationale for this change

It seems that we can't create a directory explicitly without hierarchical namespace support.

It seems that Azure Blob Storage supports only virtual directory. There is no directory. If a file (blob) name has "/", it's treated that the file (blob) exists under a virtual directory.

It seems that Azure Data Lake Storage Gen2 supports a real directory.

See also:
https://learn.microsoft.com/en-us/azure/storage/blobs/storage-blobs-introduction

### What changes are included in this PR?

This change chooses the following behavior:

* Container can be created with/without hierarchical namespace support.
* Directory can be created with hierarchical namespace support.
* Directory can't be created without hierarchical namespace support. So do nothing without hierachical namespace support. (`arrow::Status::OK()` is just returned.)

### Are these changes tested?

Azurite doesn't support hierarchical namespace yet. So I can't test the implementation for hierarchical namespace yet. Sorry.

### Are there any user-facing changes?

Yes.
* Closes: apache#38699

Lead-authored-by: Sutou Kouhei <[email protected]>
Co-authored-by: Sutou Kouhei <[email protected]>
Co-authored-by: Thomas Newton <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
  • Loading branch information
3 people authored Nov 18, 2023
1 parent c32d226 commit a394a39
Show file tree
Hide file tree
Showing 2 changed files with 224 additions and 2 deletions.
113 changes: 112 additions & 1 deletion cpp/src/arrow/filesystem/azurefs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,19 @@ Status ValidateFilePath(const AzurePath& path) {
return Status::OK();
}

Status StatusFromErrorResponse(const std::string& url,
Azure::Core::Http::RawResponse* raw_response,
const std::string& context) {
const auto& body = raw_response->GetBody();
// There isn't an Azure specification that response body on error
// doesn't contain any binary data but we assume it. We hope that
// error response body has useful information for the error.
std::string_view body_text(reinterpret_cast<const char*>(body.data()), body.size());
return Status::IOError(context, ": ", url, ": ", raw_response->GetReasonPhrase(), " (",
static_cast<int>(raw_response->GetStatusCode()),
"): ", body_text);
}

template <typename ArrowType>
std::string FormatValue(typename TypeTraits<ArrowType>::CType value) {
struct StringAppender {
Expand Down Expand Up @@ -611,6 +624,99 @@ class AzureFileSystem::Impl {
RETURN_NOT_OK(ptr->Init());
return ptr;
}

Status CreateDir(const AzurePath& path) {
if (path.container.empty()) {
return Status::Invalid("Cannot create an empty container");
}

if (path.path_to_file.empty()) {
auto container_client =
blob_service_client_->GetBlobContainerClient(path.container);
try {
auto response = container_client.Create();
if (response.Value.Created) {
return Status::OK();
} else {
return StatusFromErrorResponse(
container_client.GetUrl(), response.RawResponse.get(),
"Failed to create a container: " + path.container);
}
} catch (const Azure::Storage::StorageException& exception) {
return internal::ExceptionToStatus(
"Failed to create a container: " + path.container + ": " +
container_client.GetUrl(),
exception);
}
}

ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled,
hierarchical_namespace_.Enabled(path.container));
if (!hierarchical_namespace_enabled) {
// Without hierarchical namespace enabled Azure blob storage has no directories.
// Therefore we can't, and don't need to create one. Simply creating a blob with `/`
// in the name implies directories.
return Status::OK();
}

auto directory_client = datalake_service_client_->GetFileSystemClient(path.container)
.GetDirectoryClient(path.path_to_file);
try {
auto response = directory_client.Create();
if (response.Value.Created) {
return Status::OK();
} else {
return StatusFromErrorResponse(
directory_client.GetUrl(), response.RawResponse.get(),
"Failed to create a directory: " + path.path_to_file);
}
} catch (const Azure::Storage::StorageException& exception) {
return internal::ExceptionToStatus(
"Failed to create a directory: " + path.path_to_file + ": " +
directory_client.GetUrl(),
exception);
}
}

Status CreateDirRecursive(const AzurePath& path) {
if (path.container.empty()) {
return Status::Invalid("Cannot create an empty container");
}

auto container_client = blob_service_client_->GetBlobContainerClient(path.container);
try {
container_client.CreateIfNotExists();
} catch (const Azure::Storage::StorageException& exception) {
return internal::ExceptionToStatus(
"Failed to create a container: " + path.container + " (" +
container_client.GetUrl() + ")",
exception);
}

ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled,
hierarchical_namespace_.Enabled(path.container));
if (!hierarchical_namespace_enabled) {
// We can't create a directory without hierarchical namespace
// support. There is only "virtual directory" without
// hierarchical namespace support. And a "virtual directory" is
// (virtually) created a blob with ".../.../blob" blob name
// automatically.
return Status::OK();
}

auto directory_client = datalake_service_client_->GetFileSystemClient(path.container)
.GetDirectoryClient(path.path_to_file);
try {
directory_client.CreateIfNotExists();
} catch (const Azure::Storage::StorageException& exception) {
return internal::ExceptionToStatus(
"Failed to create a directory: " + path.path_to_file + " (" +
directory_client.GetUrl() + ")",
exception);
}

return Status::OK();
}
};

const AzureOptions& AzureFileSystem::options() const { return impl_->options(); }
Expand All @@ -636,7 +742,12 @@ Result<FileInfoVector> AzureFileSystem::GetFileInfo(const FileSelector& select)
}

Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) {
return Status::NotImplemented("The Azure FileSystem is not fully implemented");
ARROW_ASSIGN_OR_RAISE(auto p, AzurePath::FromString(path));
if (recursive) {
return impl_->CreateDirRecursive(p);
} else {
return impl_->CreateDir(p);
}
}

Status AzureFileSystem::DeleteDir(const std::string& path) {
Expand Down
113 changes: 112 additions & 1 deletion cpp/src/arrow/filesystem/azurefs_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
#include <azure/storage/common/storage_credential.hpp>
#include <azure/storage/files/datalake.hpp>

#include "arrow/filesystem/path_util.h"
#include "arrow/filesystem/test_util.h"
#include "arrow/result.h"
#include "arrow/testing/gtest_util.h"
Expand Down Expand Up @@ -225,6 +226,10 @@ class AzureFileSystemTest : public ::testing::Test {
return s;
}

std::string RandomContainerName() { return RandomChars(32); }

std::string RandomDirectoryName() { return RandomChars(32); }

void UploadLines(const std::vector<std::string>& lines, const char* path_to_file,
int total_size) {
// TODO(GH-38333): Switch to using Azure filesystem to write once its implemented.
Expand Down Expand Up @@ -267,6 +272,22 @@ class AzureFlatNamespaceFileSystemTest : public AzureFileSystemTest {
}
};

// How to enable this test:
//
// You need an Azure account. You should be able to create a free
// account at https://azure.microsoft.com/en-gb/free/ . You should be
// able to create a storage account through the portal Web UI.
//
// See also the official document how to create a storage account:
// https://learn.microsoft.com/en-us/azure/storage/blobs/create-data-lake-storage-account
//
// A few suggestions on configuration:
//
// * Use Standard general-purpose v2 not premium
// * Use LRS redundancy
// * Obviously you need to enable hierarchical namespace.
// * Set the default access tier to hot
// * SFTP, NFS and file shares are not required.
class AzureHierarchicalNamespaceFileSystemTest : public AzureFileSystemTest {
Result<AzureOptions> MakeOptions() override {
AzureOptions options;
Expand Down Expand Up @@ -396,6 +417,96 @@ TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObject) {
RunGetFileInfoObjectTest();
}

TEST_F(AzuriteFileSystemTest, CreateDirFailureNoContainer) {
ASSERT_RAISES(Invalid, fs_->CreateDir("", false));
}

TEST_F(AzuriteFileSystemTest, CreateDirSuccessContainerOnly) {
auto container_name = RandomContainerName();
ASSERT_OK(fs_->CreateDir(container_name, false));
arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory);
}

TEST_F(AzuriteFileSystemTest, CreateDirSuccessContainerAndDirectory) {
const auto path = PreexistingContainerPath() + RandomDirectoryName();
ASSERT_OK(fs_->CreateDir(path, false));
// There is only virtual directory without hierarchical namespace
// support. So the CreateDir() does nothing.
arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound);
}

TEST_F(AzureHierarchicalNamespaceFileSystemTest, CreateDirSuccessContainerAndDirectory) {
const auto path = PreexistingContainerPath() + RandomDirectoryName();
ASSERT_OK(fs_->CreateDir(path, false));
arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory);
}

TEST_F(AzuriteFileSystemTest, CreateDirFailureDirectoryWithMissingContainer) {
const auto path = std::string("not-a-container/new-directory");
ASSERT_RAISES(IOError, fs_->CreateDir(path, false));
}

TEST_F(AzuriteFileSystemTest, CreateDirRecursiveFailureNoContainer) {
ASSERT_RAISES(Invalid, fs_->CreateDir("", true));
}

TEST_F(AzureHierarchicalNamespaceFileSystemTest, CreateDirRecursiveSuccessContainerOnly) {
auto container_name = RandomContainerName();
ASSERT_OK(fs_->CreateDir(container_name, true));
arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory);
}

TEST_F(AzuriteFileSystemTest, CreateDirRecursiveSuccessContainerOnly) {
auto container_name = RandomContainerName();
ASSERT_OK(fs_->CreateDir(container_name, true));
arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory);
}

TEST_F(AzureHierarchicalNamespaceFileSystemTest, CreateDirRecursiveSuccessDirectoryOnly) {
const auto parent = PreexistingContainerPath() + RandomDirectoryName();
const auto path = internal::ConcatAbstractPath(parent, "new-sub");
ASSERT_OK(fs_->CreateDir(path, true));
arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory);
arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory);
}

TEST_F(AzuriteFileSystemTest, CreateDirRecursiveSuccessDirectoryOnly) {
const auto parent = PreexistingContainerPath() + RandomDirectoryName();
const auto path = internal::ConcatAbstractPath(parent, "new-sub");
ASSERT_OK(fs_->CreateDir(path, true));
// There is only virtual directory without hierarchical namespace
// support. So the CreateDir() does nothing.
arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound);
arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound);
}

TEST_F(AzureHierarchicalNamespaceFileSystemTest,
CreateDirRecursiveSuccessContainerAndDirectory) {
auto container_name = RandomContainerName();
const auto parent = internal::ConcatAbstractPath(container_name, RandomDirectoryName());
const auto path = internal::ConcatAbstractPath(parent, "new-sub");
ASSERT_OK(fs_->CreateDir(path, true));
arrow::fs::AssertFileInfo(fs_.get(), path, FileType::Directory);
arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::Directory);
arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory);
}

TEST_F(AzuriteFileSystemTest, CreateDirRecursiveSuccessContainerAndDirectory) {
auto container_name = RandomContainerName();
const auto parent = internal::ConcatAbstractPath(container_name, RandomDirectoryName());
const auto path = internal::ConcatAbstractPath(parent, "new-sub");
ASSERT_OK(fs_->CreateDir(path, true));
// There is only virtual directory without hierarchical namespace
// support. So the CreateDir() does nothing.
arrow::fs::AssertFileInfo(fs_.get(), path, FileType::NotFound);
arrow::fs::AssertFileInfo(fs_.get(), parent, FileType::NotFound);
arrow::fs::AssertFileInfo(fs_.get(), container_name, FileType::Directory);
}

TEST_F(AzuriteFileSystemTest, CreateDirUri) {
ASSERT_RAISES(Invalid, fs_->CreateDir("abfs://" + RandomContainerName(), true));
}

TEST_F(AzuriteFileSystemTest, OpenInputStreamString) {
std::shared_ptr<io::InputStream> stream;
ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath()));
Expand Down Expand Up @@ -455,7 +566,7 @@ TEST_F(AzuriteFileSystemTest, OpenInputStreamInfoInvalid) {
}

TEST_F(AzuriteFileSystemTest, OpenInputStreamUri) {
ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfss://" + PreexistingObjectPath()));
ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfs://" + PreexistingObjectPath()));
}

TEST_F(AzuriteFileSystemTest, OpenInputStreamTrailingSlash) {
Expand Down

0 comments on commit a394a39

Please sign in to comment.