From 2b34e37a956ac59b79e74da1dde8f037c9c88c5d Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Wed, 20 Sep 2023 14:31:12 -0400 Subject: [PATCH] GH-37770: [MATLAB] Add CSV `TableReader` and `TableWriter` MATLAB classes (#37773) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change To enable initial CSV I/O support, this PR adds `arrow.io.csv.TableReader` and `arrow.io.csv.TableWriter` MATLAB classes to the MATLAB interface. ### What changes are included in this PR? 1. Added a new `arrow.io.csv.TableReader` class 2. Added a new `arrow.io.csv.TableWriter` class **Example** ```matlab >> matlabTableWrite = array2table(rand(3)) matlabTableWrite = 3×3 table Var1 Var2 Var3 _______ ________ _______ 0.91131 0.091595 0.24594 0.51315 0.27368 0.62119 0.42942 0.88665 0.49501 >> arrowTableWrite = arrow.table(matlabTableWrite) arrowTableWrite = Var1: double Var2: double Var3: double ---- Var1: [ [ 0.9113083542736461, 0.5131490075412158, 0.42942202968065213 ] ] Var2: [ [ 0.09159480217154525, 0.27367730380496647, 0.8866478145458545 ] ] Var3: [ [ 0.2459443412735529, 0.6211893868708748, 0.49500739584280073 ] ] >> writer = arrow.io.csv.TableWriter("example.csv") writer = TableWriter with properties: Filename: "example.csv" >> writer.write(arrowTableWrite) >> reader = arrow.io.csv.TableReader("example.csv") reader = TableReader with properties: Filename: "example.csv" >> arrowTableRead = reader.read() arrowTableRead = Var1: double Var2: double Var3: double ---- Var1: [ [ 0.9113083542736461, 0.5131490075412158, 0.42942202968065213 ] ] Var2: [ [ 0.09159480217154525, 0.27367730380496647, 0.8866478145458545 ] ] Var3: [ [ 0.2459443412735529, 0.6211893868708748, 0.49500739584280073 ] ] >> matlabTableRead = table(arrowTableRead) matlabTableRead = 3×3 table Var1 Var2 Var3 _______ ________ _______ 0.91131 0.091595 0.24594 0.51315 0.27368 0.62119 0.42942 0.88665 0.49501 >> isequal(arrowTableRead, arrowTableWrite) ans = logical 1 >> isequal(matlabTableRead, matlabTableWrite) ans = logical 1 ``` ### Are these changes tested? Yes. 1. Added new CSV I/O tests including `test/arrow/io/csv/tRoundTrip.m` and `test/arrow/io/csv/tError.m`. 2. Both of these test classes inherit from a `CSVTest` superclass. ### Are there any user-facing changes? Yes. 1. Users can now read and write CSV files using `arrow.io.csv.TableReader` and `arrow.io.csv.TableWriter`. ### Future Directions 1. Expose [options](https://github.com/apache/arrow/blob/main/cpp/src/arrow/csv/options.h) for controlling CSV reading and writing in MATLAB. 2. Add more read/write tests for null value handling and other datatypes beyond numeric and string values. 4. Add a `RecordBatchReader` and `RecordBatchWriter` for CSV. 5. Add support for more I/O formats like Parquet, JSON, ORC, Arrow IPC, etc. ### Notes 1. Thank you @ sgilmore10 for your help with this pull request! 2. I chose to add both the `TableReader` and `TableWriter` in one pull request because it simplified testing. My apologies for the slightly lengthy pull request. * Closes: #37770 Lead-authored-by: Kevin Gurney Co-authored-by: Sarah Gilmore Signed-off-by: Kevin Gurney --- matlab/CMakeLists.txt | 5 +- matlab/src/cpp/arrow/matlab/error/error.h | 3 + .../arrow/matlab/io/csv/proxy/table_reader.cc | 93 ++++++++++++++++ .../arrow/matlab/io/csv/proxy/table_reader.h | 38 +++++++ .../arrow/matlab/io/csv/proxy/table_writer.cc | 86 +++++++++++++++ .../arrow/matlab/io/csv/proxy/table_writer.h | 38 +++++++ matlab/src/cpp/arrow/matlab/proxy/factory.cc | 4 + .../src/matlab/+arrow/+io/+csv/TableReader.m | 51 +++++++++ .../src/matlab/+arrow/+io/+csv/TableWriter.m | 51 +++++++++ matlab/test/arrow/io/csv/CSVTest.m | 102 ++++++++++++++++++ matlab/test/arrow/io/csv/tError.m | 73 +++++++++++++ matlab/test/arrow/io/csv/tRoundTrip.m | 62 +++++++++++ .../cmake/BuildMatlabArrowInterface.cmake | 4 +- 13 files changed, 606 insertions(+), 4 deletions(-) create mode 100644 matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc create mode 100644 matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.h create mode 100644 matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc create mode 100644 matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.h create mode 100644 matlab/src/matlab/+arrow/+io/+csv/TableReader.m create mode 100644 matlab/src/matlab/+arrow/+io/+csv/TableWriter.m create mode 100644 matlab/test/arrow/io/csv/CSVTest.m create mode 100644 matlab/test/arrow/io/csv/tError.m create mode 100644 matlab/test/arrow/io/csv/tRoundTrip.m diff --git a/matlab/CMakeLists.txt b/matlab/CMakeLists.txt index c8100a389ace0..b7af37a278536 100644 --- a/matlab/CMakeLists.txt +++ b/matlab/CMakeLists.txt @@ -34,8 +34,9 @@ function(build_arrow) set(ARROW_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep-prefix") set(ARROW_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep-build") - set(ARROW_CMAKE_ARGS "-DCMAKE_INSTALL_PREFIX=${ARROW_PREFIX}" - "-DCMAKE_INSTALL_LIBDIR=lib" "-DARROW_BUILD_STATIC=OFF") + set(ARROW_CMAKE_ARGS + "-DCMAKE_INSTALL_PREFIX=${ARROW_PREFIX}" "-DCMAKE_INSTALL_LIBDIR=lib" + "-DARROW_BUILD_STATIC=OFF" "-DARROW_CSV=ON") add_library(arrow_shared SHARED IMPORTED) set(ARROW_LIBRARY_TARGET arrow_shared) diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index 4ff77da8d8360..ada9954353d9b 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -182,6 +182,9 @@ namespace arrow::matlab::error { static const char* TABLE_INVALID_NUMERIC_COLUMN_INDEX = "arrow:tabular:table:InvalidNumericColumnIndex"; static const char* FAILED_TO_OPEN_FILE_FOR_WRITE = "arrow:io:FailedToOpenFileForWrite"; static const char* FAILED_TO_OPEN_FILE_FOR_READ = "arrow:io:FailedToOpenFileForRead"; + static const char* CSV_FAILED_TO_WRITE_TABLE = "arrow:io:csv:FailedToWriteTable"; + static const char* CSV_FAILED_TO_CREATE_TABLE_READER = "arrow:io:csv:FailedToCreateTableReader"; + static const char* CSV_FAILED_TO_READ_TABLE = "arrow:io:csv:FailedToReadTable"; static const char* FEATHER_FAILED_TO_WRITE_TABLE = "arrow:io:feather:FailedToWriteTable"; static const char* TABLE_FROM_RECORD_BATCH = "arrow:table:FromRecordBatch"; static const char* FEATHER_FAILED_TO_CREATE_READER = "arrow:io:feather:FailedToCreateReader"; diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc new file mode 100644 index 0000000000000..ab9935ce145a8 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc @@ -0,0 +1,93 @@ +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "libmexclass/proxy/ProxyManager.h" + +#include "arrow/matlab/error/error.h" +#include "arrow/matlab/io/csv/proxy/table_reader.h" +#include "arrow/matlab/tabular/proxy/table.h" + +#include "arrow/util/utf8.h" + +#include "arrow/result.h" + +#include "arrow/io/file.h" +#include "arrow/io/interfaces.h" +#include "arrow/csv/reader.h" +#include "arrow/table.h" + +namespace arrow::matlab::io::csv::proxy { + + TableReader::TableReader(const std::string& filename) : filename{filename} { + REGISTER_METHOD(TableReader, read); + REGISTER_METHOD(TableReader, getFilename); + } + + libmexclass::proxy::MakeResult TableReader::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + namespace mda = ::matlab::data; + using TableReaderProxy = arrow::matlab::io::csv::proxy::TableReader; + + mda::StructArray args = constructor_arguments[0]; + const mda::StringArray filename_utf16_mda = args[0]["Filename"]; + const auto filename_utf16 = std::u16string(filename_utf16_mda[0]); + MATLAB_ASSIGN_OR_ERROR(const auto filename, arrow::util::UTF16StringToUTF8(filename_utf16), error::UNICODE_CONVERSION_ERROR_ID); + + return std::make_shared(filename); + } + + void TableReader::read(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + using namespace libmexclass::proxy; + namespace csv = ::arrow::csv; + using TableProxy = arrow::matlab::tabular::proxy::Table; + + mda::ArrayFactory factory; + + // Create a file input stream. + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto source, arrow::io::ReadableFile::Open(filename, arrow::default_memory_pool()), context, error::FAILED_TO_OPEN_FILE_FOR_READ); + + const ::arrow::io::IOContext io_context; + const auto read_options = csv::ReadOptions::Defaults(); + const auto parse_options = csv::ParseOptions::Defaults(); + const auto convert_options = csv::ConvertOptions::Defaults(); + + // Create a TableReader from the file input stream. + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto table_reader, + csv::TableReader::Make(io_context, source, read_options, parse_options, convert_options), + context, + error::CSV_FAILED_TO_CREATE_TABLE_READER); + + // Read a Table from the file. + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto table, table_reader->Read(), context, error::CSV_FAILED_TO_READ_TABLE); + + auto table_proxy = std::make_shared(table); + const auto table_proxy_id = ProxyManager::manageProxy(table_proxy); + + const auto table_proxy_id_mda = factory.createScalar(table_proxy_id); + + context.outputs[0] = table_proxy_id_mda; + } + + void TableReader::getFilename(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::ArrayFactory factory; + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto filename_utf16, arrow::util::UTF8StringToUTF16(filename), context, error::UNICODE_CONVERSION_ERROR_ID); + auto filename_utf16_mda = factory.createScalar(filename_utf16); + context.outputs[0] = filename_utf16_mda; + } + +} diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.h b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.h new file mode 100644 index 0000000000000..d5dfce50e4096 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_reader.h @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::io::csv::proxy { + + class TableReader : public libmexclass::proxy::Proxy { + public: + TableReader(const std::string& filename); + ~TableReader() {} + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); + + protected: + void read(libmexclass::proxy::method::Context& context); + void getFilename(libmexclass::proxy::method::Context& context); + + private: + const std::string filename; + }; + +} diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc new file mode 100644 index 0000000000000..b24bd81b06681 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/matlab/io/csv/proxy/table_writer.h" +#include "arrow/matlab/tabular/proxy/table.h" +#include "arrow/matlab/error/error.h" + +#include "arrow/result.h" +#include "arrow/table.h" +#include "arrow/util/utf8.h" + +#include "arrow/io/file.h" +#include "arrow/csv/writer.h" +#include "arrow/csv/options.h" + +#include "libmexclass/proxy/ProxyManager.h" + +namespace arrow::matlab::io::csv::proxy { + + TableWriter::TableWriter(const std::string& filename) : filename{filename} { + REGISTER_METHOD(TableWriter, getFilename); + REGISTER_METHOD(TableWriter, write); + } + + libmexclass::proxy::MakeResult TableWriter::make(const libmexclass::proxy::FunctionArguments& constructor_arguments) { + namespace mda = ::matlab::data; + mda::StructArray opts = constructor_arguments[0]; + const mda::StringArray filename_mda = opts[0]["Filename"]; + using TableWriterProxy = ::arrow::matlab::io::csv::proxy::TableWriter; + + const auto filename_utf16 = std::u16string(filename_mda[0]); + MATLAB_ASSIGN_OR_ERROR(const auto filename_utf8, + arrow::util::UTF16StringToUTF8(filename_utf16), + error::UNICODE_CONVERSION_ERROR_ID); + + return std::make_shared(filename_utf8); + } + + void TableWriter::getFilename(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto utf16_filename, + arrow::util::UTF8StringToUTF16(filename), + context, + error::UNICODE_CONVERSION_ERROR_ID); + mda::ArrayFactory factory; + auto str_mda = factory.createScalar(utf16_filename); + context.outputs[0] = str_mda; + } + + void TableWriter::write(libmexclass::proxy::method::Context& context) { + namespace csv = ::arrow::csv; + namespace mda = ::matlab::data; + using TableProxy = ::arrow::matlab::tabular::proxy::Table; + + mda::StructArray opts = context.inputs[0]; + const mda::TypedArray table_proxy_id_mda = opts[0]["TableProxyID"]; + const uint64_t table_proxy_id = table_proxy_id_mda[0]; + + auto proxy = libmexclass::proxy::ProxyManager::getProxy(table_proxy_id); + auto table_proxy = std::static_pointer_cast(proxy); + auto table = table_proxy->unwrap(); + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(const auto output_stream, + arrow::io::FileOutputStream::Open(filename), + context, + error::FAILED_TO_OPEN_FILE_FOR_WRITE); + const auto options = csv::WriteOptions::Defaults(); + MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT(csv::WriteCSV(*table, options, output_stream.get()), + context, + error::CSV_FAILED_TO_WRITE_TABLE); + } +} diff --git a/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.h b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.h new file mode 100644 index 0000000000000..b9916bd9bdc22 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/io/csv/proxy/table_writer.h @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::io::csv::proxy { + + class TableWriter : public libmexclass::proxy::Proxy { + public: + TableWriter(const std::string& filename); + ~TableWriter() {} + static libmexclass::proxy::MakeResult make(const libmexclass::proxy::FunctionArguments& constructor_arguments); + + protected: + void getFilename(libmexclass::proxy::method::Context& context); + void write(libmexclass::proxy::method::Context& context); + + private: + const std::string filename; + }; + +} diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index ebeb020a9e7c7..d1f46c7e2f71f 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -37,6 +37,8 @@ #include "arrow/matlab/type/proxy/field.h" #include "arrow/matlab/io/feather/proxy/writer.h" #include "arrow/matlab/io/feather/proxy/reader.h" +#include "arrow/matlab/io/csv/proxy/table_writer.h" +#include "arrow/matlab/io/csv/proxy/table_reader.h" #include "factory.h" @@ -85,6 +87,8 @@ libmexclass::proxy::MakeResult Factory::make_proxy(const ClassName& class_name, REGISTER_PROXY(arrow.type.proxy.StructType , arrow::matlab::type::proxy::StructType); REGISTER_PROXY(arrow.io.feather.proxy.Writer , arrow::matlab::io::feather::proxy::Writer); REGISTER_PROXY(arrow.io.feather.proxy.Reader , arrow::matlab::io::feather::proxy::Reader); + REGISTER_PROXY(arrow.io.csv.proxy.TableWriter , arrow::matlab::io::csv::proxy::TableWriter); + REGISTER_PROXY(arrow.io.csv.proxy.TableReader , arrow::matlab::io::csv::proxy::TableReader); return libmexclass::error::Error{error::UNKNOWN_PROXY_ERROR_ID, "Did not find matching C++ proxy for " + class_name}; }; diff --git a/matlab/src/matlab/+arrow/+io/+csv/TableReader.m b/matlab/src/matlab/+arrow/+io/+csv/TableReader.m new file mode 100644 index 0000000000000..1e0308bb8d4fe --- /dev/null +++ b/matlab/src/matlab/+arrow/+io/+csv/TableReader.m @@ -0,0 +1,51 @@ +%TABLEREADER Reads tabular data from a CSV file into an arrow.tabular.Table. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. + +classdef TableReader + + properties (GetAccess=public, SetAccess=private, Hidden) + Proxy + end + + properties (Dependent, SetAccess=private, GetAccess=public) + Filename + end + + methods + + function obj = TableReader(filename) + arguments + filename (1, 1) string {mustBeNonmissing, mustBeNonzeroLengthText} + end + + args = struct(Filename=filename); + obj.Proxy = arrow.internal.proxy.create("arrow.io.csv.proxy.TableReader", args); + end + + function table = read(obj) + tableProxyID = obj.Proxy.read(); + proxy = libmexclass.proxy.Proxy(Name="arrow.tabular.proxy.Table", ID=tableProxyID); + table = arrow.tabular.Table(proxy); + end + + function filename = get.Filename(obj) + filename = obj.Proxy.getFilename(); + end + + end + +end \ No newline at end of file diff --git a/matlab/src/matlab/+arrow/+io/+csv/TableWriter.m b/matlab/src/matlab/+arrow/+io/+csv/TableWriter.m new file mode 100644 index 0000000000000..eb1aafe08f545 --- /dev/null +++ b/matlab/src/matlab/+arrow/+io/+csv/TableWriter.m @@ -0,0 +1,51 @@ +%TABLEWRITER Writes tabular data in an arrow.tabular.Table to a CSV file. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef TableWriter < matlab.mixin.Scalar + + properties(Hidden, SetAccess=private, GetAccess=public) + Proxy + end + + properties(Dependent, SetAccess=private, GetAccess=public) + Filename + end + + methods + function obj = TableWriter(filename) + arguments + filename (1, 1) string {mustBeNonmissing, mustBeNonzeroLengthText} + end + + args = struct(Filename=filename); + proxyName = "arrow.io.csv.proxy.TableWriter"; + obj.Proxy = arrow.internal.proxy.create(proxyName, args); + end + + function write(obj, table) + arguments + obj (1, 1) arrow.io.csv.TableWriter + table (1, 1) arrow.tabular.Table + end + args = struct(TableProxyID=table.Proxy.ID); + obj.Proxy.write(args); + end + + function filename = get.Filename(obj) + filename = obj.Proxy.getFilename(); + end + end +end diff --git a/matlab/test/arrow/io/csv/CSVTest.m b/matlab/test/arrow/io/csv/CSVTest.m new file mode 100644 index 0000000000000..49f77eaaa7c63 --- /dev/null +++ b/matlab/test/arrow/io/csv/CSVTest.m @@ -0,0 +1,102 @@ +%CSVTEST Super class for CSV related tests. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef CSVTest < matlab.unittest.TestCase + + properties + Filename + end + + methods (TestClassSetup) + + function initializeProperties(~) + % Seed the random number generator. + rng(1); + end + + end + + methods (TestMethodSetup) + + function setupTestFilename(testCase) + import matlab.unittest.fixtures.TemporaryFolderFixture + fixture = testCase.applyFixture(TemporaryFolderFixture); + testCase.Filename = fullfile(fixture.Folder, "filename.csv"); + end + + end + + methods + + function verifyRoundTrip(testCase, arrowTable) + import arrow.io.csv.* + + writer = TableWriter(testCase.Filename); + reader = TableReader(testCase.Filename); + + writer.write(arrowTable); + arrowTableRead = reader.read(); + + testCase.verifyEqual(arrowTableRead, arrowTable); + end + + function arrowTable = makeArrowTable(testCase, opts) + arguments + testCase + opts.Type + opts.ColumnNames + opts.NumRows + opts.WithNulls (1, 1) logical = false + end + + if opts.Type == "numeric" + matlabTable = array2table(rand(opts.NumRows, numel(opts.ColumnNames))); + elseif opts.Type == "string" + matlabTable = array2table("A" + rand(opts.NumRows, numel(opts.ColumnNames)) + "B"); + end + + if opts.WithNulls + matlabTable = testCase.setNullValues(matlabTable, NullPercentage=0.2); + end + + arrays = cell(1, width(matlabTable)); + for ii = 1:width(matlabTable) + arrays{ii} = arrow.array(matlabTable.(ii)); + end + arrowTable = arrow.tabular.Table.fromArrays(arrays{:}, ColumnNames=opts.ColumnNames); + end + + function tWithNulls = setNullValues(testCase, t, opts) + arguments + testCase %#ok + t table + opts.NullPercentage (1, 1) double {mustBeGreaterThanOrEqual(opts.NullPercentage, 0)} = 0.5 + end + + tWithNulls = t; + for ii = 1:width(t) + temp = tWithNulls.(ii); + numValues = numel(temp); + numNulls = uint64(opts.NullPercentage * numValues); + nullIndices = randperm(numValues, numNulls); + temp(nullIndices) = missing; + tWithNulls.(ii) = temp; + end + end + + end + +end diff --git a/matlab/test/arrow/io/csv/tError.m b/matlab/test/arrow/io/csv/tError.m new file mode 100644 index 0000000000000..24c420e7ba2dd --- /dev/null +++ b/matlab/test/arrow/io/csv/tError.m @@ -0,0 +1,73 @@ +%TERROR Error tests for CSV. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef tError < CSVTest + + methods(Test) + + function EmptyFile(testCase) + import arrow.io.csv.* + + arrowTableWrite = arrow.table(); + + writer = TableWriter(testCase.Filename); + reader = TableReader(testCase.Filename); + + writer.write(arrowTableWrite); + fcn = @() reader.read(); + testCase.verifyError(fcn, "arrow:io:csv:FailedToReadTable"); + end + + function InvalidWriterFilenameType(testCase) + import arrow.io.csv.* + fcn = @() TableWriter(table); + testCase.verifyError(fcn, "MATLAB:validation:UnableToConvert"); + fcn = @() TableWriter(["a", "b"]); + testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); + end + + function InvalidReaderFilenameType(testCase) + import arrow.io.csv.* + fcn = @() TableReader(table); + testCase.verifyError(fcn, "MATLAB:validation:UnableToConvert"); + fcn = @() TableReader(["a", "b"]); + testCase.verifyError(fcn, "MATLAB:validation:IncompatibleSize"); + end + + function InvalidWriterWriteType(testCase) + import arrow.io.csv.* + writer = TableWriter(testCase.Filename); + fcn = @() writer.write("text"); + testCase.verifyError(fcn, "MATLAB:validation:UnableToConvert"); + end + + function WriterFilenameNoSetter(testCase) + import arrow.io.csv.* + writer = TableWriter(testCase.Filename); + fcn = @() setfield(writer, "Filename", "filename.csv"); + testCase.verifyError(fcn, "MATLAB:class:SetProhibited"); + end + + function ReaderFilenameNoSetter(testCase) + import arrow.io.csv.* + reader = TableReader(testCase.Filename); + fcn = @() setfield(reader, "Filename", "filename.csv"); + testCase.verifyError(fcn, "MATLAB:class:SetProhibited"); + end + + end + +end \ No newline at end of file diff --git a/matlab/test/arrow/io/csv/tRoundTrip.m b/matlab/test/arrow/io/csv/tRoundTrip.m new file mode 100644 index 0000000000000..cb35822580106 --- /dev/null +++ b/matlab/test/arrow/io/csv/tRoundTrip.m @@ -0,0 +1,62 @@ +%TROUNDTRIP Round trip tests for CSV. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef tRoundTrip < CSVTest + + properties (TestParameter) + NumRows = { ... + 2, ... + 10, ... + 100 ... + } + WithNulls = { ... + true, ... + false ... + } + ColumnNames = {... + ["A", "B", "C"], ... + ["😀", "🌲", "🥭", " ", "ABC"], ... + [" ", " ", " "] + } + end + + methods(Test) + + function Numeric(testCase, NumRows, WithNulls, ColumnNames) + arrowTable = testCase.makeArrowTable(... + Type="numeric", ... + NumRows=NumRows, ... + WithNulls=WithNulls, ... + ColumnNames=ColumnNames ... + ); + + testCase.verifyRoundTrip(arrowTable); + end + + function String(testCase, NumRows, ColumnNames) + arrowTable = testCase.makeArrowTable(... + Type="string", ... + NumRows=NumRows, ... + WithNulls=false, ... + ColumnNames=ColumnNames ... + ); + + testCase.verifyRoundTrip(arrowTable); + end + + end + +end \ No newline at end of file diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index 40c6b5a51d4fe..294612dda370f 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -70,10 +70,10 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/type/proxy/wrap.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/writer.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/feather/proxy/reader.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/csv/proxy/table_writer.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/io/csv/proxy/table_reader.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/index/validate.cc") - - set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_INCLUDE_DIR "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_FACTORY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/proxy/factory.cc") set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_LIBRARY_INCLUDE_DIRS ${MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_LIBRARY_ROOT_INCLUDE_DIR}