From b5e715a0dbad957d4d3309aefe4403db75a8afa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Mon, 11 Jul 2022 13:52:32 +0200 Subject: [PATCH 01/12] Add new options to the Delimited format --- .../Table/0.0.0-dev/src/Io/File_Format.enso | 32 ++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso index a1e8cf0938c3..e8865360f624 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso @@ -109,7 +109,16 @@ type Delimited - keep_invalid_rows: Specifies whether rows that contain less or more columns than expected should be kept (setting the missing columns to `Nothing` or dropping the excess columns) or dropped. - type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (skip_rows:Integer=0) (row_limit:Integer|Nothing=Nothing) (quote_style:Quote_Style=Quote_Style.With_Quotes) (headers:Boolean|Infer=Infer) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True) + - line_separator: Sets the line separator to use. Defaults to `Nothing` + which infers the separator used in a given file in read mode and uses + the system-default for writing. + - comment_character: Sets the character which indicates the start of a + comment within a delimited file. Any content in the current line after + the comment is ignored and lines that only consist of a comment are + skipped. This option is only applicable for read mode and does not + affect writing. It defaults to `Nothing` which means that comments are + disabled. + type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (skip_rows:Integer=0) (row_limit:Integer|Nothing=Nothing) (quote_style:Quote_Style=Quote_Style.With_Quotes) (headers:Boolean|Infer=Infer) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True) (line_separator:Text|Nothing=Nothing) (comment_character:Text|Nothing=Nothing) ## Implements the `File.read` for this `File_Format` read : File -> Problem_Behavior -> Any @@ -124,9 +133,9 @@ type Delimited ## PRIVATE Clone the instance with some properties overridden. Note: This function is internal until such time as Atom cloning with modification is built into Enso. - clone : Text->Text->(Boolean|Infer)->Data_Formatter->Boolean->Delimited - clone (quote_style=self.quote_style) (headers=self.headers) (value_formatter=self.value_formatter) (keep_invalid_rows=self.keep_invalid_rows) = - Delimited self.delimiter self.encoding self.skip_rows self.row_limit quote_style headers value_formatter keep_invalid_rows + clone : Text->Text->(Boolean|Infer)->Data_Formatter->Boolean->(Text|Nothing)->(Text|Nothing)->Delimited + clone (quote_style=self.quote_style) (headers=self.headers) (value_formatter=self.value_formatter) (keep_invalid_rows=self.keep_invalid_rows) (line_separator=self.line_separator) (comment_character=self.comment_character) = + Delimited self.delimiter self.encoding self.skip_rows self.row_limit quote_style headers value_formatter keep_invalid_rows line_separator comment_character ## Create a clone of this with specified quoting settings. with_quotes : Text->Text->Boolean->Delimited @@ -159,6 +168,21 @@ type Delimited without_parsing = self.clone value_formatter=Nothing + ## Creates a clone of this with a changed line separator. + with_line_separator : Text|Nothing -> Delimited + with_line_separator line_separator=System.default_line_separator = + self.clone line_separator=line_separator + + ## Creates a clone of this with comment parsing enabled. + with_comments : Text -> Delimited + with_comments comment_character='#' = + self.clone comment_character=comment_character + + ## Creates a clone of this with comment parsing disabled. + without_comments : Delimited + without_comments = + self.clone comment_character=Nothing + ## A setting to infer the default behaviour of some option. type Infer From ef192088df816d40a8dcd80ca9e96b7ef8f46cff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Mon, 11 Jul 2022 18:24:35 +0200 Subject: [PATCH 02/12] Initial implementation of comment_character and line_separator --- .../Standard/Base/0.0.0-dev/src/System.enso | 9 +++++ .../src/Internal/Delimited_Reader.enso | 2 +- .../src/Internal/Delimited_Writer.enso | 3 +- .../Table/0.0.0-dev/src/Io/File_Format.enso | 1 + .../org/enso/table/read/DelimitedReader.java | 37 +++++++++++++------ 5 files changed, 39 insertions(+), 13 deletions(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/System.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/System.enso index 7b3bf2e5474a..7edd221d0ee3 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/System.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/System.enso @@ -1,5 +1,9 @@ ## Functionality for interacting with the host system. +from Standard.Base import all + +polyglot java import java.lang.System as Java_System + ## PRIVATE Create a system process, returning the exit code, and the outputs to both @@ -46,6 +50,11 @@ nano_time = @Builtin_Method "System.nano_time" os : Text os = @Builtin_Method "System.os" +## Returns the default line separator for the platform that the program is + currently running on. +default_line_separator : Text +default_line_separator = Java_System.lineSeparator + ## PRIVATE The type representing the result of a subprocess exiting. diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso index 1d003f54f653..d527f4bc65f3 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso @@ -120,7 +120,7 @@ prepare_delimited_reader java_reader format max_columns on_problems = cell_type_guesser = if format.headers != Infer then Nothing else formatter = format.value_formatter.if_nothing Data_Formatter TypeInferringParser.new formatter.get_specific_type_parsers.to_array IdentityParser.new - DelimitedReader.new java_reader format.delimiter quote_characters.first quote_characters.second java_headers format.skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows warnings_as_errors + DelimitedReader.new java_reader format.delimiter quote_characters.first quote_characters.second java_headers format.skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows format.line_separator format.comment_character warnings_as_errors translate_reader_problem problem = invalid_row = [InvalidRow, (java_problem-> Invalid_Row java_problem.source_row java_problem.table_index (Vector.Vector java_problem.row))] diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso index b9943cb9d3a6..912a1011dbde 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso @@ -2,6 +2,7 @@ from Standard.Base import all import Standard.Table import Standard.Base.Error.Common as Errors +import Standard.Base.System from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior import Standard.Base.System.File.Existing_File_Behavior from Standard.Table.Errors as Table_Errors import Duplicate_Output_Column_Names, Invalid_Output_Column_Names, Invalid_Row, Mismatched_Quote, Parser_Error, Additional_Invalid_Rows, Column_Count_Mismatch, Column_Name_Mismatch @@ -129,7 +130,7 @@ write_to_writer table format java_writer = Quote_Style.No_Quotes -> Pair Nothing Nothing Quote_Style.With_Quotes _ quote quote_escape -> Pair quote quote_escape write_headers = should_write_headers format.headers - new_line = '\n' + new_line = format.line_separator.if_nothing System.default_line_separator writer = DelimitedWriter.new java_writer column_formatters.to_array format.delimiter new_line quote_characters.first quote_characters.second quote_behavior write_headers writer.write table.java_table diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso index e8865360f624..5515399809fb 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso @@ -2,6 +2,7 @@ from Standard.Base import all import Standard.Table import Standard.Base.Error.Common as Errors +import Standard.Base.System import Standard.Table.Data.Match_Columns from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java index 42196b4b2309..b7952f0c2738 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -87,6 +87,8 @@ public DelimitedReader( DatatypeParser valueParser, TypeInferringParser cellTypeGuesser, boolean keepInvalidRows, + String newline, + String commentCharacter, boolean warningsAsErrors) { if (delimiter.isEmpty()) { throw new IllegalArgumentException("Empty delimiters are not supported."); @@ -140,23 +142,43 @@ public DelimitedReader( this.valueParser = valueParser; this.cellTypeGuesser = cellTypeGuesser; - parser = setupCsvParser(input); + parser = setupCsvParser(input, newline, commentCharacter); } /** Creates a {@code CsvParser} according to the settings specified at construction. */ - private CsvParser setupCsvParser(Reader input) { + private CsvParser setupCsvParser(Reader input, String newline, String commentCharacter) { CsvParserSettings settings = new CsvParserSettings(); settings.setHeaderExtractionEnabled(false); CsvFormat format = new CsvFormat(); format.setDelimiter(delimiter); format.setQuote(quoteCharacter); format.setQuoteEscape(quoteEscapeCharacter); - settings.setFormat(format); settings.setMaxCharsPerColumn(-1); settings.setMaxColumns(maxColumns); settings.setSkipEmptyLines(false); settings.setKeepQuotes(true); - settings.setLineSeparatorDetectionEnabled(true); + + if (newline == null) { + settings.setLineSeparatorDetectionEnabled(true); + } else { + if (newline.length() > 2 || newline.isEmpty()) { + throw new IllegalArgumentException("The newline sequence should consist of at least 1 and at most 2 characters (codepoints)."); + } + format.setLineSeparator(newline); + } + + if (commentCharacter == null) { + format.setComment('\0'); + } else { + if (commentCharacter.length() != 1) { + throw new IllegalArgumentException("The comment character should be set to Nothing or consist of exactly one character (codepoint)."); + } + + format.setComment(commentCharacter.charAt(0)); + } + + settings.setFormat(format); + settings.setNumberOfRowsToSkip(skipRows); CsvParser parser = new CsvParser(settings); parser.beginParsing(input); return parser; @@ -333,7 +355,6 @@ private void ensureHeadersDetected() { } private void detectHeaders() { - skipFirstRows(); Row firstRow = loadNextRow(); if (firstRow == null) { effectiveColumnNames = new String[0]; @@ -387,12 +408,6 @@ private void detectHeaders() { } } - private void skipFirstRows() { - for (long i = 0; i < skipRows; ++i) { - loadNextRow(); - } - } - /** Reads the input stream and returns a Table. */ public WithProblems read() { ensureHeadersDetected(); From 1689e0cbeae9e9df87546c4f837543661ac7b115 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Mon, 11 Jul 2022 22:36:37 +0200 Subject: [PATCH 03/12] WIP tests, autodetection for append --- .../src/Internal/Delimited_Reader.enso | 16 ++++++++--- .../src/Internal/Delimited_Writer.enso | 5 +++- .../org/enso/table/read/DelimitedReader.java | 28 +++++++++++++++---- test/Table_Tests/src/Delimited_Read_Spec.enso | 10 ++++++- .../Table_Tests/src/Delimited_Write_Spec.enso | 25 +++++++++++++++++ 5 files changed, 73 insertions(+), 11 deletions(-) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso index d527f4bc65f3..d9b724ec931d 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso @@ -146,11 +146,17 @@ type Detected_Headers ## Indicates that the file exists but no headers have been found, so only positional column matching is possible. type No_Headers (column_count : Integer) +## PRIVATE + An internal type representing metadata describing the format of a specific + Delimited file. +type Detected_File_Metadata + type Detected_File_Metadata (headers : Detected_Headers) (line_separator : Text|Nothing) + ## PRIVATE Reads the beginning of the file to detect the existing headers and column count. -detect_headers : File -> File_Format.Delimited -> Detected_Headers -detect_headers file format = +detect_metadata : File -> File_Format.Delimited -> Detected_Headers +detect_metadata file format = on_problems = Ignore result = handle_io_exception file <| handle_illegal_arguments <| handle_parsing_failure <| handle_parsing_exception <| file.with_input_stream [File.Option.Read] stream-> @@ -161,13 +167,15 @@ detect_headers file format = reached and restart parsing with an increased limit. reader = prepare_delimited_reader java_reader format max_columns=default_max_columns on_problems defined_columns = reader.getDefinedColumnNames - case defined_columns of + headers = case defined_columns of Nothing -> column_count = reader.getColumnCount if column_count == 0 then Nothing else No_Headers column_count _ -> Existing_Headers (Vector.Vector defined_columns) - result.catch File.File_Not_Found (_->Nothing) + line_separator = reader.getEffectiveLineSeparator + Detected_File_Metadata headers line_separator + result.catch File.File_Not_Found (_->(Detected_File_Metadata Nothing Nothing)) ## PRIVATE handle_illegal_arguments = diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso index 912a1011dbde..d3ca1fb66519 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso @@ -50,7 +50,10 @@ write_file table format file on_existing_file match_columns on_problems = append_to_file : Table -> File_Format.Delimited -> File -> Match_Columns -> Problem_Behavior -> Any append_to_file table format file match_columns on_problems = Column_Name_Mismatch.handle_java_exception <| Column_Count_Mismatch.handle_java_exception <| - preexisting_headers = Delimited_Reader.detect_headers file format + metadata = Delimited_Reader.detect_metadata file format + preexisting_headers = metadata.headers + effective_line_separator = metadata.line_separator + # TODO the separator will switch to ADT so we need to pass it differently reordered_java_table = case preexisting_headers of Nothing -> table.java_table Existing_Headers column_names -> case match_columns of diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java index b7952f0c2738..1170a337da2d 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -41,6 +41,7 @@ public class DelimitedReader { private final DatatypeParser valueParser; private final TypeInferringParser cellTypeGuesser; private final boolean keepInvalidRows; + private final String newlineSetting; private final boolean warningsAsErrors; private final NoOpProblemAggregator noOpProblemAggregator = new NoOpProblemAggregator(); private long invalidRowsCount = 0; @@ -71,6 +72,8 @@ public class DelimitedReader { * @param cellTypeGuesser a helper used to guess cell types, used for the purpose of inferring the * headers, it must not be null if {@code headerBehavior} is set to {@code INFER}. * @param keepInvalidRows specifies whether to keep rows that had an unexpected number of columns + * @param newline specifies what newline character to assume; if set to null, the newline character is autodetected + * @param commentCharacter specifies what character indicates start of comments; if set to null, comments are disabled * @param warningsAsErrors specifies if the first warning should be immediately raised as an error * (used as a fast-path for the error-reporting mode to avoid computing a value that is going * to be discarded anyway) @@ -142,11 +145,12 @@ public DelimitedReader( this.valueParser = valueParser; this.cellTypeGuesser = cellTypeGuesser; - parser = setupCsvParser(input, newline, commentCharacter); + this.newlineSetting = newline; + parser = setupCsvParser(input, commentCharacter); } /** Creates a {@code CsvParser} according to the settings specified at construction. */ - private CsvParser setupCsvParser(Reader input, String newline, String commentCharacter) { + private CsvParser setupCsvParser(Reader input, String commentCharacter) { CsvParserSettings settings = new CsvParserSettings(); settings.setHeaderExtractionEnabled(false); CsvFormat format = new CsvFormat(); @@ -158,13 +162,13 @@ private CsvParser setupCsvParser(Reader input, String newline, String commentCha settings.setSkipEmptyLines(false); settings.setKeepQuotes(true); - if (newline == null) { + if (newlineSetting == null) { settings.setLineSeparatorDetectionEnabled(true); } else { - if (newline.length() > 2 || newline.isEmpty()) { + if (newlineSetting.length() > 2 || newlineSetting.isEmpty()) { throw new IllegalArgumentException("The newline sequence should consist of at least 1 and at most 2 characters (codepoints)."); } - format.setLineSeparator(newline); + format.setLineSeparator(newlineSetting); } if (commentCharacter == null) { @@ -348,6 +352,20 @@ public int getColumnCount() { return effectiveColumnNames.length; } + /** Returns the line separator used in the file. + * + * If a specific separator is set at construction, it is just returned. If it + * was set to null, the separator inferred from the file contents is returned. + */ + public String getEffectiveLineSeparator() { + if (newlineSetting != null) { + return newlineSetting; + } else { + ensureHeadersDetected(); + return parser.getDetectedFormat().getLineSeparatorString(); + } + } + private void ensureHeadersDetected() { if (effectiveColumnNames == null) { detectHeaders(); diff --git a/test/Table_Tests/src/Delimited_Read_Spec.enso b/test/Table_Tests/src/Delimited_Read_Spec.enso index bfc7b71e3f61..847d60c157d6 100644 --- a/test/Table_Tests/src/Delimited_Read_Spec.enso +++ b/test/Table_Tests/src/Delimited_Read_Spec.enso @@ -129,6 +129,8 @@ spec = 'a,b,c\nd,e,f\r1,2,3'.write (path 'mixed.csv') File.read (path 'mixed.csv') (Delimited "," headers=True value_formatter=Nothing) Problem_Behavior.Report_Error . should_fail_with Invalid_Row + # TODO add a test where mixed is read differently by overriding line separator + ['crlf.csv', 'lf.csv', 'cr.csv', 'mixed.csv'].each (path >> .delete) Test.specify "should work with Windows-1252 encoding" <| @@ -172,7 +174,6 @@ spec = problems_2 = [Encoding_Error "Encoding issues at byte 22."] Problems.test_problem_handling action_2 problems_2 tester_2 - Test.specify "should handle duplicated columns" <| action on_problems = File.read (enso_project.data / "duplicated_columns.csv") (Delimited "," headers=True value_formatter=Nothing) on_problems tester table = @@ -358,4 +359,11 @@ spec = Delimited ',' . with_parsing custom_formatter . should_equal (Delimited ',' value_formatter=custom_formatter) Delimited ',' row_limit=456 . without_parsing . should_equal (Delimited ',' value_formatter=Nothing row_limit=456) + Test.specify "should be able to read column names starting with #" <| + Nothing + + Test.specify "should be able to handle comments if enabled" <| + # TODO check # and other comment style; check both ignored whole lines and parts of lines + Nothing + main = Test.Suite.run_main spec diff --git a/test/Table_Tests/src/Delimited_Write_Spec.enso b/test/Table_Tests/src/Delimited_Write_Spec.enso index fc740ceb61da..e809a508f374 100644 --- a/test/Table_Tests/src/Delimited_Write_Spec.enso +++ b/test/Table_Tests/src/Delimited_Write_Spec.enso @@ -40,9 +40,14 @@ spec = 2,1.5,y,2 3,2.2,z,[[[My Type :: 10]]] text = File.read_text file + # TODO check that system default is used text.should_equal expected_text+'\n' file.delete + Test.specify "should allow to specify line ending style" <| + # TODO check manual overrides of all 3 styles + Nothing + Test.specify "should be able to write an empty table" <| table = Table.new [] file = (enso_project.data / "transient" / "empty.csv") @@ -324,4 +329,24 @@ spec = file.delete + Test.specify "should be able to correctly append to a file that contains no actual data but only commented out lines" <| + # TODO + Nothing + + Test.specify "should use the same line ending style as existing data when appending" <| + # TODO check all 3 styles - initial manually set, appending autodetected + Nothing + + Test.specify "should use the system default line ending style when appending to an empty or nonexistent file" <| + # TODO + Nothing + + Test.specify "should use the existing line ending style when appending to a file consisting of only comments" <| + # TODO + Nothing + + Test.specify "should use the explicitly specified line ending style regardless of the detected one" <| + # TOOD write in one style than append in other + Nothing + main = Test.Suite.run_main spec From aa5a5349e6cb0e6c2c6217fcf7ddc884410aa484 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 12 Jul 2022 13:45:15 +0200 Subject: [PATCH 04/12] Move to line separator ADT --- .../Standard/Base/0.0.0-dev/src/System.enso | 3 +- .../src/Internal/Delimited_Reader.enso | 4 ++- .../src/Internal/Delimited_Writer.enso | 24 +++++++++------ .../Table/0.0.0-dev/src/Io/File_Format.enso | 21 +++++++------- .../0.0.0-dev/src/Io/Line_Ending_Style.enso | 29 +++++++++++++++++++ 5 files changed, 60 insertions(+), 21 deletions(-) create mode 100644 distribution/lib/Standard/Table/0.0.0-dev/src/Io/Line_Ending_Style.enso diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/System.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/System.enso index 7edd221d0ee3..ee37ed2d7388 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/System.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/System.enso @@ -50,7 +50,8 @@ nano_time = @Builtin_Method "System.nano_time" os : Text os = @Builtin_Method "System.os" -## Returns the default line separator for the platform that the program is +## PRIVATE + Returns the default line separator for the platform that the program is currently running on. default_line_separator : Text default_line_separator = Java_System.lineSeparator diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso index d9b724ec931d..b2a337513812 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Reader.enso @@ -8,6 +8,7 @@ from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encodi from Standard.Table.Io.File_Format import Infer from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter import Standard.Table.Io.Quote_Style +from Standard.Table.Io.Line_Ending_Style import line_separator_sequence polyglot java import org.enso.table.read.DelimitedReader polyglot java import org.enso.table.read.ParsingFailedException @@ -120,7 +121,8 @@ prepare_delimited_reader java_reader format max_columns on_problems = cell_type_guesser = if format.headers != Infer then Nothing else formatter = format.value_formatter.if_nothing Data_Formatter TypeInferringParser.new formatter.get_specific_type_parsers.to_array IdentityParser.new - DelimitedReader.new java_reader format.delimiter quote_characters.first quote_characters.second java_headers format.skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows format.line_separator format.comment_character warnings_as_errors + newline = line_separator_sequence format.line_endings + DelimitedReader.new java_reader format.delimiter quote_characters.first quote_characters.second java_headers format.skip_rows row_limit max_columns value_parser cell_type_guesser format.keep_invalid_rows newline format.comment_character warnings_as_errors translate_reader_problem problem = invalid_row = [InvalidRow, (java_problem-> Invalid_Row java_problem.source_row java_problem.table_index (Vector.Vector java_problem.row))] diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso index d3ca1fb66519..84aadfcb471f 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso @@ -11,6 +11,7 @@ from Standard.Table.Io.File_Format import Infer from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter import Standard.Table.Data.Storage import Standard.Table.Io.Quote_Style +from Standard.Table.Io.Line_Ending_Style import line_separator_sequence from Standard.Table.Internal.Delimited_Reader import Existing_Headers, No_Headers import Standard.Table.Data.Match_Columns @@ -53,7 +54,6 @@ append_to_file table format file match_columns on_problems = metadata = Delimited_Reader.detect_metadata file format preexisting_headers = metadata.headers effective_line_separator = metadata.line_separator - # TODO the separator will switch to ADT so we need to pass it differently reordered_java_table = case preexisting_headers of Nothing -> table.java_table Existing_Headers column_names -> case match_columns of @@ -73,7 +73,7 @@ append_to_file table format file match_columns on_problems = True -> format.with_headers False -> format.without_headers Existing_File_Behavior.Append.write file stream-> - write_to_stream reordered_table amended_format stream on_problems related_file=file + write_to_stream reordered_table amended_format stream on_problems related_file=file separator_override=effective_line_separator ## PRIVATE Returns a Text value representing the table in the delimited format. @@ -96,14 +96,16 @@ write_text table format = If set to `Ignore`, the operation proceeds without errors or warnings. - related_file: The file related to the provided `java_stream`, if available, or `Nothing`. It is used for more detailed error reporting. -write_to_stream : Table -> File_Format.Delimited -> Output_Stream -> Problem_Behavior -> File | Nothing -> Any -write_to_stream table format stream on_problems related_file=Nothing = + - separator_override: An optional override for the line separator to use + instead of the one from `format`. +write_to_stream : Table -> File_Format.Delimited -> Output_Stream -> Problem_Behavior -> File | Nothing -> Text | Nothing -> Any +write_to_stream table format stream on_problems related_file=Nothing separator_override=Nothing = handle_io_exception ~action = Panic.catch IOException action caught_panic-> Error.throw (File.wrap_io_exception related_file caught_panic.payload.cause) handle_io_exception <| stream.with_stream_encoder format.encoding on_problems reporting_stream_encoder-> - write_to_writer table format reporting_stream_encoder + write_to_writer table format reporting_stream_encoder separator_override=separator_override ## PRIVATE Writes data to the provided `Writer` according to the provided format. @@ -115,8 +117,10 @@ write_to_stream table format stream on_problems related_file=Nothing = - table: The table to serialize. - format: The specification of the delimited file format. - java_writer: A Java `Writer` to which characters will be written. -write_to_writer : Table -> File_Format.Delimited -> Writer -> Any -write_to_writer table format java_writer = + - separator_override: An optional override for the line separator to use + instead of the one from `format`. +write_to_writer : Table -> File_Format.Delimited -> Writer -> Text | Nothing -> Any +write_to_writer table format java_writer separator_override=Nothing = column_formatters = Panic.recover Illegal_Argument_Error <| case format.value_formatter of Nothing -> table.columns.map column-> case column.storage_type of Storage.Text -> TextFormatter.new @@ -133,8 +137,10 @@ write_to_writer table format java_writer = Quote_Style.No_Quotes -> Pair Nothing Nothing Quote_Style.With_Quotes _ quote quote_escape -> Pair quote quote_escape write_headers = should_write_headers format.headers - new_line = format.line_separator.if_nothing System.default_line_separator - writer = DelimitedWriter.new java_writer column_formatters.to_array format.delimiter new_line quote_characters.first quote_characters.second quote_behavior write_headers + newline = separator_override.if_nothing <| + separator_from_format = line_separator_sequence format.line_endings + separator_from_format.if_nothing System.default_line_separator + writer = DelimitedWriter.new java_writer column_formatters.to_array format.delimiter newline quote_characters.first quote_characters.second quote_behavior write_headers writer.write table.java_table ## PRIVATE diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso index 5515399809fb..7511b9622268 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso @@ -110,16 +110,17 @@ type Delimited - keep_invalid_rows: Specifies whether rows that contain less or more columns than expected should be kept (setting the missing columns to `Nothing` or dropping the excess columns) or dropped. - - line_separator: Sets the line separator to use. Defaults to `Nothing` - which infers the separator used in a given file in read mode and uses - the system-default for writing. + - line_endings: Sets the line ending style to use. Defaults to `Infer` - + when reading a file or appending to an existing file, the line endings + are detected from file contents; when writing a new file, the OS + defaults are used. - comment_character: Sets the character which indicates the start of a comment within a delimited file. Any content in the current line after the comment is ignored and lines that only consist of a comment are skipped. This option is only applicable for read mode and does not affect writing. It defaults to `Nothing` which means that comments are disabled. - type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (skip_rows:Integer=0) (row_limit:Integer|Nothing=Nothing) (quote_style:Quote_Style=Quote_Style.With_Quotes) (headers:Boolean|Infer=Infer) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True) (line_separator:Text|Nothing=Nothing) (comment_character:Text|Nothing=Nothing) + type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (skip_rows:Integer=0) (row_limit:Integer|Nothing=Nothing) (quote_style:Quote_Style=Quote_Style.With_Quotes) (headers:Boolean|Infer=Infer) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True) (line_endings:Line_Ending_Style=Infer) (comment_character:Text|Nothing=Nothing) ## Implements the `File.read` for this `File_Format` read : File -> Problem_Behavior -> Any @@ -135,8 +136,8 @@ type Delimited Clone the instance with some properties overridden. Note: This function is internal until such time as Atom cloning with modification is built into Enso. clone : Text->Text->(Boolean|Infer)->Data_Formatter->Boolean->(Text|Nothing)->(Text|Nothing)->Delimited - clone (quote_style=self.quote_style) (headers=self.headers) (value_formatter=self.value_formatter) (keep_invalid_rows=self.keep_invalid_rows) (line_separator=self.line_separator) (comment_character=self.comment_character) = - Delimited self.delimiter self.encoding self.skip_rows self.row_limit quote_style headers value_formatter keep_invalid_rows line_separator comment_character + clone (quote_style=self.quote_style) (headers=self.headers) (value_formatter=self.value_formatter) (keep_invalid_rows=self.keep_invalid_rows) (line_endings=self.line_endings) (comment_character=self.comment_character) = + Delimited self.delimiter self.encoding self.skip_rows self.row_limit quote_style headers value_formatter keep_invalid_rows line_endings comment_character ## Create a clone of this with specified quoting settings. with_quotes : Text->Text->Boolean->Delimited @@ -169,10 +170,10 @@ type Delimited without_parsing = self.clone value_formatter=Nothing - ## Creates a clone of this with a changed line separator. - with_line_separator : Text|Nothing -> Delimited - with_line_separator line_separator=System.default_line_separator = - self.clone line_separator=line_separator + ## Creates a clone of this with a changed line ending style. + with_line_endings : Line_Ending_Style -> Delimited + with_line_endings line_endings=Infer = + self.clone line_endings=line_endings ## Creates a clone of this with comment parsing enabled. with_comments : Text -> Delimited diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Line_Ending_Style.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Line_Ending_Style.enso new file mode 100644 index 000000000000..6a81af695f52 --- /dev/null +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/Line_Ending_Style.enso @@ -0,0 +1,29 @@ +from Standard.Base import Nothing +from Standard.Table.Io.File_Format import Infer + +## Specifies what line endings to use in a file format. +type Line_Ending_Style + ## The line ending style is chosen automatically. + + When reading a file or appending to an existing file, the line endings + are detected from file contents. When writing a new file, the OS defaults + are used. + Infer + + ## The UNIX line endings. + type Unix_Line_Endings + + ## The Windows line endings. + type Windows_Line_Endings + + ## The classic Mac OS line endings. Used for legacy applications, as modern + Mac OS uses the UNIX line endings. + type Classic_Mac_Line_Endings + +## PRIVATE +line_separator_sequence : Line_Ending_Style -> Text +line_separator_sequence line_endings = case line_endings of + Unix_Line_Endings -> '\n' + Windows_Line_Endings -> '\r\n' + Classic_Mac_Line_Endings -> '\r' + Infer -> Nothing From 7cb9b86583a20b5bb7c9fb2244ad2b89895b36a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 12 Jul 2022 18:23:21 +0200 Subject: [PATCH 05/12] Implement reading tests --- .../Table/0.0.0-dev/src/Io/File_Format.enso | 10 ++-- .../org/enso/table/read/DelimitedReader.java | 1 + test/Table_Tests/data/comments.csv | 4 ++ test/Table_Tests/src/Delimited_Read_Spec.enso | 53 +++++++++++++++---- 4 files changed, 54 insertions(+), 14 deletions(-) create mode 100644 test/Table_Tests/data/comments.csv diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso index 7511b9622268..139cd8b42026 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Io/File_Format.enso @@ -115,11 +115,11 @@ type Delimited are detected from file contents; when writing a new file, the OS defaults are used. - comment_character: Sets the character which indicates the start of a - comment within a delimited file. Any content in the current line after - the comment is ignored and lines that only consist of a comment are - skipped. This option is only applicable for read mode and does not - affect writing. It defaults to `Nothing` which means that comments are - disabled. + comment within a delimited file. Any line that begins with the comment + character is skipped. The comment character is treated as any other + character if it anywhere else than at the beginning of the line. This + option is only applicable for read mode and does not affect writing. It + defaults to `Nothing` which means that comments are disabled. type Delimited (delimiter:Text) (encoding:Encoding=Encoding.utf_8) (skip_rows:Integer=0) (row_limit:Integer|Nothing=Nothing) (quote_style:Quote_Style=Quote_Style.With_Quotes) (headers:Boolean|Infer=Infer) (value_formatter:Data_Formatter|Nothing=Data_Formatter) (keep_invalid_rows:Boolean=True) (line_endings:Line_Ending_Style=Infer) (comment_character:Text|Nothing=Nothing) ## Implements the `File.read` for this `File_Format` diff --git a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java index 1170a337da2d..5d9ca5436240 100644 --- a/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java +++ b/std-bits/table/src/main/java/org/enso/table/read/DelimitedReader.java @@ -168,6 +168,7 @@ private CsvParser setupCsvParser(Reader input, String commentCharacter) { if (newlineSetting.length() > 2 || newlineSetting.isEmpty()) { throw new IllegalArgumentException("The newline sequence should consist of at least 1 and at most 2 characters (codepoints)."); } + settings.setLineSeparatorDetectionEnabled(false); format.setLineSeparator(newlineSetting); } diff --git a/test/Table_Tests/data/comments.csv b/test/Table_Tests/data/comments.csv new file mode 100644 index 000000000000..4d12899da089 --- /dev/null +++ b/test/Table_Tests/data/comments.csv @@ -0,0 +1,4 @@ +#,x,y +a,42,c # comment?? +;1,2,3 +5,6,7;comment? diff --git a/test/Table_Tests/src/Delimited_Read_Spec.enso b/test/Table_Tests/src/Delimited_Read_Spec.enso index 847d60c157d6..83a306898afb 100644 --- a/test/Table_Tests/src/Delimited_Read_Spec.enso +++ b/test/Table_Tests/src/Delimited_Read_Spec.enso @@ -5,11 +5,11 @@ from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encodi import Standard.Table import Standard.Table.Data.Column from Standard.Table.Errors import all - import Standard.Table.Io.File_Read from Standard.Table.Io.File_Format import Delimited from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter import Standard.Table.Io.Quote_Style +from Standard.Table.Io.Line_Ending_Style import all import Standard.Test import Standard.Test.Problems @@ -129,10 +129,35 @@ spec = 'a,b,c\nd,e,f\r1,2,3'.write (path 'mixed.csv') File.read (path 'mixed.csv') (Delimited "," headers=True value_formatter=Nothing) Problem_Behavior.Report_Error . should_fail_with Invalid_Row - # TODO add a test where mixed is read differently by overriding line separator - ['crlf.csv', 'lf.csv', 'cr.csv', 'mixed.csv'].each (path >> .delete) + Test.specify "should allow to override line endings style" <| + file = enso_project.data / "transient" / "lf.csv" + lines = ['a,b,c', 'd,e,f', '1,2,3'] + text = lines.join '\n' + text.write file + + format = Delimited ',' headers=False value_formatter=(Data_Formatter trim_values=False) + + reference_table = Table.new [["Column_1", ["a", "d", "1"]], ["Column_2", ["b", "e", "2"]], ["Column_3", ["c", "f", "3"]]] + collapsed_table = Table.new <| + ['a', 'b', 'c\nd', 'e', 'f\n1', 2, 3].map_with_index i-> v-> + ["Column_" + (i+1).to_text, [v]] + File.read file format . should_equal reference_table + File.read file (format.with_line_endings Unix_Line_Endings) . should_equal reference_table + File.read file (format.with_line_endings Classic_Mac_Line_Endings) . should_equal collapsed_table + File.read file (format.with_line_endings Windows_Line_Endings) . should_equal collapsed_table + file.delete + + file_2 = enso_project.data / "transient" / "crlf.csv" + lines.join '\r\n' . write file_2 + File.read file_2 (format.with_line_endings Windows_Line_Endings) . should_equal reference_table + + # For some reason loading the CRLF file in Unix mode trims the CR characters. We may want to revisit this at some point. + table = File.read file_2 (format.with_line_endings Unix_Line_Endings) + table . should_equal reference_table + file_2.delete + Test.specify "should work with Windows-1252 encoding" <| table = File.read (enso_project.data / "windows.csv") (Delimited "," headers=True encoding=Encoding.windows_1252) Problem_Behavior.Report_Error table.columns.map .name . should_equal ['a', 'b', 'c'] @@ -339,6 +364,18 @@ spec = t2.at "a" . to_vector . should_equal [1, 3] t2.at "b" . to_vector . should_equal [2, 4] + Test.specify "should be able to read column names starting with #" <| + reference_table = Table.new [["#", ["a", ";1", "5"]], ["x", [42, 2, 6]], ["y", ["c # comment??", "3", "7;comment?"]]] + table = File.read (enso_project.data / "comments.csv") + table.should_equal reference_table + + Test.specify "should be able to handle comments if enabled" <| + table_hash = Table.new [["a", [";1", "5"]], ["42", [2, 6]], ["c # comment??", ["3", "7;comment?"]]] + table_semicolon = Table.new [["#", ["a", "5"]], ["x", [42, 6]], ["y", ["c # comment??", "7;comment?"]]] + + File.read (enso_project.data / "comments.csv") (Delimited ',' . with_comments . with_headers) . should_equal table_hash + File.read (enso_project.data / "comments.csv") (Delimited ',' . with_comments ';' . with_headers) . should_equal table_semicolon + Test.specify "should allow to build the Delimited configuration using builders" <| Delimited "," . clone . should_equal (Delimited ",") Delimited "," encoding=Encoding.ascii skip_rows=123 row_limit=100 headers=False value_formatter=Nothing . clone . should_equal (Delimited "," headers=False value_formatter=Nothing skip_rows=123 row_limit=100 encoding=Encoding.ascii) @@ -359,11 +396,9 @@ spec = Delimited ',' . with_parsing custom_formatter . should_equal (Delimited ',' value_formatter=custom_formatter) Delimited ',' row_limit=456 . without_parsing . should_equal (Delimited ',' value_formatter=Nothing row_limit=456) - Test.specify "should be able to read column names starting with #" <| - Nothing - - Test.specify "should be able to handle comments if enabled" <| - # TODO check # and other comment style; check both ignored whole lines and parts of lines - Nothing + Delimited ',' . with_comments . should_equal (Delimited ',' comment_character='#') + Delimited ',' . with_comments ';' . should_equal (Delimited ',' comment_character=';') + Delimited ',' comment_character='#' . without_comments . should_equal (Delimited ',' comment_character=Nothing) + Delimited ',' . with_line_endings Unix_Line_Endings . should_equal (Delimited ',' line_endings=Unix_Line_Endings) main = Test.Suite.run_main spec From a515b2aefca100d17e6ae5b25d158a8643a5cdab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 12 Jul 2022 18:48:03 +0200 Subject: [PATCH 06/12] First Write tests --- .../Base/0.0.0-dev/src/Data/Vector.enso | 2 +- test/Table_Tests/src/Delimited_Write_Spec.enso | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso index 6a3b0a468718..77dd89278c98 100644 --- a/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso +++ b/distribution/lib/Standard/Base/0.0.0-dev/src/Data/Vector.enso @@ -631,7 +631,7 @@ type Vector Join the elements of the vector together as a string. ["foo", "bar", "baz"].join ", " - join : Text -> Text + join : Text -> Text -> Text -> Text join separator="" prefix="" suffix="" = if self.is_empty then prefix+suffix else if self.length == 1 then prefix + self.unsafe_at 0 + suffix else diff --git a/test/Table_Tests/src/Delimited_Write_Spec.enso b/test/Table_Tests/src/Delimited_Write_Spec.enso index e809a508f374..62bd5477668a 100644 --- a/test/Table_Tests/src/Delimited_Write_Spec.enso +++ b/test/Table_Tests/src/Delimited_Write_Spec.enso @@ -4,6 +4,7 @@ import Standard.Base.System.File.Existing_File_Behavior from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error import Standard.Base.Data.Time.Date import Standard.Base.Data.Time.Time_Of_Day +import Standard.Base.System import Standard.Table import Standard.Table.Data.Column @@ -12,6 +13,7 @@ import Standard.Table.Io.File_Read from Standard.Table.Io.File_Format import Delimited from Standard.Table.Data.Data_Formatter as Data_Formatter_Module import Data_Formatter import Standard.Table.Io.Quote_Style +from Standard.Table.Io.Line_Ending_Style import all import Standard.Table.Data.Match_Columns import Standard.Table.Data.Column_Name_Mapping from Standard.Table.Errors as Table_Errors import Column_Count_Mismatch, Column_Name_Mismatch @@ -40,13 +42,21 @@ spec = 2,1.5,y,2 3,2.2,z,[[[My Type :: 10]]] text = File.read_text file - # TODO check that system default is used - text.should_equal expected_text+'\n' + separator = if System.os == "windows" then '\r\n' else '\n' + text.should_equal (expected_text.lines.join separator suffix=separator) file.delete Test.specify "should allow to specify line ending style" <| - # TODO check manual overrides of all 3 styles - Nothing + table = Table.new [["a", ["b", "c"]], ["d", ["e", "f"]]] + lines = ["a,d", "b,e", "c,f"] + [[Unix_Line_Endings, '\n'], [Windows_Line_Endings, '\r\n'], [Classic_Mac_Line_Endings, '\r']].each setting-> + style=setting.first + separator=setting.second + file = (enso_project.data / "transient" / "endings.csv") + table.write file (File_Format.Delimited ',' line_endings=style) + text = File.read_text file + text.should_equal (lines.join separator suffix=separator) + file.delete Test.specify "should be able to write an empty table" <| table = Table.new [] From 2c584efef846ffb42872ab095f01fdabe0b2402a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Tue, 12 Jul 2022 19:02:06 +0200 Subject: [PATCH 07/12] more write tests --- .../Table_Tests/src/Delimited_Write_Spec.enso | 42 +++++++++++++------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/test/Table_Tests/src/Delimited_Write_Spec.enso b/test/Table_Tests/src/Delimited_Write_Spec.enso index 62bd5477668a..a486f60bf245 100644 --- a/test/Table_Tests/src/Delimited_Write_Spec.enso +++ b/test/Table_Tests/src/Delimited_Write_Spec.enso @@ -30,6 +30,8 @@ type My_Type to_text = "[[[My Type :: " + self.x.to_text + "]]]" spec = + line_ending_pairs = [[Unix_Line_Endings, '\n'], [Windows_Line_Endings, '\r\n'], [Classic_Mac_Line_Endings, '\r']] + system_separator = if System.os == "windows" then '\r\n' else '\n' Test.group "Delimited File Writing" <| Test.specify "should correctly write a simple table" <| table = Table.new [["A", [1,2,3]], ["B", [1.0,1.5,2.2]], ["C", ["x","y","z"]], ["D", ["a", 2, My_Type 10]]] @@ -42,14 +44,13 @@ spec = 2,1.5,y,2 3,2.2,z,[[[My Type :: 10]]] text = File.read_text file - separator = if System.os == "windows" then '\r\n' else '\n' - text.should_equal (expected_text.lines.join separator suffix=separator) + text.should_equal (expected_text.lines.join system_separator suffix=system_separator) file.delete Test.specify "should allow to specify line ending style" <| table = Table.new [["a", ["b", "c"]], ["d", ["e", "f"]]] lines = ["a,d", "b,e", "c,f"] - [[Unix_Line_Endings, '\n'], [Windows_Line_Endings, '\r\n'], [Classic_Mac_Line_Endings, '\r']].each setting-> + line_ending_pairs.each setting-> style=setting.first separator=setting.second file = (enso_project.data / "transient" / "endings.csv") @@ -339,24 +340,41 @@ spec = file.delete - Test.specify "should be able to correctly append to a file that contains no actual data but only commented out lines" <| - # TODO - Nothing - Test.specify "should use the same line ending style as existing data when appending" <| - # TODO check all 3 styles - initial manually set, appending autodetected - Nothing + initial_table = Table.new [["a", [1, 2]], ["d", ["e", "f"]]] + table_to_append = Table.new [["a", ["x", "y"]], ["d", ["z", "w"]]] + expected_lines = ["a,d", "1,e", "2,f", "x,z", "y,w"] + line_ending_pairs.each setting-> + style=setting.first + separator=setting.second + file = (enso_project.data / "transient" / "endings.csv") + initial_table.write file (File_Format.Delimited ',' line_endings=style) + table_to_append.write file on_existing_file=Existing_File_Behavior.Append . should_equal Nothing + text = File.read_text file + text.should_equal (expected_lines.join separator suffix=separator) + file.delete Test.specify "should use the system default line ending style when appending to an empty or nonexistent file" <| - # TODO - Nothing + empty_file = (enso_project.data / "transient" / "empty.csv") + "".write empty_file + nonexistent_file = (enso_project.data / "transient" / "nonexistent.csv") + nonexistent_file.delete_if_exists + + table_to_append = Table.new [["a", ["x", "y"]], ["d", ["z", "w"]]] + table_to_append.write nonexistent_file on_existing_file=Existing_File_Behavior.Append + table_to_append.write empty_file on_existing_file=Existing_File_Behavior.Append + + expected_lines = ["a,d", "x,z", "y,w"] + expected_text = (expected_lines.join system_separator suffix=system_separator) + File.read_text empty_file . should_equal expected_text + File.read_text nonexistent_file . should_equal expected_text Test.specify "should use the existing line ending style when appending to a file consisting of only comments" <| # TODO Nothing Test.specify "should use the explicitly specified line ending style regardless of the detected one" <| - # TOOD write in one style than append in other + # TODO write in one style than append in other Nothing main = Test.Suite.run_main spec From a322c16f65c5dd6ea04c6360e6ef280100ea5b19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 13 Jul 2022 13:54:03 +0200 Subject: [PATCH 08/12] More write tests, amend how line ending mismatch on append is handled --- .../src/Internal/Delimited_Writer.enso | 14 +++++++-- .../Table_Tests/src/Delimited_Write_Spec.enso | 29 +++++++++++++++---- 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso index 84aadfcb471f..fb64466b816c 100644 --- a/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso +++ b/distribution/lib/Standard/Table/0.0.0-dev/src/Internal/Delimited_Writer.enso @@ -50,9 +50,19 @@ write_file table format file on_existing_file match_columns on_problems = If the file does not exist or is empty, it acts like a regular overwrite. append_to_file : Table -> File_Format.Delimited -> File -> Match_Columns -> Problem_Behavior -> Any append_to_file table format file match_columns on_problems = - Column_Name_Mismatch.handle_java_exception <| Column_Count_Mismatch.handle_java_exception <| - metadata = Delimited_Reader.detect_metadata file format + Column_Name_Mismatch.handle_java_exception <| Column_Count_Mismatch.handle_java_exception <| Panic.recover Illegal_Argument_Error <| + inferring_format = format.with_line_endings Infer + metadata = Delimited_Reader.detect_metadata file inferring_format preexisting_headers = metadata.headers + case format.line_endings of + Infer -> Nothing + other_ending_style -> + selected_separator = line_separator_sequence other_ending_style + existing_separator = metadata.line_separator + if selected_separator != existing_separator then + Panic.throw <| Illegal_Argument_Error <| + # Ensure that these are properly escaped once `to_text` meaning is changed. + "The explicitly provided line endings (" + selected_separator.to_text + ") do not match the line endings in the file (" + existing_separator.to_text + ")." effective_line_separator = metadata.line_separator reordered_java_table = case preexisting_headers of Nothing -> table.java_table diff --git a/test/Table_Tests/src/Delimited_Write_Spec.enso b/test/Table_Tests/src/Delimited_Write_Spec.enso index a486f60bf245..239ad85aef2d 100644 --- a/test/Table_Tests/src/Delimited_Write_Spec.enso +++ b/test/Table_Tests/src/Delimited_Write_Spec.enso @@ -370,11 +370,30 @@ spec = File.read_text nonexistent_file . should_equal expected_text Test.specify "should use the existing line ending style when appending to a file consisting of only comments" <| - # TODO - Nothing + initial_lines = ["# comment 1", "# comment 2"] + table_to_append = Table.new [["a", ["x", "y"]], ["b", ["z", "w"]]] + expected_lines = initial_lines + ["a,b", "x,z", "y,w"] + file = (enso_project.data / "transient" / "endings_comments_only.csv") + line_ending_pairs.each setting-> + separator=setting.second + file.delete_if_exists + (initial_lines.join separator suffix=separator).write file + format = File_Format.Delimited ',' . with_comments + table_to_append.write file format on_existing_file=Existing_File_Behavior.Append . should_equal Nothing + text = File.read_text file + expected_text = expected_lines.join separator suffix=separator + text.should_equal expected_text + file.delete - Test.specify "should use the explicitly specified line ending style regardless of the detected one" <| - # TODO write in one style than append in other - Nothing + Test.specify "should fail if explicitly provided line endings do not match line endings in the file when appending" <| + initial_table = Table.new [["a", [1, 2]]] + table_to_append = Table.new [["a", ["x", "y"]]] + file = (enso_project.data / "transient" / "endings_mismatch.csv") + file.delete_if_exists + initial_table.write file (File_Format.Delimited ',' line_endings=Classic_Mac_Line_Endings) + result = table_to_append.write file (File_Format.Delimited ',' line_endings=Unix_Line_Endings) on_existing_file=Existing_File_Behavior.Append match_columns=Match_Columns.By_Position + result . should_fail_with Illegal_Argument_Error + result.catch.message . should_equal "The explicitly provided line endings ('\n') do not match the line endings in the file ('\r')." + file.delete main = Test.Suite.run_main spec From 11cec0fa451b2325ea58461023c76d7c68ae1d71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 13 Jul 2022 16:12:57 +0200 Subject: [PATCH 09/12] changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5cfec9bbd178..fd6906d5a0bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -157,6 +157,8 @@ - [Allow filtering caught error type in `Error.catch`.][3574] - [Implemented `Append` mode for `File_Format.Delimited`.][3573] - [Added `Vector.write_bytes` function and removed old `File.write_bytes`][3583] +- [Added `line_endings` and `comment_character` options to + `File_Format.Delimited`.][3581] [debug-shortcuts]: https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug @@ -250,6 +252,7 @@ [3574]: https://github.com/enso-org/enso/pull/3574 [3573]: https://github.com/enso-org/enso/pull/3573 [3583]: https://github.com/enso-org/enso/pull/3583 +[3581]: https://github.com/enso-org/enso/pull/3581 #### Enso Compiler From 6c49311ae561f54e4ca8f95c7540dc49b9c1b555 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Wed, 13 Jul 2022 21:44:11 +0200 Subject: [PATCH 10/12] Normalize line endings in tests to platform default --- test/Table_Tests/src/Csv_Spec.enso | 18 ++++---- .../Table_Tests/src/Delimited_Write_Spec.enso | 42 +++++++++---------- test/Table_Tests/src/Util.enso | 6 +++ 3 files changed, 36 insertions(+), 30 deletions(-) diff --git a/test/Table_Tests/src/Csv_Spec.enso b/test/Table_Tests/src/Csv_Spec.enso index 6f58e5a8f578..48bd8af5cd1a 100644 --- a/test/Table_Tests/src/Csv_Spec.enso +++ b/test/Table_Tests/src/Csv_Spec.enso @@ -5,7 +5,7 @@ import Standard.Table.Data.Column from Standard.Table.Data.Column_Selector as Column_Selector_Module import By_Index import Standard.Table.Io.File_Format import Standard.Test -import project.Util +from project.Util import all spec = c_1 = ["a", ["1", "4", "7", "10"]] @@ -49,7 +49,7 @@ spec = Test.specify 'should properly serialize simple tables' <| varied_column = (enso_project.data / "varied_column.csv") . read res = Text.from varied_column format=(File_Format.Delimited ",") - exp = ''' + exp = normalize_lines <| ''' Column_1,Column_2,Column_3,Column_4,Column_5,Column_6 2005-02-25,2005-02-25,1,1,1.0,1 2005-02-28,2005-02-28,2,2,2.0,2 @@ -58,7 +58,7 @@ spec = ,2005-03-03,5,5,5.0,5 2005-03-04,2005-03-04,,6,6.25,6.25 2005-03-07,2005-03-07,7,7,7.0,7 - 2005-03-08,2005-03-08,8,8,8.0,osiem\n + 2005-03-08,2005-03-08,8,8,8.0,osiem res.should_equal exp Test.specify 'should properly handle quoting of records and allow specifying separators' <| @@ -66,28 +66,28 @@ spec = c2 = ['grade', [10, 20, 'hello;world']] t = Table.new [c1, c2] - expected = """ + expected = normalize_lines <| """ name;grade "Robert"");DROP TABLE Students;--";10 "This;Name;;Is""""Strange";20 Marcin,,;"hello;world" res = Text.from t format=(File_Format.Delimited ";") - res.should_equal expected+'\n' + res.should_equal expected Test.specify 'should allow forced quoting of records' c1 = ['name', ['Robert");DROP TABLE Students;--', 'This;Name;;Is""Strange', 'Marcin,,']] c2 = ['grade', [10, 20, 'hello;world']] t = Table.new [c1, c2] - expected = """ + expected = normalize_lines <| """ "name","grade" "Robert"");DROP TABLE Students;--",10 "This;Name;;Is""""Strange",20 "Marcin,,","hello;world" res = Text.from t format=(File_Format.Delimited "," . with_quotes always_quote=True) - res.should_equal expected+'\n' + res.should_equal expected Test.specify 'should write CSV to a file' <| @@ -95,7 +95,7 @@ spec = out = enso_project.data / 'out.csv' out.delete_if_exists varied_column.write out - exp = ''' + exp = normalize_lines <| ''' Column_1,Column_2,Column_3,Column_4,Column_5,Column_6 2005-02-25,2005-02-25,1,1,1.0,1 2005-02-28,2005-02-28,2,2,2.0,2 @@ -104,7 +104,7 @@ spec = ,2005-03-03,5,5,5.0,5 2005-03-04,2005-03-04,,6,6.25,6.25 2005-03-07,2005-03-07,7,7,7.0,7 - 2005-03-08,2005-03-08,8,8,8.0,osiem\n + 2005-03-08,2005-03-08,8,8,8.0,osiem out.read_text.should_equal exp out.delete_if_exists diff --git a/test/Table_Tests/src/Delimited_Write_Spec.enso b/test/Table_Tests/src/Delimited_Write_Spec.enso index 239ad85aef2d..dcc4baea3ec5 100644 --- a/test/Table_Tests/src/Delimited_Write_Spec.enso +++ b/test/Table_Tests/src/Delimited_Write_Spec.enso @@ -21,7 +21,7 @@ from Standard.Table.Errors as Table_Errors import Column_Count_Mismatch, Column_ import Standard.Test import Standard.Test.Problems -import project.Util +from project.Util import all type My_Type type My_Type x @@ -38,13 +38,13 @@ spec = file = (enso_project.data / "transient" / "written.csv") file.delete_if_exists table.write file - expected_text = """ + expected_text = normalize_lines <| """ A,B,C,D 1,1.0,x,a 2,1.5,y,2 3,2.2,z,[[[My Type :: 10]]] text = File.read_text file - text.should_equal (expected_text.lines.join system_separator suffix=system_separator) + text.should_equal expected_text file.delete Test.specify "should allow to specify line ending style" <| @@ -74,14 +74,14 @@ spec = file = (enso_project.data / "transient" / "quotes1.csv") file.delete_if_exists table.write file (File_Format.Delimited "," value_formatter=data_formatter) - expected_text = """ + expected_text = normalize_lines <| """ "The Column ""Name""","Hello, Column?" foo,"1,0" 'bar',"1000000,5" """baz""","2,2" "one, two, three","-1,5" text = File.read_text file - text.should_equal expected_text+'\n' + text.should_equal expected_text file.delete Test.specify 'should quote values that contain the delimiter or quotes, in the [;\\\"] variant' <| @@ -90,7 +90,7 @@ spec = file = (enso_project.data / "transient" / "quotes2.csv") file.delete_if_exists table.write file (File_Format.Delimited ";" value_formatter=data_formatter . with_quotes quote='"' quote_escape='\\') - expected_text = """ + expected_text = normalize_lines <| """ "\"A\"";B foo;1'000'000.5 "!\"baz\" ";1'000.0 @@ -98,7 +98,7 @@ spec = "a;b; c ";-1.2 "a\\b"; text = File.read_text file - text.should_equal expected_text+'\n' + text.should_equal expected_text file.delete Test.specify "should quote values that contain the delimiter or quotes, in the [\t''] variant" <| @@ -107,14 +107,14 @@ spec = file = (enso_project.data / "transient" / "quotes3.csv") file.delete_if_exists table.write file (File_Format.Delimited '\t' value_formatter=data_formatter . with_quotes quote='\'' quote_escape='\'') - expected_text = ''' + expected_text = normalize_lines <| ''' "A"\tB\\C \t'1''000''000.5' 'The ''thing''.'\t'1''000.0' one, "two", three\t 'a\tb'\t-1.2 text = File.read_text file - text.should_equal expected_text+'\n' + text.should_equal expected_text file.delete Test.specify "should correctly distinguish empty text from a missing value" <| @@ -122,13 +122,13 @@ spec = file = (enso_project.data / "transient" / "empty_vs_null.csv") file.delete_if_exists table.write file - expected_text = """ + expected_text = normalize_lines <| """ A,B 1, ,"" 3,abc text = File.read_text file - text.should_equal expected_text+'\n' + text.should_equal expected_text file.delete Test.specify 'should not quote values if quoting is disabled' <| @@ -137,14 +137,14 @@ spec = file = (enso_project.data / "transient" / "quote_disabled.csv") file.delete_if_exists table.write file format - expected_text = """ + expected_text = normalize_lines <| """ The Column "Name",Hello, Column? foo,1,0 'bar',1000000,5 "baz",2,2 one, two, three,-1,5 text = File.read_text file - text.should_equal expected_text+'\n' + text.should_equal expected_text file.delete Test.specify 'should allow to always quote text and custom values, but for non-text primitves only if absolutely necessary' <| @@ -153,14 +153,14 @@ spec = file = (enso_project.data / "transient" / "quote_always.csv") file.delete_if_exists table.write file format - expected_text = """ + expected_text = normalize_lines <| """ "The Column \"Name\"","B","C","D","E" "foo",1.0,"foo",1, "'bar'","1\"000\"000.5","[[[My Type :: 44]]]",2,13:55:00 "\"baz\"",2.2,"Tue, 21 Jun 2022",3, "one, two, three",-1.5,42,"4\"000", text = File.read_text file - text.should_equal expected_text+'\n' + text.should_equal expected_text file.delete Test.specify "should correctly handle alternative encodings" <| @@ -168,11 +168,11 @@ spec = file = (enso_project.data / "transient" / "utf16.csv") file.delete_if_exists table.write file (File_Format.Delimited "," encoding=Encoding.utf_16_be) - expected_text = """ + expected_text = normalize_lines <| """ ąęćś,ß 0,żółw 🐢 text = File.read_text file encoding=Encoding.utf_16_be - text.should_equal expected_text+'\n' + text.should_equal expected_text file.delete Test.specify "should correctly handle encoding errors" <| @@ -180,12 +180,12 @@ spec = file = (enso_project.data / "transient" / "ascii.csv") file.delete_if_exists result = table.write file (File_Format.Delimited "," encoding=Encoding.ascii) - expected_text = """ + expected_text = normalize_lines <| """ A,B 0,s??wka 1,? text = File.read_text file encoding=Encoding.ascii - text.should_equal expected_text+'\n' + text.should_equal expected_text result . should_equal Nothing Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at characters 7, 8, 15."] file.delete @@ -196,12 +196,12 @@ spec = file_1 = (enso_project.data / "transient" / "textonly.csv") file_1.delete_if_exists result_1 = table_1.write file_1 format - expected_text = """ + expected_text = normalize_lines <| """ A,B x,z y,w text_1 = File.read_text file_1 - text_1.should_equal expected_text+'\n' + text_1.should_equal expected_text result_1 . should_equal Nothing table_2 = Table.new [["A", [1, 2]], ["B", ["z", "w"]]] diff --git a/test/Table_Tests/src/Util.enso b/test/Table_Tests/src/Util.enso index 3c894f906bcc..78b9fe3b5505 100644 --- a/test/Table_Tests/src/Util.enso +++ b/test/Table_Tests/src/Util.enso @@ -1,5 +1,6 @@ from Standard.Base import all +import Standard.Base.System import Standard.Table import Standard.Test @@ -8,3 +9,8 @@ Table.Table.should_equal expected = that_cols = expected.columns self_cols.map .name . should_equal (that_cols.map .name) frames_to_skip=1 self_cols.map .to_vector . should_equal (that_cols.map .to_vector) frames_to_skip=1 + +normalize_lines string line_separator=System.default_line_separator newline_at_end=True = + case newline_at_end of + True -> string.lines.join line_separator suffix=line_separator + False -> string.lines.join line_separator From e863a0c6059c8b72bd56f2bc5ce4179ad7da40f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 14 Jul 2022 08:51:39 +0200 Subject: [PATCH 11/12] Fix warning offsets due to variable line separator length --- test/Table_Tests/src/Delimited_Write_Spec.enso | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/Table_Tests/src/Delimited_Write_Spec.enso b/test/Table_Tests/src/Delimited_Write_Spec.enso index dcc4baea3ec5..ac3839849aa5 100644 --- a/test/Table_Tests/src/Delimited_Write_Spec.enso +++ b/test/Table_Tests/src/Delimited_Write_Spec.enso @@ -187,7 +187,11 @@ spec = text = File.read_text file encoding=Encoding.ascii text.should_equal expected_text result . should_equal Nothing - Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at characters 7, 8, 15."] + sep_length = System.default_line_separator.length + positions = [6 + sep_length, 7 + sep_length, 13 + 2*sep_length] + msg = "Encoding issues at characters " + + positions.map .to_text . join separator=", " suffix="." + Warning.get_all result . map .value . should_equal [Encoding_Error msg] file.delete Test.specify "should allow only text columns if no formatter is specified" <| From b6a9ca9cdfdcb2edab27993d0f16539dbdb0b6e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rados=C5=82aw=20Wa=C5=9Bko?= Date: Thu, 14 Jul 2022 10:14:34 +0200 Subject: [PATCH 12/12] fix codepoint alignment of warning offsets --- .../java/org/enso/base/encoding/ReportingStreamEncoder.java | 4 ++-- test/Table_Tests/src/Delimited_Write_Spec.enso | 4 ++-- test/Tests/src/System/Reporting_Stream_Encoder_Spec.enso | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/std-bits/base/src/main/java/org/enso/base/encoding/ReportingStreamEncoder.java b/std-bits/base/src/main/java/org/enso/base/encoding/ReportingStreamEncoder.java index 7260d40002aa..0c8791d37d7f 100644 --- a/std-bits/base/src/main/java/org/enso/base/encoding/ReportingStreamEncoder.java +++ b/std-bits/base/src/main/java/org/enso/base/encoding/ReportingStreamEncoder.java @@ -149,13 +149,13 @@ public List getReportedProblems() { return List.of(); } else { if (encodingIssuePositions.size() == 1) { - return List.of("Encoding issues at character " + encodingIssuePositions.get(0) + "."); + return List.of("Encoding issues at codepoint " + encodingIssuePositions.get(0) + "."); } String issues = encodingIssuePositions.stream() .map(String::valueOf) - .collect(Collectors.joining(", ", "Encoding issues at characters ", ".")); + .collect(Collectors.joining(", ", "Encoding issues at codepoints ", ".")); return List.of(issues); } } diff --git a/test/Table_Tests/src/Delimited_Write_Spec.enso b/test/Table_Tests/src/Delimited_Write_Spec.enso index ac3839849aa5..c79c5d094263 100644 --- a/test/Table_Tests/src/Delimited_Write_Spec.enso +++ b/test/Table_Tests/src/Delimited_Write_Spec.enso @@ -187,9 +187,9 @@ spec = text = File.read_text file encoding=Encoding.ascii text.should_equal expected_text result . should_equal Nothing - sep_length = System.default_line_separator.length + sep_length = System.default_line_separator.codepoints.length positions = [6 + sep_length, 7 + sep_length, 13 + 2*sep_length] - msg = "Encoding issues at characters " + + msg = "Encoding issues at codepoints " + positions.map .to_text . join separator=", " suffix="." Warning.get_all result . map .value . should_equal [Encoding_Error msg] file.delete diff --git a/test/Tests/src/System/Reporting_Stream_Encoder_Spec.enso b/test/Tests/src/System/Reporting_Stream_Encoder_Spec.enso index b28055fe941f..edf510fbcd3a 100644 --- a/test/Tests/src/System/Reporting_Stream_Encoder_Spec.enso +++ b/test/Tests/src/System/Reporting_Stream_Encoder_Spec.enso @@ -63,7 +63,7 @@ spec = stream.with_stream_encoder encoding Problem_Behavior.Report_Warning reporting_stream_encoder-> reporting_stream_encoder.write contents result . should_equal Nothing - Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at characters 1, 3."] + Warning.get_all result . map .value . should_equal [Encoding_Error "Encoding issues at codepoints 1, 3."] f.read_text encoding . should_equal "S?o?wka!" f.delete_if_exists @@ -76,7 +76,7 @@ spec = reporting_stream_encoder.write "bar" result_2 . should_equal Nothing - Warning.get_all result_2 . map .value . should_equal [Encoding_Error "Encoding issues at characters 3, 9."] + Warning.get_all result_2 . map .value . should_equal [Encoding_Error "Encoding issues at codepoints 3, 9."] f.read_text encoding . should_equal "ABC?foo -?- bar" Test.specify "should work correctly if no data is written to it" <|