Skip to content

Commit

Permalink
Custom Encoding support when writing Delimited files (#3564)
Browse files Browse the repository at this point in the history
  • Loading branch information
radeusgd authored Jul 7, 2022
1 parent d950499 commit 7c94fa6
Show file tree
Hide file tree
Showing 11 changed files with 482 additions and 121 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@
- [Added `File_Format.Delimited` support to `Table.write` for new files.][3528]
- [Adjusted `Database.connect` API to new design.][3542]
- [Added `File_Format.Excel` support to `Table.write` for new files.][3551]
- [Added support for custom encodings in `File_Format.Delimited` writing.][3564]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand Down Expand Up @@ -232,6 +233,7 @@
[3528]: https://github.com/enso-org/enso/pull/3528
[3542]: https://github.com/enso-org/enso/pull/3542
[3551]: https://github.com/enso-org/enso/pull/3551
[3564]: https://github.com/enso-org/enso/pull/3564
[3552]: https://github.com/enso-org/enso/pull/3552

#### Enso Compiler
Expand Down
13 changes: 5 additions & 8 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,18 @@ import sbt.Keys.{libraryDependencies, scalacOptions}
import sbt.addCompilerPlugin
import sbt.complete.DefaultParsers._
import sbt.complete.Parser
import sbtcrossproject.CrossPlugin.autoImport.{crossProject, CrossType}
import src.main.scala.licenses.{
DistributionDescription,
SBTDistributionComponent
}
import sbtcrossproject.CrossPlugin.autoImport.{CrossType, crossProject}
import src.main.scala.licenses.{DistributionDescription, SBTDistributionComponent}

import java.io.File

// ============================================================================
// === Global Configuration ===================================================
// ============================================================================

val scalacVersion = "2.13.7"
val graalVersion = "21.3.0"
val javaVersion = "11"
val scalacVersion = "2.13.7"
val graalVersion = "21.3.0"
val javaVersion = "11"
val defaultDevEnsoVersion = "0.0.0-dev"
val ensoVersion = sys.env.getOrElse(
"ENSO_VERSION",
Expand Down
29 changes: 28 additions & 1 deletion distribution/lib/Standard/Base/0.0.0-dev/src/System/File.enso
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@ from Standard.Base import all

import Standard.Base.System.File.Option
import Standard.Base.System.File.Existing_File_Behavior
import Standard.Base.Error.Problem_Behavior
import Standard.Base.Data.Text.Matching_Mode
import Standard.Base.Data.Text.Text_Sub_Range
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding
from Standard.Base.Data.Text.Encoding as Encoding_Module import Encoding, Encoding_Error
from Standard.Base.Error.Problem_Behavior as Problem_Behavior_Module import Problem_Behavior, Report_Warning
from Standard.Base.Runtime.Resource import all

export Standard.Base.System.File.Option

polyglot java import org.enso.base.Encoding_Utils
polyglot java import java.io.InputStream as Java_Input_Stream
polyglot java import java.io.OutputStream as Java_Output_Stream
polyglot java import java.io.IOException
Expand Down Expand Up @@ -781,6 +783,22 @@ type Output_Stream
with_java_stream : (Java_Output_Stream -> Any) -> Any
with_java_stream f = self.stream_resource . with f

## PRIVATE
Runs an action with a `ReportingStreamEncoder` encoding data to the
output stream with the specified encoding.
with_stream_encoder : Encoding -> Problem_Behavior -> (ReportingStreamEncoder -> Any) -> Any
with_stream_encoder encoding on_problems action = self.with_java_stream java_stream->
## We ignore any warnings raised by the `bytes` method, because if the
original Unicode replacement character failed to encode, the `bytes`
method will have replaced it with the simple `?` sign which should be
available in all encodings. And this is exactly the behavior we want:
if available, we use the `�` character and otherwise we fallback to
the `?` character.
replacement_sequence = Encoding_Utils.INVALID_CHARACTER.bytes encoding on_problems=Problem_Behavior.Ignore
java_charset = encoding.to_java_charset
results = Encoding_Utils.with_stream_encoder java_stream java_charset replacement_sequence.to_array action
problems = Vector.Vector results.problems . map Encoding_Error
on_problems.attach_problems_after results.result problems

## An input stream, allowing for interactive reading of contents from an open
file.
Expand Down Expand Up @@ -906,6 +924,15 @@ type Input_Stream
with_java_stream : (Java_Input_Stream -> Any) -> Any
with_java_stream f = self.stream_resource . with f

## PRIVATE
Runs an action with a `ReportingStreamDecoder` decoding data from the
input stream with the specified encoding.
with_stream_decoder : Encoding -> Problem_Behavior -> (ReportingStreamDecoder -> Any) -> Any
with_stream_decoder encoding on_problems action = self.stream_resource . with java_stream->
java_charset = encoding.to_java_charset
results = Encoding_Utils.with_stream_decoder java_stream java_charset action
problems = Vector.Vector results.problems . map Encoding_Error
on_problems.attach_problems_after results.result problems

## PRIVATE

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ read_file format file on_problems =
exceptions), we can catch the exception indicating the limit has been
reached and restart parsing with an increased limit.
file.with_input_stream [File.Option.Read] stream->
stream.with_java_stream java_stream->
here.read_stream format java_stream on_problems related_file=file
here.read_stream format stream on_problems related_file=file

read_text : Text -> Delimited -> Problem_Behavior -> Table
read_text text format on_problems =
Expand All @@ -57,7 +56,7 @@ read_text text format on_problems =

Arguments:
- format: The specification of the delimited file format.
- java_stream: A Java `InputStream` used as the data source.
- stream: An `Input_Stream` to be used as the data source.
- on_problems: Specifies the behavior when a problem occurs during the
operation. By default, a warning is issued, but the operation proceeds.
If set to `Report_Error`, the operation fails with a dataflow error.
Expand All @@ -67,17 +66,14 @@ read_text text format on_problems =
integer.
- related_file: The file related to the provided `java_stream`, if available,
or `Nothing`. It is used for more detailed error reporting.
read_stream : Delimited -> InputStream -> Problem_Behavior -> Integer -> File | Nothing -> Any
read_stream format java_stream on_problems max_columns=4096 related_file=Nothing =
read_stream : Delimited -> Input_Stream -> Problem_Behavior -> Integer -> File | Nothing -> Any
read_stream format stream on_problems max_columns=4096 related_file=Nothing =
handle_io_exception ~action = Panic.catch IOException action caught_panic->
Error.throw (File.wrap_io_exception related_file caught_panic.payload.cause)

java_charset = format.encoding.to_java_charset
handle_io_exception <|
Encoding_Utils.with_stream_decoder java_stream java_charset reporting_stream_decoder->
result = here.read_from_reader format reporting_stream_decoder on_problems max_columns
decoding_problems = Vector.Vector reporting_stream_decoder.getReportedProblems . map Encoding_Error
on_problems.attach_problems_after result decoding_problems
stream.with_stream_decoder format.encoding on_problems reporting_stream_decoder->
here.read_from_reader format reporting_stream_decoder on_problems max_columns

## PRIVATE
Reads data from the provided `Reader` according to the provided format.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@ write_file table format file on_existing_file on_problems =
Errors.unimplemented "Appending to an existing File_Format.Delimited file is not implemented yet."
_ ->
on_existing_file.write file stream->
stream.with_java_stream java_stream->
here.write_to_stream table format java_stream on_problems related_file=file
here.write_to_stream table format stream on_problems related_file=file

## PRIVATE
Returns a Text value representing the table in the delimited format.
Expand All @@ -53,25 +52,21 @@ write_text table format =
Arguments:
- table: The table to serialize.
- format: The specification of the delimited file format.
- java_stream: A Java `OutputStream` used as the data destination.
- stream: An `Output_Stream` used as the data destination.
- on_problems: Specifies the behavior when a problem occurs during the
operation. By default, a warning is issued, but the operation proceeds.
If set to `Report_Error`, the operation fails with a dataflow error.
If set to `Ignore`, the operation proceeds without errors or warnings.
- related_file: The file related to the provided `java_stream`, if available,
or `Nothing`. It is used for more detailed error reporting.
write_to_stream : Table -> File_Format.Delimited -> OutputStream -> Problem_Behavior -> File | Nothing -> Any
write_to_stream table format java_stream on_problems related_file=Nothing =
write_to_stream : Table -> File_Format.Delimited -> Output_Stream -> Problem_Behavior -> File | Nothing -> Any
write_to_stream table format stream on_problems related_file=Nothing =
handle_io_exception ~action = Panic.catch IOException action caught_panic->
Error.throw (File.wrap_io_exception related_file caught_panic.payload.cause)

# TODO handling encoding
#java_charset = format.encoding.to_java_charset
_ = on_problems
handle_io_exception <|
# TODO create a writer that will use the appropriate encoding and handle mismatches
writer = PrintWriter.new java_stream
here.write_to_writer table format writer
stream.with_stream_encoder format.encoding on_problems reporting_stream_encoder->
here.write_to_writer table format reporting_stream_encoder

## PRIVATE
Writes data to the provided `Writer` according to the provided format.
Expand Down
61 changes: 51 additions & 10 deletions std-bits/base/src/main/java/org/enso/base/Encoding_Utils.java
Original file line number Diff line number Diff line change
@@ -1,21 +1,20 @@
package org.enso.base;

import org.enso.base.encoding.ReportingStreamDecoder;
import org.enso.base.encoding.ReportingStreamEncoder;
import org.enso.base.text.ResultWithWarnings;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.nio.charset.*;
import java.util.Arrays;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.function.IntFunction;
import org.enso.base.encoding.ReportingStreamDecoder;
import org.enso.base.text.ResultWithWarnings;

public class Encoding_Utils {
/** The replacement character used for characters that could not have been decoded. */
Expand Down Expand Up @@ -164,13 +163,55 @@ private static ReportingStreamDecoder create_stream_decoder(InputStream stream,

/**
* A helper function which runs an action with a created stream decoder and closes it afterwards.
*
* <p>It returns the result returned from the executed action and any encoding problems that
* occurred when processing it.
*/
public static <R> R with_stream_decoder(
public static <R> WithProblems<R, String> with_stream_decoder(
InputStream stream, Charset charset, Function<ReportingStreamDecoder, R> action)
throws IOException {
try (ReportingStreamDecoder decoder = create_stream_decoder(stream, charset)) {
return action.apply(decoder);
R result;
ReportingStreamDecoder decoder = create_stream_decoder(stream, charset);
try {
result = action.apply(decoder);
} finally {
decoder.close();
}
return new WithProblems<>(result, decoder.getReportedProblems());
}

/** Creates a new instance of {@code ReportingStreamEncoder} encoding a given charset. */
private static ReportingStreamEncoder create_stream_encoder(
OutputStream stream, Charset charset, byte[] replacementSequence) {
CharsetEncoder encoder =
charset
.newEncoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT)
.reset();
return new ReportingStreamEncoder(stream, encoder, replacementSequence);
}

/**
* A helper function which runs an action with a created stream encoder and closes it afterwards.
*
* <p>It returns the result returned from the executed action and any encoding problems that
* occurred when processing it.
*/
public static <R> WithProblems<R, String> with_stream_encoder(
OutputStream stream,
Charset charset,
byte[] replacementSequence,
Function<ReportingStreamEncoder, R> action)
throws IOException {
R result;
ReportingStreamEncoder encoder = create_stream_encoder(stream, charset, replacementSequence);
try {
result = action.apply(encoder);
} finally {
encoder.close();
}
return new WithProblems<>(result, encoder.getReportedProblems());
}

/**
Expand Down
5 changes: 5 additions & 0 deletions std-bits/base/src/main/java/org/enso/base/WithProblems.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
package org.enso.base;

import java.util.List;

public record WithProblems<ResultType, ProblemType>(ResultType result, List<ProblemType> problems) {}
Loading

0 comments on commit 7c94fa6

Please sign in to comment.