Skip to content

Commit

Permalink
Reworking Excel support to allow for reading of big files (#8403)
Browse files Browse the repository at this point in the history
- Closes #8111 by making sure that all Excel workbooks are read using a backing file (which should be more memory efficient).
- If the workbook is being opened from an input stream, that stream is materialized to a `Temporary_File`.
- Adds tests fetching Table formats from HTTP.
- Extends `simple-httpbin` with ability to serve files for our tests.
- Ensures that the `Infer` option on `Excel` format also works with streams, if content-type metadata is available (e.g. from HTTP headers).
- Implements a `Temporary_File` facility that can be used to create a temporary file that is deleted once all references to the `Temporary_File` instance are GCed.
  • Loading branch information
radeusgd authored Dec 15, 2023
1 parent 95f11ab commit b5c995a
Show file tree
Hide file tree
Showing 47 changed files with 1,926 additions and 553 deletions.
1 change: 1 addition & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ resources/python
# The files in the `data` directory of our tests may have specific structure or
# even be malformed on purpose, so we do not want to run prettier on them.
test/**/data
tools/simple-httpbin/www-files

# GUI
**/scala-parser.js
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,7 @@
- [Implemented truncate `Date_Time` for database backend (Postgres only).][8235]
- [Initial Enso Cloud APIs.][8006]
- [Errors thrown inside `map` are wrapped in `Map_Error`.][8307]
- [Support for loading big Excel files.][8403]

[debug-shortcuts]:
https://github.com/enso-org/enso/blob/develop/app/gui/docs/product/shortcuts.md#debug
Expand Down Expand Up @@ -850,6 +851,7 @@
[8150]: https://github.com/enso-org/enso/pull/8150
[8235]: https://github.com/enso-org/enso/pull/8235
[8307]: https://github.com/enso-org/enso/pull/8307
[8403]: https://github.com/enso-org/enso/pull/8403

#### Enso Compiler

Expand Down
18 changes: 13 additions & 5 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -1330,13 +1330,16 @@ lazy val truffleDslSuppressWarnsSetting = Seq(
)

/** A setting to replace javac with Frgaal compiler, allowing to use latest Java features in the code
* and still compile down to JDK 11
* and still compile down to JDK 17
*/
lazy val frgaalJavaCompilerSetting = Seq(
lazy val frgaalJavaCompilerSetting =
customFrgaalJavaCompilerSettings(targetJavaVersion)

def customFrgaalJavaCompilerSettings(targetJdk: String) = Seq(
Compile / compile / compilers := FrgaalJavaCompiler.compilers(
(Compile / dependencyClasspath).value,
compilers.value,
targetJavaVersion
targetJdk
),
// This dependency is needed only so that developers don't download Frgaal manually.
// Sadly it cannot be placed under plugins either because meta dependencies are not easily
Expand Down Expand Up @@ -2731,11 +2734,16 @@ val allStdBits: Parser[String] =
lazy val `simple-httpbin` = project
.in(file("tools") / "simple-httpbin")
.settings(
frgaalJavaCompilerSetting,
customFrgaalJavaCompilerSettings(targetJdk = "21"),
autoScalaLibrary := false,
Compile / javacOptions ++= Seq("-Xlint:all"),
Compile / run / mainClass := Some("org.enso.shttp.SimpleHTTPBin"),
assembly / mainClass := (Compile / run / mainClass).value,
libraryDependencies ++= Seq(
"org.apache.commons" % "commons-text" % commonsTextVersion
)
),
(Compile / run / fork) := true,
(Compile / run / connectInput) := true
)
.configs(Test)

Expand Down
12 changes: 6 additions & 6 deletions distribution/lib/Standard/AWS/0.0.0-dev/src/S3/S3_File.enso
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@ import Standard.Base.Errors.File_Error.File_Error
import Standard.Base.Errors.Illegal_Argument.Illegal_Argument
import Standard.Base.Errors.Unimplemented.Unimplemented
import Standard.Base.System.File_Format.File_For_Read
import Standard.Base.System.File_Format.File_Format_Metadata
import Standard.Base.System.Input_Stream.Input_Stream
import Standard.Base.System.Output_Stream.Output_Stream
from Standard.Base.System.File import find_extension_from_name

import project.AWS_Credential.AWS_Credential
import project.Errors.S3_Error
Expand Down Expand Up @@ -117,7 +119,9 @@ type S3_File
Auto_Detect -> if self.is_directory then format.read self on_problems else
response = S3.get_object self.bucket self.prefix self.credentials
response.decode Auto_Detect
_ -> self.with_input_stream [File_Access.Read] format.read_stream
_ ->
metadata = File_Format_Metadata.Value file_name=self.name
self.with_input_stream [File_Access.Read] (stream-> format.read_stream stream metadata)

## ALIAS load bytes, open bytes
ICON data_input
Expand Down Expand Up @@ -187,11 +191,7 @@ type S3_File
Returns the extension of the file.
extension : Text
extension self = if self.is_directory then Error.throw (S3_Error.Error "Directories do not have extensions." self.uri) else
name = self.name
last_dot = name.locate "." mode=Matching_Mode.Last
if last_dot.is_nothing then "" else
extension = name.drop (Index_Sub_Range.First last_dot.start)
if extension == "." then "" else extension
find_extension_from_name self.name

## GROUP Standard.Base.Input
Lists files contained in the directory denoted by this file.
Expand Down
12 changes: 11 additions & 1 deletion distribution/lib/Standard/Base/0.0.0-dev/src/Any.enso
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ type Any
is_nothing self = False

## GROUP Logical
If `self` is Nothing then returns `function`.
If `self` is Nothing then returns `other`.

> Example
If the value "Hello" is nothing return "".
Expand All @@ -309,6 +309,16 @@ type Any
if_nothing self ~other =
const self other

## If `self` is Nothing then returns Nothing, otherwise returns the result
of running the provided `action`.

> Example
Transform a value only if it is not nothing.

my_result.if_not_nothing <| my_result + 1
if_not_nothing : Any -> Any
if_not_nothing self ~action = action

## GROUP Errors
Executes the provided handler on an error, or returns the value unchanged.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import project.Network.HTTP.HTTP_Method.HTTP_Method
import project.Nothing.Nothing
import project.System.File.File_Access.File_Access
import project.System.File_Format.File_For_Read
import project.System.File_Format.File_Format_Metadata
import project.System.Input_Stream.Input_Stream
import project.System.Output_Stream.Output_Stream
from project.Data.Boolean import Boolean, False, True
Expand Down Expand Up @@ -129,7 +130,9 @@ type Enso_File
real_format = Auto_Detect.get_reading_format self
if real_format == Nothing then Error.throw (File_Error.Unsupported_Type self) else
self.read real_format on_problems
_ -> self.with_input_stream [File_Access.Read] format.read_stream
_ ->
metadata = File_Format_Metadata.Value file_name=self.name
self.with_input_stream [File_Access.Read] (stream-> format.read_stream stream metadata)

## ALIAS load bytes, open bytes
ICON data_input
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import project.Network.URI.URI
import project.Nothing.Nothing
import project.System.File.File
import project.System.File_Format.File_For_Read
import Standard.Base.System.File_Format.File_Format_Metadata
import project.System.Input_Stream.Input_Stream
from project.Data.Text.Extensions import all

Expand Down Expand Up @@ -45,7 +46,8 @@ type XML_Format
XML_Document.from_file file

## PRIVATE
Implements the `Data.parse` for this `File_Format`
read_stream : Input_Stream -> Any
read_stream self stream:Input_Stream =
Implements decoding the format from a stream.
read_stream : Input_Stream -> File_Format_Metadata -> Any
read_stream self stream:Input_Stream (metadata : File_Format_Metadata = File_Format_Metadata.no_information) =
_ = metadata
XML_Document.from_stream stream
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import project.System.File.File
import project.System.File_Format.File_For_Read
import project.System.File_Format.File_Format

polyglot java import java.io.FileNotFoundException
polyglot java import java.io.IOException
polyglot java import java.nio.file.AccessDeniedException
polyglot java import java.nio.file.FileAlreadyExistsException
Expand All @@ -33,7 +34,7 @@ type File_Error
Arguments:
- file: The file that couldn't be read.
- message: The message for the error.
IO_Error (file : File) (message : Text)
IO_Error (file : File | Nothing) (message : Text)

## Indicates that the given file's type is not supported.
Unsupported_Type (file : File_For_Read)
Expand All @@ -51,7 +52,9 @@ type File_Error
to_display_text : Text
to_display_text self = case self of
File_Error.Not_Found file -> "The file at " + file.path + " does not exist."
File_Error.IO_Error file msg -> msg + " (" + file.path + ")."
File_Error.IO_Error file msg ->
suffix = if file.is_nothing then "" else " (" + file.path + ")."
msg + suffix
File_Error.Already_Exists file -> "The file at "+file.path+" already exists."
File_Error.Access_Denied file -> "Insufficient permissions to perform the desired operation on the file at "+file.path+"."
File_Error.Unsupported_Type file -> "The "+file.path+" has a type that is not supported."
Expand All @@ -65,7 +68,7 @@ type File_Error
## PRIVATE

Utility method for running an action with Java exceptions mapping.
handle_java_exceptions file ~action =
handle_java_exceptions (file : File | Nothing) ~action =
Panic.catch IOException action caught_panic->
File_Error.wrap_io_exception file caught_panic.payload

Expand All @@ -78,8 +81,14 @@ type File_Error
## PRIVATE

Converts a Java `IOException` into its Enso counterpart.
wrap_io_exception file io_exception = case io_exception of
_ : NoSuchFileException -> Error.throw (File_Error.Not_Found file)
_ : FileAlreadyExistsException -> Error.throw (File_Error.Already_Exists file)
_ : AccessDeniedException -> File_Error.access_denied file
_ -> Error.throw (File_Error.IO_Error file "An IO error has occurred: "+io_exception.to_text)
wrap_io_exception (file : File | Nothing) io_exception =
## If the file is not known, all we can do is throw a generic IO error.
This will only usually matter on stream operations, where there is no relevant file -
and so the exceptions like `NoSuchFileException` should not occur in such context.
But instead of risking a Type_Error, we just throw the more generic IO_Error.
if file.is_nothing then Error.throw (File_Error.IO_Error Nothing "An IO error has occurred: "+io_exception.to_text) else case io_exception of
_ : NoSuchFileException -> Error.throw (File_Error.Not_Found file)
_ : FileNotFoundException -> Error.throw (File_Error.Not_Found file)
_ : FileAlreadyExistsException -> Error.throw (File_Error.Already_Exists file)
_ : AccessDeniedException -> File_Error.access_denied file
_ -> Error.throw (File_Error.IO_Error file "An IO error has occurred: "+io_exception.to_text)
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
import project.Data.Text.Text
import project.Error.Error
import project.Nothing.Nothing
import project.Panic.Panic

polyglot java import java.lang.IllegalStateException

type Illegal_State
## PRIVATE
Expand All @@ -19,3 +23,8 @@ type Illegal_State
Provides a human-readable representation of the encoding error.
to_display_text : Text
to_display_text self = "Illegal State: " + self.message

## PRIVATE
Capture a Java `IllegalStateException` and convert it to an Enso dataflow error - `Illegal_State.Error`.
handle_java_exception =
Panic.catch IllegalStateException handler=(cause-> Error.throw (Illegal_State.Error cause.payload.getMessage cause.payload))
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,10 @@ type HTTP
if fetch_methods.contains req.method || Context.Output.is_enabled then action else
Error.throw (Forbidden_Operation.Error ("Method " + req.method.to_text + " requests are forbidden as the Output context is disabled."))
handle_request_error =
Panic.catch JException handler=(cause-> Error.throw (Request_Error.Error 'IllegalArgumentException' cause.payload.getMessage))
handler caught_panic =
exception = caught_panic.payload
Error.throw (Request_Error.Error (Meta.type_of exception . to_text) exception.getMessage)
Panic.catch JException handler=handler

Panic.recover Any <| handle_request_error <| check_output_context <|
headers = resolve_headers req
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@ import project.Network.URI.URI
import project.Nothing.Nothing
import project.Runtime.Context
import project.Runtime.Managed_Resource.Managed_Resource
import project.System.File.Advanced.Temporary_File.Temporary_File
import project.System.File.Existing_File_Behavior.Existing_File_Behavior
import project.System.File.File
import project.System.File.File_Access.File_Access
import project.System.File.Write_Extensions
import project.System.File_Format.Auto_Detect
import project.System.File_Format.Bytes
import project.System.File_Format.File_Format
import project.System.File_Format.File_Format_Metadata
import project.System.File_Format.Plain_Text_Format
import project.System.Input_Stream.Input_Stream
from project.Data.Boolean import Boolean, False, True
Expand Down Expand Up @@ -58,23 +60,23 @@ type Response_Body
Raw_Stream (raw_stream:Input_Stream) (content_type:Text|Nothing) uri:URI

## PRIVATE
Byte_Array (bytes:Vector) (content_type:Text|Nothing) uri:URI
Materialized_Byte_Array (bytes:Vector) (content_type:Text|Nothing) uri:URI

## PRIVATE
Temporary_File (file_resource:Managed_Resource) (content_type:Text|Nothing) uri:URI
Materialized_Temporary_File (temporary_file:Temporary_File) (content_type:Text|Nothing) uri:URI

## PRIVATE
with_stream : (Input_Stream -> Any ! HTTP_Error) -> Any ! HTTP_Error
with_stream self action = case self of
Response_Body.Raw_Stream raw_stream _ _ ->
Managed_Resource.bracket raw_stream (_.close) action
Response_Body.Byte_Array bytes _ _ ->
Response_Body.Materialized_Byte_Array bytes _ _ ->
byte_stream = Input_Stream.new (ByteArrayInputStream.new bytes) (HTTP_Error.handle_java_exceptions self.uri)
Managed_Resource.bracket byte_stream (_.close) action
Response_Body.Temporary_File file_resource _ _ -> file_resource.with file->
Response_Body.Materialized_Temporary_File temporary_file _ _ -> temporary_file.with_file file->
opts = [File_Access.Read.to_java]
stream = HTTP_Error.handle_java_exceptions self.uri (file.input_stream_builtin opts)
file_stream = Input_Stream.new stream (HTTP_Error.handle_java_exceptions self.uri)
file_stream = Input_Stream.new stream (HTTP_Error.handle_java_exceptions self.uri) associated_file=temporary_file
Managed_Resource.bracket (file_stream) (_.close) action

## PRIVATE
Expand All @@ -88,23 +90,19 @@ type Response_Body
body_stream.with_java_stream body_java_stream->
first_block = body_java_stream.readNBytes maximum_body_in_memory
case first_block.length < maximum_body_in_memory of
True -> Response_Body.Byte_Array (Vector.from_polyglot_array first_block) self.content_type self.uri
False ->
file = File.create_temporary_file self.uri.host

## Write contents to temporary file
Context.Output.with_enabled <|
True -> Response_Body.Materialized_Byte_Array (Vector.from_polyglot_array first_block) self.content_type self.uri
False -> Context.Output.with_enabled <|
## Write contents to a temporary file
temp_file = Temporary_File.new self.uri.host
r = temp_file.with_file file->
file.with_output_stream [File_Access.Write, File_Access.Create, File_Access.Truncate_Existing] output_stream->
output_stream.with_java_stream java_output_stream->
java_output_stream.write first_block
body_java_stream.transferTo java_output_stream
java_output_stream.flush
Nothing
output_stream.close

## Have a file with the correct set up
resource = Managed_Resource.register file delete_file
Response_Body.Temporary_File resource self.content_type self.uri
r.if_not_error <|
Response_Body.Materialized_Temporary_File temp_file self.content_type self.uri
_ -> self

## ALIAS parse
Expand All @@ -128,7 +126,9 @@ type Response_Body
_ ->
type_obj = Meta.type_of format
if can_decode type_obj . not then Error.throw (Illegal_Argument.Error type_obj.to_text+" cannot be used to decode from a stream. It must be saved to a file first.") else
self.with_stream format.read_stream
metadata = File_Format_Metadata.Value content_type=self.content_type
self.with_stream stream->
format.read_stream stream metadata

## ALIAS bytes
GROUP Input
Expand Down
11 changes: 11 additions & 0 deletions distribution/lib/Standard/Base/0.0.0-dev/src/Nothing.enso
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import project.Any.Any
import project.Data.Numbers.Integer
import project.Data.Text.Text
from project.Data.Boolean import Boolean, False, True
from project.Function import const

## The type that has only a singleton value. Nothing in Enso is used as an
universal value to indicate the lack of presence of a value.
Expand Down Expand Up @@ -30,6 +31,16 @@ type Nothing
if_nothing : Any -> Any
if_nothing self ~function = function

## If `self` is Nothing then returns Nothing, otherwise returns the result
of running the provided `action`.

> Example
Transform a value only if it is not nothing.

my_result.if_not_nothing <| my_result + 1
if_not_nothing : Any -> Any
if_not_nothing self ~action = const Nothing action

## Get a value for the key of the object.
As `Nothing` has no keys, returns `if_missing`.

Expand Down
Loading

0 comments on commit b5c995a

Please sign in to comment.