From 88aee39894752f8bba4a97bb36149c9673ae47e0 Mon Sep 17 00:00:00 2001 From: Justin Uang Date: Wed, 10 Oct 2018 06:46:01 -0400 Subject: [PATCH] [SPARK-25493][SQL] Use auto-detection for CRLF in CSV datasource multiline mode (#419) --- .../sql/execution/datasources/csv/CSVOptions.scala | 2 ++ sql/core/src/test/resources/test-data/cars-crlf.csv | 7 +++++++ .../sql/execution/datasources/csv/CSVSuite.scala | 12 ++++++++++++ 3 files changed, 21 insertions(+) create mode 100644 sql/core/src/test/resources/test-data/cars-crlf.csv diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala index 492a21be6df3b..222954a32f1c2 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVOptions.scala @@ -212,6 +212,8 @@ class CSVOptions( settings.setEmptyValue(emptyValueInRead) settings.setMaxCharsPerColumn(maxCharsPerColumn) settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER) + settings.setLineSeparatorDetectionEnabled(multiLine == true) + settings } } diff --git a/sql/core/src/test/resources/test-data/cars-crlf.csv b/sql/core/src/test/resources/test-data/cars-crlf.csv new file mode 100644 index 0000000000000..d018d08ebc6fc --- /dev/null +++ b/sql/core/src/test/resources/test-data/cars-crlf.csv @@ -0,0 +1,7 @@ + +year,make,model,comment,blank +"2012","Tesla","S","No comment", + +1997,Ford,E350,"Go get one now they are going fast", +2015,Chevy,Volt + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala index f70df0bcecde7..adeadb911299b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala @@ -52,6 +52,7 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te private val carsNullFile = "test-data/cars-null.csv" private val carsEmptyValueFile = "test-data/cars-empty-value.csv" private val carsBlankColName = "test-data/cars-blank-column-name.csv" + private val carsCrlf = "test-data/cars-crlf.csv" private val emptyFile = "test-data/empty.csv" private val commentsFile = "test-data/comments.csv" private val disableCommentsFile = "test-data/disable_comments.csv" @@ -220,6 +221,17 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils with Te } } + test("crlf line separators in multiline mode") { + val cars = spark + .read + .format("csv") + .option("multiLine", "true") + .option("header", "true") + .load(testFile(carsCrlf)) + + verifyCars(cars, withHeader = true) + } + test("test aliases sep and encoding for delimiter and charset") { // scalastyle:off val cars = spark