Skip to content
This repository has been archived by the owner on Sep 18, 2023. It is now read-only.

Commit

Permalink
[NSE-333] Arrow Data Source: CSV format support fix (#336)
Browse files Browse the repository at this point in the history
* [NSE-333] Arrow Data Source: CSV format support fix

* Add unit test
  • Loading branch information
zhztheplayer authored May 25, 2021
1 parent 8fca8a5 commit d992844
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ object ArrowUtils {
options: ArrowOptions): Option[org.apache.arrow.dataset.file.FileFormat] = {
Option(options.originalFormat match {
case "parquet" => org.apache.arrow.dataset.file.FileFormat.PARQUET
case "csv" => org.apache.arrow.dataset.file.FileFormat.CSV
case _ => throw new IllegalArgumentException("Unrecognizable format")
})
}
Expand Down
35 changes: 35 additions & 0 deletions arrow-data-source/standard/src/test/resources/example.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
id1,id2,id3,id4,id5,id6,v1,v2,v3
id016,id016,id0000042202,15,24,5971,5,11,37.211254
id039,id045,id0000029558,40,49,39457,5,4,48.951141
id047,id023,id0000071286,68,20,74463,2,14,60.469241
id043,id057,id0000015141,32,43,63743,1,15,7.692145
id054,id040,id0000011083,9,25,16920,2,9,22.863525
id029,id020,id0000017974,40,43,14435,3,13,87.521355
id047,id023,id0000084849,90,96,35790,2,9,93.348148
id091,id022,id0000031441,50,44,71525,3,11,81.013682
id090,id048,id0000067778,24,2,51862,4,9,30.718821
id070,id008,id0000091167,78,4,23333,5,15,70.95464
id039,id084,id0000013708,94,81,44406,1,3,54.368009
id023,id061,id0000011331,36,67,86498,5,2,13.847979
id070,id054,id0000099110,24,15,47054,4,2,92.057305
id022,id008,id0000038862,38,92,63088,3,10,33.517765
id020,id070,id0000028952,17,57,50831,4,15,48.060814
id078,id022,id0000082008,69,44,15891,1,4,95.75571
id024,id033,id0000074157,1,57,83341,2,1,72.118722
id053,id076,id0000061759,55,43,59469,5,10,10.574836
id058,id087,id0000094028,14,49,72962,4,4,37.914258
id095,id091,id0000066931,35,20,98979,3,3,16.733062
id054,id061,id0000004843,69,58,14096,4,13,53.746802
id019,id078,id0000047661,5,33,13347,5,5,95.013936
id086,id088,id0000039469,45,86,65332,3,11,65.71087
id021,id055,id0000035603,96,97,36475,4,9,90.835613
id004,id034,id0000008260,99,8,73046,3,11,69.540405
id053,id052,id0000008764,47,13,49231,1,15,32.039599
id014,id050,id0000066034,45,32,33268,2,3,93.752279
id099,id057,id0000062408,27,7,63984,5,6,77.454794
id013,id067,id0000046109,69,90,21214,4,6,83.899656
id042,id043,id0000025883,64,21,85711,4,14,84.141247
id024,id062,id0000026824,79,16,49757,2,10,15.822967
id058,id077,id0000016555,71,8,24728,3,9,92.085521
id053,id012,id0000005595,73,28,79781,2,10,6.053862
id100,id096,id0000073858,11,9,25962,1,10,87.268781
3 changes: 3 additions & 0 deletions arrow-data-source/standard/src/test/resources/people.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
name,age,job
Jorge,30,Developer
Bob,32,Developer
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession {
super.afterAll()
}

test("reading parquet file") {
test("read parquet file") {
val path = ArrowDataSourceTest.locateResourcePath(parquetFile1)
verifyParquet(
spark.read
Expand Down Expand Up @@ -254,39 +254,52 @@ class ArrowDataSourceTest extends QueryTest with SharedSparkSession {
assert(fdGrowth < 100)
}

// csv cases: not implemented
private val csvFile = "cars.csv"
private val csvFile1 = "people.csv"
private val csvFile2 = "example.csv"

ignore("reading csv file without specifying original format") {
verifyCsv(spark.read.format("arrow").load(csvFile))
ignore("read csv file without specifying original format") {
// not implemented
verifyFrame(spark.read.format("arrow")
.load(ArrowDataSourceTest.locateResourcePath(csvFile1)), 1, 2)
}

ignore("reading csv file") {
val path = ArrowDataSourceTest.locateResourcePath(csvFile)
verifyCsv(
test("read csv file") {
val path = ArrowDataSourceTest.locateResourcePath(csvFile1)
verifyFrame(
spark.read
.format("arrow")
.option(ArrowOptions.KEY_ORIGINAL_FORMAT, "csv")
.load(path))
.load(path), 2, 3)
}

ignore("read csv file - programmatic API ") {
val path = ArrowDataSourceTest.locateResourcePath(csvFile)
verifyCsv(
test("read csv file 2") {
val path = ArrowDataSourceTest.locateResourcePath(csvFile2)
verifyFrame(
spark.read
.format("arrow")
.option(ArrowOptions.KEY_ORIGINAL_FORMAT, "csv")
.load(path), 34, 9)
}

test("read csv file - programmatic API ") {
val path = ArrowDataSourceTest.locateResourcePath(csvFile1)
verifyFrame(
spark.read
.option(ArrowOptions.KEY_ORIGINAL_FORMAT, "csv")
.arrow(path))
.arrow(path), 2, 3)
}

def verifyFrame(frame: DataFrame, rowCount: Int, columnCount: Int): Unit = {
assert(frame.schema.length === columnCount)
assert(frame.collect().length === rowCount)
}

def verifyCsv(frame: DataFrame): Unit = {
// todo assert something
}

def verifyParquet(frame: DataFrame): Unit = {
assert(
frame.schema ===
StructType(Seq(StructField("col", LongType))))
assert(frame.collect().length === 5)
verifyFrame(frame, 5, 1)
}

def delete(path: String): Unit = {
Expand Down
2 changes: 1 addition & 1 deletion native-sql-engine/cpp/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,7 @@ set(SPARK_COLUMNAR_PLUGIN_SRCS
precompile/unsafe_array.cc
precompile/gandiva_projector.cc
third_party/gandiva/decimal_ops.cc
third_party/gandiva/time.cc
third_party/gandiva/time.cc
)

add_subdirectory(third_party/gandiva)
Expand Down

0 comments on commit d992844

Please sign in to comment.