Skip to content

Commit

Permalink
[SPARK-23849][SQL] Tests for the samplingRatio option of JSON datasource
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

Proposed tests checks that only subset of input dataset is touched during schema inferring.

Author: Maxim Gekk <[email protected]>

Closes apache#20963 from MaxGekk/json-sampling-tests.
  • Loading branch information
MaxGekk authored and gatorsmile committed Apr 8, 2018
1 parent 2c1fe64 commit 6a73457
Showing 1 changed file with 36 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.json

import java.io.{File, StringWriter}
import java.nio.charset.StandardCharsets
import java.nio.file.Files
import java.nio.file.{Files, Paths, StandardOpenOption}
import java.sql.{Date, Timestamp}
import java.util.Locale

Expand Down Expand Up @@ -2127,4 +2127,39 @@ class JsonSuite extends QueryTest with SharedSQLContext with TestJsonData {
assert(df.schema === expectedSchema)
}
}

test("SPARK-23849: schema inferring touches less data if samplingRation < 1.0") {
val predefinedSample = Set[Int](2, 8, 15, 27, 30, 34, 35, 37, 44, 46,
57, 62, 68, 72)
withTempPath { path =>
val writer = Files.newBufferedWriter(Paths.get(path.getAbsolutePath),
StandardCharsets.UTF_8, StandardOpenOption.CREATE_NEW)
for (i <- 0 until 100) {
if (predefinedSample.contains(i)) {
writer.write(s"""{"f1":${i.toString}}""" + "\n")
} else {
writer.write(s"""{"f1":${(i.toDouble + 0.1).toString}}""" + "\n")
}
}
writer.close()

val ds = spark.read.option("samplingRatio", 0.1).json(path.getCanonicalPath)
assert(ds.schema == new StructType().add("f1", LongType))
}
}

test("SPARK-23849: usage of samplingRation while parsing of dataset of strings") {
val dstr = spark.sparkContext.parallelize(0 until 100, 1).map { i =>
val predefinedSample = Set[Int](2, 8, 15, 27, 30, 34, 35, 37, 44, 46,
57, 62, 68, 72)
if (predefinedSample.contains(i)) {
s"""{"f1":${i.toString}}""" + "\n"
} else {
s"""{"f1":${(i.toDouble + 0.1).toString}}""" + "\n"
}
}.toDS()
val ds = spark.read.option("samplingRatio", 0.1).json(dstr)

assert(ds.schema == new StructType().add("f1", LongType))
}
}

0 comments on commit 6a73457

Please sign in to comment.