Skip to content

Commit

Permalink
#484 Add unit tests for ASCII indexing and partial record parsing.
Browse files Browse the repository at this point in the history
  • Loading branch information
yruslan committed Apr 5, 2022
1 parent 59de02b commit cd34692
Showing 1 changed file with 119 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,6 @@ class Test18AsciiNulChars extends WordSpec with SparkTestBase with BinaryFileFix

"allow partial records" in {
withTempTextFile("ascii_nul", ".dat", StandardCharsets.UTF_8, text) { tmpFileName =>

val df = spark
.read
.format("cobol")
Expand All @@ -207,5 +206,124 @@ class Test18AsciiNulChars extends WordSpec with SparkTestBase with BinaryFileFix
assert(count == 21)
}
}

"allow partial records with indexing" in {
withTempTextFile("ascii_nul", ".dat", StandardCharsets.UTF_8, text) { tmpFileName =>
val expected =
"""[ {
| "Record_Id" : 0,
| "A" : "1",
| "B" : ""
|}, {
| "Record_Id" : 1,
| "A" : "1",
| "B" : "2"
|}, {
| "Record_Id" : 2,
| "A" : "1",
| "B" : "23"
|}, {
| "Record_Id" : 3,
| "A" : "1",
| "B" : "234"
|}, {
| "Record_Id" : 4,
| "A" : "1",
| "B" : "234"
|}, {
| "Record_Id" : 5,
| "A" : "1",
| "B" : "234"
|}, {
| "Record_Id" : 6,
| "A" : "1",
| "B" : "234"
|}, {
| "Record_Id" : 7,
| "A" : "1",
| "B" : "234"
|}, {
| "Record_Id" : 8,
| "A" : "1",
| "B" : "234"
|}, {
| "Record_Id" : 9,
| "A" : "1",
| "B" : "234"
|}, {
| "Record_Id" : 10,
| "A" : "5",
| "B" : "678"
|} ]
|""".stripMargin

val df = spark
.read
.format("cobol")
.option("copybook_contents", copybook)
.option("pedantic", "true")
.option("record_format", "D")
.option("input_split_records", 2)
.option("encoding", "ascii")
.option("string_trimming_policy", "keep_all")
.option("generate_record_id", "true")
.load(tmpFileName)
.select("Record_Id", "A", "B")
.orderBy("Record_Id")

val count = df.count()
val actual = SparkUtils.prettyJSON(df.toJSON.collect().mkString("[", ",", "]"))

assert(count == 11)
assertEqualsMultiline(actual, expected)
}
}

"don't lose any records" in {
val copybook =
""" 01 ENTITY.
05 A PIC X(3).
05 B PIC X(3).
"""

val expected =
"""[ {
| "Record_Id" : 0,
| "A" : "123",
| "B" : "456"
|}, {
| "Record_Id" : 1,
| "A" : "567",
| "B" : "890"
|}, {
| "Record_Id" : 2,
| "A" : "123",
| "B" : "456"
|}, {
| "Record_Id" : 3,
| "A" : "7"
|} ]""".stripMargin

val text = "123456\n567890\n123456\n7"

withTempTextFile("ascii_nul", ".dat", StandardCharsets.UTF_8, text) { tmpFileName =>
val df = spark
.read
.format("cobol")
.option("copybook_contents", copybook)
.option("pedantic", "true")
.option("record_format", "D")
.option("input_split_records", 3)
.option("generate_record_id", "true")
.load(tmpFileName)
.select("Record_Id", "A", "B")
.orderBy("Record_Id")

val actual = SparkUtils.prettyJSON(df.toJSON.collect().mkString("[", ",", "]"))

assertEqualsMultiline(actual, expected)
}
}

}
}

0 comments on commit cd34692

Please sign in to comment.