#484 Add unit tests for ASCII indexing and partial record parsing.

AbsaOSS · Apr 5, 2022 · cd34692 · cd34692
1 parent 59de02b
commit cd34692
Showing 1 changed file with 119 additions and 1 deletion.
diff --git a/.../src/test/scala/za/co/absa/cobrix/spark/cobol/source/regression/Test18AsciiNulChars.scala b/.../src/test/scala/za/co/absa/cobrix/spark/cobol/source/regression/Test18AsciiNulChars.scala
@@ -189,7 +189,6 @@ class Test18AsciiNulChars extends WordSpec with SparkTestBase with BinaryFileFix
 
     "allow partial records" in {
       withTempTextFile("ascii_nul", ".dat", StandardCharsets.UTF_8, text) { tmpFileName =>
-
         val df = spark
           .read
           .format("cobol")
@@ -207,5 +206,124 @@ class Test18AsciiNulChars extends WordSpec with SparkTestBase with BinaryFileFix
         assert(count == 21)
       }
     }
+
+    "allow partial records with indexing" in {
+      withTempTextFile("ascii_nul", ".dat", StandardCharsets.UTF_8, text) { tmpFileName =>
+        val expected =
+          """[ {
+            |  "Record_Id" : 0,
+            |  "A" : "1",
+            |  "B" : ""
+            |}, {
+            |  "Record_Id" : 1,
+            |  "A" : "1",
+            |  "B" : "2"
+            |}, {
+            |  "Record_Id" : 2,
+            |  "A" : "1",
+            |  "B" : "23"
+            |}, {
+            |  "Record_Id" : 3,
+            |  "A" : "1",
+            |  "B" : "234"
+            |}, {
+            |  "Record_Id" : 4,
+            |  "A" : "1",
+            |  "B" : "234"
+            |}, {
+            |  "Record_Id" : 5,
+            |  "A" : "1",
+            |  "B" : "234"
+            |}, {
+            |  "Record_Id" : 6,
+            |  "A" : "1",
+            |  "B" : "234"
+            |}, {
+            |  "Record_Id" : 7,
+            |  "A" : "1",
+            |  "B" : "234"
+            |}, {
+            |  "Record_Id" : 8,
+            |  "A" : "1",
+            |  "B" : "234"
+            |}, {
+            |  "Record_Id" : 9,
+            |  "A" : "1",
+            |  "B" : "234"
+            |}, {
+            |  "Record_Id" : 10,
+            |  "A" : "5",
+            |  "B" : "678"
+            |} ]
+            |""".stripMargin
+
+        val df = spark
+          .read
+          .format("cobol")
+          .option("copybook_contents", copybook)
+          .option("pedantic", "true")
+          .option("record_format", "D")
+          .option("input_split_records", 2)
+          .option("encoding", "ascii")
+          .option("string_trimming_policy", "keep_all")
+          .option("generate_record_id", "true")
+          .load(tmpFileName)
+          .select("Record_Id", "A", "B")
+          .orderBy("Record_Id")
+
+        val count = df.count()
+        val actual = SparkUtils.prettyJSON(df.toJSON.collect().mkString("[", ",", "]"))
+
+        assert(count == 11)
+        assertEqualsMultiline(actual, expected)
+      }
+    }
+
+    "don't lose any records" in {
+      val copybook =
+        """         01  ENTITY.
+           05  A    PIC X(3).
+           05  B    PIC X(3).
+    """
+
+      val expected =
+        """[ {
+          |  "Record_Id" : 0,
+          |  "A" : "123",
+          |  "B" : "456"
+          |}, {
+          |  "Record_Id" : 1,
+          |  "A" : "567",
+          |  "B" : "890"
+          |}, {
+          |  "Record_Id" : 2,
+          |  "A" : "123",
+          |  "B" : "456"
+          |}, {
+          |  "Record_Id" : 3,
+          |  "A" : "7"
+          |} ]""".stripMargin
+
+      val text = "123456\n567890\n123456\n7"
+
+      withTempTextFile("ascii_nul", ".dat", StandardCharsets.UTF_8, text) { tmpFileName =>
+        val df = spark
+          .read
+          .format("cobol")
+          .option("copybook_contents", copybook)
+          .option("pedantic", "true")
+          .option("record_format", "D")
+          .option("input_split_records", 3)
+          .option("generate_record_id", "true")
+          .load(tmpFileName)
+          .select("Record_Id", "A", "B")
+          .orderBy("Record_Id")
+
+        val actual = SparkUtils.prettyJSON(df.toJSON.collect().mkString("[", ",", "]"))
+
+        assertEqualsMultiline(actual, expected)
+      }
+    }
+
   }
 }