diff --git a/docs/src/data/key-change.json b/docs/src/data/key-change.json new file mode 100644 index 0000000000..c2719c54ff --- /dev/null +++ b/docs/src/data/key-change.json @@ -0,0 +1,5 @@ +[ + { "a": 1, "b": 2, "c": 3 }, + { "a": 4, "b": 5, "c": 6 }, + { "a": 7, "X": 8, "c": 9 } +] diff --git a/docs/src/data/under-over.json b/docs/src/data/under-over.json new file mode 100644 index 0000000000..0de486a83f --- /dev/null +++ b/docs/src/data/under-over.json @@ -0,0 +1,6 @@ +[ + { "a": 1, "b": 2, "c": 3 }, + { "a": 4, "b": 5, "c": 6, "d": 7 }, + { "a": 7, "b": 8 }, + { "a": 9, "b": 10, "c": 11 } +] diff --git a/docs/src/file-formats.md b/docs/src/file-formats.md index 3af248ce57..7064b9b49f 100644 --- a/docs/src/file-formats.md +++ b/docs/src/file-formats.md @@ -130,6 +130,74 @@ In particular, no encode/decode of `\r`, `\n`, `\t`, or `\\` is done. * CSV-lite allows changing FS and/or RS to any values, perhaps multi-character. +* CSV-lite and TSV-lite handle schema changes ("schema" meaning "ordered list of field names in a given record") by adding a newline and re-emitting the header. CSV and TSV, by contrast, do the following: + * If there are too few keys, but these match the header, empty fields are emitted. + * If there are too many keys, but these match the header up to the number of header fields, the extra fields are emitted. + * If keys don't match the header, this is an error. + +
+cat data/under-over.json ++
+[ + { "a": 1, "b": 2, "c": 3 }, + { "a": 4, "b": 5, "c": 6, "d": 7 }, + { "a": 7, "b": 8 }, + { "a": 9, "b": 10, "c": 11 } +] ++ +
+mlr --ijson --ocsvlite cat data/under-over.json ++
+a,b,c +1,2,3 + +a,b,c,d +4,5,6,7 + +a,b +7,8 + +a,b,c +9,10,11 ++ +
+mlr --ijson --ocsvlite cat data/key-change.json ++
+a,b,c +1,2,3 +4,5,6 + +a,X,c +7,8,9 ++ +
+mlr --ijson --ocsv cat data/under-over.json ++
+a,b,c +1,2,3 +4,5,6,7 +7,8, +9,10,11 ++ +
+mlr --ijson --ocsv cat data/key-change.json ++
+a,b,c +1,2,3 +4,5,6 +mlr: CSV schema change: first keys "a,b,c"; current keys "a,X,c" +mlr: exiting due to data error. ++ * In short, use-cases for CSV-lite and TSV-lite are often found when dealing with CSV/TSV files which are formatted in some non-standard way -- you have a little more flexibility available to you. (As an example of this flexibility: ASV and USV are nothing more than CSV-lite with different values for FS and RS.) CSV, TSV, CSV-lite, and TSV-lite have in common the `--implicit-csv-header` flag for input and the `--headerless-csv-output` flag for output. diff --git a/docs/src/file-formats.md.in b/docs/src/file-formats.md.in index 7e3d503082..36365a1fb2 100644 --- a/docs/src/file-formats.md.in +++ b/docs/src/file-formats.md.in @@ -42,6 +42,31 @@ In particular, no encode/decode of `\r`, `\n`, `\t`, or `\\` is done. * CSV-lite allows changing FS and/or RS to any values, perhaps multi-character. +* CSV-lite and TSV-lite handle schema changes ("schema" meaning "ordered list of field names in a given record") by adding a newline and re-emitting the header. CSV and TSV, by contrast, do the following: + * If there are too few keys, but these match the header, empty fields are emitted. + * If there are too many keys, but these match the header up to the number of header fields, the extra fields are emitted. + * If keys don't match the header, this is an error. + +GENMD-RUN-COMMAND +cat data/under-over.json +GENMD-EOF + +GENMD-RUN-COMMAND +mlr --ijson --ocsvlite cat data/under-over.json +GENMD-EOF + +GENMD-RUN-COMMAND-TOLERATING-ERROR +mlr --ijson --ocsvlite cat data/key-change.json +GENMD-EOF + +GENMD-RUN-COMMAND +mlr --ijson --ocsv cat data/under-over.json +GENMD-EOF + +GENMD-RUN-COMMAND-TOLERATING-ERROR +mlr --ijson --ocsv cat data/key-change.json +GENMD-EOF + * In short, use-cases for CSV-lite and TSV-lite are often found when dealing with CSV/TSV files which are formatted in some non-standard way -- you have a little more flexibility available to you. (As an example of this flexibility: ASV and USV are nothing more than CSV-lite with different values for FS and RS.) CSV, TSV, CSV-lite, and TSV-lite have in common the `--implicit-csv-header` flag for input and the `--headerless-csv-output` flag for output. diff --git a/docs/src/questions-about-joins.md b/docs/src/questions-about-joins.md index b8bde2d46d..e3974877ed 100644 --- a/docs/src/questions-about-joins.md +++ b/docs/src/questions-about-joins.md @@ -118,9 +118,7 @@ However, if we ask for left-unpaireds, since there's no `color` column, we get a id,code,color 4,ff0000,red 2,00ff00,green - -id,code -3,0000ff +3,0000ff, To fix this, we can use **unsparsify**: diff --git a/docs/src/record-heterogeneity.md b/docs/src/record-heterogeneity.md index d02a524482..de96ae69cd 100644 --- a/docs/src/record-heterogeneity.md +++ b/docs/src/record-heterogeneity.md @@ -375,13 +375,12 @@ record_count=150,resource=/path/to/second/file CSV and pretty-print formats expect rectangular structure. But Miller lets you process non-rectangular using CSV and pretty-print. -Miller simply prints a newline and a new header when there is a schema change --- where by _schema_ we mean simply the list of record keys in the order they -are encountered. When there is no schema change, you get CSV per se as a -special case. Likewise, Miller reads heterogeneous CSV or pretty-print input -the same way. The difference between CSV and CSV-lite is that the former is -[RFC-4180-compliant](file-formats.md#csvtsvasvusvetc), while the latter readily -handles heterogeneous data (which is non-compliant). For example: +For CSV-lite and TSV-lite, Miller simply prints a newline and a new header when there is a schema +change -- where by _schema_ we mean simply the list of record keys in the order they are +encountered. When there is no schema change, you get CSV per se as a special case. Likewise, Miller +reads heterogeneous CSV or pretty-print input the same way. The difference between CSV and CSV-lite +is that the former is [RFC-4180-compliant](file-formats.md#csvtsvasvusvetc), while the latter +readily handles heterogeneous data (which is non-compliant). For example:
cat data/het.json @@ -446,19 +445,43 @@ record_count resource 150 /path/to/second/file-Miller handles explicit header changes as just shown. If your CSV input contains ragged data -- if there are implicit header changes (no intervening blank line and new header line) as seen above -- you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`). +
+mlr --ijson --ocsvlite group-like data/het.json ++
+resource,loadsec,ok +/path/to/file,0.45,true +/path/to/second/file,0.32,true +/some/other/path,0.97,false + +record_count,resource +100,/path/to/file +150,/path/to/second/file +
-mlr --csv --ragged cat data/het/ragged.csv +mlr --ijson --ocsv group-like data/het.json
-a,b,c -1,2,3 +resource,loadsec,ok +/path/to/file,0.45,true +/path/to/second/file,0.32,true +/some/other/path,0.97,false +mlr: CSV schema change: first keys "resource,loadsec,ok"; current keys "record_count,resource" +mlr: exiting due to data error. +-a,b -4,5 +Miller handles explicit header changes as just shown. If your CSV input contains ragged data -- if +there are implicit header changes (no intervening blank line and new header line) as seen above -- +you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`). -a,b,c,4 +
+mlr --csv --allow-ragged-csv-input cat data/het/ragged.csv ++
+a,b,c +1,2,3 +4,5, 7,8,9,10diff --git a/docs/src/record-heterogeneity.md.in b/docs/src/record-heterogeneity.md.in index 1aab9dfaae..677098ee87 100644 --- a/docs/src/record-heterogeneity.md.in +++ b/docs/src/record-heterogeneity.md.in @@ -180,13 +180,12 @@ GENMD-EOF CSV and pretty-print formats expect rectangular structure. But Miller lets you process non-rectangular using CSV and pretty-print. -Miller simply prints a newline and a new header when there is a schema change --- where by _schema_ we mean simply the list of record keys in the order they -are encountered. When there is no schema change, you get CSV per se as a -special case. Likewise, Miller reads heterogeneous CSV or pretty-print input -the same way. The difference between CSV and CSV-lite is that the former is -[RFC-4180-compliant](file-formats.md#csvtsvasvusvetc), while the latter readily -handles heterogeneous data (which is non-compliant). For example: +For CSV-lite and TSV-lite, Miller simply prints a newline and a new header when there is a schema +change -- where by _schema_ we mean simply the list of record keys in the order they are +encountered. When there is no schema change, you get CSV per se as a special case. Likewise, Miller +reads heterogeneous CSV or pretty-print input the same way. The difference between CSV and CSV-lite +is that the former is [RFC-4180-compliant](file-formats.md#csvtsvasvusvetc), while the latter +readily handles heterogeneous data (which is non-compliant). For example: GENMD-RUN-COMMAND cat data/het.json @@ -200,10 +199,20 @@ GENMD-RUN-COMMAND mlr --ijson --opprint group-like data/het.json GENMD-EOF -Miller handles explicit header changes as just shown. If your CSV input contains ragged data -- if there are implicit header changes (no intervening blank line and new header line) as seen above -- you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`). +GENMD-RUN-COMMAND +mlr --ijson --ocsvlite group-like data/het.json +GENMD-EOF GENMD-RUN-COMMAND-TOLERATING-ERROR -mlr --csv --ragged cat data/het/ragged.csv +mlr --ijson --ocsv group-like data/het.json +GENMD-EOF + +Miller handles explicit header changes as just shown. If your CSV input contains ragged data -- if +there are implicit header changes (no intervening blank line and new header line) as seen above -- +you can use `--allow-ragged-csv-input` (or keystroke-saver `--ragged`). + +GENMD-RUN-COMMAND +mlr --csv --allow-ragged-csv-input cat data/het/ragged.csv GENMD-EOF ## Processing heterogeneous data diff --git a/pkg/output/channel_writer.go b/pkg/output/channel_writer.go index 6805ad8902..e7b0e802ea 100644 --- a/pkg/output/channel_writer.go +++ b/pkg/output/channel_writer.go @@ -94,7 +94,11 @@ func channelWriterHandleBatch( } if record != nil { - recordWriter.Write(record, bufferedOutputStream, outputIsStdout) + err := recordWriter.Write(record, bufferedOutputStream, outputIsStdout) + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: %v\n", err) + return true, true + } } outputString := recordAndContext.OutputString @@ -111,8 +115,13 @@ func channelWriterHandleBatch( // queued up. For example, PPRINT needs to see all same-schema // records before printing any, since it needs to compute max width // down columns. - recordWriter.Write(nil, bufferedOutputStream, outputIsStdout) - return true, false + err := recordWriter.Write(nil, bufferedOutputStream, outputIsStdout) + if err != nil { + fmt.Fprintf(os.Stderr, "mlr: %v\n", err) + return true, true + } else { + return true, false + } } } return false, false diff --git a/pkg/output/record_writer.go b/pkg/output/record_writer.go index 37d8a7780e..3ce49743d4 100644 --- a/pkg/output/record_writer.go +++ b/pkg/output/record_writer.go @@ -20,5 +20,5 @@ type IRecordWriter interface { outrec *mlrval.Mlrmap, bufferedOutputStream *bufio.Writer, outputIsStdout bool, - ) + ) error } diff --git a/pkg/output/record_writer_csv.go b/pkg/output/record_writer_csv.go index 947400275b..b71af63d25 100644 --- a/pkg/output/record_writer_csv.go +++ b/pkg/output/record_writer_csv.go @@ -12,15 +12,13 @@ import ( ) type RecordWriterCSV struct { - writerOptions *cli.TWriterOptions - ofs0 byte // Go's CSV library only lets its 'Comma' be a single character - csvWriter *csv.Writer - // For reporting schema changes: we print a newline and the new header - lastJoinedHeader *string - // Only write one blank line for schema changes / blank input lines - justWroteEmptyLine bool - // For double-quote around all fields - quoteAll bool + writerOptions *cli.TWriterOptions + ofs0 byte // Go's CSV library only lets its 'Comma' be a single character + csvWriter *csv.Writer + needToPrintHeader bool + firstRecordKeys []string + firstRecordNF int64 + quoteAll bool // For double-quote around all fields } func NewRecordWriterCSV(writerOptions *cli.TWriterOptions) (*RecordWriterCSV, error) { @@ -30,23 +28,25 @@ func NewRecordWriterCSV(writerOptions *cli.TWriterOptions) (*RecordWriterCSV, er if writerOptions.ORS != "\n" && writerOptions.ORS != "\r\n" { return nil, fmt.Errorf("for CSV, ORS cannot be altered") } - return &RecordWriterCSV{ - writerOptions: writerOptions, - csvWriter: nil, // will be set on first Write() wherein we have the output stream - lastJoinedHeader: nil, - justWroteEmptyLine: false, - quoteAll: writerOptions.CSVQuoteAll, - }, nil + writer := &RecordWriterCSV{ + writerOptions: writerOptions, + csvWriter: nil, // will be set on first Write() wherein we have the output stream + needToPrintHeader: !writerOptions.HeaderlessOutput, + firstRecordKeys: nil, + firstRecordNF: -1, + quoteAll: writerOptions.CSVQuoteAll, + } + return writer, nil } func (writer *RecordWriterCSV) Write( outrec *mlrval.Mlrmap, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { +) error { // End of record stream: nothing special for this output format if outrec == nil { - return + return nil } if writer.csvWriter == nil { @@ -54,46 +54,46 @@ func (writer *RecordWriterCSV) Write( writer.csvWriter.Comma = rune(writer.writerOptions.OFS[0]) // xxx temp } - if outrec.IsEmpty() { - if !writer.justWroteEmptyLine { - bufferedOutputStream.WriteString("\n") - } - joinedHeader := "" - writer.lastJoinedHeader = &joinedHeader - writer.justWroteEmptyLine = true - return - } - - needToPrintHeader := false - joinedHeader := strings.Join(outrec.GetKeys(), ",") - if writer.lastJoinedHeader == nil || *writer.lastJoinedHeader != joinedHeader { - if writer.lastJoinedHeader != nil { - if !writer.justWroteEmptyLine { - bufferedOutputStream.WriteString("\n") - } - writer.justWroteEmptyLine = true - } - writer.lastJoinedHeader = &joinedHeader - needToPrintHeader = true + if writer.firstRecordKeys == nil { + writer.firstRecordKeys = outrec.GetKeys() + writer.firstRecordNF = int64(len(writer.firstRecordKeys)) } - if needToPrintHeader && !writer.writerOptions.HeaderlessOutput { + if writer.needToPrintHeader { fields := make([]string, outrec.FieldCount) i := 0 for pe := outrec.Head; pe != nil; pe = pe.Next { fields[i] = pe.Key i++ } - //////writer.csvWriter.Write(fields) writer.WriteCSVRecordMaybeColorized(fields, bufferedOutputStream, outputIsStdout, true, writer.quoteAll) + writer.needToPrintHeader = false + } + + var outputNF int64 = outrec.FieldCount + if outputNF < writer.firstRecordNF { + outputNF = writer.firstRecordNF } - fields := make([]string, outrec.FieldCount) - i := 0 + fields := make([]string, outputNF) + var i int64 = 0 for pe := outrec.Head; pe != nil; pe = pe.Next { + if i < writer.firstRecordNF && pe.Key != writer.firstRecordKeys[i] { + return fmt.Errorf( + "CSV schema change: first keys \"%s\"; current keys \"%s\"", + strings.Join(writer.firstRecordKeys, writer.writerOptions.OFS), + strings.Join(outrec.GetKeys(), writer.writerOptions.OFS), + ) + } fields[i] = pe.Value.String() i++ } + + for ; i < outputNF; i++ { + fields[i] = "" + } + writer.WriteCSVRecordMaybeColorized(fields, bufferedOutputStream, outputIsStdout, false, writer.quoteAll) - writer.justWroteEmptyLine = false + + return nil } diff --git a/pkg/output/record_writer_csvlite.go b/pkg/output/record_writer_csvlite.go index ced670c13a..c59556b30f 100644 --- a/pkg/output/record_writer_csvlite.go +++ b/pkg/output/record_writer_csvlite.go @@ -29,10 +29,10 @@ func (writer *RecordWriterCSVLite) Write( outrec *mlrval.Mlrmap, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { +) error { // End of record stream: nothing special for this output format if outrec == nil { - return + return nil } if outrec.IsEmpty() { @@ -42,7 +42,7 @@ func (writer *RecordWriterCSVLite) Write( joinedHeader := "" writer.lastJoinedHeader = &joinedHeader writer.justWroteEmptyLine = true - return + return nil } needToPrintHeader := false @@ -79,4 +79,6 @@ func (writer *RecordWriterCSVLite) Write( bufferedOutputStream.WriteString(writer.writerOptions.ORS) writer.justWroteEmptyLine = false + + return nil } diff --git a/pkg/output/record_writer_dkvp.go b/pkg/output/record_writer_dkvp.go index bc60868ca7..d27420eded 100644 --- a/pkg/output/record_writer_dkvp.go +++ b/pkg/output/record_writer_dkvp.go @@ -22,15 +22,15 @@ func (writer *RecordWriterDKVP) Write( outrec *mlrval.Mlrmap, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { +) error { // End of record stream: nothing special for this output format if outrec == nil { - return + return nil } if outrec.IsEmpty() { bufferedOutputStream.WriteString(writer.writerOptions.ORS) - return + return nil } for pe := outrec.Head; pe != nil; pe = pe.Next { @@ -42,4 +42,6 @@ func (writer *RecordWriterDKVP) Write( } } bufferedOutputStream.WriteString(writer.writerOptions.ORS) + + return nil } diff --git a/pkg/output/record_writer_json.go b/pkg/output/record_writer_json.go index 578e9f8ba1..e832f169eb 100644 --- a/pkg/output/record_writer_json.go +++ b/pkg/output/record_writer_json.go @@ -39,7 +39,7 @@ func (writer *RecordWriterJSON) Write( outrec *mlrval.Mlrmap, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { +) error { if outrec != nil && writer.jvQuoteAll { outrec.StringifyValuesRecursively() } @@ -49,6 +49,7 @@ func (writer *RecordWriterJSON) Write( } else { writer.writeWithoutListWrap(outrec, bufferedOutputStream, outputIsStdout) } + return nil } // ---------------------------------------------------------------- diff --git a/pkg/output/record_writer_markdown.go b/pkg/output/record_writer_markdown.go index 2688c29624..6c2983a59f 100644 --- a/pkg/output/record_writer_markdown.go +++ b/pkg/output/record_writer_markdown.go @@ -31,9 +31,9 @@ func (writer *RecordWriterMarkdown) Write( outrec *mlrval.Mlrmap, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { +) error { if outrec == nil { // end of record stream - return + return nil } currentJoinedHeader := outrec.GetKeysJoined() @@ -73,4 +73,6 @@ func (writer *RecordWriterMarkdown) Write( bufferedOutputStream.WriteString(" |") } bufferedOutputStream.WriteString(writer.writerOptions.ORS) + + return nil } diff --git a/pkg/output/record_writer_nidx.go b/pkg/output/record_writer_nidx.go index d3babd35af..551fe47aae 100644 --- a/pkg/output/record_writer_nidx.go +++ b/pkg/output/record_writer_nidx.go @@ -21,10 +21,10 @@ func (writer *RecordWriterNIDX) Write( outrec *mlrval.Mlrmap, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { +) error { // End of record stream: nothing special for this output format if outrec == nil { - return + return nil } for pe := outrec.Head; pe != nil; pe = pe.Next { @@ -34,4 +34,6 @@ func (writer *RecordWriterNIDX) Write( } } bufferedOutputStream.WriteString(writer.writerOptions.ORS) + + return nil } diff --git a/pkg/output/record_writer_pprint.go b/pkg/output/record_writer_pprint.go index 2fd4aaa703..b9f48cd93c 100644 --- a/pkg/output/record_writer_pprint.go +++ b/pkg/output/record_writer_pprint.go @@ -37,7 +37,7 @@ func (writer *RecordWriterPPRINT) Write( outrec *mlrval.Mlrmap, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { +) error { // Group records by have-same-schema or not. Pretty-print each // homoegeneous sublist, or "batch". // @@ -83,6 +83,8 @@ func (writer *RecordWriterPPRINT) Write( bufferedOutputStream, outputIsStdout) } } + + return nil } // ---------------------------------------------------------------- diff --git a/pkg/output/record_writer_tsv.go b/pkg/output/record_writer_tsv.go index 48db403d8e..2a79793b2a 100644 --- a/pkg/output/record_writer_tsv.go +++ b/pkg/output/record_writer_tsv.go @@ -12,11 +12,10 @@ import ( ) type RecordWriterTSV struct { - writerOptions *cli.TWriterOptions - // For reporting schema changes: we print a newline and the new header - lastJoinedHeader *string - // Only write one blank line for schema changes / blank input lines - justWroteEmptyLine bool + writerOptions *cli.TWriterOptions + needToPrintHeader bool + firstRecordKeys []string + firstRecordNF int64 } func NewRecordWriterTSV(writerOptions *cli.TWriterOptions) (*RecordWriterTSV, error) { @@ -27,9 +26,10 @@ func NewRecordWriterTSV(writerOptions *cli.TWriterOptions) (*RecordWriterTSV, er return nil, fmt.Errorf("for CSV, ORS cannot be altered") } return &RecordWriterTSV{ - writerOptions: writerOptions, - lastJoinedHeader: nil, - justWroteEmptyLine: false, + writerOptions: writerOptions, + needToPrintHeader: !writerOptions.HeaderlessOutput, + firstRecordKeys: nil, + firstRecordNF: -1, }, nil } @@ -37,42 +37,28 @@ func (writer *RecordWriterTSV) Write( outrec *mlrval.Mlrmap, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { +) error { // End of record stream: nothing special for this output format if outrec == nil { - return + return nil } - if outrec.IsEmpty() { - if !writer.justWroteEmptyLine { - bufferedOutputStream.WriteString(writer.writerOptions.ORS) - } - joinedHeader := "" - writer.lastJoinedHeader = &joinedHeader - writer.justWroteEmptyLine = true - return + if writer.firstRecordKeys == nil { + writer.firstRecordKeys = outrec.GetKeys() + writer.firstRecordNF = int64(len(writer.firstRecordKeys)) } - needToPrintHeader := false - joinedHeader := strings.Join(outrec.GetKeys(), ",") - if writer.lastJoinedHeader == nil || *writer.lastJoinedHeader != joinedHeader { - if writer.lastJoinedHeader != nil { - if !writer.justWroteEmptyLine { - bufferedOutputStream.WriteString(writer.writerOptions.ORS) - } - writer.justWroteEmptyLine = true + if writer.needToPrintHeader { + fields := make([]string, outrec.FieldCount) + i := 0 + for pe := outrec.Head; pe != nil; pe = pe.Next { + fields[i] = pe.Key + i++ } - writer.lastJoinedHeader = &joinedHeader - needToPrintHeader = true - } - - if needToPrintHeader && !writer.writerOptions.HeaderlessOutput { for pe := outrec.Head; pe != nil; pe = pe.Next { bufferedOutputStream.WriteString( colorizer.MaybeColorizeKey( - lib.TSVEncodeField( - pe.Key, - ), + lib.TSVEncodeField(pe.Key), outputIsStdout, ), ) @@ -83,22 +69,44 @@ func (writer *RecordWriterTSV) Write( } bufferedOutputStream.WriteString(writer.writerOptions.ORS) + + writer.needToPrintHeader = false + } + + var outputNF int64 = outrec.FieldCount + if outputNF < writer.firstRecordNF { + outputNF = writer.firstRecordNF } + fields := make([]string, outputNF) + var i int64 = 0 for pe := outrec.Head; pe != nil; pe = pe.Next { - bufferedOutputStream.WriteString( - colorizer.MaybeColorizeValue( - lib.TSVEncodeField( - pe.Value.String(), - ), - outputIsStdout, - ), + if i < writer.firstRecordNF && pe.Key != writer.firstRecordKeys[i] { + return fmt.Errorf( + "TSV schema change: first keys \"%s\"; current keys \"%s\"", + strings.Join(writer.firstRecordKeys, writer.writerOptions.OFS), + strings.Join(outrec.GetKeys(), writer.writerOptions.OFS), + ) + } + fields[i] = colorizer.MaybeColorizeValue( + lib.TSVEncodeField(pe.Value.String()), + outputIsStdout, ) - if pe.Next != nil { + i++ + } + + for ; i < outputNF; i++ { + fields[i] = "" + } + + for j, field := range fields { + if j > 0 { bufferedOutputStream.WriteString(writer.writerOptions.OFS) } + bufferedOutputStream.WriteString(field) } + bufferedOutputStream.WriteString(writer.writerOptions.ORS) - writer.justWroteEmptyLine = false + return nil } diff --git a/pkg/output/record_writer_xtab.go b/pkg/output/record_writer_xtab.go index 9093935e9a..27f3b1bcbf 100644 --- a/pkg/output/record_writer_xtab.go +++ b/pkg/output/record_writer_xtab.go @@ -45,10 +45,10 @@ func (writer *RecordWriterXTAB) Write( outrec *mlrval.Mlrmap, bufferedOutputStream *bufio.Writer, outputIsStdout bool, -) { +) error { // End of record stream: nothing special for this output format if outrec == nil { - return + return nil } maxKeyLength := 1 @@ -64,6 +64,8 @@ func (writer *RecordWriterXTAB) Write( } else { writer.writeWithLeftAlignedValues(outrec, bufferedOutputStream, outputIsStdout, maxKeyLength) } + + return nil } func (writer *RecordWriterXTAB) writeWithLeftAlignedValues( diff --git a/test/cases/io-csv-auto-unsparsify/at/cmd b/test/cases/io-csv-auto-unsparsify/at/cmd new file mode 100644 index 0000000000..64a5e8c77a --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/at/cmd @@ -0,0 +1 @@ +mlr -i json -o csv cat ${CASEDIR}/input.json diff --git a/test/cases/io-csv-auto-unsparsify/at/experr b/test/cases/io-csv-auto-unsparsify/at/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-csv-auto-unsparsify/at/expout b/test/cases/io-csv-auto-unsparsify/at/expout new file mode 100644 index 0000000000..29e4b3171d --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/at/expout @@ -0,0 +1,4 @@ +a,b,c +1,2,3 +4,5,6 +7,8,9 diff --git a/test/cases/io-csv-auto-unsparsify/at/input.json b/test/cases/io-csv-auto-unsparsify/at/input.json new file mode 100644 index 0000000000..832be9c9e2 --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/at/input.json @@ -0,0 +1,17 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": 6 +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/io-csv-auto-unsparsify/key-change/cmd b/test/cases/io-csv-auto-unsparsify/key-change/cmd new file mode 100644 index 0000000000..64a5e8c77a --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/key-change/cmd @@ -0,0 +1 @@ +mlr -i json -o csv cat ${CASEDIR}/input.json diff --git a/test/cases/io-csv-auto-unsparsify/key-change/experr b/test/cases/io-csv-auto-unsparsify/key-change/experr new file mode 100644 index 0000000000..699fbb70f6 --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/key-change/experr @@ -0,0 +1,2 @@ +mlr: CSV schema change: first keys "a,b,c"; current keys "a,X,c" +mlr: exiting due to data error. diff --git a/test/cases/io-csv-auto-unsparsify/key-change/expout b/test/cases/io-csv-auto-unsparsify/key-change/expout new file mode 100644 index 0000000000..88700c7147 --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/key-change/expout @@ -0,0 +1,3 @@ +a,b,c +1,2,3 +4,5,6 diff --git a/test/cases/io-csv-auto-unsparsify/key-change/input.json b/test/cases/io-csv-auto-unsparsify/key-change/input.json new file mode 100644 index 0000000000..841abab575 --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/key-change/input.json @@ -0,0 +1,17 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": 6 +}, +{ + "a": 7, + "X": 8, + "c": 9 +} +] diff --git a/test/cases/io-csv-auto-unsparsify/key-change/should-fail b/test/cases/io-csv-auto-unsparsify/key-change/should-fail new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-csv-auto-unsparsify/over/cmd b/test/cases/io-csv-auto-unsparsify/over/cmd new file mode 100644 index 0000000000..64a5e8c77a --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/over/cmd @@ -0,0 +1 @@ +mlr -i json -o csv cat ${CASEDIR}/input.json diff --git a/test/cases/io-csv-auto-unsparsify/over/experr b/test/cases/io-csv-auto-unsparsify/over/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-csv-auto-unsparsify/over/expout b/test/cases/io-csv-auto-unsparsify/over/expout new file mode 100644 index 0000000000..44ad0219a7 --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/over/expout @@ -0,0 +1,4 @@ +a,b,c +1,2,3 +4,5,6,7 +7,8,9 diff --git a/test/cases/io-csv-auto-unsparsify/over/input.json b/test/cases/io-csv-auto-unsparsify/over/input.json new file mode 100644 index 0000000000..38b47c2f09 --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/over/input.json @@ -0,0 +1,18 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": 6, + "d": 7 +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/io-csv-auto-unsparsify/under/cmd b/test/cases/io-csv-auto-unsparsify/under/cmd new file mode 100644 index 0000000000..64a5e8c77a --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/under/cmd @@ -0,0 +1 @@ +mlr -i json -o csv cat ${CASEDIR}/input.json diff --git a/test/cases/io-csv-auto-unsparsify/under/experr b/test/cases/io-csv-auto-unsparsify/under/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-csv-auto-unsparsify/under/expout b/test/cases/io-csv-auto-unsparsify/under/expout new file mode 100644 index 0000000000..48f0b0017b --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/under/expout @@ -0,0 +1,4 @@ +a,b,c +1,2,3 +4,5, +7,8,9 diff --git a/test/cases/io-csv-auto-unsparsify/under/input.json b/test/cases/io-csv-auto-unsparsify/under/input.json new file mode 100644 index 0000000000..e90f7439ad --- /dev/null +++ b/test/cases/io-csv-auto-unsparsify/under/input.json @@ -0,0 +1,16 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5 +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/io-multi/0010/experr b/test/cases/io-multi/0010/experr index e69de29bb2..15e296abb2 100644 --- a/test/cases/io-multi/0010/experr +++ b/test/cases/io-multi/0010/experr @@ -0,0 +1,2 @@ +mlr: CSV schema change: first keys "host"; current keys "df/tmp,uptime" +mlr: exiting due to data error. diff --git a/test/cases/io-multi/0010/expout b/test/cases/io-multi/0010/expout index 0d20e38d97..57d47ff76a 100644 --- a/test/cases/io-multi/0010/expout +++ b/test/cases/io-multi/0010/expout @@ -1,35 +1,2 @@ host jupiter - -df/tmp,uptime -2.43MB,32345sec - -host -saturn - -df/tmp,uptime -1.34MB,234214132sec - -host -mars - -df/tmp,uptime -4.97MB,345089805sec - -host -jupiter - -df/tmp,uptime -0.04MB,890sec - -host -mars - -df/tmp,uptime -8.55MB,787897777sec - -host -saturn - -df/tmp,uptime -9.47MB,234289080sec diff --git a/test/cases/io-multi/0010/should-fail b/test/cases/io-multi/0010/should-fail new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-multi/0033/experr b/test/cases/io-multi/0033/experr index e69de29bb2..15e296abb2 100644 --- a/test/cases/io-multi/0033/experr +++ b/test/cases/io-multi/0033/experr @@ -0,0 +1,2 @@ +mlr: CSV schema change: first keys "host"; current keys "df/tmp,uptime" +mlr: exiting due to data error. diff --git a/test/cases/io-multi/0033/expout b/test/cases/io-multi/0033/expout index 0d20e38d97..57d47ff76a 100644 --- a/test/cases/io-multi/0033/expout +++ b/test/cases/io-multi/0033/expout @@ -1,35 +1,2 @@ host jupiter - -df/tmp,uptime -2.43MB,32345sec - -host -saturn - -df/tmp,uptime -1.34MB,234214132sec - -host -mars - -df/tmp,uptime -4.97MB,345089805sec - -host -jupiter - -df/tmp,uptime -0.04MB,890sec - -host -mars - -df/tmp,uptime -8.55MB,787897777sec - -host -saturn - -df/tmp,uptime -9.47MB,234289080sec diff --git a/test/cases/io-multi/0033/should-fail b/test/cases/io-multi/0033/should-fail new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-multi/0034/experr b/test/cases/io-multi/0034/experr index e69de29bb2..15e296abb2 100644 --- a/test/cases/io-multi/0034/experr +++ b/test/cases/io-multi/0034/experr @@ -0,0 +1,2 @@ +mlr: CSV schema change: first keys "host"; current keys "df/tmp,uptime" +mlr: exiting due to data error. diff --git a/test/cases/io-multi/0034/expout b/test/cases/io-multi/0034/expout index 2a14e7a0ba..9ad9ee3916 100644 --- a/test/cases/io-multi/0034/expout +++ b/test/cases/io-multi/0034/expout @@ -1,23 +1 @@ jupiter - -2.43MB,32345sec - -saturn - -1.34MB,234214132sec - -mars - -4.97MB,345089805sec - -jupiter - -0.04MB,890sec - -mars - -8.55MB,787897777sec - -saturn - -9.47MB,234289080sec diff --git a/test/cases/io-multi/0034/should-fail b/test/cases/io-multi/0034/should-fail new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-tsv-auto-unsparsify/at/cmd b/test/cases/io-tsv-auto-unsparsify/at/cmd new file mode 100644 index 0000000000..818cba82b4 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/at/cmd @@ -0,0 +1 @@ +mlr -i json -o tsv cat ${CASEDIR}/input.json diff --git a/test/cases/io-tsv-auto-unsparsify/at/experr b/test/cases/io-tsv-auto-unsparsify/at/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-tsv-auto-unsparsify/at/expout b/test/cases/io-tsv-auto-unsparsify/at/expout new file mode 100644 index 0000000000..c0232182d7 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/at/expout @@ -0,0 +1,4 @@ +a b c +1 2 3 +4 5 6 +7 8 9 diff --git a/test/cases/io-tsv-auto-unsparsify/at/input.json b/test/cases/io-tsv-auto-unsparsify/at/input.json new file mode 100644 index 0000000000..832be9c9e2 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/at/input.json @@ -0,0 +1,17 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": 6 +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/io-tsv-auto-unsparsify/key-change/cmd b/test/cases/io-tsv-auto-unsparsify/key-change/cmd new file mode 100644 index 0000000000..818cba82b4 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/key-change/cmd @@ -0,0 +1 @@ +mlr -i json -o tsv cat ${CASEDIR}/input.json diff --git a/test/cases/io-tsv-auto-unsparsify/key-change/experr b/test/cases/io-tsv-auto-unsparsify/key-change/experr new file mode 100644 index 0000000000..ce615563a8 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/key-change/experr @@ -0,0 +1,2 @@ +mlr: TSV schema change: first keys "a b c"; current keys "a X c" +mlr: exiting due to data error. diff --git a/test/cases/io-tsv-auto-unsparsify/key-change/expout b/test/cases/io-tsv-auto-unsparsify/key-change/expout new file mode 100644 index 0000000000..c96a25f193 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/key-change/expout @@ -0,0 +1,3 @@ +a b c +1 2 3 +4 5 6 diff --git a/test/cases/io-tsv-auto-unsparsify/key-change/input.json b/test/cases/io-tsv-auto-unsparsify/key-change/input.json new file mode 100644 index 0000000000..841abab575 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/key-change/input.json @@ -0,0 +1,17 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": 6 +}, +{ + "a": 7, + "X": 8, + "c": 9 +} +] diff --git a/test/cases/io-tsv-auto-unsparsify/key-change/should-fail b/test/cases/io-tsv-auto-unsparsify/key-change/should-fail new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-tsv-auto-unsparsify/over/cmd b/test/cases/io-tsv-auto-unsparsify/over/cmd new file mode 100644 index 0000000000..818cba82b4 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/over/cmd @@ -0,0 +1 @@ +mlr -i json -o tsv cat ${CASEDIR}/input.json diff --git a/test/cases/io-tsv-auto-unsparsify/over/experr b/test/cases/io-tsv-auto-unsparsify/over/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-tsv-auto-unsparsify/over/expout b/test/cases/io-tsv-auto-unsparsify/over/expout new file mode 100644 index 0000000000..0a61a24061 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/over/expout @@ -0,0 +1,4 @@ +a b c +1 2 3 +4 5 6 7 +7 8 9 diff --git a/test/cases/io-tsv-auto-unsparsify/over/input.json b/test/cases/io-tsv-auto-unsparsify/over/input.json new file mode 100644 index 0000000000..38b47c2f09 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/over/input.json @@ -0,0 +1,18 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5, + "c": 6, + "d": 7 +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +] diff --git a/test/cases/io-tsv-auto-unsparsify/under/cmd b/test/cases/io-tsv-auto-unsparsify/under/cmd new file mode 100644 index 0000000000..818cba82b4 --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/under/cmd @@ -0,0 +1 @@ +mlr -i json -o tsv cat ${CASEDIR}/input.json diff --git a/test/cases/io-tsv-auto-unsparsify/under/experr b/test/cases/io-tsv-auto-unsparsify/under/experr new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/cases/io-tsv-auto-unsparsify/under/expout b/test/cases/io-tsv-auto-unsparsify/under/expout new file mode 100644 index 0000000000..7b24f5bdbf --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/under/expout @@ -0,0 +1,4 @@ +a b c +1 2 3 +4 5 +7 8 9 diff --git a/test/cases/io-tsv-auto-unsparsify/under/input.json b/test/cases/io-tsv-auto-unsparsify/under/input.json new file mode 100644 index 0000000000..e90f7439ad --- /dev/null +++ b/test/cases/io-tsv-auto-unsparsify/under/input.json @@ -0,0 +1,16 @@ +[ +{ + "a": 1, + "b": 2, + "c": 3 +}, +{ + "a": 4, + "b": 5 +}, +{ + "a": 7, + "b": 8, + "c": 9 +} +]