diff --git a/src/cmd/dedup.rs b/src/cmd/dedup.rs index 70445ebe2..7ec7c2f91 100644 --- a/src/cmd/dedup.rs +++ b/src/cmd/dedup.rs @@ -1,15 +1,20 @@ static USAGE: &str = r#" Deduplicates CSV rows. -Note that this requires reading all of the CSV data into memory because because the -rows need to be sorted first. +This requires reading all of the CSV data into memory because because the rows need +to be sorted first. -That is, unless the --sorted option is used to indicate the CSV is already sorted -(typically, with the extsort command). This will make dedup run in streaming mode -with constant memory. +That is, unless the --sorted option is used to indicate the CSV is already sorted - +typically, with the sort cmd for more sorting options or the extsort cmd for larger +than memory CSV files. This will make dedup run in streaming mode with constant memory. Either way, the output will not only be deduplicated, it will also be sorted. +Note that dedup's sorting will only be done alphabetically, not numerically. That is, +10 will come before 2. If you need to sort numerically, use the sort command first with +the --numeric option and pipe it to dedup with the --sorted option. +(i.e. qsv sort --numeric in.csv | qsv dedup --sorted) + A duplicate count will also be sent to . For examples, see https://github.com/jqnatividad/qsv/blob/master/tests/test_dedup.rs. @@ -147,11 +152,19 @@ pub fn run(argv: &[&str]) -> CliResult<()> { util::njobs(args.flag_jobs); let mut all = rdr.byte_records().collect::, _>>()?; - all.par_sort_by(|r1, r2| { - let a = sel.select(r1); - let b = sel.select(r2); - iter_cmp(a, b) - }); + if ignore_case { + all.par_sort_by(|r1, r2| { + let a = sel.select(r1); + let b = sel.select(r2); + iter_cmp_ignore_case(a, b) + }); + } else { + all.par_sort_by(|r1, r2| { + let a = sel.select(r1); + let b = sel.select(r2); + iter_cmp(a, b) + }); + } for (current, current_record) in all.iter().enumerate() { let a = sel.select(current_record); diff --git a/tests/test_dedup.rs b/tests/test_dedup.rs index 02afdd5ce..b2e7a9138 100644 --- a/tests/test_dedup.rs +++ b/tests/test_dedup.rs @@ -45,7 +45,32 @@ fn dedup_no_case() { cmd.arg("-i").arg("in.csv"); let got: Vec> = wrk.read_stdout(&mut cmd); - let expected = vec![svec!["N", "S"], svec!["10", "a"], svec!["2", "b"]]; + let expected = vec![svec!["N", "S"], svec!["10", "a"], svec!["2", "B"]]; + assert_eq!(got, expected); +} + +#[test] +fn dedup_issue_1381() { + let wrk = Workdir::new("dedup_issue_1381"); + wrk.create( + "in.csv", + vec![ + svec!["office"], + svec!["Member of legislative assembly"], + svec!["Member of Legislative Assembly"], + svec!["Member of Tamil Nadu Legislative Assembly"], + ], + ); + + let mut cmd = wrk.command("dedup"); + cmd.arg("-i").arg("in.csv"); + + let got: Vec> = wrk.read_stdout(&mut cmd); + let expected = vec![ + svec!["office"], + svec!["Member of Legislative Assembly"], + svec!["Member of Tamil Nadu Legislative Assembly"], + ]; assert_eq!(got, expected); } @@ -183,7 +208,7 @@ fn dedup_alreadysorted_nocase() { svec!["N", "S"], svec!["10", "a"], svec!["100", "a"], - svec!["20", "b"], + svec!["20", "B"], svec!["3", "c"], svec!["4", "d"], ];