Skip to content

Commit

Permalink
Merge pull request #2411 from dathere/2387-fixlengths-remove-empty-co…
Browse files Browse the repository at this point in the history
…lumns

`fixlengths`: add `--remove-empty` option; refactored for performance
  • Loading branch information
jqnatividad authored Jan 5, 2025
2 parents 0c8a827 + 78beb07 commit ff3259c
Show file tree
Hide file tree
Showing 2 changed files with 271 additions and 40 deletions.
159 changes: 119 additions & 40 deletions src/cmd/fixlengths.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ fixlengths options:
-l, --length <arg> Forcefully set the length of each record. If a
record is not the size given, then it is truncated
or expanded as appropriate.
-r, --remove-empty Remove empty columns.
-i, --insert <pos> If empty fields need to be inserted, insert them
at <pos>. If <pos> is zero, then it is inserted
at the end of each record. If <pos> is negative, it
Expand All @@ -33,6 +34,7 @@ Common options:
-o, --output <file> Write output to <file> instead of stdout.
-d, --delimiter <arg> The field delimiter for reading CSV data.
Must be a single character. (default: ,)
-Q, --quiet Don't print removed column information.
"#;

use std::cmp;
Expand All @@ -46,13 +48,15 @@ use crate::{

#[derive(Deserialize)]
struct Args {
arg_input: Option<String>,
flag_length: Option<usize>,
flag_insert: i16,
flag_quote: Delimiter,
flag_escape: Option<Delimiter>,
flag_output: Option<String>,
flag_delimiter: Option<Delimiter>,
arg_input: Option<String>,
flag_length: Option<usize>,
flag_insert: i16,
flag_remove_empty: bool,
flag_quiet: bool,
flag_quote: Delimiter,
flag_escape: Option<Delimiter>,
flag_output: Option<String>,
flag_delimiter: Option<Delimiter>,
}

pub fn run(argv: &[&str]) -> CliResult<()> {
Expand All @@ -67,17 +71,73 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
config = config.escape(Some(escape.as_byte())).double_quote(false);
}

if config.is_stdin() && args.flag_remove_empty {
return fail_incorrectusage_clierror!(
"<stdin> cannot be used with --remove-empty. Please specify a file path."
);
}

// First determine if we need to identify existing empty columns
let col_is_empty_vec_opt = if args.flag_remove_empty {
// First pass: identify existing empty columns
let mut rdr = config.reader()?;
let mut record = csv::ByteRecord::new();
let mut col_is_empty_vec: Vec<bool> = Vec::new();
let mut first = true;

while rdr.read_byte_record(&mut record)? {
if first {
col_is_empty_vec = vec![true; record.len()];
first = false;
}

for (i, field) in record.iter().enumerate() {
if !field.is_empty() {
col_is_empty_vec[i] = false;
}
}
}

// Count and report removed columns
let empty_count = col_is_empty_vec.iter().filter(|&&x| x).count();
if !args.flag_quiet && empty_count > 0 {
eprintln!("Removed {empty_count} empty column(s)");
}
Some(col_is_empty_vec)
} else {
None
};

let length = if let Some(length) = args.flag_length {
if length == 0 {
return fail_incorrectusage_clierror!("Length must be greater than 0.");
}
length
} else if args.flag_remove_empty {
// When removing empty columns, we need to determine the maximum length
// after filtering out the empty columns. This requires another pass through the data.
let mut rdr = config.reader()?;
let mut record = csv::ByteRecord::new();
let mut maxlen = 0_usize;
// Get reference to vector indicating which columns are empty
let col_is_empty_vec = col_is_empty_vec_opt.as_ref().unwrap();

while rdr.read_byte_record(&mut record)? {
// For each record, count how many fields would remain after filtering
// out the empty columns. We use get_unchecked since we know i is in bounds.
let filtered_len = record
.iter()
.enumerate()
.filter(|(i, _)| unsafe { !*col_is_empty_vec.get_unchecked(*i) })
.count();
// Keep track of maximum filtered length seen so far
maxlen = cmp::max(maxlen, filtered_len);
}
maxlen
} else {
// --length not set, so we need to determine the length of the longest record
// by scanning the entire file
if config.is_stdin() {
return fail_incorrectusage_clierror!(
"<stdin> cannot be used in this command. Please specify a file path."
"<stdin> cannot be used with if --length is not set. Please specify a file path."
);
}
let mut maxlen = 0_usize;
Expand All @@ -99,46 +159,65 @@ pub fn run(argv: &[&str]) -> CliResult<()> {
let mut wtr = Config::new(args.flag_output.as_ref()).writer()?;
let mut record = csv::ByteRecord::new();
let mut record_work = csv::ByteRecord::new();
#[allow(unused_assignments)]
let mut field_idx = 1_i16;
let mut filtered_record = csv::ByteRecord::new();
let mut field_idx: i16;

let insert_pos = if args.flag_insert < 0 {
length as i16 + args.flag_insert
let flag_insert = args.flag_insert;

let insert_pos = if flag_insert < 0 {
length as i16 + flag_insert
} else {
args.flag_insert
flag_insert
};
// log::debug!("length: {length} insert_pos: {insert_pos}");

let mut current_len: usize;
while rdr.read_byte_record(&mut record)? {
if length >= record.len() {
if args.flag_insert == 0 {
for _ in record.len()..length {
record.push_field(b"");
// First remove existing empty columns if needed
if let Some(ref is_empty) = col_is_empty_vec_opt {
filtered_record.clear();
for (i, field) in record.iter().enumerate() {
if i < is_empty.len() && !is_empty[i] {
filtered_record.push_field(field);
}
} else {
record_work.clear();
field_idx = 1_i16;
for field in &record {
if field_idx == insert_pos {
// insert all the empty fields at the insert position
for _ in record.len()..length {
record_work.push_field(b"");
}
record.clone_from(&filtered_record);
}

// Then handle length adjustments by comparing target length with current record length
current_len = record.len();
match length.cmp(&current_len) {
std::cmp::Ordering::Greater => {
// Record is too short - need to add empty fields
if flag_insert == 0 {
// No insert position specified - append empty fields at end
record.extend((0..length - current_len).map(|_| b""));
} else {
// Insert empty fields at specified position
record_work.clear();
field_idx = 1_i16;
for field in &record {
if field_idx == insert_pos {
// When we reach insert position, add all needed empty fields
record_work.extend((0..length - current_len).map(|_| b""));
}
record_work.push_field(field);
field_idx += 1;
}
record_work.push_field(field);

field_idx += 1;
}
if record_work.len() <= length {
// insert all the empty fields at the end
for _ in record_work.len()..length {
record_work.push_field(b"");
if record_work.len() < length {
// If we never hit insert position (it was past end of record),
// append remaining empty fields at the end
record_work.extend((0..length - record_work.len()).map(|_| b""));
}
record.clone_from(&record_work);
}
record.clone_from(&record_work);
}
} else {
record.truncate(length);
},
std::cmp::Ordering::Less => {
// Record is too long - truncate to target length
record.truncate(length);
},
std::cmp::Ordering::Equal => {
// Record is already correct length - no changes needed
},
}
wtr.write_byte_record(&record)?;
}
Expand Down
152 changes: 152 additions & 0 deletions tests/test_fixlengths.rs
Original file line number Diff line number Diff line change
Expand Up @@ -212,3 +212,155 @@ fn prop_fixlengths_explicit_len() {
}
qcheck(p as fn(Vec<CsvRecord>, usize) -> TestResult);
}

#[test]
fn fixlengths_remove_empty_basic() {
let rows = vec![
svec!["a", "", "c", "", "e"],
svec!["f", "", "h", "", "j"],
svec!["k", "", "m", "", "o"],
];

let wrk = Workdir::new("fixlengths_remove_empty_basic").flexible(true);
wrk.create("in.csv", rows);

let mut cmd = wrk.command("fixlengths");
cmd.arg("in.csv").args(["-r"]);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
assert_eq!(
got,
vec![
svec!["a", "c", "e"],
svec!["f", "h", "j"],
svec!["k", "m", "o"],
]
);
}

#[test]
fn fixlengths_remove_empty_with_length() {
let rows = vec![
svec!["a", "", "c", "", "e"],
svec!["f", "", "h", "", "j"],
svec!["k", "", "m", "", "o"],
];

let wrk = Workdir::new("fixlengths_remove_empty_with_length").flexible(true);
wrk.create("in.csv", rows);

let mut cmd = wrk.command("fixlengths");
cmd.arg("in.csv").args(["-r"]).args(["-l", "4"]);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
assert_eq!(
got,
vec![
svec!["a", "c", "e", ""],
svec!["f", "h", "j", ""],
svec!["k", "m", "o", ""],
]
);
}

#[test]
fn fixlengths_remove_empty_with_insert() {
let rows = vec![
svec!["a", "", "c", "", "e"],
svec!["f", "", "h", "", "j"],
svec!["k", "", "m", "", "o"],
];

let wrk = Workdir::new("fixlengths_remove_empty_with_insert").flexible(true);
wrk.create("in.csv", rows);

let mut cmd = wrk.command("fixlengths");
cmd.arg("in.csv").args(["-r"]).args(["-i", "2"]);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
assert_eq!(
got,
vec![
svec!["a", "c", "e"],
svec!["f", "h", "j"],
svec!["k", "m", "o"],
]
);
}

#[test]
fn fixlengths_remove_empty_with_length_and_insert() {
let rows = vec![
svec!["a", "", "c", "", "e"],
svec!["f", "", "h", "", "j"],
svec!["k", "", "m", "", "o"],
];

let wrk = Workdir::new("fixlengths_remove_empty_with_length_and_insert").flexible(true);
wrk.create("in.csv", rows);

let mut cmd = wrk.command("fixlengths");
cmd.arg("in.csv")
.args(["-r"])
.args(["-l", "5"])
.args(["-i", "2"]);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
assert_eq!(
got,
vec![
svec!["a", "", "", "c", "e"],
svec!["f", "", "", "h", "j"],
svec!["k", "", "", "m", "o"],
]
);
}

#[test]
fn fixlengths_remove_empty_all_empty_columns() {
let rows = vec![
svec!["a", "", "", "", "e"],
svec!["f", "", "", "", "j"],
svec!["k", "", "", "", "o"],
];

let wrk = Workdir::new("fixlengths_remove_empty_all_empty_columns").flexible(true);
wrk.create("in.csv", rows);

let mut cmd = wrk.command("fixlengths");
cmd.arg("in.csv").args(["-r"]);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
assert_eq!(
got,
vec![svec!["a", "e"], svec!["f", "j"], svec!["k", "o"],]
);
}

#[test]
fn fixlengths_remove_empty_with_negative_insert() {
let rows = vec![
svec!["a", "", "c", "", "e"],
svec!["f", "", "h", "", "j"],
svec!["k", "", "m", "", "o"],
];

let wrk = Workdir::new("fixlengths_remove_empty_with_negative_insert").flexible(true);
wrk.create("in.csv", rows);

let mut cmd = wrk.command("fixlengths");
cmd.arg("in.csv")
.args(["-r"])
.args(["-l", "5"])
.args(["-i", "-2"]);

let got: Vec<Vec<String>> = wrk.read_stdout(&mut cmd);
assert_eq!(
got,
vec![
svec!["a", "c", "", "", "e"],
svec!["f", "h", "", "", "j"],
svec!["k", "m", "", "", "o"],
]
);
}

0 comments on commit ff3259c

Please sign in to comment.