Skip to content

Commit

Permalink
join: add support for non-unicode field separators
Browse files Browse the repository at this point in the history
This allows for `-t` to take invalid unicode (but still single-byte) values
on unix-like platforms. Other platforms, which as of the time of this commit
do not support `OsStr::as_bytes()`, could possibly be supported in the future,
but would require design decisions as to what that means.
  • Loading branch information
jtracey committed Jan 30, 2022
1 parent 7b3cfcf commit 04e5538
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 3 deletions.
20 changes: 17 additions & 3 deletions src/uu/join/src/join.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ use clap::{crate_version, App, AppSettings, Arg};
use std::cmp::Ordering;
use std::fs::File;
use std::io::{stdin, stdout, BufRead, BufReader, Split, Stdin, Write};
#[cfg(unix)]
use std::os::unix::ffi::OsStrExt;
use uucore::display::Quotable;
use uucore::error::{set_exit_code, UResult, USimpleError};

Expand Down Expand Up @@ -532,16 +534,27 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
settings.key1 = get_field_number(keys, key1)?;
settings.key2 = get_field_number(keys, key2)?;

if let Some(value_str) = matches.value_of("t") {
let value = value_str.as_bytes();
if let Some(value_os) = matches.value_of_os("t") {
#[cfg(unix)]
let value = value_os.as_bytes();
#[cfg(not(unix))]
let value = match value_os.to_str() {
Some(value) => value.as_bytes(),
None => {
return Err(USimpleError::new(
1,
"unprintable field separators are only supported on unix-like platforms",
))
}
};
settings.separator = match value.len() {
0 => Sep::Line,
1 => Sep::Char(value[0]),
2 if value[0] == b'\\' && value[1] == b'0' => Sep::Char(0),
_ => {
return Err(USimpleError::new(
1,
format!("multi-character tab {}", value_str),
format!("multi-character tab {}", value_os.to_string_lossy()),
))
}
};
Expand Down Expand Up @@ -655,6 +668,7 @@ FILENUM is 1 or 2, corresponding to FILE1 or FILE2",
.short('t')
.takes_value(true)
.value_name("CHAR")
.allow_invalid_utf8(true)
.help("use CHAR as input and output field separator"),
)
.arg(
Expand Down
34 changes: 34 additions & 0 deletions tests/by-util/test_join.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@
// spell-checker:ignore (words) autoformat

use crate::common::util::*;
#[cfg(unix)]
use std::ffi::OsStr;
#[cfg(unix)]
use std::os::unix::ffi::OsStrExt;
#[cfg(windows)]
use std::ffi::OsString;
#[cfg(windows)]
use std::os::windows::ffi::OsStringExt;

#[test]
fn empty_files() {
Expand Down Expand Up @@ -364,6 +372,32 @@ fn non_unicode() {
.arg("non-unicode_2.bin")
.succeeds()
.stdout_only_fixture("non-unicode.expected");

#[cfg(unix)]
{
let invalid_utf8: u8 = 167;
new_ucmd!()
.arg("-t")
.arg(OsStr::from_bytes(&[invalid_utf8]))
.arg("non-unicode_1.bin")
.arg("non-unicode_2.bin")
.succeeds()
.stdout_only_fixture("non-unicode_sep.expected");
}

#[cfg(windows)]
{
let invalid_utf16: OsString = OsStringExt::from_wide(&[0xD800]);
new_ucmd!()
.arg("-t")
.arg(&invalid_utf16)
.arg("non-unicode_1.bin")
.arg("non-unicode_2.bin")
.fails()
.stderr_is(
"join: unprintable field separators are only supported on unix-like platforms",
);
}
}

#[test]
Expand Down
Binary file added tests/fixtures/join/non-unicode_sep.expected
Binary file not shown.

0 comments on commit 04e5538

Please sign in to comment.