From 58d65fb953475635bcbbae75a031c31b7dd3f4b6 Mon Sep 17 00:00:00 2001 From: Justin Tracey Date: Fri, 21 Jan 2022 14:22:11 -0500 Subject: [PATCH] join: add support for non-unicode field separators This allows for `-t` to take invalid unicode (but still single-byte) values on unix-like platforms. Other platforms, which as of the time of this commit do not support `OsStr::as_bytes()`, could possibly be supported in the future, but would require design decisions as to what that means. --- src/uu/join/src/join.rs | 20 +++++++++++-- tests/by-util/test_join.rs | 30 +++++++++++++++++++ tests/fixtures/join/non-unicode_sep.expected | Bin 0 -> 13 bytes 3 files changed, 47 insertions(+), 3 deletions(-) create mode 100644 tests/fixtures/join/non-unicode_sep.expected diff --git a/src/uu/join/src/join.rs b/src/uu/join/src/join.rs index 559d957d15f..8b1901282b9 100644 --- a/src/uu/join/src/join.rs +++ b/src/uu/join/src/join.rs @@ -14,6 +14,8 @@ use clap::{crate_version, App, AppSettings, Arg}; use std::cmp::Ordering; use std::fs::File; use std::io::{stdin, stdout, BufRead, BufReader, Split, Stdin, Write}; +#[cfg(unix)] +use std::os::unix::ffi::OsStrExt; use uucore::display::Quotable; use uucore::error::{set_exit_code, UResult, USimpleError}; @@ -532,8 +534,19 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { settings.key1 = get_field_number(keys, key1)?; settings.key2 = get_field_number(keys, key2)?; - if let Some(value_str) = matches.value_of("t") { - let value = value_str.as_bytes(); + if let Some(value_os) = matches.value_of_os("t") { + #[cfg(unix)] + let value = value_os.as_bytes(); + #[cfg(not(unix))] + let value = match value_os.to_str() { + Some(value) => value.as_bytes(), + None => { + return Err(USimpleError::new( + 1, + "unprintable field separators are only supported on unix-like platforms", + )) + } + }; settings.separator = match value.len() { 0 => Sep::Line, 1 => Sep::Char(value[0]), @@ -541,7 +554,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { _ => { return Err(USimpleError::new( 1, - format!("multi-character tab {}", value_str), + format!("multi-character tab {}", value_os.to_string_lossy()), )) } }; @@ -655,6 +668,7 @@ FILENUM is 1 or 2, corresponding to FILE1 or FILE2", .short('t') .takes_value(true) .value_name("CHAR") + .allow_invalid_utf8(true) .help("use CHAR as input and output field separator"), ) .arg( diff --git a/tests/by-util/test_join.rs b/tests/by-util/test_join.rs index 84482ea8e5d..8663a43eae4 100644 --- a/tests/by-util/test_join.rs +++ b/tests/by-util/test_join.rs @@ -1,6 +1,10 @@ // spell-checker:ignore (words) autoformat use crate::common::util::*; +#[cfg(unix)] +use std::{ffi::OsStr, os::unix::ffi::OsStrExt}; +#[cfg(windows)] +use std::{ffi::OsString, os::windows::ffi::OsStringExt}; #[test] fn empty_files() { @@ -364,6 +368,32 @@ fn non_unicode() { .arg("non-unicode_2.bin") .succeeds() .stdout_only_fixture("non-unicode.expected"); + + #[cfg(unix)] + { + let invalid_utf8: u8 = 167; + new_ucmd!() + .arg("-t") + .arg(OsStr::from_bytes(&[invalid_utf8])) + .arg("non-unicode_1.bin") + .arg("non-unicode_2.bin") + .succeeds() + .stdout_only_fixture("non-unicode_sep.expected"); + } + + #[cfg(windows)] + { + let invalid_utf16: OsString = OsStringExt::from_wide(&[0xD800]); + new_ucmd!() + .arg("-t") + .arg(&invalid_utf16) + .arg("non-unicode_1.bin") + .arg("non-unicode_2.bin") + .fails() + .stderr_is( + "join: unprintable field separators are only supported on unix-like platforms", + ); + } } #[test] diff --git a/tests/fixtures/join/non-unicode_sep.expected b/tests/fixtures/join/non-unicode_sep.expected new file mode 100644 index 0000000000000000000000000000000000000000..c4041409bcb6116c35ac3ce2f76511d0dc827851 GIT binary patch literal 13 UcmYdPSk92LoFSPZMIns~02_M)ivR!s literal 0 HcmV?d00001