Skip to content

Commit

Permalink
fix: CSV Infer Schema now properly supports escaped characters. (#13214)
Browse files Browse the repository at this point in the history
  • Loading branch information
mnorfolk03 authored Nov 4, 2024
1 parent 659d5a3 commit 0458d30
Showing 1 changed file with 55 additions and 1 deletion.
56 changes: 55 additions & 1 deletion datafusion/core/src/datasource/file_format/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,12 @@ impl CsvFormat {
.has_header
.unwrap_or(state.config_options().catalog.has_header),
)
.with_delimiter(self.options.delimiter);
.with_delimiter(self.options.delimiter)
.with_quote(self.options.quote);

if let Some(escape) = self.options.escape {
format = format.with_escape(escape);
}

if let Some(comment) = self.options.comment {
format = format.with_comment(comment);
Expand Down Expand Up @@ -867,6 +872,55 @@ mod tests {
Ok(())
}

#[tokio::test]
async fn test_infer_schema_escape_chars() -> Result<()> {
let session_ctx = SessionContext::new();
let state = session_ctx.state();
let variable_object_store = Arc::new(VariableStream::new(
Bytes::from(
r#"c1,c2,c3,c4
0.3,"Here, is a comma\"",third,3
0.31,"double quotes are ok, "" quote",third again,9
0.314,abc,xyz,27"#,
),
1,
));
let object_meta = ObjectMeta {
location: Path::parse("/")?,
last_modified: DateTime::default(),
size: usize::MAX,
e_tag: None,
version: None,
};

let num_rows_to_read = 3;
let csv_format = CsvFormat::default()
.with_has_header(true)
.with_schema_infer_max_rec(num_rows_to_read)
.with_quote(b'"')
.with_escape(Some(b'\\'));

let inferred_schema = csv_format
.infer_schema(
&state,
&(variable_object_store.clone() as Arc<dyn ObjectStore>),
&[object_meta],
)
.await?;

let actual_fields: Vec<_> = inferred_schema
.fields()
.iter()
.map(|f| format!("{}: {:?}", f.name(), f.data_type()))
.collect();

assert_eq!(
vec!["c1: Float64", "c2: Utf8", "c3: Utf8", "c4: Int64",],
actual_fields
);
Ok(())
}

#[rstest(
file_compression_type,
case(FileCompressionType::UNCOMPRESSED),
Expand Down

0 comments on commit 0458d30

Please sign in to comment.