diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index 468e7f61bca4..e6b6fd5110d5 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -50,6 +50,7 @@ rand = "0.8" criterion = "0.3" rand = "0.8" snap = "1.0" +tempfile = "3.0" brotli = "3.3" flate2 = "1.0" lz4 = "1.23" diff --git a/parquet/src/arrow/arrow_reader.rs b/parquet/src/arrow/arrow_reader.rs index 066f86cfb7fc..94b2e7bfa0c4 100644 --- a/parquet/src/arrow/arrow_reader.rs +++ b/parquet/src/arrow/arrow_reader.rs @@ -251,7 +251,7 @@ mod tests { use crate::file::writer::{FileWriter, SerializedFileWriter}; use crate::schema::parser::parse_message_type; use crate::schema::types::{Type, TypePtr}; - use crate::util::test_common::{get_temp_file, get_temp_filename, RandGen}; + use crate::util::test_common::RandGen; use arrow::array::*; use arrow::datatypes::{DataType as ArrowDataType, Field}; use arrow::record_batch::RecordBatchReader; @@ -261,7 +261,8 @@ mod tests { use std::cmp::min; use std::convert::TryFrom; use std::fs::File; - use std::path::{Path, PathBuf}; + use std::io::Seek; + use std::path::PathBuf; use std::sync::Arc; #[test] @@ -703,8 +704,6 @@ mod tests { }) .collect(); - let path = get_temp_filename(); - let len = match T::get_physical_type() { crate::basic::Type::FIXED_LEN_BYTE_ARRAY => rand_max, crate::basic::Type::INT96 => 12, @@ -731,18 +730,21 @@ mod tests { .clone() .map(|t| arrow::datatypes::Field::new("leaf", t, false)); + let mut file = tempfile::tempfile().unwrap(); + generate_single_column_file_with_data::( &values, def_levels.as_ref(), - path.as_path(), + file.try_clone().unwrap(), // Cannot use &mut File (#1163) schema, arrow_field, &opts, ) .unwrap(); - let parquet_reader = - SerializedFileReader::try_from(File::open(&path).unwrap()).unwrap(); + file.rewind().unwrap(); + + let parquet_reader = SerializedFileReader::try_from(file).unwrap(); let mut arrow_reader = ParquetFileArrowReader::new(Arc::new(parquet_reader)); let mut record_reader = arrow_reader @@ -802,12 +804,11 @@ mod tests { fn generate_single_column_file_with_data( values: &[Vec], def_levels: Option<&Vec>>, - path: &Path, + file: File, schema: TypePtr, field: Option, opts: &TestOptions, ) -> Result { - let file = File::create(path)?; let mut writer_props = opts.writer_props(); if let Some(field) = field { let arrow_schema = arrow::datatypes::Schema::new(vec![field]); @@ -926,7 +927,7 @@ mod tests { } }"; - let file = get_temp_file("nested_nullability.parquet", &[]); + let file = tempfile::tempfile().unwrap(); let schema = Arc::new(parse_message_type(message_type).unwrap()); { diff --git a/parquet/src/arrow/arrow_writer.rs b/parquet/src/arrow/arrow_writer.rs index 7f255db36f95..c7a5f06859d5 100644 --- a/parquet/src/arrow/arrow_writer.rs +++ b/parquet/src/arrow/arrow_writer.rs @@ -70,11 +70,8 @@ impl ArrowWriter { let max_row_group_size = props.max_row_group_size(); - let file_writer = SerializedFileWriter::new( - writer.try_clone()?, - schema.root_schema_ptr(), - Arc::new(props), - )?; + let file_writer = + SerializedFileWriter::new(writer, schema.root_schema_ptr(), Arc::new(props))?; Ok(Self { writer: file_writer, @@ -605,7 +602,6 @@ mod tests { statistics::Statistics, writer::InMemoryWriteableCursor, }; - use crate::util::test_common::get_temp_file; #[test] fn arrow_writer() { @@ -624,7 +620,7 @@ mod tests { RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]) .unwrap(); - roundtrip("test_arrow_write.parquet", batch, Some(SMALL_SIZE / 2)); + roundtrip(batch, Some(SMALL_SIZE / 2)); } #[test] @@ -685,11 +681,7 @@ mod tests { // build a record batch let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap(); - roundtrip( - "test_arrow_writer_non_null.parquet", - batch, - Some(SMALL_SIZE / 2), - ); + roundtrip(batch, Some(SMALL_SIZE / 2)); } #[test] @@ -730,7 +722,7 @@ mod tests { // This test fails if the max row group size is less than the batch's length // see https://github.com/apache/arrow-rs/issues/518 - roundtrip("test_arrow_writer_list.parquet", batch, None); + roundtrip(batch, None); } #[test] @@ -770,7 +762,7 @@ mod tests { // see https://github.com/apache/arrow-rs/issues/518 assert_eq!(batch.column(0).data().null_count(), 0); - roundtrip("test_arrow_writer_list_non_null.parquet", batch, None); + roundtrip(batch, None); } #[test] @@ -799,11 +791,7 @@ mod tests { ) .unwrap(); - roundtrip( - "test_arrow_writer_binary.parquet", - batch, - Some(SMALL_SIZE / 2), - ); + roundtrip(batch, Some(SMALL_SIZE / 2)); } #[test] @@ -822,11 +810,7 @@ mod tests { RecordBatch::try_new(Arc::new(schema), vec![Arc::new(decimal_values)]) .unwrap(); - roundtrip( - "test_arrow_writer_decimal.parquet", - batch, - Some(SMALL_SIZE / 2), - ); + roundtrip(batch, Some(SMALL_SIZE / 2)); } #[test] @@ -912,17 +896,8 @@ mod tests { ) .unwrap(); - roundtrip( - "test_arrow_writer_complex.parquet", - batch.clone(), - Some(SMALL_SIZE / 2), - ); - - roundtrip( - "test_arrow_writer_complex_small_batch.parquet", - batch, - Some(SMALL_SIZE / 3), - ); + roundtrip(batch.clone(), Some(SMALL_SIZE / 2)); + roundtrip(batch, Some(SMALL_SIZE / 3)); } #[test] @@ -960,11 +935,7 @@ mod tests { RecordBatch::try_new(Arc::new(schema), vec![Arc::new(some_nested_object)]) .unwrap(); - roundtrip( - "test_arrow_writer_complex_mixed.parquet", - batch, - Some(SMALL_SIZE / 2), - ); + roundtrip(batch, Some(SMALL_SIZE / 2)); } #[test] @@ -994,7 +965,7 @@ mod tests { let mut reader = builder.build(std::io::Cursor::new(json_content)).unwrap(); let batch = reader.next().unwrap().unwrap(); - roundtrip("test_arrow_writer_map.parquet", batch, None); + roundtrip(batch, None); } #[test] @@ -1028,11 +999,7 @@ mod tests { // build a racord batch let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap(); - roundtrip( - "test_arrow_writer_2_level_struct.parquet", - batch, - Some(SMALL_SIZE / 2), - ); + roundtrip(batch, Some(SMALL_SIZE / 2)); } #[test] @@ -1064,11 +1031,7 @@ mod tests { // build a racord batch let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap(); - roundtrip( - "test_arrow_writer_2_level_struct_non_null.parquet", - batch, - Some(SMALL_SIZE / 2), - ); + roundtrip(batch, Some(SMALL_SIZE / 2)); } #[test] @@ -1102,21 +1065,13 @@ mod tests { // build a racord batch let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap(); - roundtrip( - "test_arrow_writer_2_level_struct_mixed_null.parquet", - batch, - Some(SMALL_SIZE / 2), - ); + roundtrip(batch, Some(SMALL_SIZE / 2)); } const SMALL_SIZE: usize = 7; - fn roundtrip( - filename: &str, - expected_batch: RecordBatch, - max_row_group_size: Option, - ) -> File { - let file = get_temp_file(filename, &[]); + fn roundtrip(expected_batch: RecordBatch, max_row_group_size: Option) -> File { + let file = tempfile::tempfile().unwrap(); let mut writer = ArrowWriter::try_new( file.try_clone().unwrap(), @@ -1154,7 +1109,6 @@ mod tests { } fn one_column_roundtrip( - filename: &str, values: ArrayRef, nullable: bool, max_row_group_size: Option, @@ -1167,20 +1121,20 @@ mod tests { let expected_batch = RecordBatch::try_new(Arc::new(schema), vec![values]).unwrap(); - roundtrip(filename, expected_batch, max_row_group_size) + roundtrip(expected_batch, max_row_group_size) } - fn values_required(iter: I, filename: &str) + fn values_required(iter: I) where A: From> + Array + 'static, I: IntoIterator, { let raw_values: Vec<_> = iter.into_iter().collect(); let values = Arc::new(A::from(raw_values)); - one_column_roundtrip(filename, values, false, Some(SMALL_SIZE / 2)); + one_column_roundtrip(values, false, Some(SMALL_SIZE / 2)); } - fn values_optional(iter: I, filename: &str) + fn values_optional(iter: I) where A: From>> + Array + 'static, I: IntoIterator, @@ -1191,32 +1145,27 @@ mod tests { .map(|(i, v)| if i % 2 == 0 { None } else { Some(v) }) .collect(); let optional_values = Arc::new(A::from(optional_raw_values)); - one_column_roundtrip(filename, optional_values, true, Some(SMALL_SIZE / 2)); + one_column_roundtrip(optional_values, true, Some(SMALL_SIZE / 2)); } - fn required_and_optional(iter: I, filename: &str) + fn required_and_optional(iter: I) where A: From> + From>> + Array + 'static, I: IntoIterator + Clone, { - values_required::(iter.clone(), filename); - values_optional::(iter, filename); + values_required::(iter.clone()); + values_optional::(iter); } #[test] fn all_null_primitive_single_column() { let values = Arc::new(Int32Array::from(vec![None; SMALL_SIZE])); - one_column_roundtrip( - "all_null_primitive_single_column", - values, - true, - Some(SMALL_SIZE / 2), - ); + one_column_roundtrip(values, true, Some(SMALL_SIZE / 2)); } #[test] fn null_single_column() { let values = Arc::new(NullArray::new(SMALL_SIZE)); - one_column_roundtrip("null_single_column", values, true, Some(SMALL_SIZE / 2)); + one_column_roundtrip(values, true, Some(SMALL_SIZE / 2)); // null arrays are always nullable, a test with non-nullable nulls fails } @@ -1224,7 +1173,6 @@ mod tests { fn bool_single_column() { required_and_optional::( [true, false].iter().cycle().copied().take(SMALL_SIZE), - "bool_single_column", ); } @@ -1242,7 +1190,7 @@ mod tests { Schema::new(vec![Field::new("col", values.data_type().clone(), true)]); let expected_batch = RecordBatch::try_new(Arc::new(schema), vec![values]).unwrap(); - let file = get_temp_file("bool_large_single_column", &[]); + let file = tempfile::tempfile().unwrap(); let mut writer = ArrowWriter::try_new( file.try_clone().unwrap(), @@ -1256,67 +1204,52 @@ mod tests { #[test] fn i8_single_column() { - required_and_optional::(0..SMALL_SIZE as i8, "i8_single_column"); + required_and_optional::(0..SMALL_SIZE as i8); } #[test] fn i16_single_column() { - required_and_optional::(0..SMALL_SIZE as i16, "i16_single_column"); + required_and_optional::(0..SMALL_SIZE as i16); } #[test] fn i32_single_column() { - required_and_optional::(0..SMALL_SIZE as i32, "i32_single_column"); + required_and_optional::(0..SMALL_SIZE as i32); } #[test] fn i64_single_column() { - required_and_optional::(0..SMALL_SIZE as i64, "i64_single_column"); + required_and_optional::(0..SMALL_SIZE as i64); } #[test] fn u8_single_column() { - required_and_optional::(0..SMALL_SIZE as u8, "u8_single_column"); + required_and_optional::(0..SMALL_SIZE as u8); } #[test] fn u16_single_column() { - required_and_optional::( - 0..SMALL_SIZE as u16, - "u16_single_column", - ); + required_and_optional::(0..SMALL_SIZE as u16); } #[test] fn u32_single_column() { - required_and_optional::( - 0..SMALL_SIZE as u32, - "u32_single_column", - ); + required_and_optional::(0..SMALL_SIZE as u32); } #[test] fn u64_single_column() { - required_and_optional::( - 0..SMALL_SIZE as u64, - "u64_single_column", - ); + required_and_optional::(0..SMALL_SIZE as u64); } #[test] fn f32_single_column() { - required_and_optional::( - (0..SMALL_SIZE).map(|i| i as f32), - "f32_single_column", - ); + required_and_optional::((0..SMALL_SIZE).map(|i| i as f32)); } #[test] fn f64_single_column() { - required_and_optional::( - (0..SMALL_SIZE).map(|i| i as f64), - "f64_single_column", - ); + required_and_optional::((0..SMALL_SIZE).map(|i| i as f64)); } // The timestamp array types don't implement From> because they need the timezone @@ -1328,7 +1261,7 @@ mod tests { let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); let values = Arc::new(TimestampSecondArray::from_vec(raw_values, None)); - one_column_roundtrip("timestamp_second_single_column", values, false, Some(3)); + one_column_roundtrip(values, false, Some(3)); } #[test] @@ -1336,12 +1269,7 @@ mod tests { let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); let values = Arc::new(TimestampMillisecondArray::from_vec(raw_values, None)); - one_column_roundtrip( - "timestamp_millisecond_single_column", - values, - false, - Some(SMALL_SIZE / 2 + 1), - ); + one_column_roundtrip(values, false, Some(SMALL_SIZE / 2 + 1)); } #[test] @@ -1349,12 +1277,7 @@ mod tests { let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); let values = Arc::new(TimestampMicrosecondArray::from_vec(raw_values, None)); - one_column_roundtrip( - "timestamp_microsecond_single_column", - values, - false, - Some(SMALL_SIZE / 2 + 2), - ); + one_column_roundtrip(values, false, Some(SMALL_SIZE / 2 + 2)); } #[test] @@ -1362,20 +1285,12 @@ mod tests { let raw_values: Vec<_> = (0..SMALL_SIZE as i64).collect(); let values = Arc::new(TimestampNanosecondArray::from_vec(raw_values, None)); - one_column_roundtrip( - "timestamp_nanosecond_single_column", - values, - false, - Some(SMALL_SIZE / 2), - ); + one_column_roundtrip(values, false, Some(SMALL_SIZE / 2)); } #[test] fn date32_single_column() { - required_and_optional::( - 0..SMALL_SIZE as i32, - "date32_single_column", - ); + required_and_optional::(0..SMALL_SIZE as i32); } #[test] @@ -1383,92 +1298,61 @@ mod tests { // Date64 must be a multiple of 86400000, see ARROW-10925 required_and_optional::( (0..(SMALL_SIZE as i64 * 86400000)).step_by(86400000), - "date64_single_column", ); } #[test] fn time32_second_single_column() { - required_and_optional::( - 0..SMALL_SIZE as i32, - "time32_second_single_column", - ); + required_and_optional::(0..SMALL_SIZE as i32); } #[test] fn time32_millisecond_single_column() { - required_and_optional::( - 0..SMALL_SIZE as i32, - "time32_millisecond_single_column", - ); + required_and_optional::(0..SMALL_SIZE as i32); } #[test] fn time64_microsecond_single_column() { - required_and_optional::( - 0..SMALL_SIZE as i64, - "time64_microsecond_single_column", - ); + required_and_optional::(0..SMALL_SIZE as i64); } #[test] fn time64_nanosecond_single_column() { - required_and_optional::( - 0..SMALL_SIZE as i64, - "time64_nanosecond_single_column", - ); + required_and_optional::(0..SMALL_SIZE as i64); } #[test] #[should_panic(expected = "Converting Duration to parquet not supported")] fn duration_second_single_column() { - required_and_optional::( - 0..SMALL_SIZE as i64, - "duration_second_single_column", - ); + required_and_optional::(0..SMALL_SIZE as i64); } #[test] #[should_panic(expected = "Converting Duration to parquet not supported")] fn duration_millisecond_single_column() { - required_and_optional::( - 0..SMALL_SIZE as i64, - "duration_millisecond_single_column", - ); + required_and_optional::(0..SMALL_SIZE as i64); } #[test] #[should_panic(expected = "Converting Duration to parquet not supported")] fn duration_microsecond_single_column() { - required_and_optional::( - 0..SMALL_SIZE as i64, - "duration_microsecond_single_column", - ); + required_and_optional::(0..SMALL_SIZE as i64); } #[test] #[should_panic(expected = "Converting Duration to parquet not supported")] fn duration_nanosecond_single_column() { - required_and_optional::( - 0..SMALL_SIZE as i64, - "duration_nanosecond_single_column", - ); + required_and_optional::(0..SMALL_SIZE as i64); } #[test] fn interval_year_month_single_column() { - required_and_optional::( - 0..SMALL_SIZE as i32, - "interval_year_month_single_column", - ); + required_and_optional::(0..SMALL_SIZE as i32); } #[test] fn interval_day_time_single_column() { - required_and_optional::( - 0..SMALL_SIZE as i64, - "interval_day_time_single_column", - ); + required_and_optional::(0..SMALL_SIZE as i64); } #[test] @@ -1476,10 +1360,7 @@ mod tests { expected = "Attempting to write an Arrow interval type MonthDayNano to parquet that is not yet implemented" )] fn interval_month_day_nano_single_column() { - required_and_optional::( - 0..SMALL_SIZE as i128, - "interval_month_day_nano_single_column", - ); + required_and_optional::(0..SMALL_SIZE as i128); } #[test] @@ -1489,7 +1370,7 @@ mod tests { let many_vecs_iter = many_vecs.iter().map(|v| v.as_slice()); // BinaryArrays can't be built from Vec>, so only call `values_required` - values_required::(many_vecs_iter, "binary_single_column"); + values_required::(many_vecs_iter); } #[test] @@ -1499,10 +1380,7 @@ mod tests { let many_vecs_iter = many_vecs.iter().map(|v| v.as_slice()); // LargeBinaryArrays can't be built from Vec>, so only call `values_required` - values_required::( - many_vecs_iter, - "large_binary_single_column", - ); + values_required::(many_vecs_iter); } #[test] @@ -1514,12 +1392,7 @@ mod tests { builder.append_value(b"1112").unwrap(); let array = Arc::new(builder.finish()); - one_column_roundtrip( - "fixed_size_binary_single_column", - array, - true, - Some(SMALL_SIZE / 2), - ); + one_column_roundtrip(array, true, Some(SMALL_SIZE / 2)); } #[test] @@ -1527,7 +1400,7 @@ mod tests { let raw_values: Vec<_> = (0..SMALL_SIZE).map(|i| i.to_string()).collect(); let raw_strs = raw_values.iter().map(|s| s.as_str()); - required_and_optional::(raw_strs, "string_single_column"); + required_and_optional::(raw_strs); } #[test] @@ -1535,10 +1408,7 @@ mod tests { let raw_values: Vec<_> = (0..SMALL_SIZE).map(|i| i.to_string()).collect(); let raw_strs = raw_values.iter().map(|s| s.as_str()); - required_and_optional::( - raw_strs, - "large_string_single_column", - ); + required_and_optional::(raw_strs); } #[test] @@ -1563,7 +1433,7 @@ mod tests { let a = ListArray::from(a_list_data); let values = Arc::new(a); - one_column_roundtrip("list_single_column", values, true, Some(SMALL_SIZE / 2)); + one_column_roundtrip(values, true, Some(SMALL_SIZE / 2)); } #[test] @@ -1589,12 +1459,7 @@ mod tests { let a = LargeListArray::from(a_list_data); let values = Arc::new(a); - one_column_roundtrip( - "large_list_single_column", - values, - true, - Some(SMALL_SIZE / 2), - ); + one_column_roundtrip(values, true, Some(SMALL_SIZE / 2)); } #[test] @@ -1604,7 +1469,7 @@ mod tests { let s = StructArray::from(vec![(struct_field_a, Arc::new(a_values) as ArrayRef)]); let values = Arc::new(s); - one_column_roundtrip("struct_single_column", values, false, Some(SMALL_SIZE / 2)); + one_column_roundtrip(values, false, Some(SMALL_SIZE / 2)); } #[test] @@ -1627,11 +1492,7 @@ mod tests { // build a record batch let expected_batch = RecordBatch::try_new(schema, vec![Arc::new(d)]).unwrap(); - roundtrip( - "test_arrow_writer_string_dictionary.parquet", - expected_batch, - Some(SMALL_SIZE / 2), - ); + roundtrip(expected_batch, Some(SMALL_SIZE / 2)); } #[test] @@ -1658,11 +1519,7 @@ mod tests { // build a record batch let expected_batch = RecordBatch::try_new(schema, vec![Arc::new(d)]).unwrap(); - roundtrip( - "test_arrow_writer_primitive_dictionary.parquet", - expected_batch, - Some(SMALL_SIZE / 2), - ); + roundtrip(expected_batch, Some(SMALL_SIZE / 2)); } #[test] @@ -1685,11 +1542,7 @@ mod tests { // build a record batch let expected_batch = RecordBatch::try_new(schema, vec![Arc::new(d)]).unwrap(); - roundtrip( - "test_arrow_writer_string_dictionary_unsigned_index.parquet", - expected_batch, - Some(SMALL_SIZE / 2), - ); + roundtrip(expected_batch, Some(SMALL_SIZE / 2)); } #[test] @@ -1704,7 +1557,7 @@ mod tests { u32::MAX - 1, u32::MAX, ])); - let file = one_column_roundtrip("u32_min_max_single_column", values, false, None); + let file = one_column_roundtrip(values, false, None); // check statistics are valid let reader = SerializedFileReader::new(file).unwrap(); @@ -1735,7 +1588,7 @@ mod tests { u64::MAX - 1, u64::MAX, ])); - let file = one_column_roundtrip("u64_min_max_single_column", values, false, None); + let file = one_column_roundtrip(values, false, None); // check statistics are valid let reader = SerializedFileReader::new(file).unwrap(); @@ -1758,7 +1611,7 @@ mod tests { fn statistics_null_counts_only_nulls() { // check that null-count statistics for "only NULL"-columns are correct let values = Arc::new(UInt64Array::from(vec![None, None])); - let file = one_column_roundtrip("null_counts", values, true, None); + let file = one_column_roundtrip(values, true, None); // check statistics are valid let reader = SerializedFileReader::new(file).unwrap(); diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs index 51a7a04aa3c6..198763b2c3d6 100644 --- a/parquet/src/arrow/schema.rs +++ b/parquet/src/arrow/schema.rs @@ -1044,7 +1044,6 @@ mod tests { use crate::{ arrow::{ArrowReader, ArrowWriter, ParquetFileArrowReader}, schema::{parser::parse_message_type, types::SchemaDescriptor}, - util::test_common::get_temp_file, }; #[test] @@ -2124,7 +2123,7 @@ mod tests { ); // write to an empty parquet file so that schema is serialized - let file = get_temp_file("test_arrow_schema_roundtrip.parquet", &[]); + let file = tempfile::tempfile().unwrap(); let mut writer = ArrowWriter::try_new( file.try_clone().unwrap(), Arc::new(schema.clone()), @@ -2195,7 +2194,7 @@ mod tests { ); // write to an empty parquet file so that schema is serialized - let file = get_temp_file("test_arrow_schema_roundtrip_lists.parquet", &[]); + let file = tempfile::tempfile().unwrap(); let mut writer = ArrowWriter::try_new( file.try_clone().unwrap(), Arc::new(schema.clone()), diff --git a/parquet/src/column/writer.rs b/parquet/src/column/writer.rs index 162941a54dea..87b25b4d3c70 100644 --- a/parquet/src/column/writer.rs +++ b/parquet/src/column/writer.rs @@ -1058,7 +1058,7 @@ mod tests { use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; use crate::util::{ io::{FileSink, FileSource}, - test_common::{get_temp_file, random_numbers_range}, + test_common::random_numbers_range, }; use super::*; @@ -1516,14 +1516,13 @@ mod tests { #[test] fn test_column_writer_empty_column_roundtrip() { let props = WriterProperties::builder().build(); - column_roundtrip::("test_col_writer_rnd_1", props, &[], None, None); + column_roundtrip::(props, &[], None, None); } #[test] fn test_column_writer_non_nullable_values_roundtrip() { let props = WriterProperties::builder().build(); column_roundtrip_random::( - "test_col_writer_rnd_2", props, 1024, std::i32::MIN, @@ -1537,7 +1536,6 @@ mod tests { fn test_column_writer_nullable_non_repeated_values_roundtrip() { let props = WriterProperties::builder().build(); column_roundtrip_random::( - "test_column_writer_nullable_non_repeated_values_roundtrip", props, 1024, std::i32::MIN, @@ -1551,7 +1549,6 @@ mod tests { fn test_column_writer_nullable_repeated_values_roundtrip() { let props = WriterProperties::builder().build(); column_roundtrip_random::( - "test_col_writer_rnd_3", props, 1024, std::i32::MIN, @@ -1568,7 +1565,6 @@ mod tests { .set_data_pagesize_limit(32) .build(); column_roundtrip_random::( - "test_col_writer_rnd_4", props, 1024, std::i32::MIN, @@ -1584,7 +1580,6 @@ mod tests { let props = WriterProperties::builder().set_write_batch_size(*i).build(); column_roundtrip_random::( - "test_col_writer_rnd_5", props, 1024, std::i32::MIN, @@ -1602,7 +1597,6 @@ mod tests { .set_dictionary_enabled(false) .build(); column_roundtrip_random::( - "test_col_writer_rnd_6", props, 1024, std::i32::MIN, @@ -1619,7 +1613,6 @@ mod tests { .set_dictionary_enabled(false) .build(); column_roundtrip_random::( - "test_col_writer_rnd_7", props, 1024, std::i32::MIN, @@ -1636,7 +1629,6 @@ mod tests { .set_compression(Compression::SNAPPY) .build(); column_roundtrip_random::( - "test_col_writer_rnd_8", props, 2048, std::i32::MIN, @@ -1653,7 +1645,6 @@ mod tests { .set_compression(Compression::SNAPPY) .build(); column_roundtrip_random::( - "test_col_writer_rnd_9", props, 2048, std::i32::MIN, @@ -1667,7 +1658,7 @@ mod tests { fn test_column_writer_add_data_pages_with_dict() { // ARROW-5129: Test verifies that we add data page in case of dictionary encoding // and no fallback occurred so far. - let file = get_temp_file("test_column_writer_add_data_pages_with_dict", &[]); + let file = tempfile::tempfile().unwrap(); let sink = FileSink::new(&file); let page_writer = Box::new(SerializedPageWriter::new(sink)); let props = Arc::new( @@ -1894,7 +1885,6 @@ mod tests { /// `max_size` is maximum number of values or levels (if `max_def_level` > 0) to write /// for a column. fn column_roundtrip_random( - file_name: &str, props: WriterProperties, max_size: usize, min_value: T::T, @@ -1931,18 +1921,17 @@ mod tests { let mut values: Vec = Vec::new(); random_numbers_range(num_values, min_value, max_value, &mut values); - column_roundtrip::(file_name, props, &values[..], def_levels, rep_levels); + column_roundtrip::(props, &values[..], def_levels, rep_levels); } /// Performs write-read roundtrip and asserts written values and levels. - fn column_roundtrip<'a, T: DataType>( - file_name: &'a str, + fn column_roundtrip( props: WriterProperties, values: &[T::T], def_levels: Option<&[i16]>, rep_levels: Option<&[i16]>, ) { - let file = get_temp_file(file_name, &[]); + let file = tempfile::tempfile().unwrap(); let sink = FileSink::new(&file); let page_writer = Box::new(SerializedPageWriter::new(sink)); diff --git a/parquet/src/file/footer.rs b/parquet/src/file/footer.rs index 2e572944868b..48b013f478e7 100644 --- a/parquet/src/file/footer.rs +++ b/parquet/src/file/footer.rs @@ -160,12 +160,12 @@ mod tests { use crate::basic::SortOrder; use crate::basic::Type; use crate::schema::types::Type as SchemaType; - use crate::util::test_common::get_temp_file; + use crate::util::cursor::SliceableCursor; use parquet_format::TypeDefinedOrder; #[test] fn test_parse_metadata_size_smaller_than_footer() { - let test_file = get_temp_file("corrupt-1.parquet", &[]); + let test_file = tempfile::tempfile().unwrap(); let reader_result = parse_metadata(&test_file); assert!(reader_result.is_err()); assert_eq!( @@ -176,8 +176,8 @@ mod tests { #[test] fn test_parse_metadata_corrupt_footer() { - let test_file = get_temp_file("corrupt-2.parquet", &[1, 2, 3, 4, 5, 6, 7, 8]); - let reader_result = parse_metadata(&test_file); + let data = SliceableCursor::new(Arc::new(vec![1, 2, 3, 4, 5, 6, 7, 8])); + let reader_result = parse_metadata(&data); assert!(reader_result.is_err()); assert_eq!( reader_result.err().unwrap(), @@ -188,7 +188,7 @@ mod tests { #[test] fn test_parse_metadata_invalid_length() { let test_file = - get_temp_file("corrupt-3.parquet", &[0, 0, 0, 255, b'P', b'A', b'R', b'1']); + SliceableCursor::new(Arc::new(vec![0, 0, 0, 255, b'P', b'A', b'R', b'1'])); let reader_result = parse_metadata(&test_file); assert!(reader_result.is_err()); assert_eq!( @@ -202,7 +202,7 @@ mod tests { #[test] fn test_parse_metadata_invalid_start() { let test_file = - get_temp_file("corrupt-4.parquet", &[255, 0, 0, 0, b'P', b'A', b'R', b'1']); + SliceableCursor::new(Arc::new(vec![255, 0, 0, 0, b'P', b'A', b'R', b'1'])); let reader_result = parse_metadata(&test_file); assert!(reader_result.is_err()); assert_eq!( diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs index bec2cf141795..86e4d8d7379f 100644 --- a/parquet/src/file/writer.rs +++ b/parquet/src/file/writer.rs @@ -550,11 +550,11 @@ mod tests { statistics::{from_thrift, to_thrift, Statistics}, }; use crate::record::RowAccessor; - use crate::util::{memory::ByteBufferPtr, test_common::get_temp_file}; + use crate::util::memory::ByteBufferPtr; #[test] fn test_file_writer_error_after_close() { - let file = get_temp_file("test_file_writer_error_after_close", &[]); + let file = tempfile::tempfile().unwrap(); let schema = Arc::new(types::Type::group_type_builder("schema").build().unwrap()); let props = Arc::new(WriterProperties::builder().build()); let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); @@ -577,7 +577,7 @@ mod tests { #[test] fn test_row_group_writer_error_after_close() { - let file = get_temp_file("test_file_writer_row_group_error_after_close", &[]); + let file = tempfile::tempfile().unwrap(); let schema = Arc::new(types::Type::group_type_builder("schema").build().unwrap()); let props = Arc::new(WriterProperties::builder().build()); let mut writer = SerializedFileWriter::new(file, schema, props).unwrap(); @@ -596,8 +596,7 @@ mod tests { #[test] fn test_row_group_writer_error_not_all_columns_written() { - let file = - get_temp_file("test_row_group_writer_error_not_all_columns_written", &[]); + let file = tempfile::tempfile().unwrap(); let schema = Arc::new( types::Type::group_type_builder("schema") .with_fields(&mut vec![Arc::new( @@ -623,7 +622,7 @@ mod tests { #[test] fn test_row_group_writer_num_records_mismatch() { - let file = get_temp_file("test_row_group_writer_num_records_mismatch", &[]); + let file = tempfile::tempfile().unwrap(); let schema = Arc::new( types::Type::group_type_builder("schema") .with_fields(&mut vec![ @@ -670,7 +669,7 @@ mod tests { #[test] fn test_file_writer_empty_file() { - let file = get_temp_file("test_file_writer_write_empty_file", &[]); + let file = tempfile::tempfile().unwrap(); let schema = Arc::new( types::Type::group_type_builder("schema") @@ -693,7 +692,7 @@ mod tests { #[test] fn test_file_writer_with_metadata() { - let file = get_temp_file("test_file_writer_write_with_metadata", &[]); + let file = tempfile::tempfile().unwrap(); let schema = Arc::new( types::Type::group_type_builder("schema") @@ -732,7 +731,7 @@ mod tests { #[test] fn test_file_writer_v2_with_metadata() { - let file = get_temp_file("test_file_writer_v2_write_with_metadata", &[]); + let file = tempfile::tempfile().unwrap(); let field_logical_type = Some(LogicalType::INTEGER(IntType { bit_width: 8, is_signed: false, @@ -785,19 +784,19 @@ mod tests { #[test] fn test_file_writer_empty_row_groups() { - let file = get_temp_file("test_file_writer_write_empty_row_groups", &[]); + let file = tempfile::tempfile().unwrap(); test_file_roundtrip(file, vec![]); } #[test] fn test_file_writer_single_row_group() { - let file = get_temp_file("test_file_writer_write_single_row_group", &[]); + let file = tempfile::tempfile().unwrap(); test_file_roundtrip(file, vec![vec![1, 2, 3, 4, 5]]); } #[test] fn test_file_writer_multiple_row_groups() { - let file = get_temp_file("test_file_writer_write_multiple_row_groups", &[]); + let file = tempfile::tempfile().unwrap(); test_file_roundtrip( file, vec![ @@ -811,7 +810,7 @@ mod tests { #[test] fn test_file_writer_multiple_large_row_groups() { - let file = get_temp_file("test_file_writer_multiple_large_row_groups", &[]); + let file = tempfile::tempfile().unwrap(); test_file_roundtrip( file, vec![vec![123; 1024], vec![124; 1000], vec![125; 15], vec![]], diff --git a/parquet/src/util/io.rs b/parquet/src/util/io.rs index 44e99ac0a779..c10e6d6161b5 100644 --- a/parquet/src/util/io.rs +++ b/parquet/src/util/io.rs @@ -207,7 +207,7 @@ mod tests { use std::iter; - use crate::util::test_common::{get_temp_file, get_test_file}; + use crate::util::test_common::get_test_file; #[test] fn test_io_read_fully() { @@ -272,8 +272,8 @@ mod tests { #[test] fn test_io_write_with_pos() { - let mut file = get_temp_file("file_sink_test", &[b'a', b'b', b'c']); - file.seek(SeekFrom::Current(3)).unwrap(); + let mut file = tempfile::tempfile().unwrap(); + file.write_all(&[b'a', b'b', b'c']).unwrap(); // Write into sink let mut sink = FileSink::new(&file); @@ -300,8 +300,9 @@ mod tests { .flatten() .take(3 * DEFAULT_BUF_SIZE) .collect(); - // always use different temp files as test might be run in parallel - let mut file = get_temp_file("large_file_sink_test", &patterned_data); + + let mut file = tempfile::tempfile().unwrap(); + file.write_all(&patterned_data).unwrap(); // seek the underlying file to the first 'd' file.seek(SeekFrom::Start(3)).unwrap(); diff --git a/parquet/src/util/test_common/file_util.rs b/parquet/src/util/test_common/file_util.rs index 7393b55f1ed2..c2dcd677360d 100644 --- a/parquet/src/util/test_common/file_util.rs +++ b/parquet/src/util/test_common/file_util.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::{env, fs, io::Write, path::PathBuf, str::FromStr}; +use std::{fs, path::PathBuf, str::FromStr}; /// Returns path to the test parquet file in 'data' directory pub fn get_test_path(file_name: &str) -> PathBuf { @@ -36,38 +36,3 @@ pub fn get_test_file(file_name: &str) -> fs::File { ) }) } - -/// Returns file handle for a temp file in 'target' directory with a provided content -pub fn get_temp_file(file_name: &str, content: &[u8]) -> fs::File { - // build tmp path to a file in "target/debug/testdata" - let mut path_buf = env::current_dir().unwrap(); - path_buf.push("target"); - path_buf.push("debug"); - path_buf.push("testdata"); - fs::create_dir_all(&path_buf).unwrap(); - path_buf.push(file_name); - - // write file content - let mut tmp_file = fs::File::create(path_buf.as_path()).unwrap(); - tmp_file.write_all(content).unwrap(); - tmp_file.sync_all().unwrap(); - - // return file handle for both read and write - let file = fs::OpenOptions::new() - .read(true) - .write(true) - .open(path_buf.as_path()); - assert!(file.is_ok()); - file.unwrap() -} - -pub fn get_temp_filename() -> PathBuf { - let mut path_buf = env::current_dir().unwrap(); - path_buf.push("target"); - path_buf.push("debug"); - path_buf.push("testdata"); - fs::create_dir_all(&path_buf).unwrap(); - path_buf.push(rand::random::().to_string()); - - path_buf -} diff --git a/parquet/src/util/test_common/mod.rs b/parquet/src/util/test_common/mod.rs index ed65bbe8a820..f0beb16ca954 100644 --- a/parquet/src/util/test_common/mod.rs +++ b/parquet/src/util/test_common/mod.rs @@ -25,8 +25,6 @@ pub use self::rand_gen::random_numbers; pub use self::rand_gen::random_numbers_range; pub use self::rand_gen::RandGen; -pub use self::file_util::get_temp_file; -pub use self::file_util::get_temp_filename; pub use self::file_util::get_test_file; pub use self::file_util::get_test_path;