Standardise .parquet suffix in docs and tests (#4254)

Signed-off-by: G. D. McBain <[email protected]> Signed-off-by: G. D. McBain <[email protected]> Co-authored-by: Deepyaman Datta <[email protected]>
kedro-org · Oct 25, 2024 · cbde71f · cbde71f
1 parent c2d7100
commit cbde71f
Show file tree

Hide file tree

Showing 6 changed files with 26 additions and 25 deletions.
diff --git a/RELEASE.md b/RELEASE.md
@@ -11,6 +11,7 @@
 ## Breaking changes to the API
 ## Documentation changes
 * Updated CLI autocompletion docs with new Click syntax.
+* Standardised `.parquet` suffix in docs and tests.
 
 ## Community contributions
 * [Hyewon Choi](https://github.com/hyew0nChoi)

diff --git a/docs/source/data/kedro_dataset_factories.md b/docs/source/data/kedro_dataset_factories.md
@@ -164,21 +164,21 @@ entries share `type`, `file_format` and `save_args`:
 ```yaml
 processing.factory_data:
   type: spark.SparkDataset
-  filepath: data/processing/factory_data.pq
+  filepath: data/processing/factory_data.parquet
   file_format: parquet
   save_args:
     mode: overwrite
 
 processing.process_data:
   type: spark.SparkDataset
-  filepath: data/processing/process_data.pq
+  filepath: data/processing/process_data.parquet
   file_format: parquet
   save_args:
     mode: overwrite
 
 modelling.metrics:
   type: spark.SparkDataset
-  filepath: data/modelling/factory_data.pq
+  filepath: data/modelling/factory_data.parquet
   file_format: parquet
   save_args:
     mode: overwrite
@@ -189,7 +189,7 @@ This could be generalised to the following pattern:
 ```yaml
 "{layer}.{dataset_name}":
   type: spark.SparkDataset
-  filepath: data/{layer}/{dataset_name}.pq
+  filepath: data/{layer}/{dataset_name}.parquet
   file_format: parquet
   save_args:
     mode: overwrite
@@ -202,7 +202,7 @@ You can have multiple dataset factories in your catalog. For example:
 ```yaml
 "{namespace}.{dataset_name}@spark":
   type: spark.SparkDataset
-  filepath: data/{namespace}/{dataset_name}.pq
+  filepath: data/{namespace}/{dataset_name}.parquet
   file_format: parquet
 
 "{dataset_name}@csv":
@@ -255,19 +255,19 @@ Consider a catalog file with the following patterns:
 
 "preprocessed_{dataset_name}":
   type: pandas.ParquetDataset
-  filepath: data/02_intermediate/preprocessed_{dataset_name}.pq
+  filepath: data/02_intermediate/preprocessed_{dataset_name}.parquet
 
 "processed_{dataset_name}":
   type: pandas.ParquetDataset
-  filepath: data/03_primary/processed_{dataset_name}.pq
+  filepath: data/03_primary/processed_{dataset_name}.parquet
 
 "{dataset_name}_csv":
   type: pandas.CSVDataset
   filepath: data/03_primary/{dataset_name}.csv
 
 "{namespace}.{dataset_name}_pq":
   type: pandas.ParquetDataset
-  filepath: data/03_primary/{dataset_name}_{namespace}.pq
+  filepath: data/03_primary/{dataset_name}_{namespace}.parquet
 
 "{default_dataset}":
   type: pickle.PickleDataset
@@ -315,11 +315,11 @@ shuttles:
 
 "preprocessed_{name}":
   type: pandas.ParquetDataset
-  filepath: data/02_intermediate/preprocessed_{name}.pq
+  filepath: data/02_intermediate/preprocessed_{name}.parquet
 
 "{default}":
   type: pandas.ParquetDataset
-  filepath: data/03_primary/{default}.pq
+  filepath: data/03_primary/{default}.parquet
 ```
 </details>
 
@@ -365,13 +365,13 @@ companies:
   filepath: data/01_raw/companies.csv
   type: pandas.CSVDataset
 model_input_table:
-  filepath: data/03_primary/model_input_table.pq
+  filepath: data/03_primary/model_input_table.parquet
   type: pandas.ParquetDataset
 preprocessed_companies:
-  filepath: data/02_intermediate/preprocessed_companies.pq
+  filepath: data/02_intermediate/preprocessed_companies.parquet
   type: pandas.ParquetDataset
 preprocessed_shuttles:
-  filepath: data/02_intermediate/preprocessed_shuttles.pq
+  filepath: data/02_intermediate/preprocessed_shuttles.parquet
   type: pandas.ParquetDataset
 reviews:
   filepath: data/01_raw/reviews.csv

diff --git a/docs/source/integrations/mlflow.md b/docs/source/integrations/mlflow.md
@@ -195,7 +195,7 @@ For that, you can make use of {ref}`runtime parameters <runtime-params>`:
 # Add the intermediate datasets to run only the inference
 X_test:
   type: pandas.ParquetDataset
-  filepath: data/05_model_input/X_test.pq
+  filepath: data/05_model_input/X_test.parquet
 
 y_test:
   type: pandas.CSVDataset  # https://github.com/pandas-dev/pandas/issues/54638

diff --git a/docs/source/tutorial/create_a_pipeline.md b/docs/source/tutorial/create_a_pipeline.md
@@ -200,11 +200,11 @@ Each of the nodes outputs a new dataset (`preprocessed_companies` and `preproces
 ```yaml
 preprocessed_companies:
   type: pandas.ParquetDataset
-  filepath: data/02_intermediate/preprocessed_companies.pq
+  filepath: data/02_intermediate/preprocessed_companies.parquet
 
 preprocessed_shuttles:
   type: pandas.ParquetDataset
-  filepath: data/02_intermediate/preprocessed_shuttles.pq
+  filepath: data/02_intermediate/preprocessed_shuttles.parquet
 ```
 </details>
 
@@ -290,7 +290,7 @@ The following entry in `conf/base/catalog.yml` saves the model input table datas
 ```yaml
 model_input_table:
   type: pandas.ParquetDataset
-  filepath: data/03_primary/model_input_table.pq
+  filepath: data/03_primary/model_input_table.parquet
 ```
 
 ## Test the example again

diff --git a/tests/framework/cli/test_catalog.py b/tests/framework/cli/test_catalog.py
@@ -38,7 +38,7 @@ def fake_catalog_config():
     config = {
         "parquet_{factory_pattern}": {
             "type": "pandas.ParquetDataset",
-            "filepath": "data/01_raw/{factory_pattern}.pq",
+            "filepath": "data/01_raw/{factory_pattern}.parquet",
             "credentials": "db_connection",
         },
         "csv_{factory_pattern}": {
@@ -55,7 +55,7 @@ def fake_catalog_config_resolved():
     config = {
         "parquet_example": {
             "type": "pandas.ParquetDataset",
-            "filepath": "data/01_raw/example.pq",
+            "filepath": "data/01_raw/example.parquet",
             "credentials": {"con": "foo"},
         },
         "csv_example": {
@@ -99,7 +99,7 @@ def fake_catalog_config_with_factories(fake_metadata):
     config = {
         "parquet_{factory_pattern}": {
             "type": "pandas.ParquetDataset",
-            "filepath": "data/01_raw/{factory_pattern}.pq",
+            "filepath": "data/01_raw/{factory_pattern}.parquet",
         },
         "csv_{factory_pattern}": {
             "type": "pandas.CSVDataset",
@@ -108,7 +108,7 @@ def fake_catalog_config_with_factories(fake_metadata):
         "explicit_ds": {"type": "pandas.CSVDataset", "filepath": "test.csv"},
         "{factory_pattern}_ds": {
             "type": "pandas.ParquetDataset",
-            "filepath": "data/01_raw/{factory_pattern}_ds.pq",
+            "filepath": "data/01_raw/{factory_pattern}_ds.parquet",
         },
         "partitioned_{factory_pattern}": {
             "type": "partitions.PartitionedDataset",
@@ -129,7 +129,7 @@ def fake_catalog_config_with_factories_resolved():
     config = {
         "parquet_example": {
             "type": "pandas.ParquetDataset",
-            "filepath": "data/01_raw/example.pq",
+            "filepath": "data/01_raw/example.parquet",
         },
         "csv_example": {
             "type": "pandas.CSVDataset",

diff --git a/tests/io/test_data_catalog.py b/tests/io/test_data_catalog.py
@@ -38,7 +38,7 @@ def config_with_dataset_factories():
             },
             "audi_cars": {
                 "type": "pandas.ParquetDataset",
-                "filepath": "data/01_raw/audi_cars.pq",
+                "filepath": "data/01_raw/audi_cars.parquet",
             },
             "{type}_boats": {
                 "type": "pandas.CSVDataset",
@@ -84,7 +84,7 @@ def config_with_dataset_factories_with_default(config_with_dataset_factories):
 def config_with_dataset_factories_bad_pattern(config_with_dataset_factories):
     config_with_dataset_factories["catalog"]["{type}@planes"] = {
         "type": "pandas.ParquetDataset",
-        "filepath": "data/01_raw/{brand}_plane.pq",
+        "filepath": "data/01_raw/{brand}_plane.parquet",
     }
     return config_with_dataset_factories
 
@@ -95,7 +95,7 @@ def config_with_dataset_factories_only_patterns():
         "catalog": {
             "{namespace}_{dataset}": {
                 "type": "pandas.CSVDataset",
-                "filepath": "data/01_raw/{namespace}_{dataset}.pq",
+                "filepath": "data/01_raw/{namespace}_{dataset}.parquet",
             },
             "{country}_companies": {
                 "type": "pandas.CSVDataset",