risingwavelabs · wcy-fdu · Dec 31, 2024 · Dec 31, 2024 · Dec 31, 2024 · Jan 2, 2025
diff --git a/proto/batch_plan.proto b/proto/batch_plan.proto
@@ -88,6 +88,8 @@ message FileScanNode {
   enum StorageType {
     STORAGE_TYPE_UNSPECIFIED = 0;
     S3 = 1;
+    GCS = 2;
+    AZBLOB = 3;
   }
 
   repeated plan_common.ColumnDesc columns = 1;
@@ -99,6 +101,33 @@ message FileScanNode {
   repeated string file_location = 7;
 }
 
+message GcsFileScanNode {
+  enum FileFormat {
+    FILE_FORMAT_UNSPECIFIED = 0;
+    PARQUET = 1;
+  }
+
+  repeated plan_common.ColumnDesc columns = 1;
+  FileFormat file_format = 2;
+  string credential = 3;
+  repeated string file_location = 4;
+}
+
+message AzblobFileScanNode {
+  enum FileFormat {
+    FILE_FORMAT_UNSPECIFIED = 0;
+    PARQUET = 1;
+  }
+
+  repeated plan_common.ColumnDesc columns = 1;
+  FileFormat file_format = 2;
+  string account_name = 3;
+  string account_key = 4;
+  string endpoint = 5;
+
+  repeated string file_location = 6;
+}
+
 // NOTE(kwannoel): This will only be used in batch mode. We can change the definition as needed.
 message PostgresQueryNode {
   repeated plan_common.ColumnDesc columns = 1;
@@ -405,6 +434,8 @@ message PlanNode {
     IcebergScanNode iceberg_scan = 39;
     PostgresQueryNode postgres_query = 40;
     MySqlQueryNode mysql_query = 41;
+    GcsFileScanNode gcs_file_scan = 42;
+    AzblobFileScanNode azblob_file_scan = 43;
     // The following nodes are used for testing.
     bool block_executor = 100;
     bool busy_loop_executor = 101;

diff --git a/src/batch/executors/src/executor.rs b/src/batch/executors/src/executor.rs
@@ -15,9 +15,11 @@
 pub use risingwave_batch::executor::*;
 
 pub mod aggregation;
+mod azblob_file_scan;
 mod delete;
 mod expand;
 mod filter;
+mod gcs_file_scan;
 mod generic_exchange;
 mod group_top_n;
 mod hash_agg;
@@ -49,9 +51,11 @@ mod update;
 mod utils;
 mod values;
 
+use azblob_file_scan::AzblobFileScanExecutorBuilder;
 pub use delete::*;
 pub use expand::*;
 pub use filter::*;
+use gcs_file_scan::GcsFileScanExecutorBuilder;
 pub use generic_exchange::*;
 pub use group_top_n::*;
 pub use hash_agg::*;
@@ -112,6 +116,8 @@ register_executor!(Source, SourceExecutor);
 register_executor!(SortOverWindow, SortOverWindowExecutor);
 register_executor!(MaxOneRow, MaxOneRowExecutor);
 register_executor!(FileScan, FileScanExecutorBuilder);
+register_executor!(GcsFileScan, GcsFileScanExecutorBuilder);
+register_executor!(AzblobFileScan, AzblobFileScanExecutorBuilder);
 register_executor!(IcebergScan, IcebergScanExecutorBuilder);
 register_executor!(PostgresQuery, PostgresQueryExecutorBuilder);
 register_executor!(MysqlQuery, MySqlQueryExecutorBuilder);

diff --git a/src/batch/executors/src/executor/azblob_file_scan.rs b/src/batch/executors/src/executor/azblob_file_scan.rs
@@ -0,0 +1,132 @@
+// Copyright 2025 RisingWave Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use futures_async_stream::try_stream;
+use futures_util::stream::StreamExt;
+use risingwave_common::array::DataChunk;
+use risingwave_common::catalog::{Field, Schema};
+use risingwave_connector::source::iceberg::{
+    extract_bucket_and_file_name, new_azblob_operator, read_parquet_file, FileScanBackend,
+};
+use risingwave_pb::batch_plan::file_scan_node;
+use risingwave_pb::batch_plan::plan_node::NodeBody;
+
+use crate::error::BatchError;
+use crate::executor::{BoxedExecutor, BoxedExecutorBuilder, Executor, ExecutorBuilder};
+
+#[derive(PartialEq, Debug)]
+pub enum FileFormat {
+    Parquet,
+}
+
+/// Azblob file scan executor. Currently only support parquet file format.
+pub struct AzblobFileScanExecutor {
+    file_format: FileFormat,
+    file_location: Vec<String>,
+    account_name: String,
+    account_key: String,
+    endpoint: String,
+    batch_size: usize,
+    schema: Schema,
+    identity: String,
+}
+
+impl Executor for AzblobFileScanExecutor {
+    fn schema(&self) -> &risingwave_common::catalog::Schema {
+        &self.schema
+    }
+
+    fn identity(&self) -> &str {
+        &self.identity
+    }
+
+    fn execute(self: Box<Self>) -> super::BoxedDataChunkStream {
+        self.do_execute().boxed()
+    }
+}
+
+impl AzblobFileScanExecutor {
+    pub fn new(
+        file_format: FileFormat,
+        file_location: Vec<String>,
+        account_name: String,
+        account_key: String,
+        endpoint: String,
+        batch_size: usize,
+        schema: Schema,
+        identity: String,
+    ) -> Self {
+        Self {
+            file_format,
+            file_location,
+            account_name,
+            account_key,
+            endpoint,
+            batch_size,
+            schema,
+            identity,
+        }
+    }
+
+    #[try_stream(ok = DataChunk, error = BatchError)]
+    async fn do_execute(self: Box<Self>) {
+        assert_eq!(self.file_format, FileFormat::Parquet);
+        for file in self.file_location {
+            let (bucket, file_name) =
+                extract_bucket_and_file_name(&file, &FileScanBackend::Azblob)?;
+            let op = new_azblob_operator(
+                self.account_name.clone(),
+                self.account_key.clone(),
+                self.endpoint.clone(),
+                bucket.clone(),
+            )?;
+            let chunk_stream =
+                read_parquet_file(op, file_name, None, None, self.batch_size, 0).await?;
+            #[for_await]
+            for stream_chunk in chunk_stream {
+                let stream_chunk = stream_chunk?;
+                let (data_chunk, _) = stream_chunk.into_parts();
+                yield data_chunk;
+            }
+        }
+    }
+}
+
+pub struct AzblobFileScanExecutorBuilder {}
+
+impl BoxedExecutorBuilder for AzblobFileScanExecutorBuilder {
+    async fn new_boxed_executor(
+        source: &ExecutorBuilder<'_>,
+        _inputs: Vec<BoxedExecutor>,
+    ) -> crate::error::Result<BoxedExecutor> {
+        let file_scan_node = try_match_expand!(
+            source.plan_node().get_node_body().unwrap(),
+            NodeBody::AzblobFileScan
+        )?;
+
+        Ok(Box::new(AzblobFileScanExecutor::new(
+            match file_scan_node::FileFormat::try_from(file_scan_node.file_format).unwrap() {
+                file_scan_node::FileFormat::Parquet => FileFormat::Parquet,
+                file_scan_node::FileFormat::Unspecified => unreachable!(),
+            },
+            file_scan_node.file_location.clone(),
+            file_scan_node.account_name.clone(),
+            file_scan_node.account_key.clone(),
+            file_scan_node.endpoint.clone(),
+            source.context().get_config().developer.chunk_size,
+            Schema::from_iter(file_scan_node.columns.iter().map(Field::from)),
+            source.plan_node().get_identity().clone(),
+        )))
+    }
+}
diff --git a/src/batch/executors/src/executor/gcs_file_scan.rs b/src/batch/executors/src/executor/gcs_file_scan.rs
@@ -0,0 +1,118 @@
+// Copyright 2025 RisingWave Labs
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use futures_async_stream::try_stream;
+use futures_util::stream::StreamExt;
+use risingwave_common::array::DataChunk;
+use risingwave_common::catalog::{Field, Schema};
+use risingwave_connector::source::iceberg::{
+    extract_bucket_and_file_name, new_gcs_operator, read_parquet_file, FileScanBackend,
+};
+use risingwave_pb::batch_plan::file_scan_node;
+use risingwave_pb::batch_plan::plan_node::NodeBody;
+
+use crate::error::BatchError;
+use crate::executor::{BoxedExecutor, BoxedExecutorBuilder, Executor, ExecutorBuilder};
+
+#[derive(PartialEq, Debug)]
+pub enum FileFormat {
+    Parquet,
+}
+
+/// Gcs file scan executor. Currently only support parquet file format.
+pub struct GcsFileScanExecutor {
+    file_format: FileFormat,
+    file_location: Vec<String>,
+    gcs_credential: String,
+    batch_size: usize,
+    schema: Schema,
+    identity: String,
+}
+
+impl Executor for GcsFileScanExecutor {
+    fn schema(&self) -> &risingwave_common::catalog::Schema {
+        &self.schema
+    }
+
+    fn identity(&self) -> &str {
+        &self.identity
+    }
+
+    fn execute(self: Box<Self>) -> super::BoxedDataChunkStream {
+        self.do_execute().boxed()
+    }
+}
+
+impl GcsFileScanExecutor {
+    pub fn new(
+        file_format: FileFormat,
+        file_location: Vec<String>,
+        gcs_credential: String,
+        batch_size: usize,
+        schema: Schema,
+        identity: String,
+    ) -> Self {
+        Self {
+            file_format,
+            file_location,
+            gcs_credential,
+            batch_size,
+            schema,
+            identity,
+        }
+    }
+
+    #[try_stream(ok = DataChunk, error = BatchError)]
+    async fn do_execute(self: Box<Self>) {
+        assert_eq!(self.file_format, FileFormat::Parquet);
+        for file in self.file_location {
+            let (bucket, file_name) = extract_bucket_and_file_name(&file, &FileScanBackend::Gcs)?;
+            let op = new_gcs_operator(self.gcs_credential.clone(), bucket.clone())?;
+            let chunk_stream =
+                read_parquet_file(op, file_name, None, None, self.batch_size, 0).await?;
+            #[for_await]
+            for stream_chunk in chunk_stream {
+                let stream_chunk = stream_chunk?;
+                let (data_chunk, _) = stream_chunk.into_parts();
+                yield data_chunk;
+            }
+        }
+    }
+}
+
+pub struct GcsFileScanExecutorBuilder {}
+
+impl BoxedExecutorBuilder for GcsFileScanExecutorBuilder {
+    async fn new_boxed_executor(
+        source: &ExecutorBuilder<'_>,
+        _inputs: Vec<BoxedExecutor>,
+    ) -> crate::error::Result<BoxedExecutor> {
+        let file_scan_node = try_match_expand!(
+            source.plan_node().get_node_body().unwrap(),
+            NodeBody::GcsFileScan
+        )?;
+
+        Ok(Box::new(GcsFileScanExecutor::new(
+            match file_scan_node::FileFormat::try_from(file_scan_node.file_format).unwrap() {
+                file_scan_node::FileFormat::Parquet => FileFormat::Parquet,
+                file_scan_node::FileFormat::Unspecified => unreachable!(),
+            },
+            file_scan_node.file_location.clone(),
+            file_scan_node.credential.clone(),
+            source.context().get_config().developer.chunk_size,
+            Schema::from_iter(file_scan_node.columns.iter().map(Field::from)),
+            source.plan_node().get_identity().clone(),
+        )))
+    }
+}
diff --git a/src/batch/executors/src/executor/s3_file_scan.rs b/src/batch/executors/src/executor/s3_file_scan.rs
@@ -17,7 +17,7 @@ use futures_util::stream::StreamExt;
 use risingwave_common::array::DataChunk;
 use risingwave_common::catalog::{Field, Schema};
 use risingwave_connector::source::iceberg::{
-    extract_bucket_and_file_name, new_s3_operator, read_parquet_file,
+    extract_bucket_and_file_name, new_s3_operator, read_parquet_file, FileScanBackend,
 };
 use risingwave_pb::batch_plan::file_scan_node;
 use risingwave_pb::batch_plan::file_scan_node::StorageType;
@@ -84,7 +84,7 @@ impl S3FileScanExecutor {
     async fn do_execute(self: Box<Self>) {
         assert_eq!(self.file_format, FileFormat::Parquet);
         for file in self.file_location {
-            let (bucket, file_name) = extract_bucket_and_file_name(&file)?;
+            let (bucket, file_name) = extract_bucket_and_file_name(&file, &FileScanBackend::S3)?;
             let op = new_s3_operator(
                 self.s3_region.clone(),
                 self.s3_access_key.clone(),