Skip to content

Commit

Permalink
Merge pull request #7867 from zhang2014/feat/ISSUE-7861
Browse files Browse the repository at this point in the history
feat(cluster): experimental distributed eval index
  • Loading branch information
zhang2014 authored Oct 4, 2022
2 parents edcc1bf + 9f3cda0 commit 3fd0b72
Show file tree
Hide file tree
Showing 31 changed files with 436 additions and 260 deletions.
2 changes: 1 addition & 1 deletion scripts/ci/ci-run-stateful-tests-cluster-s3.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,4 @@ SCRIPT_PATH="$(cd "$(dirname "$0")" >/dev/null 2>&1 && pwd)"
cd "$SCRIPT_PATH/../../tests" || exit

echo "Starting databend-test"
./databend-test --mode 'cluster' --run-dir 1_stateful
./databend-test --mode 'cluster' --run-dir 1_stateful --skip '02_0001_create_table_with_external_location'
5 changes: 2 additions & 3 deletions src/query/catalog/src/table_context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ use common_datablocks::DataBlock;
use common_exception::Result;
use common_functions::scalars::FunctionContext;
use common_io::prelude::FormatSettings;
use common_legacy_planners::PartInfoPtr;
use common_legacy_planners::Partitions;
use common_legacy_planners::ReadDataSourcePlan;
use common_meta_types::UserInfo;
Expand Down Expand Up @@ -64,9 +65,7 @@ pub trait TableContext: Send + Sync {
fn get_write_progress_value(&self) -> ProgressValues;
fn get_result_progress(&self) -> Arc<Progress>;
fn get_result_progress_value(&self) -> ProgressValues;
// Steal n partitions from the partition pool by the pipeline worker.
// This also can steal the partitions from distributed node.
fn try_get_partitions(&self, num: u64) -> Result<Partitions>;
fn try_get_part(&self) -> Option<PartInfoPtr>;
// Update the context partition pool from the pipeline builder.
fn try_set_partitions(&self, partitions: Partitions) -> Result<()>;
fn attach_query_str(&self, kind: String, query: &str);
Expand Down
26 changes: 26 additions & 0 deletions src/query/pipeline/core/src/pipeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,12 @@ use crate::TransformPipeBuilder;
pub struct Pipeline {
max_threads: usize,
pub pipes: Vec<Pipe>,
on_init: Option<InitCallback>,
on_finished: Option<FinishedCallback>,
}

pub type InitCallback = Arc<Box<dyn Fn() -> Result<()> + Send + Sync + 'static>>;

pub type FinishedCallback =
Arc<Box<dyn Fn(&Option<ErrorCode>) -> Result<()> + Send + Sync + 'static>>;

Expand All @@ -56,6 +59,7 @@ impl Pipeline {
Pipeline {
max_threads: 0,
pipes: Vec::new(),
on_init: None,
on_finished: None,
}
}
Expand Down Expand Up @@ -159,6 +163,21 @@ impl Pipeline {
}
}

pub fn set_on_init<F: Fn() -> Result<()> + Send + Sync + 'static>(&mut self, f: F) {
if let Some(on_init) = &self.on_init {
let old_on_init = on_init.clone();

self.on_init = Some(Arc::new(Box::new(move || {
old_on_init()?;
f()
})));

return;
}

self.on_init = Some(Arc::new(Box::new(f)));
}

pub fn set_on_finished<F: Fn(&Option<ErrorCode>) -> Result<()> + Send + Sync + 'static>(
&mut self,
f: F,
Expand All @@ -177,6 +196,13 @@ impl Pipeline {
self.on_finished = Some(Arc::new(Box::new(f)));
}

pub fn take_on_init(&mut self) -> InitCallback {
match self.on_init.take() {
None => Arc::new(Box::new(|| Ok(()))),
Some(on_init) => on_init,
}
}

pub fn take_on_finished(&mut self) -> FinishedCallback {
match self.on_finished.take() {
None => Arc::new(Box::new(|_may_error| Ok(()))),
Expand Down
3 changes: 2 additions & 1 deletion src/query/service/src/api/rpc/exchange/exchange_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -668,7 +668,8 @@ impl FragmentCoordinator {

match &self.payload {
FragmentPayload::PlanV2(plan) => {
let pipeline_builder = PipelineBuilderV2::create(ctx);
let pipeline_ctx = QueryContext::create_from(ctx);
let pipeline_builder = PipelineBuilderV2::create(pipeline_ctx);
self.pipeline_build_res = Some(pipeline_builder.finalize(plan)?);
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@ impl Interpreter for InsertInterpreterV2 {
_ => unreachable!(),
};

table1.get_table_info();
let catalog = self.plan.catalog.clone();
let is_distributed_plan = select_plan.is_distributed_plan();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

use std::sync::Arc;

use common_base::base::Thread;
use common_exception::ErrorCode;
use common_exception::Result;

Expand Down Expand Up @@ -67,7 +68,13 @@ impl PipelineCompleteExecutor {
}

pub fn execute(&self) -> Result<()> {
self.executor.execute()
let executor = self.executor.clone();
let execute_thread =
Thread::named_spawn(Some(String::from("CompleteExecutor")), move || {
executor.execute()
});

execute_thread.join().flatten()
}
}

Expand Down
75 changes: 52 additions & 23 deletions src/query/service/src/pipelines/executor/pipeline_executor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ use crate::pipelines::executor::executor_worker_context::ExecutorWorkerContext;
use crate::pipelines::executor::ExecutorSettings;
use crate::pipelines::pipeline::Pipeline;

pub type InitCallback = Arc<Box<dyn Fn() -> Result<()> + Send + Sync + 'static>>;

pub type FinishedCallback =
Arc<Box<dyn Fn(&Option<ErrorCode>) -> Result<()> + Send + Sync + 'static>>;

Expand All @@ -45,6 +47,7 @@ pub struct PipelineExecutor {
workers_condvar: Arc<WorkersCondvar>,
pub async_runtime: Arc<Runtime>,
pub global_tasks_queue: Arc<ExecutorTasksQueue>,
on_init_callback: InitCallback,
on_finished_callback: FinishedCallback,
settings: ExecutorSettings,
finished_notify: Notify,
Expand All @@ -57,12 +60,14 @@ impl PipelineExecutor {
settings: ExecutorSettings,
) -> Result<Arc<PipelineExecutor>> {
let threads_num = pipeline.get_max_threads();
let on_init_callback = pipeline.take_on_init();
let on_finished_callback = pipeline.take_on_finished();

assert_ne!(threads_num, 0, "Pipeline max threads cannot equals zero.");
Self::try_create(
RunningGraph::create(pipeline)?,
threads_num,
on_init_callback,
on_finished_callback,
settings,
)
Expand All @@ -82,6 +87,11 @@ impl PipelineExecutor {
.max()
.unwrap_or(0);

let on_init_callbacks = pipelines
.iter_mut()
.map(|x| x.take_on_init())
.collect::<Vec<_>>();

let on_finished_callbacks = pipelines
.iter_mut()
.map(|x| x.take_on_finished())
Expand All @@ -91,6 +101,13 @@ impl PipelineExecutor {
Self::try_create(
RunningGraph::from_pipelines(pipelines)?,
threads_num,
Arc::new(Box::new(move || {
for on_init_callback in &on_init_callbacks {
on_init_callback()?;
}

Ok(())
})),
Arc::new(Box::new(move |may_error| {
for on_finished_callback in &on_finished_callbacks {
on_finished_callback(may_error)?;
Expand All @@ -105,33 +122,25 @@ impl PipelineExecutor {
fn try_create(
graph: RunningGraph,
threads_num: usize,
on_init_callback: InitCallback,
on_finished_callback: FinishedCallback,
settings: ExecutorSettings,
) -> Result<Arc<PipelineExecutor>> {
unsafe {
let workers_condvar = WorkersCondvar::create(threads_num);
let global_tasks_queue = ExecutorTasksQueue::create(threads_num);

let mut init_schedule_queue = graph.init_schedule_queue()?;
let workers_condvar = WorkersCondvar::create(threads_num);
let global_tasks_queue = ExecutorTasksQueue::create(threads_num);

let mut tasks = VecDeque::new();
while let Some(task) = init_schedule_queue.pop_task() {
tasks.push_back(task);
}
global_tasks_queue.init_tasks(tasks);

Ok(Arc::new(PipelineExecutor {
graph,
threads_num,
workers_condvar,
global_tasks_queue,
on_finished_callback,
async_runtime: GlobalIORuntime::instance(),
settings,
finished_notify: Notify::new(),
finished_error: Mutex::new(None),
}))
}
Ok(Arc::new(PipelineExecutor {
graph,
threads_num,
workers_condvar,
global_tasks_queue,
on_init_callback,
on_finished_callback,
async_runtime: GlobalIORuntime::instance(),
settings,
finished_notify: Notify::new(),
finished_error: Mutex::new(None),
}))
}

pub fn finish(&self, cause: Option<ErrorCode>) {
Expand All @@ -145,6 +154,8 @@ impl PipelineExecutor {
}

pub fn execute(self: &Arc<Self>) -> Result<()> {
self.init()?;

self.start_executor_daemon()?;

let mut thread_join_handles = self.execute_threads(self.threads_num);
Expand All @@ -171,6 +182,24 @@ impl PipelineExecutor {
Ok(())
}

fn init(self: &Arc<Self>) -> Result<()> {
unsafe {
// TODO: the on init callback cannot be killed.
(self.on_init_callback)()?;

let mut init_schedule_queue = self.graph.init_schedule_queue()?;

let mut tasks = VecDeque::new();
while let Some(task) = init_schedule_queue.pop_task() {
tasks.push_back(task);
}

self.global_tasks_queue.init_tasks(tasks);

Ok(())
}
}

fn start_executor_daemon(self: &Arc<Self>) -> Result<()> {
if !self.settings.max_execute_time.is_zero() {
let this = self.clone();
Expand Down
28 changes: 6 additions & 22 deletions src/query/service/src/sessions/query_ctx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ pub struct QueryContext {
version: String,
partition_queue: Arc<RwLock<VecDeque<PartInfoPtr>>>,
shared: Arc<QueryContextShared>,
precommit_blocks: Arc<RwLock<Vec<DataBlock>>>,
fragment_id: Arc<AtomicUsize>,
}

Expand All @@ -84,7 +83,6 @@ impl QueryContext {
partition_queue: Arc::new(RwLock::new(VecDeque::new())),
version: format!("DatabendQuery {}", *crate::version::DATABEND_COMMIT_VERSION),
shared,
precommit_blocks: Arc::new(RwLock::new(Vec::new())),
fragment_id: Arc::new(AtomicUsize::new(0)),
})
}
Expand Down Expand Up @@ -225,20 +223,11 @@ impl TableContext for QueryContext {
fn get_result_progress_value(&self) -> ProgressValues {
self.shared.result_progress.as_ref().get_values()
}
// Steal n partitions from the partition pool by the pipeline worker.
// This also can steal the partitions from distributed node.
fn try_get_partitions(&self, num: u64) -> Result<Partitions> {
let mut partitions = vec![];
for _ in 0..num {
match self.partition_queue.write().pop_back() {
None => break,
Some(partition) => {
partitions.push(partition);
}
}
}
Ok(partitions)

fn try_get_part(&self) -> Option<PartInfoPtr> {
self.partition_queue.write().pop_front()
}

// Update the context partition pool from the pipeline builder.
fn try_set_partitions(&self, partitions: Partitions) -> Result<()> {
let mut partition_queue = self.partition_queue.write();
Expand Down Expand Up @@ -328,15 +317,10 @@ impl TableContext for QueryContext {
self.shared.dal_ctx.as_ref()
}
fn push_precommit_block(&self, block: DataBlock) {
let mut blocks = self.precommit_blocks.write();
blocks.push(block);
self.shared.push_precommit_block(block)
}
fn consume_precommit_blocks(&self) -> Vec<DataBlock> {
let mut blocks = self.precommit_blocks.write();

let mut swaped_precommit_blocks = vec![];
std::mem::swap(&mut *blocks, &mut swaped_precommit_blocks);
swaped_precommit_blocks
self.shared.consume_precommit_blocks()
}
fn try_get_function_context(&self) -> Result<FunctionContext> {
let tz = self.get_settings().get_timezone()?;
Expand Down
16 changes: 16 additions & 0 deletions src/query/service/src/sessions/query_ctx_shared.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use std::sync::Weak;
use common_base::base::Progress;
use common_base::base::Runtime;
use common_contexts::DalContext;
use common_datablocks::DataBlock;
use common_exception::ErrorCode;
use common_exception::Result;
use common_meta_types::UserInfo;
Expand Down Expand Up @@ -79,6 +80,7 @@ pub struct QueryContextShared {
pub(in crate::sessions) catalog_manager: Arc<CatalogManager>,
pub(in crate::sessions) storage_operator: StorageOperator,
pub(in crate::sessions) executor: Arc<RwLock<Weak<PipelineExecutor>>>,
pub(in crate::sessions) precommit_blocks: Arc<RwLock<Vec<DataBlock>>>,
}

impl QueryContextShared {
Expand Down Expand Up @@ -108,6 +110,7 @@ impl QueryContextShared {
auth_manager: AuthMgr::create(config).await?,
affect: Arc::new(Mutex::new(None)),
executor: Arc::new(RwLock::new(Weak::new())),
precommit_blocks: Arc::new(RwLock::new(vec![])),
}))
}

Expand Down Expand Up @@ -297,4 +300,17 @@ impl QueryContextShared {
let mut executor = self.executor.write();
*executor = weak_ptr;
}

pub fn push_precommit_block(&self, block: DataBlock) {
let mut blocks = self.precommit_blocks.write();
blocks.push(block);
}

pub fn consume_precommit_blocks(&self) -> Vec<DataBlock> {
let mut blocks = self.precommit_blocks.write();

let mut swaped_precommit_blocks = vec![];
std::mem::swap(&mut *blocks, &mut swaped_precommit_blocks);
swaped_precommit_blocks
}
}
Loading

1 comment on commit 3fd0b72

@vercel
Copy link

@vercel vercel bot commented on 3fd0b72 Oct 4, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

databend – ./

databend.vercel.app
databend-databend.vercel.app
databend.rs
databend-git-main-databend.vercel.app

Please sign in to comment.