Fix GCWorker ownership.

This commit fixes an inconsistency in the VMCollection interface where a GCWorker is exposed to the binding via `Collection::spawn_worker_thread` as a `&GCWorker`, but later received from the binding via `memory_manager::start_worker` as a `&mut GCWorker`. The root cause is because GCWorker is wrongly owned by the scheduler. We make each GC worker thread the owner of its `GCWorker` struct, and we pass the `GCWorker` struct across API boundary as owning `Box`, fixing the interface. We isolate the part of `GCWorker` shared with the GC scheduler into a `GCWorkerShared` struct, and ensure the fields are properly synchronized. Particularly, we now use `AtomicRefCell` for `WorkerLocalStat`. `AtomicRefCell` allows it to be borrowed mutably by the GC worker and the mutator (via `harness_begin/end` methods) at different time. However, it is a known issue that in concurrent GC, it is possible for GC to happen when `harness_begin/end` is called. In that case, it will panic. Fixes: mmtk#522
wks · Jan 24, 2022 · 6e287a9 · 6e287a9
1 parent 8b58a78
commit 6e287a9
Show file tree

Hide file tree

Showing 8 changed files with 133 additions and 59 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -34,6 +34,7 @@ atomic = "0.4.6"
 spin = "0.5.2"
 env_logger = "0.8.2"
 pfm = {version = "0.1.0-beta.1", optional = true}
+atomic_refcell = "0.1.7"
 
 [dev-dependencies]
 crossbeam = "0.7.3"

diff --git a/src/scheduler/gc_work.rs b/src/scheduler/gc_work.rs
@@ -49,7 +49,7 @@ impl<C: GCWorkContext + 'static> GCWork<C::VM> for Prepare<C> {
             mmtk.scheduler.work_buckets[WorkBucketStage::Prepare]
                 .add(PrepareMutator::<C::VM>::new(mutator));
         }
-        for w in &mmtk.scheduler.worker_group().workers {
+        for w in &mmtk.scheduler.worker_group().workers_shared {
             w.local_work_bucket.add(PrepareCollector);
         }
     }
@@ -117,7 +117,7 @@ impl<C: GCWorkContext + 'static> GCWork<C::VM> for Release<C> {
             mmtk.scheduler.work_buckets[WorkBucketStage::Release]
                 .add(ReleaseMutator::<C::VM>::new(mutator));
         }
-        for w in &mmtk.scheduler.worker_group().workers {
+        for w in &mmtk.scheduler.worker_group().workers_shared {
             w.local_work_bucket.add(ReleaseCollector);
         }
         // TODO: Process weak references properly

diff --git a/src/scheduler/scheduler.rs b/src/scheduler/scheduler.rs
@@ -22,7 +22,7 @@ pub struct GCWorkScheduler<VM: VMBinding> {
     /// Work for the coordinator thread
     pub coordinator_work: WorkBucket<VM>,
     /// workers
-    worker_group: Option<Arc<WorkerGroup<VM>>>,
+    worker_group: Option<WorkerGroup<VM>>,
     /// Condition Variable for worker synchronization
     pub worker_monitor: Arc<(Mutex<()>, Condvar)>,
     mmtk: Option<&'static MMTK<VM>>,
@@ -53,6 +53,11 @@ pub struct GCWorkScheduler<VM: VMBinding> {
 // the workers.
 unsafe impl<VM: VMBinding> Sync for GCWorkScheduler<VM> {}
 
+// Error message for borrowing `GCWorkerShared::stat`.
+const STAT_BORROWED_MSG: &str =
+    "`stat` is already borrowed.  This may happen when GC is running concurrently.  \
+    The current statistics algorithm does not support concurrent GC.";
+
 impl<VM: VMBinding> GCWorkScheduler<VM> {
     pub fn new() -> Arc<Self> {
         let worker_monitor: Arc<(Mutex<()>, Condvar)> = Default::default();
@@ -104,16 +109,22 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         self_mut.mmtk = Some(mmtk);
         self_mut.coordinator_worker = Some(RwLock::new(GCWorker::new(
             0,
-            Arc::downgrade(self),
+            self.clone(),
             true,
             self.channel.0.clone(),
         )));
         self_mut.worker_group = Some(WorkerGroup::new(
             num_workers,
-            Arc::downgrade(self),
+            self.clone(),
             self.channel.0.clone(),
         ));
-        self.worker_group.as_ref().unwrap().spawn_workers(tls, mmtk);
+        // FIXME: because of the `Arc::get_mut_unchanged` above, we may mutate the scheduler
+        // while the spawned workers already have access to the scheduler.
+        self_mut
+            .worker_group
+            .as_mut()
+            .unwrap()
+            .spawn_workers(tls, mmtk);
 
         {
             // Unconstrained is always open. Prepare will be opened at the beginning of a GC.
@@ -223,8 +234,8 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
         *self.closure_end.lock().unwrap() = Some(f);
     }
 
-    pub fn worker_group(&self) -> Arc<WorkerGroup<VM>> {
-        self.worker_group.as_ref().unwrap().clone()
+    pub fn worker_group(&self) -> &WorkerGroup<VM> {
+        self.worker_group.as_ref().unwrap()
     }
 
     fn all_buckets_empty(&self) -> bool {
@@ -330,8 +341,8 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
 
     #[inline]
     fn pop_scheduable_work(&self, worker: &GCWorker<VM>) -> Option<(Box<dyn GCWork<VM>>, bool)> {
-        if let Some(work) = worker.local_work_bucket.poll() {
-            return Some((work, worker.local_work_bucket.is_empty()));
+        if let Some(work) = worker.shared.local_work_bucket.poll() {
+            return Some((work, worker.shared.local_work_bucket.is_empty()));
         }
         for work_bucket in self.work_buckets.values() {
             if let Some(work) = work_bucket.poll() {
@@ -360,10 +371,10 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
 
     #[cold]
     fn poll_slow(&self, worker: &GCWorker<VM>) -> Box<dyn GCWork<VM>> {
-        debug_assert!(!worker.is_parked());
+        debug_assert!(!worker.shared.is_parked());
         let mut guard = self.worker_monitor.0.lock().unwrap();
         loop {
-            debug_assert!(!worker.is_parked());
+            debug_assert!(!worker.shared.is_parked());
             if let Some((work, bucket_is_empty)) = self.pop_scheduable_work(worker) {
                 if bucket_is_empty {
                     worker
@@ -374,7 +385,7 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
                 return work;
             }
             // Park this worker
-            worker.parked.store(true, Ordering::SeqCst);
+            worker.shared.parked.store(true, Ordering::SeqCst);
             if self.worker_group().all_parked() {
                 worker
                     .sender
@@ -384,25 +395,37 @@ impl<VM: VMBinding> GCWorkScheduler<VM> {
             // Wait
             guard = self.worker_monitor.1.wait(guard).unwrap();
             // Unpark this worker
-            worker.parked.store(false, Ordering::SeqCst);
+            worker.shared.parked.store(false, Ordering::SeqCst);
         }
     }
 
     pub fn enable_stat(&self) {
-        for worker in &self.worker_group().workers {
-            worker.stat.enable();
+        for worker in &self.worker_group().workers_shared {
+            let worker_stat = worker.stat.try_borrow().expect(STAT_BORROWED_MSG);
+            worker_stat.enable();
         }
         let coordinator_worker = self.coordinator_worker.as_ref().unwrap().read().unwrap();
-        coordinator_worker.stat.enable();
+        let coordinator_worker_stat = coordinator_worker
+            .shared
+            .stat
+            .try_borrow()
+            .expect(STAT_BORROWED_MSG);
+        coordinator_worker_stat.enable();
     }
 
     pub fn statistics(&self) -> HashMap<String, String> {
         let mut summary = SchedulerStat::default();
-        for worker in &self.worker_group().workers {
-            summary.merge(&worker.stat);
+        for worker in &self.worker_group().workers_shared {
+            let worker_stat = worker.stat.try_borrow().expect(STAT_BORROWED_MSG);
+            summary.merge(&worker_stat);
         }
         let coordinator_worker = self.coordinator_worker.as_ref().unwrap().read().unwrap();
-        summary.merge(&coordinator_worker.stat);
+        let coordinator_worker_stat = coordinator_worker
+            .shared
+            .stat
+            .try_borrow()
+            .expect(STAT_BORROWED_MSG);
+        summary.merge(&coordinator_worker_stat);
         summary.harness_stat()
     }
 

diff --git a/src/scheduler/work.rs b/src/scheduler/work.rs
@@ -11,6 +11,12 @@ use std::any::{type_name, TypeId};
 /// For this case, use `WorkBucket::add_with_priority_unsync` instead.
 pub trait CoordinatorWork<VM: VMBinding>: 'static + Send + GCWork<VM> {}
 
+// Error message for borrowing `GCWorkerShared::stat`.
+#[cfg(feature = "work_packet_stats")]
+const STAT_BORROWED_MSG: &'static str =
+    "`stat` is already borrowed.  This may happen when GC is running concurrently.  \
+    The current statistics algorithm does not support concurrent GC.";
+
 pub trait GCWork<VM: VMBinding>: 'static + Send {
     /// Define the work for this packet. However, this is not supposed to be called directly.
     /// Usually `do_work_with_stat()` should be used.
@@ -28,16 +34,28 @@ pub trait GCWork<VM: VMBinding>: 'static + Send {
 
         #[cfg(feature = "work_packet_stats")]
         // Start collecting statistics
-        let stat = worker
-            .stat
-            .measure_work(TypeId::of::<Self>(), type_name::<Self>(), mmtk);
+        let stat = {
+            let mut worker_stat = worker
+                .shared
+                .stat
+                .try_borrow_mut()
+                .expect(STAT_BORROWED_MSG);
+            worker_stat.measure_work(TypeId::of::<Self>(), type_name::<Self>(), mmtk)
+        };
 
         // Do the actual work
         self.do_work(worker, mmtk);
 
         #[cfg(feature = "work_packet_stats")]
         // Finish collecting statistics
-        stat.end_of_work(&mut worker.stat);
+        {
+            let mut worker_stat = worker
+                .shared
+                .stat
+                .try_borrow_mut()
+                .expect(STAT_BORROWED_MSG);
+            stat.end_of_work(&mut worker_stat);
+        }
     }
 }
 

diff --git a/src/scheduler/worker.rs b/src/scheduler/worker.rs
@@ -5,50 +5,66 @@ use crate::mmtk::MMTK;
 use crate::util::copy::GCWorkerCopyContext;
 use crate::util::opaque_pointer::*;
 use crate::vm::{Collection, VMBinding};
+use atomic_refcell::AtomicRefCell;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::mpsc::Sender;
-use std::sync::{Arc, Weak};
+use std::sync::Arc;
 
 const LOCALLY_CACHED_WORKS: usize = 1;
 
+/// The part shared between a GCWorker and the scheduler.
+/// This structure is used for communication, e.g. adding new work packets.
+pub struct GCWorkerShared<VM: VMBinding> {
+    pub parked: AtomicBool,
+    pub stat: AtomicRefCell<WorkerLocalStat<VM>>,
+    pub local_work_bucket: WorkBucket<VM>,
+}
+
+/// A GC worker.  This part is privately owned by a worker thread.
 pub struct GCWorker<VM: VMBinding> {
     pub tls: VMWorkerThread,
     pub ordinal: usize,
-    pub parked: AtomicBool,
     scheduler: Arc<GCWorkScheduler<VM>>,
     copy: GCWorkerCopyContext<VM>,
-    pub local_work_bucket: WorkBucket<VM>,
     pub sender: Sender<CoordinatorMessage<VM>>,
-    pub stat: WorkerLocalStat<VM>,
     mmtk: Option<&'static MMTK<VM>>,
     is_coordinator: bool,
     local_work_buffer: Vec<(WorkBucketStage, Box<dyn GCWork<VM>>)>,
+    pub shared: Arc<GCWorkerShared<VM>>,
 }
 
-unsafe impl<VM: VMBinding> Sync for GCWorker<VM> {}
-unsafe impl<VM: VMBinding> Send for GCWorker<VM> {}
+unsafe impl<VM: VMBinding> Sync for GCWorkerShared<VM> {}
+unsafe impl<VM: VMBinding> Send for GCWorkerShared<VM> {}
+
+impl<VM: VMBinding> GCWorkerShared<VM> {
+    pub fn is_parked(&self) -> bool {
+        self.parked.load(Ordering::SeqCst)
+    }
+}
 
 impl<VM: VMBinding> GCWorker<VM> {
     pub fn new(
         ordinal: usize,
-        scheduler: Weak<GCWorkScheduler<VM>>,
+        scheduler: Arc<GCWorkScheduler<VM>>,
         is_coordinator: bool,
         sender: Sender<CoordinatorMessage<VM>>,
     ) -> Self {
-        let scheduler = scheduler.upgrade().unwrap();
+        let worker_monitor = scheduler.worker_monitor.clone();
         Self {
             tls: VMWorkerThread(VMThread::UNINITIALIZED),
             ordinal,
-            parked: AtomicBool::new(true),
             // We will set this later
             copy: GCWorkerCopyContext::new_non_copy(),
-            local_work_bucket: WorkBucket::new(true, scheduler.worker_monitor.clone()),
             sender,
             scheduler,
-            stat: Default::default(),
             mmtk: None,
             is_coordinator,
             local_work_buffer: Vec::with_capacity(LOCALLY_CACHED_WORKS),
+            shared: Arc::new(GCWorkerShared {
+                parked: AtomicBool::new(true),
+                stat: Default::default(),
+                local_work_bucket: WorkBucket::new(true, worker_monitor),
+            }),
         }
     }
 
@@ -73,10 +89,6 @@ impl<VM: VMBinding> GCWorker<VM> {
         }
     }
 
-    pub fn is_parked(&self) -> bool {
-        self.parked.load(Ordering::SeqCst)
-    }
-
     pub fn is_coordinator(&self) -> bool {
         self.is_coordinator
     }
@@ -97,52 +109,72 @@ impl<VM: VMBinding> GCWorker<VM> {
         self.tls = tls;
         self.copy = crate::plan::create_gc_worker_context(tls, mmtk);
         self.mmtk = Some(mmtk);
-        self.parked.store(false, Ordering::SeqCst);
+        self.shared.parked.store(false, Ordering::SeqCst);
         loop {
             while let Some((bucket, mut work)) = self.local_work_buffer.pop() {
                 debug_assert!(self.scheduler.work_buckets[bucket].is_activated());
                 work.do_work_with_stat(self, mmtk);
             }
             let mut work = self.scheduler().poll(self);
-            debug_assert!(!self.is_parked());
+            debug_assert!(!self.shared.is_parked());
             work.do_work_with_stat(self, mmtk);
         }
     }
 }
 
 pub struct WorkerGroup<VM: VMBinding> {
-    pub workers: Vec<GCWorker<VM>>,
+    pub workers_shared: Vec<Arc<GCWorkerShared<VM>>>,
+    // Allow vec_box: GCWorker are supposed to be heap-allocated.  This Vec, however, is temporary.
+    // When spawning mutators, the boxes will be moved out of the vec, and become owned by worker threads.
+    #[allow(clippy::vec_box)]
+    workers_to_spawn: Vec<Box<GCWorker<VM>>>,
 }
 
 impl<VM: VMBinding> WorkerGroup<VM> {
     pub fn new(
         workers: usize,
-        scheduler: Weak<GCWorkScheduler<VM>>,
+        scheduler: Arc<GCWorkScheduler<VM>>,
         sender: Sender<CoordinatorMessage<VM>>,
-    ) -> Arc<Self> {
-        Arc::new(Self {
-            workers: (0..workers)
-                .map(|i| GCWorker::new(i, scheduler.clone(), false, sender.clone()))
-                .collect(),
-        })
+    ) -> Self {
+        let mut workers_shared = Vec::new();
+        let mut workers_to_spawn = Vec::new();
+
+        for ordinal in 0..workers {
+            let worker = Box::new(GCWorker::new(
+                ordinal,
+                scheduler.clone(),
+                false,
+                sender.clone(),
+            ));
+            let worker_shared = worker.shared.clone();
+            workers_shared.push(worker_shared);
+            workers_to_spawn.push(worker);
+            // NOTE: We cannot call spawn_worker_thread here,
+            // because the worker will access the WorkerGroup in the scheduler which is not yet constructed.
+        }
+
+        Self {
+            workers_shared,
+            workers_to_spawn,
+        }
     }
 
     pub fn worker_count(&self) -> usize {
-        self.workers.len()
+        self.workers_shared.len()
     }
 
     pub fn parked_workers(&self) -> usize {
-        self.workers.iter().filter(|w| w.is_parked()).count()
+        self.workers_shared.iter().filter(|w| w.is_parked()).count()
     }
 
     pub fn all_parked(&self) -> bool {
         self.parked_workers() == self.worker_count()
     }
 
-    pub fn spawn_workers(&'static self, tls: VMThread, _mmtk: &'static MMTK<VM>) {
-        for i in 0..self.worker_count() {
-            let worker = &self.workers[i];
+    pub fn spawn_workers(&mut self, tls: VMThread, _mmtk: &'static MMTK<VM>) {
+        for worker in self.workers_to_spawn.drain(..) {
             VM::VMCollection::spawn_worker_thread(tls, Some(worker));
         }
+        assert!(self.workers_to_spawn.is_empty());
     }
 }