From af7a8ffad9c58deac791333a65c62d7fc72f9e9c Mon Sep 17 00:00:00 2001 From: Daniel Vetter Date: Wed, 11 Oct 2017 11:10:19 +0200 Subject: [PATCH] drm/i915: Use rcu instead of stop_machine in set_wedged stop_machine is not really a locking primitive we should use, except when the hw folks tell us the hw is broken and that's the only way to work around it. This patch tries to address the locking abuse of stop_machine() from commit 20e4933c478a1ca694b38fa4ac44d99e659941f5 Author: Chris Wilson Date: Tue Nov 22 14:41:21 2016 +0000 drm/i915: Stop the machine as we install the wedged submit_request handler Chris said parts of the reasons for going with stop_machine() was that it's no overhead for the fast-path. But these callbacks use irqsave spinlocks and do a bunch of MMIO, and rcu_read_lock is _real_ fast. To stay as close as possible to the stop_machine semantics we first update all the submit function pointers to the nop handler, then call synchronize_rcu() to make sure no new requests can be submitted. This should give us exactly the huge barrier we want. I pondered whether we should annotate engine->submit_request as __rcu and use rcu_assign_pointer and rcu_dereference on it. But the reason behind those is to make sure the compiler/cpu barriers are there for when you have an actual data structure you point at, to make sure all the writes are seen correctly on the read side. But we just have a function pointer, and .text isn't changed, so no need for these barriers and hence no need for annotations. Unfortunately there's a complication with the call to intel_engine_init_global_seqno: - Without stop_machine we must hold the corresponding spinlock. - Without stop_machine we must ensure that all requests are marked as having failed with dma_fence_set_error() before we call it. That means we need to split the nop request submission into two phases, both synchronized with rcu: 1. Only stop submitting the requests to hw and mark them as failed. 2. After all pending requests in the scheduler/ring are suitably marked up as failed and we can force complete them all, also force complete by calling intel_engine_init_global_seqno(). This should fix the followwing lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 4.14.0-rc3-CI-CI_DRM_3179+ #1 Tainted: G U ------------------------------------------------------ kworker/3:4/562 is trying to acquire lock: (cpu_hotplug_lock.rw_sem){++++}, at: [] stop_machine+0x1c/0x40 but task is already holding lock: (&dev->struct_mutex){+.+.}, at: [] i915_reset_device+0x1e8/0x260 [i915] which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #6 (&dev->struct_mutex){+.+.}: __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 __mutex_lock+0x86/0x9b0 mutex_lock_interruptible_nested+0x1b/0x20 i915_mutex_lock_interruptible+0x51/0x130 [i915] i915_gem_fault+0x209/0x650 [i915] __do_fault+0x1e/0x80 __handle_mm_fault+0xa08/0xed0 handle_mm_fault+0x156/0x300 __do_page_fault+0x2c5/0x570 do_page_fault+0x28/0x250 page_fault+0x22/0x30 -> #5 (&mm->mmap_sem){++++}: __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 __might_fault+0x68/0x90 _copy_to_user+0x23/0x70 filldir+0xa5/0x120 dcache_readdir+0xf9/0x170 iterate_dir+0x69/0x1a0 SyS_getdents+0xa5/0x140 entry_SYSCALL_64_fastpath+0x1c/0xb1 -> #4 (&sb->s_type->i_mutex_key#5){++++}: down_write+0x3b/0x70 handle_create+0xcb/0x1e0 devtmpfsd+0x139/0x180 kthread+0x152/0x190 ret_from_fork+0x27/0x40 -> #3 ((complete)&req.done){+.+.}: __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 wait_for_common+0x58/0x210 wait_for_completion+0x1d/0x20 devtmpfs_create_node+0x13d/0x160 device_add+0x5eb/0x620 device_create_groups_vargs+0xe0/0xf0 device_create+0x3a/0x40 msr_device_create+0x2b/0x40 cpuhp_invoke_callback+0xc9/0xbf0 cpuhp_thread_fun+0x17b/0x240 smpboot_thread_fn+0x18a/0x280 kthread+0x152/0x190 ret_from_fork+0x27/0x40 -> #2 (cpuhp_state-up){+.+.}: __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 cpuhp_issue_call+0x133/0x1c0 __cpuhp_setup_state_cpuslocked+0x139/0x2a0 __cpuhp_setup_state+0x46/0x60 page_writeback_init+0x43/0x67 pagecache_init+0x3d/0x42 start_kernel+0x3a8/0x3fc x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0x6d/0x70 verify_cpu+0x0/0xfb -> #1 (cpuhp_state_mutex){+.+.}: __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 __mutex_lock+0x86/0x9b0 mutex_lock_nested+0x1b/0x20 __cpuhp_setup_state_cpuslocked+0x53/0x2a0 __cpuhp_setup_state+0x46/0x60 page_alloc_init+0x28/0x30 start_kernel+0x145/0x3fc x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0x6d/0x70 verify_cpu+0x0/0xfb -> #0 (cpu_hotplug_lock.rw_sem){++++}: check_prev_add+0x430/0x840 __lock_acquire+0x1420/0x15e0 lock_acquire+0xb0/0x200 cpus_read_lock+0x3d/0xb0 stop_machine+0x1c/0x40 i915_gem_set_wedged+0x1a/0x20 [i915] i915_reset+0xb9/0x230 [i915] i915_reset_device+0x1f6/0x260 [i915] i915_handle_error+0x2d8/0x430 [i915] hangcheck_declare_hang+0xd3/0xf0 [i915] i915_hangcheck_elapsed+0x262/0x2d0 [i915] process_one_work+0x233/0x660 worker_thread+0x4e/0x3b0 kthread+0x152/0x190 ret_from_fork+0x27/0x40 other info that might help us debug this: Chain exists of: cpu_hotplug_lock.rw_sem --> &mm->mmap_sem --> &dev->struct_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&dev->struct_mutex); lock(&mm->mmap_sem); lock(&dev->struct_mutex); lock(cpu_hotplug_lock.rw_sem); *** DEADLOCK *** 3 locks held by kworker/3:4/562: #0: ("events_long"){+.+.}, at: [] process_one_work+0x1aa/0x660 #1: ((&(&i915->gpu_error.hangcheck_work)->work)){+.+.}, at: [] process_one_work+0x1aa/0x660 #2: (&dev->struct_mutex){+.+.}, at: [] i915_reset_device+0x1e8/0x260 [i915] stack backtrace: CPU: 3 PID: 562 Comm: kworker/3:4 Tainted: G U 4.14.0-rc3-CI-CI_DRM_3179+ #1 Hardware name: /NUC7i5BNB, BIOS BNKBL357.86A.0048.2017.0704.1415 07/04/2017 Workqueue: events_long i915_hangcheck_elapsed [i915] Call Trace: dump_stack+0x68/0x9f print_circular_bug+0x235/0x3c0 ? lockdep_init_map_crosslock+0x20/0x20 check_prev_add+0x430/0x840 ? irq_work_queue+0x86/0xe0 ? wake_up_klogd+0x53/0x70 __lock_acquire+0x1420/0x15e0 ? __lock_acquire+0x1420/0x15e0 ? lockdep_init_map_crosslock+0x20/0x20 lock_acquire+0xb0/0x200 ? stop_machine+0x1c/0x40 ? i915_gem_object_truncate+0x50/0x50 [i915] cpus_read_lock+0x3d/0xb0 ? stop_machine+0x1c/0x40 stop_machine+0x1c/0x40 i915_gem_set_wedged+0x1a/0x20 [i915] i915_reset+0xb9/0x230 [i915] i915_reset_device+0x1f6/0x260 [i915] ? gen8_gt_irq_ack+0x170/0x170 [i915] ? work_on_cpu_safe+0x60/0x60 i915_handle_error+0x2d8/0x430 [i915] ? vsnprintf+0xd1/0x4b0 ? scnprintf+0x3a/0x70 hangcheck_declare_hang+0xd3/0xf0 [i915] ? intel_runtime_pm_put+0x56/0xa0 [i915] i915_hangcheck_elapsed+0x262/0x2d0 [i915] process_one_work+0x233/0x660 worker_thread+0x4e/0x3b0 kthread+0x152/0x190 ? process_one_work+0x660/0x660 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x27/0x40 Setting dangerous option reset - tainting kernel i915 0000:00:02.0: Resetting chip after gpu hang Setting dangerous option reset - tainting kernel i915 0000:00:02.0: Resetting chip after gpu hang v2: Have 1 global synchronize_rcu() barrier across all engines, and improve commit message. v3: We need to protect the seqno update with the timeline spinlock (in set_wedged) to avoid racing with other updates of the seqno, like we already do in nop_submit_request (Chris). v4: Use two-phase sequence to plug the race Chris spotted where we can complete requests before they're marked up with -EIO. v5: Review from Chris: - simplify nop_submit_request. - Add comment to rcu_read_lock section. - Align comments with the new style. v6: Remove unused variable to appease CI. Reviewed-by: Chris Wilson Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102886 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103096 Cc: Chris Wilson Cc: Mika Kuoppala Cc: Thomas Gleixner Cc: Marta Lofstedt Signed-off-by: Daniel Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20171011091019.1425-1-daniel.vetter@ffwll.ch --- drivers/gpu/drm/i915/i915_gem.c | 80 ++++++++++++------- drivers/gpu/drm/i915/i915_gem_request.c | 9 +++ .../gpu/drm/i915/selftests/i915_gem_request.c | 2 + 3 files changed, 62 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index f76890b74d000..20fcac37c85a0 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -3086,6 +3086,14 @@ void i915_gem_reset_finish(struct drm_i915_private *dev_priv) } static void nop_submit_request(struct drm_i915_gem_request *request) +{ + GEM_BUG_ON(!i915_terminally_wedged(&request->i915->gpu_error)); + dma_fence_set_error(&request->fence, -EIO); + + i915_gem_request_submit(request); +} + +static void nop_complete_submit_request(struct drm_i915_gem_request *request) { unsigned long flags; @@ -3098,45 +3106,59 @@ static void nop_submit_request(struct drm_i915_gem_request *request) spin_unlock_irqrestore(&request->engine->timeline->lock, flags); } -static void engine_set_wedged(struct intel_engine_cs *engine) +void i915_gem_set_wedged(struct drm_i915_private *i915) { - /* We need to be sure that no thread is running the old callback as - * we install the nop handler (otherwise we would submit a request - * to hardware that will never complete). In order to prevent this - * race, we wait until the machine is idle before making the swap - * (using stop_machine()). + struct intel_engine_cs *engine; + enum intel_engine_id id; + + /* + * First, stop submission to hw, but do not yet complete requests by + * rolling the global seqno forward (since this would complete requests + * for which we haven't set the fence error to EIO yet). + */ + for_each_engine(engine, i915, id) + engine->submit_request = nop_submit_request; + + /* + * Make sure no one is running the old callback before we proceed with + * cancelling requests and resetting the completion tracking. Otherwise + * we might submit a request to the hardware which never completes. */ - engine->submit_request = nop_submit_request; + synchronize_rcu(); + + for_each_engine(engine, i915, id) { + /* Mark all executing requests as skipped */ + engine->cancel_requests(engine); - /* Mark all executing requests as skipped */ - engine->cancel_requests(engine); + /* + * Only once we've force-cancelled all in-flight requests can we + * start to complete all requests. + */ + engine->submit_request = nop_complete_submit_request; + } - /* Mark all pending requests as complete so that any concurrent - * (lockless) lookup doesn't try and wait upon the request as we - * reset it. + /* + * Make sure no request can slip through without getting completed by + * either this call here to intel_engine_init_global_seqno, or the one + * in nop_complete_submit_request. */ - intel_engine_init_global_seqno(engine, - intel_engine_last_submit(engine)); -} + synchronize_rcu(); -static int __i915_gem_set_wedged_BKL(void *data) -{ - struct drm_i915_private *i915 = data; - struct intel_engine_cs *engine; - enum intel_engine_id id; + for_each_engine(engine, i915, id) { + unsigned long flags; - for_each_engine(engine, i915, id) - engine_set_wedged(engine); + /* Mark all pending requests as complete so that any concurrent + * (lockless) lookup doesn't try and wait upon the request as we + * reset it. + */ + spin_lock_irqsave(&engine->timeline->lock, flags); + intel_engine_init_global_seqno(engine, + intel_engine_last_submit(engine)); + spin_unlock_irqrestore(&engine->timeline->lock, flags); + } set_bit(I915_WEDGED, &i915->gpu_error.flags); wake_up_all(&i915->gpu_error.reset_queue); - - return 0; -} - -void i915_gem_set_wedged(struct drm_i915_private *dev_priv) -{ - stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL); } bool i915_gem_unset_wedged(struct drm_i915_private *i915) diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index d5f4023e5d630..d140fcf5c6a39 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -556,7 +556,16 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state) switch (state) { case FENCE_COMPLETE: trace_i915_gem_request_submit(request); + /* + * We need to serialize use of the submit_request() callback with its + * hotplugging performed during an emergency i915_gem_set_wedged(). + * We use the RCU mechanism to mark the critical section in order to + * force i915_gem_set_wedged() to wait until the submit_request() is + * completed before proceeding. + */ + rcu_read_lock(); request->engine->submit_request(request); + rcu_read_unlock(); break; case FENCE_FREE: diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_request.c b/drivers/gpu/drm/i915/selftests/i915_gem_request.c index 78b9f811707f9..a999161e8db1e 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem_request.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem_request.c @@ -215,7 +215,9 @@ static int igt_request_rewind(void *arg) } i915_gem_request_get(vip); i915_add_request(vip); + rcu_read_lock(); request->engine->submit_request(request); + rcu_read_unlock(); mutex_unlock(&i915->drm.struct_mutex);