feat(profiling): Introduce different profiler schedulers (#1616)

Previously, the only scheduling mechanism was via `signals.SIGPROF`. This was limited to UNIX platforms and was not always consistent. This PR introduces more ways to schedule the sampling. They are the following: - `_SigprofScheduler` uses `signals.SIGPROF` to schedule - `_SigalrmScheduler` uses `signals.SIGALRM` to schedule - `_SleepScheduler` uses threads and `time.sleep` to schedule - `_EventScheduler` uses threads and `threading.Event().wait` to schedule
getsentry · Sep 19, 2022 · 19720e6 · 19720e6
1 parent e32f224
commit 19720e6
Show file tree

Hide file tree

Showing 2 changed files with 243 additions and 45 deletions.
diff --git a/sentry_sdk/client.py b/sentry_sdk/client.py
@@ -134,9 +134,9 @@ def _capture_envelope(envelope):
         profiles_sample_rate = self.options["_experiments"].get("profiles_sample_rate")
         if profiles_sample_rate is not None and profiles_sample_rate > 0:
             try:
-                setup_profiler()
-            except ValueError:
-                logger.debug("Profiling can only be enabled from the main thread.")
+                setup_profiler(self.options)
+            except ValueError as e:
+                logger.debug(str(e))
 
     @property
     def dsn(self):

diff --git a/sentry_sdk/profiler.py b/sentry_sdk/profiler.py
@@ -64,18 +64,15 @@ def nanosecond_time():
 _scheduler = None  # type: Optional[_Scheduler]
 
 
-def setup_profiler(buffer_secs=60, frequency=101):
-    # type: (int, int) -> None
+def setup_profiler(options):
+    # type: (Dict[str, Any]) -> None
 
     """
-    This method sets up the application so that it can be profiled.
-    It MUST be called from the main thread. This is a limitation of
-    python's signal library where it only allows the main thread to
-    set a signal handler.
-
     `buffer_secs` determines the max time a sample will be buffered for
     `frequency` determines the number of samples to take per second (Hz)
     """
+    buffer_secs = 60
+    frequency = 101
 
     global _sample_buffer
     global _scheduler
@@ -86,11 +83,19 @@ def setup_profiler(buffer_secs=60, frequency=101):
     # a capcity of `buffer_secs * frequency`.
     _sample_buffer = _SampleBuffer(capacity=buffer_secs * frequency)
 
-    _scheduler = _Scheduler(frequency=frequency)
+    profiler_mode = options["_experiments"].get("profiler_mode", _SigprofScheduler.mode)
+    if profiler_mode == _SigprofScheduler.mode:
+        _scheduler = _SigprofScheduler(frequency=frequency)
+    elif profiler_mode == _SigalrmScheduler.mode:
+        _scheduler = _SigalrmScheduler(frequency=frequency)
+    elif profiler_mode == _SleepScheduler.mode:
+        _scheduler = _SleepScheduler(frequency=frequency)
+    elif profiler_mode == _EventScheduler.mode:
+        _scheduler = _EventScheduler(frequency=frequency)
+    else:
+        raise ValueError("Unknown profiler mode: {}".format(profiler_mode))
+    _scheduler.setup()
 
-    # This setups a process wide signal handler that will be called
-    # at an interval to record samples.
-    signal.signal(signal.SIGPROF, _sample_stack)
     atexit.register(teardown_profiler)
 
 
@@ -100,32 +105,18 @@ def teardown_profiler():
     global _sample_buffer
     global _scheduler
 
+    if _scheduler is not None:
+        _scheduler.teardown()
+
     _sample_buffer = None
     _scheduler = None
 
-    # setting the timer with 0 will stop will clear the timer
-    signal.setitimer(signal.ITIMER_PROF, 0)
-
-    # put back the default signal handler
-    signal.signal(signal.SIGPROF, signal.SIG_DFL)
 
-
-def _sample_stack(_signal_num, _frame):
-    # type: (int, Frame) -> None
+def _sample_stack(*args, **kwargs):
+    # type: (*Any, **Any) -> None
     """
     Take a sample of the stack on all the threads in the process.
-    This handler is called to handle the signal at a set interval.
-
-    See https://www.gnu.org/software/libc/manual/html_node/Alarm-Signals.html
-
-    This is not based on wall time, and you may see some variances
-    in the frequency at which this handler is called.
-
-    Notably, it looks like only threads started using the threading
-    module counts towards the time elapsed. It is unclear why that
-    is the case right now. However, we are able to get samples from
-    threading._DummyThread if this handler is called as a result of
-    another thread (e.g. the main thread).
+    This should be called at a regular interval to collect samples.
     """
 
     assert _sample_buffer is not None
@@ -298,33 +289,240 @@ def slice_profile(self, start_ns, stop_ns):
 
 
 class _Scheduler(object):
+    mode = "unknown"
+
     def __init__(self, frequency):
         # type: (int) -> None
         self._lock = threading.Lock()
         self._count = 0
         self._interval = 1.0 / frequency
 
+    def setup(self):
+        # type: () -> None
+        raise NotImplementedError
+
+    def teardown(self):
+        # type: () -> None
+        raise NotImplementedError
+
     def start_profiling(self):
         # type: () -> bool
         with self._lock:
-            # we only need to start the timer if we're starting the first profile
-            should_start_timer = self._count == 0
             self._count += 1
-
-        if should_start_timer:
-            signal.setitimer(signal.ITIMER_PROF, self._interval, self._interval)
-        return should_start_timer
+            return self._count == 1
 
     def stop_profiling(self):
         # type: () -> bool
         with self._lock:
-            # we only need to stop the timer if we're stoping the last profile
-            should_stop_timer = self._count == 1
             self._count -= 1
+            return self._count == 0
+
+
+class _ThreadScheduler(_Scheduler):
+    """
+    This abstract scheduler is based on running a daemon thread that will call
+    the sampler at a regular interval.
+    """
+
+    mode = "thread"
+
+    def __init__(self, frequency):
+        # type: (int) -> None
+        super(_ThreadScheduler, self).__init__(frequency)
+        self.event = threading.Event()
+
+    def setup(self):
+        # type: () -> None
+        pass
+
+    def teardown(self):
+        # type: () -> None
+        pass
+
+    def start_profiling(self):
+        # type: () -> bool
+        if super(_ThreadScheduler, self).start_profiling():
+            # make sure to clear the event as we reuse the same event
+            # over the lifetime of the scheduler
+            self.event.clear()
+
+            # make sure the thread is a daemon here otherwise this
+            # can keep the application running after other threads
+            # have exited
+            thread = threading.Thread(target=self.run, daemon=True)
+            thread.start()
+            return True
+        return False
+
+    def stop_profiling(self):
+        # type: () -> bool
+        if super(_ThreadScheduler, self).stop_profiling():
+            # make sure the set the event here so that the thread
+            # can check to see if it should keep running
+            self.event.set()
+            return True
+        return False
+
+    def run(self):
+        # type: () -> None
+        raise NotImplementedError
+
+
+class _SleepScheduler(_ThreadScheduler):
+    """
+    This scheduler uses time.sleep to wait the required interval before calling
+    the sampling function.
+    """
+
+    mode = "sleep"
+
+    def run(self):
+        # type: () -> None
+        while True:
+            if self.event.is_set():
+                break
+            time.sleep(self._interval)
+            _sample_stack()
+
+
+class _EventScheduler(_ThreadScheduler):
+    """
+    This scheduler uses threading.Event to wait the required interval before
+    calling the sampling function.
+    """
+
+    mode = "event"
 
-        if should_stop_timer:
-            signal.setitimer(signal.ITIMER_PROF, 0)
-        return should_stop_timer
+    def run(self):
+        # type: () -> None
+        while True:
+            if self.event.is_set():
+                break
+            self.event.wait(timeout=self._interval)
+            _sample_stack()
+
+
+class _SignalScheduler(_Scheduler):
+    """
+    This abstract scheduler is based on UNIX signals. It sets up a
+    signal handler for the specified signal, and the matching itimer in order
+    for the signal handler to fire at a regular interval.
+
+    See https://www.gnu.org/software/libc/manual/html_node/Alarm-Signals.html
+    """
+
+    mode = "signal"
+
+    @property
+    def signal_num(self):
+        # type: () -> signal.Signals
+        raise NotImplementedError
+
+    @property
+    def signal_timer(self):
+        # type: () -> int
+        raise NotImplementedError
+
+    def setup(self):
+        # type: () -> None
+        """
+        This method sets up the application so that it can be profiled.
+        It MUST be called from the main thread. This is a limitation of
+        python's signal library where it only allows the main thread to
+        set a signal handler.
+        """
+
+        # This setups a process wide signal handler that will be called
+        # at an interval to record samples.
+        try:
+            signal.signal(self.signal_num, _sample_stack)
+        except ValueError:
+            raise ValueError(
+                "Signal based profiling can only be enabled from the main thread."
+            )
+
+        # Ensures that system calls interrupted by signals are restarted
+        # automatically. Otherwise, we may see some strage behaviours
+        # such as IOErrors caused by the system call being interrupted.
+        signal.siginterrupt(self.signal_num, False)
+
+    def teardown(self):
+        # type: () -> None
+
+        # setting the timer with 0 will stop will clear the timer
+        signal.setitimer(self.signal_timer, 0)
+
+        # put back the default signal handler
+        signal.signal(self.signal_num, signal.SIG_DFL)
+
+    def start_profiling(self):
+        # type: () -> bool
+        if super(_SignalScheduler, self).start_profiling():
+            signal.setitimer(self.signal_timer, self._interval, self._interval)
+            return True
+        return False
+
+    def stop_profiling(self):
+        # type: () -> bool
+        if super(_SignalScheduler, self).stop_profiling():
+            signal.setitimer(self.signal_timer, 0)
+            return True
+        return False
+
+
+class _SigprofScheduler(_SignalScheduler):
+    """
+    This scheduler uses SIGPROF to regularly call a signal handler where the
+    samples will be taken.
+
+    This is not based on wall time, and you may see some variances
+    in the frequency at which this handler is called.
+
+    This has some limitations:
+    - Only the main thread counts towards the time elapsed. This means that if
+      the main thread is blocking on a sleep() or select() system call, then
+      this clock will not count down. Some examples of this in practice are
+        - When using uwsgi with multiple threads in a worker, the non main
+          threads will only be profiled if the main thread is actively running
+          at the same time.
+        - When using gunicorn with threads, the main thread does not handle the
+          requests directly, so the clock counts down slower than expected since
+          its mostly idling while waiting for requests.
+    """
+
+    mode = "sigprof"
+
+    @property
+    def signal_num(self):
+        # type: () -> signal.Signals
+        return signal.SIGPROF
+
+    @property
+    def signal_timer(self):
+        # type: () -> int
+        return signal.ITIMER_PROF
+
+
+class _SigalrmScheduler(_SignalScheduler):
+    """
+    This scheduler uses SIGALRM to regularly call a signal handler where the
+    samples will be taken.
+
+    This is based on real time, so it *should* be called close to the expected
+    frequency.
+    """
+
+    mode = "sigalrm"
+
+    @property
+    def signal_num(self):
+        # type: () -> signal.Signals
+        return signal.SIGALRM
+
+    @property
+    def signal_timer(self):
+        # type: () -> int
+        return signal.ITIMER_REAL
 
 
 def _should_profile(transaction, hub):