From 0dec0707a2803d6ba5fe30d779e3358e19c04d75 Mon Sep 17 00:00:00 2001 From: Tony Xiao Date: Wed, 14 Sep 2022 17:24:23 -0400 Subject: [PATCH 1/5] feat(profiling): Introduce different profiler schedulers Previously, the only scheduling mechanism was via `signals.SIGPROF`. This was limited to UNIX platforms and was not always consistent. This PR introduces more ways to schedule the sampling. They are the following: - `_SigprofScheduler` uses `signals.SIGPROF` to schedule - `_SigalrmScheduler` uses `signals.SIGALRM` to schedule - `_SleepScheduler` uses threads and `time.sleep` to schedule - `_EventScheduler` uses threads and `threading.Event().wait` to schedule --- sentry_sdk/profiler.py | 169 +++++++++++++++++++++++++++++++++++------ 1 file changed, 145 insertions(+), 24 deletions(-) diff --git a/sentry_sdk/profiler.py b/sentry_sdk/profiler.py index fcfde6ef0d..95e7ff6ec4 100644 --- a/sentry_sdk/profiler.py +++ b/sentry_sdk/profiler.py @@ -76,7 +76,6 @@ def setup_profiler(buffer_secs=60, frequency=101): `buffer_secs` determines the max time a sample will be buffered for `frequency` determines the number of samples to take per second (Hz) """ - global _sample_buffer global _scheduler @@ -86,11 +85,9 @@ def setup_profiler(buffer_secs=60, frequency=101): # a capcity of `buffer_secs * frequency`. _sample_buffer = _SampleBuffer(capacity=buffer_secs * frequency) - _scheduler = _Scheduler(frequency=frequency) + _scheduler = _SigprofScheduler(frequency=frequency) + _scheduler.setup() - # This setups a process wide signal handler that will be called - # at an interval to record samples. - signal.signal(signal.SIGPROF, _sample_stack) atexit.register(teardown_profiler) @@ -100,18 +97,15 @@ def teardown_profiler(): global _sample_buffer global _scheduler + if _scheduler is not None: + _scheduler.teardown() + _sample_buffer = None _scheduler = None - # setting the timer with 0 will stop will clear the timer - signal.setitimer(signal.ITIMER_PROF, 0) - - # put back the default signal handler - signal.signal(signal.SIGPROF, signal.SIG_DFL) - -def _sample_stack(_signal_num, _frame): - # type: (int, Frame) -> None +def _sample_stack(): + # type: () -> None """ Take a sample of the stack on all the threads in the process. This handler is called to handle the signal at a set interval. @@ -304,27 +298,154 @@ def __init__(self, frequency): self._count = 0 self._interval = 1.0 / frequency + def setup(self): + # type: () -> None + raise NotImplementedError + + def teardown(self): + # type: () -> None + raise NotImplementedError + def start_profiling(self): # type: () -> bool with self._lock: - # we only need to start the timer if we're starting the first profile - should_start_timer = self._count == 0 self._count += 1 - - if should_start_timer: - signal.setitimer(signal.ITIMER_PROF, self._interval, self._interval) - return should_start_timer + return self._count == 1 def stop_profiling(self): # type: () -> bool with self._lock: - # we only need to stop the timer if we're stoping the last profile - should_stop_timer = self._count == 1 self._count -= 1 + return self._count == 0 + + +class _ThreadScheduler(_Scheduler): + event = threading.Event() + + def setup(self): + # type: () -> None + pass + + def teardown(self): + # type: () -> None + pass + + def start_profiling(self): + # type: () -> bool + if super(_ThreadScheduler, self).start_profiling(): + thread = threading.Thread(target=self.run, daemon=True) + thread.start() + return True + return False + + def stop_profiling(self): + # type: () -> bool + if super(_ThreadScheduler, self).stop_profiling(): + self.event.set() + return True + return False + + def run(self): + # type: () -> None + raise NotImplementedError + + +class _SleepScheduler(_ThreadScheduler): + def run(self): + # type: () -> None + while True: + if self.event.is_set(): + self.event.clear() + break + time.sleep(self._interval) + _sample_stack() - if should_stop_timer: - signal.setitimer(signal.ITIMER_PROF, 0) - return should_stop_timer + +class _EventScheduler(_ThreadScheduler): + def run(self): + # type: () -> None + while True: + if self.event.is_set(): + self.event.clear() + break + self.event.wait(timeout=self._interval) + _sample_stack() + + +class _SignalScheduler(_Scheduler): + @property + def signal_num(self): + # type: () -> signal.Signals + raise NotImplementedError + + @property + def signal_timer(self): + # type: () -> int + raise NotImplementedError + + def _signal_handler(self, _signal_num, _frame): + # type: (int, Frame) -> None + """ + Wraps around the _sample_stack function. The signal handler + is called with the signal number and the interrupted frame. + The _sample_stack function does not take any argumnents so + we just use this wrapper to discard the arguments. + """ + _sample_stack() + + def setup(self): + # type: () -> None + + # This setups a process wide signal handler that will be called + # at an interval to record samples. + signal.signal(self.signal_num, self._signal_handler) + + def teardown(self): + # type: () -> None + + # setting the timer with 0 will stop will clear the timer + signal.setitimer(self.signal_timer, 0) + + # put back the default signal handler + signal.signal(self.signal_num, signal.SIG_DFL) + + def start_profiling(self): + # type: () -> bool + if super(_SignalScheduler, self).start_profiling(): + signal.setitimer(self.signal_timer, self._interval, self._interval) + return True + return False + + def stop_profiling(self): + # type: () -> bool + if super(_SignalScheduler, self).stop_profiling(): + signal.setitimer(self.signal_timer, 0) + return True + return False + + +class _SigprofScheduler(_SignalScheduler): + @property + def signal_num(self): + # type: () -> signal.Signals + return signal.SIGPROF + + @property + def signal_timer(self): + # type: () -> int + return signal.ITIMER_PROF + + +class _SigalrmScheduler(_SignalScheduler): + @property + def signal_num(self): + # type: () -> signal.Signals + return signal.SIGALRM + + @property + def signal_timer(self): + # type: () -> int + return signal.ITIMER_REAL def _should_profile(hub): From f97d358139760cdcfd712cb72f158c50113a13a3 Mon Sep 17 00:00:00 2001 From: Tony Xiao Date: Wed, 14 Sep 2022 18:11:01 -0400 Subject: [PATCH 2/5] clean up comments --- sentry_sdk/profiler.py | 118 ++++++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 32 deletions(-) diff --git a/sentry_sdk/profiler.py b/sentry_sdk/profiler.py index 95e7ff6ec4..17d821ec64 100644 --- a/sentry_sdk/profiler.py +++ b/sentry_sdk/profiler.py @@ -68,11 +68,6 @@ def setup_profiler(buffer_secs=60, frequency=101): # type: (int, int) -> None """ - This method sets up the application so that it can be profiled. - It MUST be called from the main thread. This is a limitation of - python's signal library where it only allows the main thread to - set a signal handler. - `buffer_secs` determines the max time a sample will be buffered for `frequency` determines the number of samples to take per second (Hz) """ @@ -104,22 +99,11 @@ def teardown_profiler(): _scheduler = None -def _sample_stack(): +def _sample_stack(*args, **kwargs): # type: () -> None """ Take a sample of the stack on all the threads in the process. - This handler is called to handle the signal at a set interval. - - See https://www.gnu.org/software/libc/manual/html_node/Alarm-Signals.html - - This is not based on wall time, and you may see some variances - in the frequency at which this handler is called. - - Notably, it looks like only threads started using the threading - module counts towards the time elapsed. It is unclear why that - is the case right now. However, we are able to get samples from - threading._DummyThread if this handler is called as a result of - another thread (e.g. the main thread). + This should be called at a regular interval to collect samples. """ assert _sample_buffer is not None @@ -292,6 +276,8 @@ def slice_profile(self, start_ns, stop_ns): class _Scheduler(object): + mode = "unknown" + def __init__(self, frequency): # type: (int) -> None self._lock = threading.Lock() @@ -320,7 +306,17 @@ def stop_profiling(self): class _ThreadScheduler(_Scheduler): - event = threading.Event() + """ + This abstract scheduler is based on running a daemon thread that will call + the sampler at a regular interval. + """ + + mode = "thread" + + def __init__(self, frequency): + # type: (int) -> None + super(_ThreadScheduler, self).__init__(frequency) + self.event = threading.Event() def setup(self): # type: () -> None @@ -333,6 +329,13 @@ def teardown(self): def start_profiling(self): # type: () -> bool if super(_ThreadScheduler, self).start_profiling(): + # make sure to clear the event as we reuse the same event + # over the lifetime of the scheduler + self.event.clear() + + # make sure the thread is a daemon here otherwise this + # can keep the application running after other threads + # have exited thread = threading.Thread(target=self.run, daemon=True) thread.start() return True @@ -341,6 +344,8 @@ def start_profiling(self): def stop_profiling(self): # type: () -> bool if super(_ThreadScheduler, self).stop_profiling(): + # make sure the set the event here so that the thread + # can check to see if it should keep running self.event.set() return True return False @@ -351,28 +356,50 @@ def run(self): class _SleepScheduler(_ThreadScheduler): + """ + This scheduler uses time.sleep to wait the required interval before calling + the sampling function. + """ + + mode = "sleep" + def run(self): # type: () -> None while True: if self.event.is_set(): - self.event.clear() break time.sleep(self._interval) _sample_stack() class _EventScheduler(_ThreadScheduler): + """ + This scheduler uses threading.Event to wait the required interval before + calling the sampling function. + """ + + mode = "event" + def run(self): # type: () -> None while True: if self.event.is_set(): - self.event.clear() break self.event.wait(timeout=self._interval) _sample_stack() class _SignalScheduler(_Scheduler): + """ + This abstract scheduler is based on UNIX signals. It sets up a + signal handler for the specified signal, and the matching itimer in order + for the signal handler to fire at a regular interval. + + See https://www.gnu.org/software/libc/manual/html_node/Alarm-Signals.html + """ + + mode = "signal" + @property def signal_num(self): # type: () -> signal.Signals @@ -383,22 +410,18 @@ def signal_timer(self): # type: () -> int raise NotImplementedError - def _signal_handler(self, _signal_num, _frame): - # type: (int, Frame) -> None - """ - Wraps around the _sample_stack function. The signal handler - is called with the signal number and the interrupted frame. - The _sample_stack function does not take any argumnents so - we just use this wrapper to discard the arguments. - """ - _sample_stack() - def setup(self): # type: () -> None + """ + This method sets up the application so that it can be profiled. + It MUST be called from the main thread. This is a limitation of + python's signal library where it only allows the main thread to + set a signal handler. + """ # This setups a process wide signal handler that will be called # at an interval to record samples. - signal.signal(self.signal_num, self._signal_handler) + signal.signal(self.signal_num, _sample_stack) def teardown(self): # type: () -> None @@ -425,6 +448,27 @@ def stop_profiling(self): class _SigprofScheduler(_SignalScheduler): + """ + This scheduler uses SIGPROF to regularly call a signal handler where the + samples will be taken. + + This is not based on wall time, and you may see some variances + in the frequency at which this handler is called. + + This has some limitations: + - Only the main thread counts towards the time elapsed. This means that if + the main thread is blocking on a sleep() or select() system call, then + this clock will not count down. Some examples of this in practice are + - When using uwsgi with multiple threads in a worker, the non main + threads will only be profiled if the main thread is actively running + at the same time. + - When using gunicorn with threads, the main thread does not handle the + requests directly, so the clock counts down slower than expected since + its mostly idling while waiting for requests. + """ + + mode = "sigprof" + @property def signal_num(self): # type: () -> signal.Signals @@ -437,6 +481,16 @@ def signal_timer(self): class _SigalrmScheduler(_SignalScheduler): + """ + This scheduler uses SIGALRM to regularly call a signal handler where the + samples will be taken. + + This is based on real time, so it *should* be called close to the expected + frequency. + """ + + mode = "sigalrm" + @property def signal_num(self): # type: () -> signal.Signals From c0af7e3f044316da767f6b7b4fb347471faa69e8 Mon Sep 17 00:00:00 2001 From: Tony Xiao Date: Thu, 15 Sep 2022 10:09:06 -0400 Subject: [PATCH 3/5] add type annotation --- sentry_sdk/profiler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sentry_sdk/profiler.py b/sentry_sdk/profiler.py index 17d821ec64..0b487c26ec 100644 --- a/sentry_sdk/profiler.py +++ b/sentry_sdk/profiler.py @@ -100,7 +100,7 @@ def teardown_profiler(): def _sample_stack(*args, **kwargs): - # type: () -> None + # type: (*Any, **Any) -> None """ Take a sample of the stack on all the threads in the process. This should be called at a regular interval to collect samples. From d0e355870dcd59daeba30682d00218bb62629c54 Mon Sep 17 00:00:00 2001 From: Tony Xiao Date: Thu, 15 Sep 2022 16:11:31 -0400 Subject: [PATCH 4/5] add experiment to enable various profiler modes --- sentry_sdk/client.py | 6 +++--- sentry_sdk/profiler.py | 29 +++++++++++++++++++++++++---- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/sentry_sdk/client.py b/sentry_sdk/client.py index dec9018154..a0b0bc233f 100644 --- a/sentry_sdk/client.py +++ b/sentry_sdk/client.py @@ -134,9 +134,9 @@ def _capture_envelope(envelope): profiles_sample_rate = self.options["_experiments"].get("profiles_sample_rate") if profiles_sample_rate is not None and profiles_sample_rate > 0: try: - setup_profiler() - except ValueError: - logger.debug("Profiling can only be enabled from the main thread.") + setup_profiler(self.options) + except ValueError as e: + logger.debug(str(e)) @property def dsn(self): diff --git a/sentry_sdk/profiler.py b/sentry_sdk/profiler.py index 0b487c26ec..45e44ded6a 100644 --- a/sentry_sdk/profiler.py +++ b/sentry_sdk/profiler.py @@ -64,13 +64,16 @@ def nanosecond_time(): _scheduler = None # type: Optional[_Scheduler] -def setup_profiler(buffer_secs=60, frequency=101): - # type: (int, int) -> None +def setup_profiler(options): + # type: (Dict[str, Any]) -> None """ `buffer_secs` determines the max time a sample will be buffered for `frequency` determines the number of samples to take per second (Hz) """ + buffer_secs = 60 + frequency = 101 + global _sample_buffer global _scheduler @@ -80,7 +83,17 @@ def setup_profiler(buffer_secs=60, frequency=101): # a capcity of `buffer_secs * frequency`. _sample_buffer = _SampleBuffer(capacity=buffer_secs * frequency) - _scheduler = _SigprofScheduler(frequency=frequency) + profiler_mode = options["_experiments"].get("profiler_mode", _SigprofScheduler.mode) + if profiler_mode == _SigprofScheduler.mode: + _scheduler = _SigprofScheduler(frequency=frequency) + elif profiler_mode == _SigalrmScheduler.mode: + _scheduler = _SigalrmScheduler(frequency=frequency) + elif profiler_mode == _SleepScheduler.mode: + _scheduler = _SleepScheduler(frequency=frequency) + elif profiler_mode == _EventScheduler.mode: + _scheduler = _EventScheduler(frequency=frequency) + else: + raise ValueError("Unknown profiler mode: {}".format(profiler_mode)) _scheduler.setup() atexit.register(teardown_profiler) @@ -421,7 +434,15 @@ def setup(self): # This setups a process wide signal handler that will be called # at an interval to record samples. - signal.signal(self.signal_num, _sample_stack) + try: + signal.signal(self.signal_num, _sample_stack) + except ValueError: + raise ValueError("Signal based profiling can only be enabled from the main thread.") + + # Ensures that system calls interrupted by signals are restarted + # automatically. Otherwise, we may see some strage behaviours + # such as IOErrors caused by the system call being interrupted. + signal.siginterrupt(self.signal_num, False) def teardown(self): # type: () -> None From d8091721f5099f3ebcf2a6046f8138cc371f14c2 Mon Sep 17 00:00:00 2001 From: Tony Xiao Date: Thu, 15 Sep 2022 16:20:59 -0400 Subject: [PATCH 5/5] lint --- sentry_sdk/profiler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sentry_sdk/profiler.py b/sentry_sdk/profiler.py index 45e44ded6a..21bf599560 100644 --- a/sentry_sdk/profiler.py +++ b/sentry_sdk/profiler.py @@ -437,7 +437,9 @@ def setup(self): try: signal.signal(self.signal_num, _sample_stack) except ValueError: - raise ValueError("Signal based profiling can only be enabled from the main thread.") + raise ValueError( + "Signal based profiling can only be enabled from the main thread." + ) # Ensures that system calls interrupted by signals are restarted # automatically. Otherwise, we may see some strage behaviours