From 8a9fec4d09045c929dbb19b5fb54ef48c1c37f9d Mon Sep 17 00:00:00 2001 From: Emmett Butler <723615+emmettbutler@users.noreply.github.com> Date: Mon, 30 Jan 2023 19:49:28 -0800 Subject: [PATCH] chore(gunicorn): test PeriodicService under various gunicorn+ddtracepy configs (#4962) ## Description This pull request tries to answer two questions: * Is the [public documentation about gunicorn](https://ddtrace.readthedocs.io/en/stable/advanced_usage.html#gunicorn) accurate and complete? * What problematic ddtracepy behavior was [this commit](https://github.com/DataDog/dd-trace-py/commit/e1ab88c78050f0fbaf1232a7cc8431b097b4f048) fixing? It does so by implementing a series of integration tests against a real gunicorn server. Each test configures the server and ddtracepy in a different way. These tests check each configuration against known failure modes. The set of configurations that avoid all known failure modes is the one that the updated documentation recommends. There were a lot more tests exercising various buggy configurations, but I removed them to make CI run faster. Let me know if you think some of those other configurations need to be included in automated testing. By "known failure modes", I mean degraded performance, duplicated work, or crashes that I found through experimentation based on following logical threads from the relevant issues below. The big comment in the diff explains it in detail. ## Checklist - [x] [Library documentation](https://github.com/DataDog/dd-trace-py/tree/1.x/docs) - [ ] update these https://docs.datadoghq.com/ - [ ] update these https://app.datadoghq.com/apm/service-setup?architecture=host-based&language=python (I'll do the other documentation locations once we're in agreement about the documentation decisions made in this PR) ## Relevant issue(s) https://github.com/DataDog/dd-trace-py/pull/2894 https://github.com/DataDog/dd-trace-py/pull/4810 https://github.com/DataDog/dd-trace-py/pull/4070 https://github.com/DataDog/dd-trace-py/pull/1799 ## Testing strategy After the experimentation process that informed the integration test writing, the testing strategy is the tests themselves. ## Reviewer Checklist - [ ] Title is accurate. - [ ] Description motivates each change. - [ ] No unnecessary changes were introduced in this PR. - [ ] Avoid breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes unless absolutely necessary. - [ ] Tests provided or description of manual testing performed is included in the code or PR. - [ ] Release note has been added and follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/contributing.html#Release-Note-Guidelines), or else `changelog/no-changelog` label added. - [ ] All relevant GitHub issues are correctly linked. - [ ] Change contains telemetry where appropriate (logs, metrics, etc.). - [ ] Telemetry is meaningful, actionable and does not have the potential to leak sensitive data. --------- Co-authored-by: Munir Abdinur Co-authored-by: Brett Langdon --- .circleci/config.yml | 12 +- ddtrace/contrib/gunicorn/__init__.py | 32 +++- ddtrace/internal/periodic.py | 49 ++++-- docs/index.rst | 2 +- .../gunicorn-issue-09308901fa00e76c.yaml | 4 + riotfile.py | 7 +- tests/contrib/gunicorn/post_fork.py | 21 +++ tests/contrib/gunicorn/simple_app.py | 4 - tests/contrib/gunicorn/test_gunicorn.py | 150 +++++++++++++----- tests/contrib/gunicorn/wsgi_mw_app.py | 41 ++++- 10 files changed, 247 insertions(+), 75 deletions(-) create mode 100644 releasenotes/notes/gunicorn-issue-09308901fa00e76c.yaml create mode 100644 tests/contrib/gunicorn/post_fork.py delete mode 100644 tests/contrib/gunicorn/simple_app.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 74fdc1879c5..dd4ad2bcbe4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -762,9 +762,15 @@ jobs: gunicorn: <<: *machine_executor steps: - - run_test: - pattern: "gunicorn" - snapshot: true + - attach_workspace: + at: . + - checkout + - start_docker_services: + services: ddagent + - run: + command: | + mv .riot .ddriot + ./scripts/ddtest riot -v run --pass-env -s 'gunicorn' httplib: <<: *machine_executor diff --git a/ddtrace/contrib/gunicorn/__init__.py b/ddtrace/contrib/gunicorn/__init__.py index 03f69f18328..d8e6db0452a 100644 --- a/ddtrace/contrib/gunicorn/__init__.py +++ b/ddtrace/contrib/gunicorn/__init__.py @@ -1,17 +1,33 @@ """ -``ddtrace`` supports `Gunicorn `__. +**Note:** ``ddtrace-run`` and Python 2 are both not supported with `Gunicorn `__. -If the application is using the ``gevent`` worker class, ``gevent`` monkey patching must be performed before loading the -``ddtrace`` library. +``ddtrace`` only supports Gunicorn's ``gevent`` worker type when configured as follows: -There are different options to ensure this happens: +- The application is running under Python 3 +- `ddtrace-run` is not used +- The `DD_GEVENT_PATCH_ALL=1` environment variable is set +- Gunicorn's ```post_fork`` `__ hook does not import from + ``ddtrace`` +- ``import ddtrace.bootstrap.sitecustomize`` is called either in the application's main process or in the + ```post_worker_init`` `__ hook. -- If using ``ddtrace-run``, set the environment variable ``DD_GEVENT_PATCH_ALL=1``. +.. code-block:: python -- Replace ``ddtrace-run`` by using ``import ddtrace.bootstrap.sitecustomize`` as the first import of the application. + # gunicorn.conf.py + def post_fork(server, worker): + # don't touch ddtrace here + pass -- Use a `post_worker_init `_ - hook to import ``ddtrace.bootstrap.sitecustomize``. + def post_worker_init(worker): + import ddtrace.bootstrap.sitecustomize + + workers = 4 + worker_class = "gevent" + bind = "8080" + +.. code-block:: bash + + DD_GEVENT_PATCH_ALL=1 gunicorn --config gunicorn.conf.py path.to.my:app """ diff --git a/ddtrace/internal/periodic.py b/ddtrace/internal/periodic.py index e50c952b82e..ed0b7dff98f 100644 --- a/ddtrace/internal/periodic.py +++ b/ddtrace/internal/periodic.py @@ -54,15 +54,46 @@ def stop(self): self.quit.set() def _is_proper_class(self): - # DEV: Some frameworks, like e.g. gevent, seem to resuscitate some - # of the threads that were running prior to the fork of the worker - # processes. These threads are normally created via the native API - # and are exposed to the child process as _DummyThreads. We check - # whether the current thread is no longer an instance of the - # original thread class to prevent it from running in the child - # process while the state copied over from the parent is being - # cleaned up. The restarting of the thread is responsibility to the - # registered forksafe hooks. + """ + Picture this: you're running a gunicorn server under ddtrace-run (`ddtrace-run gunicorn...`). + Profiler._profiler() (a PeriodicThread) is running in the main process. + Gunicorn forks the process and sets up the resulting child process as a "gevent worker". + Because of the Profiler's _restart_on_fork hook, inside the child process, the Profiler is stopped, copied, and + started for the purpose of profiling the child process. + In _restart_on_fork, the Profiler being stopped is the one that was running before the fork, ie the one in the + main process. Copying that Profiler takes place in the child process. Inside the child process, we've now got + *one* process-local Profiler running in a thread, and that's the only Profiler running. + + ...or is it? + + As it turns out, gevent has some of its own post-fork logic that complicates things. All of the above is + accurate, except for the bit about the child process' Profiler being the only one alive. In fact, gunicorn's + "gevent worker" notices that there was a Profiler (PeriodicThread) running before the fork, and attempts to + bring it back to life after the fork. + + Outside of ddtrace-run, the only apparent problem this causes is duplicated work and decreased performance. + Under ddtrace-run, because it triggers an additional fork to which gevent's post-fork logic responds, the + thread ends up being restarted twice in the child process. This means that there are a bunch of instances of + the thread running simultaneously: + + the "correct" one started by ddtrace's _restart_on_fork + the copy of the pre-fork one restarted by gevent after the fork done by gunicorn + the copy of the pre-fork one restarted by gevent after the fork done by ddtrace-run + + This causes even more problems for PeriodicThread uses like the Profiler that rely on running as singletons per + process. + + In these situations where there are many copies of the restarted thread, the copies are conveniently marked as + such by being instances of gevent.threading._DummyThread. + + This function _is_proper_class exists as a thread-local release valve that lets these _DummyThreads stop + themselves, because there's no other sane way to join all _DummyThreads from outside of those threads + themselves. Not doing this causes the _DummyThreads to be orphaned and hang, which can degrade performance + and cause crashes in threads that assume they're singletons. + + That's why it's hard to write a test for - doing so requires waiting for the threads to die on their own, which + can make tests take a really long time. + """ return isinstance(threading.current_thread(), self.__class__) def run(self): diff --git a/docs/index.rst b/docs/index.rst index 444566290ba..5ea5d1b38b5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -101,7 +101,7 @@ contacting support. +--------------------------------------------------+---------------+----------------+ | :ref:`graphql-core ` | >= 2.0.0 | Yes | +--------------------------------------------------+---------------+----------------+ -| :ref:`gunicorn ` | >= 19.10.0 | No | +| :ref:`gunicorn ` | >= 20.0 | No | +--------------------------------------------------+---------------+----------------+ | :ref:`httplib` | \* | Yes | +--------------------------------------------------+---------------+----------------+ diff --git a/releasenotes/notes/gunicorn-issue-09308901fa00e76c.yaml b/releasenotes/notes/gunicorn-issue-09308901fa00e76c.yaml new file mode 100644 index 00000000000..8baf41ec4e7 --- /dev/null +++ b/releasenotes/notes/gunicorn-issue-09308901fa00e76c.yaml @@ -0,0 +1,4 @@ +--- +issues: + - | + gunicorn: ddtrace-run does not work with gunicorn. To instrument a gunicorn application, follow the instructions `here `_. diff --git a/riotfile.py b/riotfile.py index c93420eef1e..3d254d0a1b4 100644 --- a/riotfile.py +++ b/riotfile.py @@ -2595,13 +2595,8 @@ def select_pys(min_version=MIN_PYTHON_VERSION, max_version=MAX_PYTHON_VERSION): Venv( name="gunicorn", command="pytest {cmdargs} tests/contrib/gunicorn", - pkgs={"requests": latest}, + pkgs={"requests": latest, "gevent": latest}, venvs=[ - Venv( - pys="2.7", - # Gunicorn ended Python 2 support after 19.10.0 - pkgs={"gunicorn": "==19.10.0"}, - ), Venv( pys=select_pys(min_version="3.5"), pkgs={"gunicorn": ["==19.10.0", "==20.0.4", latest]}, diff --git a/tests/contrib/gunicorn/post_fork.py b/tests/contrib/gunicorn/post_fork.py new file mode 100644 index 00000000000..85da8039fd3 --- /dev/null +++ b/tests/contrib/gunicorn/post_fork.py @@ -0,0 +1,21 @@ +import sys + +import gevent.monkey + +from ddtrace.debugging import DynamicInstrumentation +from ddtrace.internal.remoteconfig import RemoteConfig +from ddtrace.profiling.profiler import Profiler + + +# take some notes about the relative ordering of thread creation and +# monkeypatching +monkeypatch_happened = gevent.monkey.is_module_patched("threading") + +# enabling DI here allows test cases to exercise the code paths that handle +# gevent monkeypatching of running threads +# post_fork is called before gevent.monkey.patch_all() +if sys.version_info < (3, 11): + DynamicInstrumentation.enable() + +RemoteConfig._was_enabled_after_gevent_monkeypatch = monkeypatch_happened +Profiler._was_enabled_after_gevent_monkeypatch = monkeypatch_happened diff --git a/tests/contrib/gunicorn/simple_app.py b/tests/contrib/gunicorn/simple_app.py deleted file mode 100644 index ed02339949e..00000000000 --- a/tests/contrib/gunicorn/simple_app.py +++ /dev/null @@ -1,4 +0,0 @@ -def app(_, start_response): - data = b"Hello, World!\n" - start_response("200 OK", [("Content-Type", "text/plain"), ("Content-Length", str(len(data)))]) - return iter([data]) diff --git a/tests/contrib/gunicorn/test_gunicorn.py b/tests/contrib/gunicorn/test_gunicorn.py index fa3495d9a2c..7e702dd14ae 100644 --- a/tests/contrib/gunicorn/test_gunicorn.py +++ b/tests/contrib/gunicorn/test_gunicorn.py @@ -1,8 +1,12 @@ +from contextlib import contextmanager +import json import os import subprocess import sys +import time from typing import Dict from typing import NamedTuple +from typing import Optional # noqa import pytest import tenacity @@ -11,6 +15,11 @@ from tests.webclient import Client +SERVICE_INTERVAL = 1 +# this is the most direct manifestation i can find of a bug caused by misconfigured gunicorn+ddtrace +MOST_DIRECT_KNOWN_GUNICORN_RELATED_PROFILER_ERROR_SIGNAL = b"RuntimeError: the memalloc module is already started" + + GunicornServerSettings = NamedTuple( "GunicornServerSettings", [ @@ -21,29 +30,62 @@ ("worker_class", str), ("bind", str), ("use_ddtracerun", bool), - ("post_worker_init", str), + ("import_sitecustomize_in_postworkerinit", bool), + ("start_service_in_hook_named", str), ], ) -_post_worker_init_ddtrace = """ -def post_worker_init(worker): - import ddtrace.bootstrap.sitecustomize -""" +IMPORT_SITECUSTOMIZE = "import ddtrace.bootstrap.sitecustomize" +with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "post_fork.py"), "r") as f: + code = f.readlines() +START_SERVICE = " " + " ".join(code) + + +def assert_no_profiler_error(server_process): + assert MOST_DIRECT_KNOWN_GUNICORN_RELATED_PROFILER_ERROR_SIGNAL not in server_process.stderr.read() + + +def parse_payload(data): + decoded = data + if sys.version_info[1] == 5: + decoded = data.decode("utf-8") + return json.loads(decoded) + + +def assert_remoteconfig_started_successfully(response, check_patch=True): + if sys.version_info[1] in (5, 11): + return + assert response.status_code == 200 + payload = parse_payload(response.content) + assert payload["remoteconfig"]["worker_alive"] is True + if check_patch: + assert payload["remoteconfig"]["enabled_after_gevent_monkeypatch"] is True def _gunicorn_settings_factory( - env=os.environ.copy(), # type: Dict[str, str] + env=None, # type: Dict[str, str] directory=os.getcwd(), # type: str - app_path="tests.contrib.gunicorn.simple_app:app", # type: str + app_path="tests.contrib.gunicorn.wsgi_mw_app:app", # type: str num_workers="4", # type: str worker_class="sync", # type: str bind="0.0.0.0:8080", # type: str use_ddtracerun=True, # type: bool - post_worker_init="", # type: str + import_sitecustomize_in_postworkerinit=False, # type: bool + patch_gevent=None, # type: Optional[bool] + import_sitecustomize_in_app=None, # type: Optional[bool] + start_service_in_hook_named="post_fork", # type: str ): # type: (...) -> GunicornServerSettings """Factory for creating gunicorn settings with simple defaults if settings are not defined.""" + if env is None: + env = os.environ.copy() + if patch_gevent is not None: + env["DD_GEVENT_PATCH_ALL"] = str(patch_gevent) + if import_sitecustomize_in_app is not None: + env["_DD_TEST_IMPORT_SITECUSTOMIZE"] = str(import_sitecustomize_in_app) + env["DD_REMOTECONFIG_POLL_SECONDS"] = str(SERVICE_INTERVAL) + env["DD_PROFILING_UPLOAD_INTERVAL"] = str(SERVICE_INTERVAL) return GunicornServerSettings( env=env, directory=directory, @@ -52,79 +94,103 @@ def _gunicorn_settings_factory( worker_class=worker_class, bind=bind, use_ddtracerun=use_ddtracerun, - post_worker_init=post_worker_init, + import_sitecustomize_in_postworkerinit=import_sitecustomize_in_postworkerinit, + start_service_in_hook_named=start_service_in_hook_named, ) -@pytest.fixture -def gunicorn_server_settings(): - yield _gunicorn_settings_factory() - - -@pytest.fixture -def gunicorn_server(gunicorn_server_settings, tmp_path): - cfg_file = tmp_path / "gunicorn.conf.py" +def build_config_file(gunicorn_server_settings): + post_fork = START_SERVICE if gunicorn_server_settings.start_service_in_hook_named != "post_worker_init" else "" + post_worker_init = " {sitecustomize}\n{service_start}".format( + sitecustomize=IMPORT_SITECUSTOMIZE if gunicorn_server_settings.import_sitecustomize_in_postworkerinit else "", + service_start=START_SERVICE + if gunicorn_server_settings.start_service_in_hook_named == "post_worker_init" + else "", + ) cfg = """ +def post_fork(server, worker): + pass +{post_fork} + +def post_worker_init(worker): + pass {post_worker_init} workers = {num_workers} worker_class = "{worker_class}" bind = "{bind}" """.format( - post_worker_init=gunicorn_server_settings.post_worker_init, + post_fork=post_fork, + post_worker_init=post_worker_init, bind=gunicorn_server_settings.bind, num_workers=gunicorn_server_settings.num_workers, worker_class=gunicorn_server_settings.worker_class, ) + return cfg + + +@contextmanager +def gunicorn_server(gunicorn_server_settings, tmp_path): + cfg_file = tmp_path / "gunicorn.conf.py" + cfg = build_config_file(gunicorn_server_settings) cfg_file.write_text(stringify(cfg)) cmd = [] if gunicorn_server_settings.use_ddtracerun: cmd = ["ddtrace-run"] cmd += ["gunicorn", "--config", str(cfg_file), str(gunicorn_server_settings.app_path)] print("Running %r with configuration file %s" % (" ".join(cmd), cfg)) - p = subprocess.Popen( + + server_process = subprocess.Popen( cmd, env=gunicorn_server_settings.env, cwd=gunicorn_server_settings.directory, - stdout=sys.stdout, - stderr=sys.stderr, + stderr=subprocess.PIPE, close_fds=True, preexec_fn=os.setsid, ) try: client = Client("http://%s" % gunicorn_server_settings.bind) - # Wait for the server to start up try: - client.wait(max_tries=20, delay=0.5) + client.wait(max_tries=100, delay=0.1) except tenacity.RetryError: - # if proc.returncode is not None: - # process failed raise TimeoutError("Server failed to start, see stdout and stderr logs") - - yield client - + time.sleep(SERVICE_INTERVAL) + yield server_process, client try: client.get_ignored("/shutdown") except Exception: pass finally: - p.terminate() - p.wait() + server_process.terminate() + server_process.wait() -def test_basic(gunicorn_server): - r = gunicorn_server.get("/") - assert r.status_code == 200 - assert r.content == b"Hello, World!\n" +SETTINGS_GEVENT_APPIMPORT_PATCH_POSTWORKERSERVICE = _gunicorn_settings_factory( + worker_class="gevent", + use_ddtracerun=False, + import_sitecustomize_in_app=True, + patch_gevent=True, + start_service_in_hook_named="post_worker_init", +) +SETTINGS_GEVENT_POSTWORKERIMPORT_PATCH_POSTWORKERSERVICE = _gunicorn_settings_factory( + worker_class="gevent", + use_ddtracerun=False, + import_sitecustomize_in_postworkerinit=True, + patch_gevent=True, + start_service_in_hook_named="post_worker_init", +) -@pytest.mark.snapshot(ignores=["meta.result_class"]) @pytest.mark.parametrize( - "gunicorn_server_settings", [_gunicorn_settings_factory(app_path="tests.contrib.gunicorn.wsgi_mw_app:app")] + "gunicorn_server_settings", + [ + SETTINGS_GEVENT_APPIMPORT_PATCH_POSTWORKERSERVICE, + SETTINGS_GEVENT_POSTWORKERIMPORT_PATCH_POSTWORKERSERVICE, + ], ) -def test_traced_basic(gunicorn_server_settings, gunicorn_server): - # meta.result_class is listiterator vs list_iterator in PY2 vs PY3. - # Ignore this field to avoid having to create mostly duplicate snapshots in Python 2 and 3. - r = gunicorn_server.get("/") - assert r.status_code == 200 - assert r.content == b"Hello, World!\n" +def test_no_known_errors_occur(gunicorn_server_settings, tmp_path): + with gunicorn_server(gunicorn_server_settings, tmp_path) as context: + server_process, client = context + r = client.get("/") + assert_no_profiler_error(server_process) + assert_remoteconfig_started_successfully(r) diff --git a/tests/contrib/gunicorn/wsgi_mw_app.py b/tests/contrib/gunicorn/wsgi_mw_app.py index 87128dc9dd6..98898349836 100644 --- a/tests/contrib/gunicorn/wsgi_mw_app.py +++ b/tests/contrib/gunicorn/wsgi_mw_app.py @@ -1,20 +1,57 @@ +""" +This app exists to replicate and report on failures and degraded behavior that can arise when using ddtrace with +gunicorn +""" +import json +import os +import sys + from ddtrace import tracer from ddtrace.contrib.wsgi import DDWSGIMiddleware +from ddtrace.internal.remoteconfig import RemoteConfig +from ddtrace.profiling import bootstrap +import ddtrace.profiling.auto # noqa from tests.webclient import PingFilter +if os.getenv("_DD_TEST_IMPORT_SITECUSTOMIZE"): + import ddtrace.bootstrap.sitecustomize # noqa: F401 # isort: skip + tracer.configure( settings={ "FILTERS": [PingFilter()], } ) +if sys.version_info < (3, 11): + from ddtrace.debugging import DynamicInstrumentation + + +def aggressive_shutdown(): + RemoteConfig.disable() + if sys.version_info < (3, 11): + DynamicInstrumentation.disable() + tracer.shutdown(timeout=1) + if hasattr(bootstrap, "profiler"): + bootstrap.profiler._scheduler.stop() + bootstrap.profiler.stop() + def simple_app(environ, start_response): if environ["RAW_URI"] == "/shutdown": - tracer.shutdown() + aggressive_shutdown() + data = bytes("goodbye", encoding="utf-8") + else: + has_config_worker = hasattr(RemoteConfig._worker, "_worker") + payload = { + "remoteconfig": { + "worker_alive": has_config_worker and RemoteConfig._worker._worker.is_alive(), + "enabled_after_gevent_monkeypatch": RemoteConfig._was_enabled_after_gevent_monkeypatch, + }, + } + json_payload = json.dumps(payload) + data = bytes(json_payload, encoding="utf-8") - data = b"Hello, World!\n" start_response("200 OK", [("Content-Type", "text/plain"), ("Content-Length", str(len(data)))]) return iter([data])