Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Track ongoing event fetches correctly in the presence of failure #11240

Merged
merged 1 commit into from
Nov 4, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/11240.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix a long-standing bug where all requests that read events from the database could get stuck as a result of losing the database connection.
56 changes: 34 additions & 22 deletions synapse/storage/databases/main/events_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

import attr
from constantly import NamedConstant, Names
from prometheus_client import Gauge
from typing_extensions import Literal

from twisted.internet import defer
Expand Down Expand Up @@ -81,6 +82,12 @@
EVENT_QUEUE_TIMEOUT_S = 0.1 # Timeout when waiting for requests for events


event_fetch_ongoing_gauge = Gauge(
"synapse_event_fetch_ongoing",
"The number of event fetchers that are running",
)


@attr.s(slots=True, auto_attribs=True)
class _EventCacheEntry:
event: EventBase
Expand Down Expand Up @@ -222,6 +229,7 @@ def __init__(self, database: DatabasePool, db_conn, hs):
self._event_fetch_lock = threading.Condition()
self._event_fetch_list = []
self._event_fetch_ongoing = 0
event_fetch_ongoing_gauge.set(self._event_fetch_ongoing)

# We define this sequence here so that it can be referenced from both
# the DataStore and PersistEventStore.
Expand Down Expand Up @@ -732,28 +740,31 @@ def _do_fetch(self, conn: Connection) -> None:
"""Takes a database connection and waits for requests for events from
the _event_fetch_list queue.
"""
i = 0
while True:
with self._event_fetch_lock:
event_list = self._event_fetch_list
self._event_fetch_list = []

if not event_list:
single_threaded = self.database_engine.single_threaded
if (
not self.USE_DEDICATED_DB_THREADS_FOR_EVENT_FETCHING
or single_threaded
or i > EVENT_QUEUE_ITERATIONS
):
self._event_fetch_ongoing -= 1
return
else:
self._event_fetch_lock.wait(EVENT_QUEUE_TIMEOUT_S)
i += 1
continue
i = 0

self._fetch_event_list(conn, event_list)
try:
i = 0
while True:
with self._event_fetch_lock:
event_list = self._event_fetch_list
self._event_fetch_list = []

if not event_list:
single_threaded = self.database_engine.single_threaded
if (
not self.USE_DEDICATED_DB_THREADS_FOR_EVENT_FETCHING
or single_threaded
or i > EVENT_QUEUE_ITERATIONS
):
break
else:
self._event_fetch_lock.wait(EVENT_QUEUE_TIMEOUT_S)
i += 1
continue
i = 0

self._fetch_event_list(conn, event_list)
finally:
self._event_fetch_ongoing -= 1
event_fetch_ongoing_gauge.set(self._event_fetch_ongoing)

def _fetch_event_list(
self, conn: Connection, event_list: List[Tuple[List[str], defer.Deferred]]
Expand Down Expand Up @@ -977,6 +988,7 @@ async def _enqueue_events(self, events: Iterable[str]) -> Dict[str, _EventRow]:

if self._event_fetch_ongoing < EVENT_QUEUE_THREADS:
self._event_fetch_ongoing += 1
event_fetch_ongoing_gauge.set(self._event_fetch_ongoing)
should_start = True
else:
should_start = False
Expand Down