-
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Handle race between persisting an event and un-partial stating a room #13100
Changes from 17 commits
6c0ab7a
72c9ccc
2509549
e49796d
cdb4f3c
565b17c
97bbf97
10cd406
8b1e6a7
49dd92b
97d27ef
8af0caa
a720f3b
4bc63cd
87175c7
2bd5e2f
d709513
999de65
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Faster room joins: Handle race between persisting an event and un-partial stating a room. |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -45,6 +45,7 @@ | |
FederationDeniedError, | ||
FederationError, | ||
HttpResponseException, | ||
LimitExceededError, | ||
NotFoundError, | ||
RequestSendFailed, | ||
SynapseError, | ||
|
@@ -64,6 +65,7 @@ | |
ReplicationCleanRoomRestServlet, | ||
ReplicationStoreRoomOnOutlierMembershipRestServlet, | ||
) | ||
from synapse.storage.databases.main.events import PartialStateConflictError | ||
from synapse.storage.databases.main.events_worker import EventRedactBehaviour | ||
from synapse.storage.state import StateFilter | ||
from synapse.types import JsonDict, StateMap, get_domain_from_id | ||
|
@@ -549,15 +551,29 @@ async def do_invite_join( | |
# https://github.com/matrix-org/synapse/issues/12998 | ||
await self.store.store_partial_state_room(room_id, ret.servers_in_room) | ||
|
||
max_stream_id = await self._federation_event_handler.process_remote_join( | ||
origin, | ||
room_id, | ||
auth_chain, | ||
state, | ||
event, | ||
room_version_obj, | ||
partial_state=ret.partial_state, | ||
) | ||
try: | ||
max_stream_id = ( | ||
await self._federation_event_handler.process_remote_join( | ||
origin, | ||
room_id, | ||
auth_chain, | ||
state, | ||
event, | ||
room_version_obj, | ||
partial_state=ret.partial_state, | ||
) | ||
) | ||
except PartialStateConflictError as e: | ||
# The homeserver was already in the room and it is no longer partial | ||
# stated. We ought to be doing a local join instead. Turn the error into | ||
# a 429, as a hint to the client to try again. | ||
# TODO(faster_joins): `_should_perform_remote_join` suggests that we may | ||
# do a remote join for restricted rooms even if we have full state. | ||
logger.error( | ||
"Room %s was un-partial stated while processing remote join.", | ||
room_id, | ||
) | ||
raise LimitExceededError(msg=e.msg, errcode=e.errcode, retry_after_ms=0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we consider opening an issue as to talk about whether we want to make this better? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that's fair. I've filed it as #13173. |
||
|
||
if ret.partial_state: | ||
# Kick off the process of asynchronously fetching the state for this | ||
|
@@ -1567,11 +1583,6 @@ async def _sync_partial_state_room( | |
|
||
# we raced against more events arriving with partial state. Go round | ||
# the loop again. We've already logged a warning, so no need for more. | ||
# TODO(faster_joins): there is still a race here, whereby incoming events which raced | ||
# with us will fail to be persisted after the call to `clear_partial_state_room` due to | ||
# having partial state. | ||
# https://github.com/matrix-org/synapse/issues/12988 | ||
# | ||
continue | ||
|
||
events = await self.store.get_events_as_list( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,7 @@ | |
AuthError, | ||
Codes, | ||
ConsentNotGivenError, | ||
LimitExceededError, | ||
NotFoundError, | ||
ShadowBanError, | ||
SynapseError, | ||
|
@@ -53,6 +54,7 @@ | |
from synapse.logging.context import make_deferred_yieldable, run_in_background | ||
from synapse.metrics.background_process_metrics import run_as_background_process | ||
from synapse.replication.http.send_event import ReplicationSendEventRestServlet | ||
from synapse.storage.databases.main.events import PartialStateConflictError | ||
from synapse.storage.databases.main.events_worker import EventRedactBehaviour | ||
from synapse.storage.state import StateFilter | ||
from synapse.types import ( | ||
|
@@ -1247,6 +1249,8 @@ async def handle_new_client_event( | |
|
||
Raises: | ||
ShadowBanError if the requester has been shadow-banned. | ||
SynapseError(503) if attempting to persist a partial state event in | ||
a room that has been un-partial stated. | ||
""" | ||
extra_users = extra_users or [] | ||
|
||
|
@@ -1297,24 +1301,35 @@ async def handle_new_client_event( | |
|
||
# We now persist the event (and update the cache in parallel, since we | ||
# don't want to block on it). | ||
result, _ = await make_deferred_yieldable( | ||
gather_results( | ||
( | ||
run_in_background( | ||
self._persist_event, | ||
requester=requester, | ||
event=event, | ||
context=context, | ||
ratelimit=ratelimit, | ||
extra_users=extra_users, | ||
try: | ||
result, _ = await make_deferred_yieldable( | ||
gather_results( | ||
( | ||
run_in_background( | ||
self._persist_event, | ||
requester=requester, | ||
event=event, | ||
context=context, | ||
ratelimit=ratelimit, | ||
extra_users=extra_users, | ||
), | ||
run_in_background( | ||
self.cache_joined_hosts_for_event, event, context | ||
).addErrback( | ||
log_failure, "cache_joined_hosts_for_event failed" | ||
), | ||
), | ||
run_in_background( | ||
self.cache_joined_hosts_for_event, event, context | ||
).addErrback(log_failure, "cache_joined_hosts_for_event failed"), | ||
), | ||
consumeErrors=True, | ||
consumeErrors=True, | ||
) | ||
).addErrback(unwrapFirstError) | ||
except PartialStateConflictError as e: | ||
# The event context needs to be recomputed. | ||
# Turn the error into a 429, as a hint to the client to try again. | ||
logger.info( | ||
"Room %s was un-partial stated while persisting client event.", | ||
event.room_id, | ||
) | ||
).addErrback(unwrapFirstError) | ||
raise LimitExceededError(msg=e.msg, errcode=e.errcode, retry_after_ms=0) | ||
|
||
return result | ||
|
||
|
@@ -1329,6 +1344,9 @@ async def _persist_event( | |
"""Actually persists the event. Should only be called by | ||
`handle_new_client_event`, and see its docstring for documentation of | ||
the arguments. | ||
|
||
PartialStateConflictError: if attempting to persist a partial state event in | ||
a room that has been un-partial stated. | ||
""" | ||
|
||
# Skip push notification actions for historical messages | ||
|
@@ -1345,16 +1363,21 @@ async def _persist_event( | |
# If we're a worker we need to hit out to the master. | ||
writer_instance = self._events_shard_config.get_instance(event.room_id) | ||
if writer_instance != self._instance_name: | ||
result = await self.send_event( | ||
instance_name=writer_instance, | ||
event_id=event.event_id, | ||
store=self.store, | ||
requester=requester, | ||
event=event, | ||
context=context, | ||
ratelimit=ratelimit, | ||
extra_users=extra_users, | ||
) | ||
try: | ||
result = await self.send_event( | ||
instance_name=writer_instance, | ||
event_id=event.event_id, | ||
store=self.store, | ||
requester=requester, | ||
event=event, | ||
context=context, | ||
ratelimit=ratelimit, | ||
extra_users=extra_users, | ||
) | ||
except SynapseError as e: | ||
if e.code == HTTPStatus.CONFLICT: | ||
raise PartialStateConflictError() | ||
raise | ||
Comment on lines
+1380
to
+1383
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The way we transport the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it feels ugly but it also is straight forward so it has that going for it. I think it's fine and we can always change it later |
||
stream_id = result["stream_id"] | ||
event_id = result["event_id"] | ||
if event_id != event.event_id: | ||
|
@@ -1482,6 +1505,10 @@ async def persist_and_notify_client_event( | |
The persisted event. This may be different than the given event if | ||
it was de-duplicated (e.g. because we had already persisted an | ||
event with the same transaction ID.) | ||
|
||
Raises: | ||
PartialStateConflictError: if attempting to persist a partial state event in | ||
a room that has been un-partial stated. | ||
""" | ||
extra_users = extra_users or [] | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so to clarify, there should only ever be one retry needed because once it's un-partial-stated, it can't conflict anymore?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's right, a room can only be un-partial-stated once.
Unless we leave it or purge it, but I don't know what happens in that case, even in the absence of faster room joins.