From a5eff46f15d898690939dda57e35a566b1b3449e Mon Sep 17 00:00:00 2001 From: Rohit Nayak Date: Sat, 7 Jan 2023 20:57:14 +0100 Subject: [PATCH 01/16] Move vreplication design docs to reference/vreplication/internal. Remove file pos doc Signed-off-by: Rohit Nayak --- .../vreplication/internal}/_index.md | 0 .../vreplication/internal}/character-sets.md | 0 .../vreplication/internal}/cutover.md | 0 .../reference/vreplication/internal}/keys.md | 0 .../internal}/life-of-a-stream.md | 0 .../vreplication/internal}/vstream/_index.md | 0 .../internal}/vstream/skew-detection.md | 0 .../internal}/vstream/stream-migration.md | 0 .../vreplication/internal}/vstream/tracker.md | 0 .../vreplication/internal}/vstream/vscopy.md | 0 .../docs/design-docs/vreplication/file-pos.md | 46 ------------------- 11 files changed, 46 deletions(-) rename content/en/docs/{design-docs/vreplication => 16.0/reference/vreplication/internal}/_index.md (100%) rename content/en/docs/{design-docs/vreplication => 16.0/reference/vreplication/internal}/character-sets.md (100%) rename content/en/docs/{design-docs/vreplication => 16.0/reference/vreplication/internal}/cutover.md (100%) rename content/en/docs/{design-docs/vreplication => 16.0/reference/vreplication/internal}/keys.md (100%) rename content/en/docs/{design-docs/vreplication => 16.0/reference/vreplication/internal}/life-of-a-stream.md (100%) rename content/en/docs/{design-docs/vreplication => 16.0/reference/vreplication/internal}/vstream/_index.md (100%) rename content/en/docs/{design-docs/vreplication => 16.0/reference/vreplication/internal}/vstream/skew-detection.md (100%) rename content/en/docs/{design-docs/vreplication => 16.0/reference/vreplication/internal}/vstream/stream-migration.md (100%) rename content/en/docs/{design-docs/vreplication => 16.0/reference/vreplication/internal}/vstream/tracker.md (100%) rename content/en/docs/{design-docs/vreplication => 16.0/reference/vreplication/internal}/vstream/vscopy.md (100%) delete mode 100644 content/en/docs/design-docs/vreplication/file-pos.md diff --git a/content/en/docs/design-docs/vreplication/_index.md b/content/en/docs/16.0/reference/vreplication/internal/_index.md similarity index 100% rename from content/en/docs/design-docs/vreplication/_index.md rename to content/en/docs/16.0/reference/vreplication/internal/_index.md diff --git a/content/en/docs/design-docs/vreplication/character-sets.md b/content/en/docs/16.0/reference/vreplication/internal/character-sets.md similarity index 100% rename from content/en/docs/design-docs/vreplication/character-sets.md rename to content/en/docs/16.0/reference/vreplication/internal/character-sets.md diff --git a/content/en/docs/design-docs/vreplication/cutover.md b/content/en/docs/16.0/reference/vreplication/internal/cutover.md similarity index 100% rename from content/en/docs/design-docs/vreplication/cutover.md rename to content/en/docs/16.0/reference/vreplication/internal/cutover.md diff --git a/content/en/docs/design-docs/vreplication/keys.md b/content/en/docs/16.0/reference/vreplication/internal/keys.md similarity index 100% rename from content/en/docs/design-docs/vreplication/keys.md rename to content/en/docs/16.0/reference/vreplication/internal/keys.md diff --git a/content/en/docs/design-docs/vreplication/life-of-a-stream.md b/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md similarity index 100% rename from content/en/docs/design-docs/vreplication/life-of-a-stream.md rename to content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md diff --git a/content/en/docs/design-docs/vreplication/vstream/_index.md b/content/en/docs/16.0/reference/vreplication/internal/vstream/_index.md similarity index 100% rename from content/en/docs/design-docs/vreplication/vstream/_index.md rename to content/en/docs/16.0/reference/vreplication/internal/vstream/_index.md diff --git a/content/en/docs/design-docs/vreplication/vstream/skew-detection.md b/content/en/docs/16.0/reference/vreplication/internal/vstream/skew-detection.md similarity index 100% rename from content/en/docs/design-docs/vreplication/vstream/skew-detection.md rename to content/en/docs/16.0/reference/vreplication/internal/vstream/skew-detection.md diff --git a/content/en/docs/design-docs/vreplication/vstream/stream-migration.md b/content/en/docs/16.0/reference/vreplication/internal/vstream/stream-migration.md similarity index 100% rename from content/en/docs/design-docs/vreplication/vstream/stream-migration.md rename to content/en/docs/16.0/reference/vreplication/internal/vstream/stream-migration.md diff --git a/content/en/docs/design-docs/vreplication/vstream/tracker.md b/content/en/docs/16.0/reference/vreplication/internal/vstream/tracker.md similarity index 100% rename from content/en/docs/design-docs/vreplication/vstream/tracker.md rename to content/en/docs/16.0/reference/vreplication/internal/vstream/tracker.md diff --git a/content/en/docs/design-docs/vreplication/vstream/vscopy.md b/content/en/docs/16.0/reference/vreplication/internal/vstream/vscopy.md similarity index 100% rename from content/en/docs/design-docs/vreplication/vstream/vscopy.md rename to content/en/docs/16.0/reference/vreplication/internal/vstream/vscopy.md diff --git a/content/en/docs/design-docs/vreplication/file-pos.md b/content/en/docs/design-docs/vreplication/file-pos.md deleted file mode 100644 index 71d0c97ee..000000000 --- a/content/en/docs/design-docs/vreplication/file-pos.md +++ /dev/null @@ -1,46 +0,0 @@ ---- -title: File:Position based VReplication -description: -weight: 1 ---- - -## Problem Statement - -In order to support migration from legacy databases that may not have GTID turned on, there is a need for VReplication to support file:position based tracking. - -It is understood that this type of tracking will work only for a single physical mysql source. Replication cannot be automatically resumed from a different source. In the worst case, it is possible to manually figure out the continuation point, and change the VReplication parameters to resume replication from a different source. - -Supporting file:position based vreplication will allow for one-time "shoveling" of data from an existing source to a destination. It could also be used to continuously keep the target data up-to-date just like we do for resharding. In this situation, it should also be possible to reverse the replication after a cut-over from legacy to vitess. This will give us the option to rollback a migration should anything go wrong. - -## Requirements - -The VReplication design is entrenched in using the concept of GTID. So, it will be substantial work to change its DNA to recognize file:position based binlog tracking. However, Vitess has the design elements needed to support multiple GTID formats. This means that we can build a low level abstraction layer that encapsulates a file:position as a GTID. There are situations where this abstraction breaks down. We'll have to handle those corner cases. - -### Forcing file:position based tracking - -Although not anticipated, it's possible that a user may want to use file:position based tracking even if the source has GTID turned on. This means that file:position based tracking should not be inferred through auto-detection. - -An environment variable (like MYSQL_FLAVOR) should not be used because the tracking mode depends on a specific source. Since VReplication can have multiple sources, this should be dictated by the source. - -### Current position - -In GTID mode, the current position is guaranteed to be at a transaction boundary. But a file:position based binlog can report a position for every event, including stray ones that are not material for replication. In such cases, the mechanism of getting the current position from the source and asking the target to stop at that position should work no matter what that position is. - -### Single Source - -As mentioned above, only a fixed mysql instance will be supported as source. - -## Design - -A prototype work was done by PlanetScale for file:position based tracking. This was developed as an "RDS" flavor. This is because RDS did not support GTIDs when this was developed. With PlanetScale's permission, this work will be leveraged to implement the new proposal. The work will be published as part of the Vitess license. The following high level tasks will need to be performed. - -* **Rename rdsFlavor->filePosFlavor**: This rename will more accurately represent the functionality. -* **Flavor as connection parameter**: Since we need to control the flavor on a per-source basis, the best way to achieve this is to extend the `ConnParams` structure to include a `Flavor`. If empty, auto-detection will be used. If specified, the name will map to a registered flavor implementation. This approach will retain backward compatibility. -* **Track sub-flavor**: Since we want to support file:position based tracking even if GTID is turned on, we need the ability to recognize GTID events. This means that we have to understand MySQL and MariaDB flavors under the covers. -* **Standardize on when to send GTID**: Currently, the binlog dictates when to report a GTID. In some cases, it's before the next transaction, and sometimes it's within. We'll change the design to report the GTID just before the "COMMIT". This is the point where it's actually useful. Knowing exactly when a GTID will be received will simplify the design of the vplayer. -* **Introduce Pseudo-GTID**: In order to report positions outside of transaction boundaries, one possibility is to report them as pseudo-GTIDs. Although it's possible to convert them to fake empty transactions, it may be more readable to use a separate category. -* **Stop Position**: The vplayer currently uses ambiguous rules about how it handles the case where a stop position was exceeded. As part of this change, we'll standardize on: _A stop position is considered to be successfully reached if the new position is greater than or equal to the specified position_. The main motivation for this change is that the possibility of position mismatch is higher in the case of file:pos tracking. We're likely to hit many false positives if we're too strict. - -## Future improvements - -Once a GTID gets more implicitly associated with saveable events, we can later deprecate GTID as an explicit event. Instead, we can send the GTID as part of the COMMIT or DDL message, which is where it's actually used. This will allow us to tighten some code in vplayer. Without this, the association of GTID with a COMMIT is more ambiguous, and there's extra code to glue them together. From 3218017ea7907fa37383daef0dfa0db1e0077e04 Mon Sep 17 00:00:00 2001 From: Rohit Nayak Date: Sun, 8 Jan 2023 00:35:47 +0100 Subject: [PATCH 02/16] Update life of a stream Signed-off-by: Rohit Nayak --- .../reference/vreplication/internal/_index.md | 8 +- .../vreplication/internal/life-of-a-stream.md | 109 ++++++++++++------ 2 files changed, 80 insertions(+), 37 deletions(-) diff --git a/content/en/docs/16.0/reference/vreplication/internal/_index.md b/content/en/docs/16.0/reference/vreplication/internal/_index.md index cf88ae8ec..f49a9ab9e 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/_index.md +++ b/content/en/docs/16.0/reference/vreplication/internal/_index.md @@ -1,7 +1,7 @@ --- -title: VReplication -description: VReplication related design docs -weight: 3 +title: Internals +description: Selected vreplication-related design docs and implementation details +weight: 1000 skip_sections: true -aliases: ['/docs/design/vreplication/'] +aliases: ['/docs/reference/vreplication/internal'] --- diff --git a/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md b/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md index cda84a645..56f98147b 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md +++ b/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md @@ -1,94 +1,130 @@ --- title: Life of a stream -description: Replicating data in a VRep Workflow +description: How VReplication replicates data weight: 1 aliases: ['/docs/reference/vreplication/internals'] --- ### Introduction -The diagram above outlines how a VReplication workflow is performed. VReplication can be asked to start from a specific GTID or from the start. When starting from a GTID the _replication_ mode is used where it streams events from the binlog. +When a VReplication workflow runs, data is copied from source to target shards. Each target primary runs one +vreplication stream (vstream) for each source shard that the +target's [keyrange](https://vitess.io/docs/16.0/reference/features/sharding/#key-ranges-and-partitions) overlaps with. -![VReplication Flow](/img/VReplicationFlow.png) +The diagram below outlines how one such stream operates. VReplication can be asked to start from a specific +GTID or from the start. When starting from a GTID the _replication_ mode is used where it streams events from the +binlog. +![VReplication Flow](/img/VReplicationFlow.png) #### Full table copy -When starting from the beginning the simple streaming done by _replication_ can create an avalanche of events (think 10s of millions of rows). To speed things up a _copy/catchup_ mode is initiated first: data in the tables are copied over in a consistent manner using bulk inserts. Once we have copied enough data so that we are close enough to the current position (when replication lag is low) it switches over (and stays in) the _replication_ mode. +If an entire table data is requested simple streaming done by _replication_ can create an avalanche of events (think 10s +of millions of rows). Moreover, it is highly likely that earlier binlogs are no longer available. + +So a _copy/catchup_ mode is initiated first: data in the tables are copied over in +a consistent manner using bulk inserts. Once we have copied enough data so that we are close enough to the current +position (when replication lag is low) it switches over to, and stays in, the _replication_ mode. All future replication +is done only by streaming binlog events. -While we may have multiple database sources in a workflow each vstream has just one source and one target. The source is always a vttablet (and hence one mysql instance). The target could be another vttablet (resharding) or a streaming grpc response (vstream api clients). +While we may have multiple database sources in a workflow each vstream has just one source and one target. The source is +always a vttablet (and hence one mysql instance). The target could be another vttablet (resharding) or a streaming grpc +response (vstream api clients). #### Transformation and Filtering -Note that for all steps the data selected from the source will only be from the list of tables specified (specified via Match). Furthermore if a Filter is specified for a table it will be applied before being sent to the target. Columns may also be transformed in the Filter’s select clause. +Note that for all steps the data selected from the source will only be from the tables specified +in the [Match](https://github.com/vitessio/vitess/blob/main/proto/binlogdata.proto#LL128C5) field of the Rule +specification of the VReplication workflow. Furthermore, if a +[Filter](https://github.com/vitessio/vitess/blob/main/proto/binlogdata.proto#LL133C5) is specified for a table it will +be applied before being sent to the target. Columns may also be transformed based on the Filter’s select clause. #### Source and Sink -Each stream has two parts. The target initiates streaming by making grpc calls to the source tablet. The source sources the data connecting to mysql as a replica or using sql queries and streams it to the target. The target takes appropriate action: in case of resharding it will convert the events into CRUDs and apply it to the target database. In case of vstream clients the events are forwarded by vtgate to the client. +Each stream has two actors: the target initiates streaming by making grpc calls to the source tablet and the source +tablet sources the data by connecting to its underlying mysql server as a replica (while replicating) or using sql +queries (in the coy phase) and streams it to the target. The target takes appropriate action: in case of resharding it +will convert the events into CRUD sql statements and apply them to the target database. In case of vstream clients the +events are forwarded by vtgate to the client. -Note that the target always pulls the data. This ensures that there is no problems of buffer overruns that can occur if the source is pushing the data since (especially in sharding) it is possible that the application of events can be substantially cpu intensive especially in the case of bulk inserts. +Note that the target always pulls data. If the source pushes data, there are chances of buffer overruns if the target is +not able to process them in time. For example, in resharding workflows we need to convert the events to sql insert +statements and execute them on the target's mysql server, which are usually much slower than just selecting data on the +source. ### Modes, in detail - #### Replicate -This is the easiest step to understand. The source stream just mimics a mysql replica and processes events as they are received. Events (after filtering and transformation) are sent to the target. Replication runs continuously with short sleeps when there are no more events to source. +This is the easiest to understand. The source stream just acts like a mysql replica and processes events as they are +received. Events, after any necessary filtering and transformation, are sent to the target. Replication runs +continuously with short sleeps when there are no more events to source. Periodic heartbeats are sent to the target to +signal liveliness. #### Initialize -Initialize is called at the start of the copy phase. For each table to be copied an entry is created in \_vt.copy_state with a zero primary key. As each table copy is completed the related entry is deleted and when there are no more entries for this workflow the copy phase is considered complete and the workflow moves into the Replication mode. +Initialize is called at the start of the copy phase. For each table to be copied an entry is created in \_vt.copy_state +with a null primary key. As each table copy is completed the related entry is deleted and when there are no more entries +for this workflow the copy phase is considered complete and the workflow moves into the Replication mode. #### Copy -Copy works on one table at a time. The source selects a set of rows from the table with higher primary keys that the one copied so far using a consistent snapshot. This results in a stream of rows to be sent to the target which generates a bulk insert of these rows. +Copy works on one table at a time. The source selects a set of rows from the table, for primary keys greater than the +ones copied so far, using a consistent snapshot. This results in a stream of rows to be sent to the target which +generates a bulk insert of these rows. -However there are a couple of factors which complicate our story:: +However, there are a couple of factors which complicate our story: -* Each copy selects all rows until the current position of the binlog. -* Since transactions continue to be applied (presuming the database is online) the gtid positions are continuously moving forward +* Each copy selects all rows until the current position of the binlog, but, +* Since transactions continue to be applied (presuming the database is online) the gtid positions are continuously + moving forward Consider this example. -We have two tables t1 and t2 and this is how the copy state proceeds: Each has 20 rows and we copy 10 rows at a time. -(Queries are not exact but simplified for readability). +We have two tables X and Y. Each table has 20 rows and we copy 10 rows at a time. +(The queries below simplified for readability). -If we follow this we get: +The queries for the copy phase of X will be: ``` -T1: select * from t1 where pk > 0 limit 10. GTID: 100, Last PK 10 +T1: select * from X where pk > 0 limit 10. GTID: 100, Last PK 10 send rows to target -T2: select * from t1 where pk > 10 limit 10 GTID: 110, Last PK 20 +T2: select * from X where pk > 10 limit 10 GTID: 110, Last PK 20 send rows to target ``` -Gotcha: however we see that 10 new txs have occurred since T1. Some of these can potentially modify the rows returned from the query at T1. Hence if we just return the rows from T2 (which have only rows from pk 11 to 20) we will have an inconsistent state on the target: the updates to rows with PK between 1 and 10 will not be present. +There is a gotcha here: onsider that there are 10 new txs between times T1 and T2. Some of these can potentially modify +the rows returned from the query at T1. Hence if we just return the rows from T2 (which have only rows from pk 11 to 20) +we will have an inconsistent state on the target: the updates to rows with PK between 1 and 10 will not be present. -This means that we need to first stream the events between 100 to 110 for PK between 1 and 10 first and then do the second select: +This means that we need to first stream the events between GTIDs 100 and 110 for primary keys between 1 and 10 first and +then do the second select: ``` -T1: select * from t1 where pk > 0 limit 10. GTID: 100, Last PK 10 +T1: select * from X where pk > 0 limit 10. GTID: 100, Last PK 10 send rows to target T2: replicate from 100 to current position (110 from previous example), - only pass events for pks 1 to 10 + only pass events for pks 1 to 10 of X -T3: select * from t1 where pk > 10 limit 10 GTID: 112, Last PK 20 +T3: select * from X where pk > 10 limit 10 GTID: 112, Last PK 20 send rows to target ``` -Another gotcha!: Note that at T3 when we selected the pks from 11 to 20 the gtid position has moved further! This happened because of transactions that were applied between T2 and T3. So if we just applied the rows from T3 we would still have an inconsistent state, if transactions 111 and 112 affected the rows from pks 1 to 10. +Another gotcha!: note that at time T3 when we selected the pks from 11 to 20 the gtid position could have moved further! +This could be due to transactions that were applied between T2 and T3. So if we just applied the rows from T3 we would +still have an inconsistent state, if transactions 111 and 112 affected the rows from pks 1 to 10. This leads us to the following flow: ``` -T1: select * from t1 where pk > 0 limit 10. GTID: 100, Last PK 10 +T1: select * from X where pk > 0 limit 10. GTID: 100, Last PK 10 send rows to target @@ -96,7 +132,7 @@ T2: replicate from 100 to current position (110 from previous example), only pass events for pks 1 to 10 -T3: select * from t1 where pk > 10 limit 10 GTID: 112, Last PK 20 +T3: select * from X where pk > 10 limit 10 GTID: 112, Last PK 20 T4: replicate from 111 to 112 @@ -105,16 +141,23 @@ T4: replicate from 111 to 112 T5: Send rows for pks 11 to 20 to target ``` -This flow actually works! +This flow actually works and is the one used in Vitess VReplication! -T1 can take a long time (due to the bulk inserts). T3 (which is just a snapshot) is quick. So the position can diverge much more at T2 than at T4. Hence we call the step in T2 as Catchup and Step T4 is called Fast Forward. +The transactions to be applied at T1 can take a long time (due to the bulk inserts). T3 (which is just a snapshot) is +quick. So the position can diverge much more at T2 than at T4. Hence, we call the step in T2 as Catchup and Step T4 as a +Fast Forward. #### Catchup -As detailed above the catchup phase runs between two copy phases. During the copy phase the gtid position can move significantly ahead. So we run a replicate till we come close to the current position i.e.the replication lag is small. At this point we call Copy again. +As detailed above the catchup phase runs between two copy phases. During the copy phase the gtid position can move +significantly ahead. So we run a replicate till we come close to the current position i.e.the replication lag is small. +At this point we call Copy again. #### Fast forward -During the copy phase we first take a snapshot. Then we fast forward: we run another replicate from the gtid position where we stopped the Catchup to the position of the snapshot. +During the copy phase we first take a snapshot. Then we fast forward: we run another replicate from the gtid position +where we stopped the Catchup to the position of the snapshot. -Finally once we have finished copying all the tables we proceed to replicate until our job is done: for example if we have resharded and switched over the reads and writes to the new shards or when the vstream client closes its connection. +Finally once we have finished copying all the tables we proceed to replicate until our job is done: for example if we +have resharded and switched over the reads and writes to the new shards or when the vstream client closes its +connection. From 4ed42e5057010bb3e89399d1179a243e1058b579 Mon Sep 17 00:00:00 2001 From: Rohit Nayak Date: Sun, 8 Jan 2023 00:40:27 +0100 Subject: [PATCH 03/16] Remove character set page which is no longer in sync with current code Signed-off-by: Rohit Nayak --- .../vreplication/internal/character-sets.md | 105 ------------------ 1 file changed, 105 deletions(-) delete mode 100644 content/en/docs/16.0/reference/vreplication/internal/character-sets.md diff --git a/content/en/docs/16.0/reference/vreplication/internal/character-sets.md b/content/en/docs/16.0/reference/vreplication/internal/character-sets.md deleted file mode 100644 index 507bd54dd..000000000 --- a/content/en/docs/16.0/reference/vreplication/internal/character-sets.md +++ /dev/null @@ -1,105 +0,0 @@ ---- -title: Character set support -description: Supported character sets and configuration -weight: 30 ---- - -# Overview - -Ideally all textual data should be unicode. The best character set to be used today is `utf8mb4` (4 byte UTF8), the successor to `utf8`. However, legacy systems may carry non-UTF characters sets, specific to European, Chinese, or other languages. - -VReplication supports copying & streaming across multiple character sets. Moreover, it supports conversion from one character set to another. An important use case is importing from an external data source that uses non-UTF8 encoding, into a UTF8-encoded Vitess cluster. - -Unless told otherwise, VReplication assumes the stream's source and target both use _trivial_ character sets that do not require any special encodings. These are: - -- `utf8` -- `utf8mb4` -- `ascii` -- `binary` - -To be able to work with other character sets: - -- Verify VReplication supports the specific character sets. -- VReplication needs to be told how which character sets it's converting from/to. - -# Supported character sets - -The list of supported character sets is dynamic and may grow. You will find it under `CharacterSetEncoding` in https://github.com/vitessio/vitess/blob/main/go/mysql/constants.go - -The current list of supported character sets/encodings is: - -- ascii -- binary -- cp1250 -- cp1251 -- cp1256 -- cp1257 -- cp850 -- cp852 -- cp866 -- gbk -- greek -- hebrew -- koi8r -- latin1 -- latin2 -- latin5 -- latin7 -- utf8 -- utf8mb4 - -# Converting/encoding - -- In VRecpliation's filter query, make sure to convert all non-trivial character sets to UTF like so: - -``` -select ..., convert(column_name using utf8mb4) as column_name, ... -``` - -- In VReplication's rule, add one or more `convert_charset` entries. Each entry is of the form: - -``` -convert_charset:{key:"" value:{from_charset:"" to_charset:""}} -``` - -### Example - -In this simplified example, we wish to stream from this source table: - -```sql -create table source_names ( - id int, - name varchar(64) charset latin1 collate latin1_swedish_ci, - primary key(id) -) -``` - -And into this target table: - -```sql -create table target_names ( - id int, - name varchar(64) charset utf8mb4, - primary key(id) -) -``` - -Note that we wish to convert column `name` from `latin1` to `utf8mb4`. - -The rule would looks like this: - -``` -keyspace:"commerce" shard:"0" filter:{ - rules:{ - match:"target_names" - filter:"select `id` as `id`, convert(`name` using utf8mb4) as `name` from `source_names`" - convert_charset:{key:"name" value:{from_charset:"latin1" to_charset:"utf8mb4"}} - } -} -``` - -# Internal notes - -Right now `to_charset` is not actually used in the code. The write works correctly whether `to_charset` is specified or not, and irrespective of its value. It "just works"" because the data gets encoded from a `utf8` in Go-plane, via MySQL connector and onto the specific column. However, future implementations may require explicit definition of `to_charset`. - -As for the filter query, right now it's the user's responsibility to identify non-UTF columns in the source table. In the future, Vitess should be able to auto detect those, and automatically select `convert(col_name using utf8mb4) as col_name`. From d85ec45b4e8f76d7bd2cf1c9448beb2bf7fb9d6a Mon Sep 17 00:00:00 2001 From: Rohit Nayak Date: Sun, 8 Jan 2023 01:48:58 +0100 Subject: [PATCH 04/16] Update cutover doc Signed-off-by: Rohit Nayak --- .../vreplication/internal/cutover.md | 82 ++++++++++++++----- 1 file changed, 61 insertions(+), 21 deletions(-) diff --git a/content/en/docs/16.0/reference/vreplication/internal/cutover.md b/content/en/docs/16.0/reference/vreplication/internal/cutover.md index 40e1455e0..adc3cd514 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/cutover.md +++ b/content/en/docs/16.0/reference/vreplication/internal/cutover.md @@ -7,7 +7,13 @@ weight: 30 # Related persistent Vitess objects {{< info >}} -As the objects or keys noted below are stored in [the topo server](../../../reference/features/topology-service/) and cached locally, the processes involved will refresh their topo data throughout the cutover process. For example, each tablet on the source and target shards that are involved in a [VReplication](../../../reference/vreplication/) workflow will refresh their topo data multiple times as the state of things transition during the cutover. If we are *not* able to confirm that all tablets involved in a VReplication worfklow are able to refresh their topo data then the cutover command — e.g. [`vtctlclient SwitchTraffic`](../../../reference/vreplication/switchtraffic/) — will cancel the operation and return an error indicating which tablet(s) is unhealthy (including `--dry_run` executions). +As the objects or keys noted below are stored in [the topo server](../../../reference/features/topology-service/) and +cached locally, the processes involved will refresh their topo data throughout the cutover process. For example, each +tablet on the source and target shards that are involved in a [VReplication](../../../reference/vreplication/) workflow +will refresh their topo data multiple times as the state of things transition during the cutover. If we are *not* able +to confirm that all tablets involved in a VReplication worfklow are able to refresh their topo data then the cutover +command — e.g. [`vtctlclient SwitchTraffic`](../../../reference/vreplication/switchtraffic/) — will cancel the operation +and return an error indicating which tablet(s) is unhealthy (including `--dry_run` executions). {{< /info >}} ## VSchema @@ -16,7 +22,13 @@ A [VSchema](../../../concepts/vschema/) allows you to describe how data is organ ## Shard Info -The [`global` topo](../../../reference/features/topology-service/#global-vs-local) contains one [`Shard`](../../../reference/features/topology-service/#shard) key per keyspace which then contains one key per shard that has been created within the keyspace. For each shard that is healthy there is an attribute `is_primary_serving` which is set to true. The other shards which have been created but are still not healthy and serving within the keyspace will not have this attribute set. Here is an example shard info record from an unsharded keyspace named commerce (without the `--cell` flag being passed the `global` topo base path is used): +The [`global` topo](../../../reference/features/topology-service/#global-vs-local) contains +one [`Shard`](../../../reference/features/topology-service/#shard) key per keyspace which then contains one key per +shard that has been created within the keyspace. For each shard that is healthy there is an +attribute `is_primary_serving` which is set to true. The other shards which have been created but are still not healthy +and serving within the keyspace will not have this attribute set. Here is an example shard info record from an unsharded +keyspace named commerce (without the `--cell` flag being passed the `global` topo base path is used): + ```bash $ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/keyspaces/commerce/shards/0/Shard' primary_alias:{cell:"zone1" uid:100} primary_term_start_time:{seconds:1650341417 nanoseconds:374817485} is_primary_serving:true @@ -24,9 +36,18 @@ primary_alias:{cell:"zone1" uid:100} primary_term_start_time:{seconds:1650341417 ## SrvKeyspace -Each cell has a [`SrvKeyspace`](../../../reference/features/topology-service/#srvkeyspace) key in the [`local` topo](../../../reference/features/topology-service/#global-vs-local) (per cell info) for each keyspace. For each tablet type (primary/replica) there is one `partitions` object. The `partitions` objects contain all of the current shards in the keyspace. For sharded keyspaces, the tablets which are healthy and serving have a key range specified for that shard. +Each cell has a [`SrvKeyspace`](../../../reference/features/topology-service/#srvkeyspace) key in +the [`local` topo](../../../reference/features/topology-service/#global-vs-local) (per cell info) for each keyspace. For +each tablet type (primary/replica) there is one `partitions` object. The `partitions` objects contain all of the current +shards in the keyspace. For sharded keyspaces, the tablets which are healthy and serving have a key range specified for +that shard. + +Also the primary can contain a `query_service_disabled` attribute which is set to `true` during resharding cutovers. +This tells the primary in that shard to reject any queries made to it, as a signal to vtgate in case vtgate routes +queries to this primary during the cutover or before it discovers the new serving graph. Here is an example using the +same unsharded commerce keyspace and here we specify the `--cell` flag so that cell's topo base path — stored in +its `CellInfo` record in the `global` topo — is used: -Also the primary can contain a `query_service_disabled` attribute which is set to false during resharding cutovers. This tells the primary in that shard to reject any queries made to it, as a signal to vtgate in case vtgate routes queries to this primary during the cutover or before it discovers the new serving graph. Here is an example using the same unsharded commerce keyspace and here we specify the `--cell` flag so that cell's topo base path — stored in its `CellInfo` record in the `global` topo — is used: ```bash $ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/cells/zone1/CellInfo' server_address:"localhost:2379" root:"/vitess/zone1" @@ -37,29 +58,44 @@ partitions:{served_type:PRIMARY shard_references:{name:"0"}} partitions:{served_ ## Routing Rules -[Routing Rules](../../../reference/features/schema-routing-rules) are stored in the `RoutingRules` key within the `global` topo. Routing Rules contain a list of table-specific routes. You can route a table for all or specific tablet types to another table in the same or different keyspace. Here is an example using the same commerce keyspace where we have an active [`MoveTables`](../../../reference/vreplication/movetables/) workflow to move tables to the customer keyspace but we have not switched any traffic yet: +[Routing Rules](../../../reference/features/schema-routing-rules) are stored in the `RoutingRules` key within +the `global` topo. Routing Rules contain a list of table-specific routes. You can route a table for all or specific +tablet types to another table in the same or different keyspace. Here is an example using the same commerce keyspace +where we have an active [`MoveTables`](../../../reference/vreplication/movetables/) workflow to move tables to the +customer keyspace but we have not switched any traffic yet: + ```bash $ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/RoutingRules' rules:{from_table:"corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer.corder" to_tables:"commerce.corder"} rules:{from_table:"customer.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"commerce.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"corder" to_tables:"commerce.corder"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"commerce.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer.customer" to_tables:"commerce.customer"} ``` {{< info >}} -In practice you would instead typically view the routing rules via the dedicated [`vtctl GetRoutingRules`](../../../reference/programs/vtctl/schema-version-permissions/#getroutingrules) command which will return the rules for all keyspaces in the topo. +In practice you would instead typically view the routing rules via the +dedicated [`vtctl GetRoutingRules`](../../../reference/programs/vtctl/schema-version-permissions/#getroutingrules) +command which will return the rules for all keyspaces in the topo. {{< /info >}} # How VTGate routes a query -This section gives a simplified logic used to determine which keyspace and table vtgate will route a simple query of the form `select * from t1 where id = 1` (a _read_ query) or `insert into t1 (id, val) values (1,'abc')` (a _write_ query). +This section gives a simplified logic used to determine which keyspace and table vtgate will route a simple query of the +form `select * from t1 where id = 1` (a _read_ query) or `insert into t1 (id, val) values (1,'abc')` (a _write_ query). * Check to see if t1 has an appropriate routing rule defined. If so, use the specified target table as an alias for t1 * Locate the keyspace for t1 using the VSchema -* For a non-sharded keyspace locate the appropriate tablet (primary, by default) from the (cached) `SrvKeyspace` `local` (per cell) topo record. -* For a sharded keyspace the `SrvKeyspace` record is used to find the currently active shards. This is done by checking the list of partitions for the specific tablet type selected for the query (primary, by default, for reads and writes) and selecting the ones whose `query_service_disabled` is not set and whose `is_primary_serving` is true. -* Finally, based on the vindex for the table from the cached `VSchema` (stored in the `global` topo), the shard for the relevant row is computed based on the keyrange to which the id is mapped to using the declared vindex function/type. +* For a non-sharded keyspace locate the appropriate tablet (primary, by default) from the ( + cached) `SrvKeyspace` `local` (per cell) topo record. +* For a sharded keyspace the `SrvKeyspace` record is used to find the currently active shards. This is done by checking + the list of partitions for the specific tablet type selected for the query (primary, by default, for reads and writes) + and selecting the ones whose `query_service_disabled` is not set and whose `is_primary_serving` is true. +* Finally, based on the vindex for the table from the cached `VSchema` (stored in the `global` topo), the shard for the + relevant row is computed based on the keyrange to which the id is mapped to using the declared vindex function/type. # Changes made to the topo when traffic is switched -This document outlines the steps involved in the cutover process of [`MoveTables`](../../../reference/vreplication/movetables/) and [`Reshard`](../../../reference/vreplication/reshard/) workflows when traffic is switched from the source tables/shards to the target tables/shards. We use the resharding flow provided in the local examples and show the relevant snippets from the topo for each step in the workflow. +This document outlines the steps involved in the cutover process +of [`MoveTables`](../../../reference/vreplication/movetables/) and [`Reshard`](../../../reference/vreplication/reshard/) +workflows when traffic is switched from the source tables/shards to the target tables/shards. We use the resharding flow +provided in the local examples and show the relevant snippets from the topo for each step in the workflow. Note: Items in italics are topo keys and the following snippet the value of the key @@ -69,7 +105,8 @@ For brevity we only show the records for the 80- shard. There will be similar re #### Before Resharding, after -80/80- shards are created -Only shard 0 has `is_primary_serving` set to true. The `SrvKeyspace` record only has references to 0 for both primary and replica. +Only shard 0 has `is_primary_serving` set to true. The `SrvKeyspace` record only has references to 0 for both primary +and replica. _global/keyspaces/customer/shards/0/Shard_ @@ -94,7 +131,7 @@ partitions:{served_type:PRIMARY shard_references:{name:"0"}} partitions:{served_type:REPLICA shard_references:{name:"0"}} ``` -### After replica traffic is switched (aka SwitchReads) +### After replica traffic is switched using `SwitchTraffic` (previously known as SwitchReads) Shard 0 still has the `is_primary_serving` set as true. The primary partition is still the same. @@ -134,7 +171,7 @@ shard_tablet_controls:{name:"-80" key_range:{end:"\x80"}} shard_tablet_controls:{name:"80-" key_range:{start:"\x80"}}} ``` -#### After primary traffic is switched (aka SwitchWrites) +#### After primary traffic is switched using `SwitchTraffic` (previously known as SwitchWrites) * `is_primary_serving` is removed from shard 0 * `is_primary_serving` is added to shards -80 and 80- @@ -182,7 +219,8 @@ VSchema for the source keyspace contains the table name, so vtgate routes to tha #### During MoveTables -Both the source and target now contain the tables and both VSchemas refer to them. However we have routing rules that map the tables for each tablet type from the target keyspace to the other +Both the source and target now contain the tables and both VSchemas refer to them. However we have routing rules that +map the tables for each tablet type from the target keyspace to the other _global/RoutingRules_ @@ -207,7 +245,9 @@ rules:{from_table:"customer@replica" to_tables:"customer.customer"} #### On switching primary traffic -The routing rules for the primary are updated to map the table on the source to the target. In addition the tables are added to the “denylist” on the source keyspace which vttablet uses to reject writes for tables that have moved. The denylist/routing rules are temporary and can be removed since the moved tables will only appear in the target VSchema +The routing rules for the primary are updated to map the table on the source to the target. In addition the tables are +added to the “denylist” on the source keyspace which vttablet uses to reject writes for tables that have moved. The +denylist/routing rules are temporary and can be removed since the moved tables will only appear in the target VSchema _global/RoutingRules_ @@ -229,8 +269,8 @@ is_primary_serving:true # Miscellaneous Notes: -* In VReplication workflows, cutovers are performed manually by the user executing the noted `vtctl` commands -* [`SwitchReads`](../../../reference/vreplication/v1/switchreads/) and [`SwitchWrites`](../../../reference/vreplication/v1/switchwrites/) are deprecated terms from the [“v1” workflows](../../../reference/vreplication/v1/) and are now replaced by [`SwitchTraffic`](../../../reference/vreplication/switchtraffic/) and [`ReverseTraffic`](../../../reference/vreplication/reversetraffic/) in the [“v2” workflows](../../../reference/vreplication/) . This section mentions both terms since the nomenclature has changed in recent versions and the v1 names may be more well known. -* The term [`SwitchReads`](../../../reference/vreplication/v1/switchreads/) refers to switching traffic for replica and rdonly tablets. Of course this is by definition read-only traffic. Traffic to the primary tablets is not affected. This is equivalent to [`SwitchTraffic`](../../../reference/vreplication/switchtraffic/) for replica and rdonly tablets (if you do not specify primary as a tablet_type for the [`SwitchTraffic`](../../../reference/vreplication/switchtraffic/) command). -* [`SwitchWrites`](../../../reference/vreplication/v1/switchwrites/) refers to switching all traffic for the primary tablets. Equivalent to [`SwitchTraffic`](../../../reference/vreplication/switchtraffic/) for primary tablet types. -* [`SwitchReads`](../../../reference/vreplication/v1/switchreads/) and [`SwitchWrites`](../../../reference/vreplication/v1/switchwrites/) can also reverse traffic based on the options/parameters provided to them +* In VReplication workflows, cutovers are performed manually by the user executing the `vtctl` + commands [`SwitchTraffic`](../../../reference/vreplication/switchtraffic/) + and [`ReverseTraffic`](../../../reference/vreplication/reversetraffic/) +* When traffic for replica and rdonly tablets is switched not all read traffic is switched: primary reads will still be + from the source shards, until primary traffic is also switched. From 3c8e29b674623ac1e3d571a7037b08f9130f5a55 Mon Sep 17 00:00:00 2001 From: Rohit Nayak Date: Sat, 14 Jan 2023 21:40:06 +0100 Subject: [PATCH 05/16] Reorganized design docs and refactored a few Signed-off-by: Rohit Nayak --- .../vreplication/internal/cutover.md | 2 +- .../reference/vreplication/internal/keys.md | 172 ++++++++++++------ .../internal/{vstream => }/tracker.md | 65 ++++--- .../internal/vstream-skew-detection.md | 81 +++++++++ ...gration.md => vstream-stream-migration.md} | 17 +- .../vreplication/internal/vstream/_index.md | 9 - .../internal/vstream/skew-detection.md | 55 ------ .../vreplication/internal/vstream/vscopy.md | 134 -------------- 8 files changed, 254 insertions(+), 281 deletions(-) rename content/en/docs/16.0/reference/vreplication/internal/{vstream => }/tracker.md (55%) create mode 100644 content/en/docs/16.0/reference/vreplication/internal/vstream-skew-detection.md rename content/en/docs/16.0/reference/vreplication/internal/{vstream/stream-migration.md => vstream-stream-migration.md} (88%) delete mode 100644 content/en/docs/16.0/reference/vreplication/internal/vstream/_index.md delete mode 100644 content/en/docs/16.0/reference/vreplication/internal/vstream/skew-detection.md delete mode 100644 content/en/docs/16.0/reference/vreplication/internal/vstream/vscopy.md diff --git a/content/en/docs/16.0/reference/vreplication/internal/cutover.md b/content/en/docs/16.0/reference/vreplication/internal/cutover.md index adc3cd514..8c3276c9e 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/cutover.md +++ b/content/en/docs/16.0/reference/vreplication/internal/cutover.md @@ -1,7 +1,7 @@ --- title: How traffic is switched description: How Vitess signals traffic cutover for Reshard and MoveTables -weight: 30 +weight: 2 --- # Related persistent Vitess objects diff --git a/content/en/docs/16.0/reference/vreplication/internal/keys.md b/content/en/docs/16.0/reference/vreplication/internal/keys.md index 0bc65a653..6dbd1b0a7 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/keys.md +++ b/content/en/docs/16.0/reference/vreplication/internal/keys.md @@ -1,71 +1,102 @@ --- -title: Keys and iteration -description: How VReplication utilizes keys on source and target tables, requirements and limitations -weight: 30 +title: Role of table keys in VReplication +description: Uses and requirements for primary and unique keys in source and target table in VReplication Workflows +weight: 3 --- # The use of unique keys -A VReplication stream copies data from a table on a source target to a table on a target tablet. In some cases the two tablets may be the same one, but the stream is oblivious to such nuance. VReplication needs to be able to copy existing rows from the source table onto the target table, as well as identify binary log events from the source tablet, and apply them onto the target table. To that effect, VReplication needs to be able to uniquely identify rows, so that it can apply a specific `UPDATE` on the correct row, or so that it knows all rows _up to a given row_ have been copied. +A VReplication stream copies data from a table on a source target to a table on a target tablet. In some cases, the two +tablets may be the same one, but the stream is oblivious to such nuance. VReplication needs to be able to copy existing +rows from the source table to the target table, as well as identify binary log events from the source tablet, and +apply them to the target table. To that effect, VReplication needs to be able to uniquely identify rows, so that it +can apply a specific `UPDATE` on the correct row, or so that it knows all rows _up to a given row_ have been copied. -Each row needs to be uniquely identifiable. In the relational model this is trivially done by utilizing `UNIQUE KEY`s, preferably `PRIMARY KEY`s. A `UNIQUE KEY` made up of non-`NULL`able columns is considered a `PRIMARY KEY` equivalent (PKE) for this purpose. +Each row needs to be uniquely identifiable. In the relational model, this is trivially done by utilizing `UNIQUE KEY`s, +preferably `PRIMARY KEY`s. A `UNIQUE KEY` made up of non-`NULL`able columns is considered a `PRIMARY KEY` equivalent ( +PKE) for this purpose. -Typically, both source and target table have similar structure, and same keys. +Typically, both the source and the target table have a similar structure and the same keys. -In fact, In the most common use case both tables will have same `PRIMARY KEY` covering same set of columns in same order. This is the default assumption and expectation by VReplication. But this doesn't have to be the case, and it is possible to have different keys on source and target table. +In fact, in the most common use case, both tables will have the same `PRIMARY KEY` covering the same set of columns in +the same order. This is the default assumption and expectation by VReplication. But this doesn't have to be the case, +and it is possible to have different keys on the source and the target table. ## Which keys are eligible? -Any `UNIQUE KEY` that is non-`NULL`able potentially qualifies. A `NULL`able `UNIQUE KEY` is a key that covers one or more `NULL`able columns. It doesn't matter if column values do or do not actually contain `NULL`s. If a column is `NULL`able, then a `UNIQUE KEY` that includes that column is not eligible. +Any `UNIQUE KEY` that is non-`NULL`able potentially qualifies. A `NULL`able `UNIQUE KEY` is a key that covers one or +more `NULL`able columns. It doesn't matter if column values do or do not actually contain `NULL`s. If a column is `NULL` +able, then a `UNIQUE KEY` that includes that column is not eligible. -`PRIMARY KEY`s are by definition always non-`NULL`able. A `PRIMARY KEY` (PK) is typically the best choice. It gives best iteration/read performance on InnoDB tables, as those are clustered by PK order. +`PRIMARY KEY`s are by definition always non-`NULL`able. A `PRIMARY KEY` (PK) is typically the best choice. It gives best +iteration/read performance on InnoDB tables, as those are clustered by PK order. -`PRIMARY KEY` aside, `VReplication` prioritizes keys that utilize e.g. integers rather than texts, and more generally prioritizes smaller data types over larger data types. +`PRIMARY KEY` aside, `VReplication` prioritizes keys that utilize e.g. integers rather than texts, and more generally +prioritizes smaller data types over larger data types. -However, not all eligible `UNIQUE KEY`s, or even `PRIMARY KEY`s are usable for all VReplication streams, as described below. +However, not all eligible `UNIQUE KEY`s, or even `PRIMARY KEY`s are usable for all VReplication streams, as described +below. ## Comparable rows VReplication needs to be able to determine, given a row in the source table, which row it maps to in the target table. -In the case both tables share the same `PRIMARY KEY`, the answer is trivial: given a row from the source table, take the PK column values (say the table has `PRIMARY KEY(col1, col2)`), and compare with/apply to target table via `... WHERE col1= AND col2=`. +In the case both tables share the same `PRIMARY KEY`, the answer is trivial: given a row from the source table, take the +PK column values (say the table has `PRIMARY KEY(col1, col2)`), and compare with/apply to target table +via `... WHERE col1= AND col2=`. -However, other scenarios are valid. Consider an OnlineDDL operation that modifies the `PRIMARY KEY` as follows: `DROP PRIMARY KEY, ADD PRIMARY KEY(col1)`. On the source table, a row is identified by `col1, col2`. On the target table, a row is only identifiable by `col1`. This scenario still feels comfortable: all we need to do when we apply e.g. an `UPDATE` statement on target table is to drop `col2` from the statement: `... WHERE col1=`. +However, other scenarios are valid. Consider an OnlineDDL operation that modifies the `PRIMARY KEY` as +follows: `DROP PRIMARY KEY, ADD PRIMARY KEY(col1)`. On the source table, a row is identified by `col1, col2`. On the +target table, a row is only identifiable by `col1`. This scenario still feels comfortable: all we need to do when we +apply e.g. an `UPDATE` statement on the target table is to drop `col2` from the statement: `... WHERE col1=`. -_Note that it is the user's responsibility to make sure the data will comply with the new constraints. If not, VReplication will fail the operation._ +_Note that it is the user's responsibility to make sure the data will comply with the new constraints. If not, +VReplication will fail the operation._ -But consider the opposite case, there's a `PRIMARY KEY(col1)` and an OnlineDDL operation turns it into `PRIMARY KEY(col1, col2)`. Now we need to apply changes `... WHERE col1= AND col2=`. But `col2` is not part of the source `PRIMARY KEY`. +But consider the opposite case, there's a `PRIMARY KEY(col1)` and an OnlineDDL operation turns it +into `PRIMARY KEY(col1, col2)`. Now we need to apply changes `... WHERE col1= AND col2=`. But `col2` is not +part of the source `PRIMARY KEY`. -An extreme case is when the keys on source table and target table do not share _any columns_ between them. Say source table has `PRIMARY KEY(col1)` and target table has `PRIMARY KEY(col2)` and with no other potential keys. We still need to identify which row in source table maps to which row in the target table. VReplication still supports this scenario. +An extreme case is when the keys on the source table and the target table do not share _any columns_ between them. Say +the source table has `PRIMARY KEY(col1)` and the target table has `PRIMARY KEY(col2)` and with no other potential keys. +We still +need to identify which row in the source table maps to which row in the target table. VReplication still supports this +scenario. -Yet another complication is when columns are renamed along the way. Consider a `ALTER TABLE CHANGE COLUMN col2 col_two INT UNSIGNED ...`. A row on the source table is identified by `col1, col2`, but on the target table it is identified by `col1, col_two`. +Yet another complication is when columns are renamed along the way. Consider +an `ALTER TABLE CHANGE COLUMN col2 col_two INT UNSIGNED ...`. A row on the source table is identified by `col1, col2`, +but on the target table it is identified by `col1, col_two`. Let's now discuss what the exact requirements are for unique keys, and then discuss implementation. ## Requirements -To be able to create a VReplication stream between source table and target table: +To be able to create a VReplication stream between the source table and target table: -- The source table must have a non-`NULL`able `UNIQUE/PRIMARY` key (PK or PKE) whose columns all exist in the target table (possibly under different names) -- The target table must have a non-`NULL`able `UNIQUE/PRIMARY` key whose columns all exist in the source table (possibly under different names) -- Except in the trivial case where both tables share same `PRIMARY KEY` (of same columns in same order), VReplication can automatically determine which keys to utilize (more on this later on) +- The source table must have a non-`NULL`able `UNIQUE/PRIMARY` key (PK or PKE) whose columns all exist in the target + table (possibly under different names) +- The target table must have a non-`NULL`able `UNIQUE/PRIMARY` key whose columns all exist in the source table (possibly + under different names) +- Except in the trivial case where both tables share the same `PRIMARY KEY` (of the same columns in the same order), + VReplication can automatically determine which keys to utilize (more on this later on) To clarify, it is **OK** if: -- Keys in source table and target table go by different names -- Chosen key in source table and chosen key in target table do not share any columns -- Chosen key in source table and chosen key in target table share some or all columns -- Chosen key in source table and chosen key in target table share some or all columns but in different order -- There are keys in source table that cover columns not present in target table -- There are keys in target table that cover columns not present in source table -- There are `NULL`able columns in source and target table -- There are `NULL`able keys in source and target table +- Keys in the source table and the target table go by different names +- Chosen key in the source table and chosen key in the target table do not share any columns +- Chosen key in the source table and chosen key in the target table share some or all columns +- Chosen key in the source table and chosen key in the target table share some or all columns but in a different order +- There are keys in the source table that cover columns not present in the target table +- There are keys in the target table that cover columns not present in the source table +- There are `NULL`able columns in the source and the target table +- There are `NULL`able keys in the source and the target table -All it takes is _one_ viable key that can be used to uniqely identify rows in the source table, and one such viable key in target table to allow VReplication to work. +All it takes is _one_ viable key that can be used to uniquely identify rows in the source table, and one such viable key +in the target table to allow VReplication to work. -### Examples for valid cases +### Examples of valid cases -#### Source table and target table are same: +#### Source table and target table are the same: ```sql CREATE TABLE `entry` ( @@ -77,16 +108,16 @@ CREATE TABLE `entry` ( ) ``` -The above is the trivial scenario. +The above is a trivial scenario. -#### Source table and target table share same PRIMARY KEY +#### Source table and target table share the same PRIMARY KEY ```sql CREATE TABLE `source` ( `id` int NOT NULL, `uuid` varchar(40) DEFAULT NULL, `ts` timestamp NULL DEFAULT NULL, - `customer_id` int , + `customer_id` int, PRIMARY KEY (`id`), KEY ts_idx(`ts`) ) @@ -100,7 +131,7 @@ CREATE TABLE `target` ( ) ``` -The differences in structure are interesting, but irrelevant to VReplication's ability to copy the data. +The differences in structure are interesting but irrelevant to VReplication's ability to copy the data. #### Subset PRIMARY KEY @@ -192,11 +223,11 @@ The only eligible solution in the above is: - Use `source`'s `PRIMARY KEY` (the column `uuid` is found in `target`) - Use `target`'s `uuid_idx` key (again using column `uuid` which is found in `source`). -`target`'s `PRIMARY KEY` is not valid because covered column `id` does not exist in `source`. +`target`'s `PRIMARY KEY` is not valid because the covered column `id` does not exist in `source`. Incidentally, in the above, the chosen keys differ by name, but share the same columns (`uuid`). -### Examples for invalid cases +### Examples of invalid cases #### NULLable columns @@ -243,7 +274,8 @@ CREATE TABLE `target` ( ## Configuring the stream -If both source and target table share the same `PRIMARY KEY` (covering same columns in same order) then there's nothing to be done. VReplication will pick `PRIMARY KEY` on both ends by default. +If both source and target table share the same `PRIMARY KEY` (covering the same columns in the same order) then there's +nothing to be done. VReplication will pick `PRIMARY KEY` on both ends by default. In all other cases, VReplication must determine which keys are involved and which ones to use. @@ -261,7 +293,8 @@ CREATE TABLE `corder` ( ) ``` -And even though we don't _have to_, here's how we could manually configure the VReplication workflow definition (prettified for readability): +And even though we don't _have to_, here's how we could manually configure the VReplication workflow definition ( +prettified for readability): ``` keyspace:"commerce" shard:"0" filter:{ @@ -279,11 +312,12 @@ In the above: - `source_unique_key_columns` is the (comma delimited) list of columns covered by the chosen key on source table - `target_unique_key_columns` is the (comma delimited) list of columns covered by the chosen key on target table -- `source_unique_key_target_columns` is the (comma delimited) list of column names in target table, which map to `source_unique_key_columns`. This mapping is necessary because columns may change their names. +- `source_unique_key_target_columns` is the (comma delimited) list of column names in target table, which map + to `source_unique_key_columns`. This mapping is necessary because columns may change their names. ### Example 2 -Again both source and target table share same `PRIMARY KEY`, but this time it covers two columns: +Again both the source and the target table share same `PRIMARY KEY`, but this time it covers two columns: ```sql CREATE TABLE `shipment` ( @@ -335,41 +369,67 @@ keyspace:"commerce" shard:"0" filter:{ Note: -- `source_unique_key_columns` indicates names of columns on source table -- `target_unique_key_columns` indicates names of columns on target table +- `source_unique_key_columns` indicates the names of columns on the source table +- `target_unique_key_columns` indicates the names of columns on the target table - `source_unique_key_target_columns` repeats `source_unique_key_columns`, but replaces `customer_id` with `cust_id` ## Automation -OnlineDDL has a mechanism to automatically analyze the differences between source and target tables, evaluate eligible keys, choose the keys on source and target tables, and populate the filter's `source_unique_key_columns`, `target_unique_key_columns`, `source_unique_key_target_columns` fields. Indeed, OnlineDDL operations are most susceptible to differences in keys. The user can also supply their chosen values as an override — using those fields in the workflow definition — in the rare case it's needed. +OnlineDDL has a mechanism to automatically analyze the differences between source and target tables, evaluate eligible +keys, choose the keys on source and target tables, and populate the +filter's `source_unique_key_columns`, `target_unique_key_columns`, `source_unique_key_target_columns` fields. Indeed, +OnlineDDL operations are most susceptible to differences in keys. The user can also supply their chosen values as an +override — using those fields in the workflow definition — in the rare case it's needed. -VReplication more broadly will automatically use the most efficient `PRIMARY KEY` equivalent (non-`NULL`able unique key) when there's no defined `PRIMARY KEY` on the table. +VReplication more broadly will automatically use the most efficient `PRIMARY KEY` equivalent (non-`NULL`able unique key) +when there's no defined `PRIMARY KEY` on the table. ## Implementation At a high level, this is how VReplication is able to work with different keys/columns. -Originally, VReplication was only designed to work with identical `PRIMARY KEY`s. If not specified, VReplication assumed the source table's `PRIMARY KEY` _can be used_ on target table, and that target table's `PRIMARY KEY` applied to the source table. If not, it would error out and the workflow would fail. +Originally, VReplication was only designed to work with identical `PRIMARY KEY`s. If not specified, VReplication assumed +the source table's `PRIMARY KEY` _can be used_ on the target table, and that the target table's `PRIMARY KEY` is applied +to the source table. If not, it would error out and the workflow would fail. -With the introduction of mechanisms to automatically determine the optimal key to use and of the `source_unique_key_columns`, `target_unique_key_columns`, `source_unique_key_target_columns` fields for more fine-grained control, VReplication changes behavior as follows: +With the introduction of mechanisms to automatically determine the optimal key to use and of +the `source_unique_key_columns`, `target_unique_key_columns`, `source_unique_key_target_columns` fields for more +fine-grained control, VReplication changes behavior as follows: #### Notes about the code -Much of the code uses "PK" terminology. With the introduction of _any_ unique key utilization the "PK" terminology becomes incorrect. However, to avoid mass rewrites we kept this terminology, and wherever VReplication discusses a `PRIMARY KEY` or pkColumns etc., it may refer to a non-PK Unique Key (PKE). +Much of the code uses "PK" terminology. With the introduction of _any_ unique key utilization the "PK" terminology +becomes incorrect. However, to avoid mass rewrites we kept this terminology, and wherever VReplication discusses +a `PRIMARY KEY` or pkColumns, etc., it may refer to a non-PK Unique Key (PKE). ### Streamer -Streaming is done using the `source_unique_key_columns` value, if present. When present, `rowstreamer` trusts the information in `source_unique_key_columns` to be correct. It does not validate that there is indeed a valid unique key covering those columns, it only validates that the columns exist. When a `source_unique_key_columns` value is not present, `rowstreamer` uses the `PRIMARY KEY` columns if they exist, otherwise it will determine the best available `PRIMARY KEY` equivalent if one exists, and lastly if none of these are available it will use all of the columns in the table. +Streaming is done using the `source_unique_key_columns` value if present. When present, `rowstreamer` trusts the +information in `source_unique_key_columns` to be correct. It does not validate that there is indeed a valid unique key +covering those columns, it only validates that the columns exist. When a `source_unique_key_columns` value is not +present, `rowstreamer` uses the `PRIMARY KEY` columns if they exist, otherwise it will determine the best +available `PRIMARY KEY` equivalent if one exists, and lastly if none of these are available it will use all of the +columns in the table. -The streamer iterates the table by the chosen index's column order. It then tracks its progress in `lastPk` as if this was indeed a true `PRIMARY KEY`. +The streamer iterates the table by the chosen index's column order. It then tracks its progress in `lastPk` as if this +was indeed a true `PRIMARY KEY`. ### Copier -VCopier receives rows from the streamer in the chosen index's column order. It complies with the streamer's ordering. When tracking progress in `_vt.copy_state` it uses `lastPk` values from the streamer, which means it uses the same index columns as the streamer in that order. +VCopier receives rows from the streamer in the chosen index's column order. It complies with the streamer's ordering. +When tracking progress in `_vt.copy_state` it uses `lastPk` values from the streamer, which means it uses the same index +columns as the streamer in that order. ### Player -VPlayer adhers to both `source_unique_key_columns` and `target_unique_key_columns` when present. If not present, again it attempts to use the `PRIMARY KEY` columns if they exist, otherwise it will determine the best available `PRIMARY KEY` equivalent if one exists, and lastly if none of these are available it will use all of the columns in the table. - -- `TablePlan`'s `isOutsidePKRange()` function needs to compare values according to `rowstreamer`'s ordering, therefore uses the chosen index columns in order -- `tablePlanBuilder`'s `generateWhere()` function uses the target table's `target_unique_key_columns`, and then also appends any supplemental columns from `source_unique_key_target_columns` not included in `target_unique_key_columns` when they are present. If not present, again it attempts to use the `PRIMARY KEY` columns if they exist, otherwise it will determine the best available `PRIMARY KEY` equivalent if one exists, and lastly if none of these are available it will use all of the columns in the table. +VPlayer adheres to both `source_unique_key_columns` and `target_unique_key_columns` when present. If not present, again +it attempts to use the `PRIMARY KEY` columns if they exist, otherwise it will determine the best available `PRIMARY KEY` +equivalent if one exists, and lastly if none of these are available it will use all of the columns in the table. + +- `TablePlan`'s `isOutsidePKRange()` function needs to compare values according to `rowstreamer`'s ordering, therefore + uses the chosen index columns in order +- `tablePlanBuilder`'s `generateWhere()` function uses the target table's `target_unique_key_columns`, and then also + appends any supplemental columns from `source_unique_key_target_columns` not included in `target_unique_key_columns` + when they are present. If not present, again it attempts to use the `PRIMARY KEY` columns if they exist, otherwise it + will determine the best available `PRIMARY KEY` equivalent if one exists, and lastly if none of these are available it + will use all of the columns in the table. diff --git a/content/en/docs/16.0/reference/vreplication/internal/vstream/tracker.md b/content/en/docs/16.0/reference/vreplication/internal/tracker.md similarity index 55% rename from content/en/docs/16.0/reference/vreplication/internal/vstream/tracker.md rename to content/en/docs/16.0/reference/vreplication/internal/tracker.md index def41d817..9535f234e 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/vstream/tracker.md +++ b/content/en/docs/16.0/reference/vreplication/internal/tracker.md @@ -2,21 +2,26 @@ title: Schema Tracker description: Tracking schema changes in Vstreams aliases: ["/user-guide/update-stream"] -weight: 1 +weight: 4 --- # Tracking schema changes in Vstreams ## Motivation -Currently, Vstreams work with a single (the latest) database schema. On every DDL the schema engine reloads the schema from the database engine. +Currently, Vstreams work with a single (the latest) database schema. On every DDL the schema engine reloads the schema +from the database engine. -All Vstreams on a tablet share a common engine. Vstreams that are lagging might be seeing a newer (and hence incorrect) version of the schema in case ddls were applied in between. +All Vstreams on a tablet share a common schema engine. Vstreams that are lagging might be seeing a newer (and hence +incorrect) version of the schema in case ddls were applied in between. -In addition reloading schemas is an expensive operation. If there are multiple Vstreams each of them will separately receive a DDL event resulting in multiple reloads for the same DDL. +In addition, reloading schemas is an expensive operation. If there are multiple Vstreams each of them will separately +receive a DDL event resulting in multiple reloads for the same DDL. {{< info >}} -For full functionality, schema tracking relies on non-default Vitess vttablet options: `-watch_replication_stream` and `-track_schema_versions`. Specifically, performing a Vstream from a non-primary tablet while concurrently making DDL changes to the keyspace without one or both of these tablet options will result in incorrect Vstream results. +For full functionality, schema tracking relies on non-default Vitess vttablet options: `-watch_replication_stream` +and `-track_schema_versions`. Specifically, performing a Vstream from a non-primary tablet while concurrently making DDL +changes to the keyspace without one or both of these tablet options could result in incorrect Vstream results. {{< /info >}} ## Goals @@ -26,13 +31,16 @@ For full functionality, schema tracking relies on non-default Vitess vttablet op ## Model -We add a new schema_version table in \_vt with columns, including, the gtid position, the schema as of that position, and the ddl that led to this schema. Inserting into this table generates a Version event in Vstream. +We add a new schema_version table in \_vt with columns, including, the gtid position, the schema as of that position, +and the ddl that led to this schema. Inserting into this table generates a Version event in Vstream. ## Actors #### Schema Engine -Schema engine gets the schema from the database and only keeps the last (latest) copy it loaded. It notifies subscribers if the schema changes. It polls for the latest schema at intervals or can be explicitly requested to load the schema. +Schema engine gets the schema from the database and only keeps the last (latest) copy it loaded. It notifies subscribers +if the schema changes. It polls for the latest schema at intervals or can be explicitly requested to load the schema for +a tablet using `vtctl`'s `ReloadSchema`. #### Replication watcher @@ -40,16 +48,21 @@ Replication watcher is a Vstream that is started by the tabletserver. It notifie #### Version Tracker -Version tracker runs on the primary. It subscribes to the replication watcher and inserts a new row into the schema_version table with the latest schema. +Version tracker runs on the primary. It subscribes to the replication watcher and inserts a new row into the +\_vt.schema_version table with the latest schema. #### Version Historian -Version historian runs on both primary and replica and handles DDL events. For a given GTID it looks up its cache to check if it has a schema valid for that GTID. If not, on the replica, it looks up the schema_version table. If no schema is found then it provides the latest schema which is updated by subscribing to the schema engine’s change notification. +Version historian runs on both primary and replica and handles DDL events. For a given GTID it looks up its cache to +check if it has a schema valid for that GTID. If not, on the replica, it looks up the schema_version table. If no schema +is found then it provides the latest schema -- which is updated by subscribing to the schema engine’s change notification. ### Notes - Schema Engine is an existing service -- Replication Watcher already exists and is used as an optional Vstream that the user can run. It doesn’t do anything specific: it is used for the side-effect that a Vstream loads the schema on a DDL, to proactively load the latest schema. +- Replication Watcher is used as an optional Vstream that the user can run. It doesn’t do anything + specific: it is used for the side-effect that a Vstream loads the schema on a DDL, to proactively load the latest + schema. ## Basic Flow for version tracking @@ -57,21 +70,24 @@ Version historian runs on both primary and replica and handles DDL events. For a #### Version tracker: -1. When the primary comes up the replication watcher (a Vstream) is started from the current GTID position. Tracker subscribes to the watcher. +1. When the primary comes up the replication watcher (a Vstream) is started from the current GTID position. The Tracker + subscribes to the watcher. 1. Say, a DDL is applied 1. The watcher Vstream sees the DDL and - 1. asks the schema engine to reload the schema, also providing the corresponding gtid position - 2. notifies the tracker of a schema change +1. asks the schema engine to reload the schema, also providing the corresponding gtid position +1 notifies the tracker of a schema change 1. Tracker stores its latest schema into the \_vt.schema_version table associated with the given GTID and DDL #### Historian/Vstreams: 1. Historian warms its cache from the schema_version table when it loads -2. When the tracker inserts the latest schema into \_vt.schema_version table, the Vstream converts it into a (new) Version event +2. When the tracker inserts the latest schema into \_vt.schema_version table, the Vstream converts it into a (new) + Version event 3. For every Version event the Vstream registers it with the Historian 4. On the Version event, the tracker loads the new row from the \_vt.schema_version table 5. When a Vstream needs a new TableMap it asks the Historian for it along with the corresponding GTID. -6. Historian looks up its cache for a schema version for that GTID. If not present just provides the latest schema it has received from the schema engine. +6. Historian looks up its cache for a schema version for that GTID. If not present just provides the latest schema it + has received from the schema engine. #### Replica @@ -82,14 +98,17 @@ Version historian runs on both primary and replica and handles DDL events. For a ### Primary -Schema version snapshots are stored only on the primary. This is done when the Replication Watcher gets a DDL event resulting in a SchemaUpdated(). There are two independent flows here: +Schema version snapshots are stored only on the primary. This is done when the Replication Watcher gets a DDL event +resulting in a SchemaUpdated(). There are two independent flows here: 1. Replication Watcher is running 2. Schema snapshots are saved to \_vt.schema_version when SchemaUpdated is called -Point 2 is performed only when the flag TrackSchemaVersions is enabled. This implies that #1 also has to happen when TrackSchemaVersions is enabled independently of the WatchReplication flag +Point 2 is performed only when the flag TrackSchemaVersions is enabled. This implies that #1 also has to happen when +TrackSchemaVersions is enabled independently of the WatchReplication flag -However if the WatchReplication flag is enabled but TrackSchemaVersions is disabled we still need to run the Replication Watcher since the user has requested it, but we should not store schema versions. +However if the WatchReplication flag is enabled but TrackSchemaVersions is disabled we still need to run the Replication +Watcher since the user has requested it, but we should not store schema versions. So the logic is: @@ -103,11 +122,13 @@ So the logic is: => Replication Watcher is running \ => SchemaUpdated is handled -The Historian behavior is identical to that of the replica: of course if versions are not stored in \_vt.schema_versions it will always provide the latest version of the scheme. +The Historian behavior is identical to that of the replica: of course if versions are not stored in \_vt.schema_versions +it will always provide the latest version of the scheme. ### Replica -Schema versions are never stored on replicas, so SchemaUpdated is always a Noop. Versions are provided as appropriate by the historian. The historian provides the latest schema if there is no appropriate version. +Schema versions are never stored on replicas, so SchemaUpdated is always a Noop. Versions are provided as appropriate by +the historian. The historian provides the latest schema if there is no appropriate version. So the logic is: @@ -139,7 +160,9 @@ So now on the replica, at T4, the version historian will incorrectly provide the ### Situation 2 -If version tracking is turned off on the primary for some time, correct versions may not be available to the historian which will always return the latest schema. This might result in an incorrect schema when a Vstream is processing events in the past. +If version tracking is turned off on the primary for some time, correct versions may not be available to the historian +which will always return the latest schema. This might result in an incorrect schema when a Vstream is processing events +in the past. #### Possible new features around this functionality diff --git a/content/en/docs/16.0/reference/vreplication/internal/vstream-skew-detection.md b/content/en/docs/16.0/reference/vreplication/internal/vstream-skew-detection.md new file mode 100644 index 000000000..34ccdeecb --- /dev/null +++ b/content/en/docs/16.0/reference/vreplication/internal/vstream-skew-detection.md @@ -0,0 +1,81 @@ +--- +title: VStream Skew Minimization +description: Aligning streams from different shards in the VStream API +weight: 7 +--- + +## VStream Skew Detection + +### Motivation + +When the VStream API is streaming from multiple shards we have multiple sources of events: one primary or replica tablet +for each shard in the provided VGTID. The rate at which the events will be streamed from the underlying sources can vary +depending on various factors, like: + +* the replication lag on the source tablets (if a replica is selected as the source for the VStream) +* the cpu load on the source tablet +* possible network partitions or network delays + +This can result in the events in the VStream from some shards being well ahead of other shards. So, for example, if a +row moves from the faster shard to a slower shard we might see the delete event in the VStream from the faster shard +long before the insert from the second. This would result in the row going "invisible" for the duration of the skew. +This can affect user experience in applications where the VStream events are used to refresh UI, for example. + +For most applications where VStream API events feed into change data capture systems for auditing or reporting purposes +these delays may be acceptable. However for applications which are using these events for user-facing functions this can +cause unexpected behavior. See https://github.com/vitessio/vitess/issues/7402 for one such case. + +### Goal + +It is not practically possible to provide exact ordering of events across Vitess shards. The VStream API will inherently +stream events from one shard independently of another. However, VStreamer events do keep track of the binlog event +timestamps which we can use to loosely coordinate the streams. Since binlog timestamp granularity is only to the nearest +second, we attempt to align the streams to within a second. + +### Implementation + +The skew minimization feature adds a flag that the client can set. This flag enables skew detection between the various +streams. Once a skew is detected, events for streams that are ahead are held back until the lagging streams catch up +causing the skew to reach an acceptable level. + +Each VStreamer event (_vevent_) contains two timestamps: one when the database transaction occurred, and the other, the +current time on the source tablet where the vevent was created. This lets us compute how far in the past the event we +just received was created. We use this to determine which shard has the most recent event and which one has the oldest +event. Note that, for shards where there are no activity, VStreamer sends a heartbeat event every second. The +transaction time for an heartbeat is the same as the current time on the source. (These heartbeats are not forwarded to +the VStream since they are synthetic vreplication events.) + +If the difference between the fastest and slowest streams is greater than a threshold, we declare that we have detected +a skew. MySQL binlogs store the transaction timestamp in seconds. Also, on the VTGate serving the VStream, we adjust +this time for clock skews between the VTGate and the source MySQL server. When the user sets the `MinimizeSkew` flag we +want to keep the events across shards to be in the same second: each transaction timestamp is within 1 second of each +other. To account for rounding-off of the transaction timestamp and the clock-skew we set the threshold to be 2 seconds, +instead of 1 second, so that we don't keep stalling the streams due to cumulative round-offs. + +### Possible unexpected behavior + +If there are no events for a second in a shard then a heartbeat is sent. On receiving a heartbeat we reset the skew. +This is necessary to avoid shards with no events starving other shards. The current logic will align streams only if +they are all getting events faster than the heartbeat frequency. + +This means that we cannot guarantee the skew alignment feature will work as expected in certain conditions. This could +happen mainly while streaming from replicas with high replication lags, say, due to high write qps or a network +partition. + +Thus it is recommended that you stream from primaries when using this feature. Note, however, that even primaries with +skewed loads could trigger such a situation. + +### API + +This is how you would turn on the skew detection and alignment feature in a VStream client: + +``` + import vtgatepb "vitess.io/vitess/go/vt/proto/vtgate" + ... + ... + flags := &vtgatepb.VStreamFlags{}; + flags.MinimizeSkew = true; + + reader, err := conn.VStream(ctx, topodatapb.TabletType_PRIMARY, vgtid, filter, flags) + +``` diff --git a/content/en/docs/16.0/reference/vreplication/internal/vstream/stream-migration.md b/content/en/docs/16.0/reference/vreplication/internal/vstream-stream-migration.md similarity index 88% rename from content/en/docs/16.0/reference/vreplication/internal/vstream/stream-migration.md rename to content/en/docs/16.0/reference/vreplication/internal/vstream-stream-migration.md index 4bbf1d851..9797d6cb9 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/vstream/stream-migration.md +++ b/content/en/docs/16.0/reference/vreplication/internal/vstream-stream-migration.md @@ -1,20 +1,25 @@ --- title: VStream API and Resharding description: How VStream API handles a reshard -weight: 3 +weight: 8 --- ## Stream migration on a resharding operation -While subscribing to the VStream API you need to specify the shards from which to stream events. While streaming it is possible that the underlying keyspace is resharded. Thus some or all of the shards which were specified may be replaced by new shards after the resharding is completed. +While subscribing to the VStream API you need to specify the shards from which to stream events. While streaming it is +possible that the underlying keyspace is resharded. Thus some or all of the shards which were specified may be replaced +by new shards after the resharding is completed. -Stream migration logic within VReplication handles this transparently within VTGate. Event sending will be put on hold momentarily during the actual cutover (when writes are switched) and you will start getting the events (and vgtids) for the new set of shards once the cutover is completed. +Stream migration logic within VReplication handles this transparently within VTGate. Event sending will be put on hold +momentarily during the actual cutover (when writes are switched) and you will start getting the events (and vgtids) for +the new set of shards once the cutover is completed. ### An illustration Here is a sample session using the scripts from the [local example](/docs/get-started/local). -Run the steps up to and including `205_clean_commerce.sh`. Now start a vstream api client in a separate terminal to stream events from the customer table in the customer keyspace, which is currently unsharded. +Run the steps up to and including `205_clean_commerce.sh`. Now start a vstream api client in a separate terminal to +stream events from the customer table in the customer keyspace, which is currently unsharded. ``` { @@ -54,11 +59,13 @@ Run the 305 script to switch writes. You will see that vgtids will include the n [type:BEGIN timestamp:1616748733 current_time:1616748733520355854 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616748733 current_time:1616748733520403210 ] ``` -Insert new rows: this will result in row events from the new shards. Shards will only stream changes from the point of resharding. +Insert new rows: this will result in row events from the new shards. Shards will only stream changes from the point of +resharding. ``` $ mysql -u root --host=127.0.0.1 -P 15306 -e "insert into customer(customer_id, email) values(6,'sougou@planetscale.com'), (7, 'deepthi@planetscale.com');" ``` + ``` [type:BEGIN timestamp:1616749631 current_time:1616749631516372189 type:FIELD timestamp:1616749631 field_event: fields: > current_time:1616749631517765487 type:ROW timestamp:1616749631 row_event: > row_changes: > > current_time:1616749631517779353 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616749631 current_time:1616749631517789376 ] ``` diff --git a/content/en/docs/16.0/reference/vreplication/internal/vstream/_index.md b/content/en/docs/16.0/reference/vreplication/internal/vstream/_index.md deleted file mode 100644 index bf7526b75..000000000 --- a/content/en/docs/16.0/reference/vreplication/internal/vstream/_index.md +++ /dev/null @@ -1,9 +0,0 @@ ---- -title: Vstream -description: Change notification service accessible via VTGate -weight: 5 ---- - -References: - -[Streaming Vitess at Bolt](https://medium.com/bolt-labs/streaming-vitess-at-bolt-f8ea93211c3f) diff --git a/content/en/docs/16.0/reference/vreplication/internal/vstream/skew-detection.md b/content/en/docs/16.0/reference/vreplication/internal/vstream/skew-detection.md deleted file mode 100644 index dfd60e1b7..000000000 --- a/content/en/docs/16.0/reference/vreplication/internal/vstream/skew-detection.md +++ /dev/null @@ -1,55 +0,0 @@ ---- -title: VStream Skew Minimization -description: Aligning streams from different shards in the VStream API -weight: 3 ---- - -## VStream Skew Detection - -### Motivation - -When the VStream API is streaming from multiple shards we have multiple sources of events: one primary or replica tablet for each shard in the provided VGTID. The rate at which the events will be streamed from the underlying sources can vary depending on various factors, like: - -* the replication lag on the source tablets (if a replica is selected as the source for the VStream) -* the cpu load on the source tablet -* possible network partitions or network delays - -This can result in the events in the VStream from some shards being well ahead of other shards. So, for example, if a row moves from the faster shard to a slower shard we might see the delete event in the VStream from the faster shard long before the insert from the second. This would result in the row going "invisible" for the duration of the skew. This can affect user experience in applications where the VStream events are used to refresh UI, for example. - -For most applications where VStream API events feed into change data capture systems for auditing or reporting purposes these delays may be acceptable. However for applications which are using these events for user-facing functions this can cause unexpected behavior. See https://github.com/vitessio/vitess/issues/7402 for one such case. - -### Goal - -It is not practically possible to provide exact ordering of events across Vitess shards. The VStream API will inherently stream events from one shard independently of another. However, VStreamer events do keep track of the binlog event timestamps which we can use to loosely coordinate the streams. Since binlog timestamp granularity is only to the nearest second, we attempt to align the streams to within a second. - - -### Implementation - -The skew minimization feature adds a flag that the client can set. This flag enables skew detection between the various streams. Once a skew is detected, events for streams that are ahead are held back until the lagging streams catch up causing the skew to reach an acceptable level. - -Each VStreamer event (_vevent_) contains two timestamps: one when the database transaction occurred, and the other, the current time on the source tablet where the vevent was created. This lets us compute how far in the past the event we just received was created. We use this to determine which shard has the most recent event and which one has the oldest event. Note that, for shards where there are no activity, VStreamer sends a heartbeat event every second. The transaction time for an heartbeat is the same as the current time on the source. (These heartbeats are not forwarded to the VStream since they are synthetic vreplication events.) - -If the difference between the fastest and slowest streams is greater than a threshold, we declare that we have detected a skew. MySQL binlogs store the transaction timestamp in seconds. Also, on the VTGate serving the VStream, we adjust this time for clock skews between the VTGate and the source MySQL server. When the user sets the `MinimizeSkew` flag we want to keep the events across shards to be in the same second: each transaction timestamp is within 1 second of each other. To account for rounding-off of the transaction timestamp and the clock-skew we set the threshold to be 2 seconds, instead of 1 second, so that we don't keep stalling the streams due to cumulative round-offs. - -### Possible unexpected behavior - -If there are no events for a second in a shard then a heartbeat is sent. On receiving a heartbeat we reset the skew. This is necessary to avoid shards with no events starving other shards. The current logic will align streams only if they are all getting events faster than the heartbeat frequency. - -This means that we cannot guarantee the skew alignment feature will work as expected in certain conditions. This could happen mainly while streaming from replicas with high replication lags, say, due to high write qps or a network partition. - -Thus it is recommended that you stream from primaries when using this feature. Note, however, that even primaries with skewed loads could trigger such a situation. - -### API - -This is how you would turn on the skew detection and alignment feature in a VStream client: - -``` - import vtgatepb "vitess.io/vitess/go/vt/proto/vtgate" - ... - ... - flags := &vtgatepb.VStreamFlags{}; - flags.MinimizeSkew = true; - - reader, err := conn.VStream(ctx, topodatapb.TabletType_PRIMARY, vgtid, filter, flags) - -``` diff --git a/content/en/docs/16.0/reference/vreplication/internal/vstream/vscopy.md b/content/en/docs/16.0/reference/vreplication/internal/vstream/vscopy.md deleted file mode 100644 index 6304f17e2..000000000 --- a/content/en/docs/16.0/reference/vreplication/internal/vstream/vscopy.md +++ /dev/null @@ -1,134 +0,0 @@ ---- -title: VStream Copy -description: Streaming events from the beginning -weight: 1 ---- - -## VStream Copy - -### Allow vstreams to stream entire databases or tables - -## Motivation - -Currently, the vstream API streams events starting either from the current position of the binlog or from a position specified by the client. The VStream Copy feature adds support to send all events starting from the first position of the binlog. - -A naive extension of the current mechanism is to stream from the starting position. However, this is impractical for any database/table of a reasonable size. We will extend VStream to make use of the bulk copy based mechanism similar to vreplication streams, used in MoveTables or Reshard sharding workflows. - -Note that with vstream copy the client vstream will not faithfully reproduce the events from the binlog. The aim is to be eventually (and rapidly) consistent with the current database snapshot. This improves performance since we will be merging multiple row updates into a single transaction. -Once we have caught up (i.e. the replication lag is small) binlog events will again be directly streamed similar to the current implementation. - -### Previous API - -Clients create vstreams by grpc-ing to VTGate using the Vstream API call. In golang: - -``` -conn, _ := VTGate.Dial(ctx, "localhost:15991") -// tabletType is one of replica/primary/rdonly, filter,vgtid: see below -reader, _ := VStream(ctx, tabletType, vgtid, filter) -e, _ := reader.Recv() //receive VEvents in a loop until io.EOF -``` - -It is possible for network errors to occur or for the client process to fail. In addition, the vstreamer itself might fail at VTGate or VTTablet. Thus, VTGate needs to send state frequently allowing VTGate to be stateless and clients to recover properly from failures. - -Also, while creating the stream the client can specify multiple shards and/or keyspaces from which to stream events. - -The vgtid structure facilitates both: determining the stream sources and maintaining state. vgtid is a list of tuples: (keyspace, shard, gtid). When a stream is created, gtid can either be “current” or a valid binlog position at which the vstream starts streaming events. - -Some examples: -``` -// stream from current position from two shards -vgtid := &binlogdatapb.VGtid{ - ShardGtids: []*binlogdatapb.ShardGtid{{ - Keyspace: "ks", - Shard: "-40", - Gtid: "current", - },{ - Keyspace: "ks", - Shard: "80-c0", - Gtid: "current", - }} - } - -// stream from specific position from all shards in keyspace ks -vgtid := &binlogdatapb.VGtid{ - ShardGtids: []*binlogdatapb.ShardGtid{{ - Keyspace: "ks", - Gtid: "MariaDB/0-41983-20", - }} - } - -// stream from current position from all keyspaces -vgtid := &binlogdatapb.VGtid{ - ShardGtids: []*binlogdatapb.ShardGtid{{ - Gtid: "current", - }} - } -``` - -The data streamed is sourced from the list of keyspace/shards after applying the specified filter. - -To achieve this VTGate sends a vgtid event whenever it encounters a gtid event with the current vgtid state at VTGate. Thus if the stream is broken, for any reason, the client needs to simply create a new vstream using the last vgtid that it received. - - -## Architecture/Design - -During a copy there will two distinct phases: - -1. Copy phase: where the vstreamer is sending row data in bulk using the primary key to “paginate” the table -1. Replication phase: once copying is completed and going forward we only stream events - -The copy phase is nuanced: we copy a batch of rows until a particular PK using a consistent snapshot. However, once the copy is completed the binlog position would have moved possibly containing updates to the rows already transmitted. -Hence we need to perform a “catchup” where we play the events up to the current position. We can only send updates to rows that we have already sent to the stream. - -After the catchup, we send the next batch of rows and perform the related catchup. This copy-catchup loop continues until all tables are copied, after which it is business as usual and events are streamed as they appear in the binlog. - -### API Changes for VStream Copy - -To use VStream Copy you just need to pass an empty string as the position. -The only other change is in the vgtid structure. It now becomes a list of - -`(keyspace, shard, gtid,[]LastTablePK)` - -While the copy is in progress, the LastPK list contains the last seen primary key for each table in that shard. Once the copy is completed and we are replicating the stream this parameter will be nil. - -Note that the vgtid is opaque to the consumer of the vstream API once the vstream starts and the ongoing state does not need to be interpreted on the client. - -To start a VStream Copy user is expected to provide an empty gtid along with a list of tables to copy - (essentially a LastTablePK list with a nil PK for each). Some examples - (see https://github.com/vitessio/contrib/blob/main/vstream_client/vstream_client.go for a sample client): - -``` -// vstream copy two tables table from two shards -filter := &binlogdatapb.Filter{ - Rules: []*binlogdatapb.Rule{{ - Match: "t2", - Filter: "select id, val from t2", - },{ - Match: "t1", - Filter: "select * from t1", - }}, - } -vgtid := &binlogdatapb.VGtid{ - ShardGtids: []*binlogdatapb.ShardGtid{{ - Keyspace: "ks", - Shard: "-40", - Gtid: "", - },{ - Keyspace: "ks", - Shard: "80-c0", - Gtid: "", - }} - } - -// stream the entire database: vstream copy from all tables in all keyspaces -filter := &binlogdatapb.Filter{ - Rules: []*binlogdatapb.Rule{{ - Match: "/.*/", - }}, - } -vgtid := &binlogdatapb.VGtid{ - ShardGtids: []*binlogdatapb.ShardGtid{{ - Gtid: "", - }} - } -``` From 2f32be8e05e11940c711ddecdac6c8c931d1b517 Mon Sep 17 00:00:00 2001 From: Rohit Nayak Date: Sat, 14 Jan 2023 23:03:39 +0100 Subject: [PATCH 06/16] Selected VReplication RFCs Signed-off-by: Rohit Nayak --- .../16.0/reference/vreplication/internal/_index.md | 1 + .../docs/16.0/reference/vreplication/internal/rfcs.md | 10 ++++++++++ 2 files changed, 11 insertions(+) create mode 100644 content/en/docs/16.0/reference/vreplication/internal/rfcs.md diff --git a/content/en/docs/16.0/reference/vreplication/internal/_index.md b/content/en/docs/16.0/reference/vreplication/internal/_index.md index f49a9ab9e..4f9a0179a 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/_index.md +++ b/content/en/docs/16.0/reference/vreplication/internal/_index.md @@ -5,3 +5,4 @@ weight: 1000 skip_sections: true aliases: ['/docs/reference/vreplication/internal'] --- + diff --git a/content/en/docs/16.0/reference/vreplication/internal/rfcs.md b/content/en/docs/16.0/reference/vreplication/internal/rfcs.md new file mode 100644 index 000000000..ff32584be --- /dev/null +++ b/content/en/docs/16.0/reference/vreplication/internal/rfcs.md @@ -0,0 +1,10 @@ +--- +title: Selected VReplication RFCs +description: Links to the RFC Issues in the vitess repo, for convenience +weight: 100 +--- + +- [VDiff2: Reimplementing VDiff on tablets](https://github.com/vitessio/vitess/issues/10134) +- [VStream Copy: streaming events from the beginning](https://github.com/vitessio/vitess/issues/6277) +- [Cross-Cluster Data Migration](https://github.com/vitessio/vitess/issues/7545) +- [File:Position based VReplication](https://github.com/vitessio/vitess/issues/5424) From 8b320eba3f5ccac35a1347a2997e0f57f9677d0b Mon Sep 17 00:00:00 2001 From: Rohit Nayak Date: Wed, 18 Jan 2023 15:09:52 +0100 Subject: [PATCH 07/16] Address review comments Signed-off-by: Rohit Nayak --- .../16.0/reference/vreplication/internal/tracker.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/content/en/docs/16.0/reference/vreplication/internal/tracker.md b/content/en/docs/16.0/reference/vreplication/internal/tracker.md index 9535f234e..8d761f26d 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/tracker.md +++ b/content/en/docs/16.0/reference/vreplication/internal/tracker.md @@ -12,8 +12,9 @@ weight: 4 Currently, Vstreams work with a single (the latest) database schema. On every DDL the schema engine reloads the schema from the database engine. -All Vstreams on a tablet share a common schema engine. Vstreams that are lagging might be seeing a newer (and hence -incorrect) version of the schema in case ddls were applied in between. +All Vstreams on a tablet share a common schema engine. Vstreams that are lagging can see a more recent schema than when +the older binlog events occurred. So the lagging vstreams will see an incorrect version of the schema in case ddls were +applied in between that affect the schema of the tables involved in those lagging events. In addition, reloading schemas is an expensive operation. If there are multiple Vstreams each of them will separately receive a DDL event resulting in multiple reloads for the same DDL. @@ -55,7 +56,8 @@ Version tracker runs on the primary. It subscribes to the replication watcher an Version historian runs on both primary and replica and handles DDL events. For a given GTID it looks up its cache to check if it has a schema valid for that GTID. If not, on the replica, it looks up the schema_version table. If no schema -is found then it provides the latest schema -- which is updated by subscribing to the schema engine’s change notification. +is found then it provides the latest schema -- which is updated by subscribing to the schema engine’s change +notification. ### Notes @@ -74,8 +76,8 @@ is found then it provides the latest schema -- which is updated by subscribing t subscribes to the watcher. 1. Say, a DDL is applied 1. The watcher Vstream sees the DDL and -1. asks the schema engine to reload the schema, also providing the corresponding gtid position -1 notifies the tracker of a schema change +1. Asks the schema engine to reload the schema, also providing the corresponding gtid position +1. Notifies the tracker of a schema change 1. Tracker stores its latest schema into the \_vt.schema_version table associated with the given GTID and DDL #### Historian/Vstreams: From 1ca58e457219d7c8c6d678d09057e5cf9985a563 Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Fri, 20 Jan 2023 10:16:08 -0500 Subject: [PATCH 08/16] Initial work on link fixups and aliases Signed-off-by: Matt Lord --- .../reference/vreplication/internal/life-of-a-stream.md | 2 +- content/en/docs/16.0/reference/vreplication/vreplication.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md b/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md index 56f98147b..8de4cbb94 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md +++ b/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md @@ -2,7 +2,7 @@ title: Life of a stream description: How VReplication replicates data weight: 1 -aliases: ['/docs/reference/vreplication/internals'] +aliases: ['/docs/design-docs/vreplication/life-of-a-stream/'] --- ### Introduction diff --git a/content/en/docs/16.0/reference/vreplication/vreplication.md b/content/en/docs/16.0/reference/vreplication/vreplication.md index 4b22c1e98..d127d20c2 100644 --- a/content/en/docs/16.0/reference/vreplication/vreplication.md +++ b/content/en/docs/16.0/reference/vreplication/vreplication.md @@ -39,7 +39,7 @@ many features. It can be used for the following use cases: ## Feature Description -VReplication works as [a stream or set of streams](../../../../design-docs/vreplication/life-of-a-stream/). +VReplication works as [a stream or set of streams](../internal/life-of-a-stream/). Each stream establishes replication from a source keyspace/shard to a target keyspace/shard. @@ -62,14 +62,14 @@ the relationship between them may not be one to one. VReplication performs the following essential functions: -* [Copy data](../../../../design-docs/vreplication/life-of-a-stream/#copy) +* [Copy data](../internal/life-of-a-stream/#copy) from the source to the destination table in a consistent fashion. For a large table, this copy can be long-running. It can be interrupted and resumed. If interrupted, VReplication can keep the copied portion up-to-date with respect to the source, and it can resume the copy process at a point that is consistent with the current replication position. -* After copying is finished, it can continuously [replicate](../../../../design-docs/vreplication/life-of-a-stream/#replicate) +* After copying is finished, it can continuously [replicate](../internal/life-of-a-stream/#replicate) the data from the source to destination. * The copying rule can be expressed as a `SELECT` statement. The statement should be simple enough that the materialized table can From f4c888994a08f4f9af448c8f388d4f6d2475e0ec Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Fri, 20 Jan 2023 12:26:42 -0500 Subject: [PATCH 09/16] Life of a stream, cutover, RFCs Signed-off-by: Matt Lord --- .../vreplication/internal/cutover.md | 286 ++++++++++++++++++ .../vreplication/internal/life-of-a-stream.md | 168 ++++++++++ .../reference/vreplication/internal/rfcs.md | 10 + .../vreplication/internal/cutover.md | 286 ++++++++++++++++++ .../vreplication/internal/life-of-a-stream.md | 168 ++++++++++ .../reference/vreplication/internal/rfcs.md | 10 + .../vreplication/internal/cutover.md | 286 ++++++++++++++++++ .../vreplication/internal/life-of-a-stream.md | 168 ++++++++++ .../reference/vreplication/internal/rfcs.md | 10 + .../15.0/reference/vreplication/progress.md | 2 +- .../vreplication/internal/cutover.md | 192 ++++++------ .../vreplication/internal/life-of-a-stream.md | 129 ++++---- 12 files changed, 1561 insertions(+), 154 deletions(-) create mode 100644 content/en/docs/13.0/reference/vreplication/internal/cutover.md create mode 100644 content/en/docs/13.0/reference/vreplication/internal/life-of-a-stream.md create mode 100644 content/en/docs/13.0/reference/vreplication/internal/rfcs.md create mode 100644 content/en/docs/14.0/reference/vreplication/internal/cutover.md create mode 100644 content/en/docs/14.0/reference/vreplication/internal/life-of-a-stream.md create mode 100644 content/en/docs/14.0/reference/vreplication/internal/rfcs.md create mode 100644 content/en/docs/15.0/reference/vreplication/internal/cutover.md create mode 100644 content/en/docs/15.0/reference/vreplication/internal/life-of-a-stream.md create mode 100644 content/en/docs/15.0/reference/vreplication/internal/rfcs.md diff --git a/content/en/docs/13.0/reference/vreplication/internal/cutover.md b/content/en/docs/13.0/reference/vreplication/internal/cutover.md new file mode 100644 index 000000000..f733a5e3c --- /dev/null +++ b/content/en/docs/13.0/reference/vreplication/internal/cutover.md @@ -0,0 +1,286 @@ +--- +title: How traffic is switched +description: How Vitess signals traffic cutover for Reshard and MoveTables +weight: 2 +aliases: ['/docs/design-docs/vreplication/cutover/'] +--- + +# Related persistent Vitess objects + +{{< info >}} +As the objects or keys noted below are stored in [the topo server](../../../features/topology-service/) and +cached locally, the processes involved will refresh their topo data throughout the cutover process. For example, each +tablet on the source and target shards that are involved in a [VReplication](../../) workflow +will refresh their topo data multiple times as the state of things transition during the cutover. If we are *not* able +to confirm that all tablets involved in a VReplication worfklow are able to refresh their topo data then the cutover +command — e.g. [`vtctlclient SwitchTraffic`](../../../reference/vreplication/switchtraffic/) — will cancel the operation +and return an error indicating which tablet(s) are unhealthy (including for `--dry_run` executions). +{{< /info >}} + +## VSchema + +A [VSchema](../../../../concepts/vschema/) allows you to describe how data is organized within keyspaces and shards. + +## Shard Info + +The [`global` topo](../../../features/topology-service/#global-vs-local) contains +one [`Shard`](../../../features/topology-service/#shard) key per keyspace which then contains one key per +shard that has been created within the keyspace. For each shard that is healthy there is an +attribute `is_primary_serving` which is set to true. The other shards which have been created but are still not healthy +and serving within the keyspace will not have this attribute set. Here is an example shard info record from an unsharded +keyspace named commerce (without the `--cell` flag being passed the `global` topo base path is used): + +```bash +$ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/keyspaces/commerce/shards/0/Shard' +primary_alias:{cell:"zone1" uid:100} primary_term_start_time:{seconds:1650341417 nanoseconds:374817485} is_primary_serving:true +``` + +## SrvKeyspace + +Each cell has a [`SrvKeyspace`](../../../features/topology-service/#srvkeyspace) key in +the [`local` topo](../../../features/topology-service/#global-vs-local) (per cell info) for each keyspace. For +each tablet type (e.g. `PRIMARY` or `REPLICA`) there is one `partitions` object. The `partitions` objects contain all of the +current shards in the keyspace. For sharded keyspaces, the tablets which are healthy and serving have a key range specified +for that shard. + +Also the primary can contain a `query_service_disabled` attribute which is set to `true` during resharding cutovers. +This tells the primary in that shard to reject any queries made to it, as a signal to vtgate in case vtgate routes +queries to this primary during the cutover or before it discovers the new serving graph. Here is an example using the +same unsharded commerce keyspace and here we specify the `--cell` flag so that cell's topo base path — stored in +its `CellInfo` record in the `global` topo — is used: + +```bash +$ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/cells/zone1/CellInfo' +server_address:"localhost:2379" root:"/vitess/zone1" + +$ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto --cell=zone1 '/keyspaces/commerce/SrvKeyspace' +partitions:{served_type:PRIMARY shard_references:{name:"0"}} partitions:{served_type:REPLICA shard_references:{name:"0"}} partitions:{served_type:RDONLY shard_references:{name:"0"}} +``` + +## Routing Rules + +[Routing Rules](../../../features/schema-routing-rules) are stored in the `RoutingRules` key within +the `global` topo. Routing Rules contain a list of table-specific routes. You can route a table for all or specific +tablet types to another table in the same or different keyspace. Here is an example using the same commerce keyspace +where we have an active [`MoveTables`](../../../reference/vreplication/movetables/) workflow to move tables to the +customer keyspace but we have not switched any traffic yet: + +```bash +$ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/RoutingRules' +rules:{from_table:"corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer.corder" to_tables:"commerce.corder"} rules:{from_table:"customer.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"commerce.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"corder" to_tables:"commerce.corder"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"commerce.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer.customer" to_tables:"commerce.customer"} +``` + +{{< info >}} +In practice you would instead typically view the routing rules via the +dedicated [`GetRoutingRules`](../../../programs/vtctl/schema-version-permissions/#getroutingrules) +vtctl client command which will return the rules for all keyspaces in the topo. +{{< /info >}} + +# How VTGate routes a query + +This section walks through a simplified version of the logic used to determine which keyspace and table vtgate will route +a simple query of the form `select * from t1 where id = 1` (a _read_ query) or `insert into t1 (id, val) values (1,'abc')` +(a _write_ query). + +1. Check to see if `t1` has an appropriate routing rule defined. If so, use the specified target table as an alias for `t1`. +2. Locate the keyspace for `t1` using the [`VSchema`](../../../features/vschema/). +3. For a non-sharded keyspace locate the appropriate tablet (`PRIMARY`, by default) from the (cached) `SrvKeyspace` `local` +(per cell) topo record. +4. For a sharded keyspace the `SrvKeyspace` record is used to find the currently active shards. This is done by checking +the list of partitions for the specific tablet type selected for the query (`PRIMARY`, by default, for both reads and writes) +and selecting the ones whose `query_service_disabled` field is *not* set and whose `is_primary_serving` value is true. +5. Finally, based on the [`VIndex`](../../../features/vindexes/) defined for the table from the cached +[`VSchema`](../../../features/vschema/) (stored in the `global` topo), the shard for the relevant row is computed based +on the keyrange to which the id is mapped to using the declared [`VIndex` function/type](../../../features/vindexes/#predefined-vindexes). + +# Changes made to the topo when traffic is switched + +This document outlines the steps involved in the cutover process +of [`MoveTables`](../../movetables/) and [`Reshard`](../../reshard/) +workflows when traffic is switched from the source tables/shards to the target tables/shards. We use the resharding flow +provided in the [local examples](../../../../get-started/local/) and show the relevant snippets from the topo for each step +in the workflow. + +{{< info >}} +Items in italics are topo keys and the following snippet the value of the key +{{< /info >}} + +## What happens when a Reshard is cutover + +For brevity we only show the records for the `80-` shard. There will be similar records for the `-80` shard. + +#### Before Resharding, after -80/80- shards are created + +Only shard `0` has `is_primary_serving` set to true. The `SrvKeyspace` record only has references to `0` for both `PRIMARY` +and `REPLICA` tablet types. + +*global/keyspaces/customer/shards/0/Shard* + +```proto +primary_alias:{cell:"zone1" uid:200} +primary_term_start_time:{seconds:1627465761 nanoseconds:600070156} +is_primary_serving:true +``` + +*global/keyspaces/customer/shards/80-/Shard* + +```proto +primary_alias:{cell:"zone1" uid:400} +primary_term_start_time:{seconds:1627465833 nanoseconds:536524508} +key_range:{start:"\x80"} +``` + +*zone1/keyspaces/customer/SrvKeyspace* + +```proto +partitions:{served_type:PRIMARY shard_references:{name:"0"}} +partitions:{served_type:REPLICA shard_references:{name:"0"}} +``` + +### After replica traffic is switched using `SwitchTraffic` (previously known as SwitchReads) + +Shard `0` still has the `is_primary_serving` set as true. The primary partition is still the same. + +The replica partition has the following changes: + +* Two more shard_references for `-80` and `80-` +* Key ranges are specified for these shards +* The key range for shard `0` has been removed +* `query_service_disabled` is set to true for shard `0` + +*global/keyspaces/customer/shards/0/Shard* + +```proto +primary_alias:{cell:"zone1" uid:200} +primary_term_start_time:{seconds:1627466189 nanoseconds:587021377} +is_primary_serving:true +``` + +*global/keyspaces/customer/shards/80-/Shard* + +```proto +primary_alias:{cell:"zone1" uid:400} +primary_term_start_time:{seconds:1627466263 nanoseconds:16201490} +key_range:{start:"\x80"}`` +``` + +_zone1/keyspaces/customer/SrvKeyspace_ + +```proto +partitions:{served_type:PRIMARY shard_references:{name:"0"}} + +partitions:{served_type:REPLICA + shard_references:{name:"-80" key_range:{end:"\x80"}} + shard_references:{name:"80-" key_range:{start:"\x80"}} + shard_tablet_controls:{name:"0" query_service_disabled:true} + shard_tablet_controls:{name:"-80" key_range:{end:"\x80"}} + shard_tablet_controls:{name:"80-" key_range:{start:"\x80"}} +} +``` + +#### After primary traffic is switched using `SwitchTraffic` (previously known as SwitchWrites) + +* `is_primary_serving` is removed from shard `0` +* `is_primary_serving` is added to shards `-80` and `80-` +* In the primary partition the shards `-80` and `80-` are added with their associated key ranges +* In the primary partition the key range for shard `0` is removed +* The replica partition remains the same as in the previous step + +*global/keyspaces/customer/shards/0/Shard* + +```proto +primary_alias:{cell:"zone1" uid:200} +primary_term_start_time:{seconds:1627466636 nanoseconds:405646818} +``` + +*global/keyspaces/customer/shards/80-/Shard* + +```proto +primary_alias:{cell:"zone1" uid:400} +primary_term_start_time:{seconds:1627466710 nanoseconds:579634511} +key_range:{start:"\x80"} +is_primary_serving:true +``` + +*zone1/keyspaces/customer/SrvKeyspace* + +```proto +partitions:{served_type:PRIMARY + shard_references:{name:"-80" key_range:{end:"\x80"}} + shard_references:{name:"80-" + key_range:{start:"\x80"}} +} {name:"0"} + +partitions:{served_type:REPLICA + shard_references:{name:"-80" key_range:{end:"\x80"}} + shard_references:{name:"80-" key_range:{start:"\x80"}}} + shard_tablet_controls:{name:"0" query_service_disabled:true} + shard_tablet_controls:{name:"-80" key_range:{end:"\x80"}} + shard_tablet_controls:{name:"80-" key_range:{start:"\x80"}} +} +``` + +## What happens when a MoveTables workflow is cutover + +#### Before MoveTables is initiated + +The [`VSchema`](../../../features/vschema/) for the source keyspace contains the table name, so vtgate routes queries to that +keyspace. + +#### During MoveTables + +Both the source and target now contain the tables and both [`VSchemas`](../../../features/vschema/) refer to them. However we +have routing rules that map the tables for each tablet type from the target keyspace to the source keyspace. + +*global/RoutingRules* + +```proto +rules:{from_table:"customer" to_tables:"commerce.customer"} +rules:{from_table:"customer.customer" to_tables:"commerce.customer"} +rules:{from_table:"customer@replica" to_tables:"commerce.customer"} +rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} +``` + +#### On switching replica traffic to target + +The routing rules for replica targeted reads are updated to map the table on the source to the target. + +*global/RoutingRules* + +```proto +rules:{from_table:"customer.customer" to_tables:"commerce.customer"} rules:{from_table:"commerce.customer@replica" to_tables:"customer.customer"} +rules:{from_table:"customer" to_tables:"commerce.customer"} +rules:{from_table:"customer@replica" to_tables:"customer.customer"} +``` + +#### On switching primary traffic + +The routing rules for default read-write traffic are updated to map the table on the source to the target. In addition the +tables are added to the “denylist” on the source keyspace which `vttablet` uses to reject queries for these tables on the +old/inactive shards. + +*global/RoutingRules* + +```proto +rules:{from_table:"commerce.customer@replica" to_tables:"customer.customer"} +rules:{from_table:"customer.customer@replica" to_tables:"customer.customer"} +rules:{from_table:"commerce.customer" to_tables:"customer.customer"} +rules:{from_table:"customer" to_tables:"customer.customer"} +``` + +*global/keyspaces/commerce/shards/0/Shard* + +```proto +primary_alias:{cell:"zone1" uid:100} +primary_term_start_time:{seconds:1627477340 nanoseconds:740407602} +tablet_controls:{tablet_type:PRIMARY denylisted_tables:"customer"} +is_primary_serving:true +``` + +# Miscellaneous Notes: + +* In VReplication workflows, cutovers are performed manually by the user executing the `SwitchTraffic` and `ReverseTraffic` +actions e.g. for a [`MoveTables`](../../movetables/#switchtraffic) or [`Reshard`](../../reshard/#reversetraffic) vtctl +client command. +* When traffic for `REPLICA` and `RDONLY` tablets is switched not all read traffic is switched: primary/default reads will +still be served from the source shards, until `PRIMARY` tablet traffic is also switched. diff --git a/content/en/docs/13.0/reference/vreplication/internal/life-of-a-stream.md b/content/en/docs/13.0/reference/vreplication/internal/life-of-a-stream.md new file mode 100644 index 000000000..1d58d9e97 --- /dev/null +++ b/content/en/docs/13.0/reference/vreplication/internal/life-of-a-stream.md @@ -0,0 +1,168 @@ +--- +title: Life of a stream +description: How VReplication replicates data +weight: 1 +aliases: ['/docs/design-docs/vreplication/life-of-a-stream/'] +--- + +### Introduction + +When a VReplication workflow runs, data is copied from source to target shards. Each target `PRIMARY` tablet runs one +vreplication stream (`vstream`) for each source shard that the target's +[keyrange](../../../features/sharding/#key-ranges-and-partitions) overlaps with. + +The diagram below outlines how one such stream operates. VReplication can be asked to start from a specific +[`GTID`](https://dev.mysql.com/doc/refman/en/replication-gtids-concepts.html) or from the start. When starting from a +[`GTID`](https://dev.mysql.com/doc/refman/en/replication-gtids-concepts.html) the _replication_ mode is used where it +streams events from the binlog. + +![VReplication Flow](/img/VReplicationFlow.png) + +#### Full table copy + +If the entire table data is requested then the simple streaming done by the _replication_ mode can create an avalanche +of events (think 100s of millions of rows). Moreover, and more importantly, it is highly likely that necesasry older +binlogs are no longer available. + +So a _copy/catchup_ mode is initiated first: data in the tables are copied over in a consistent manner using bulk +inserts. Once we have copied enough data so that we are close enough to the current position (when replication lag is +low) it switches over to, and forever stays in, the _replication_ mode. All future replication is done only by +streaming binlog events. + +While we may have multiple database sources in a workflow each `vstream` has just one source and one target. The source is +always a `vttablet` (and hence one `mysqld` instance). The target could be another `vttablet` (when resharding) or a +streaming gRPC response (for [`vtgate` `vstream` API](../../vstream/) clients). + +#### Transformation and Filtering + +Note that for all steps the data selected from the source will only be from the tables specified +in the [`Match`](https://github.com/vitessio/vitess/blob/main/proto/binlogdata.proto#LL128C5) field of the Rule +specification of the VReplication workflow. Furthermore, if a +[`Filter`](https://github.com/vitessio/vitess/blob/main/proto/binlogdata.proto#LL133C5) is specified for a table it will +be applied before being sent to the target. Columns may also be transformed based on the Filter’s `SELECT` clause. + +#### Source and Sink + +Each stream has two actors: the target initiates streaming by making gRPC calls to the source tablet and the source +tablet sources the data by connecting to its underlying `mysqld` server as a replica (while replicating) or using SQL +queries (in the copy phase) and streams it to the target. The target takes appropriate action: in case of resharding it +will convert the events into CRUD SQL statements and apply them to the target database. In case of [`vtgate` `vstream` +API](../../vstream/) clients the events are forwarded by `vtgate` to the client. + +Note that the target always pulls data. If the source pushes data, there are chances of buffer overruns if the target is +not able to process them in time. For example, in resharding workflows we need to convert the events to SQL `INSERT` +statements and execute them on the target's mysqld instance, which are usually much slower than just selecting data on +the source. + +### Modes, in detail + +#### Replicate + +This is the easiest to understand. The source stream just acts like a MySQL replica and processes events as they are +received. Events, after any necessary filtering and transformation, are sent to the target. Replication runs +continuously with short sleeps when there are no more events to source. Periodic heartbeats are sent to the target to +signal liveness. You will see this reflected with the `Running` state for the workflow. + +#### Initialize + +Initialize is called at the start of the copy phase. For each table to be copied an entry is created in the internal +`_vt.copy_state` table with a null primary key (PK). As each table copy is completed the related entries are deleted +and when there are no more entries for this workflow the copy phase is considered complete and the workflow moves into +the replication mode which you will see reflected with the `Running` state for the workflow. + +#### Copy + +Copy works on one table at a time. The source selects a set of rows from the table, for primary keys greater than the +ones copied so far, using a consistent snapshot. This results in a stream of rows to be sent to the target which +generates a bulk `INSERT` for these rows. You will see this reflected with the `Copying` state for the workflow. + +However, there are a couple of factors which complicate our story: + +* Each copy selects all rows until the current position of the binlog, but, +* Since transactions continue to be applied (presuming the database is online) the GTID position is continuously +moving forward + +Consider this example: + +We have two tables `X` and `Y`. Each table has 20 rows and we copy 10 rows at a time. (The queries below simplified +for readability). + +The queries for the copy phase of `X` will be: + +```sql +T1: select * from X where pk > 0 limit 10; GTID: 100, Last PK 10 + + send rows to target + +T2: select * from X where pk > 10 limit 10; GTID: 110, Last PK 20 + + send rows to target +``` + +There is a gotcha here: onsider that there are 10 new transactions or GTIDs between times T1 and T2. Some of these +can potentially modify the rows returned from the query at T1. Hence if we just return the rows from T2 (which have +only rows from PK 11 to 20) then we will have an inconsistent state on the target: the updates to rows with PK +between 1 and 10 will not be present. + +This means that we need to first stream the events between GTIDs 100 and 110 for primary keys between 1 and 10 first +and then do the second select: + +```sql +T1: select * from X where pk > 0 limit 10; GTID: 100, Last PK 10 + + send rows to target + +T2: replicate from 100 to current position (110 from previous example), + + only pass events for pks 1 to 10 of X + +T3: select * from X where pk > 10 limit 10; GTID: 112, Last PK 20 + + send rows to target +``` + +Another gotcha: note that at time T3 when we selected the PKs from 11 to 20 the GTID position could have moved further! +This could be due to transactions that were applied between T2 and T3. So if we just applied the rows from T3 we would +still have an inconsistent state, if transactions 111 and 112 affected the rows from pks 1 to 10. + +This leads us to the following flow: + +```sql +T1: select * from X where pk > 0 limit 10; GTID: 100, Last PK 10 + + send rows to target + +T2: replicate from 100 to current position (110 from previous example), + + only pass events for pks 1 to 10 + +T3: select * from X where pk > 10 limit 10; GTID: 112, Last PK 20 + +T4: replicate from 111 to 112 + + only pass events for pks 1 to 10 + +T5: Send rows for pks 11 to 20 to target +``` + +This flow actually works and is the one used in Vitess VReplication! + +The transactions to be applied at T1 can take a long time (due to the bulk inserts). T3 (which is just a snapshot) is +quick. So the position can diverge much more at T2 than at T4. Hence, we call step T2 "Catchup" and step T4 +"Fast Forward". + +#### Catchup + +As detailed above the catchup phase runs between copy phase cycles (time limited by the +[`vreplication_copy_phase_max_duration`](../../flags/#vreplication_copy_phase_duration) flag). During the copy phase the +GTID position can move significantly ahead. So we run a catchup and fast-forward phase until we come close to the current +position — i.e. the replication lag is small. At that point we execute another Copy cycle. + +#### Fast forward + +During the copy phase we first take a snapshot. Then we fast-forward: we replicate from the gtid position where we stopped +the Catchup to the position of the new snapshot. + +Finally once we have finished copying all the tables we proceed to the replicate or `Running` phase until our job is done: +for example if we have resharded and switched over the reads and writes to the new shards or when the +[`vstream` API](../../vstream/) client closes its connection. diff --git a/content/en/docs/13.0/reference/vreplication/internal/rfcs.md b/content/en/docs/13.0/reference/vreplication/internal/rfcs.md new file mode 100644 index 000000000..ff32584be --- /dev/null +++ b/content/en/docs/13.0/reference/vreplication/internal/rfcs.md @@ -0,0 +1,10 @@ +--- +title: Selected VReplication RFCs +description: Links to the RFC Issues in the vitess repo, for convenience +weight: 100 +--- + +- [VDiff2: Reimplementing VDiff on tablets](https://github.com/vitessio/vitess/issues/10134) +- [VStream Copy: streaming events from the beginning](https://github.com/vitessio/vitess/issues/6277) +- [Cross-Cluster Data Migration](https://github.com/vitessio/vitess/issues/7545) +- [File:Position based VReplication](https://github.com/vitessio/vitess/issues/5424) diff --git a/content/en/docs/14.0/reference/vreplication/internal/cutover.md b/content/en/docs/14.0/reference/vreplication/internal/cutover.md new file mode 100644 index 000000000..f733a5e3c --- /dev/null +++ b/content/en/docs/14.0/reference/vreplication/internal/cutover.md @@ -0,0 +1,286 @@ +--- +title: How traffic is switched +description: How Vitess signals traffic cutover for Reshard and MoveTables +weight: 2 +aliases: ['/docs/design-docs/vreplication/cutover/'] +--- + +# Related persistent Vitess objects + +{{< info >}} +As the objects or keys noted below are stored in [the topo server](../../../features/topology-service/) and +cached locally, the processes involved will refresh their topo data throughout the cutover process. For example, each +tablet on the source and target shards that are involved in a [VReplication](../../) workflow +will refresh their topo data multiple times as the state of things transition during the cutover. If we are *not* able +to confirm that all tablets involved in a VReplication worfklow are able to refresh their topo data then the cutover +command — e.g. [`vtctlclient SwitchTraffic`](../../../reference/vreplication/switchtraffic/) — will cancel the operation +and return an error indicating which tablet(s) are unhealthy (including for `--dry_run` executions). +{{< /info >}} + +## VSchema + +A [VSchema](../../../../concepts/vschema/) allows you to describe how data is organized within keyspaces and shards. + +## Shard Info + +The [`global` topo](../../../features/topology-service/#global-vs-local) contains +one [`Shard`](../../../features/topology-service/#shard) key per keyspace which then contains one key per +shard that has been created within the keyspace. For each shard that is healthy there is an +attribute `is_primary_serving` which is set to true. The other shards which have been created but are still not healthy +and serving within the keyspace will not have this attribute set. Here is an example shard info record from an unsharded +keyspace named commerce (without the `--cell` flag being passed the `global` topo base path is used): + +```bash +$ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/keyspaces/commerce/shards/0/Shard' +primary_alias:{cell:"zone1" uid:100} primary_term_start_time:{seconds:1650341417 nanoseconds:374817485} is_primary_serving:true +``` + +## SrvKeyspace + +Each cell has a [`SrvKeyspace`](../../../features/topology-service/#srvkeyspace) key in +the [`local` topo](../../../features/topology-service/#global-vs-local) (per cell info) for each keyspace. For +each tablet type (e.g. `PRIMARY` or `REPLICA`) there is one `partitions` object. The `partitions` objects contain all of the +current shards in the keyspace. For sharded keyspaces, the tablets which are healthy and serving have a key range specified +for that shard. + +Also the primary can contain a `query_service_disabled` attribute which is set to `true` during resharding cutovers. +This tells the primary in that shard to reject any queries made to it, as a signal to vtgate in case vtgate routes +queries to this primary during the cutover or before it discovers the new serving graph. Here is an example using the +same unsharded commerce keyspace and here we specify the `--cell` flag so that cell's topo base path — stored in +its `CellInfo` record in the `global` topo — is used: + +```bash +$ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/cells/zone1/CellInfo' +server_address:"localhost:2379" root:"/vitess/zone1" + +$ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto --cell=zone1 '/keyspaces/commerce/SrvKeyspace' +partitions:{served_type:PRIMARY shard_references:{name:"0"}} partitions:{served_type:REPLICA shard_references:{name:"0"}} partitions:{served_type:RDONLY shard_references:{name:"0"}} +``` + +## Routing Rules + +[Routing Rules](../../../features/schema-routing-rules) are stored in the `RoutingRules` key within +the `global` topo. Routing Rules contain a list of table-specific routes. You can route a table for all or specific +tablet types to another table in the same or different keyspace. Here is an example using the same commerce keyspace +where we have an active [`MoveTables`](../../../reference/vreplication/movetables/) workflow to move tables to the +customer keyspace but we have not switched any traffic yet: + +```bash +$ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/RoutingRules' +rules:{from_table:"corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer.corder" to_tables:"commerce.corder"} rules:{from_table:"customer.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"commerce.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"corder" to_tables:"commerce.corder"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"commerce.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer.customer" to_tables:"commerce.customer"} +``` + +{{< info >}} +In practice you would instead typically view the routing rules via the +dedicated [`GetRoutingRules`](../../../programs/vtctl/schema-version-permissions/#getroutingrules) +vtctl client command which will return the rules for all keyspaces in the topo. +{{< /info >}} + +# How VTGate routes a query + +This section walks through a simplified version of the logic used to determine which keyspace and table vtgate will route +a simple query of the form `select * from t1 where id = 1` (a _read_ query) or `insert into t1 (id, val) values (1,'abc')` +(a _write_ query). + +1. Check to see if `t1` has an appropriate routing rule defined. If so, use the specified target table as an alias for `t1`. +2. Locate the keyspace for `t1` using the [`VSchema`](../../../features/vschema/). +3. For a non-sharded keyspace locate the appropriate tablet (`PRIMARY`, by default) from the (cached) `SrvKeyspace` `local` +(per cell) topo record. +4. For a sharded keyspace the `SrvKeyspace` record is used to find the currently active shards. This is done by checking +the list of partitions for the specific tablet type selected for the query (`PRIMARY`, by default, for both reads and writes) +and selecting the ones whose `query_service_disabled` field is *not* set and whose `is_primary_serving` value is true. +5. Finally, based on the [`VIndex`](../../../features/vindexes/) defined for the table from the cached +[`VSchema`](../../../features/vschema/) (stored in the `global` topo), the shard for the relevant row is computed based +on the keyrange to which the id is mapped to using the declared [`VIndex` function/type](../../../features/vindexes/#predefined-vindexes). + +# Changes made to the topo when traffic is switched + +This document outlines the steps involved in the cutover process +of [`MoveTables`](../../movetables/) and [`Reshard`](../../reshard/) +workflows when traffic is switched from the source tables/shards to the target tables/shards. We use the resharding flow +provided in the [local examples](../../../../get-started/local/) and show the relevant snippets from the topo for each step +in the workflow. + +{{< info >}} +Items in italics are topo keys and the following snippet the value of the key +{{< /info >}} + +## What happens when a Reshard is cutover + +For brevity we only show the records for the `80-` shard. There will be similar records for the `-80` shard. + +#### Before Resharding, after -80/80- shards are created + +Only shard `0` has `is_primary_serving` set to true. The `SrvKeyspace` record only has references to `0` for both `PRIMARY` +and `REPLICA` tablet types. + +*global/keyspaces/customer/shards/0/Shard* + +```proto +primary_alias:{cell:"zone1" uid:200} +primary_term_start_time:{seconds:1627465761 nanoseconds:600070156} +is_primary_serving:true +``` + +*global/keyspaces/customer/shards/80-/Shard* + +```proto +primary_alias:{cell:"zone1" uid:400} +primary_term_start_time:{seconds:1627465833 nanoseconds:536524508} +key_range:{start:"\x80"} +``` + +*zone1/keyspaces/customer/SrvKeyspace* + +```proto +partitions:{served_type:PRIMARY shard_references:{name:"0"}} +partitions:{served_type:REPLICA shard_references:{name:"0"}} +``` + +### After replica traffic is switched using `SwitchTraffic` (previously known as SwitchReads) + +Shard `0` still has the `is_primary_serving` set as true. The primary partition is still the same. + +The replica partition has the following changes: + +* Two more shard_references for `-80` and `80-` +* Key ranges are specified for these shards +* The key range for shard `0` has been removed +* `query_service_disabled` is set to true for shard `0` + +*global/keyspaces/customer/shards/0/Shard* + +```proto +primary_alias:{cell:"zone1" uid:200} +primary_term_start_time:{seconds:1627466189 nanoseconds:587021377} +is_primary_serving:true +``` + +*global/keyspaces/customer/shards/80-/Shard* + +```proto +primary_alias:{cell:"zone1" uid:400} +primary_term_start_time:{seconds:1627466263 nanoseconds:16201490} +key_range:{start:"\x80"}`` +``` + +_zone1/keyspaces/customer/SrvKeyspace_ + +```proto +partitions:{served_type:PRIMARY shard_references:{name:"0"}} + +partitions:{served_type:REPLICA + shard_references:{name:"-80" key_range:{end:"\x80"}} + shard_references:{name:"80-" key_range:{start:"\x80"}} + shard_tablet_controls:{name:"0" query_service_disabled:true} + shard_tablet_controls:{name:"-80" key_range:{end:"\x80"}} + shard_tablet_controls:{name:"80-" key_range:{start:"\x80"}} +} +``` + +#### After primary traffic is switched using `SwitchTraffic` (previously known as SwitchWrites) + +* `is_primary_serving` is removed from shard `0` +* `is_primary_serving` is added to shards `-80` and `80-` +* In the primary partition the shards `-80` and `80-` are added with their associated key ranges +* In the primary partition the key range for shard `0` is removed +* The replica partition remains the same as in the previous step + +*global/keyspaces/customer/shards/0/Shard* + +```proto +primary_alias:{cell:"zone1" uid:200} +primary_term_start_time:{seconds:1627466636 nanoseconds:405646818} +``` + +*global/keyspaces/customer/shards/80-/Shard* + +```proto +primary_alias:{cell:"zone1" uid:400} +primary_term_start_time:{seconds:1627466710 nanoseconds:579634511} +key_range:{start:"\x80"} +is_primary_serving:true +``` + +*zone1/keyspaces/customer/SrvKeyspace* + +```proto +partitions:{served_type:PRIMARY + shard_references:{name:"-80" key_range:{end:"\x80"}} + shard_references:{name:"80-" + key_range:{start:"\x80"}} +} {name:"0"} + +partitions:{served_type:REPLICA + shard_references:{name:"-80" key_range:{end:"\x80"}} + shard_references:{name:"80-" key_range:{start:"\x80"}}} + shard_tablet_controls:{name:"0" query_service_disabled:true} + shard_tablet_controls:{name:"-80" key_range:{end:"\x80"}} + shard_tablet_controls:{name:"80-" key_range:{start:"\x80"}} +} +``` + +## What happens when a MoveTables workflow is cutover + +#### Before MoveTables is initiated + +The [`VSchema`](../../../features/vschema/) for the source keyspace contains the table name, so vtgate routes queries to that +keyspace. + +#### During MoveTables + +Both the source and target now contain the tables and both [`VSchemas`](../../../features/vschema/) refer to them. However we +have routing rules that map the tables for each tablet type from the target keyspace to the source keyspace. + +*global/RoutingRules* + +```proto +rules:{from_table:"customer" to_tables:"commerce.customer"} +rules:{from_table:"customer.customer" to_tables:"commerce.customer"} +rules:{from_table:"customer@replica" to_tables:"commerce.customer"} +rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} +``` + +#### On switching replica traffic to target + +The routing rules for replica targeted reads are updated to map the table on the source to the target. + +*global/RoutingRules* + +```proto +rules:{from_table:"customer.customer" to_tables:"commerce.customer"} rules:{from_table:"commerce.customer@replica" to_tables:"customer.customer"} +rules:{from_table:"customer" to_tables:"commerce.customer"} +rules:{from_table:"customer@replica" to_tables:"customer.customer"} +``` + +#### On switching primary traffic + +The routing rules for default read-write traffic are updated to map the table on the source to the target. In addition the +tables are added to the “denylist” on the source keyspace which `vttablet` uses to reject queries for these tables on the +old/inactive shards. + +*global/RoutingRules* + +```proto +rules:{from_table:"commerce.customer@replica" to_tables:"customer.customer"} +rules:{from_table:"customer.customer@replica" to_tables:"customer.customer"} +rules:{from_table:"commerce.customer" to_tables:"customer.customer"} +rules:{from_table:"customer" to_tables:"customer.customer"} +``` + +*global/keyspaces/commerce/shards/0/Shard* + +```proto +primary_alias:{cell:"zone1" uid:100} +primary_term_start_time:{seconds:1627477340 nanoseconds:740407602} +tablet_controls:{tablet_type:PRIMARY denylisted_tables:"customer"} +is_primary_serving:true +``` + +# Miscellaneous Notes: + +* In VReplication workflows, cutovers are performed manually by the user executing the `SwitchTraffic` and `ReverseTraffic` +actions e.g. for a [`MoveTables`](../../movetables/#switchtraffic) or [`Reshard`](../../reshard/#reversetraffic) vtctl +client command. +* When traffic for `REPLICA` and `RDONLY` tablets is switched not all read traffic is switched: primary/default reads will +still be served from the source shards, until `PRIMARY` tablet traffic is also switched. diff --git a/content/en/docs/14.0/reference/vreplication/internal/life-of-a-stream.md b/content/en/docs/14.0/reference/vreplication/internal/life-of-a-stream.md new file mode 100644 index 000000000..1d58d9e97 --- /dev/null +++ b/content/en/docs/14.0/reference/vreplication/internal/life-of-a-stream.md @@ -0,0 +1,168 @@ +--- +title: Life of a stream +description: How VReplication replicates data +weight: 1 +aliases: ['/docs/design-docs/vreplication/life-of-a-stream/'] +--- + +### Introduction + +When a VReplication workflow runs, data is copied from source to target shards. Each target `PRIMARY` tablet runs one +vreplication stream (`vstream`) for each source shard that the target's +[keyrange](../../../features/sharding/#key-ranges-and-partitions) overlaps with. + +The diagram below outlines how one such stream operates. VReplication can be asked to start from a specific +[`GTID`](https://dev.mysql.com/doc/refman/en/replication-gtids-concepts.html) or from the start. When starting from a +[`GTID`](https://dev.mysql.com/doc/refman/en/replication-gtids-concepts.html) the _replication_ mode is used where it +streams events from the binlog. + +![VReplication Flow](/img/VReplicationFlow.png) + +#### Full table copy + +If the entire table data is requested then the simple streaming done by the _replication_ mode can create an avalanche +of events (think 100s of millions of rows). Moreover, and more importantly, it is highly likely that necesasry older +binlogs are no longer available. + +So a _copy/catchup_ mode is initiated first: data in the tables are copied over in a consistent manner using bulk +inserts. Once we have copied enough data so that we are close enough to the current position (when replication lag is +low) it switches over to, and forever stays in, the _replication_ mode. All future replication is done only by +streaming binlog events. + +While we may have multiple database sources in a workflow each `vstream` has just one source and one target. The source is +always a `vttablet` (and hence one `mysqld` instance). The target could be another `vttablet` (when resharding) or a +streaming gRPC response (for [`vtgate` `vstream` API](../../vstream/) clients). + +#### Transformation and Filtering + +Note that for all steps the data selected from the source will only be from the tables specified +in the [`Match`](https://github.com/vitessio/vitess/blob/main/proto/binlogdata.proto#LL128C5) field of the Rule +specification of the VReplication workflow. Furthermore, if a +[`Filter`](https://github.com/vitessio/vitess/blob/main/proto/binlogdata.proto#LL133C5) is specified for a table it will +be applied before being sent to the target. Columns may also be transformed based on the Filter’s `SELECT` clause. + +#### Source and Sink + +Each stream has two actors: the target initiates streaming by making gRPC calls to the source tablet and the source +tablet sources the data by connecting to its underlying `mysqld` server as a replica (while replicating) or using SQL +queries (in the copy phase) and streams it to the target. The target takes appropriate action: in case of resharding it +will convert the events into CRUD SQL statements and apply them to the target database. In case of [`vtgate` `vstream` +API](../../vstream/) clients the events are forwarded by `vtgate` to the client. + +Note that the target always pulls data. If the source pushes data, there are chances of buffer overruns if the target is +not able to process them in time. For example, in resharding workflows we need to convert the events to SQL `INSERT` +statements and execute them on the target's mysqld instance, which are usually much slower than just selecting data on +the source. + +### Modes, in detail + +#### Replicate + +This is the easiest to understand. The source stream just acts like a MySQL replica and processes events as they are +received. Events, after any necessary filtering and transformation, are sent to the target. Replication runs +continuously with short sleeps when there are no more events to source. Periodic heartbeats are sent to the target to +signal liveness. You will see this reflected with the `Running` state for the workflow. + +#### Initialize + +Initialize is called at the start of the copy phase. For each table to be copied an entry is created in the internal +`_vt.copy_state` table with a null primary key (PK). As each table copy is completed the related entries are deleted +and when there are no more entries for this workflow the copy phase is considered complete and the workflow moves into +the replication mode which you will see reflected with the `Running` state for the workflow. + +#### Copy + +Copy works on one table at a time. The source selects a set of rows from the table, for primary keys greater than the +ones copied so far, using a consistent snapshot. This results in a stream of rows to be sent to the target which +generates a bulk `INSERT` for these rows. You will see this reflected with the `Copying` state for the workflow. + +However, there are a couple of factors which complicate our story: + +* Each copy selects all rows until the current position of the binlog, but, +* Since transactions continue to be applied (presuming the database is online) the GTID position is continuously +moving forward + +Consider this example: + +We have two tables `X` and `Y`. Each table has 20 rows and we copy 10 rows at a time. (The queries below simplified +for readability). + +The queries for the copy phase of `X` will be: + +```sql +T1: select * from X where pk > 0 limit 10; GTID: 100, Last PK 10 + + send rows to target + +T2: select * from X where pk > 10 limit 10; GTID: 110, Last PK 20 + + send rows to target +``` + +There is a gotcha here: onsider that there are 10 new transactions or GTIDs between times T1 and T2. Some of these +can potentially modify the rows returned from the query at T1. Hence if we just return the rows from T2 (which have +only rows from PK 11 to 20) then we will have an inconsistent state on the target: the updates to rows with PK +between 1 and 10 will not be present. + +This means that we need to first stream the events between GTIDs 100 and 110 for primary keys between 1 and 10 first +and then do the second select: + +```sql +T1: select * from X where pk > 0 limit 10; GTID: 100, Last PK 10 + + send rows to target + +T2: replicate from 100 to current position (110 from previous example), + + only pass events for pks 1 to 10 of X + +T3: select * from X where pk > 10 limit 10; GTID: 112, Last PK 20 + + send rows to target +``` + +Another gotcha: note that at time T3 when we selected the PKs from 11 to 20 the GTID position could have moved further! +This could be due to transactions that were applied between T2 and T3. So if we just applied the rows from T3 we would +still have an inconsistent state, if transactions 111 and 112 affected the rows from pks 1 to 10. + +This leads us to the following flow: + +```sql +T1: select * from X where pk > 0 limit 10; GTID: 100, Last PK 10 + + send rows to target + +T2: replicate from 100 to current position (110 from previous example), + + only pass events for pks 1 to 10 + +T3: select * from X where pk > 10 limit 10; GTID: 112, Last PK 20 + +T4: replicate from 111 to 112 + + only pass events for pks 1 to 10 + +T5: Send rows for pks 11 to 20 to target +``` + +This flow actually works and is the one used in Vitess VReplication! + +The transactions to be applied at T1 can take a long time (due to the bulk inserts). T3 (which is just a snapshot) is +quick. So the position can diverge much more at T2 than at T4. Hence, we call step T2 "Catchup" and step T4 +"Fast Forward". + +#### Catchup + +As detailed above the catchup phase runs between copy phase cycles (time limited by the +[`vreplication_copy_phase_max_duration`](../../flags/#vreplication_copy_phase_duration) flag). During the copy phase the +GTID position can move significantly ahead. So we run a catchup and fast-forward phase until we come close to the current +position — i.e. the replication lag is small. At that point we execute another Copy cycle. + +#### Fast forward + +During the copy phase we first take a snapshot. Then we fast-forward: we replicate from the gtid position where we stopped +the Catchup to the position of the new snapshot. + +Finally once we have finished copying all the tables we proceed to the replicate or `Running` phase until our job is done: +for example if we have resharded and switched over the reads and writes to the new shards or when the +[`vstream` API](../../vstream/) client closes its connection. diff --git a/content/en/docs/14.0/reference/vreplication/internal/rfcs.md b/content/en/docs/14.0/reference/vreplication/internal/rfcs.md new file mode 100644 index 000000000..ff32584be --- /dev/null +++ b/content/en/docs/14.0/reference/vreplication/internal/rfcs.md @@ -0,0 +1,10 @@ +--- +title: Selected VReplication RFCs +description: Links to the RFC Issues in the vitess repo, for convenience +weight: 100 +--- + +- [VDiff2: Reimplementing VDiff on tablets](https://github.com/vitessio/vitess/issues/10134) +- [VStream Copy: streaming events from the beginning](https://github.com/vitessio/vitess/issues/6277) +- [Cross-Cluster Data Migration](https://github.com/vitessio/vitess/issues/7545) +- [File:Position based VReplication](https://github.com/vitessio/vitess/issues/5424) diff --git a/content/en/docs/15.0/reference/vreplication/internal/cutover.md b/content/en/docs/15.0/reference/vreplication/internal/cutover.md new file mode 100644 index 000000000..f733a5e3c --- /dev/null +++ b/content/en/docs/15.0/reference/vreplication/internal/cutover.md @@ -0,0 +1,286 @@ +--- +title: How traffic is switched +description: How Vitess signals traffic cutover for Reshard and MoveTables +weight: 2 +aliases: ['/docs/design-docs/vreplication/cutover/'] +--- + +# Related persistent Vitess objects + +{{< info >}} +As the objects or keys noted below are stored in [the topo server](../../../features/topology-service/) and +cached locally, the processes involved will refresh their topo data throughout the cutover process. For example, each +tablet on the source and target shards that are involved in a [VReplication](../../) workflow +will refresh their topo data multiple times as the state of things transition during the cutover. If we are *not* able +to confirm that all tablets involved in a VReplication worfklow are able to refresh their topo data then the cutover +command — e.g. [`vtctlclient SwitchTraffic`](../../../reference/vreplication/switchtraffic/) — will cancel the operation +and return an error indicating which tablet(s) are unhealthy (including for `--dry_run` executions). +{{< /info >}} + +## VSchema + +A [VSchema](../../../../concepts/vschema/) allows you to describe how data is organized within keyspaces and shards. + +## Shard Info + +The [`global` topo](../../../features/topology-service/#global-vs-local) contains +one [`Shard`](../../../features/topology-service/#shard) key per keyspace which then contains one key per +shard that has been created within the keyspace. For each shard that is healthy there is an +attribute `is_primary_serving` which is set to true. The other shards which have been created but are still not healthy +and serving within the keyspace will not have this attribute set. Here is an example shard info record from an unsharded +keyspace named commerce (without the `--cell` flag being passed the `global` topo base path is used): + +```bash +$ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/keyspaces/commerce/shards/0/Shard' +primary_alias:{cell:"zone1" uid:100} primary_term_start_time:{seconds:1650341417 nanoseconds:374817485} is_primary_serving:true +``` + +## SrvKeyspace + +Each cell has a [`SrvKeyspace`](../../../features/topology-service/#srvkeyspace) key in +the [`local` topo](../../../features/topology-service/#global-vs-local) (per cell info) for each keyspace. For +each tablet type (e.g. `PRIMARY` or `REPLICA`) there is one `partitions` object. The `partitions` objects contain all of the +current shards in the keyspace. For sharded keyspaces, the tablets which are healthy and serving have a key range specified +for that shard. + +Also the primary can contain a `query_service_disabled` attribute which is set to `true` during resharding cutovers. +This tells the primary in that shard to reject any queries made to it, as a signal to vtgate in case vtgate routes +queries to this primary during the cutover or before it discovers the new serving graph. Here is an example using the +same unsharded commerce keyspace and here we specify the `--cell` flag so that cell's topo base path — stored in +its `CellInfo` record in the `global` topo — is used: + +```bash +$ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/cells/zone1/CellInfo' +server_address:"localhost:2379" root:"/vitess/zone1" + +$ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto --cell=zone1 '/keyspaces/commerce/SrvKeyspace' +partitions:{served_type:PRIMARY shard_references:{name:"0"}} partitions:{served_type:REPLICA shard_references:{name:"0"}} partitions:{served_type:RDONLY shard_references:{name:"0"}} +``` + +## Routing Rules + +[Routing Rules](../../../features/schema-routing-rules) are stored in the `RoutingRules` key within +the `global` topo. Routing Rules contain a list of table-specific routes. You can route a table for all or specific +tablet types to another table in the same or different keyspace. Here is an example using the same commerce keyspace +where we have an active [`MoveTables`](../../../reference/vreplication/movetables/) workflow to move tables to the +customer keyspace but we have not switched any traffic yet: + +```bash +$ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/RoutingRules' +rules:{from_table:"corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer.corder" to_tables:"commerce.corder"} rules:{from_table:"customer.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"commerce.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"corder" to_tables:"commerce.corder"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"commerce.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer.customer" to_tables:"commerce.customer"} +``` + +{{< info >}} +In practice you would instead typically view the routing rules via the +dedicated [`GetRoutingRules`](../../../programs/vtctl/schema-version-permissions/#getroutingrules) +vtctl client command which will return the rules for all keyspaces in the topo. +{{< /info >}} + +# How VTGate routes a query + +This section walks through a simplified version of the logic used to determine which keyspace and table vtgate will route +a simple query of the form `select * from t1 where id = 1` (a _read_ query) or `insert into t1 (id, val) values (1,'abc')` +(a _write_ query). + +1. Check to see if `t1` has an appropriate routing rule defined. If so, use the specified target table as an alias for `t1`. +2. Locate the keyspace for `t1` using the [`VSchema`](../../../features/vschema/). +3. For a non-sharded keyspace locate the appropriate tablet (`PRIMARY`, by default) from the (cached) `SrvKeyspace` `local` +(per cell) topo record. +4. For a sharded keyspace the `SrvKeyspace` record is used to find the currently active shards. This is done by checking +the list of partitions for the specific tablet type selected for the query (`PRIMARY`, by default, for both reads and writes) +and selecting the ones whose `query_service_disabled` field is *not* set and whose `is_primary_serving` value is true. +5. Finally, based on the [`VIndex`](../../../features/vindexes/) defined for the table from the cached +[`VSchema`](../../../features/vschema/) (stored in the `global` topo), the shard for the relevant row is computed based +on the keyrange to which the id is mapped to using the declared [`VIndex` function/type](../../../features/vindexes/#predefined-vindexes). + +# Changes made to the topo when traffic is switched + +This document outlines the steps involved in the cutover process +of [`MoveTables`](../../movetables/) and [`Reshard`](../../reshard/) +workflows when traffic is switched from the source tables/shards to the target tables/shards. We use the resharding flow +provided in the [local examples](../../../../get-started/local/) and show the relevant snippets from the topo for each step +in the workflow. + +{{< info >}} +Items in italics are topo keys and the following snippet the value of the key +{{< /info >}} + +## What happens when a Reshard is cutover + +For brevity we only show the records for the `80-` shard. There will be similar records for the `-80` shard. + +#### Before Resharding, after -80/80- shards are created + +Only shard `0` has `is_primary_serving` set to true. The `SrvKeyspace` record only has references to `0` for both `PRIMARY` +and `REPLICA` tablet types. + +*global/keyspaces/customer/shards/0/Shard* + +```proto +primary_alias:{cell:"zone1" uid:200} +primary_term_start_time:{seconds:1627465761 nanoseconds:600070156} +is_primary_serving:true +``` + +*global/keyspaces/customer/shards/80-/Shard* + +```proto +primary_alias:{cell:"zone1" uid:400} +primary_term_start_time:{seconds:1627465833 nanoseconds:536524508} +key_range:{start:"\x80"} +``` + +*zone1/keyspaces/customer/SrvKeyspace* + +```proto +partitions:{served_type:PRIMARY shard_references:{name:"0"}} +partitions:{served_type:REPLICA shard_references:{name:"0"}} +``` + +### After replica traffic is switched using `SwitchTraffic` (previously known as SwitchReads) + +Shard `0` still has the `is_primary_serving` set as true. The primary partition is still the same. + +The replica partition has the following changes: + +* Two more shard_references for `-80` and `80-` +* Key ranges are specified for these shards +* The key range for shard `0` has been removed +* `query_service_disabled` is set to true for shard `0` + +*global/keyspaces/customer/shards/0/Shard* + +```proto +primary_alias:{cell:"zone1" uid:200} +primary_term_start_time:{seconds:1627466189 nanoseconds:587021377} +is_primary_serving:true +``` + +*global/keyspaces/customer/shards/80-/Shard* + +```proto +primary_alias:{cell:"zone1" uid:400} +primary_term_start_time:{seconds:1627466263 nanoseconds:16201490} +key_range:{start:"\x80"}`` +``` + +_zone1/keyspaces/customer/SrvKeyspace_ + +```proto +partitions:{served_type:PRIMARY shard_references:{name:"0"}} + +partitions:{served_type:REPLICA + shard_references:{name:"-80" key_range:{end:"\x80"}} + shard_references:{name:"80-" key_range:{start:"\x80"}} + shard_tablet_controls:{name:"0" query_service_disabled:true} + shard_tablet_controls:{name:"-80" key_range:{end:"\x80"}} + shard_tablet_controls:{name:"80-" key_range:{start:"\x80"}} +} +``` + +#### After primary traffic is switched using `SwitchTraffic` (previously known as SwitchWrites) + +* `is_primary_serving` is removed from shard `0` +* `is_primary_serving` is added to shards `-80` and `80-` +* In the primary partition the shards `-80` and `80-` are added with their associated key ranges +* In the primary partition the key range for shard `0` is removed +* The replica partition remains the same as in the previous step + +*global/keyspaces/customer/shards/0/Shard* + +```proto +primary_alias:{cell:"zone1" uid:200} +primary_term_start_time:{seconds:1627466636 nanoseconds:405646818} +``` + +*global/keyspaces/customer/shards/80-/Shard* + +```proto +primary_alias:{cell:"zone1" uid:400} +primary_term_start_time:{seconds:1627466710 nanoseconds:579634511} +key_range:{start:"\x80"} +is_primary_serving:true +``` + +*zone1/keyspaces/customer/SrvKeyspace* + +```proto +partitions:{served_type:PRIMARY + shard_references:{name:"-80" key_range:{end:"\x80"}} + shard_references:{name:"80-" + key_range:{start:"\x80"}} +} {name:"0"} + +partitions:{served_type:REPLICA + shard_references:{name:"-80" key_range:{end:"\x80"}} + shard_references:{name:"80-" key_range:{start:"\x80"}}} + shard_tablet_controls:{name:"0" query_service_disabled:true} + shard_tablet_controls:{name:"-80" key_range:{end:"\x80"}} + shard_tablet_controls:{name:"80-" key_range:{start:"\x80"}} +} +``` + +## What happens when a MoveTables workflow is cutover + +#### Before MoveTables is initiated + +The [`VSchema`](../../../features/vschema/) for the source keyspace contains the table name, so vtgate routes queries to that +keyspace. + +#### During MoveTables + +Both the source and target now contain the tables and both [`VSchemas`](../../../features/vschema/) refer to them. However we +have routing rules that map the tables for each tablet type from the target keyspace to the source keyspace. + +*global/RoutingRules* + +```proto +rules:{from_table:"customer" to_tables:"commerce.customer"} +rules:{from_table:"customer.customer" to_tables:"commerce.customer"} +rules:{from_table:"customer@replica" to_tables:"commerce.customer"} +rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} +``` + +#### On switching replica traffic to target + +The routing rules for replica targeted reads are updated to map the table on the source to the target. + +*global/RoutingRules* + +```proto +rules:{from_table:"customer.customer" to_tables:"commerce.customer"} rules:{from_table:"commerce.customer@replica" to_tables:"customer.customer"} +rules:{from_table:"customer" to_tables:"commerce.customer"} +rules:{from_table:"customer@replica" to_tables:"customer.customer"} +``` + +#### On switching primary traffic + +The routing rules for default read-write traffic are updated to map the table on the source to the target. In addition the +tables are added to the “denylist” on the source keyspace which `vttablet` uses to reject queries for these tables on the +old/inactive shards. + +*global/RoutingRules* + +```proto +rules:{from_table:"commerce.customer@replica" to_tables:"customer.customer"} +rules:{from_table:"customer.customer@replica" to_tables:"customer.customer"} +rules:{from_table:"commerce.customer" to_tables:"customer.customer"} +rules:{from_table:"customer" to_tables:"customer.customer"} +``` + +*global/keyspaces/commerce/shards/0/Shard* + +```proto +primary_alias:{cell:"zone1" uid:100} +primary_term_start_time:{seconds:1627477340 nanoseconds:740407602} +tablet_controls:{tablet_type:PRIMARY denylisted_tables:"customer"} +is_primary_serving:true +``` + +# Miscellaneous Notes: + +* In VReplication workflows, cutovers are performed manually by the user executing the `SwitchTraffic` and `ReverseTraffic` +actions e.g. for a [`MoveTables`](../../movetables/#switchtraffic) or [`Reshard`](../../reshard/#reversetraffic) vtctl +client command. +* When traffic for `REPLICA` and `RDONLY` tablets is switched not all read traffic is switched: primary/default reads will +still be served from the source shards, until `PRIMARY` tablet traffic is also switched. diff --git a/content/en/docs/15.0/reference/vreplication/internal/life-of-a-stream.md b/content/en/docs/15.0/reference/vreplication/internal/life-of-a-stream.md new file mode 100644 index 000000000..1d58d9e97 --- /dev/null +++ b/content/en/docs/15.0/reference/vreplication/internal/life-of-a-stream.md @@ -0,0 +1,168 @@ +--- +title: Life of a stream +description: How VReplication replicates data +weight: 1 +aliases: ['/docs/design-docs/vreplication/life-of-a-stream/'] +--- + +### Introduction + +When a VReplication workflow runs, data is copied from source to target shards. Each target `PRIMARY` tablet runs one +vreplication stream (`vstream`) for each source shard that the target's +[keyrange](../../../features/sharding/#key-ranges-and-partitions) overlaps with. + +The diagram below outlines how one such stream operates. VReplication can be asked to start from a specific +[`GTID`](https://dev.mysql.com/doc/refman/en/replication-gtids-concepts.html) or from the start. When starting from a +[`GTID`](https://dev.mysql.com/doc/refman/en/replication-gtids-concepts.html) the _replication_ mode is used where it +streams events from the binlog. + +![VReplication Flow](/img/VReplicationFlow.png) + +#### Full table copy + +If the entire table data is requested then the simple streaming done by the _replication_ mode can create an avalanche +of events (think 100s of millions of rows). Moreover, and more importantly, it is highly likely that necesasry older +binlogs are no longer available. + +So a _copy/catchup_ mode is initiated first: data in the tables are copied over in a consistent manner using bulk +inserts. Once we have copied enough data so that we are close enough to the current position (when replication lag is +low) it switches over to, and forever stays in, the _replication_ mode. All future replication is done only by +streaming binlog events. + +While we may have multiple database sources in a workflow each `vstream` has just one source and one target. The source is +always a `vttablet` (and hence one `mysqld` instance). The target could be another `vttablet` (when resharding) or a +streaming gRPC response (for [`vtgate` `vstream` API](../../vstream/) clients). + +#### Transformation and Filtering + +Note that for all steps the data selected from the source will only be from the tables specified +in the [`Match`](https://github.com/vitessio/vitess/blob/main/proto/binlogdata.proto#LL128C5) field of the Rule +specification of the VReplication workflow. Furthermore, if a +[`Filter`](https://github.com/vitessio/vitess/blob/main/proto/binlogdata.proto#LL133C5) is specified for a table it will +be applied before being sent to the target. Columns may also be transformed based on the Filter’s `SELECT` clause. + +#### Source and Sink + +Each stream has two actors: the target initiates streaming by making gRPC calls to the source tablet and the source +tablet sources the data by connecting to its underlying `mysqld` server as a replica (while replicating) or using SQL +queries (in the copy phase) and streams it to the target. The target takes appropriate action: in case of resharding it +will convert the events into CRUD SQL statements and apply them to the target database. In case of [`vtgate` `vstream` +API](../../vstream/) clients the events are forwarded by `vtgate` to the client. + +Note that the target always pulls data. If the source pushes data, there are chances of buffer overruns if the target is +not able to process them in time. For example, in resharding workflows we need to convert the events to SQL `INSERT` +statements and execute them on the target's mysqld instance, which are usually much slower than just selecting data on +the source. + +### Modes, in detail + +#### Replicate + +This is the easiest to understand. The source stream just acts like a MySQL replica and processes events as they are +received. Events, after any necessary filtering and transformation, are sent to the target. Replication runs +continuously with short sleeps when there are no more events to source. Periodic heartbeats are sent to the target to +signal liveness. You will see this reflected with the `Running` state for the workflow. + +#### Initialize + +Initialize is called at the start of the copy phase. For each table to be copied an entry is created in the internal +`_vt.copy_state` table with a null primary key (PK). As each table copy is completed the related entries are deleted +and when there are no more entries for this workflow the copy phase is considered complete and the workflow moves into +the replication mode which you will see reflected with the `Running` state for the workflow. + +#### Copy + +Copy works on one table at a time. The source selects a set of rows from the table, for primary keys greater than the +ones copied so far, using a consistent snapshot. This results in a stream of rows to be sent to the target which +generates a bulk `INSERT` for these rows. You will see this reflected with the `Copying` state for the workflow. + +However, there are a couple of factors which complicate our story: + +* Each copy selects all rows until the current position of the binlog, but, +* Since transactions continue to be applied (presuming the database is online) the GTID position is continuously +moving forward + +Consider this example: + +We have two tables `X` and `Y`. Each table has 20 rows and we copy 10 rows at a time. (The queries below simplified +for readability). + +The queries for the copy phase of `X` will be: + +```sql +T1: select * from X where pk > 0 limit 10; GTID: 100, Last PK 10 + + send rows to target + +T2: select * from X where pk > 10 limit 10; GTID: 110, Last PK 20 + + send rows to target +``` + +There is a gotcha here: onsider that there are 10 new transactions or GTIDs between times T1 and T2. Some of these +can potentially modify the rows returned from the query at T1. Hence if we just return the rows from T2 (which have +only rows from PK 11 to 20) then we will have an inconsistent state on the target: the updates to rows with PK +between 1 and 10 will not be present. + +This means that we need to first stream the events between GTIDs 100 and 110 for primary keys between 1 and 10 first +and then do the second select: + +```sql +T1: select * from X where pk > 0 limit 10; GTID: 100, Last PK 10 + + send rows to target + +T2: replicate from 100 to current position (110 from previous example), + + only pass events for pks 1 to 10 of X + +T3: select * from X where pk > 10 limit 10; GTID: 112, Last PK 20 + + send rows to target +``` + +Another gotcha: note that at time T3 when we selected the PKs from 11 to 20 the GTID position could have moved further! +This could be due to transactions that were applied between T2 and T3. So if we just applied the rows from T3 we would +still have an inconsistent state, if transactions 111 and 112 affected the rows from pks 1 to 10. + +This leads us to the following flow: + +```sql +T1: select * from X where pk > 0 limit 10; GTID: 100, Last PK 10 + + send rows to target + +T2: replicate from 100 to current position (110 from previous example), + + only pass events for pks 1 to 10 + +T3: select * from X where pk > 10 limit 10; GTID: 112, Last PK 20 + +T4: replicate from 111 to 112 + + only pass events for pks 1 to 10 + +T5: Send rows for pks 11 to 20 to target +``` + +This flow actually works and is the one used in Vitess VReplication! + +The transactions to be applied at T1 can take a long time (due to the bulk inserts). T3 (which is just a snapshot) is +quick. So the position can diverge much more at T2 than at T4. Hence, we call step T2 "Catchup" and step T4 +"Fast Forward". + +#### Catchup + +As detailed above the catchup phase runs between copy phase cycles (time limited by the +[`vreplication_copy_phase_max_duration`](../../flags/#vreplication_copy_phase_duration) flag). During the copy phase the +GTID position can move significantly ahead. So we run a catchup and fast-forward phase until we come close to the current +position — i.e. the replication lag is small. At that point we execute another Copy cycle. + +#### Fast forward + +During the copy phase we first take a snapshot. Then we fast-forward: we replicate from the gtid position where we stopped +the Catchup to the position of the new snapshot. + +Finally once we have finished copying all the tables we proceed to the replicate or `Running` phase until our job is done: +for example if we have resharded and switched over the reads and writes to the new shards or when the +[`vstream` API](../../vstream/) client closes its connection. diff --git a/content/en/docs/15.0/reference/vreplication/internal/rfcs.md b/content/en/docs/15.0/reference/vreplication/internal/rfcs.md new file mode 100644 index 000000000..ff32584be --- /dev/null +++ b/content/en/docs/15.0/reference/vreplication/internal/rfcs.md @@ -0,0 +1,10 @@ +--- +title: Selected VReplication RFCs +description: Links to the RFC Issues in the vitess repo, for convenience +weight: 100 +--- + +- [VDiff2: Reimplementing VDiff on tablets](https://github.com/vitessio/vitess/issues/10134) +- [VStream Copy: streaming events from the beginning](https://github.com/vitessio/vitess/issues/6277) +- [Cross-Cluster Data Migration](https://github.com/vitessio/vitess/issues/7545) +- [File:Position based VReplication](https://github.com/vitessio/vitess/issues/5424) diff --git a/content/en/docs/15.0/reference/vreplication/progress.md b/content/en/docs/15.0/reference/vreplication/progress.md index d26c5b9df..717f208aa 100644 --- a/content/en/docs/15.0/reference/vreplication/progress.md +++ b/content/en/docs/15.0/reference/vreplication/progress.md @@ -16,7 +16,7 @@ MoveTables/Reshard Progress ``` ### Description -Workflows start in the copy state, (details in [VReplication Life of a stream](../../../design-docs/vreplication/life-of-a-stream), doing a bulk copy of the tables involved until they reach a low replication lag, after which we stream binlog events. Tables are copied sequentially. +Workflows start in the copy state, (details in [VReplication Life of a stream](../internal/life-of-a-stream/), doing a bulk copy of the tables involved until they reach a low replication lag, after which we stream binlog events. Tables are copied sequentially. `Progress` reports the progress of a workflow by showing the percentage of data copied across targets, if workflow is in copy state, and the replication lag between the target and the source once the copy phase is completed. diff --git a/content/en/docs/16.0/reference/vreplication/internal/cutover.md b/content/en/docs/16.0/reference/vreplication/internal/cutover.md index 8c3276c9e..f733a5e3c 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/cutover.md +++ b/content/en/docs/16.0/reference/vreplication/internal/cutover.md @@ -2,28 +2,29 @@ title: How traffic is switched description: How Vitess signals traffic cutover for Reshard and MoveTables weight: 2 +aliases: ['/docs/design-docs/vreplication/cutover/'] --- # Related persistent Vitess objects {{< info >}} -As the objects or keys noted below are stored in [the topo server](../../../reference/features/topology-service/) and +As the objects or keys noted below are stored in [the topo server](../../../features/topology-service/) and cached locally, the processes involved will refresh their topo data throughout the cutover process. For example, each -tablet on the source and target shards that are involved in a [VReplication](../../../reference/vreplication/) workflow +tablet on the source and target shards that are involved in a [VReplication](../../) workflow will refresh their topo data multiple times as the state of things transition during the cutover. If we are *not* able to confirm that all tablets involved in a VReplication worfklow are able to refresh their topo data then the cutover command — e.g. [`vtctlclient SwitchTraffic`](../../../reference/vreplication/switchtraffic/) — will cancel the operation -and return an error indicating which tablet(s) is unhealthy (including `--dry_run` executions). +and return an error indicating which tablet(s) are unhealthy (including for `--dry_run` executions). {{< /info >}} ## VSchema -A [VSchema](../../../concepts/vschema/) allows you to describe how data is organized within keyspaces and shards. +A [VSchema](../../../../concepts/vschema/) allows you to describe how data is organized within keyspaces and shards. ## Shard Info -The [`global` topo](../../../reference/features/topology-service/#global-vs-local) contains -one [`Shard`](../../../reference/features/topology-service/#shard) key per keyspace which then contains one key per +The [`global` topo](../../../features/topology-service/#global-vs-local) contains +one [`Shard`](../../../features/topology-service/#shard) key per keyspace which then contains one key per shard that has been created within the keyspace. For each shard that is healthy there is an attribute `is_primary_serving` which is set to true. The other shards which have been created but are still not healthy and serving within the keyspace will not have this attribute set. Here is an example shard info record from an unsharded @@ -36,11 +37,11 @@ primary_alias:{cell:"zone1" uid:100} primary_term_start_time:{seconds:1650341417 ## SrvKeyspace -Each cell has a [`SrvKeyspace`](../../../reference/features/topology-service/#srvkeyspace) key in -the [`local` topo](../../../reference/features/topology-service/#global-vs-local) (per cell info) for each keyspace. For -each tablet type (primary/replica) there is one `partitions` object. The `partitions` objects contain all of the current -shards in the keyspace. For sharded keyspaces, the tablets which are healthy and serving have a key range specified for -that shard. +Each cell has a [`SrvKeyspace`](../../../features/topology-service/#srvkeyspace) key in +the [`local` topo](../../../features/topology-service/#global-vs-local) (per cell info) for each keyspace. For +each tablet type (e.g. `PRIMARY` or `REPLICA`) there is one `partitions` object. The `partitions` objects contain all of the +current shards in the keyspace. For sharded keyspaces, the tablets which are healthy and serving have a key range specified +for that shard. Also the primary can contain a `query_service_disabled` attribute which is set to `true` during resharding cutovers. This tells the primary in that shard to reject any queries made to it, as a signal to vtgate in case vtgate routes @@ -58,7 +59,7 @@ partitions:{served_type:PRIMARY shard_references:{name:"0"}} partitions:{served_ ## Routing Rules -[Routing Rules](../../../reference/features/schema-routing-rules) are stored in the `RoutingRules` key within +[Routing Rules](../../../features/schema-routing-rules) are stored in the `RoutingRules` key within the `global` topo. Routing Rules contain a list of table-specific routes. You can route a table for all or specific tablet types to another table in the same or different keyspace. Here is an example using the same commerce keyspace where we have an active [`MoveTables`](../../../reference/vreplication/movetables/) workflow to move tables to the @@ -71,88 +72,93 @@ rules:{from_table:"corder@rdonly" to_tables:"commerce.corder"} rules:{from_table {{< info >}} In practice you would instead typically view the routing rules via the -dedicated [`vtctl GetRoutingRules`](../../../reference/programs/vtctl/schema-version-permissions/#getroutingrules) -command which will return the rules for all keyspaces in the topo. +dedicated [`GetRoutingRules`](../../../programs/vtctl/schema-version-permissions/#getroutingrules) +vtctl client command which will return the rules for all keyspaces in the topo. {{< /info >}} # How VTGate routes a query -This section gives a simplified logic used to determine which keyspace and table vtgate will route a simple query of the -form `select * from t1 where id = 1` (a _read_ query) or `insert into t1 (id, val) values (1,'abc')` (a _write_ query). - -* Check to see if t1 has an appropriate routing rule defined. If so, use the specified target table as an alias for t1 -* Locate the keyspace for t1 using the VSchema -* For a non-sharded keyspace locate the appropriate tablet (primary, by default) from the ( - cached) `SrvKeyspace` `local` (per cell) topo record. -* For a sharded keyspace the `SrvKeyspace` record is used to find the currently active shards. This is done by checking - the list of partitions for the specific tablet type selected for the query (primary, by default, for reads and writes) - and selecting the ones whose `query_service_disabled` is not set and whose `is_primary_serving` is true. -* Finally, based on the vindex for the table from the cached `VSchema` (stored in the `global` topo), the shard for the - relevant row is computed based on the keyrange to which the id is mapped to using the declared vindex function/type. +This section walks through a simplified version of the logic used to determine which keyspace and table vtgate will route +a simple query of the form `select * from t1 where id = 1` (a _read_ query) or `insert into t1 (id, val) values (1,'abc')` +(a _write_ query). + +1. Check to see if `t1` has an appropriate routing rule defined. If so, use the specified target table as an alias for `t1`. +2. Locate the keyspace for `t1` using the [`VSchema`](../../../features/vschema/). +3. For a non-sharded keyspace locate the appropriate tablet (`PRIMARY`, by default) from the (cached) `SrvKeyspace` `local` +(per cell) topo record. +4. For a sharded keyspace the `SrvKeyspace` record is used to find the currently active shards. This is done by checking +the list of partitions for the specific tablet type selected for the query (`PRIMARY`, by default, for both reads and writes) +and selecting the ones whose `query_service_disabled` field is *not* set and whose `is_primary_serving` value is true. +5. Finally, based on the [`VIndex`](../../../features/vindexes/) defined for the table from the cached +[`VSchema`](../../../features/vschema/) (stored in the `global` topo), the shard for the relevant row is computed based +on the keyrange to which the id is mapped to using the declared [`VIndex` function/type](../../../features/vindexes/#predefined-vindexes). # Changes made to the topo when traffic is switched This document outlines the steps involved in the cutover process -of [`MoveTables`](../../../reference/vreplication/movetables/) and [`Reshard`](../../../reference/vreplication/reshard/) +of [`MoveTables`](../../movetables/) and [`Reshard`](../../reshard/) workflows when traffic is switched from the source tables/shards to the target tables/shards. We use the resharding flow -provided in the local examples and show the relevant snippets from the topo for each step in the workflow. +provided in the [local examples](../../../../get-started/local/) and show the relevant snippets from the topo for each step +in the workflow. -Note: Items in italics are topo keys and the following snippet the value of the key +{{< info >}} +Items in italics are topo keys and the following snippet the value of the key +{{< /info >}} ## What happens when a Reshard is cutover -For brevity we only show the records for the 80- shard. There will be similar records for the -80 shard. +For brevity we only show the records for the `80-` shard. There will be similar records for the `-80` shard. #### Before Resharding, after -80/80- shards are created -Only shard 0 has `is_primary_serving` set to true. The `SrvKeyspace` record only has references to 0 for both primary -and replica. +Only shard `0` has `is_primary_serving` set to true. The `SrvKeyspace` record only has references to `0` for both `PRIMARY` +and `REPLICA` tablet types. -_global/keyspaces/customer/shards/0/Shard_ +*global/keyspaces/customer/shards/0/Shard* -``` +```proto primary_alias:{cell:"zone1" uid:200} primary_term_start_time:{seconds:1627465761 nanoseconds:600070156} is_primary_serving:true ``` -_global/keyspaces/customer/shards/80-/Shard_ +*global/keyspaces/customer/shards/80-/Shard* -``` +```proto primary_alias:{cell:"zone1" uid:400} primary_term_start_time:{seconds:1627465833 nanoseconds:536524508} key_range:{start:"\x80"} ``` -_zone1/keyspaces/customer/SrvKeyspace_ +*zone1/keyspaces/customer/SrvKeyspace* -``` +```proto partitions:{served_type:PRIMARY shard_references:{name:"0"}} partitions:{served_type:REPLICA shard_references:{name:"0"}} ``` ### After replica traffic is switched using `SwitchTraffic` (previously known as SwitchReads) -Shard 0 still has the `is_primary_serving` set as true. The primary partition is still the same. +Shard `0` still has the `is_primary_serving` set as true. The primary partition is still the same. The replica partition has the following changes: -* Two more shard_references for -80 and 80- +* Two more shard_references for `-80` and `80-` * Key ranges are specified for these shards -* The key range for shard 0 has been removed -* `query_service_disabled` is set to true for shard 0 +* The key range for shard `0` has been removed +* `query_service_disabled` is set to true for shard `0` -_global/keyspaces/customer/shards/0/Shard_ +*global/keyspaces/customer/shards/0/Shard* -``` +```proto primary_alias:{cell:"zone1" uid:200} primary_term_start_time:{seconds:1627466189 nanoseconds:587021377} is_primary_serving:true ``` -_global/keyspaces/customer/shards/80-/Shard_ +*global/keyspaces/customer/shards/80-/Shard* -``` +```proto primary_alias:{cell:"zone1" uid:400} primary_term_start_time:{seconds:1627466263 nanoseconds:16201490} key_range:{start:"\x80"}`` @@ -160,71 +166,75 @@ key_range:{start:"\x80"}`` _zone1/keyspaces/customer/SrvKeyspace_ -``` +```proto partitions:{served_type:PRIMARY shard_references:{name:"0"}} partitions:{served_type:REPLICA -shard_references:{name:"-80" key_range:{end:"\x80"}} -shard_references:{name:"80-" key_range:{start:"\x80"}} -shard_tablet_controls:{name:"0" query_service_disabled:true} -shard_tablet_controls:{name:"-80" key_range:{end:"\x80"}} -shard_tablet_controls:{name:"80-" key_range:{start:"\x80"}}} + shard_references:{name:"-80" key_range:{end:"\x80"}} + shard_references:{name:"80-" key_range:{start:"\x80"}} + shard_tablet_controls:{name:"0" query_service_disabled:true} + shard_tablet_controls:{name:"-80" key_range:{end:"\x80"}} + shard_tablet_controls:{name:"80-" key_range:{start:"\x80"}} +} ``` #### After primary traffic is switched using `SwitchTraffic` (previously known as SwitchWrites) -* `is_primary_serving` is removed from shard 0 -* `is_primary_serving` is added to shards -80 and 80- -* In the primary partition the shards -80 and 80- are added with associated key ranges -* In the primary partition the key range for shard 0 are removed -* The replica partition is the same as in the previous step +* `is_primary_serving` is removed from shard `0` +* `is_primary_serving` is added to shards `-80` and `80-` +* In the primary partition the shards `-80` and `80-` are added with their associated key ranges +* In the primary partition the key range for shard `0` is removed +* The replica partition remains the same as in the previous step -_global/keyspaces/customer/shards/0/Shard_ +*global/keyspaces/customer/shards/0/Shard* -``` +```proto primary_alias:{cell:"zone1" uid:200} primary_term_start_time:{seconds:1627466636 nanoseconds:405646818} ``` -_global/keyspaces/customer/shards/80-/Shard_ +*global/keyspaces/customer/shards/80-/Shard* -``` +```proto primary_alias:{cell:"zone1" uid:400} primary_term_start_time:{seconds:1627466710 nanoseconds:579634511} key_range:{start:"\x80"} is_primary_serving:true ``` -_zone1/keyspace/customer/SrvKeyspace_ +*zone1/keyspaces/customer/SrvKeyspace* -``` +```proto partitions:{served_type:PRIMARY -shard_references:{name:"-80" key_range:{end:"\x80"}} -shard_references:{name:"80-" -key_range:{start:"\x80"}}} {name:"0"} + shard_references:{name:"-80" key_range:{end:"\x80"}} + shard_references:{name:"80-" + key_range:{start:"\x80"}} +} {name:"0"} partitions:{served_type:REPLICA -shard_references:{name:"-80" key_range:{end:"\x80"}} -shard_references:{name:"80-" key_range:{start:"\x80"}}} -shard_tablet_controls:{name:"0" query_service_disabled:true} -shard_tablet_controls:{name:"-80" key_range:{end:"\x80"}} -shard_tablet_controls:{name:"80-" key_range:{start:"\x80"}} + shard_references:{name:"-80" key_range:{end:"\x80"}} + shard_references:{name:"80-" key_range:{start:"\x80"}}} + shard_tablet_controls:{name:"0" query_service_disabled:true} + shard_tablet_controls:{name:"-80" key_range:{end:"\x80"}} + shard_tablet_controls:{name:"80-" key_range:{start:"\x80"}} +} ``` ## What happens when a MoveTables workflow is cutover #### Before MoveTables is initiated -VSchema for the source keyspace contains the table name, so vtgate routes to that keyspace +The [`VSchema`](../../../features/vschema/) for the source keyspace contains the table name, so vtgate routes queries to that +keyspace. #### During MoveTables -Both the source and target now contain the tables and both VSchemas refer to them. However we have routing rules that -map the tables for each tablet type from the target keyspace to the other +Both the source and target now contain the tables and both [`VSchemas`](../../../features/vschema/) refer to them. However we +have routing rules that map the tables for each tablet type from the target keyspace to the source keyspace. -_global/RoutingRules_ +*global/RoutingRules* -``` +```proto rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer.customer" to_tables:"commerce.customer"} rules:{from_table:"customer@replica" to_tables:"commerce.customer"} @@ -233,11 +243,11 @@ rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} #### On switching replica traffic to target -The routing rules for replicas are updated to map the table on the source to the target +The routing rules for replica targeted reads are updated to map the table on the source to the target. -_global/RoutingRules_ +*global/RoutingRules* -``` +```proto rules:{from_table:"customer.customer" to_tables:"commerce.customer"} rules:{from_table:"commerce.customer@replica" to_tables:"customer.customer"} rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer@replica" to_tables:"customer.customer"} @@ -245,22 +255,22 @@ rules:{from_table:"customer@replica" to_tables:"customer.customer"} #### On switching primary traffic -The routing rules for the primary are updated to map the table on the source to the target. In addition the tables are -added to the “denylist” on the source keyspace which vttablet uses to reject writes for tables that have moved. The -denylist/routing rules are temporary and can be removed since the moved tables will only appear in the target VSchema +The routing rules for default read-write traffic are updated to map the table on the source to the target. In addition the +tables are added to the “denylist” on the source keyspace which `vttablet` uses to reject queries for these tables on the +old/inactive shards. -_global/RoutingRules_ +*global/RoutingRules* -``` +```proto rules:{from_table:"commerce.customer@replica" to_tables:"customer.customer"} rules:{from_table:"customer.customer@replica" to_tables:"customer.customer"} rules:{from_table:"commerce.customer" to_tables:"customer.customer"} rules:{from_table:"customer" to_tables:"customer.customer"} ``` -_global/keyspaces/commerce/shards/0/Shard_ +*global/keyspaces/commerce/shards/0/Shard* -``` +```proto primary_alias:{cell:"zone1" uid:100} primary_term_start_time:{seconds:1627477340 nanoseconds:740407602} tablet_controls:{tablet_type:PRIMARY denylisted_tables:"customer"} @@ -269,8 +279,8 @@ is_primary_serving:true # Miscellaneous Notes: -* In VReplication workflows, cutovers are performed manually by the user executing the `vtctl` - commands [`SwitchTraffic`](../../../reference/vreplication/switchtraffic/) - and [`ReverseTraffic`](../../../reference/vreplication/reversetraffic/) -* When traffic for replica and rdonly tablets is switched not all read traffic is switched: primary reads will still be - from the source shards, until primary traffic is also switched. +* In VReplication workflows, cutovers are performed manually by the user executing the `SwitchTraffic` and `ReverseTraffic` +actions e.g. for a [`MoveTables`](../../movetables/#switchtraffic) or [`Reshard`](../../reshard/#reversetraffic) vtctl +client command. +* When traffic for `REPLICA` and `RDONLY` tablets is switched not all read traffic is switched: primary/default reads will +still be served from the source shards, until `PRIMARY` tablet traffic is also switched. diff --git a/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md b/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md index 8de4cbb94..1d58d9e97 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md +++ b/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md @@ -7,104 +7,108 @@ aliases: ['/docs/design-docs/vreplication/life-of-a-stream/'] ### Introduction -When a VReplication workflow runs, data is copied from source to target shards. Each target primary runs one -vreplication stream (vstream) for each source shard that the -target's [keyrange](https://vitess.io/docs/16.0/reference/features/sharding/#key-ranges-and-partitions) overlaps with. +When a VReplication workflow runs, data is copied from source to target shards. Each target `PRIMARY` tablet runs one +vreplication stream (`vstream`) for each source shard that the target's +[keyrange](../../../features/sharding/#key-ranges-and-partitions) overlaps with. The diagram below outlines how one such stream operates. VReplication can be asked to start from a specific -GTID or from the start. When starting from a GTID the _replication_ mode is used where it streams events from the -binlog. +[`GTID`](https://dev.mysql.com/doc/refman/en/replication-gtids-concepts.html) or from the start. When starting from a +[`GTID`](https://dev.mysql.com/doc/refman/en/replication-gtids-concepts.html) the _replication_ mode is used where it +streams events from the binlog. ![VReplication Flow](/img/VReplicationFlow.png) #### Full table copy -If an entire table data is requested simple streaming done by _replication_ can create an avalanche of events (think 10s -of millions of rows). Moreover, it is highly likely that earlier binlogs are no longer available. +If the entire table data is requested then the simple streaming done by the _replication_ mode can create an avalanche +of events (think 100s of millions of rows). Moreover, and more importantly, it is highly likely that necesasry older +binlogs are no longer available. -So a _copy/catchup_ mode is initiated first: data in the tables are copied over in -a consistent manner using bulk inserts. Once we have copied enough data so that we are close enough to the current -position (when replication lag is low) it switches over to, and stays in, the _replication_ mode. All future replication -is done only by streaming binlog events. +So a _copy/catchup_ mode is initiated first: data in the tables are copied over in a consistent manner using bulk +inserts. Once we have copied enough data so that we are close enough to the current position (when replication lag is +low) it switches over to, and forever stays in, the _replication_ mode. All future replication is done only by +streaming binlog events. -While we may have multiple database sources in a workflow each vstream has just one source and one target. The source is -always a vttablet (and hence one mysql instance). The target could be another vttablet (resharding) or a streaming grpc -response (vstream api clients). +While we may have multiple database sources in a workflow each `vstream` has just one source and one target. The source is +always a `vttablet` (and hence one `mysqld` instance). The target could be another `vttablet` (when resharding) or a +streaming gRPC response (for [`vtgate` `vstream` API](../../vstream/) clients). #### Transformation and Filtering Note that for all steps the data selected from the source will only be from the tables specified -in the [Match](https://github.com/vitessio/vitess/blob/main/proto/binlogdata.proto#LL128C5) field of the Rule +in the [`Match`](https://github.com/vitessio/vitess/blob/main/proto/binlogdata.proto#LL128C5) field of the Rule specification of the VReplication workflow. Furthermore, if a -[Filter](https://github.com/vitessio/vitess/blob/main/proto/binlogdata.proto#LL133C5) is specified for a table it will -be applied before being sent to the target. Columns may also be transformed based on the Filter’s select clause. +[`Filter`](https://github.com/vitessio/vitess/blob/main/proto/binlogdata.proto#LL133C5) is specified for a table it will +be applied before being sent to the target. Columns may also be transformed based on the Filter’s `SELECT` clause. #### Source and Sink -Each stream has two actors: the target initiates streaming by making grpc calls to the source tablet and the source -tablet sources the data by connecting to its underlying mysql server as a replica (while replicating) or using sql -queries (in the coy phase) and streams it to the target. The target takes appropriate action: in case of resharding it -will convert the events into CRUD sql statements and apply them to the target database. In case of vstream clients the -events are forwarded by vtgate to the client. +Each stream has two actors: the target initiates streaming by making gRPC calls to the source tablet and the source +tablet sources the data by connecting to its underlying `mysqld` server as a replica (while replicating) or using SQL +queries (in the copy phase) and streams it to the target. The target takes appropriate action: in case of resharding it +will convert the events into CRUD SQL statements and apply them to the target database. In case of [`vtgate` `vstream` +API](../../vstream/) clients the events are forwarded by `vtgate` to the client. Note that the target always pulls data. If the source pushes data, there are chances of buffer overruns if the target is -not able to process them in time. For example, in resharding workflows we need to convert the events to sql insert -statements and execute them on the target's mysql server, which are usually much slower than just selecting data on the -source. +not able to process them in time. For example, in resharding workflows we need to convert the events to SQL `INSERT` +statements and execute them on the target's mysqld instance, which are usually much slower than just selecting data on +the source. ### Modes, in detail #### Replicate -This is the easiest to understand. The source stream just acts like a mysql replica and processes events as they are +This is the easiest to understand. The source stream just acts like a MySQL replica and processes events as they are received. Events, after any necessary filtering and transformation, are sent to the target. Replication runs continuously with short sleeps when there are no more events to source. Periodic heartbeats are sent to the target to -signal liveliness. +signal liveness. You will see this reflected with the `Running` state for the workflow. #### Initialize -Initialize is called at the start of the copy phase. For each table to be copied an entry is created in \_vt.copy_state -with a null primary key. As each table copy is completed the related entry is deleted and when there are no more entries -for this workflow the copy phase is considered complete and the workflow moves into the Replication mode. +Initialize is called at the start of the copy phase. For each table to be copied an entry is created in the internal +`_vt.copy_state` table with a null primary key (PK). As each table copy is completed the related entries are deleted +and when there are no more entries for this workflow the copy phase is considered complete and the workflow moves into +the replication mode which you will see reflected with the `Running` state for the workflow. #### Copy Copy works on one table at a time. The source selects a set of rows from the table, for primary keys greater than the ones copied so far, using a consistent snapshot. This results in a stream of rows to be sent to the target which -generates a bulk insert of these rows. +generates a bulk `INSERT` for these rows. You will see this reflected with the `Copying` state for the workflow. However, there are a couple of factors which complicate our story: * Each copy selects all rows until the current position of the binlog, but, -* Since transactions continue to be applied (presuming the database is online) the gtid positions are continuously - moving forward +* Since transactions continue to be applied (presuming the database is online) the GTID position is continuously +moving forward -Consider this example. +Consider this example: -We have two tables X and Y. Each table has 20 rows and we copy 10 rows at a time. -(The queries below simplified for readability). +We have two tables `X` and `Y`. Each table has 20 rows and we copy 10 rows at a time. (The queries below simplified +for readability). -The queries for the copy phase of X will be: +The queries for the copy phase of `X` will be: -``` -T1: select * from X where pk > 0 limit 10. GTID: 100, Last PK 10 +```sql +T1: select * from X where pk > 0 limit 10; GTID: 100, Last PK 10 send rows to target -T2: select * from X where pk > 10 limit 10 GTID: 110, Last PK 20 +T2: select * from X where pk > 10 limit 10; GTID: 110, Last PK 20 send rows to target ``` -There is a gotcha here: onsider that there are 10 new txs between times T1 and T2. Some of these can potentially modify -the rows returned from the query at T1. Hence if we just return the rows from T2 (which have only rows from pk 11 to 20) -we will have an inconsistent state on the target: the updates to rows with PK between 1 and 10 will not be present. +There is a gotcha here: onsider that there are 10 new transactions or GTIDs between times T1 and T2. Some of these +can potentially modify the rows returned from the query at T1. Hence if we just return the rows from T2 (which have +only rows from PK 11 to 20) then we will have an inconsistent state on the target: the updates to rows with PK +between 1 and 10 will not be present. -This means that we need to first stream the events between GTIDs 100 and 110 for primary keys between 1 and 10 first and -then do the second select: +This means that we need to first stream the events between GTIDs 100 and 110 for primary keys between 1 and 10 first +and then do the second select: -``` -T1: select * from X where pk > 0 limit 10. GTID: 100, Last PK 10 +```sql +T1: select * from X where pk > 0 limit 10; GTID: 100, Last PK 10 send rows to target @@ -112,19 +116,19 @@ T2: replicate from 100 to current position (110 from previous example), only pass events for pks 1 to 10 of X -T3: select * from X where pk > 10 limit 10 GTID: 112, Last PK 20 +T3: select * from X where pk > 10 limit 10; GTID: 112, Last PK 20 send rows to target ``` -Another gotcha!: note that at time T3 when we selected the pks from 11 to 20 the gtid position could have moved further! +Another gotcha: note that at time T3 when we selected the PKs from 11 to 20 the GTID position could have moved further! This could be due to transactions that were applied between T2 and T3. So if we just applied the rows from T3 we would still have an inconsistent state, if transactions 111 and 112 affected the rows from pks 1 to 10. This leads us to the following flow: -``` -T1: select * from X where pk > 0 limit 10. GTID: 100, Last PK 10 +```sql +T1: select * from X where pk > 0 limit 10; GTID: 100, Last PK 10 send rows to target @@ -132,7 +136,7 @@ T2: replicate from 100 to current position (110 from previous example), only pass events for pks 1 to 10 -T3: select * from X where pk > 10 limit 10 GTID: 112, Last PK 20 +T3: select * from X where pk > 10 limit 10; GTID: 112, Last PK 20 T4: replicate from 111 to 112 @@ -144,20 +148,21 @@ T5: Send rows for pks 11 to 20 to target This flow actually works and is the one used in Vitess VReplication! The transactions to be applied at T1 can take a long time (due to the bulk inserts). T3 (which is just a snapshot) is -quick. So the position can diverge much more at T2 than at T4. Hence, we call the step in T2 as Catchup and Step T4 as a -Fast Forward. +quick. So the position can diverge much more at T2 than at T4. Hence, we call step T2 "Catchup" and step T4 +"Fast Forward". #### Catchup -As detailed above the catchup phase runs between two copy phases. During the copy phase the gtid position can move -significantly ahead. So we run a replicate till we come close to the current position i.e.the replication lag is small. -At this point we call Copy again. +As detailed above the catchup phase runs between copy phase cycles (time limited by the +[`vreplication_copy_phase_max_duration`](../../flags/#vreplication_copy_phase_duration) flag). During the copy phase the +GTID position can move significantly ahead. So we run a catchup and fast-forward phase until we come close to the current +position — i.e. the replication lag is small. At that point we execute another Copy cycle. #### Fast forward -During the copy phase we first take a snapshot. Then we fast forward: we run another replicate from the gtid position -where we stopped the Catchup to the position of the snapshot. +During the copy phase we first take a snapshot. Then we fast-forward: we replicate from the gtid position where we stopped +the Catchup to the position of the new snapshot. -Finally once we have finished copying all the tables we proceed to replicate until our job is done: for example if we -have resharded and switched over the reads and writes to the new shards or when the vstream client closes its -connection. +Finally once we have finished copying all the tables we proceed to the replicate or `Running` phase until our job is done: +for example if we have resharded and switched over the reads and writes to the new shards or when the +[`vstream` API](../../vstream/) client closes its connection. From 0aa77f8cd4ef7bc447101feafc9376e9cfffab35 Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Fri, 20 Jan 2023 12:52:02 -0500 Subject: [PATCH 10/16] Keys page Signed-off-by: Matt Lord --- .../reference/vreplication/internal/keys.md | 435 ++++++++++++++++++ .../reference/vreplication/internal/keys.md | 435 ++++++++++++++++++ .../reference/vreplication/internal/keys.md | 435 ++++++++++++++++++ .../reference/vreplication/internal/keys.md | 90 ++-- 4 files changed, 1350 insertions(+), 45 deletions(-) create mode 100644 content/en/docs/13.0/reference/vreplication/internal/keys.md create mode 100644 content/en/docs/14.0/reference/vreplication/internal/keys.md create mode 100644 content/en/docs/15.0/reference/vreplication/internal/keys.md diff --git a/content/en/docs/13.0/reference/vreplication/internal/keys.md b/content/en/docs/13.0/reference/vreplication/internal/keys.md new file mode 100644 index 000000000..ae3bee759 --- /dev/null +++ b/content/en/docs/13.0/reference/vreplication/internal/keys.md @@ -0,0 +1,435 @@ +--- +title: Role of table keys in VReplication +description: Uses and requirements for primary and unique keys in source and target tables in VReplication Workflows +weight: 3 +aliases: ['/docs/design-docs/vreplication/keys/'] +--- + +# The use of unique keys + +A VReplication stream copies data from a table on a source tablet to a table on a target tablet. In some cases, the two +tablets may be the same one, but the stream is oblivious to such nuance. VReplication needs to be able to copy existing +rows from the source table to the target table, as well as identify binary log events from the source tablet and +apply them to the target table. To that effect, VReplication needs to be able to uniquely identify rows, so that it +can apply a specific `UPDATE` on the correct row, or so that it knows that all rows _up to a given row_ have been copied. + +Thus each row needs to be uniquely identifiable. In the relational model, this is trivially done by utilizing a `UNIQUE KEY`s, +preferably `PRIMARY KEY`s. A `UNIQUE KEY` made up of non-`NULL`able columns is considered a `PRIMARY KEY` equivalent (PKE) +for this purpose. + +Typically, both the source and the target tables have a similar structure and the same keys. + +In fact, in the most common use case, both tables will have the same `PRIMARY KEY` covering the same set of columns in +the same order. This is the default assumption and expectation by VReplication. But this doesn't have to be the case, +and it is possible to have different keys on the source and the target table. + +## Which keys are eligible? + +Any `UNIQUE KEY` that is non-`NULL`able potentially qualifies. A `NULL`able `UNIQUE KEY` is a key that covers one or +more `NULL`able columns. It doesn't matter if column values do or do not actually contain `NULL`s. If a column is `NULL` +able, then a `UNIQUE KEY` that includes that column is not eligible. + +`PRIMARY KEY`s are by definition always non-`NULL`able. A `PRIMARY KEY` (PK) is typically the best choice. It gives best +iteration/read performance on InnoDB tables, as those are clustered by PK (index organized tables). + +`PRIMARY KEY` aside, `VReplication` prioritizes keys that utilize e.g. integers rather than characters, and more generally +prioritizes smaller data types over larger data types. + +However, not all eligible `UNIQUE KEY`s, or even `PRIMARY KEY`s are usable for all VReplication streams, as described +below. + +## Comparable rows + +VReplication needs to be able to determine, given a row in the source table, which row it maps to in the target table. + +In the case both tables share the same `PRIMARY KEY`, the answer is trivial: given a row from the source table, take the +PK column values (say the table has `PRIMARY KEY(col1, col2)`), and compare with/apply to the target table via +`... WHERE col1= AND col2=`. + +However, other scenarios are also valid. Consider an OnlineDDL operation that modifies the `PRIMARY KEY` as +follows: `DROP PRIMARY KEY, ADD PRIMARY KEY(col1)`. On the source table, a row is identified by `col1, col2`. On the +target table, a row is only identifiable by `col1`. This scenario still feels comfortable: all we need to do when we +apply e.g. an `UPDATE` statement on the target table is to drop `col2` from the statement: `... WHERE col1=`. + +_Note that it is the user's responsibility to make sure the data will comply with the new constraints. If not, +VReplication will fail the operation._ + +But consider the opposite case, there's a `PRIMARY KEY(col1)` and an OnlineDDL operation turns it into +`PRIMARY KEY(col1, col2)`. Now we need to apply changes using `... WHERE col1= AND col2=`. But `col2` is +not part of the source `PRIMARY KEY`. + +An extreme case is when the keys on the source table and the target table do not share _any columns_ between them. Say +the source table has `PRIMARY KEY(col1)` and the target table has `PRIMARY KEY(col2)` and with no other potential keys. +We still need to identify which row in the source table maps to which row in the target table. VReplication still supports +this scenario. + +Yet another complication is when columns are renamed along the way. Consider an +`ALTER TABLE CHANGE COLUMN col2 col_two INT UNSIGNED ...` statement. A row on the source table is identified by +`col1, col2`, but on the target table it is identified by `col1, col_two`. + +Let's now discuss what the exact requirements are for unique keys, and then discuss the implementation. + +## Requirements + +To be able to create a VReplication stream between the source table and target table: + +- The source table must have a non-`NULL`able `UNIQUE/PRIMARY` key (PK or PKE) whose columns all exist in the target + table (possibly under different names) +- The target table must have a non-`NULL`able `UNIQUE/PRIMARY` key whose columns all exist in the source table (possibly + under different names) +- Except in the trivial case where both tables share the same `PRIMARY KEY` (of the same columns in the same order), + VReplication can automatically determine which keys to utilize (more on this later) + +To clarify, it is **OK** if: + +- Keys in the source table and the target table go by different names +- Chosen key in the source table and chosen key in the target table do not share any columns +- Chosen key in the source table and chosen key in the target table share some or all columns +- Chosen key in the source table and chosen key in the target table share some or all columns, but in a different order +- There are keys in the source table that cover columns not present in the target table +- There are keys in the target table that cover columns not present in the source table +- There are `NULL`able columns in the source and the target table +- There are `NULL`able keys in the source and the target table + +All it takes is _one_ viable key that can be used to uniquely identify rows in the source table, and one such viable key +in the target table to allow VReplication to work. + +### Examples of valid cases + +#### Source table and target table are the same + +```sql +CREATE TABLE `entry` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) +``` + +The above is a trivial scenario. + +#### Source table and target table share the same PRIMARY KEY + +```sql +CREATE TABLE `source` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int, + PRIMARY KEY (`id`), + KEY ts_idx(`ts`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) +``` + +The differences in structure are interesting but irrelevant to VReplication's ability to copy the data. + +#### Subset PRIMARY KEY + +```sql +CREATE TABLE `source` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`, `customer_id`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) +``` + +#### Superset PRIMARY KEY + +```sql +CREATE TABLE `source` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`, `customer_id`) +) +``` + +#### Different PRIMARY KEYs + +```sql +CREATE TABLE `source` ( + `id` int NOT NULL, + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`uuid`) +) +``` + +No columns are shared between the `PRIMARY KEY`s in the above. However: + +- `id`, covered by `source`'s PK, is found in `target` +- `uuid`, covered by `target`'s PK, is found in `source` + +#### Mixed keys + +```sql +CREATE TABLE `source` ( + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`uuid`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) + UNIQUE KEY uuid_idx(`uuid`) +) +``` + +The only eligible solution in the above is: + +- Use `source`'s `PRIMARY KEY` (the column `uuid` is found in `target`) +- Use `target`'s `uuid_idx` key (again using column `uuid` which is found in `source`). + +`target`'s `PRIMARY KEY` is not valid because the covered column `id` does not exist in `source`. + +Incidentally, in the above, the chosen keys differ by name, but share the same columns (`uuid`). + +### Examples of invalid cases + +#### NULLable columns + +```sql +CREATE TABLE `source` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + UNIQUE KEY (`uuid`) +) +``` + +The only `UNIQUE KEY` on `target` is `NULL`able, hence _not_ eligible. + +#### Missing columns + +```sql +CREATE TABLE `source` ( + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`uuid`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) +``` + +`target` only has one possible key, the `PRIMARY KEY`, covering `id`. But `id` is not found in `source`. + +## Configuring the stream + +If both source and target table share the same `PRIMARY KEY` (covering the same columns in the same order) then there's +nothing to be done. VReplication will pick `PRIMARY KEY` on both ends by default. + +In all other cases, VReplication must determine which keys are involved and which ones to use. + +### Example 1 + +Let's begin again as a trivial example, both tables have same `PRIMARY KEY`s: + +```sql +CREATE TABLE `corder` ( + `order_id` bigint NOT NULL AUTO_INCREMENT, + `customer_id` bigint DEFAULT NULL, + `sku` varbinary(128) DEFAULT NULL, + `price` bigint DEFAULT NULL, + PRIMARY KEY (`order_id`) +) +``` + +And even though we don't _have to_, here's how we could manually configure the VReplication workflow definition +(prettified for readability): + +```proto +keyspace:"commerce" shard:"0" filter:{ + rules:{ + match:"corder" + filter:"select `order_id` as `order_id`, `customer_id` as `customer_id`, `sku` as `sku`, `price` as `price` from `corder`" + source_unique_key_columns:"order_id" + target_unique_key_columns:"order_id" + source_unique_key_target_columns:"order_id" + } +} +``` + +In the above: + +- `source_unique_key_columns` is the (comma delimited) list of columns covered by the chosen key on source table +- `target_unique_key_columns` is the (comma delimited) list of columns covered by the chosen key on target table +- `source_unique_key_target_columns` is the (comma delimited) list of column names in target table, which map + to `source_unique_key_columns`. This mapping is necessary because columns may change their names. + +### Example 2 + +Again both the source and the target table share same `PRIMARY KEY`, but this time it covers two columns: + +```sql +CREATE TABLE `shipment` ( + `order_id` int NOT NULL, + `customer_id` int NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + PRIMARY KEY (`order_id`,`customer_id`) +) +``` + +```proto +keyspace:"commerce" shard:"0" filter:{ + rules:{ + match:"shipment" + filter:"select `order_id` as `order_id`, `customer_id` as `customer_id`, `ts` as `ts` from `shipment`" + source_unique_key_columns:"order_id,customer_id" + target_unique_key_columns:"order_id,customer_id" + source_unique_key_target_columns:"order_id,customer_id" + } +} +``` + +Not much changed from the previous example, just note how we comma separate `"order_id,customer_id"`. + +### Example 3 + +Continuing the previous example, we now rename a column the target table: + +```sql +CREATE TABLE `shipment` ( + `order_id` int NOT NULL, + `cust_id` int NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + PRIMARY KEY (`order_id`,`cust_id`) +) +``` + +```proto +keyspace:"commerce" shard:"0" filter:{ + rules:{ + match:"shipment" + filter:"select `order_id` as `order_id`, `customer_id` as `cust_id`, `ts` as `ts` from `shipment`" + source_unique_key_columns:"order_id,customer_id" + target_unique_key_columns:"order_id,cust_id" + source_unique_key_target_columns:"order_id,cust_id" + } +} +``` + +Note: + +- `source_unique_key_columns` indicates the names of columns on the source table +- `target_unique_key_columns` indicates the names of columns on the target table +- `source_unique_key_target_columns` repeats `source_unique_key_columns`, but replaces `customer_id` with `cust_id` + +## Automation + +OnlineDDL has a mechanism to automatically analyze the differences between source and target tables, evaluate eligible +keys, choose the best keys on source and target tables, and populate the filter's +`source_unique_key_columns`, `target_unique_key_columns`, and `source_unique_key_target_columns` fields. Indeed, +OnlineDDL operations are most susceptible to differences in keys. The user can also supply their chosen values as an +override — using those fields in the workflow definition — in the rare case it's needed. + +VReplication more broadly will automatically use the most efficient `PRIMARY KEY` equivalent or PKE (non-`NULL`able unique +key) when there's no defined `PRIMARY KEY` on the table. + +## Implementation + +At a high level, this is how VReplication is able to work with different keys/columns between the source and target. + +Originally, VReplication was only designed to work with identical `PRIMARY KEY`s. If not specified, VReplication assumed +the source table's `PRIMARY KEY` _can be used_ on the target table, and that the target table's `PRIMARY KEY` is applied +to the source table. If not, it would error out and the workflow would fail. + +With the introduction of mechanisms to automatically determine the optimal key to use and of +the `source_unique_key_columns`, `target_unique_key_columns`, and `source_unique_key_target_columns` fields for more +fine-grained control, VReplication changes its behavior as needed. + +#### Notes about the code + +Much of the code uses "PK" terminology. With the introduction of _any_ unique key utilization the "PK" terminology +becomes incorrect. However, to avoid mass rewrites we kept this terminology, and wherever VReplication discusses +a `PRIMARY KEY` or pkColumns, etc., it may refer to a non-PK Unique Key (PKE). + +### Streamer + +Streaming is done using the `source_unique_key_columns` value if present. When present, `rowstreamer` trusts the +information in `source_unique_key_columns` to be correct. It does not validate that there is indeed a valid unique key +covering those columns, it only validates that the columns exist. When a `source_unique_key_columns` value is not +present, `rowstreamer` uses the `PRIMARY KEY` columns if they exist, otherwise it will determine the best +available `PRIMARY KEY` equivalent if one exists, and lastly if none of these are available it will use all of the +columns in the table. + +The streamer iterates the table by the chosen index's column order. It then tracks its progress in `lastPk` as if this +was indeed a true `PRIMARY KEY`. + +### Copier + +VCopier receives rows from the `rowstreamer` in the chosen index's column order. It complies with the streamer's ordering. +When tracking progress in `_vt.copy_state` it uses `lastPk` values from the streamer, which means it uses the same index +columns as the streamer in that order. + +### Player + +VPlayer adheres to both `source_unique_key_columns` and `target_unique_key_columns` when present. If not present, again +it attempts to use the `PRIMARY KEY` columns if they exist, otherwise it will determine the best available `PRIMARY KEY` +equivalent if one exists, and lastly if none of these are available it will use all of the columns in the table. + +- `TablePlan`'s `isOutsidePKRange()` function needs to compare values according to `rowstreamer`'s ordering, therefore + uses the chosen index columns in order. +- `tablePlanBuilder`'s `generateWhere()` function uses the target table's `target_unique_key_columns`, and then also + appends any supplemental columns from `source_unique_key_target_columns` not included in `target_unique_key_columns` + when they are present. If not present, again it attempts to use the `PRIMARY KEY` columns if they exist, otherwise it + will determine the best available `PRIMARY KEY` equivalent if one exists, and lastly if none of these are available it + will use all of the columns in the table. diff --git a/content/en/docs/14.0/reference/vreplication/internal/keys.md b/content/en/docs/14.0/reference/vreplication/internal/keys.md new file mode 100644 index 000000000..ae3bee759 --- /dev/null +++ b/content/en/docs/14.0/reference/vreplication/internal/keys.md @@ -0,0 +1,435 @@ +--- +title: Role of table keys in VReplication +description: Uses and requirements for primary and unique keys in source and target tables in VReplication Workflows +weight: 3 +aliases: ['/docs/design-docs/vreplication/keys/'] +--- + +# The use of unique keys + +A VReplication stream copies data from a table on a source tablet to a table on a target tablet. In some cases, the two +tablets may be the same one, but the stream is oblivious to such nuance. VReplication needs to be able to copy existing +rows from the source table to the target table, as well as identify binary log events from the source tablet and +apply them to the target table. To that effect, VReplication needs to be able to uniquely identify rows, so that it +can apply a specific `UPDATE` on the correct row, or so that it knows that all rows _up to a given row_ have been copied. + +Thus each row needs to be uniquely identifiable. In the relational model, this is trivially done by utilizing a `UNIQUE KEY`s, +preferably `PRIMARY KEY`s. A `UNIQUE KEY` made up of non-`NULL`able columns is considered a `PRIMARY KEY` equivalent (PKE) +for this purpose. + +Typically, both the source and the target tables have a similar structure and the same keys. + +In fact, in the most common use case, both tables will have the same `PRIMARY KEY` covering the same set of columns in +the same order. This is the default assumption and expectation by VReplication. But this doesn't have to be the case, +and it is possible to have different keys on the source and the target table. + +## Which keys are eligible? + +Any `UNIQUE KEY` that is non-`NULL`able potentially qualifies. A `NULL`able `UNIQUE KEY` is a key that covers one or +more `NULL`able columns. It doesn't matter if column values do or do not actually contain `NULL`s. If a column is `NULL` +able, then a `UNIQUE KEY` that includes that column is not eligible. + +`PRIMARY KEY`s are by definition always non-`NULL`able. A `PRIMARY KEY` (PK) is typically the best choice. It gives best +iteration/read performance on InnoDB tables, as those are clustered by PK (index organized tables). + +`PRIMARY KEY` aside, `VReplication` prioritizes keys that utilize e.g. integers rather than characters, and more generally +prioritizes smaller data types over larger data types. + +However, not all eligible `UNIQUE KEY`s, or even `PRIMARY KEY`s are usable for all VReplication streams, as described +below. + +## Comparable rows + +VReplication needs to be able to determine, given a row in the source table, which row it maps to in the target table. + +In the case both tables share the same `PRIMARY KEY`, the answer is trivial: given a row from the source table, take the +PK column values (say the table has `PRIMARY KEY(col1, col2)`), and compare with/apply to the target table via +`... WHERE col1= AND col2=`. + +However, other scenarios are also valid. Consider an OnlineDDL operation that modifies the `PRIMARY KEY` as +follows: `DROP PRIMARY KEY, ADD PRIMARY KEY(col1)`. On the source table, a row is identified by `col1, col2`. On the +target table, a row is only identifiable by `col1`. This scenario still feels comfortable: all we need to do when we +apply e.g. an `UPDATE` statement on the target table is to drop `col2` from the statement: `... WHERE col1=`. + +_Note that it is the user's responsibility to make sure the data will comply with the new constraints. If not, +VReplication will fail the operation._ + +But consider the opposite case, there's a `PRIMARY KEY(col1)` and an OnlineDDL operation turns it into +`PRIMARY KEY(col1, col2)`. Now we need to apply changes using `... WHERE col1= AND col2=`. But `col2` is +not part of the source `PRIMARY KEY`. + +An extreme case is when the keys on the source table and the target table do not share _any columns_ between them. Say +the source table has `PRIMARY KEY(col1)` and the target table has `PRIMARY KEY(col2)` and with no other potential keys. +We still need to identify which row in the source table maps to which row in the target table. VReplication still supports +this scenario. + +Yet another complication is when columns are renamed along the way. Consider an +`ALTER TABLE CHANGE COLUMN col2 col_two INT UNSIGNED ...` statement. A row on the source table is identified by +`col1, col2`, but on the target table it is identified by `col1, col_two`. + +Let's now discuss what the exact requirements are for unique keys, and then discuss the implementation. + +## Requirements + +To be able to create a VReplication stream between the source table and target table: + +- The source table must have a non-`NULL`able `UNIQUE/PRIMARY` key (PK or PKE) whose columns all exist in the target + table (possibly under different names) +- The target table must have a non-`NULL`able `UNIQUE/PRIMARY` key whose columns all exist in the source table (possibly + under different names) +- Except in the trivial case where both tables share the same `PRIMARY KEY` (of the same columns in the same order), + VReplication can automatically determine which keys to utilize (more on this later) + +To clarify, it is **OK** if: + +- Keys in the source table and the target table go by different names +- Chosen key in the source table and chosen key in the target table do not share any columns +- Chosen key in the source table and chosen key in the target table share some or all columns +- Chosen key in the source table and chosen key in the target table share some or all columns, but in a different order +- There are keys in the source table that cover columns not present in the target table +- There are keys in the target table that cover columns not present in the source table +- There are `NULL`able columns in the source and the target table +- There are `NULL`able keys in the source and the target table + +All it takes is _one_ viable key that can be used to uniquely identify rows in the source table, and one such viable key +in the target table to allow VReplication to work. + +### Examples of valid cases + +#### Source table and target table are the same + +```sql +CREATE TABLE `entry` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) +``` + +The above is a trivial scenario. + +#### Source table and target table share the same PRIMARY KEY + +```sql +CREATE TABLE `source` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int, + PRIMARY KEY (`id`), + KEY ts_idx(`ts`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) +``` + +The differences in structure are interesting but irrelevant to VReplication's ability to copy the data. + +#### Subset PRIMARY KEY + +```sql +CREATE TABLE `source` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`, `customer_id`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) +``` + +#### Superset PRIMARY KEY + +```sql +CREATE TABLE `source` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`, `customer_id`) +) +``` + +#### Different PRIMARY KEYs + +```sql +CREATE TABLE `source` ( + `id` int NOT NULL, + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`uuid`) +) +``` + +No columns are shared between the `PRIMARY KEY`s in the above. However: + +- `id`, covered by `source`'s PK, is found in `target` +- `uuid`, covered by `target`'s PK, is found in `source` + +#### Mixed keys + +```sql +CREATE TABLE `source` ( + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`uuid`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) + UNIQUE KEY uuid_idx(`uuid`) +) +``` + +The only eligible solution in the above is: + +- Use `source`'s `PRIMARY KEY` (the column `uuid` is found in `target`) +- Use `target`'s `uuid_idx` key (again using column `uuid` which is found in `source`). + +`target`'s `PRIMARY KEY` is not valid because the covered column `id` does not exist in `source`. + +Incidentally, in the above, the chosen keys differ by name, but share the same columns (`uuid`). + +### Examples of invalid cases + +#### NULLable columns + +```sql +CREATE TABLE `source` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + UNIQUE KEY (`uuid`) +) +``` + +The only `UNIQUE KEY` on `target` is `NULL`able, hence _not_ eligible. + +#### Missing columns + +```sql +CREATE TABLE `source` ( + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`uuid`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) +``` + +`target` only has one possible key, the `PRIMARY KEY`, covering `id`. But `id` is not found in `source`. + +## Configuring the stream + +If both source and target table share the same `PRIMARY KEY` (covering the same columns in the same order) then there's +nothing to be done. VReplication will pick `PRIMARY KEY` on both ends by default. + +In all other cases, VReplication must determine which keys are involved and which ones to use. + +### Example 1 + +Let's begin again as a trivial example, both tables have same `PRIMARY KEY`s: + +```sql +CREATE TABLE `corder` ( + `order_id` bigint NOT NULL AUTO_INCREMENT, + `customer_id` bigint DEFAULT NULL, + `sku` varbinary(128) DEFAULT NULL, + `price` bigint DEFAULT NULL, + PRIMARY KEY (`order_id`) +) +``` + +And even though we don't _have to_, here's how we could manually configure the VReplication workflow definition +(prettified for readability): + +```proto +keyspace:"commerce" shard:"0" filter:{ + rules:{ + match:"corder" + filter:"select `order_id` as `order_id`, `customer_id` as `customer_id`, `sku` as `sku`, `price` as `price` from `corder`" + source_unique_key_columns:"order_id" + target_unique_key_columns:"order_id" + source_unique_key_target_columns:"order_id" + } +} +``` + +In the above: + +- `source_unique_key_columns` is the (comma delimited) list of columns covered by the chosen key on source table +- `target_unique_key_columns` is the (comma delimited) list of columns covered by the chosen key on target table +- `source_unique_key_target_columns` is the (comma delimited) list of column names in target table, which map + to `source_unique_key_columns`. This mapping is necessary because columns may change their names. + +### Example 2 + +Again both the source and the target table share same `PRIMARY KEY`, but this time it covers two columns: + +```sql +CREATE TABLE `shipment` ( + `order_id` int NOT NULL, + `customer_id` int NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + PRIMARY KEY (`order_id`,`customer_id`) +) +``` + +```proto +keyspace:"commerce" shard:"0" filter:{ + rules:{ + match:"shipment" + filter:"select `order_id` as `order_id`, `customer_id` as `customer_id`, `ts` as `ts` from `shipment`" + source_unique_key_columns:"order_id,customer_id" + target_unique_key_columns:"order_id,customer_id" + source_unique_key_target_columns:"order_id,customer_id" + } +} +``` + +Not much changed from the previous example, just note how we comma separate `"order_id,customer_id"`. + +### Example 3 + +Continuing the previous example, we now rename a column the target table: + +```sql +CREATE TABLE `shipment` ( + `order_id` int NOT NULL, + `cust_id` int NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + PRIMARY KEY (`order_id`,`cust_id`) +) +``` + +```proto +keyspace:"commerce" shard:"0" filter:{ + rules:{ + match:"shipment" + filter:"select `order_id` as `order_id`, `customer_id` as `cust_id`, `ts` as `ts` from `shipment`" + source_unique_key_columns:"order_id,customer_id" + target_unique_key_columns:"order_id,cust_id" + source_unique_key_target_columns:"order_id,cust_id" + } +} +``` + +Note: + +- `source_unique_key_columns` indicates the names of columns on the source table +- `target_unique_key_columns` indicates the names of columns on the target table +- `source_unique_key_target_columns` repeats `source_unique_key_columns`, but replaces `customer_id` with `cust_id` + +## Automation + +OnlineDDL has a mechanism to automatically analyze the differences between source and target tables, evaluate eligible +keys, choose the best keys on source and target tables, and populate the filter's +`source_unique_key_columns`, `target_unique_key_columns`, and `source_unique_key_target_columns` fields. Indeed, +OnlineDDL operations are most susceptible to differences in keys. The user can also supply their chosen values as an +override — using those fields in the workflow definition — in the rare case it's needed. + +VReplication more broadly will automatically use the most efficient `PRIMARY KEY` equivalent or PKE (non-`NULL`able unique +key) when there's no defined `PRIMARY KEY` on the table. + +## Implementation + +At a high level, this is how VReplication is able to work with different keys/columns between the source and target. + +Originally, VReplication was only designed to work with identical `PRIMARY KEY`s. If not specified, VReplication assumed +the source table's `PRIMARY KEY` _can be used_ on the target table, and that the target table's `PRIMARY KEY` is applied +to the source table. If not, it would error out and the workflow would fail. + +With the introduction of mechanisms to automatically determine the optimal key to use and of +the `source_unique_key_columns`, `target_unique_key_columns`, and `source_unique_key_target_columns` fields for more +fine-grained control, VReplication changes its behavior as needed. + +#### Notes about the code + +Much of the code uses "PK" terminology. With the introduction of _any_ unique key utilization the "PK" terminology +becomes incorrect. However, to avoid mass rewrites we kept this terminology, and wherever VReplication discusses +a `PRIMARY KEY` or pkColumns, etc., it may refer to a non-PK Unique Key (PKE). + +### Streamer + +Streaming is done using the `source_unique_key_columns` value if present. When present, `rowstreamer` trusts the +information in `source_unique_key_columns` to be correct. It does not validate that there is indeed a valid unique key +covering those columns, it only validates that the columns exist. When a `source_unique_key_columns` value is not +present, `rowstreamer` uses the `PRIMARY KEY` columns if they exist, otherwise it will determine the best +available `PRIMARY KEY` equivalent if one exists, and lastly if none of these are available it will use all of the +columns in the table. + +The streamer iterates the table by the chosen index's column order. It then tracks its progress in `lastPk` as if this +was indeed a true `PRIMARY KEY`. + +### Copier + +VCopier receives rows from the `rowstreamer` in the chosen index's column order. It complies with the streamer's ordering. +When tracking progress in `_vt.copy_state` it uses `lastPk` values from the streamer, which means it uses the same index +columns as the streamer in that order. + +### Player + +VPlayer adheres to both `source_unique_key_columns` and `target_unique_key_columns` when present. If not present, again +it attempts to use the `PRIMARY KEY` columns if they exist, otherwise it will determine the best available `PRIMARY KEY` +equivalent if one exists, and lastly if none of these are available it will use all of the columns in the table. + +- `TablePlan`'s `isOutsidePKRange()` function needs to compare values according to `rowstreamer`'s ordering, therefore + uses the chosen index columns in order. +- `tablePlanBuilder`'s `generateWhere()` function uses the target table's `target_unique_key_columns`, and then also + appends any supplemental columns from `source_unique_key_target_columns` not included in `target_unique_key_columns` + when they are present. If not present, again it attempts to use the `PRIMARY KEY` columns if they exist, otherwise it + will determine the best available `PRIMARY KEY` equivalent if one exists, and lastly if none of these are available it + will use all of the columns in the table. diff --git a/content/en/docs/15.0/reference/vreplication/internal/keys.md b/content/en/docs/15.0/reference/vreplication/internal/keys.md new file mode 100644 index 000000000..ae3bee759 --- /dev/null +++ b/content/en/docs/15.0/reference/vreplication/internal/keys.md @@ -0,0 +1,435 @@ +--- +title: Role of table keys in VReplication +description: Uses and requirements for primary and unique keys in source and target tables in VReplication Workflows +weight: 3 +aliases: ['/docs/design-docs/vreplication/keys/'] +--- + +# The use of unique keys + +A VReplication stream copies data from a table on a source tablet to a table on a target tablet. In some cases, the two +tablets may be the same one, but the stream is oblivious to such nuance. VReplication needs to be able to copy existing +rows from the source table to the target table, as well as identify binary log events from the source tablet and +apply them to the target table. To that effect, VReplication needs to be able to uniquely identify rows, so that it +can apply a specific `UPDATE` on the correct row, or so that it knows that all rows _up to a given row_ have been copied. + +Thus each row needs to be uniquely identifiable. In the relational model, this is trivially done by utilizing a `UNIQUE KEY`s, +preferably `PRIMARY KEY`s. A `UNIQUE KEY` made up of non-`NULL`able columns is considered a `PRIMARY KEY` equivalent (PKE) +for this purpose. + +Typically, both the source and the target tables have a similar structure and the same keys. + +In fact, in the most common use case, both tables will have the same `PRIMARY KEY` covering the same set of columns in +the same order. This is the default assumption and expectation by VReplication. But this doesn't have to be the case, +and it is possible to have different keys on the source and the target table. + +## Which keys are eligible? + +Any `UNIQUE KEY` that is non-`NULL`able potentially qualifies. A `NULL`able `UNIQUE KEY` is a key that covers one or +more `NULL`able columns. It doesn't matter if column values do or do not actually contain `NULL`s. If a column is `NULL` +able, then a `UNIQUE KEY` that includes that column is not eligible. + +`PRIMARY KEY`s are by definition always non-`NULL`able. A `PRIMARY KEY` (PK) is typically the best choice. It gives best +iteration/read performance on InnoDB tables, as those are clustered by PK (index organized tables). + +`PRIMARY KEY` aside, `VReplication` prioritizes keys that utilize e.g. integers rather than characters, and more generally +prioritizes smaller data types over larger data types. + +However, not all eligible `UNIQUE KEY`s, or even `PRIMARY KEY`s are usable for all VReplication streams, as described +below. + +## Comparable rows + +VReplication needs to be able to determine, given a row in the source table, which row it maps to in the target table. + +In the case both tables share the same `PRIMARY KEY`, the answer is trivial: given a row from the source table, take the +PK column values (say the table has `PRIMARY KEY(col1, col2)`), and compare with/apply to the target table via +`... WHERE col1= AND col2=`. + +However, other scenarios are also valid. Consider an OnlineDDL operation that modifies the `PRIMARY KEY` as +follows: `DROP PRIMARY KEY, ADD PRIMARY KEY(col1)`. On the source table, a row is identified by `col1, col2`. On the +target table, a row is only identifiable by `col1`. This scenario still feels comfortable: all we need to do when we +apply e.g. an `UPDATE` statement on the target table is to drop `col2` from the statement: `... WHERE col1=`. + +_Note that it is the user's responsibility to make sure the data will comply with the new constraints. If not, +VReplication will fail the operation._ + +But consider the opposite case, there's a `PRIMARY KEY(col1)` and an OnlineDDL operation turns it into +`PRIMARY KEY(col1, col2)`. Now we need to apply changes using `... WHERE col1= AND col2=`. But `col2` is +not part of the source `PRIMARY KEY`. + +An extreme case is when the keys on the source table and the target table do not share _any columns_ between them. Say +the source table has `PRIMARY KEY(col1)` and the target table has `PRIMARY KEY(col2)` and with no other potential keys. +We still need to identify which row in the source table maps to which row in the target table. VReplication still supports +this scenario. + +Yet another complication is when columns are renamed along the way. Consider an +`ALTER TABLE CHANGE COLUMN col2 col_two INT UNSIGNED ...` statement. A row on the source table is identified by +`col1, col2`, but on the target table it is identified by `col1, col_two`. + +Let's now discuss what the exact requirements are for unique keys, and then discuss the implementation. + +## Requirements + +To be able to create a VReplication stream between the source table and target table: + +- The source table must have a non-`NULL`able `UNIQUE/PRIMARY` key (PK or PKE) whose columns all exist in the target + table (possibly under different names) +- The target table must have a non-`NULL`able `UNIQUE/PRIMARY` key whose columns all exist in the source table (possibly + under different names) +- Except in the trivial case where both tables share the same `PRIMARY KEY` (of the same columns in the same order), + VReplication can automatically determine which keys to utilize (more on this later) + +To clarify, it is **OK** if: + +- Keys in the source table and the target table go by different names +- Chosen key in the source table and chosen key in the target table do not share any columns +- Chosen key in the source table and chosen key in the target table share some or all columns +- Chosen key in the source table and chosen key in the target table share some or all columns, but in a different order +- There are keys in the source table that cover columns not present in the target table +- There are keys in the target table that cover columns not present in the source table +- There are `NULL`able columns in the source and the target table +- There are `NULL`able keys in the source and the target table + +All it takes is _one_ viable key that can be used to uniquely identify rows in the source table, and one such viable key +in the target table to allow VReplication to work. + +### Examples of valid cases + +#### Source table and target table are the same + +```sql +CREATE TABLE `entry` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) +``` + +The above is a trivial scenario. + +#### Source table and target table share the same PRIMARY KEY + +```sql +CREATE TABLE `source` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int, + PRIMARY KEY (`id`), + KEY ts_idx(`ts`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) +``` + +The differences in structure are interesting but irrelevant to VReplication's ability to copy the data. + +#### Subset PRIMARY KEY + +```sql +CREATE TABLE `source` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`, `customer_id`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) +``` + +#### Superset PRIMARY KEY + +```sql +CREATE TABLE `source` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`, `customer_id`) +) +``` + +#### Different PRIMARY KEYs + +```sql +CREATE TABLE `source` ( + `id` int NOT NULL, + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`uuid`) +) +``` + +No columns are shared between the `PRIMARY KEY`s in the above. However: + +- `id`, covered by `source`'s PK, is found in `target` +- `uuid`, covered by `target`'s PK, is found in `source` + +#### Mixed keys + +```sql +CREATE TABLE `source` ( + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`uuid`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) + UNIQUE KEY uuid_idx(`uuid`) +) +``` + +The only eligible solution in the above is: + +- Use `source`'s `PRIMARY KEY` (the column `uuid` is found in `target`) +- Use `target`'s `uuid_idx` key (again using column `uuid` which is found in `source`). + +`target`'s `PRIMARY KEY` is not valid because the covered column `id` does not exist in `source`. + +Incidentally, in the above, the chosen keys differ by name, but share the same columns (`uuid`). + +### Examples of invalid cases + +#### NULLable columns + +```sql +CREATE TABLE `source` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) DEFAULT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + UNIQUE KEY (`uuid`) +) +``` + +The only `UNIQUE KEY` on `target` is `NULL`able, hence _not_ eligible. + +#### Missing columns + +```sql +CREATE TABLE `source` ( + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`uuid`) +) + +CREATE TABLE `target` ( + `id` int NOT NULL, + `uuid` varchar(40) NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + `customer_id` int NOT NULL, + PRIMARY KEY (`id`) +) +``` + +`target` only has one possible key, the `PRIMARY KEY`, covering `id`. But `id` is not found in `source`. + +## Configuring the stream + +If both source and target table share the same `PRIMARY KEY` (covering the same columns in the same order) then there's +nothing to be done. VReplication will pick `PRIMARY KEY` on both ends by default. + +In all other cases, VReplication must determine which keys are involved and which ones to use. + +### Example 1 + +Let's begin again as a trivial example, both tables have same `PRIMARY KEY`s: + +```sql +CREATE TABLE `corder` ( + `order_id` bigint NOT NULL AUTO_INCREMENT, + `customer_id` bigint DEFAULT NULL, + `sku` varbinary(128) DEFAULT NULL, + `price` bigint DEFAULT NULL, + PRIMARY KEY (`order_id`) +) +``` + +And even though we don't _have to_, here's how we could manually configure the VReplication workflow definition +(prettified for readability): + +```proto +keyspace:"commerce" shard:"0" filter:{ + rules:{ + match:"corder" + filter:"select `order_id` as `order_id`, `customer_id` as `customer_id`, `sku` as `sku`, `price` as `price` from `corder`" + source_unique_key_columns:"order_id" + target_unique_key_columns:"order_id" + source_unique_key_target_columns:"order_id" + } +} +``` + +In the above: + +- `source_unique_key_columns` is the (comma delimited) list of columns covered by the chosen key on source table +- `target_unique_key_columns` is the (comma delimited) list of columns covered by the chosen key on target table +- `source_unique_key_target_columns` is the (comma delimited) list of column names in target table, which map + to `source_unique_key_columns`. This mapping is necessary because columns may change their names. + +### Example 2 + +Again both the source and the target table share same `PRIMARY KEY`, but this time it covers two columns: + +```sql +CREATE TABLE `shipment` ( + `order_id` int NOT NULL, + `customer_id` int NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + PRIMARY KEY (`order_id`,`customer_id`) +) +``` + +```proto +keyspace:"commerce" shard:"0" filter:{ + rules:{ + match:"shipment" + filter:"select `order_id` as `order_id`, `customer_id` as `customer_id`, `ts` as `ts` from `shipment`" + source_unique_key_columns:"order_id,customer_id" + target_unique_key_columns:"order_id,customer_id" + source_unique_key_target_columns:"order_id,customer_id" + } +} +``` + +Not much changed from the previous example, just note how we comma separate `"order_id,customer_id"`. + +### Example 3 + +Continuing the previous example, we now rename a column the target table: + +```sql +CREATE TABLE `shipment` ( + `order_id` int NOT NULL, + `cust_id` int NOT NULL, + `ts` timestamp NULL DEFAULT NULL, + PRIMARY KEY (`order_id`,`cust_id`) +) +``` + +```proto +keyspace:"commerce" shard:"0" filter:{ + rules:{ + match:"shipment" + filter:"select `order_id` as `order_id`, `customer_id` as `cust_id`, `ts` as `ts` from `shipment`" + source_unique_key_columns:"order_id,customer_id" + target_unique_key_columns:"order_id,cust_id" + source_unique_key_target_columns:"order_id,cust_id" + } +} +``` + +Note: + +- `source_unique_key_columns` indicates the names of columns on the source table +- `target_unique_key_columns` indicates the names of columns on the target table +- `source_unique_key_target_columns` repeats `source_unique_key_columns`, but replaces `customer_id` with `cust_id` + +## Automation + +OnlineDDL has a mechanism to automatically analyze the differences between source and target tables, evaluate eligible +keys, choose the best keys on source and target tables, and populate the filter's +`source_unique_key_columns`, `target_unique_key_columns`, and `source_unique_key_target_columns` fields. Indeed, +OnlineDDL operations are most susceptible to differences in keys. The user can also supply their chosen values as an +override — using those fields in the workflow definition — in the rare case it's needed. + +VReplication more broadly will automatically use the most efficient `PRIMARY KEY` equivalent or PKE (non-`NULL`able unique +key) when there's no defined `PRIMARY KEY` on the table. + +## Implementation + +At a high level, this is how VReplication is able to work with different keys/columns between the source and target. + +Originally, VReplication was only designed to work with identical `PRIMARY KEY`s. If not specified, VReplication assumed +the source table's `PRIMARY KEY` _can be used_ on the target table, and that the target table's `PRIMARY KEY` is applied +to the source table. If not, it would error out and the workflow would fail. + +With the introduction of mechanisms to automatically determine the optimal key to use and of +the `source_unique_key_columns`, `target_unique_key_columns`, and `source_unique_key_target_columns` fields for more +fine-grained control, VReplication changes its behavior as needed. + +#### Notes about the code + +Much of the code uses "PK" terminology. With the introduction of _any_ unique key utilization the "PK" terminology +becomes incorrect. However, to avoid mass rewrites we kept this terminology, and wherever VReplication discusses +a `PRIMARY KEY` or pkColumns, etc., it may refer to a non-PK Unique Key (PKE). + +### Streamer + +Streaming is done using the `source_unique_key_columns` value if present. When present, `rowstreamer` trusts the +information in `source_unique_key_columns` to be correct. It does not validate that there is indeed a valid unique key +covering those columns, it only validates that the columns exist. When a `source_unique_key_columns` value is not +present, `rowstreamer` uses the `PRIMARY KEY` columns if they exist, otherwise it will determine the best +available `PRIMARY KEY` equivalent if one exists, and lastly if none of these are available it will use all of the +columns in the table. + +The streamer iterates the table by the chosen index's column order. It then tracks its progress in `lastPk` as if this +was indeed a true `PRIMARY KEY`. + +### Copier + +VCopier receives rows from the `rowstreamer` in the chosen index's column order. It complies with the streamer's ordering. +When tracking progress in `_vt.copy_state` it uses `lastPk` values from the streamer, which means it uses the same index +columns as the streamer in that order. + +### Player + +VPlayer adheres to both `source_unique_key_columns` and `target_unique_key_columns` when present. If not present, again +it attempts to use the `PRIMARY KEY` columns if they exist, otherwise it will determine the best available `PRIMARY KEY` +equivalent if one exists, and lastly if none of these are available it will use all of the columns in the table. + +- `TablePlan`'s `isOutsidePKRange()` function needs to compare values according to `rowstreamer`'s ordering, therefore + uses the chosen index columns in order. +- `tablePlanBuilder`'s `generateWhere()` function uses the target table's `target_unique_key_columns`, and then also + appends any supplemental columns from `source_unique_key_target_columns` not included in `target_unique_key_columns` + when they are present. If not present, again it attempts to use the `PRIMARY KEY` columns if they exist, otherwise it + will determine the best available `PRIMARY KEY` equivalent if one exists, and lastly if none of these are available it + will use all of the columns in the table. diff --git a/content/en/docs/16.0/reference/vreplication/internal/keys.md b/content/en/docs/16.0/reference/vreplication/internal/keys.md index 6dbd1b0a7..ae3bee759 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/keys.md +++ b/content/en/docs/16.0/reference/vreplication/internal/keys.md @@ -1,22 +1,23 @@ --- title: Role of table keys in VReplication -description: Uses and requirements for primary and unique keys in source and target table in VReplication Workflows +description: Uses and requirements for primary and unique keys in source and target tables in VReplication Workflows weight: 3 +aliases: ['/docs/design-docs/vreplication/keys/'] --- # The use of unique keys -A VReplication stream copies data from a table on a source target to a table on a target tablet. In some cases, the two +A VReplication stream copies data from a table on a source tablet to a table on a target tablet. In some cases, the two tablets may be the same one, but the stream is oblivious to such nuance. VReplication needs to be able to copy existing -rows from the source table to the target table, as well as identify binary log events from the source tablet, and +rows from the source table to the target table, as well as identify binary log events from the source tablet and apply them to the target table. To that effect, VReplication needs to be able to uniquely identify rows, so that it -can apply a specific `UPDATE` on the correct row, or so that it knows all rows _up to a given row_ have been copied. +can apply a specific `UPDATE` on the correct row, or so that it knows that all rows _up to a given row_ have been copied. -Each row needs to be uniquely identifiable. In the relational model, this is trivially done by utilizing `UNIQUE KEY`s, -preferably `PRIMARY KEY`s. A `UNIQUE KEY` made up of non-`NULL`able columns is considered a `PRIMARY KEY` equivalent ( -PKE) for this purpose. +Thus each row needs to be uniquely identifiable. In the relational model, this is trivially done by utilizing a `UNIQUE KEY`s, +preferably `PRIMARY KEY`s. A `UNIQUE KEY` made up of non-`NULL`able columns is considered a `PRIMARY KEY` equivalent (PKE) +for this purpose. -Typically, both the source and the target table have a similar structure and the same keys. +Typically, both the source and the target tables have a similar structure and the same keys. In fact, in the most common use case, both tables will have the same `PRIMARY KEY` covering the same set of columns in the same order. This is the default assumption and expectation by VReplication. But this doesn't have to be the case, @@ -29,9 +30,9 @@ more `NULL`able columns. It doesn't matter if column values do or do not actuall able, then a `UNIQUE KEY` that includes that column is not eligible. `PRIMARY KEY`s are by definition always non-`NULL`able. A `PRIMARY KEY` (PK) is typically the best choice. It gives best -iteration/read performance on InnoDB tables, as those are clustered by PK order. +iteration/read performance on InnoDB tables, as those are clustered by PK (index organized tables). -`PRIMARY KEY` aside, `VReplication` prioritizes keys that utilize e.g. integers rather than texts, and more generally +`PRIMARY KEY` aside, `VReplication` prioritizes keys that utilize e.g. integers rather than characters, and more generally prioritizes smaller data types over larger data types. However, not all eligible `UNIQUE KEY`s, or even `PRIMARY KEY`s are usable for all VReplication streams, as described @@ -42,10 +43,10 @@ below. VReplication needs to be able to determine, given a row in the source table, which row it maps to in the target table. In the case both tables share the same `PRIMARY KEY`, the answer is trivial: given a row from the source table, take the -PK column values (say the table has `PRIMARY KEY(col1, col2)`), and compare with/apply to target table -via `... WHERE col1= AND col2=`. +PK column values (say the table has `PRIMARY KEY(col1, col2)`), and compare with/apply to the target table via +`... WHERE col1= AND col2=`. -However, other scenarios are valid. Consider an OnlineDDL operation that modifies the `PRIMARY KEY` as +However, other scenarios are also valid. Consider an OnlineDDL operation that modifies the `PRIMARY KEY` as follows: `DROP PRIMARY KEY, ADD PRIMARY KEY(col1)`. On the source table, a row is identified by `col1, col2`. On the target table, a row is only identifiable by `col1`. This scenario still feels comfortable: all we need to do when we apply e.g. an `UPDATE` statement on the target table is to drop `col2` from the statement: `... WHERE col1=`. @@ -53,21 +54,20 @@ apply e.g. an `UPDATE` statement on the target table is to drop `col2` from the _Note that it is the user's responsibility to make sure the data will comply with the new constraints. If not, VReplication will fail the operation._ -But consider the opposite case, there's a `PRIMARY KEY(col1)` and an OnlineDDL operation turns it -into `PRIMARY KEY(col1, col2)`. Now we need to apply changes `... WHERE col1= AND col2=`. But `col2` is not -part of the source `PRIMARY KEY`. +But consider the opposite case, there's a `PRIMARY KEY(col1)` and an OnlineDDL operation turns it into +`PRIMARY KEY(col1, col2)`. Now we need to apply changes using `... WHERE col1= AND col2=`. But `col2` is +not part of the source `PRIMARY KEY`. An extreme case is when the keys on the source table and the target table do not share _any columns_ between them. Say the source table has `PRIMARY KEY(col1)` and the target table has `PRIMARY KEY(col2)` and with no other potential keys. -We still -need to identify which row in the source table maps to which row in the target table. VReplication still supports this -scenario. +We still need to identify which row in the source table maps to which row in the target table. VReplication still supports +this scenario. -Yet another complication is when columns are renamed along the way. Consider -an `ALTER TABLE CHANGE COLUMN col2 col_two INT UNSIGNED ...`. A row on the source table is identified by `col1, col2`, -but on the target table it is identified by `col1, col_two`. +Yet another complication is when columns are renamed along the way. Consider an +`ALTER TABLE CHANGE COLUMN col2 col_two INT UNSIGNED ...` statement. A row on the source table is identified by +`col1, col2`, but on the target table it is identified by `col1, col_two`. -Let's now discuss what the exact requirements are for unique keys, and then discuss implementation. +Let's now discuss what the exact requirements are for unique keys, and then discuss the implementation. ## Requirements @@ -78,14 +78,14 @@ To be able to create a VReplication stream between the source table and target t - The target table must have a non-`NULL`able `UNIQUE/PRIMARY` key whose columns all exist in the source table (possibly under different names) - Except in the trivial case where both tables share the same `PRIMARY KEY` (of the same columns in the same order), - VReplication can automatically determine which keys to utilize (more on this later on) + VReplication can automatically determine which keys to utilize (more on this later) To clarify, it is **OK** if: - Keys in the source table and the target table go by different names - Chosen key in the source table and chosen key in the target table do not share any columns - Chosen key in the source table and chosen key in the target table share some or all columns -- Chosen key in the source table and chosen key in the target table share some or all columns but in a different order +- Chosen key in the source table and chosen key in the target table share some or all columns, but in a different order - There are keys in the source table that cover columns not present in the target table - There are keys in the target table that cover columns not present in the source table - There are `NULL`able columns in the source and the target table @@ -96,7 +96,7 @@ in the target table to allow VReplication to work. ### Examples of valid cases -#### Source table and target table are the same: +#### Source table and target table are the same ```sql CREATE TABLE `entry` ( @@ -249,7 +249,7 @@ CREATE TABLE `target` ( ) ``` -The only `UNIQUE KEY` on `target` is `NULL`able, hence not eligible. +The only `UNIQUE KEY` on `target` is `NULL`able, hence _not_ eligible. #### Missing columns @@ -293,10 +293,10 @@ CREATE TABLE `corder` ( ) ``` -And even though we don't _have to_, here's how we could manually configure the VReplication workflow definition ( -prettified for readability): +And even though we don't _have to_, here's how we could manually configure the VReplication workflow definition +(prettified for readability): -``` +```proto keyspace:"commerce" shard:"0" filter:{ rules:{ match:"corder" @@ -328,10 +328,10 @@ CREATE TABLE `shipment` ( ) ``` -``` +```proto keyspace:"commerce" shard:"0" filter:{ rules:{ - match:"_a363f199_f4f8_11eb_a520_0a43f95f28a3_20210804075030_vrepl" + match:"shipment" filter:"select `order_id` as `order_id`, `customer_id` as `customer_id`, `ts` as `ts` from `shipment`" source_unique_key_columns:"order_id,customer_id" target_unique_key_columns:"order_id,customer_id" @@ -340,11 +340,11 @@ keyspace:"commerce" shard:"0" filter:{ } ``` -Not much changed from the previous example, just note how we comma separate `"order_id,customer_id"` +Not much changed from the previous example, just note how we comma separate `"order_id,customer_id"`. ### Example 3 -Continuing previous example, we now rename a column the target table: +Continuing the previous example, we now rename a column the target table: ```sql CREATE TABLE `shipment` ( @@ -355,10 +355,10 @@ CREATE TABLE `shipment` ( ) ``` -``` +```proto keyspace:"commerce" shard:"0" filter:{ rules:{ - match:"_e285cc54_f369_11eb_afef_0a43f95f28a3_20210802081607_vrepl" + match:"shipment" filter:"select `order_id` as `order_id`, `customer_id` as `cust_id`, `ts` as `ts` from `shipment`" source_unique_key_columns:"order_id,customer_id" target_unique_key_columns:"order_id,cust_id" @@ -376,25 +376,25 @@ Note: ## Automation OnlineDDL has a mechanism to automatically analyze the differences between source and target tables, evaluate eligible -keys, choose the keys on source and target tables, and populate the -filter's `source_unique_key_columns`, `target_unique_key_columns`, `source_unique_key_target_columns` fields. Indeed, +keys, choose the best keys on source and target tables, and populate the filter's +`source_unique_key_columns`, `target_unique_key_columns`, and `source_unique_key_target_columns` fields. Indeed, OnlineDDL operations are most susceptible to differences in keys. The user can also supply their chosen values as an override — using those fields in the workflow definition — in the rare case it's needed. -VReplication more broadly will automatically use the most efficient `PRIMARY KEY` equivalent (non-`NULL`able unique key) -when there's no defined `PRIMARY KEY` on the table. +VReplication more broadly will automatically use the most efficient `PRIMARY KEY` equivalent or PKE (non-`NULL`able unique +key) when there's no defined `PRIMARY KEY` on the table. ## Implementation -At a high level, this is how VReplication is able to work with different keys/columns. +At a high level, this is how VReplication is able to work with different keys/columns between the source and target. Originally, VReplication was only designed to work with identical `PRIMARY KEY`s. If not specified, VReplication assumed the source table's `PRIMARY KEY` _can be used_ on the target table, and that the target table's `PRIMARY KEY` is applied to the source table. If not, it would error out and the workflow would fail. With the introduction of mechanisms to automatically determine the optimal key to use and of -the `source_unique_key_columns`, `target_unique_key_columns`, `source_unique_key_target_columns` fields for more -fine-grained control, VReplication changes behavior as follows: +the `source_unique_key_columns`, `target_unique_key_columns`, and `source_unique_key_target_columns` fields for more +fine-grained control, VReplication changes its behavior as needed. #### Notes about the code @@ -416,7 +416,7 @@ was indeed a true `PRIMARY KEY`. ### Copier -VCopier receives rows from the streamer in the chosen index's column order. It complies with the streamer's ordering. +VCopier receives rows from the `rowstreamer` in the chosen index's column order. It complies with the streamer's ordering. When tracking progress in `_vt.copy_state` it uses `lastPk` values from the streamer, which means it uses the same index columns as the streamer in that order. @@ -427,7 +427,7 @@ it attempts to use the `PRIMARY KEY` columns if they exist, otherwise it will de equivalent if one exists, and lastly if none of these are available it will use all of the columns in the table. - `TablePlan`'s `isOutsidePKRange()` function needs to compare values according to `rowstreamer`'s ordering, therefore - uses the chosen index columns in order + uses the chosen index columns in order. - `tablePlanBuilder`'s `generateWhere()` function uses the target table's `target_unique_key_columns`, and then also appends any supplemental columns from `source_unique_key_target_columns` not included in `target_unique_key_columns` when they are present. If not present, again it attempts to use the `PRIMARY KEY` columns if they exist, otherwise it From 311fdd8524ea7e2084cb9c46db323add29f57ef8 Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Fri, 20 Jan 2023 13:18:23 -0500 Subject: [PATCH 11/16] Schema tracker page Signed-off-by: Matt Lord --- .../vreplication/internal/tracker.md | 182 ++++++++++++++++++ .../vreplication/internal/tracker.md | 182 ++++++++++++++++++ .../vreplication/internal/tracker.md | 182 ++++++++++++++++++ .../vreplication/internal/tracker.md | 128 ++++++------ 4 files changed, 615 insertions(+), 59 deletions(-) create mode 100644 content/en/docs/13.0/reference/vreplication/internal/tracker.md create mode 100644 content/en/docs/14.0/reference/vreplication/internal/tracker.md create mode 100644 content/en/docs/15.0/reference/vreplication/internal/tracker.md diff --git a/content/en/docs/13.0/reference/vreplication/internal/tracker.md b/content/en/docs/13.0/reference/vreplication/internal/tracker.md new file mode 100644 index 000000000..08fd12941 --- /dev/null +++ b/content/en/docs/13.0/reference/vreplication/internal/tracker.md @@ -0,0 +1,182 @@ +--- +title: Schema Tracker +description: Tracking schema changes in VStreams +aliases: [] +weight: 4 +aliases: ['/user-guide/update-stream', '/docs/design-docs/vreplication/vstream/tracker/'] +--- + +# Tracking Schema Changes in VStreams + +## Motivation + +Currently, vstreams work with a single (the latest or current) database schema. On every DDL the schema engine reloads the +schema from the database engine. + +All vstreams on a tablet share a common schema engine. VStreams that are lagging can see a more recent schema than when +the older binlog events occurred. So the lagging vstreams will see an incorrect version of the schema in case DDLs were +applied in between that affect the schema of the tables involved in those lagging events. + +In addition, reloading schemas is an expensive operation. If there are multiple vstreams them each of them will separately +receive a DDL event resulting in multiple reloads for the same DDL. + +{{< info >}} +For full functionality, schema tracking relies on these non-default Vitess `vttablet` flags: +[`--watch_replication_stream`](../../flags/#watch_replication_stream) and +[`--track_schema_versions`](../../flags/#track_schema_versions). Specifically, performing a vstream from a non-PRIMARY +tablet while concurrently making DDL changes to the keyspace without one or both of these tablet options could result in +incorrect vstream results. +{{< /info >}} + +## Goals + +1. Provide a mechanism for maintaining versions of the schema +2. Reduce the number of redundant schema loads + +## Model + +We add a new `schema_version` table in the internal `_vt` database with columns, including, the `GTID` position, the +schema as of that position, and the DDL that led to this schema. Inserting into this table generates a `version` event +in the vstream. + +## Actors + +#### Schema Engine + +Schema engine gets the schema from the database and only keeps the last (latest) copy it loaded. It notifies subscribers +if the schema changes. It polls for the latest schema at intervals or can be explicitly requested to load the schema for +a tablet using the [`ReloadSchemaKeyspace`](../../../programs/vtctl/schema-version-permissions/#reloadschemakeyspace) +vtctl client command. + +#### Replication Watcher + +Replication watcher is a separate vstream that is started by the tabletserver. It notifies subscribers when it encounters +a DDL in the workflow stream. + +#### Version Tracker + +Version tracker runs on the `PRIMARY` tablet. It subscribes to the replication watcher and inserts a new row into the +`_vt.schema_version` table with the latest schema. + +#### Version Historian + +Version historian runs on both `PRIMARY` and `REPLICA` tablets and handles DDL events. For a given `GTID` it looks in its +cache to check if it has a valid schema for that `GTID`. If not, it looks up the in the `schema_version` table on `REPLICA` +tablet. If no schema is found then it provides the latest schema -- which is updated by subscribing to the schema engine’s +change notification. + +### Notes + +- Schema Engine is an existing service +- Replication Watcher is used as an optional vstream that the user can run. It doesn’t do anything user specific: it is only +used for the side-effect that a vstream loads the schema on a DDL to proactively load the latest schema + +## Basic Flow for Version Tracking + +### Primary + +#### Version Tracker: + +1. When the primary comes up the replication watcher (a vstream) is started from the current `GTID` position. The +tracker subscribes to the watcher. +1. Say, a DDL is applied +1. The watcher vstream sees the DDL and +1. Asks the schema engine to reload the schema, also providing the corresponding `GTID` position +1. Notifies the tracker of a schema change +1. Tracker stores its latest schema into the `_vt.schema_version` table associated with the given `GTID` and DDL + +#### Historian/VStreams: + +1. Historian warms its cache from the `_vt.schema_version` table when it starts +2. When the tracker inserts the latest schema into `_vt.schema_version` table, the vstream converts it into a (new) + version event +3. For every version event the vstream registers it with the historian +4. On the version event, the tracker loads the new row from the `_vt.schema_version` table +5. When a vstream needs a new `TableMap` event it asks the historian for it along with the corresponding `GTID` +6. Historian looks in its cache for a schema version for that `GTID`. If not present it provides the latest schema it + has received from the schema engine + +#### Replica + +1. Version tracker does not run: the tracker can only store versions on the `PRIMARY` since it requires writing to the +database +2. Historian functionality is identical to that on the `PRIMARY` + +## Flags + +### Primary + +Schema version snapshots are stored only on the `PRIMARY`. This is done when the Replication Watcher gets a DDL event +resulting in a `SchemaUpdated()` call. There are two independent flows here: + +1. Replication Watcher is running +2. Schema snapshots are saved to `_vt.schema_version` when `SchemaUpdated()` is called + +Point 2 is performed only when the [`--track_schema_versions`](../../flags/#track_schema_versions) `vttablet` flag is enabled. +This implies that #1 also has to happen when [`--track_schema_versions`](../../flags/#track_schema_versions) is enabled +independently of the [`--watch_replication_stream`](../../flags/#watch_replication_stream) flag. + +However if the [`--watch_replication_stream`](../../flags/#watch_replication_stream) flag is enabled but +[`--track_schema_versions`](../../flags/#track_schema_versions) is disabled we still need to run the Replication +Watcher since the user has requested it, but we do not store any schema versions. + +So the logic is: + +1. WatchReplication==true \ + => Replication Watcher is running + +2. TrackSchemaVersions==false + => SchemaUpdated is a noop + +3. TrackSchemaVersions=true + => Replication Watcher is running \ + => SchemaUpdated is handled + +The historian behavior is identical to that of the replica: of course if versions are not stored in `_vt.schema_versions` +it will always provide the latest version of the schema. + +### Replica + +Schema versions are never stored directly on `REPLICA` tablets, so SchemaUpdated is always a noop. Versions are provided +as appropriate by the historian. The historian provides the latest schema if there is no appropriate version. + +So the logic is: + +1. WatchReplication==true \ + => Replication Watcher is running + +2. TrackSchemaVersions==false || true //noop \ + => Historian tries to get appropriate schema version + +## Caveat + +Only best-effort versioning can be provided due to races between DDLs and DMLs. Some examples below: + +### Situation 1 + +If multiple DDLs are applied in a quick sequence we can end up with the following binlog scenario: + +```text +T1: DDL 1 on table1 + +T2: DDL 2 on table1 + +T3: Version Event DDL1 // gets written because of the time taken by tracker processing DDL1 + +T4: DML1 on table1 + +T5: Version Event DDL2 // gets written AFTER DML1 +``` + +So now on the `REPLICA`, at T4, the version historian will incorrectly provide the schema from T1 after DDL1 was applied. + +### Situation 2 + +If version tracking is turned off on the `PRIMARY` for some time, correct versions may not be available to the historian +which will always return the latest schema. This might result in an incorrect schema when a vstream is processing events +in the past. + +#### Possible new features around this functionality + +- Schema tracking vstream client for notifications of all ddls +- Raw history of schema changes for auditing, root cause analysis, etc. diff --git a/content/en/docs/14.0/reference/vreplication/internal/tracker.md b/content/en/docs/14.0/reference/vreplication/internal/tracker.md new file mode 100644 index 000000000..08fd12941 --- /dev/null +++ b/content/en/docs/14.0/reference/vreplication/internal/tracker.md @@ -0,0 +1,182 @@ +--- +title: Schema Tracker +description: Tracking schema changes in VStreams +aliases: [] +weight: 4 +aliases: ['/user-guide/update-stream', '/docs/design-docs/vreplication/vstream/tracker/'] +--- + +# Tracking Schema Changes in VStreams + +## Motivation + +Currently, vstreams work with a single (the latest or current) database schema. On every DDL the schema engine reloads the +schema from the database engine. + +All vstreams on a tablet share a common schema engine. VStreams that are lagging can see a more recent schema than when +the older binlog events occurred. So the lagging vstreams will see an incorrect version of the schema in case DDLs were +applied in between that affect the schema of the tables involved in those lagging events. + +In addition, reloading schemas is an expensive operation. If there are multiple vstreams them each of them will separately +receive a DDL event resulting in multiple reloads for the same DDL. + +{{< info >}} +For full functionality, schema tracking relies on these non-default Vitess `vttablet` flags: +[`--watch_replication_stream`](../../flags/#watch_replication_stream) and +[`--track_schema_versions`](../../flags/#track_schema_versions). Specifically, performing a vstream from a non-PRIMARY +tablet while concurrently making DDL changes to the keyspace without one or both of these tablet options could result in +incorrect vstream results. +{{< /info >}} + +## Goals + +1. Provide a mechanism for maintaining versions of the schema +2. Reduce the number of redundant schema loads + +## Model + +We add a new `schema_version` table in the internal `_vt` database with columns, including, the `GTID` position, the +schema as of that position, and the DDL that led to this schema. Inserting into this table generates a `version` event +in the vstream. + +## Actors + +#### Schema Engine + +Schema engine gets the schema from the database and only keeps the last (latest) copy it loaded. It notifies subscribers +if the schema changes. It polls for the latest schema at intervals or can be explicitly requested to load the schema for +a tablet using the [`ReloadSchemaKeyspace`](../../../programs/vtctl/schema-version-permissions/#reloadschemakeyspace) +vtctl client command. + +#### Replication Watcher + +Replication watcher is a separate vstream that is started by the tabletserver. It notifies subscribers when it encounters +a DDL in the workflow stream. + +#### Version Tracker + +Version tracker runs on the `PRIMARY` tablet. It subscribes to the replication watcher and inserts a new row into the +`_vt.schema_version` table with the latest schema. + +#### Version Historian + +Version historian runs on both `PRIMARY` and `REPLICA` tablets and handles DDL events. For a given `GTID` it looks in its +cache to check if it has a valid schema for that `GTID`. If not, it looks up the in the `schema_version` table on `REPLICA` +tablet. If no schema is found then it provides the latest schema -- which is updated by subscribing to the schema engine’s +change notification. + +### Notes + +- Schema Engine is an existing service +- Replication Watcher is used as an optional vstream that the user can run. It doesn’t do anything user specific: it is only +used for the side-effect that a vstream loads the schema on a DDL to proactively load the latest schema + +## Basic Flow for Version Tracking + +### Primary + +#### Version Tracker: + +1. When the primary comes up the replication watcher (a vstream) is started from the current `GTID` position. The +tracker subscribes to the watcher. +1. Say, a DDL is applied +1. The watcher vstream sees the DDL and +1. Asks the schema engine to reload the schema, also providing the corresponding `GTID` position +1. Notifies the tracker of a schema change +1. Tracker stores its latest schema into the `_vt.schema_version` table associated with the given `GTID` and DDL + +#### Historian/VStreams: + +1. Historian warms its cache from the `_vt.schema_version` table when it starts +2. When the tracker inserts the latest schema into `_vt.schema_version` table, the vstream converts it into a (new) + version event +3. For every version event the vstream registers it with the historian +4. On the version event, the tracker loads the new row from the `_vt.schema_version` table +5. When a vstream needs a new `TableMap` event it asks the historian for it along with the corresponding `GTID` +6. Historian looks in its cache for a schema version for that `GTID`. If not present it provides the latest schema it + has received from the schema engine + +#### Replica + +1. Version tracker does not run: the tracker can only store versions on the `PRIMARY` since it requires writing to the +database +2. Historian functionality is identical to that on the `PRIMARY` + +## Flags + +### Primary + +Schema version snapshots are stored only on the `PRIMARY`. This is done when the Replication Watcher gets a DDL event +resulting in a `SchemaUpdated()` call. There are two independent flows here: + +1. Replication Watcher is running +2. Schema snapshots are saved to `_vt.schema_version` when `SchemaUpdated()` is called + +Point 2 is performed only when the [`--track_schema_versions`](../../flags/#track_schema_versions) `vttablet` flag is enabled. +This implies that #1 also has to happen when [`--track_schema_versions`](../../flags/#track_schema_versions) is enabled +independently of the [`--watch_replication_stream`](../../flags/#watch_replication_stream) flag. + +However if the [`--watch_replication_stream`](../../flags/#watch_replication_stream) flag is enabled but +[`--track_schema_versions`](../../flags/#track_schema_versions) is disabled we still need to run the Replication +Watcher since the user has requested it, but we do not store any schema versions. + +So the logic is: + +1. WatchReplication==true \ + => Replication Watcher is running + +2. TrackSchemaVersions==false + => SchemaUpdated is a noop + +3. TrackSchemaVersions=true + => Replication Watcher is running \ + => SchemaUpdated is handled + +The historian behavior is identical to that of the replica: of course if versions are not stored in `_vt.schema_versions` +it will always provide the latest version of the schema. + +### Replica + +Schema versions are never stored directly on `REPLICA` tablets, so SchemaUpdated is always a noop. Versions are provided +as appropriate by the historian. The historian provides the latest schema if there is no appropriate version. + +So the logic is: + +1. WatchReplication==true \ + => Replication Watcher is running + +2. TrackSchemaVersions==false || true //noop \ + => Historian tries to get appropriate schema version + +## Caveat + +Only best-effort versioning can be provided due to races between DDLs and DMLs. Some examples below: + +### Situation 1 + +If multiple DDLs are applied in a quick sequence we can end up with the following binlog scenario: + +```text +T1: DDL 1 on table1 + +T2: DDL 2 on table1 + +T3: Version Event DDL1 // gets written because of the time taken by tracker processing DDL1 + +T4: DML1 on table1 + +T5: Version Event DDL2 // gets written AFTER DML1 +``` + +So now on the `REPLICA`, at T4, the version historian will incorrectly provide the schema from T1 after DDL1 was applied. + +### Situation 2 + +If version tracking is turned off on the `PRIMARY` for some time, correct versions may not be available to the historian +which will always return the latest schema. This might result in an incorrect schema when a vstream is processing events +in the past. + +#### Possible new features around this functionality + +- Schema tracking vstream client for notifications of all ddls +- Raw history of schema changes for auditing, root cause analysis, etc. diff --git a/content/en/docs/15.0/reference/vreplication/internal/tracker.md b/content/en/docs/15.0/reference/vreplication/internal/tracker.md new file mode 100644 index 000000000..08fd12941 --- /dev/null +++ b/content/en/docs/15.0/reference/vreplication/internal/tracker.md @@ -0,0 +1,182 @@ +--- +title: Schema Tracker +description: Tracking schema changes in VStreams +aliases: [] +weight: 4 +aliases: ['/user-guide/update-stream', '/docs/design-docs/vreplication/vstream/tracker/'] +--- + +# Tracking Schema Changes in VStreams + +## Motivation + +Currently, vstreams work with a single (the latest or current) database schema. On every DDL the schema engine reloads the +schema from the database engine. + +All vstreams on a tablet share a common schema engine. VStreams that are lagging can see a more recent schema than when +the older binlog events occurred. So the lagging vstreams will see an incorrect version of the schema in case DDLs were +applied in between that affect the schema of the tables involved in those lagging events. + +In addition, reloading schemas is an expensive operation. If there are multiple vstreams them each of them will separately +receive a DDL event resulting in multiple reloads for the same DDL. + +{{< info >}} +For full functionality, schema tracking relies on these non-default Vitess `vttablet` flags: +[`--watch_replication_stream`](../../flags/#watch_replication_stream) and +[`--track_schema_versions`](../../flags/#track_schema_versions). Specifically, performing a vstream from a non-PRIMARY +tablet while concurrently making DDL changes to the keyspace without one or both of these tablet options could result in +incorrect vstream results. +{{< /info >}} + +## Goals + +1. Provide a mechanism for maintaining versions of the schema +2. Reduce the number of redundant schema loads + +## Model + +We add a new `schema_version` table in the internal `_vt` database with columns, including, the `GTID` position, the +schema as of that position, and the DDL that led to this schema. Inserting into this table generates a `version` event +in the vstream. + +## Actors + +#### Schema Engine + +Schema engine gets the schema from the database and only keeps the last (latest) copy it loaded. It notifies subscribers +if the schema changes. It polls for the latest schema at intervals or can be explicitly requested to load the schema for +a tablet using the [`ReloadSchemaKeyspace`](../../../programs/vtctl/schema-version-permissions/#reloadschemakeyspace) +vtctl client command. + +#### Replication Watcher + +Replication watcher is a separate vstream that is started by the tabletserver. It notifies subscribers when it encounters +a DDL in the workflow stream. + +#### Version Tracker + +Version tracker runs on the `PRIMARY` tablet. It subscribes to the replication watcher and inserts a new row into the +`_vt.schema_version` table with the latest schema. + +#### Version Historian + +Version historian runs on both `PRIMARY` and `REPLICA` tablets and handles DDL events. For a given `GTID` it looks in its +cache to check if it has a valid schema for that `GTID`. If not, it looks up the in the `schema_version` table on `REPLICA` +tablet. If no schema is found then it provides the latest schema -- which is updated by subscribing to the schema engine’s +change notification. + +### Notes + +- Schema Engine is an existing service +- Replication Watcher is used as an optional vstream that the user can run. It doesn’t do anything user specific: it is only +used for the side-effect that a vstream loads the schema on a DDL to proactively load the latest schema + +## Basic Flow for Version Tracking + +### Primary + +#### Version Tracker: + +1. When the primary comes up the replication watcher (a vstream) is started from the current `GTID` position. The +tracker subscribes to the watcher. +1. Say, a DDL is applied +1. The watcher vstream sees the DDL and +1. Asks the schema engine to reload the schema, also providing the corresponding `GTID` position +1. Notifies the tracker of a schema change +1. Tracker stores its latest schema into the `_vt.schema_version` table associated with the given `GTID` and DDL + +#### Historian/VStreams: + +1. Historian warms its cache from the `_vt.schema_version` table when it starts +2. When the tracker inserts the latest schema into `_vt.schema_version` table, the vstream converts it into a (new) + version event +3. For every version event the vstream registers it with the historian +4. On the version event, the tracker loads the new row from the `_vt.schema_version` table +5. When a vstream needs a new `TableMap` event it asks the historian for it along with the corresponding `GTID` +6. Historian looks in its cache for a schema version for that `GTID`. If not present it provides the latest schema it + has received from the schema engine + +#### Replica + +1. Version tracker does not run: the tracker can only store versions on the `PRIMARY` since it requires writing to the +database +2. Historian functionality is identical to that on the `PRIMARY` + +## Flags + +### Primary + +Schema version snapshots are stored only on the `PRIMARY`. This is done when the Replication Watcher gets a DDL event +resulting in a `SchemaUpdated()` call. There are two independent flows here: + +1. Replication Watcher is running +2. Schema snapshots are saved to `_vt.schema_version` when `SchemaUpdated()` is called + +Point 2 is performed only when the [`--track_schema_versions`](../../flags/#track_schema_versions) `vttablet` flag is enabled. +This implies that #1 also has to happen when [`--track_schema_versions`](../../flags/#track_schema_versions) is enabled +independently of the [`--watch_replication_stream`](../../flags/#watch_replication_stream) flag. + +However if the [`--watch_replication_stream`](../../flags/#watch_replication_stream) flag is enabled but +[`--track_schema_versions`](../../flags/#track_schema_versions) is disabled we still need to run the Replication +Watcher since the user has requested it, but we do not store any schema versions. + +So the logic is: + +1. WatchReplication==true \ + => Replication Watcher is running + +2. TrackSchemaVersions==false + => SchemaUpdated is a noop + +3. TrackSchemaVersions=true + => Replication Watcher is running \ + => SchemaUpdated is handled + +The historian behavior is identical to that of the replica: of course if versions are not stored in `_vt.schema_versions` +it will always provide the latest version of the schema. + +### Replica + +Schema versions are never stored directly on `REPLICA` tablets, so SchemaUpdated is always a noop. Versions are provided +as appropriate by the historian. The historian provides the latest schema if there is no appropriate version. + +So the logic is: + +1. WatchReplication==true \ + => Replication Watcher is running + +2. TrackSchemaVersions==false || true //noop \ + => Historian tries to get appropriate schema version + +## Caveat + +Only best-effort versioning can be provided due to races between DDLs and DMLs. Some examples below: + +### Situation 1 + +If multiple DDLs are applied in a quick sequence we can end up with the following binlog scenario: + +```text +T1: DDL 1 on table1 + +T2: DDL 2 on table1 + +T3: Version Event DDL1 // gets written because of the time taken by tracker processing DDL1 + +T4: DML1 on table1 + +T5: Version Event DDL2 // gets written AFTER DML1 +``` + +So now on the `REPLICA`, at T4, the version historian will incorrectly provide the schema from T1 after DDL1 was applied. + +### Situation 2 + +If version tracking is turned off on the `PRIMARY` for some time, correct versions may not be available to the historian +which will always return the latest schema. This might result in an incorrect schema when a vstream is processing events +in the past. + +#### Possible new features around this functionality + +- Schema tracking vstream client for notifications of all ddls +- Raw history of schema changes for auditing, root cause analysis, etc. diff --git a/content/en/docs/16.0/reference/vreplication/internal/tracker.md b/content/en/docs/16.0/reference/vreplication/internal/tracker.md index 8d761f26d..08fd12941 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/tracker.md +++ b/content/en/docs/16.0/reference/vreplication/internal/tracker.md @@ -1,28 +1,31 @@ --- title: Schema Tracker -description: Tracking schema changes in Vstreams -aliases: ["/user-guide/update-stream"] +description: Tracking schema changes in VStreams +aliases: [] weight: 4 +aliases: ['/user-guide/update-stream', '/docs/design-docs/vreplication/vstream/tracker/'] --- -# Tracking schema changes in Vstreams +# Tracking Schema Changes in VStreams ## Motivation -Currently, Vstreams work with a single (the latest) database schema. On every DDL the schema engine reloads the schema -from the database engine. +Currently, vstreams work with a single (the latest or current) database schema. On every DDL the schema engine reloads the +schema from the database engine. -All Vstreams on a tablet share a common schema engine. Vstreams that are lagging can see a more recent schema than when -the older binlog events occurred. So the lagging vstreams will see an incorrect version of the schema in case ddls were +All vstreams on a tablet share a common schema engine. VStreams that are lagging can see a more recent schema than when +the older binlog events occurred. So the lagging vstreams will see an incorrect version of the schema in case DDLs were applied in between that affect the schema of the tables involved in those lagging events. -In addition, reloading schemas is an expensive operation. If there are multiple Vstreams each of them will separately +In addition, reloading schemas is an expensive operation. If there are multiple vstreams them each of them will separately receive a DDL event resulting in multiple reloads for the same DDL. {{< info >}} -For full functionality, schema tracking relies on non-default Vitess vttablet options: `-watch_replication_stream` -and `-track_schema_versions`. Specifically, performing a Vstream from a non-primary tablet while concurrently making DDL -changes to the keyspace without one or both of these tablet options could result in incorrect Vstream results. +For full functionality, schema tracking relies on these non-default Vitess `vttablet` flags: +[`--watch_replication_stream`](../../flags/#watch_replication_stream) and +[`--track_schema_versions`](../../flags/#track_schema_versions). Specifically, performing a vstream from a non-PRIMARY +tablet while concurrently making DDL changes to the keyspace without one or both of these tablet options could result in +incorrect vstream results. {{< /info >}} ## Goals @@ -32,8 +35,9 @@ changes to the keyspace without one or both of these tablet options could result ## Model -We add a new schema_version table in \_vt with columns, including, the gtid position, the schema as of that position, -and the ddl that led to this schema. Inserting into this table generates a Version event in Vstream. +We add a new `schema_version` table in the internal `_vt` database with columns, including, the `GTID` position, the +schema as of that position, and the DDL that led to this schema. Inserting into this table generates a `version` event +in the vstream. ## Actors @@ -41,76 +45,80 @@ and the ddl that led to this schema. Inserting into this table generates a Versi Schema engine gets the schema from the database and only keeps the last (latest) copy it loaded. It notifies subscribers if the schema changes. It polls for the latest schema at intervals or can be explicitly requested to load the schema for -a tablet using `vtctl`'s `ReloadSchema`. +a tablet using the [`ReloadSchemaKeyspace`](../../../programs/vtctl/schema-version-permissions/#reloadschemakeyspace) +vtctl client command. -#### Replication watcher +#### Replication Watcher -Replication watcher is a Vstream that is started by the tabletserver. It notifies subscribers when it encounters a DDL +Replication watcher is a separate vstream that is started by the tabletserver. It notifies subscribers when it encounters +a DDL in the workflow stream. #### Version Tracker -Version tracker runs on the primary. It subscribes to the replication watcher and inserts a new row into the -\_vt.schema_version table with the latest schema. +Version tracker runs on the `PRIMARY` tablet. It subscribes to the replication watcher and inserts a new row into the +`_vt.schema_version` table with the latest schema. #### Version Historian -Version historian runs on both primary and replica and handles DDL events. For a given GTID it looks up its cache to -check if it has a schema valid for that GTID. If not, on the replica, it looks up the schema_version table. If no schema -is found then it provides the latest schema -- which is updated by subscribing to the schema engine’s change -notification. +Version historian runs on both `PRIMARY` and `REPLICA` tablets and handles DDL events. For a given `GTID` it looks in its +cache to check if it has a valid schema for that `GTID`. If not, it looks up the in the `schema_version` table on `REPLICA` +tablet. If no schema is found then it provides the latest schema -- which is updated by subscribing to the schema engine’s +change notification. ### Notes - Schema Engine is an existing service -- Replication Watcher is used as an optional Vstream that the user can run. It doesn’t do anything - specific: it is used for the side-effect that a Vstream loads the schema on a DDL, to proactively load the latest - schema. +- Replication Watcher is used as an optional vstream that the user can run. It doesn’t do anything user specific: it is only +used for the side-effect that a vstream loads the schema on a DDL to proactively load the latest schema -## Basic Flow for version tracking +## Basic Flow for Version Tracking ### Primary -#### Version tracker: +#### Version Tracker: -1. When the primary comes up the replication watcher (a Vstream) is started from the current GTID position. The Tracker - subscribes to the watcher. +1. When the primary comes up the replication watcher (a vstream) is started from the current `GTID` position. The +tracker subscribes to the watcher. 1. Say, a DDL is applied -1. The watcher Vstream sees the DDL and -1. Asks the schema engine to reload the schema, also providing the corresponding gtid position +1. The watcher vstream sees the DDL and +1. Asks the schema engine to reload the schema, also providing the corresponding `GTID` position 1. Notifies the tracker of a schema change -1. Tracker stores its latest schema into the \_vt.schema_version table associated with the given GTID and DDL +1. Tracker stores its latest schema into the `_vt.schema_version` table associated with the given `GTID` and DDL -#### Historian/Vstreams: +#### Historian/VStreams: -1. Historian warms its cache from the schema_version table when it loads -2. When the tracker inserts the latest schema into \_vt.schema_version table, the Vstream converts it into a (new) - Version event -3. For every Version event the Vstream registers it with the Historian -4. On the Version event, the tracker loads the new row from the \_vt.schema_version table -5. When a Vstream needs a new TableMap it asks the Historian for it along with the corresponding GTID. -6. Historian looks up its cache for a schema version for that GTID. If not present just provides the latest schema it - has received from the schema engine. +1. Historian warms its cache from the `_vt.schema_version` table when it starts +2. When the tracker inserts the latest schema into `_vt.schema_version` table, the vstream converts it into a (new) + version event +3. For every version event the vstream registers it with the historian +4. On the version event, the tracker loads the new row from the `_vt.schema_version` table +5. When a vstream needs a new `TableMap` event it asks the historian for it along with the corresponding `GTID` +6. Historian looks in its cache for a schema version for that `GTID`. If not present it provides the latest schema it + has received from the schema engine #### Replica -1. Version tracker does not run: the tracker can only store versions on the primary since it is writing to the database. -2. Historian functionality is identical to that on the primary. +1. Version tracker does not run: the tracker can only store versions on the `PRIMARY` since it requires writing to the +database +2. Historian functionality is identical to that on the `PRIMARY` ## Flags ### Primary -Schema version snapshots are stored only on the primary. This is done when the Replication Watcher gets a DDL event -resulting in a SchemaUpdated(). There are two independent flows here: +Schema version snapshots are stored only on the `PRIMARY`. This is done when the Replication Watcher gets a DDL event +resulting in a `SchemaUpdated()` call. There are two independent flows here: 1. Replication Watcher is running -2. Schema snapshots are saved to \_vt.schema_version when SchemaUpdated is called +2. Schema snapshots are saved to `_vt.schema_version` when `SchemaUpdated()` is called -Point 2 is performed only when the flag TrackSchemaVersions is enabled. This implies that #1 also has to happen when -TrackSchemaVersions is enabled independently of the WatchReplication flag +Point 2 is performed only when the [`--track_schema_versions`](../../flags/#track_schema_versions) `vttablet` flag is enabled. +This implies that #1 also has to happen when [`--track_schema_versions`](../../flags/#track_schema_versions) is enabled +independently of the [`--watch_replication_stream`](../../flags/#watch_replication_stream) flag. -However if the WatchReplication flag is enabled but TrackSchemaVersions is disabled we still need to run the Replication -Watcher since the user has requested it, but we should not store schema versions. +However if the [`--watch_replication_stream`](../../flags/#watch_replication_stream) flag is enabled but +[`--track_schema_versions`](../../flags/#track_schema_versions) is disabled we still need to run the Replication +Watcher since the user has requested it, but we do not store any schema versions. So the logic is: @@ -124,13 +132,13 @@ So the logic is: => Replication Watcher is running \ => SchemaUpdated is handled -The Historian behavior is identical to that of the replica: of course if versions are not stored in \_vt.schema_versions -it will always provide the latest version of the scheme. +The historian behavior is identical to that of the replica: of course if versions are not stored in `_vt.schema_versions` +it will always provide the latest version of the schema. ### Replica -Schema versions are never stored on replicas, so SchemaUpdated is always a Noop. Versions are provided as appropriate by -the historian. The historian provides the latest schema if there is no appropriate version. +Schema versions are never stored directly on `REPLICA` tablets, so SchemaUpdated is always a noop. Versions are provided +as appropriate by the historian. The historian provides the latest schema if there is no appropriate version. So the logic is: @@ -146,8 +154,9 @@ Only best-effort versioning can be provided due to races between DDLs and DMLs. ### Situation 1 -If multiple DDLs are applied in a quick sequence we can end up with the following binlog. +If multiple DDLs are applied in a quick sequence we can end up with the following binlog scenario: +```text T1: DDL 1 on table1 T2: DDL 2 on table1 @@ -157,16 +166,17 @@ T3: Version Event DDL1 // gets written because of the time taken by tracker proc T4: DML1 on table1 T5: Version Event DDL2 // gets written AFTER DML1 +``` -So now on the replica, at T4, the version historian will incorrectly provide the schema from T1 after DDL1 was applied. +So now on the `REPLICA`, at T4, the version historian will incorrectly provide the schema from T1 after DDL1 was applied. ### Situation 2 -If version tracking is turned off on the primary for some time, correct versions may not be available to the historian -which will always return the latest schema. This might result in an incorrect schema when a Vstream is processing events +If version tracking is turned off on the `PRIMARY` for some time, correct versions may not be available to the historian +which will always return the latest schema. This might result in an incorrect schema when a vstream is processing events in the past. #### Possible new features around this functionality -- Schema tracking Vstream client for notifications of all ddls +- Schema tracking vstream client for notifications of all ddls - Raw history of schema changes for auditing, root cause analysis, etc. From c5dd3177319bb4e6226c98767083736c947164c4 Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Fri, 20 Jan 2023 13:31:13 -0500 Subject: [PATCH 12/16] Title capitalization Signed-off-by: Matt Lord --- .../vreplication/internal/cutover.md | 26 +++++++++---------- .../reference/vreplication/internal/keys.md | 20 +++++++------- .../vreplication/internal/life-of-a-stream.md | 8 +++--- .../vreplication/internal/tracker.md | 2 +- .../vreplication/internal/cutover.md | 26 +++++++++---------- .../reference/vreplication/internal/keys.md | 20 +++++++------- .../vreplication/internal/life-of-a-stream.md | 8 +++--- .../vreplication/internal/tracker.md | 2 +- .../vreplication/internal/cutover.md | 26 +++++++++---------- .../reference/vreplication/internal/keys.md | 20 +++++++------- .../vreplication/internal/life-of-a-stream.md | 8 +++--- .../vreplication/internal/tracker.md | 2 +- .../vreplication/internal/cutover.md | 26 +++++++++---------- .../reference/vreplication/internal/keys.md | 20 +++++++------- .../vreplication/internal/life-of-a-stream.md | 8 +++--- .../vreplication/internal/tracker.md | 2 +- 16 files changed, 112 insertions(+), 112 deletions(-) diff --git a/content/en/docs/13.0/reference/vreplication/internal/cutover.md b/content/en/docs/13.0/reference/vreplication/internal/cutover.md index f733a5e3c..376ffc7dc 100644 --- a/content/en/docs/13.0/reference/vreplication/internal/cutover.md +++ b/content/en/docs/13.0/reference/vreplication/internal/cutover.md @@ -1,11 +1,11 @@ --- -title: How traffic is switched +title: How Traffic Is Switched description: How Vitess signals traffic cutover for Reshard and MoveTables weight: 2 aliases: ['/docs/design-docs/vreplication/cutover/'] --- -# Related persistent Vitess objects +# Related Persistent Vitess Objects {{< info >}} As the objects or keys noted below are stored in [the topo server](../../../features/topology-service/) and @@ -76,7 +76,7 @@ dedicated [`GetRoutingRules`](../../../programs/vtctl/schema-version-permissions vtctl client command which will return the rules for all keyspaces in the topo. {{< /info >}} -# How VTGate routes a query +# How VTGate Routes a Query This section walks through a simplified version of the logic used to determine which keyspace and table vtgate will route a simple query of the form `select * from t1 where id = 1` (a _read_ query) or `insert into t1 (id, val) values (1,'abc')` @@ -93,7 +93,7 @@ and selecting the ones whose `query_service_disabled` field is *not* set and who [`VSchema`](../../../features/vschema/) (stored in the `global` topo), the shard for the relevant row is computed based on the keyrange to which the id is mapped to using the declared [`VIndex` function/type](../../../features/vindexes/#predefined-vindexes). -# Changes made to the topo when traffic is switched +# Changes Made to the Topo When Traffic Is Switched This document outlines the steps involved in the cutover process of [`MoveTables`](../../movetables/) and [`Reshard`](../../reshard/) @@ -105,11 +105,11 @@ in the workflow. Items in italics are topo keys and the following snippet the value of the key {{< /info >}} -## What happens when a Reshard is cutover +## What Happens When a Reshard Is Cutover For brevity we only show the records for the `80-` shard. There will be similar records for the `-80` shard. -#### Before Resharding, after -80/80- shards are created +#### Before Resharding, After -80/80- Shards Are Created Only shard `0` has `is_primary_serving` set to true. The `SrvKeyspace` record only has references to `0` for both `PRIMARY` and `REPLICA` tablet types. @@ -137,7 +137,7 @@ partitions:{served_type:PRIMARY shard_references:{name:"0"}} partitions:{served_type:REPLICA shard_references:{name:"0"}} ``` -### After replica traffic is switched using `SwitchTraffic` (previously known as SwitchReads) +### After Replica Traffic Is Switched Using `SwitchTraffic` (Previously Known as SwitchReads) Shard `0` still has the `is_primary_serving` set as true. The primary partition is still the same. @@ -178,7 +178,7 @@ partitions:{served_type:REPLICA } ``` -#### After primary traffic is switched using `SwitchTraffic` (previously known as SwitchWrites) +#### After Primary Traffic Is Switched Using `SwitchTraffic` (Previously Known as SwitchWrites) * `is_primary_serving` is removed from shard `0` * `is_primary_serving` is added to shards `-80` and `80-` @@ -220,9 +220,9 @@ partitions:{served_type:REPLICA } ``` -## What happens when a MoveTables workflow is cutover +## What Happens When a MoveTables Workflow Is Cutover -#### Before MoveTables is initiated +#### Before MoveTables Is Initiated The [`VSchema`](../../../features/vschema/) for the source keyspace contains the table name, so vtgate routes queries to that keyspace. @@ -241,7 +241,7 @@ rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} ``` -#### On switching replica traffic to target +#### On Switching Replica Traffic to Target The routing rules for replica targeted reads are updated to map the table on the source to the target. @@ -253,7 +253,7 @@ rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer@replica" to_tables:"customer.customer"} ``` -#### On switching primary traffic +#### On Switching Primary Traffic The routing rules for default read-write traffic are updated to map the table on the source to the target. In addition the tables are added to the “denylist” on the source keyspace which `vttablet` uses to reject queries for these tables on the @@ -277,7 +277,7 @@ tablet_controls:{tablet_type:PRIMARY denylisted_tables:"customer"} is_primary_serving:true ``` -# Miscellaneous Notes: +# Miscellaneous Notes * In VReplication workflows, cutovers are performed manually by the user executing the `SwitchTraffic` and `ReverseTraffic` actions e.g. for a [`MoveTables`](../../movetables/#switchtraffic) or [`Reshard`](../../reshard/#reversetraffic) vtctl diff --git a/content/en/docs/13.0/reference/vreplication/internal/keys.md b/content/en/docs/13.0/reference/vreplication/internal/keys.md index ae3bee759..0be41810e 100644 --- a/content/en/docs/13.0/reference/vreplication/internal/keys.md +++ b/content/en/docs/13.0/reference/vreplication/internal/keys.md @@ -1,11 +1,11 @@ --- -title: Role of table keys in VReplication +title: Role of Table Keys in VReplication description: Uses and requirements for primary and unique keys in source and target tables in VReplication Workflows weight: 3 aliases: ['/docs/design-docs/vreplication/keys/'] --- -# The use of unique keys +# The Use of Unique Keys A VReplication stream copies data from a table on a source tablet to a table on a target tablet. In some cases, the two tablets may be the same one, but the stream is oblivious to such nuance. VReplication needs to be able to copy existing @@ -23,7 +23,7 @@ In fact, in the most common use case, both tables will have the same `PRIMARY KE the same order. This is the default assumption and expectation by VReplication. But this doesn't have to be the case, and it is possible to have different keys on the source and the target table. -## Which keys are eligible? +## Which Keys Are Eligible? Any `UNIQUE KEY` that is non-`NULL`able potentially qualifies. A `NULL`able `UNIQUE KEY` is a key that covers one or more `NULL`able columns. It doesn't matter if column values do or do not actually contain `NULL`s. If a column is `NULL` @@ -38,7 +38,7 @@ prioritizes smaller data types over larger data types. However, not all eligible `UNIQUE KEY`s, or even `PRIMARY KEY`s are usable for all VReplication streams, as described below. -## Comparable rows +## Comparable Rows VReplication needs to be able to determine, given a row in the source table, which row it maps to in the target table. @@ -94,9 +94,9 @@ To clarify, it is **OK** if: All it takes is _one_ viable key that can be used to uniquely identify rows in the source table, and one such viable key in the target table to allow VReplication to work. -### Examples of valid cases +### Examples of Valid Cases -#### Source table and target table are the same +#### Source Table and Target Table Are the Same ```sql CREATE TABLE `entry` ( @@ -110,7 +110,7 @@ CREATE TABLE `entry` ( The above is a trivial scenario. -#### Source table and target table share the same PRIMARY KEY +#### Source Table and Target table Share the Same PRIMARY KEY ```sql CREATE TABLE `source` ( @@ -227,7 +227,7 @@ The only eligible solution in the above is: Incidentally, in the above, the chosen keys differ by name, but share the same columns (`uuid`). -### Examples of invalid cases +### Examples of Invalid Cases #### NULLable columns @@ -272,7 +272,7 @@ CREATE TABLE `target` ( `target` only has one possible key, the `PRIMARY KEY`, covering `id`. But `id` is not found in `source`. -## Configuring the stream +## Configuring The Stream If both source and target table share the same `PRIMARY KEY` (covering the same columns in the same order) then there's nothing to be done. VReplication will pick `PRIMARY KEY` on both ends by default. @@ -396,7 +396,7 @@ With the introduction of mechanisms to automatically determine the optimal key t the `source_unique_key_columns`, `target_unique_key_columns`, and `source_unique_key_target_columns` fields for more fine-grained control, VReplication changes its behavior as needed. -#### Notes about the code +#### Notes About The Code Much of the code uses "PK" terminology. With the introduction of _any_ unique key utilization the "PK" terminology becomes incorrect. However, to avoid mass rewrites we kept this terminology, and wherever VReplication discusses diff --git a/content/en/docs/13.0/reference/vreplication/internal/life-of-a-stream.md b/content/en/docs/13.0/reference/vreplication/internal/life-of-a-stream.md index 1d58d9e97..5648fc2fe 100644 --- a/content/en/docs/13.0/reference/vreplication/internal/life-of-a-stream.md +++ b/content/en/docs/13.0/reference/vreplication/internal/life-of-a-stream.md @@ -1,5 +1,5 @@ --- -title: Life of a stream +title: Life of a Stream description: How VReplication replicates data weight: 1 aliases: ['/docs/design-docs/vreplication/life-of-a-stream/'] @@ -18,7 +18,7 @@ streams events from the binlog. ![VReplication Flow](/img/VReplicationFlow.png) -#### Full table copy +#### Full Table Copy If the entire table data is requested then the simple streaming done by the _replication_ mode can create an avalanche of events (think 100s of millions of rows). Moreover, and more importantly, it is highly likely that necesasry older @@ -54,7 +54,7 @@ not able to process them in time. For example, in resharding workflows we need t statements and execute them on the target's mysqld instance, which are usually much slower than just selecting data on the source. -### Modes, in detail +### Modes, in Detail #### Replicate @@ -158,7 +158,7 @@ As detailed above the catchup phase runs between copy phase cycles (time limited GTID position can move significantly ahead. So we run a catchup and fast-forward phase until we come close to the current position — i.e. the replication lag is small. At that point we execute another Copy cycle. -#### Fast forward +#### Fast Forward During the copy phase we first take a snapshot. Then we fast-forward: we replicate from the gtid position where we stopped the Catchup to the position of the new snapshot. diff --git a/content/en/docs/13.0/reference/vreplication/internal/tracker.md b/content/en/docs/13.0/reference/vreplication/internal/tracker.md index 08fd12941..4a9a73478 100644 --- a/content/en/docs/13.0/reference/vreplication/internal/tracker.md +++ b/content/en/docs/13.0/reference/vreplication/internal/tracker.md @@ -176,7 +176,7 @@ If version tracking is turned off on the `PRIMARY` for some time, correct versio which will always return the latest schema. This might result in an incorrect schema when a vstream is processing events in the past. -#### Possible new features around this functionality +#### Possible New Features Around This Functionality - Schema tracking vstream client for notifications of all ddls - Raw history of schema changes for auditing, root cause analysis, etc. diff --git a/content/en/docs/14.0/reference/vreplication/internal/cutover.md b/content/en/docs/14.0/reference/vreplication/internal/cutover.md index f733a5e3c..376ffc7dc 100644 --- a/content/en/docs/14.0/reference/vreplication/internal/cutover.md +++ b/content/en/docs/14.0/reference/vreplication/internal/cutover.md @@ -1,11 +1,11 @@ --- -title: How traffic is switched +title: How Traffic Is Switched description: How Vitess signals traffic cutover for Reshard and MoveTables weight: 2 aliases: ['/docs/design-docs/vreplication/cutover/'] --- -# Related persistent Vitess objects +# Related Persistent Vitess Objects {{< info >}} As the objects or keys noted below are stored in [the topo server](../../../features/topology-service/) and @@ -76,7 +76,7 @@ dedicated [`GetRoutingRules`](../../../programs/vtctl/schema-version-permissions vtctl client command which will return the rules for all keyspaces in the topo. {{< /info >}} -# How VTGate routes a query +# How VTGate Routes a Query This section walks through a simplified version of the logic used to determine which keyspace and table vtgate will route a simple query of the form `select * from t1 where id = 1` (a _read_ query) or `insert into t1 (id, val) values (1,'abc')` @@ -93,7 +93,7 @@ and selecting the ones whose `query_service_disabled` field is *not* set and who [`VSchema`](../../../features/vschema/) (stored in the `global` topo), the shard for the relevant row is computed based on the keyrange to which the id is mapped to using the declared [`VIndex` function/type](../../../features/vindexes/#predefined-vindexes). -# Changes made to the topo when traffic is switched +# Changes Made to the Topo When Traffic Is Switched This document outlines the steps involved in the cutover process of [`MoveTables`](../../movetables/) and [`Reshard`](../../reshard/) @@ -105,11 +105,11 @@ in the workflow. Items in italics are topo keys and the following snippet the value of the key {{< /info >}} -## What happens when a Reshard is cutover +## What Happens When a Reshard Is Cutover For brevity we only show the records for the `80-` shard. There will be similar records for the `-80` shard. -#### Before Resharding, after -80/80- shards are created +#### Before Resharding, After -80/80- Shards Are Created Only shard `0` has `is_primary_serving` set to true. The `SrvKeyspace` record only has references to `0` for both `PRIMARY` and `REPLICA` tablet types. @@ -137,7 +137,7 @@ partitions:{served_type:PRIMARY shard_references:{name:"0"}} partitions:{served_type:REPLICA shard_references:{name:"0"}} ``` -### After replica traffic is switched using `SwitchTraffic` (previously known as SwitchReads) +### After Replica Traffic Is Switched Using `SwitchTraffic` (Previously Known as SwitchReads) Shard `0` still has the `is_primary_serving` set as true. The primary partition is still the same. @@ -178,7 +178,7 @@ partitions:{served_type:REPLICA } ``` -#### After primary traffic is switched using `SwitchTraffic` (previously known as SwitchWrites) +#### After Primary Traffic Is Switched Using `SwitchTraffic` (Previously Known as SwitchWrites) * `is_primary_serving` is removed from shard `0` * `is_primary_serving` is added to shards `-80` and `80-` @@ -220,9 +220,9 @@ partitions:{served_type:REPLICA } ``` -## What happens when a MoveTables workflow is cutover +## What Happens When a MoveTables Workflow Is Cutover -#### Before MoveTables is initiated +#### Before MoveTables Is Initiated The [`VSchema`](../../../features/vschema/) for the source keyspace contains the table name, so vtgate routes queries to that keyspace. @@ -241,7 +241,7 @@ rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} ``` -#### On switching replica traffic to target +#### On Switching Replica Traffic to Target The routing rules for replica targeted reads are updated to map the table on the source to the target. @@ -253,7 +253,7 @@ rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer@replica" to_tables:"customer.customer"} ``` -#### On switching primary traffic +#### On Switching Primary Traffic The routing rules for default read-write traffic are updated to map the table on the source to the target. In addition the tables are added to the “denylist” on the source keyspace which `vttablet` uses to reject queries for these tables on the @@ -277,7 +277,7 @@ tablet_controls:{tablet_type:PRIMARY denylisted_tables:"customer"} is_primary_serving:true ``` -# Miscellaneous Notes: +# Miscellaneous Notes * In VReplication workflows, cutovers are performed manually by the user executing the `SwitchTraffic` and `ReverseTraffic` actions e.g. for a [`MoveTables`](../../movetables/#switchtraffic) or [`Reshard`](../../reshard/#reversetraffic) vtctl diff --git a/content/en/docs/14.0/reference/vreplication/internal/keys.md b/content/en/docs/14.0/reference/vreplication/internal/keys.md index ae3bee759..0be41810e 100644 --- a/content/en/docs/14.0/reference/vreplication/internal/keys.md +++ b/content/en/docs/14.0/reference/vreplication/internal/keys.md @@ -1,11 +1,11 @@ --- -title: Role of table keys in VReplication +title: Role of Table Keys in VReplication description: Uses and requirements for primary and unique keys in source and target tables in VReplication Workflows weight: 3 aliases: ['/docs/design-docs/vreplication/keys/'] --- -# The use of unique keys +# The Use of Unique Keys A VReplication stream copies data from a table on a source tablet to a table on a target tablet. In some cases, the two tablets may be the same one, but the stream is oblivious to such nuance. VReplication needs to be able to copy existing @@ -23,7 +23,7 @@ In fact, in the most common use case, both tables will have the same `PRIMARY KE the same order. This is the default assumption and expectation by VReplication. But this doesn't have to be the case, and it is possible to have different keys on the source and the target table. -## Which keys are eligible? +## Which Keys Are Eligible? Any `UNIQUE KEY` that is non-`NULL`able potentially qualifies. A `NULL`able `UNIQUE KEY` is a key that covers one or more `NULL`able columns. It doesn't matter if column values do or do not actually contain `NULL`s. If a column is `NULL` @@ -38,7 +38,7 @@ prioritizes smaller data types over larger data types. However, not all eligible `UNIQUE KEY`s, or even `PRIMARY KEY`s are usable for all VReplication streams, as described below. -## Comparable rows +## Comparable Rows VReplication needs to be able to determine, given a row in the source table, which row it maps to in the target table. @@ -94,9 +94,9 @@ To clarify, it is **OK** if: All it takes is _one_ viable key that can be used to uniquely identify rows in the source table, and one such viable key in the target table to allow VReplication to work. -### Examples of valid cases +### Examples of Valid Cases -#### Source table and target table are the same +#### Source Table and Target Table Are the Same ```sql CREATE TABLE `entry` ( @@ -110,7 +110,7 @@ CREATE TABLE `entry` ( The above is a trivial scenario. -#### Source table and target table share the same PRIMARY KEY +#### Source Table and Target table Share the Same PRIMARY KEY ```sql CREATE TABLE `source` ( @@ -227,7 +227,7 @@ The only eligible solution in the above is: Incidentally, in the above, the chosen keys differ by name, but share the same columns (`uuid`). -### Examples of invalid cases +### Examples of Invalid Cases #### NULLable columns @@ -272,7 +272,7 @@ CREATE TABLE `target` ( `target` only has one possible key, the `PRIMARY KEY`, covering `id`. But `id` is not found in `source`. -## Configuring the stream +## Configuring The Stream If both source and target table share the same `PRIMARY KEY` (covering the same columns in the same order) then there's nothing to be done. VReplication will pick `PRIMARY KEY` on both ends by default. @@ -396,7 +396,7 @@ With the introduction of mechanisms to automatically determine the optimal key t the `source_unique_key_columns`, `target_unique_key_columns`, and `source_unique_key_target_columns` fields for more fine-grained control, VReplication changes its behavior as needed. -#### Notes about the code +#### Notes About The Code Much of the code uses "PK" terminology. With the introduction of _any_ unique key utilization the "PK" terminology becomes incorrect. However, to avoid mass rewrites we kept this terminology, and wherever VReplication discusses diff --git a/content/en/docs/14.0/reference/vreplication/internal/life-of-a-stream.md b/content/en/docs/14.0/reference/vreplication/internal/life-of-a-stream.md index 1d58d9e97..5648fc2fe 100644 --- a/content/en/docs/14.0/reference/vreplication/internal/life-of-a-stream.md +++ b/content/en/docs/14.0/reference/vreplication/internal/life-of-a-stream.md @@ -1,5 +1,5 @@ --- -title: Life of a stream +title: Life of a Stream description: How VReplication replicates data weight: 1 aliases: ['/docs/design-docs/vreplication/life-of-a-stream/'] @@ -18,7 +18,7 @@ streams events from the binlog. ![VReplication Flow](/img/VReplicationFlow.png) -#### Full table copy +#### Full Table Copy If the entire table data is requested then the simple streaming done by the _replication_ mode can create an avalanche of events (think 100s of millions of rows). Moreover, and more importantly, it is highly likely that necesasry older @@ -54,7 +54,7 @@ not able to process them in time. For example, in resharding workflows we need t statements and execute them on the target's mysqld instance, which are usually much slower than just selecting data on the source. -### Modes, in detail +### Modes, in Detail #### Replicate @@ -158,7 +158,7 @@ As detailed above the catchup phase runs between copy phase cycles (time limited GTID position can move significantly ahead. So we run a catchup and fast-forward phase until we come close to the current position — i.e. the replication lag is small. At that point we execute another Copy cycle. -#### Fast forward +#### Fast Forward During the copy phase we first take a snapshot. Then we fast-forward: we replicate from the gtid position where we stopped the Catchup to the position of the new snapshot. diff --git a/content/en/docs/14.0/reference/vreplication/internal/tracker.md b/content/en/docs/14.0/reference/vreplication/internal/tracker.md index 08fd12941..4a9a73478 100644 --- a/content/en/docs/14.0/reference/vreplication/internal/tracker.md +++ b/content/en/docs/14.0/reference/vreplication/internal/tracker.md @@ -176,7 +176,7 @@ If version tracking is turned off on the `PRIMARY` for some time, correct versio which will always return the latest schema. This might result in an incorrect schema when a vstream is processing events in the past. -#### Possible new features around this functionality +#### Possible New Features Around This Functionality - Schema tracking vstream client for notifications of all ddls - Raw history of schema changes for auditing, root cause analysis, etc. diff --git a/content/en/docs/15.0/reference/vreplication/internal/cutover.md b/content/en/docs/15.0/reference/vreplication/internal/cutover.md index f733a5e3c..376ffc7dc 100644 --- a/content/en/docs/15.0/reference/vreplication/internal/cutover.md +++ b/content/en/docs/15.0/reference/vreplication/internal/cutover.md @@ -1,11 +1,11 @@ --- -title: How traffic is switched +title: How Traffic Is Switched description: How Vitess signals traffic cutover for Reshard and MoveTables weight: 2 aliases: ['/docs/design-docs/vreplication/cutover/'] --- -# Related persistent Vitess objects +# Related Persistent Vitess Objects {{< info >}} As the objects or keys noted below are stored in [the topo server](../../../features/topology-service/) and @@ -76,7 +76,7 @@ dedicated [`GetRoutingRules`](../../../programs/vtctl/schema-version-permissions vtctl client command which will return the rules for all keyspaces in the topo. {{< /info >}} -# How VTGate routes a query +# How VTGate Routes a Query This section walks through a simplified version of the logic used to determine which keyspace and table vtgate will route a simple query of the form `select * from t1 where id = 1` (a _read_ query) or `insert into t1 (id, val) values (1,'abc')` @@ -93,7 +93,7 @@ and selecting the ones whose `query_service_disabled` field is *not* set and who [`VSchema`](../../../features/vschema/) (stored in the `global` topo), the shard for the relevant row is computed based on the keyrange to which the id is mapped to using the declared [`VIndex` function/type](../../../features/vindexes/#predefined-vindexes). -# Changes made to the topo when traffic is switched +# Changes Made to the Topo When Traffic Is Switched This document outlines the steps involved in the cutover process of [`MoveTables`](../../movetables/) and [`Reshard`](../../reshard/) @@ -105,11 +105,11 @@ in the workflow. Items in italics are topo keys and the following snippet the value of the key {{< /info >}} -## What happens when a Reshard is cutover +## What Happens When a Reshard Is Cutover For brevity we only show the records for the `80-` shard. There will be similar records for the `-80` shard. -#### Before Resharding, after -80/80- shards are created +#### Before Resharding, After -80/80- Shards Are Created Only shard `0` has `is_primary_serving` set to true. The `SrvKeyspace` record only has references to `0` for both `PRIMARY` and `REPLICA` tablet types. @@ -137,7 +137,7 @@ partitions:{served_type:PRIMARY shard_references:{name:"0"}} partitions:{served_type:REPLICA shard_references:{name:"0"}} ``` -### After replica traffic is switched using `SwitchTraffic` (previously known as SwitchReads) +### After Replica Traffic Is Switched Using `SwitchTraffic` (Previously Known as SwitchReads) Shard `0` still has the `is_primary_serving` set as true. The primary partition is still the same. @@ -178,7 +178,7 @@ partitions:{served_type:REPLICA } ``` -#### After primary traffic is switched using `SwitchTraffic` (previously known as SwitchWrites) +#### After Primary Traffic Is Switched Using `SwitchTraffic` (Previously Known as SwitchWrites) * `is_primary_serving` is removed from shard `0` * `is_primary_serving` is added to shards `-80` and `80-` @@ -220,9 +220,9 @@ partitions:{served_type:REPLICA } ``` -## What happens when a MoveTables workflow is cutover +## What Happens When a MoveTables Workflow Is Cutover -#### Before MoveTables is initiated +#### Before MoveTables Is Initiated The [`VSchema`](../../../features/vschema/) for the source keyspace contains the table name, so vtgate routes queries to that keyspace. @@ -241,7 +241,7 @@ rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} ``` -#### On switching replica traffic to target +#### On Switching Replica Traffic to Target The routing rules for replica targeted reads are updated to map the table on the source to the target. @@ -253,7 +253,7 @@ rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer@replica" to_tables:"customer.customer"} ``` -#### On switching primary traffic +#### On Switching Primary Traffic The routing rules for default read-write traffic are updated to map the table on the source to the target. In addition the tables are added to the “denylist” on the source keyspace which `vttablet` uses to reject queries for these tables on the @@ -277,7 +277,7 @@ tablet_controls:{tablet_type:PRIMARY denylisted_tables:"customer"} is_primary_serving:true ``` -# Miscellaneous Notes: +# Miscellaneous Notes * In VReplication workflows, cutovers are performed manually by the user executing the `SwitchTraffic` and `ReverseTraffic` actions e.g. for a [`MoveTables`](../../movetables/#switchtraffic) or [`Reshard`](../../reshard/#reversetraffic) vtctl diff --git a/content/en/docs/15.0/reference/vreplication/internal/keys.md b/content/en/docs/15.0/reference/vreplication/internal/keys.md index ae3bee759..0be41810e 100644 --- a/content/en/docs/15.0/reference/vreplication/internal/keys.md +++ b/content/en/docs/15.0/reference/vreplication/internal/keys.md @@ -1,11 +1,11 @@ --- -title: Role of table keys in VReplication +title: Role of Table Keys in VReplication description: Uses and requirements for primary and unique keys in source and target tables in VReplication Workflows weight: 3 aliases: ['/docs/design-docs/vreplication/keys/'] --- -# The use of unique keys +# The Use of Unique Keys A VReplication stream copies data from a table on a source tablet to a table on a target tablet. In some cases, the two tablets may be the same one, but the stream is oblivious to such nuance. VReplication needs to be able to copy existing @@ -23,7 +23,7 @@ In fact, in the most common use case, both tables will have the same `PRIMARY KE the same order. This is the default assumption and expectation by VReplication. But this doesn't have to be the case, and it is possible to have different keys on the source and the target table. -## Which keys are eligible? +## Which Keys Are Eligible? Any `UNIQUE KEY` that is non-`NULL`able potentially qualifies. A `NULL`able `UNIQUE KEY` is a key that covers one or more `NULL`able columns. It doesn't matter if column values do or do not actually contain `NULL`s. If a column is `NULL` @@ -38,7 +38,7 @@ prioritizes smaller data types over larger data types. However, not all eligible `UNIQUE KEY`s, or even `PRIMARY KEY`s are usable for all VReplication streams, as described below. -## Comparable rows +## Comparable Rows VReplication needs to be able to determine, given a row in the source table, which row it maps to in the target table. @@ -94,9 +94,9 @@ To clarify, it is **OK** if: All it takes is _one_ viable key that can be used to uniquely identify rows in the source table, and one such viable key in the target table to allow VReplication to work. -### Examples of valid cases +### Examples of Valid Cases -#### Source table and target table are the same +#### Source Table and Target Table Are the Same ```sql CREATE TABLE `entry` ( @@ -110,7 +110,7 @@ CREATE TABLE `entry` ( The above is a trivial scenario. -#### Source table and target table share the same PRIMARY KEY +#### Source Table and Target table Share the Same PRIMARY KEY ```sql CREATE TABLE `source` ( @@ -227,7 +227,7 @@ The only eligible solution in the above is: Incidentally, in the above, the chosen keys differ by name, but share the same columns (`uuid`). -### Examples of invalid cases +### Examples of Invalid Cases #### NULLable columns @@ -272,7 +272,7 @@ CREATE TABLE `target` ( `target` only has one possible key, the `PRIMARY KEY`, covering `id`. But `id` is not found in `source`. -## Configuring the stream +## Configuring The Stream If both source and target table share the same `PRIMARY KEY` (covering the same columns in the same order) then there's nothing to be done. VReplication will pick `PRIMARY KEY` on both ends by default. @@ -396,7 +396,7 @@ With the introduction of mechanisms to automatically determine the optimal key t the `source_unique_key_columns`, `target_unique_key_columns`, and `source_unique_key_target_columns` fields for more fine-grained control, VReplication changes its behavior as needed. -#### Notes about the code +#### Notes About The Code Much of the code uses "PK" terminology. With the introduction of _any_ unique key utilization the "PK" terminology becomes incorrect. However, to avoid mass rewrites we kept this terminology, and wherever VReplication discusses diff --git a/content/en/docs/15.0/reference/vreplication/internal/life-of-a-stream.md b/content/en/docs/15.0/reference/vreplication/internal/life-of-a-stream.md index 1d58d9e97..5648fc2fe 100644 --- a/content/en/docs/15.0/reference/vreplication/internal/life-of-a-stream.md +++ b/content/en/docs/15.0/reference/vreplication/internal/life-of-a-stream.md @@ -1,5 +1,5 @@ --- -title: Life of a stream +title: Life of a Stream description: How VReplication replicates data weight: 1 aliases: ['/docs/design-docs/vreplication/life-of-a-stream/'] @@ -18,7 +18,7 @@ streams events from the binlog. ![VReplication Flow](/img/VReplicationFlow.png) -#### Full table copy +#### Full Table Copy If the entire table data is requested then the simple streaming done by the _replication_ mode can create an avalanche of events (think 100s of millions of rows). Moreover, and more importantly, it is highly likely that necesasry older @@ -54,7 +54,7 @@ not able to process them in time. For example, in resharding workflows we need t statements and execute them on the target's mysqld instance, which are usually much slower than just selecting data on the source. -### Modes, in detail +### Modes, in Detail #### Replicate @@ -158,7 +158,7 @@ As detailed above the catchup phase runs between copy phase cycles (time limited GTID position can move significantly ahead. So we run a catchup and fast-forward phase until we come close to the current position — i.e. the replication lag is small. At that point we execute another Copy cycle. -#### Fast forward +#### Fast Forward During the copy phase we first take a snapshot. Then we fast-forward: we replicate from the gtid position where we stopped the Catchup to the position of the new snapshot. diff --git a/content/en/docs/15.0/reference/vreplication/internal/tracker.md b/content/en/docs/15.0/reference/vreplication/internal/tracker.md index 08fd12941..4a9a73478 100644 --- a/content/en/docs/15.0/reference/vreplication/internal/tracker.md +++ b/content/en/docs/15.0/reference/vreplication/internal/tracker.md @@ -176,7 +176,7 @@ If version tracking is turned off on the `PRIMARY` for some time, correct versio which will always return the latest schema. This might result in an incorrect schema when a vstream is processing events in the past. -#### Possible new features around this functionality +#### Possible New Features Around This Functionality - Schema tracking vstream client for notifications of all ddls - Raw history of schema changes for auditing, root cause analysis, etc. diff --git a/content/en/docs/16.0/reference/vreplication/internal/cutover.md b/content/en/docs/16.0/reference/vreplication/internal/cutover.md index f733a5e3c..376ffc7dc 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/cutover.md +++ b/content/en/docs/16.0/reference/vreplication/internal/cutover.md @@ -1,11 +1,11 @@ --- -title: How traffic is switched +title: How Traffic Is Switched description: How Vitess signals traffic cutover for Reshard and MoveTables weight: 2 aliases: ['/docs/design-docs/vreplication/cutover/'] --- -# Related persistent Vitess objects +# Related Persistent Vitess Objects {{< info >}} As the objects or keys noted below are stored in [the topo server](../../../features/topology-service/) and @@ -76,7 +76,7 @@ dedicated [`GetRoutingRules`](../../../programs/vtctl/schema-version-permissions vtctl client command which will return the rules for all keyspaces in the topo. {{< /info >}} -# How VTGate routes a query +# How VTGate Routes a Query This section walks through a simplified version of the logic used to determine which keyspace and table vtgate will route a simple query of the form `select * from t1 where id = 1` (a _read_ query) or `insert into t1 (id, val) values (1,'abc')` @@ -93,7 +93,7 @@ and selecting the ones whose `query_service_disabled` field is *not* set and who [`VSchema`](../../../features/vschema/) (stored in the `global` topo), the shard for the relevant row is computed based on the keyrange to which the id is mapped to using the declared [`VIndex` function/type](../../../features/vindexes/#predefined-vindexes). -# Changes made to the topo when traffic is switched +# Changes Made to the Topo When Traffic Is Switched This document outlines the steps involved in the cutover process of [`MoveTables`](../../movetables/) and [`Reshard`](../../reshard/) @@ -105,11 +105,11 @@ in the workflow. Items in italics are topo keys and the following snippet the value of the key {{< /info >}} -## What happens when a Reshard is cutover +## What Happens When a Reshard Is Cutover For brevity we only show the records for the `80-` shard. There will be similar records for the `-80` shard. -#### Before Resharding, after -80/80- shards are created +#### Before Resharding, After -80/80- Shards Are Created Only shard `0` has `is_primary_serving` set to true. The `SrvKeyspace` record only has references to `0` for both `PRIMARY` and `REPLICA` tablet types. @@ -137,7 +137,7 @@ partitions:{served_type:PRIMARY shard_references:{name:"0"}} partitions:{served_type:REPLICA shard_references:{name:"0"}} ``` -### After replica traffic is switched using `SwitchTraffic` (previously known as SwitchReads) +### After Replica Traffic Is Switched Using `SwitchTraffic` (Previously Known as SwitchReads) Shard `0` still has the `is_primary_serving` set as true. The primary partition is still the same. @@ -178,7 +178,7 @@ partitions:{served_type:REPLICA } ``` -#### After primary traffic is switched using `SwitchTraffic` (previously known as SwitchWrites) +#### After Primary Traffic Is Switched Using `SwitchTraffic` (Previously Known as SwitchWrites) * `is_primary_serving` is removed from shard `0` * `is_primary_serving` is added to shards `-80` and `80-` @@ -220,9 +220,9 @@ partitions:{served_type:REPLICA } ``` -## What happens when a MoveTables workflow is cutover +## What Happens When a MoveTables Workflow Is Cutover -#### Before MoveTables is initiated +#### Before MoveTables Is Initiated The [`VSchema`](../../../features/vschema/) for the source keyspace contains the table name, so vtgate routes queries to that keyspace. @@ -241,7 +241,7 @@ rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} ``` -#### On switching replica traffic to target +#### On Switching Replica Traffic to Target The routing rules for replica targeted reads are updated to map the table on the source to the target. @@ -253,7 +253,7 @@ rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer@replica" to_tables:"customer.customer"} ``` -#### On switching primary traffic +#### On Switching Primary Traffic The routing rules for default read-write traffic are updated to map the table on the source to the target. In addition the tables are added to the “denylist” on the source keyspace which `vttablet` uses to reject queries for these tables on the @@ -277,7 +277,7 @@ tablet_controls:{tablet_type:PRIMARY denylisted_tables:"customer"} is_primary_serving:true ``` -# Miscellaneous Notes: +# Miscellaneous Notes * In VReplication workflows, cutovers are performed manually by the user executing the `SwitchTraffic` and `ReverseTraffic` actions e.g. for a [`MoveTables`](../../movetables/#switchtraffic) or [`Reshard`](../../reshard/#reversetraffic) vtctl diff --git a/content/en/docs/16.0/reference/vreplication/internal/keys.md b/content/en/docs/16.0/reference/vreplication/internal/keys.md index ae3bee759..0be41810e 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/keys.md +++ b/content/en/docs/16.0/reference/vreplication/internal/keys.md @@ -1,11 +1,11 @@ --- -title: Role of table keys in VReplication +title: Role of Table Keys in VReplication description: Uses and requirements for primary and unique keys in source and target tables in VReplication Workflows weight: 3 aliases: ['/docs/design-docs/vreplication/keys/'] --- -# The use of unique keys +# The Use of Unique Keys A VReplication stream copies data from a table on a source tablet to a table on a target tablet. In some cases, the two tablets may be the same one, but the stream is oblivious to such nuance. VReplication needs to be able to copy existing @@ -23,7 +23,7 @@ In fact, in the most common use case, both tables will have the same `PRIMARY KE the same order. This is the default assumption and expectation by VReplication. But this doesn't have to be the case, and it is possible to have different keys on the source and the target table. -## Which keys are eligible? +## Which Keys Are Eligible? Any `UNIQUE KEY` that is non-`NULL`able potentially qualifies. A `NULL`able `UNIQUE KEY` is a key that covers one or more `NULL`able columns. It doesn't matter if column values do or do not actually contain `NULL`s. If a column is `NULL` @@ -38,7 +38,7 @@ prioritizes smaller data types over larger data types. However, not all eligible `UNIQUE KEY`s, or even `PRIMARY KEY`s are usable for all VReplication streams, as described below. -## Comparable rows +## Comparable Rows VReplication needs to be able to determine, given a row in the source table, which row it maps to in the target table. @@ -94,9 +94,9 @@ To clarify, it is **OK** if: All it takes is _one_ viable key that can be used to uniquely identify rows in the source table, and one such viable key in the target table to allow VReplication to work. -### Examples of valid cases +### Examples of Valid Cases -#### Source table and target table are the same +#### Source Table and Target Table Are the Same ```sql CREATE TABLE `entry` ( @@ -110,7 +110,7 @@ CREATE TABLE `entry` ( The above is a trivial scenario. -#### Source table and target table share the same PRIMARY KEY +#### Source Table and Target table Share the Same PRIMARY KEY ```sql CREATE TABLE `source` ( @@ -227,7 +227,7 @@ The only eligible solution in the above is: Incidentally, in the above, the chosen keys differ by name, but share the same columns (`uuid`). -### Examples of invalid cases +### Examples of Invalid Cases #### NULLable columns @@ -272,7 +272,7 @@ CREATE TABLE `target` ( `target` only has one possible key, the `PRIMARY KEY`, covering `id`. But `id` is not found in `source`. -## Configuring the stream +## Configuring The Stream If both source and target table share the same `PRIMARY KEY` (covering the same columns in the same order) then there's nothing to be done. VReplication will pick `PRIMARY KEY` on both ends by default. @@ -396,7 +396,7 @@ With the introduction of mechanisms to automatically determine the optimal key t the `source_unique_key_columns`, `target_unique_key_columns`, and `source_unique_key_target_columns` fields for more fine-grained control, VReplication changes its behavior as needed. -#### Notes about the code +#### Notes About The Code Much of the code uses "PK" terminology. With the introduction of _any_ unique key utilization the "PK" terminology becomes incorrect. However, to avoid mass rewrites we kept this terminology, and wherever VReplication discusses diff --git a/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md b/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md index 1d58d9e97..5648fc2fe 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md +++ b/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md @@ -1,5 +1,5 @@ --- -title: Life of a stream +title: Life of a Stream description: How VReplication replicates data weight: 1 aliases: ['/docs/design-docs/vreplication/life-of-a-stream/'] @@ -18,7 +18,7 @@ streams events from the binlog. ![VReplication Flow](/img/VReplicationFlow.png) -#### Full table copy +#### Full Table Copy If the entire table data is requested then the simple streaming done by the _replication_ mode can create an avalanche of events (think 100s of millions of rows). Moreover, and more importantly, it is highly likely that necesasry older @@ -54,7 +54,7 @@ not able to process them in time. For example, in resharding workflows we need t statements and execute them on the target's mysqld instance, which are usually much slower than just selecting data on the source. -### Modes, in detail +### Modes, in Detail #### Replicate @@ -158,7 +158,7 @@ As detailed above the catchup phase runs between copy phase cycles (time limited GTID position can move significantly ahead. So we run a catchup and fast-forward phase until we come close to the current position — i.e. the replication lag is small. At that point we execute another Copy cycle. -#### Fast forward +#### Fast Forward During the copy phase we first take a snapshot. Then we fast-forward: we replicate from the gtid position where we stopped the Catchup to the position of the new snapshot. diff --git a/content/en/docs/16.0/reference/vreplication/internal/tracker.md b/content/en/docs/16.0/reference/vreplication/internal/tracker.md index 08fd12941..4a9a73478 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/tracker.md +++ b/content/en/docs/16.0/reference/vreplication/internal/tracker.md @@ -176,7 +176,7 @@ If version tracking is turned off on the `PRIMARY` for some time, correct versio which will always return the latest schema. This might result in an incorrect schema when a vstream is processing events in the past. -#### Possible new features around this functionality +#### Possible New Features Around This Functionality - Schema tracking vstream client for notifications of all ddls - Raw history of schema changes for auditing, root cause analysis, etc. From 347c8356997388e6f82b9eff342788720c605fb1 Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Fri, 20 Jan 2023 14:04:53 -0500 Subject: [PATCH 13/16] VStream Skew page Signed-off-by: Matt Lord --- .../internal/vstream-skew-detection.md | 71 ++++++++++--------- 1 file changed, 37 insertions(+), 34 deletions(-) diff --git a/content/en/docs/16.0/reference/vreplication/internal/vstream-skew-detection.md b/content/en/docs/16.0/reference/vreplication/internal/vstream-skew-detection.md index 34ccdeecb..dd444246a 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/vstream-skew-detection.md +++ b/content/en/docs/16.0/reference/vreplication/internal/vstream-skew-detection.md @@ -2,74 +2,77 @@ title: VStream Skew Minimization description: Aligning streams from different shards in the VStream API weight: 7 +aliases: ['/docs/design-docs/vreplication/vstream/skew-detection/'] --- ## VStream Skew Detection ### Motivation -When the VStream API is streaming from multiple shards we have multiple sources of events: one primary or replica tablet -for each shard in the provided VGTID. The rate at which the events will be streamed from the underlying sources can vary -depending on various factors, like: +When the [VStream API](../../vstream/) is streaming from multiple shards we have multiple sources of events: one `PRIMARY` +or `REPLICA` tablet for each shard in the provided [`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid). +The rate at which the events will be streamed from the underlying sources can vary depending on various factors, such as: -* the replication lag on the source tablets (if a replica is selected as the source for the VStream) -* the cpu load on the source tablet -* possible network partitions or network delays +* The replication lag on the source tablets (if a `REPLICA` tablet is selected as the source for the vstream) +* The CPU load on the source tablet +* Possible network partitions or network delays -This can result in the events in the VStream from some shards being well ahead of other shards. So, for example, if a -row moves from the faster shard to a slower shard we might see the delete event in the VStream from the faster shard -long before the insert from the second. This would result in the row going "invisible" for the duration of the skew. -This can affect user experience in applications where the VStream events are used to refresh UI, for example. +This can result in the events in the vstream from some shards being well ahead of other shards. So, for example, if a +row moves from the faster shard to a slower shard we might see the `DELETE` event in the vstream from the faster shard +long before the `INSERT` from the second. This would result in the row going "invisible" for the duration of the skew. +This can affect the user experience in applications where the vstream events are used to refresh a UI, for example. -For most applications where VStream API events feed into change data capture systems for auditing or reporting purposes -these delays may be acceptable. However for applications which are using these events for user-facing functions this can -cause unexpected behavior. See https://github.com/vitessio/vitess/issues/7402 for one such case. +For most applications where [VStream API](../../vstream/) events feed into change data capture systems for auditing or +reporting purposes these delays may be acceptable. However, for applications which are using these events for user-facing +functions this can cause unexpected behavior. See https://github.com/vitessio/vitess/issues/7402 for one such case. ### Goal -It is not practically possible to provide exact ordering of events across Vitess shards. The VStream API will inherently -stream events from one shard independently of another. However, VStreamer events do keep track of the binlog event +It is not practically possible to provide exact ordering of events across Vitess shards. The [VStream API](../../vstream/) +will inherently stream events from one shard independently of another. However, vstream events +([`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent)) do keep track of the binlog event timestamps which we can use to loosely coordinate the streams. Since binlog timestamp granularity is only to the nearest -second, we attempt to align the streams to within a second. +second, and we attempt to align the streams to within a second. ### Implementation -The skew minimization feature adds a flag that the client can set. This flag enables skew detection between the various -streams. Once a skew is detected, events for streams that are ahead are held back until the lagging streams catch up -causing the skew to reach an acceptable level. +The skew minimization feature adds a [`MinimizeSkew` flag](../../vstream/#minimizeskew) that the client can set. This flag +enables skew detection between the various streams. Once a skew is detected, events for streams that are ahead are held back +until the lagging streams catch up causing the skew to reach an acceptable level. -Each VStreamer event (_vevent_) contains two timestamps: one when the database transaction occurred, and the other, the -current time on the source tablet where the vevent was created. This lets us compute how far in the past the event we -just received was created. We use this to determine which shard has the most recent event and which one has the oldest -event. Note that, for shards where there are no activity, VStreamer sends a heartbeat event every second. The -transaction time for an heartbeat is the same as the current time on the source. (These heartbeats are not forwarded to -the VStream since they are synthetic vreplication events.) +Each vstream event ([`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent)) contains two timestamps: +one when the database transaction occurred, and the other the current time on the source tablet where the +[`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent) was created. This lets us compute how far in +the past the event we just received was created. We use this to determine which shard has the most recent event and which one +has the oldest. Note that for shards where there is no activity, the vstreamer sends a heartbeat event every second and the +transaction time for a heartbeat is the same as the current time on the source. (These heartbeats are not forwarded to clients +in the vstream since they are synthetic/internal VReplication events.) If the difference between the fastest and slowest streams is greater than a threshold, we declare that we have detected -a skew. MySQL binlogs store the transaction timestamp in seconds. Also, on the VTGate serving the VStream, we adjust -this time for clock skews between the VTGate and the source MySQL server. When the user sets the `MinimizeSkew` flag we -want to keep the events across shards to be in the same second: each transaction timestamp is within 1 second of each +a skew. MySQL binlogs store the transaction timestamp in seconds. Also, on the `vtgate` serving the vstream, we adjust +this time for clock skews between the `vtgate` and the source tablet's `mysqld` server. When the user sets the `MinimizeSkew` +flag we want to keep the events across shards within the same second: each transaction timestamp is within 1 second of each other. To account for rounding-off of the transaction timestamp and the clock-skew we set the threshold to be 2 seconds, instead of 1 second, so that we don't keep stalling the streams due to cumulative round-offs. -### Possible unexpected behavior +### Possible Unexpected Behavior If there are no events for a second in a shard then a heartbeat is sent. On receiving a heartbeat we reset the skew. This is necessary to avoid shards with no events starving other shards. The current logic will align streams only if they are all getting events faster than the heartbeat frequency. This means that we cannot guarantee the skew alignment feature will work as expected in certain conditions. This could -happen mainly while streaming from replicas with high replication lags, say, due to high write qps or a network +happen mainly while streaming from `REPLICA` tablets with high replication lag, say, due to high write QPS or a network partition. -Thus it is recommended that you stream from primaries when using this feature. Note, however, that even primaries with -skewed loads could trigger such a situation. +Thus it is recommended that you stream from `PRIMARY` tablets when using the [VStream feature](../../vstream/). +Note, however, that even `PRIMARY` tablets with skewed loads could potentially trigger such a situation. ### API -This is how you would turn on the skew detection and alignment feature in a VStream client: +This is how you would turn on the skew detection and alignment feature in a [VStream](../../vstream/) client: -``` +```go import vtgatepb "vitess.io/vitess/go/vt/proto/vtgate" ... ... From 133091add9eb8a5394e9f6d85937a5d12d3a728d Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Fri, 20 Jan 2023 14:22:46 -0500 Subject: [PATCH 14/16] VStream Migration page Signed-off-by: Matt Lord --- .../internal/vstream-skew-detection.md | 84 +++++++++++++++++++ .../internal/vstream-stream-migration.md | 74 ++++++++++++++++ .../internal/vstream-skew-detection.md | 84 +++++++++++++++++++ .../internal/vstream-stream-migration.md | 74 ++++++++++++++++ .../internal/vstream-skew-detection.md | 84 +++++++++++++++++++ .../internal/vstream-stream-migration.md | 74 ++++++++++++++++ .../internal/vstream-stream-migration.md | 55 ++++++------ 7 files changed, 503 insertions(+), 26 deletions(-) create mode 100644 content/en/docs/13.0/reference/vreplication/internal/vstream-skew-detection.md create mode 100644 content/en/docs/13.0/reference/vreplication/internal/vstream-stream-migration.md create mode 100644 content/en/docs/14.0/reference/vreplication/internal/vstream-skew-detection.md create mode 100644 content/en/docs/14.0/reference/vreplication/internal/vstream-stream-migration.md create mode 100644 content/en/docs/15.0/reference/vreplication/internal/vstream-skew-detection.md create mode 100644 content/en/docs/15.0/reference/vreplication/internal/vstream-stream-migration.md diff --git a/content/en/docs/13.0/reference/vreplication/internal/vstream-skew-detection.md b/content/en/docs/13.0/reference/vreplication/internal/vstream-skew-detection.md new file mode 100644 index 000000000..dd444246a --- /dev/null +++ b/content/en/docs/13.0/reference/vreplication/internal/vstream-skew-detection.md @@ -0,0 +1,84 @@ +--- +title: VStream Skew Minimization +description: Aligning streams from different shards in the VStream API +weight: 7 +aliases: ['/docs/design-docs/vreplication/vstream/skew-detection/'] +--- + +## VStream Skew Detection + +### Motivation + +When the [VStream API](../../vstream/) is streaming from multiple shards we have multiple sources of events: one `PRIMARY` +or `REPLICA` tablet for each shard in the provided [`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid). +The rate at which the events will be streamed from the underlying sources can vary depending on various factors, such as: + +* The replication lag on the source tablets (if a `REPLICA` tablet is selected as the source for the vstream) +* The CPU load on the source tablet +* Possible network partitions or network delays + +This can result in the events in the vstream from some shards being well ahead of other shards. So, for example, if a +row moves from the faster shard to a slower shard we might see the `DELETE` event in the vstream from the faster shard +long before the `INSERT` from the second. This would result in the row going "invisible" for the duration of the skew. +This can affect the user experience in applications where the vstream events are used to refresh a UI, for example. + +For most applications where [VStream API](../../vstream/) events feed into change data capture systems for auditing or +reporting purposes these delays may be acceptable. However, for applications which are using these events for user-facing +functions this can cause unexpected behavior. See https://github.com/vitessio/vitess/issues/7402 for one such case. + +### Goal + +It is not practically possible to provide exact ordering of events across Vitess shards. The [VStream API](../../vstream/) +will inherently stream events from one shard independently of another. However, vstream events +([`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent)) do keep track of the binlog event +timestamps which we can use to loosely coordinate the streams. Since binlog timestamp granularity is only to the nearest +second, and we attempt to align the streams to within a second. + +### Implementation + +The skew minimization feature adds a [`MinimizeSkew` flag](../../vstream/#minimizeskew) that the client can set. This flag +enables skew detection between the various streams. Once a skew is detected, events for streams that are ahead are held back +until the lagging streams catch up causing the skew to reach an acceptable level. + +Each vstream event ([`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent)) contains two timestamps: +one when the database transaction occurred, and the other the current time on the source tablet where the +[`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent) was created. This lets us compute how far in +the past the event we just received was created. We use this to determine which shard has the most recent event and which one +has the oldest. Note that for shards where there is no activity, the vstreamer sends a heartbeat event every second and the +transaction time for a heartbeat is the same as the current time on the source. (These heartbeats are not forwarded to clients +in the vstream since they are synthetic/internal VReplication events.) + +If the difference between the fastest and slowest streams is greater than a threshold, we declare that we have detected +a skew. MySQL binlogs store the transaction timestamp in seconds. Also, on the `vtgate` serving the vstream, we adjust +this time for clock skews between the `vtgate` and the source tablet's `mysqld` server. When the user sets the `MinimizeSkew` +flag we want to keep the events across shards within the same second: each transaction timestamp is within 1 second of each +other. To account for rounding-off of the transaction timestamp and the clock-skew we set the threshold to be 2 seconds, +instead of 1 second, so that we don't keep stalling the streams due to cumulative round-offs. + +### Possible Unexpected Behavior + +If there are no events for a second in a shard then a heartbeat is sent. On receiving a heartbeat we reset the skew. +This is necessary to avoid shards with no events starving other shards. The current logic will align streams only if +they are all getting events faster than the heartbeat frequency. + +This means that we cannot guarantee the skew alignment feature will work as expected in certain conditions. This could +happen mainly while streaming from `REPLICA` tablets with high replication lag, say, due to high write QPS or a network +partition. + +Thus it is recommended that you stream from `PRIMARY` tablets when using the [VStream feature](../../vstream/). +Note, however, that even `PRIMARY` tablets with skewed loads could potentially trigger such a situation. + +### API + +This is how you would turn on the skew detection and alignment feature in a [VStream](../../vstream/) client: + +```go + import vtgatepb "vitess.io/vitess/go/vt/proto/vtgate" + ... + ... + flags := &vtgatepb.VStreamFlags{}; + flags.MinimizeSkew = true; + + reader, err := conn.VStream(ctx, topodatapb.TabletType_PRIMARY, vgtid, filter, flags) + +``` diff --git a/content/en/docs/13.0/reference/vreplication/internal/vstream-stream-migration.md b/content/en/docs/13.0/reference/vreplication/internal/vstream-stream-migration.md new file mode 100644 index 000000000..54b62f9e7 --- /dev/null +++ b/content/en/docs/13.0/reference/vreplication/internal/vstream-stream-migration.md @@ -0,0 +1,74 @@ +--- +title: VStream API and Resharding +description: How VStream API handles a reshard +weight: 8 +aliases: ['/docs/design-docs/vreplication/vstream/stream-migration/'] +--- + +## Stream Migration on a Resharding Operation + +While subscribing to the [VStream API](../../vstream/) you need to specify the shards from which to stream events. While +streaming it is possible that the underlying keyspace is resharded. Thus some or all of the shards which were originally +specified may be replaced by new shards after the resharding operation is completed. + +Stream migration logic within VReplication handles this transparently within `vtgate`. The Event streaming will be paused +momentarily during the actual cutover (when writes are switched) and you will start getting the events +([`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent)) (and updated +[`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid)s) for the new set of shards once the cutover +is completed. + +### An Illustration + +Here is a sample session using the scripts from the [local example](../../../../get-started/local). + +Run the steps up to and including `205_clean_commerce.sh`. Now start a [VStream API](../../vstream/) client in a +separate terminal to stream events from the `customer` table in the `customer` keyspace, which is currently unsharded. + +```json +{ + ShardGtids: []*binlogdatapb.ShardGtid{ + { + Keyspace: "customer", + Shard: "0", + }, + }, +} +``` + +Initial events will be streamed: + +```proto +[type:BEGIN type:FIELD field_event: fields: > ] +[type:VGTID vgtid: > ] +[type:ROW row_event: > > type:ROW row_event: > > type:ROW row_event: > > type:ROW row_event: > > type:ROW row_event: > > type:VGTID vgtid: > > > > type:COMMIT ] +[type:BEGIN type:VGTID vgtid: > type:COMMIT ] +``` + +Now run the resharding scripts and switch reads (steps/scripts 301, 302, 303, and 304). The following events are now seen: + +```proto +[type:VGTID vgtid: > type:DDL timestamp:1616748652 statement:"alter table customer change customer_id customer_id bigint not null" current_time:1616748652480051077 ] +[type:VGTID vgtid: > type:OTHER timestamp:1616748652 current_time:1616748652553883482 ] +``` + +Run the 305 step/script to switch writes. You will see that the [`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid)s) +will include the new shards `-80` and `80-` instead of `0`: + +```proto +[type:BEGIN timestamp:1616748733 current_time:1616748733480901644 type:VGTID vgtid: > type:COMMIT timestamp:1616748733 current_time:1616748733480932466 ] +[type:BEGIN timestamp:1616748733 current_time:1616748733486715446 type:VGTID vgtid: > type:COMMIT timestamp:1616748733 current_time:1616748733486749728 ] + +[type:BEGIN timestamp:1616748733 current_time:1616748733519198641 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616748733 current_time:1616748733519244822 ] +[type:BEGIN timestamp:1616748733 current_time:1616748733520355854 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616748733 current_time:1616748733520403210 ] +``` + +Insert new rows: this will result in row events from the new shards. Shards will only stream changes from the point of +resharding. + +```bash +$ mysql -u root --host=127.0.0.1 -P 15306 -e "insert into customer(customer_id, email) values(6,'rohit@planetscale.com'), (7, 'mlord@planetscale.com')" +``` + +```proto +[type:BEGIN timestamp:1616749631 current_time:1616749631516372189 type:FIELD timestamp:1616749631 field_event: fields: > current_time:1616749631517765487 type:ROW timestamp:1616749631 row_event: > row_changes: > > current_time:1616749631517779353 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616749631 current_time:1616749631517789376 ] +``` diff --git a/content/en/docs/14.0/reference/vreplication/internal/vstream-skew-detection.md b/content/en/docs/14.0/reference/vreplication/internal/vstream-skew-detection.md new file mode 100644 index 000000000..dd444246a --- /dev/null +++ b/content/en/docs/14.0/reference/vreplication/internal/vstream-skew-detection.md @@ -0,0 +1,84 @@ +--- +title: VStream Skew Minimization +description: Aligning streams from different shards in the VStream API +weight: 7 +aliases: ['/docs/design-docs/vreplication/vstream/skew-detection/'] +--- + +## VStream Skew Detection + +### Motivation + +When the [VStream API](../../vstream/) is streaming from multiple shards we have multiple sources of events: one `PRIMARY` +or `REPLICA` tablet for each shard in the provided [`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid). +The rate at which the events will be streamed from the underlying sources can vary depending on various factors, such as: + +* The replication lag on the source tablets (if a `REPLICA` tablet is selected as the source for the vstream) +* The CPU load on the source tablet +* Possible network partitions or network delays + +This can result in the events in the vstream from some shards being well ahead of other shards. So, for example, if a +row moves from the faster shard to a slower shard we might see the `DELETE` event in the vstream from the faster shard +long before the `INSERT` from the second. This would result in the row going "invisible" for the duration of the skew. +This can affect the user experience in applications where the vstream events are used to refresh a UI, for example. + +For most applications where [VStream API](../../vstream/) events feed into change data capture systems for auditing or +reporting purposes these delays may be acceptable. However, for applications which are using these events for user-facing +functions this can cause unexpected behavior. See https://github.com/vitessio/vitess/issues/7402 for one such case. + +### Goal + +It is not practically possible to provide exact ordering of events across Vitess shards. The [VStream API](../../vstream/) +will inherently stream events from one shard independently of another. However, vstream events +([`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent)) do keep track of the binlog event +timestamps which we can use to loosely coordinate the streams. Since binlog timestamp granularity is only to the nearest +second, and we attempt to align the streams to within a second. + +### Implementation + +The skew minimization feature adds a [`MinimizeSkew` flag](../../vstream/#minimizeskew) that the client can set. This flag +enables skew detection between the various streams. Once a skew is detected, events for streams that are ahead are held back +until the lagging streams catch up causing the skew to reach an acceptable level. + +Each vstream event ([`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent)) contains two timestamps: +one when the database transaction occurred, and the other the current time on the source tablet where the +[`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent) was created. This lets us compute how far in +the past the event we just received was created. We use this to determine which shard has the most recent event and which one +has the oldest. Note that for shards where there is no activity, the vstreamer sends a heartbeat event every second and the +transaction time for a heartbeat is the same as the current time on the source. (These heartbeats are not forwarded to clients +in the vstream since they are synthetic/internal VReplication events.) + +If the difference between the fastest and slowest streams is greater than a threshold, we declare that we have detected +a skew. MySQL binlogs store the transaction timestamp in seconds. Also, on the `vtgate` serving the vstream, we adjust +this time for clock skews between the `vtgate` and the source tablet's `mysqld` server. When the user sets the `MinimizeSkew` +flag we want to keep the events across shards within the same second: each transaction timestamp is within 1 second of each +other. To account for rounding-off of the transaction timestamp and the clock-skew we set the threshold to be 2 seconds, +instead of 1 second, so that we don't keep stalling the streams due to cumulative round-offs. + +### Possible Unexpected Behavior + +If there are no events for a second in a shard then a heartbeat is sent. On receiving a heartbeat we reset the skew. +This is necessary to avoid shards with no events starving other shards. The current logic will align streams only if +they are all getting events faster than the heartbeat frequency. + +This means that we cannot guarantee the skew alignment feature will work as expected in certain conditions. This could +happen mainly while streaming from `REPLICA` tablets with high replication lag, say, due to high write QPS or a network +partition. + +Thus it is recommended that you stream from `PRIMARY` tablets when using the [VStream feature](../../vstream/). +Note, however, that even `PRIMARY` tablets with skewed loads could potentially trigger such a situation. + +### API + +This is how you would turn on the skew detection and alignment feature in a [VStream](../../vstream/) client: + +```go + import vtgatepb "vitess.io/vitess/go/vt/proto/vtgate" + ... + ... + flags := &vtgatepb.VStreamFlags{}; + flags.MinimizeSkew = true; + + reader, err := conn.VStream(ctx, topodatapb.TabletType_PRIMARY, vgtid, filter, flags) + +``` diff --git a/content/en/docs/14.0/reference/vreplication/internal/vstream-stream-migration.md b/content/en/docs/14.0/reference/vreplication/internal/vstream-stream-migration.md new file mode 100644 index 000000000..54b62f9e7 --- /dev/null +++ b/content/en/docs/14.0/reference/vreplication/internal/vstream-stream-migration.md @@ -0,0 +1,74 @@ +--- +title: VStream API and Resharding +description: How VStream API handles a reshard +weight: 8 +aliases: ['/docs/design-docs/vreplication/vstream/stream-migration/'] +--- + +## Stream Migration on a Resharding Operation + +While subscribing to the [VStream API](../../vstream/) you need to specify the shards from which to stream events. While +streaming it is possible that the underlying keyspace is resharded. Thus some or all of the shards which were originally +specified may be replaced by new shards after the resharding operation is completed. + +Stream migration logic within VReplication handles this transparently within `vtgate`. The Event streaming will be paused +momentarily during the actual cutover (when writes are switched) and you will start getting the events +([`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent)) (and updated +[`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid)s) for the new set of shards once the cutover +is completed. + +### An Illustration + +Here is a sample session using the scripts from the [local example](../../../../get-started/local). + +Run the steps up to and including `205_clean_commerce.sh`. Now start a [VStream API](../../vstream/) client in a +separate terminal to stream events from the `customer` table in the `customer` keyspace, which is currently unsharded. + +```json +{ + ShardGtids: []*binlogdatapb.ShardGtid{ + { + Keyspace: "customer", + Shard: "0", + }, + }, +} +``` + +Initial events will be streamed: + +```proto +[type:BEGIN type:FIELD field_event: fields: > ] +[type:VGTID vgtid: > ] +[type:ROW row_event: > > type:ROW row_event: > > type:ROW row_event: > > type:ROW row_event: > > type:ROW row_event: > > type:VGTID vgtid: > > > > type:COMMIT ] +[type:BEGIN type:VGTID vgtid: > type:COMMIT ] +``` + +Now run the resharding scripts and switch reads (steps/scripts 301, 302, 303, and 304). The following events are now seen: + +```proto +[type:VGTID vgtid: > type:DDL timestamp:1616748652 statement:"alter table customer change customer_id customer_id bigint not null" current_time:1616748652480051077 ] +[type:VGTID vgtid: > type:OTHER timestamp:1616748652 current_time:1616748652553883482 ] +``` + +Run the 305 step/script to switch writes. You will see that the [`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid)s) +will include the new shards `-80` and `80-` instead of `0`: + +```proto +[type:BEGIN timestamp:1616748733 current_time:1616748733480901644 type:VGTID vgtid: > type:COMMIT timestamp:1616748733 current_time:1616748733480932466 ] +[type:BEGIN timestamp:1616748733 current_time:1616748733486715446 type:VGTID vgtid: > type:COMMIT timestamp:1616748733 current_time:1616748733486749728 ] + +[type:BEGIN timestamp:1616748733 current_time:1616748733519198641 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616748733 current_time:1616748733519244822 ] +[type:BEGIN timestamp:1616748733 current_time:1616748733520355854 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616748733 current_time:1616748733520403210 ] +``` + +Insert new rows: this will result in row events from the new shards. Shards will only stream changes from the point of +resharding. + +```bash +$ mysql -u root --host=127.0.0.1 -P 15306 -e "insert into customer(customer_id, email) values(6,'rohit@planetscale.com'), (7, 'mlord@planetscale.com')" +``` + +```proto +[type:BEGIN timestamp:1616749631 current_time:1616749631516372189 type:FIELD timestamp:1616749631 field_event: fields: > current_time:1616749631517765487 type:ROW timestamp:1616749631 row_event: > row_changes: > > current_time:1616749631517779353 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616749631 current_time:1616749631517789376 ] +``` diff --git a/content/en/docs/15.0/reference/vreplication/internal/vstream-skew-detection.md b/content/en/docs/15.0/reference/vreplication/internal/vstream-skew-detection.md new file mode 100644 index 000000000..dd444246a --- /dev/null +++ b/content/en/docs/15.0/reference/vreplication/internal/vstream-skew-detection.md @@ -0,0 +1,84 @@ +--- +title: VStream Skew Minimization +description: Aligning streams from different shards in the VStream API +weight: 7 +aliases: ['/docs/design-docs/vreplication/vstream/skew-detection/'] +--- + +## VStream Skew Detection + +### Motivation + +When the [VStream API](../../vstream/) is streaming from multiple shards we have multiple sources of events: one `PRIMARY` +or `REPLICA` tablet for each shard in the provided [`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid). +The rate at which the events will be streamed from the underlying sources can vary depending on various factors, such as: + +* The replication lag on the source tablets (if a `REPLICA` tablet is selected as the source for the vstream) +* The CPU load on the source tablet +* Possible network partitions or network delays + +This can result in the events in the vstream from some shards being well ahead of other shards. So, for example, if a +row moves from the faster shard to a slower shard we might see the `DELETE` event in the vstream from the faster shard +long before the `INSERT` from the second. This would result in the row going "invisible" for the duration of the skew. +This can affect the user experience in applications where the vstream events are used to refresh a UI, for example. + +For most applications where [VStream API](../../vstream/) events feed into change data capture systems for auditing or +reporting purposes these delays may be acceptable. However, for applications which are using these events for user-facing +functions this can cause unexpected behavior. See https://github.com/vitessio/vitess/issues/7402 for one such case. + +### Goal + +It is not practically possible to provide exact ordering of events across Vitess shards. The [VStream API](../../vstream/) +will inherently stream events from one shard independently of another. However, vstream events +([`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent)) do keep track of the binlog event +timestamps which we can use to loosely coordinate the streams. Since binlog timestamp granularity is only to the nearest +second, and we attempt to align the streams to within a second. + +### Implementation + +The skew minimization feature adds a [`MinimizeSkew` flag](../../vstream/#minimizeskew) that the client can set. This flag +enables skew detection between the various streams. Once a skew is detected, events for streams that are ahead are held back +until the lagging streams catch up causing the skew to reach an acceptable level. + +Each vstream event ([`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent)) contains two timestamps: +one when the database transaction occurred, and the other the current time on the source tablet where the +[`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent) was created. This lets us compute how far in +the past the event we just received was created. We use this to determine which shard has the most recent event and which one +has the oldest. Note that for shards where there is no activity, the vstreamer sends a heartbeat event every second and the +transaction time for a heartbeat is the same as the current time on the source. (These heartbeats are not forwarded to clients +in the vstream since they are synthetic/internal VReplication events.) + +If the difference between the fastest and slowest streams is greater than a threshold, we declare that we have detected +a skew. MySQL binlogs store the transaction timestamp in seconds. Also, on the `vtgate` serving the vstream, we adjust +this time for clock skews between the `vtgate` and the source tablet's `mysqld` server. When the user sets the `MinimizeSkew` +flag we want to keep the events across shards within the same second: each transaction timestamp is within 1 second of each +other. To account for rounding-off of the transaction timestamp and the clock-skew we set the threshold to be 2 seconds, +instead of 1 second, so that we don't keep stalling the streams due to cumulative round-offs. + +### Possible Unexpected Behavior + +If there are no events for a second in a shard then a heartbeat is sent. On receiving a heartbeat we reset the skew. +This is necessary to avoid shards with no events starving other shards. The current logic will align streams only if +they are all getting events faster than the heartbeat frequency. + +This means that we cannot guarantee the skew alignment feature will work as expected in certain conditions. This could +happen mainly while streaming from `REPLICA` tablets with high replication lag, say, due to high write QPS or a network +partition. + +Thus it is recommended that you stream from `PRIMARY` tablets when using the [VStream feature](../../vstream/). +Note, however, that even `PRIMARY` tablets with skewed loads could potentially trigger such a situation. + +### API + +This is how you would turn on the skew detection and alignment feature in a [VStream](../../vstream/) client: + +```go + import vtgatepb "vitess.io/vitess/go/vt/proto/vtgate" + ... + ... + flags := &vtgatepb.VStreamFlags{}; + flags.MinimizeSkew = true; + + reader, err := conn.VStream(ctx, topodatapb.TabletType_PRIMARY, vgtid, filter, flags) + +``` diff --git a/content/en/docs/15.0/reference/vreplication/internal/vstream-stream-migration.md b/content/en/docs/15.0/reference/vreplication/internal/vstream-stream-migration.md new file mode 100644 index 000000000..54b62f9e7 --- /dev/null +++ b/content/en/docs/15.0/reference/vreplication/internal/vstream-stream-migration.md @@ -0,0 +1,74 @@ +--- +title: VStream API and Resharding +description: How VStream API handles a reshard +weight: 8 +aliases: ['/docs/design-docs/vreplication/vstream/stream-migration/'] +--- + +## Stream Migration on a Resharding Operation + +While subscribing to the [VStream API](../../vstream/) you need to specify the shards from which to stream events. While +streaming it is possible that the underlying keyspace is resharded. Thus some or all of the shards which were originally +specified may be replaced by new shards after the resharding operation is completed. + +Stream migration logic within VReplication handles this transparently within `vtgate`. The Event streaming will be paused +momentarily during the actual cutover (when writes are switched) and you will start getting the events +([`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent)) (and updated +[`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid)s) for the new set of shards once the cutover +is completed. + +### An Illustration + +Here is a sample session using the scripts from the [local example](../../../../get-started/local). + +Run the steps up to and including `205_clean_commerce.sh`. Now start a [VStream API](../../vstream/) client in a +separate terminal to stream events from the `customer` table in the `customer` keyspace, which is currently unsharded. + +```json +{ + ShardGtids: []*binlogdatapb.ShardGtid{ + { + Keyspace: "customer", + Shard: "0", + }, + }, +} +``` + +Initial events will be streamed: + +```proto +[type:BEGIN type:FIELD field_event: fields: > ] +[type:VGTID vgtid: > ] +[type:ROW row_event: > > type:ROW row_event: > > type:ROW row_event: > > type:ROW row_event: > > type:ROW row_event: > > type:VGTID vgtid: > > > > type:COMMIT ] +[type:BEGIN type:VGTID vgtid: > type:COMMIT ] +``` + +Now run the resharding scripts and switch reads (steps/scripts 301, 302, 303, and 304). The following events are now seen: + +```proto +[type:VGTID vgtid: > type:DDL timestamp:1616748652 statement:"alter table customer change customer_id customer_id bigint not null" current_time:1616748652480051077 ] +[type:VGTID vgtid: > type:OTHER timestamp:1616748652 current_time:1616748652553883482 ] +``` + +Run the 305 step/script to switch writes. You will see that the [`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid)s) +will include the new shards `-80` and `80-` instead of `0`: + +```proto +[type:BEGIN timestamp:1616748733 current_time:1616748733480901644 type:VGTID vgtid: > type:COMMIT timestamp:1616748733 current_time:1616748733480932466 ] +[type:BEGIN timestamp:1616748733 current_time:1616748733486715446 type:VGTID vgtid: > type:COMMIT timestamp:1616748733 current_time:1616748733486749728 ] + +[type:BEGIN timestamp:1616748733 current_time:1616748733519198641 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616748733 current_time:1616748733519244822 ] +[type:BEGIN timestamp:1616748733 current_time:1616748733520355854 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616748733 current_time:1616748733520403210 ] +``` + +Insert new rows: this will result in row events from the new shards. Shards will only stream changes from the point of +resharding. + +```bash +$ mysql -u root --host=127.0.0.1 -P 15306 -e "insert into customer(customer_id, email) values(6,'rohit@planetscale.com'), (7, 'mlord@planetscale.com')" +``` + +```proto +[type:BEGIN timestamp:1616749631 current_time:1616749631516372189 type:FIELD timestamp:1616749631 field_event: fields: > current_time:1616749631517765487 type:ROW timestamp:1616749631 row_event: > row_changes: > > current_time:1616749631517779353 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616749631 current_time:1616749631517789376 ] +``` diff --git a/content/en/docs/16.0/reference/vreplication/internal/vstream-stream-migration.md b/content/en/docs/16.0/reference/vreplication/internal/vstream-stream-migration.md index 9797d6cb9..54b62f9e7 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/vstream-stream-migration.md +++ b/content/en/docs/16.0/reference/vreplication/internal/vstream-stream-migration.md @@ -2,56 +2,59 @@ title: VStream API and Resharding description: How VStream API handles a reshard weight: 8 +aliases: ['/docs/design-docs/vreplication/vstream/stream-migration/'] --- -## Stream migration on a resharding operation +## Stream Migration on a Resharding Operation -While subscribing to the VStream API you need to specify the shards from which to stream events. While streaming it is -possible that the underlying keyspace is resharded. Thus some or all of the shards which were specified may be replaced -by new shards after the resharding is completed. +While subscribing to the [VStream API](../../vstream/) you need to specify the shards from which to stream events. While +streaming it is possible that the underlying keyspace is resharded. Thus some or all of the shards which were originally +specified may be replaced by new shards after the resharding operation is completed. -Stream migration logic within VReplication handles this transparently within VTGate. Event sending will be put on hold -momentarily during the actual cutover (when writes are switched) and you will start getting the events (and vgtids) for -the new set of shards once the cutover is completed. +Stream migration logic within VReplication handles this transparently within `vtgate`. The Event streaming will be paused +momentarily during the actual cutover (when writes are switched) and you will start getting the events +([`VEvent`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VEvent)) (and updated +[`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid)s) for the new set of shards once the cutover +is completed. -### An illustration +### An Illustration -Here is a sample session using the scripts from the [local example](/docs/get-started/local). +Here is a sample session using the scripts from the [local example](../../../../get-started/local). -Run the steps up to and including `205_clean_commerce.sh`. Now start a vstream api client in a separate terminal to -stream events from the customer table in the customer keyspace, which is currently unsharded. +Run the steps up to and including `205_clean_commerce.sh`. Now start a [VStream API](../../vstream/) client in a +separate terminal to stream events from the `customer` table in the `customer` keyspace, which is currently unsharded. -``` +```json { ShardGtids: []*binlogdatapb.ShardGtid{ - { - Keyspace: "customer", - Shard: "0", - - }, - }, + { + Keyspace: "customer", + Shard: "0", + }, + }, } ``` Initial events will be streamed: -``` +```proto [type:BEGIN type:FIELD field_event: fields: > ] [type:VGTID vgtid: > ] [type:ROW row_event: > > type:ROW row_event: > > type:ROW row_event: > > type:ROW row_event: > > type:ROW row_event: > > type:VGTID vgtid: > > > > type:COMMIT ] [type:BEGIN type:VGTID vgtid: > type:COMMIT ] ``` -Now run the resharding scripts and switch reads (301, 302, 303, 304). The following events are now seen: +Now run the resharding scripts and switch reads (steps/scripts 301, 302, 303, and 304). The following events are now seen: -``` +```proto [type:VGTID vgtid: > type:DDL timestamp:1616748652 statement:"alter table customer change customer_id customer_id bigint not null" current_time:1616748652480051077 ] [type:VGTID vgtid: > type:OTHER timestamp:1616748652 current_time:1616748652553883482 ] ``` -Run the 305 script to switch writes. You will see that vgtids will include the new shards `-80` and `80-` instead of `0` +Run the 305 step/script to switch writes. You will see that the [`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid)s) +will include the new shards `-80` and `80-` instead of `0`: -``` +```proto [type:BEGIN timestamp:1616748733 current_time:1616748733480901644 type:VGTID vgtid: > type:COMMIT timestamp:1616748733 current_time:1616748733480932466 ] [type:BEGIN timestamp:1616748733 current_time:1616748733486715446 type:VGTID vgtid: > type:COMMIT timestamp:1616748733 current_time:1616748733486749728 ] @@ -62,10 +65,10 @@ Run the 305 script to switch writes. You will see that vgtids will include the n Insert new rows: this will result in row events from the new shards. Shards will only stream changes from the point of resharding. -``` -$ mysql -u root --host=127.0.0.1 -P 15306 -e "insert into customer(customer_id, email) values(6,'sougou@planetscale.com'), (7, 'deepthi@planetscale.com');" +```bash +$ mysql -u root --host=127.0.0.1 -P 15306 -e "insert into customer(customer_id, email) values(6,'rohit@planetscale.com'), (7, 'mlord@planetscale.com')" ``` -``` +```proto [type:BEGIN timestamp:1616749631 current_time:1616749631516372189 type:FIELD timestamp:1616749631 field_event: fields: > current_time:1616749631517765487 type:ROW timestamp:1616749631 row_event: > row_changes: > > current_time:1616749631517779353 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616749631 current_time:1616749631517789376 ] ``` From 3cc135f9e79f83ff812e724169cdf566a3100eae Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Fri, 20 Jan 2023 14:33:53 -0500 Subject: [PATCH 15/16] Address Netlify spacing issues Signed-off-by: Matt Lord --- .../vreplication/internal/cutover.md | 22 +++++++++++++++++ .../reference/vreplication/internal/keys.md | 24 +++++++++++++++++++ .../vreplication/internal/life-of-a-stream.md | 6 +++++ .../vreplication/internal/tracker.md | 2 ++ .../internal/vstream-stream-migration.md | 8 +++++++ .../vreplication/internal/cutover.md | 22 +++++++++++++++++ .../reference/vreplication/internal/keys.md | 24 +++++++++++++++++++ .../vreplication/internal/life-of-a-stream.md | 6 +++++ .../vreplication/internal/tracker.md | 2 ++ .../internal/vstream-stream-migration.md | 8 +++++++ .../vreplication/internal/cutover.md | 22 +++++++++++++++++ .../reference/vreplication/internal/keys.md | 24 +++++++++++++++++++ .../vreplication/internal/life-of-a-stream.md | 6 +++++ .../vreplication/internal/tracker.md | 2 ++ .../internal/vstream-stream-migration.md | 8 +++++++ .../vreplication/internal/cutover.md | 22 +++++++++++++++++ .../reference/vreplication/internal/keys.md | 24 +++++++++++++++++++ .../vreplication/internal/life-of-a-stream.md | 6 +++++ .../vreplication/internal/tracker.md | 2 ++ .../internal/vstream-stream-migration.md | 8 +++++++ 20 files changed, 248 insertions(+) diff --git a/content/en/docs/13.0/reference/vreplication/internal/cutover.md b/content/en/docs/13.0/reference/vreplication/internal/cutover.md index 376ffc7dc..a54f120d3 100644 --- a/content/en/docs/13.0/reference/vreplication/internal/cutover.md +++ b/content/en/docs/13.0/reference/vreplication/internal/cutover.md @@ -70,6 +70,8 @@ $ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/RoutingRules' rules:{from_table:"corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer.corder" to_tables:"commerce.corder"} rules:{from_table:"customer.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"commerce.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"corder" to_tables:"commerce.corder"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"commerce.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer.customer" to_tables:"commerce.customer"} ``` +
+ {{< info >}} In practice you would instead typically view the routing rules via the dedicated [`GetRoutingRules`](../../../programs/vtctl/schema-version-permissions/#getroutingrules) @@ -122,6 +124,8 @@ primary_term_start_time:{seconds:1627465761 nanoseconds:600070156} is_primary_serving:true ``` +
+ *global/keyspaces/customer/shards/80-/Shard* ```proto @@ -130,6 +134,8 @@ primary_term_start_time:{seconds:1627465833 nanoseconds:536524508} key_range:{start:"\x80"} ``` +
+ *zone1/keyspaces/customer/SrvKeyspace* ```proto @@ -156,6 +162,8 @@ primary_term_start_time:{seconds:1627466189 nanoseconds:587021377} is_primary_serving:true ``` +
+ *global/keyspaces/customer/shards/80-/Shard* ```proto @@ -164,6 +172,8 @@ primary_term_start_time:{seconds:1627466263 nanoseconds:16201490} key_range:{start:"\x80"}`` ``` +
+ _zone1/keyspaces/customer/SrvKeyspace_ ```proto @@ -178,6 +188,8 @@ partitions:{served_type:REPLICA } ``` +
+ #### After Primary Traffic Is Switched Using `SwitchTraffic` (Previously Known as SwitchWrites) * `is_primary_serving` is removed from shard `0` @@ -193,6 +205,8 @@ primary_alias:{cell:"zone1" uid:200} primary_term_start_time:{seconds:1627466636 nanoseconds:405646818} ``` +
+ *global/keyspaces/customer/shards/80-/Shard* ```proto @@ -202,6 +216,8 @@ key_range:{start:"\x80"} is_primary_serving:true ``` +
+ *zone1/keyspaces/customer/SrvKeyspace* ```proto @@ -241,6 +257,8 @@ rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} ``` +
+ #### On Switching Replica Traffic to Target The routing rules for replica targeted reads are updated to map the table on the source to the target. @@ -253,6 +271,8 @@ rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer@replica" to_tables:"customer.customer"} ``` +
+ #### On Switching Primary Traffic The routing rules for default read-write traffic are updated to map the table on the source to the target. In addition the @@ -268,6 +288,8 @@ rules:{from_table:"commerce.customer" to_tables:"customer.customer"} rules:{from_table:"customer" to_tables:"customer.customer"} ``` +
+ *global/keyspaces/commerce/shards/0/Shard* ```proto diff --git a/content/en/docs/13.0/reference/vreplication/internal/keys.md b/content/en/docs/13.0/reference/vreplication/internal/keys.md index 0be41810e..0b041d03f 100644 --- a/content/en/docs/13.0/reference/vreplication/internal/keys.md +++ b/content/en/docs/13.0/reference/vreplication/internal/keys.md @@ -108,6 +108,8 @@ CREATE TABLE `entry` ( ) ``` +
+ The above is a trivial scenario. #### Source Table and Target table Share the Same PRIMARY KEY @@ -131,6 +133,8 @@ CREATE TABLE `target` ( ) ``` +
+ The differences in structure are interesting but irrelevant to VReplication's ability to copy the data. #### Subset PRIMARY KEY @@ -153,6 +157,8 @@ CREATE TABLE `target` ( ) ``` +
+ #### Superset PRIMARY KEY ```sql @@ -173,6 +179,8 @@ CREATE TABLE `target` ( ) ``` +
+ #### Different PRIMARY KEYs ```sql @@ -193,6 +201,8 @@ CREATE TABLE `target` ( ) ``` +
+ No columns are shared between the `PRIMARY KEY`s in the above. However: - `id`, covered by `source`'s PK, is found in `target` @@ -218,6 +228,8 @@ CREATE TABLE `target` ( ) ``` +
+ The only eligible solution in the above is: - Use `source`'s `PRIMARY KEY` (the column `uuid` is found in `target`) @@ -249,6 +261,8 @@ CREATE TABLE `target` ( ) ``` +
+ The only `UNIQUE KEY` on `target` is `NULL`able, hence _not_ eligible. #### Missing columns @@ -270,6 +284,8 @@ CREATE TABLE `target` ( ) ``` +
+ `target` only has one possible key, the `PRIMARY KEY`, covering `id`. But `id` is not found in `source`. ## Configuring The Stream @@ -293,6 +309,8 @@ CREATE TABLE `corder` ( ) ``` +
+ And even though we don't _have to_, here's how we could manually configure the VReplication workflow definition (prettified for readability): @@ -308,6 +326,8 @@ keyspace:"commerce" shard:"0" filter:{ } ``` +
+ In the above: - `source_unique_key_columns` is the (comma delimited) list of columns covered by the chosen key on source table @@ -340,6 +360,8 @@ keyspace:"commerce" shard:"0" filter:{ } ``` +
+ Not much changed from the previous example, just note how we comma separate `"order_id,customer_id"`. ### Example 3 @@ -367,6 +389,8 @@ keyspace:"commerce" shard:"0" filter:{ } ``` +
+ Note: - `source_unique_key_columns` indicates the names of columns on the source table diff --git a/content/en/docs/13.0/reference/vreplication/internal/life-of-a-stream.md b/content/en/docs/13.0/reference/vreplication/internal/life-of-a-stream.md index 5648fc2fe..278635c87 100644 --- a/content/en/docs/13.0/reference/vreplication/internal/life-of-a-stream.md +++ b/content/en/docs/13.0/reference/vreplication/internal/life-of-a-stream.md @@ -99,6 +99,8 @@ T2: select * from X where pk > 10 limit 10; GTID: 110, Last PK 20 send rows to target ``` +
+ There is a gotcha here: onsider that there are 10 new transactions or GTIDs between times T1 and T2. Some of these can potentially modify the rows returned from the query at T1. Hence if we just return the rows from T2 (which have only rows from PK 11 to 20) then we will have an inconsistent state on the target: the updates to rows with PK @@ -121,6 +123,8 @@ T3: select * from X where pk > 10 limit 10; GTID: 112, Last PK 20 send rows to target ``` +
+ Another gotcha: note that at time T3 when we selected the PKs from 11 to 20 the GTID position could have moved further! This could be due to transactions that were applied between T2 and T3. So if we just applied the rows from T3 we would still have an inconsistent state, if transactions 111 and 112 affected the rows from pks 1 to 10. @@ -145,6 +149,8 @@ T4: replicate from 111 to 112 T5: Send rows for pks 11 to 20 to target ``` +
+ This flow actually works and is the one used in Vitess VReplication! The transactions to be applied at T1 can take a long time (due to the bulk inserts). T3 (which is just a snapshot) is diff --git a/content/en/docs/13.0/reference/vreplication/internal/tracker.md b/content/en/docs/13.0/reference/vreplication/internal/tracker.md index 4a9a73478..1aa688c12 100644 --- a/content/en/docs/13.0/reference/vreplication/internal/tracker.md +++ b/content/en/docs/13.0/reference/vreplication/internal/tracker.md @@ -168,6 +168,8 @@ T4: DML1 on table1 T5: Version Event DDL2 // gets written AFTER DML1 ``` +
+ So now on the `REPLICA`, at T4, the version historian will incorrectly provide the schema from T1 after DDL1 was applied. ### Situation 2 diff --git a/content/en/docs/13.0/reference/vreplication/internal/vstream-stream-migration.md b/content/en/docs/13.0/reference/vreplication/internal/vstream-stream-migration.md index 54b62f9e7..f22aea00f 100644 --- a/content/en/docs/13.0/reference/vreplication/internal/vstream-stream-migration.md +++ b/content/en/docs/13.0/reference/vreplication/internal/vstream-stream-migration.md @@ -35,6 +35,8 @@ separate terminal to stream events from the `customer` table in the `customer` k } ``` +
+ Initial events will be streamed: ```proto @@ -44,6 +46,8 @@ Initial events will be streamed: [type:BEGIN type:VGTID vgtid: > type:COMMIT ] ``` +
+ Now run the resharding scripts and switch reads (steps/scripts 301, 302, 303, and 304). The following events are now seen: ```proto @@ -51,6 +55,8 @@ Now run the resharding scripts and switch reads (steps/scripts 301, 302, 303, an [type:VGTID vgtid: > type:OTHER timestamp:1616748652 current_time:1616748652553883482 ] ``` +
+ Run the 305 step/script to switch writes. You will see that the [`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid)s) will include the new shards `-80` and `80-` instead of `0`: @@ -62,6 +68,8 @@ will include the new shards `-80` and `80-` instead of `0`: [type:BEGIN timestamp:1616748733 current_time:1616748733520355854 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616748733 current_time:1616748733520403210 ] ``` +
+ Insert new rows: this will result in row events from the new shards. Shards will only stream changes from the point of resharding. diff --git a/content/en/docs/14.0/reference/vreplication/internal/cutover.md b/content/en/docs/14.0/reference/vreplication/internal/cutover.md index 376ffc7dc..a54f120d3 100644 --- a/content/en/docs/14.0/reference/vreplication/internal/cutover.md +++ b/content/en/docs/14.0/reference/vreplication/internal/cutover.md @@ -70,6 +70,8 @@ $ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/RoutingRules' rules:{from_table:"corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer.corder" to_tables:"commerce.corder"} rules:{from_table:"customer.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"commerce.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"corder" to_tables:"commerce.corder"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"commerce.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer.customer" to_tables:"commerce.customer"} ``` +
+ {{< info >}} In practice you would instead typically view the routing rules via the dedicated [`GetRoutingRules`](../../../programs/vtctl/schema-version-permissions/#getroutingrules) @@ -122,6 +124,8 @@ primary_term_start_time:{seconds:1627465761 nanoseconds:600070156} is_primary_serving:true ``` +
+ *global/keyspaces/customer/shards/80-/Shard* ```proto @@ -130,6 +134,8 @@ primary_term_start_time:{seconds:1627465833 nanoseconds:536524508} key_range:{start:"\x80"} ``` +
+ *zone1/keyspaces/customer/SrvKeyspace* ```proto @@ -156,6 +162,8 @@ primary_term_start_time:{seconds:1627466189 nanoseconds:587021377} is_primary_serving:true ``` +
+ *global/keyspaces/customer/shards/80-/Shard* ```proto @@ -164,6 +172,8 @@ primary_term_start_time:{seconds:1627466263 nanoseconds:16201490} key_range:{start:"\x80"}`` ``` +
+ _zone1/keyspaces/customer/SrvKeyspace_ ```proto @@ -178,6 +188,8 @@ partitions:{served_type:REPLICA } ``` +
+ #### After Primary Traffic Is Switched Using `SwitchTraffic` (Previously Known as SwitchWrites) * `is_primary_serving` is removed from shard `0` @@ -193,6 +205,8 @@ primary_alias:{cell:"zone1" uid:200} primary_term_start_time:{seconds:1627466636 nanoseconds:405646818} ``` +
+ *global/keyspaces/customer/shards/80-/Shard* ```proto @@ -202,6 +216,8 @@ key_range:{start:"\x80"} is_primary_serving:true ``` +
+ *zone1/keyspaces/customer/SrvKeyspace* ```proto @@ -241,6 +257,8 @@ rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} ``` +
+ #### On Switching Replica Traffic to Target The routing rules for replica targeted reads are updated to map the table on the source to the target. @@ -253,6 +271,8 @@ rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer@replica" to_tables:"customer.customer"} ``` +
+ #### On Switching Primary Traffic The routing rules for default read-write traffic are updated to map the table on the source to the target. In addition the @@ -268,6 +288,8 @@ rules:{from_table:"commerce.customer" to_tables:"customer.customer"} rules:{from_table:"customer" to_tables:"customer.customer"} ``` +
+ *global/keyspaces/commerce/shards/0/Shard* ```proto diff --git a/content/en/docs/14.0/reference/vreplication/internal/keys.md b/content/en/docs/14.0/reference/vreplication/internal/keys.md index 0be41810e..0b041d03f 100644 --- a/content/en/docs/14.0/reference/vreplication/internal/keys.md +++ b/content/en/docs/14.0/reference/vreplication/internal/keys.md @@ -108,6 +108,8 @@ CREATE TABLE `entry` ( ) ``` +
+ The above is a trivial scenario. #### Source Table and Target table Share the Same PRIMARY KEY @@ -131,6 +133,8 @@ CREATE TABLE `target` ( ) ``` +
+ The differences in structure are interesting but irrelevant to VReplication's ability to copy the data. #### Subset PRIMARY KEY @@ -153,6 +157,8 @@ CREATE TABLE `target` ( ) ``` +
+ #### Superset PRIMARY KEY ```sql @@ -173,6 +179,8 @@ CREATE TABLE `target` ( ) ``` +
+ #### Different PRIMARY KEYs ```sql @@ -193,6 +201,8 @@ CREATE TABLE `target` ( ) ``` +
+ No columns are shared between the `PRIMARY KEY`s in the above. However: - `id`, covered by `source`'s PK, is found in `target` @@ -218,6 +228,8 @@ CREATE TABLE `target` ( ) ``` +
+ The only eligible solution in the above is: - Use `source`'s `PRIMARY KEY` (the column `uuid` is found in `target`) @@ -249,6 +261,8 @@ CREATE TABLE `target` ( ) ``` +
+ The only `UNIQUE KEY` on `target` is `NULL`able, hence _not_ eligible. #### Missing columns @@ -270,6 +284,8 @@ CREATE TABLE `target` ( ) ``` +
+ `target` only has one possible key, the `PRIMARY KEY`, covering `id`. But `id` is not found in `source`. ## Configuring The Stream @@ -293,6 +309,8 @@ CREATE TABLE `corder` ( ) ``` +
+ And even though we don't _have to_, here's how we could manually configure the VReplication workflow definition (prettified for readability): @@ -308,6 +326,8 @@ keyspace:"commerce" shard:"0" filter:{ } ``` +
+ In the above: - `source_unique_key_columns` is the (comma delimited) list of columns covered by the chosen key on source table @@ -340,6 +360,8 @@ keyspace:"commerce" shard:"0" filter:{ } ``` +
+ Not much changed from the previous example, just note how we comma separate `"order_id,customer_id"`. ### Example 3 @@ -367,6 +389,8 @@ keyspace:"commerce" shard:"0" filter:{ } ``` +
+ Note: - `source_unique_key_columns` indicates the names of columns on the source table diff --git a/content/en/docs/14.0/reference/vreplication/internal/life-of-a-stream.md b/content/en/docs/14.0/reference/vreplication/internal/life-of-a-stream.md index 5648fc2fe..278635c87 100644 --- a/content/en/docs/14.0/reference/vreplication/internal/life-of-a-stream.md +++ b/content/en/docs/14.0/reference/vreplication/internal/life-of-a-stream.md @@ -99,6 +99,8 @@ T2: select * from X where pk > 10 limit 10; GTID: 110, Last PK 20 send rows to target ``` +
+ There is a gotcha here: onsider that there are 10 new transactions or GTIDs between times T1 and T2. Some of these can potentially modify the rows returned from the query at T1. Hence if we just return the rows from T2 (which have only rows from PK 11 to 20) then we will have an inconsistent state on the target: the updates to rows with PK @@ -121,6 +123,8 @@ T3: select * from X where pk > 10 limit 10; GTID: 112, Last PK 20 send rows to target ``` +
+ Another gotcha: note that at time T3 when we selected the PKs from 11 to 20 the GTID position could have moved further! This could be due to transactions that were applied between T2 and T3. So if we just applied the rows from T3 we would still have an inconsistent state, if transactions 111 and 112 affected the rows from pks 1 to 10. @@ -145,6 +149,8 @@ T4: replicate from 111 to 112 T5: Send rows for pks 11 to 20 to target ``` +
+ This flow actually works and is the one used in Vitess VReplication! The transactions to be applied at T1 can take a long time (due to the bulk inserts). T3 (which is just a snapshot) is diff --git a/content/en/docs/14.0/reference/vreplication/internal/tracker.md b/content/en/docs/14.0/reference/vreplication/internal/tracker.md index 4a9a73478..1aa688c12 100644 --- a/content/en/docs/14.0/reference/vreplication/internal/tracker.md +++ b/content/en/docs/14.0/reference/vreplication/internal/tracker.md @@ -168,6 +168,8 @@ T4: DML1 on table1 T5: Version Event DDL2 // gets written AFTER DML1 ``` +
+ So now on the `REPLICA`, at T4, the version historian will incorrectly provide the schema from T1 after DDL1 was applied. ### Situation 2 diff --git a/content/en/docs/14.0/reference/vreplication/internal/vstream-stream-migration.md b/content/en/docs/14.0/reference/vreplication/internal/vstream-stream-migration.md index 54b62f9e7..f22aea00f 100644 --- a/content/en/docs/14.0/reference/vreplication/internal/vstream-stream-migration.md +++ b/content/en/docs/14.0/reference/vreplication/internal/vstream-stream-migration.md @@ -35,6 +35,8 @@ separate terminal to stream events from the `customer` table in the `customer` k } ``` +
+ Initial events will be streamed: ```proto @@ -44,6 +46,8 @@ Initial events will be streamed: [type:BEGIN type:VGTID vgtid: > type:COMMIT ] ``` +
+ Now run the resharding scripts and switch reads (steps/scripts 301, 302, 303, and 304). The following events are now seen: ```proto @@ -51,6 +55,8 @@ Now run the resharding scripts and switch reads (steps/scripts 301, 302, 303, an [type:VGTID vgtid: > type:OTHER timestamp:1616748652 current_time:1616748652553883482 ] ``` +
+ Run the 305 step/script to switch writes. You will see that the [`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid)s) will include the new shards `-80` and `80-` instead of `0`: @@ -62,6 +68,8 @@ will include the new shards `-80` and `80-` instead of `0`: [type:BEGIN timestamp:1616748733 current_time:1616748733520355854 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616748733 current_time:1616748733520403210 ] ``` +
+ Insert new rows: this will result in row events from the new shards. Shards will only stream changes from the point of resharding. diff --git a/content/en/docs/15.0/reference/vreplication/internal/cutover.md b/content/en/docs/15.0/reference/vreplication/internal/cutover.md index 376ffc7dc..a54f120d3 100644 --- a/content/en/docs/15.0/reference/vreplication/internal/cutover.md +++ b/content/en/docs/15.0/reference/vreplication/internal/cutover.md @@ -70,6 +70,8 @@ $ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/RoutingRules' rules:{from_table:"corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer.corder" to_tables:"commerce.corder"} rules:{from_table:"customer.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"commerce.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"corder" to_tables:"commerce.corder"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"commerce.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer.customer" to_tables:"commerce.customer"} ``` +
+ {{< info >}} In practice you would instead typically view the routing rules via the dedicated [`GetRoutingRules`](../../../programs/vtctl/schema-version-permissions/#getroutingrules) @@ -122,6 +124,8 @@ primary_term_start_time:{seconds:1627465761 nanoseconds:600070156} is_primary_serving:true ``` +
+ *global/keyspaces/customer/shards/80-/Shard* ```proto @@ -130,6 +134,8 @@ primary_term_start_time:{seconds:1627465833 nanoseconds:536524508} key_range:{start:"\x80"} ``` +
+ *zone1/keyspaces/customer/SrvKeyspace* ```proto @@ -156,6 +162,8 @@ primary_term_start_time:{seconds:1627466189 nanoseconds:587021377} is_primary_serving:true ``` +
+ *global/keyspaces/customer/shards/80-/Shard* ```proto @@ -164,6 +172,8 @@ primary_term_start_time:{seconds:1627466263 nanoseconds:16201490} key_range:{start:"\x80"}`` ``` +
+ _zone1/keyspaces/customer/SrvKeyspace_ ```proto @@ -178,6 +188,8 @@ partitions:{served_type:REPLICA } ``` +
+ #### After Primary Traffic Is Switched Using `SwitchTraffic` (Previously Known as SwitchWrites) * `is_primary_serving` is removed from shard `0` @@ -193,6 +205,8 @@ primary_alias:{cell:"zone1" uid:200} primary_term_start_time:{seconds:1627466636 nanoseconds:405646818} ``` +
+ *global/keyspaces/customer/shards/80-/Shard* ```proto @@ -202,6 +216,8 @@ key_range:{start:"\x80"} is_primary_serving:true ``` +
+ *zone1/keyspaces/customer/SrvKeyspace* ```proto @@ -241,6 +257,8 @@ rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} ``` +
+ #### On Switching Replica Traffic to Target The routing rules for replica targeted reads are updated to map the table on the source to the target. @@ -253,6 +271,8 @@ rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer@replica" to_tables:"customer.customer"} ``` +
+ #### On Switching Primary Traffic The routing rules for default read-write traffic are updated to map the table on the source to the target. In addition the @@ -268,6 +288,8 @@ rules:{from_table:"commerce.customer" to_tables:"customer.customer"} rules:{from_table:"customer" to_tables:"customer.customer"} ``` +
+ *global/keyspaces/commerce/shards/0/Shard* ```proto diff --git a/content/en/docs/15.0/reference/vreplication/internal/keys.md b/content/en/docs/15.0/reference/vreplication/internal/keys.md index 0be41810e..0b041d03f 100644 --- a/content/en/docs/15.0/reference/vreplication/internal/keys.md +++ b/content/en/docs/15.0/reference/vreplication/internal/keys.md @@ -108,6 +108,8 @@ CREATE TABLE `entry` ( ) ``` +
+ The above is a trivial scenario. #### Source Table and Target table Share the Same PRIMARY KEY @@ -131,6 +133,8 @@ CREATE TABLE `target` ( ) ``` +
+ The differences in structure are interesting but irrelevant to VReplication's ability to copy the data. #### Subset PRIMARY KEY @@ -153,6 +157,8 @@ CREATE TABLE `target` ( ) ``` +
+ #### Superset PRIMARY KEY ```sql @@ -173,6 +179,8 @@ CREATE TABLE `target` ( ) ``` +
+ #### Different PRIMARY KEYs ```sql @@ -193,6 +201,8 @@ CREATE TABLE `target` ( ) ``` +
+ No columns are shared between the `PRIMARY KEY`s in the above. However: - `id`, covered by `source`'s PK, is found in `target` @@ -218,6 +228,8 @@ CREATE TABLE `target` ( ) ``` +
+ The only eligible solution in the above is: - Use `source`'s `PRIMARY KEY` (the column `uuid` is found in `target`) @@ -249,6 +261,8 @@ CREATE TABLE `target` ( ) ``` +
+ The only `UNIQUE KEY` on `target` is `NULL`able, hence _not_ eligible. #### Missing columns @@ -270,6 +284,8 @@ CREATE TABLE `target` ( ) ``` +
+ `target` only has one possible key, the `PRIMARY KEY`, covering `id`. But `id` is not found in `source`. ## Configuring The Stream @@ -293,6 +309,8 @@ CREATE TABLE `corder` ( ) ``` +
+ And even though we don't _have to_, here's how we could manually configure the VReplication workflow definition (prettified for readability): @@ -308,6 +326,8 @@ keyspace:"commerce" shard:"0" filter:{ } ``` +
+ In the above: - `source_unique_key_columns` is the (comma delimited) list of columns covered by the chosen key on source table @@ -340,6 +360,8 @@ keyspace:"commerce" shard:"0" filter:{ } ``` +
+ Not much changed from the previous example, just note how we comma separate `"order_id,customer_id"`. ### Example 3 @@ -367,6 +389,8 @@ keyspace:"commerce" shard:"0" filter:{ } ``` +
+ Note: - `source_unique_key_columns` indicates the names of columns on the source table diff --git a/content/en/docs/15.0/reference/vreplication/internal/life-of-a-stream.md b/content/en/docs/15.0/reference/vreplication/internal/life-of-a-stream.md index 5648fc2fe..278635c87 100644 --- a/content/en/docs/15.0/reference/vreplication/internal/life-of-a-stream.md +++ b/content/en/docs/15.0/reference/vreplication/internal/life-of-a-stream.md @@ -99,6 +99,8 @@ T2: select * from X where pk > 10 limit 10; GTID: 110, Last PK 20 send rows to target ``` +
+ There is a gotcha here: onsider that there are 10 new transactions or GTIDs between times T1 and T2. Some of these can potentially modify the rows returned from the query at T1. Hence if we just return the rows from T2 (which have only rows from PK 11 to 20) then we will have an inconsistent state on the target: the updates to rows with PK @@ -121,6 +123,8 @@ T3: select * from X where pk > 10 limit 10; GTID: 112, Last PK 20 send rows to target ``` +
+ Another gotcha: note that at time T3 when we selected the PKs from 11 to 20 the GTID position could have moved further! This could be due to transactions that were applied between T2 and T3. So if we just applied the rows from T3 we would still have an inconsistent state, if transactions 111 and 112 affected the rows from pks 1 to 10. @@ -145,6 +149,8 @@ T4: replicate from 111 to 112 T5: Send rows for pks 11 to 20 to target ``` +
+ This flow actually works and is the one used in Vitess VReplication! The transactions to be applied at T1 can take a long time (due to the bulk inserts). T3 (which is just a snapshot) is diff --git a/content/en/docs/15.0/reference/vreplication/internal/tracker.md b/content/en/docs/15.0/reference/vreplication/internal/tracker.md index 4a9a73478..1aa688c12 100644 --- a/content/en/docs/15.0/reference/vreplication/internal/tracker.md +++ b/content/en/docs/15.0/reference/vreplication/internal/tracker.md @@ -168,6 +168,8 @@ T4: DML1 on table1 T5: Version Event DDL2 // gets written AFTER DML1 ``` +
+ So now on the `REPLICA`, at T4, the version historian will incorrectly provide the schema from T1 after DDL1 was applied. ### Situation 2 diff --git a/content/en/docs/15.0/reference/vreplication/internal/vstream-stream-migration.md b/content/en/docs/15.0/reference/vreplication/internal/vstream-stream-migration.md index 54b62f9e7..f22aea00f 100644 --- a/content/en/docs/15.0/reference/vreplication/internal/vstream-stream-migration.md +++ b/content/en/docs/15.0/reference/vreplication/internal/vstream-stream-migration.md @@ -35,6 +35,8 @@ separate terminal to stream events from the `customer` table in the `customer` k } ``` +
+ Initial events will be streamed: ```proto @@ -44,6 +46,8 @@ Initial events will be streamed: [type:BEGIN type:VGTID vgtid: > type:COMMIT ] ``` +
+ Now run the resharding scripts and switch reads (steps/scripts 301, 302, 303, and 304). The following events are now seen: ```proto @@ -51,6 +55,8 @@ Now run the resharding scripts and switch reads (steps/scripts 301, 302, 303, an [type:VGTID vgtid: > type:OTHER timestamp:1616748652 current_time:1616748652553883482 ] ``` +
+ Run the 305 step/script to switch writes. You will see that the [`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid)s) will include the new shards `-80` and `80-` instead of `0`: @@ -62,6 +68,8 @@ will include the new shards `-80` and `80-` instead of `0`: [type:BEGIN timestamp:1616748733 current_time:1616748733520355854 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616748733 current_time:1616748733520403210 ] ``` +
+ Insert new rows: this will result in row events from the new shards. Shards will only stream changes from the point of resharding. diff --git a/content/en/docs/16.0/reference/vreplication/internal/cutover.md b/content/en/docs/16.0/reference/vreplication/internal/cutover.md index 376ffc7dc..a54f120d3 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/cutover.md +++ b/content/en/docs/16.0/reference/vreplication/internal/cutover.md @@ -70,6 +70,8 @@ $ vtctlclient --server=localhost:15999 TopoCat -- --decode_proto '/RoutingRules' rules:{from_table:"corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer.corder" to_tables:"commerce.corder"} rules:{from_table:"customer.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"customer.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@replica" to_tables:"commerce.corder"} rules:{from_table:"commerce.corder@rdonly" to_tables:"commerce.corder"} rules:{from_table:"commerce.customer@rdonly" to_tables:"commerce.customer"} rules:{from_table:"corder" to_tables:"commerce.corder"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"commerce.customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer.customer" to_tables:"commerce.customer"} ``` +
+ {{< info >}} In practice you would instead typically view the routing rules via the dedicated [`GetRoutingRules`](../../../programs/vtctl/schema-version-permissions/#getroutingrules) @@ -122,6 +124,8 @@ primary_term_start_time:{seconds:1627465761 nanoseconds:600070156} is_primary_serving:true ``` +
+ *global/keyspaces/customer/shards/80-/Shard* ```proto @@ -130,6 +134,8 @@ primary_term_start_time:{seconds:1627465833 nanoseconds:536524508} key_range:{start:"\x80"} ``` +
+ *zone1/keyspaces/customer/SrvKeyspace* ```proto @@ -156,6 +162,8 @@ primary_term_start_time:{seconds:1627466189 nanoseconds:587021377} is_primary_serving:true ``` +
+ *global/keyspaces/customer/shards/80-/Shard* ```proto @@ -164,6 +172,8 @@ primary_term_start_time:{seconds:1627466263 nanoseconds:16201490} key_range:{start:"\x80"}`` ``` +
+ _zone1/keyspaces/customer/SrvKeyspace_ ```proto @@ -178,6 +188,8 @@ partitions:{served_type:REPLICA } ``` +
+ #### After Primary Traffic Is Switched Using `SwitchTraffic` (Previously Known as SwitchWrites) * `is_primary_serving` is removed from shard `0` @@ -193,6 +205,8 @@ primary_alias:{cell:"zone1" uid:200} primary_term_start_time:{seconds:1627466636 nanoseconds:405646818} ``` +
+ *global/keyspaces/customer/shards/80-/Shard* ```proto @@ -202,6 +216,8 @@ key_range:{start:"\x80"} is_primary_serving:true ``` +
+ *zone1/keyspaces/customer/SrvKeyspace* ```proto @@ -241,6 +257,8 @@ rules:{from_table:"customer@replica" to_tables:"commerce.customer"} rules:{from_table:"customer.customer@replica" to_tables:"commerce.customer"} ``` +
+ #### On Switching Replica Traffic to Target The routing rules for replica targeted reads are updated to map the table on the source to the target. @@ -253,6 +271,8 @@ rules:{from_table:"customer" to_tables:"commerce.customer"} rules:{from_table:"customer@replica" to_tables:"customer.customer"} ``` +
+ #### On Switching Primary Traffic The routing rules for default read-write traffic are updated to map the table on the source to the target. In addition the @@ -268,6 +288,8 @@ rules:{from_table:"commerce.customer" to_tables:"customer.customer"} rules:{from_table:"customer" to_tables:"customer.customer"} ``` +
+ *global/keyspaces/commerce/shards/0/Shard* ```proto diff --git a/content/en/docs/16.0/reference/vreplication/internal/keys.md b/content/en/docs/16.0/reference/vreplication/internal/keys.md index 0be41810e..0b041d03f 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/keys.md +++ b/content/en/docs/16.0/reference/vreplication/internal/keys.md @@ -108,6 +108,8 @@ CREATE TABLE `entry` ( ) ``` +
+ The above is a trivial scenario. #### Source Table and Target table Share the Same PRIMARY KEY @@ -131,6 +133,8 @@ CREATE TABLE `target` ( ) ``` +
+ The differences in structure are interesting but irrelevant to VReplication's ability to copy the data. #### Subset PRIMARY KEY @@ -153,6 +157,8 @@ CREATE TABLE `target` ( ) ``` +
+ #### Superset PRIMARY KEY ```sql @@ -173,6 +179,8 @@ CREATE TABLE `target` ( ) ``` +
+ #### Different PRIMARY KEYs ```sql @@ -193,6 +201,8 @@ CREATE TABLE `target` ( ) ``` +
+ No columns are shared between the `PRIMARY KEY`s in the above. However: - `id`, covered by `source`'s PK, is found in `target` @@ -218,6 +228,8 @@ CREATE TABLE `target` ( ) ``` +
+ The only eligible solution in the above is: - Use `source`'s `PRIMARY KEY` (the column `uuid` is found in `target`) @@ -249,6 +261,8 @@ CREATE TABLE `target` ( ) ``` +
+ The only `UNIQUE KEY` on `target` is `NULL`able, hence _not_ eligible. #### Missing columns @@ -270,6 +284,8 @@ CREATE TABLE `target` ( ) ``` +
+ `target` only has one possible key, the `PRIMARY KEY`, covering `id`. But `id` is not found in `source`. ## Configuring The Stream @@ -293,6 +309,8 @@ CREATE TABLE `corder` ( ) ``` +
+ And even though we don't _have to_, here's how we could manually configure the VReplication workflow definition (prettified for readability): @@ -308,6 +326,8 @@ keyspace:"commerce" shard:"0" filter:{ } ``` +
+ In the above: - `source_unique_key_columns` is the (comma delimited) list of columns covered by the chosen key on source table @@ -340,6 +360,8 @@ keyspace:"commerce" shard:"0" filter:{ } ``` +
+ Not much changed from the previous example, just note how we comma separate `"order_id,customer_id"`. ### Example 3 @@ -367,6 +389,8 @@ keyspace:"commerce" shard:"0" filter:{ } ``` +
+ Note: - `source_unique_key_columns` indicates the names of columns on the source table diff --git a/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md b/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md index 5648fc2fe..278635c87 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md +++ b/content/en/docs/16.0/reference/vreplication/internal/life-of-a-stream.md @@ -99,6 +99,8 @@ T2: select * from X where pk > 10 limit 10; GTID: 110, Last PK 20 send rows to target ``` +
+ There is a gotcha here: onsider that there are 10 new transactions or GTIDs between times T1 and T2. Some of these can potentially modify the rows returned from the query at T1. Hence if we just return the rows from T2 (which have only rows from PK 11 to 20) then we will have an inconsistent state on the target: the updates to rows with PK @@ -121,6 +123,8 @@ T3: select * from X where pk > 10 limit 10; GTID: 112, Last PK 20 send rows to target ``` +
+ Another gotcha: note that at time T3 when we selected the PKs from 11 to 20 the GTID position could have moved further! This could be due to transactions that were applied between T2 and T3. So if we just applied the rows from T3 we would still have an inconsistent state, if transactions 111 and 112 affected the rows from pks 1 to 10. @@ -145,6 +149,8 @@ T4: replicate from 111 to 112 T5: Send rows for pks 11 to 20 to target ``` +
+ This flow actually works and is the one used in Vitess VReplication! The transactions to be applied at T1 can take a long time (due to the bulk inserts). T3 (which is just a snapshot) is diff --git a/content/en/docs/16.0/reference/vreplication/internal/tracker.md b/content/en/docs/16.0/reference/vreplication/internal/tracker.md index 4a9a73478..1aa688c12 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/tracker.md +++ b/content/en/docs/16.0/reference/vreplication/internal/tracker.md @@ -168,6 +168,8 @@ T4: DML1 on table1 T5: Version Event DDL2 // gets written AFTER DML1 ``` +
+ So now on the `REPLICA`, at T4, the version historian will incorrectly provide the schema from T1 after DDL1 was applied. ### Situation 2 diff --git a/content/en/docs/16.0/reference/vreplication/internal/vstream-stream-migration.md b/content/en/docs/16.0/reference/vreplication/internal/vstream-stream-migration.md index 54b62f9e7..f22aea00f 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/vstream-stream-migration.md +++ b/content/en/docs/16.0/reference/vreplication/internal/vstream-stream-migration.md @@ -35,6 +35,8 @@ separate terminal to stream events from the `customer` table in the `customer` k } ``` +
+ Initial events will be streamed: ```proto @@ -44,6 +46,8 @@ Initial events will be streamed: [type:BEGIN type:VGTID vgtid: > type:COMMIT ] ``` +
+ Now run the resharding scripts and switch reads (steps/scripts 301, 302, 303, and 304). The following events are now seen: ```proto @@ -51,6 +55,8 @@ Now run the resharding scripts and switch reads (steps/scripts 301, 302, 303, an [type:VGTID vgtid: > type:OTHER timestamp:1616748652 current_time:1616748652553883482 ] ``` +
+ Run the 305 step/script to switch writes. You will see that the [`VGTID`](https://pkg.go.dev/vitess.io/vitess/go/vt/proto/binlogdata#VGtid)s) will include the new shards `-80` and `80-` instead of `0`: @@ -62,6 +68,8 @@ will include the new shards `-80` and `80-` instead of `0`: [type:BEGIN timestamp:1616748733 current_time:1616748733520355854 type:VGTID vgtid: shard_gtids: > type:COMMIT timestamp:1616748733 current_time:1616748733520403210 ] ``` +
+ Insert new rows: this will result in row events from the new shards. Shards will only stream changes from the point of resharding. From 257fa9454b118ef6cfc6d5214a6afeed6d0f1615 Mon Sep 17 00:00:00 2001 From: Matt Lord Date: Fri, 20 Jan 2023 14:40:50 -0500 Subject: [PATCH 16/16] Minor capitalization Signed-off-by: Matt Lord --- .../en/docs/13.0/reference/vreplication/internal/keys.md | 6 +++--- .../en/docs/14.0/reference/vreplication/internal/keys.md | 6 +++--- .../en/docs/15.0/reference/vreplication/internal/keys.md | 6 +++--- .../en/docs/16.0/reference/vreplication/internal/keys.md | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/content/en/docs/13.0/reference/vreplication/internal/keys.md b/content/en/docs/13.0/reference/vreplication/internal/keys.md index 0b041d03f..6cb62f6f1 100644 --- a/content/en/docs/13.0/reference/vreplication/internal/keys.md +++ b/content/en/docs/13.0/reference/vreplication/internal/keys.md @@ -208,7 +208,7 @@ No columns are shared between the `PRIMARY KEY`s in the above. However: - `id`, covered by `source`'s PK, is found in `target` - `uuid`, covered by `target`'s PK, is found in `source` -#### Mixed keys +#### Mixed Keys ```sql CREATE TABLE `source` ( @@ -241,7 +241,7 @@ Incidentally, in the above, the chosen keys differ by name, but share the same c ### Examples of Invalid Cases -#### NULLable columns +#### NULLable Columns ```sql CREATE TABLE `source` ( @@ -265,7 +265,7 @@ CREATE TABLE `target` ( The only `UNIQUE KEY` on `target` is `NULL`able, hence _not_ eligible. -#### Missing columns +#### Missing Columns ```sql CREATE TABLE `source` ( diff --git a/content/en/docs/14.0/reference/vreplication/internal/keys.md b/content/en/docs/14.0/reference/vreplication/internal/keys.md index 0b041d03f..6cb62f6f1 100644 --- a/content/en/docs/14.0/reference/vreplication/internal/keys.md +++ b/content/en/docs/14.0/reference/vreplication/internal/keys.md @@ -208,7 +208,7 @@ No columns are shared between the `PRIMARY KEY`s in the above. However: - `id`, covered by `source`'s PK, is found in `target` - `uuid`, covered by `target`'s PK, is found in `source` -#### Mixed keys +#### Mixed Keys ```sql CREATE TABLE `source` ( @@ -241,7 +241,7 @@ Incidentally, in the above, the chosen keys differ by name, but share the same c ### Examples of Invalid Cases -#### NULLable columns +#### NULLable Columns ```sql CREATE TABLE `source` ( @@ -265,7 +265,7 @@ CREATE TABLE `target` ( The only `UNIQUE KEY` on `target` is `NULL`able, hence _not_ eligible. -#### Missing columns +#### Missing Columns ```sql CREATE TABLE `source` ( diff --git a/content/en/docs/15.0/reference/vreplication/internal/keys.md b/content/en/docs/15.0/reference/vreplication/internal/keys.md index 0b041d03f..6cb62f6f1 100644 --- a/content/en/docs/15.0/reference/vreplication/internal/keys.md +++ b/content/en/docs/15.0/reference/vreplication/internal/keys.md @@ -208,7 +208,7 @@ No columns are shared between the `PRIMARY KEY`s in the above. However: - `id`, covered by `source`'s PK, is found in `target` - `uuid`, covered by `target`'s PK, is found in `source` -#### Mixed keys +#### Mixed Keys ```sql CREATE TABLE `source` ( @@ -241,7 +241,7 @@ Incidentally, in the above, the chosen keys differ by name, but share the same c ### Examples of Invalid Cases -#### NULLable columns +#### NULLable Columns ```sql CREATE TABLE `source` ( @@ -265,7 +265,7 @@ CREATE TABLE `target` ( The only `UNIQUE KEY` on `target` is `NULL`able, hence _not_ eligible. -#### Missing columns +#### Missing Columns ```sql CREATE TABLE `source` ( diff --git a/content/en/docs/16.0/reference/vreplication/internal/keys.md b/content/en/docs/16.0/reference/vreplication/internal/keys.md index 0b041d03f..6cb62f6f1 100644 --- a/content/en/docs/16.0/reference/vreplication/internal/keys.md +++ b/content/en/docs/16.0/reference/vreplication/internal/keys.md @@ -208,7 +208,7 @@ No columns are shared between the `PRIMARY KEY`s in the above. However: - `id`, covered by `source`'s PK, is found in `target` - `uuid`, covered by `target`'s PK, is found in `source` -#### Mixed keys +#### Mixed Keys ```sql CREATE TABLE `source` ( @@ -241,7 +241,7 @@ Incidentally, in the above, the chosen keys differ by name, but share the same c ### Examples of Invalid Cases -#### NULLable columns +#### NULLable Columns ```sql CREATE TABLE `source` ( @@ -265,7 +265,7 @@ CREATE TABLE `target` ( The only `UNIQUE KEY` on `target` is `NULL`able, hence _not_ eligible. -#### Missing columns +#### Missing Columns ```sql CREATE TABLE `source` (